Files
datatools-dev/tests/test_dedup.py
Michael b871ab24fc feat: add documentation, Streamlit GUI, and full source tree
- Rewrite README.md with project overview, quick-start, and CLI summary
- Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections
- Add docs/DEVELOPER.md with architecture, data flow, and extension guides
- Rewrite src/core/__init__.py with public API exports and module docstring
- Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive
  match group review with side-by-side diff, and download buttons
- Add .gitignore, requirements.txt, all source code, tests, and sample data
- Add streamlit to requirements.txt

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-28 23:06:39 +00:00

259 lines
9.7 KiB
Python

"""Tests for src.core.dedup — matching engine."""
import pandas as pd
import pytest
from src.core.dedup import (
Algorithm,
ColumnMatchStrategy,
MatchStrategy,
SurvivorRule,
_compute_similarity,
_compare_pair,
_UnionFind,
build_default_strategies,
deduplicate,
)
from src.core.normalizers import NormalizerType
class TestComputeSimilarity:
def test_exact_match(self):
assert _compute_similarity("hello", "hello", Algorithm.EXACT) == 100.0
def test_exact_mismatch(self):
assert _compute_similarity("hello", "world", Algorithm.EXACT) == 0.0
def test_levenshtein_similar(self):
score = _compute_similarity("kitten", "sitting", Algorithm.LEVENSHTEIN)
assert 50 < score < 80
def test_jaro_winkler_similar(self):
score = _compute_similarity("john", "jon", Algorithm.JARO_WINKLER)
assert score > 80
def test_token_set_ratio(self):
score = _compute_similarity(
"123 main street apt 4",
"apt 4 123 main street",
Algorithm.TOKEN_SET_RATIO,
)
assert score == 100.0
class TestUnionFind:
def test_basic_union(self):
uf = _UnionFind(5)
uf.union(0, 1)
uf.union(1, 2)
assert uf.find(0) == uf.find(2) # transitive
def test_separate_groups(self):
uf = _UnionFind(5)
uf.union(0, 1)
uf.union(3, 4)
assert uf.find(0) != uf.find(3)
def test_groups(self):
uf = _UnionFind(5)
uf.union(0, 1)
uf.union(1, 2)
uf.union(3, 4)
groups = uf.groups()
assert len(groups) == 2
sizes = sorted(len(v) for v in groups.values())
assert sizes == [2, 3]
class TestComparePair:
def test_exact_match(self):
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])
row_a = pd.Series({"email": "test@example.com"})
row_b = pd.Series({"email": "test@example.com"})
is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
assert is_match
assert conf == 100.0
assert cols == ["email"]
def test_exact_mismatch(self):
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])
row_a = pd.Series({"email": "a@test.com"})
row_b = pd.Series({"email": "b@test.com"})
is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
assert not is_match
def test_fuzzy_match(self):
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="name", algorithm=Algorithm.JARO_WINKLER, threshold=80),
])
row_a = pd.Series({"name": "john smith"})
row_b = pd.Series({"name": "jon smith"})
is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
assert is_match
assert conf > 80
def test_and_logic_both_must_match(self):
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="name", algorithm=Algorithm.EXACT),
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])
# name matches, email doesn't
row_a = pd.Series({"name": "alice", "email": "a@test.com"})
row_b = pd.Series({"name": "alice", "email": "b@test.com"})
is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
assert not is_match
class TestBuildDefaultStrategies:
def test_detects_email(self):
df = pd.DataFrame({"email": ["a@b.com"], "name": ["Alice"]})
strats = build_default_strategies(df)
# email (strong, standalone) + name AND email (weak paired with strong) = 2
assert len(strats) == 2
found_email = any(
cs.column == "email" and cs.normalizer == NormalizerType.EMAIL
for s in strats for cs in s.column_strategies
)
assert found_email
# Name should only appear paired with email, not standalone
name_strats = [s for s in strats
if any(cs.column == "name" for cs in s.column_strategies)]
for s in name_strats:
assert len(s.column_strategies) >= 2, "Name should be paired with a strong key"
def test_fallback_all_columns(self):
df = pd.DataFrame({"x": [1], "y": [2], "z": [3]})
strats = build_default_strategies(df)
assert len(strats) == 1
assert len(strats[0].column_strategies) == 3
class TestDeduplicate:
def test_exact_duplicates(self, simple_df):
# Alice appears 3 times with same email
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])
result = deduplicate(simple_df, strategies=[strategy])
# 3 Alices -> 1, Bob stays, Charlie stays = 3 rows
assert len(result.deduplicated_df) == 3
assert result.original_row_count == 5
assert len(result.match_groups) == 1
def test_fuzzy_name_match(self):
df = pd.DataFrame({
"name": ["John Smith", "Jon Smith", "Jane Doe"],
"email": ["a@test.com", "b@test.com", "c@test.com"],
})
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(
column="name",
algorithm=Algorithm.JARO_WINKLER,
threshold=85,
normalizer=NormalizerType.NAME,
),
])
result = deduplicate(df, strategies=[strategy])
assert len(result.deduplicated_df) == 2
assert len(result.match_groups) == 1
def test_survivor_keep_last(self, simple_df):
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])
result = deduplicate(simple_df, strategies=[strategy],
survivor_rule=SurvivorRule.KEEP_LAST)
# The last Alice (index 4) should survive
assert len(result.match_groups) == 1
assert result.match_groups[0].survivor_index == 4
def test_survivor_most_complete(self, merge_df):
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])
result = deduplicate(merge_df, strategies=[strategy],
survivor_rule=SurvivorRule.KEEP_MOST_COMPLETE)
# Row 0 has phone but no address (1 empty)
# Row 1 has address but no phone (1 empty)
# Both have 1 empty, so keep_first among ties
assert len(result.deduplicated_df) == 2
def test_merge_mode(self, merge_df):
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])
result = deduplicate(merge_df, strategies=[strategy], merge=True)
# Survivor should have both phone and address filled
john_row = result.deduplicated_df[
result.deduplicated_df["name"] == "John Doe"
].iloc[0]
assert john_row["phone"] == "555-1111"
assert john_row["address"] == "123 Main St"
def test_multi_strategy_or(self):
df = pd.DataFrame({
"name": ["Alice", "Bob", "Alice B."],
"email": ["a@test.com", "a@test.com", "c@test.com"],
})
# Strategy 1: match on email
# Strategy 2: match on name (fuzzy)
strat1 = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])
strat2 = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="name", algorithm=Algorithm.JARO_WINKLER, threshold=70),
])
result = deduplicate(df, strategies=[strat1, strat2])
# All three should end up in one group via transitive closure:
# Alice~Bob (email), Alice~Alice B. (name)
assert len(result.deduplicated_df) == 1
def test_confidence_score(self, simple_df):
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])
result = deduplicate(simple_df, strategies=[strategy])
for group in result.match_groups:
assert 0 <= group.confidence <= 100
def test_preview_flag(self, simple_df):
result = deduplicate(simple_df, preview=True)
assert result.is_preview is True
result2 = deduplicate(simple_df, preview=False)
assert result2.is_preview is False
def test_auto_detect_strategies(self, sample_df):
result = deduplicate(sample_df)
# Should find duplicates in the sample data
assert len(result.match_groups) > 0
assert len(result.deduplicated_df) < len(sample_df)
def test_idempotent(self, sample_df):
"""Running dedup twice with same config produces same output."""
result1 = deduplicate(sample_df)
result2 = deduplicate(result1.deduplicated_df)
# Second pass should find no new duplicates
assert len(result2.match_groups) == 0
assert len(result2.deduplicated_df) == len(result1.deduplicated_df)
def test_review_callback(self):
df = pd.DataFrame({
"email": ["a@test.com", "a@test.com", "b@test.com"],
})
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])
# Reject all matches
result = deduplicate(df, strategies=[strategy],
review_callback=lambda g, d: False)
assert len(result.deduplicated_df) == 3 # nothing removed
# Accept all matches
result = deduplicate(df, strategies=[strategy],
review_callback=lambda g, d: True)
assert len(result.deduplicated_df) == 2