- Rewrite README.md with project overview, quick-start, and CLI summary - Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections - Add docs/DEVELOPER.md with architecture, data flow, and extension guides - Rewrite src/core/__init__.py with public API exports and module docstring - Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive match group review with side-by-side diff, and download buttons - Add .gitignore, requirements.txt, all source code, tests, and sample data - Add streamlit to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
259 lines
9.7 KiB
Python
259 lines
9.7 KiB
Python
"""Tests for src.core.dedup — matching engine."""
|
|
|
|
import pandas as pd
|
|
import pytest
|
|
|
|
from src.core.dedup import (
|
|
Algorithm,
|
|
ColumnMatchStrategy,
|
|
MatchStrategy,
|
|
SurvivorRule,
|
|
_compute_similarity,
|
|
_compare_pair,
|
|
_UnionFind,
|
|
build_default_strategies,
|
|
deduplicate,
|
|
)
|
|
from src.core.normalizers import NormalizerType
|
|
|
|
|
|
class TestComputeSimilarity:
|
|
def test_exact_match(self):
|
|
assert _compute_similarity("hello", "hello", Algorithm.EXACT) == 100.0
|
|
|
|
def test_exact_mismatch(self):
|
|
assert _compute_similarity("hello", "world", Algorithm.EXACT) == 0.0
|
|
|
|
def test_levenshtein_similar(self):
|
|
score = _compute_similarity("kitten", "sitting", Algorithm.LEVENSHTEIN)
|
|
assert 50 < score < 80
|
|
|
|
def test_jaro_winkler_similar(self):
|
|
score = _compute_similarity("john", "jon", Algorithm.JARO_WINKLER)
|
|
assert score > 80
|
|
|
|
def test_token_set_ratio(self):
|
|
score = _compute_similarity(
|
|
"123 main street apt 4",
|
|
"apt 4 123 main street",
|
|
Algorithm.TOKEN_SET_RATIO,
|
|
)
|
|
assert score == 100.0
|
|
|
|
|
|
class TestUnionFind:
|
|
def test_basic_union(self):
|
|
uf = _UnionFind(5)
|
|
uf.union(0, 1)
|
|
uf.union(1, 2)
|
|
assert uf.find(0) == uf.find(2) # transitive
|
|
|
|
def test_separate_groups(self):
|
|
uf = _UnionFind(5)
|
|
uf.union(0, 1)
|
|
uf.union(3, 4)
|
|
assert uf.find(0) != uf.find(3)
|
|
|
|
def test_groups(self):
|
|
uf = _UnionFind(5)
|
|
uf.union(0, 1)
|
|
uf.union(1, 2)
|
|
uf.union(3, 4)
|
|
groups = uf.groups()
|
|
assert len(groups) == 2
|
|
sizes = sorted(len(v) for v in groups.values())
|
|
assert sizes == [2, 3]
|
|
|
|
|
|
class TestComparePair:
|
|
def test_exact_match(self):
|
|
strategy = MatchStrategy(column_strategies=[
|
|
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
|
])
|
|
row_a = pd.Series({"email": "test@example.com"})
|
|
row_b = pd.Series({"email": "test@example.com"})
|
|
is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
|
|
assert is_match
|
|
assert conf == 100.0
|
|
assert cols == ["email"]
|
|
|
|
def test_exact_mismatch(self):
|
|
strategy = MatchStrategy(column_strategies=[
|
|
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
|
])
|
|
row_a = pd.Series({"email": "a@test.com"})
|
|
row_b = pd.Series({"email": "b@test.com"})
|
|
is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
|
|
assert not is_match
|
|
|
|
def test_fuzzy_match(self):
|
|
strategy = MatchStrategy(column_strategies=[
|
|
ColumnMatchStrategy(column="name", algorithm=Algorithm.JARO_WINKLER, threshold=80),
|
|
])
|
|
row_a = pd.Series({"name": "john smith"})
|
|
row_b = pd.Series({"name": "jon smith"})
|
|
is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
|
|
assert is_match
|
|
assert conf > 80
|
|
|
|
def test_and_logic_both_must_match(self):
|
|
strategy = MatchStrategy(column_strategies=[
|
|
ColumnMatchStrategy(column="name", algorithm=Algorithm.EXACT),
|
|
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
|
])
|
|
# name matches, email doesn't
|
|
row_a = pd.Series({"name": "alice", "email": "a@test.com"})
|
|
row_b = pd.Series({"name": "alice", "email": "b@test.com"})
|
|
is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
|
|
assert not is_match
|
|
|
|
|
|
class TestBuildDefaultStrategies:
|
|
def test_detects_email(self):
|
|
df = pd.DataFrame({"email": ["a@b.com"], "name": ["Alice"]})
|
|
strats = build_default_strategies(df)
|
|
# email (strong, standalone) + name AND email (weak paired with strong) = 2
|
|
assert len(strats) == 2
|
|
found_email = any(
|
|
cs.column == "email" and cs.normalizer == NormalizerType.EMAIL
|
|
for s in strats for cs in s.column_strategies
|
|
)
|
|
assert found_email
|
|
# Name should only appear paired with email, not standalone
|
|
name_strats = [s for s in strats
|
|
if any(cs.column == "name" for cs in s.column_strategies)]
|
|
for s in name_strats:
|
|
assert len(s.column_strategies) >= 2, "Name should be paired with a strong key"
|
|
|
|
def test_fallback_all_columns(self):
|
|
df = pd.DataFrame({"x": [1], "y": [2], "z": [3]})
|
|
strats = build_default_strategies(df)
|
|
assert len(strats) == 1
|
|
assert len(strats[0].column_strategies) == 3
|
|
|
|
|
|
class TestDeduplicate:
|
|
def test_exact_duplicates(self, simple_df):
|
|
# Alice appears 3 times with same email
|
|
strategy = MatchStrategy(column_strategies=[
|
|
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
|
])
|
|
result = deduplicate(simple_df, strategies=[strategy])
|
|
# 3 Alices -> 1, Bob stays, Charlie stays = 3 rows
|
|
assert len(result.deduplicated_df) == 3
|
|
assert result.original_row_count == 5
|
|
assert len(result.match_groups) == 1
|
|
|
|
def test_fuzzy_name_match(self):
|
|
df = pd.DataFrame({
|
|
"name": ["John Smith", "Jon Smith", "Jane Doe"],
|
|
"email": ["a@test.com", "b@test.com", "c@test.com"],
|
|
})
|
|
strategy = MatchStrategy(column_strategies=[
|
|
ColumnMatchStrategy(
|
|
column="name",
|
|
algorithm=Algorithm.JARO_WINKLER,
|
|
threshold=85,
|
|
normalizer=NormalizerType.NAME,
|
|
),
|
|
])
|
|
result = deduplicate(df, strategies=[strategy])
|
|
assert len(result.deduplicated_df) == 2
|
|
assert len(result.match_groups) == 1
|
|
|
|
def test_survivor_keep_last(self, simple_df):
|
|
strategy = MatchStrategy(column_strategies=[
|
|
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
|
])
|
|
result = deduplicate(simple_df, strategies=[strategy],
|
|
survivor_rule=SurvivorRule.KEEP_LAST)
|
|
# The last Alice (index 4) should survive
|
|
assert len(result.match_groups) == 1
|
|
assert result.match_groups[0].survivor_index == 4
|
|
|
|
def test_survivor_most_complete(self, merge_df):
|
|
strategy = MatchStrategy(column_strategies=[
|
|
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
|
])
|
|
result = deduplicate(merge_df, strategies=[strategy],
|
|
survivor_rule=SurvivorRule.KEEP_MOST_COMPLETE)
|
|
# Row 0 has phone but no address (1 empty)
|
|
# Row 1 has address but no phone (1 empty)
|
|
# Both have 1 empty, so keep_first among ties
|
|
assert len(result.deduplicated_df) == 2
|
|
|
|
def test_merge_mode(self, merge_df):
|
|
strategy = MatchStrategy(column_strategies=[
|
|
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
|
])
|
|
result = deduplicate(merge_df, strategies=[strategy], merge=True)
|
|
# Survivor should have both phone and address filled
|
|
john_row = result.deduplicated_df[
|
|
result.deduplicated_df["name"] == "John Doe"
|
|
].iloc[0]
|
|
assert john_row["phone"] == "555-1111"
|
|
assert john_row["address"] == "123 Main St"
|
|
|
|
def test_multi_strategy_or(self):
|
|
df = pd.DataFrame({
|
|
"name": ["Alice", "Bob", "Alice B."],
|
|
"email": ["a@test.com", "a@test.com", "c@test.com"],
|
|
})
|
|
# Strategy 1: match on email
|
|
# Strategy 2: match on name (fuzzy)
|
|
strat1 = MatchStrategy(column_strategies=[
|
|
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
|
])
|
|
strat2 = MatchStrategy(column_strategies=[
|
|
ColumnMatchStrategy(column="name", algorithm=Algorithm.JARO_WINKLER, threshold=70),
|
|
])
|
|
result = deduplicate(df, strategies=[strat1, strat2])
|
|
# All three should end up in one group via transitive closure:
|
|
# Alice~Bob (email), Alice~Alice B. (name)
|
|
assert len(result.deduplicated_df) == 1
|
|
|
|
def test_confidence_score(self, simple_df):
|
|
strategy = MatchStrategy(column_strategies=[
|
|
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
|
])
|
|
result = deduplicate(simple_df, strategies=[strategy])
|
|
for group in result.match_groups:
|
|
assert 0 <= group.confidence <= 100
|
|
|
|
def test_preview_flag(self, simple_df):
|
|
result = deduplicate(simple_df, preview=True)
|
|
assert result.is_preview is True
|
|
result2 = deduplicate(simple_df, preview=False)
|
|
assert result2.is_preview is False
|
|
|
|
def test_auto_detect_strategies(self, sample_df):
|
|
result = deduplicate(sample_df)
|
|
# Should find duplicates in the sample data
|
|
assert len(result.match_groups) > 0
|
|
assert len(result.deduplicated_df) < len(sample_df)
|
|
|
|
def test_idempotent(self, sample_df):
|
|
"""Running dedup twice with same config produces same output."""
|
|
result1 = deduplicate(sample_df)
|
|
result2 = deduplicate(result1.deduplicated_df)
|
|
# Second pass should find no new duplicates
|
|
assert len(result2.match_groups) == 0
|
|
assert len(result2.deduplicated_df) == len(result1.deduplicated_df)
|
|
|
|
def test_review_callback(self):
|
|
df = pd.DataFrame({
|
|
"email": ["a@test.com", "a@test.com", "b@test.com"],
|
|
})
|
|
strategy = MatchStrategy(column_strategies=[
|
|
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
|
])
|
|
# Reject all matches
|
|
result = deduplicate(df, strategies=[strategy],
|
|
review_callback=lambda g, d: False)
|
|
assert len(result.deduplicated_df) == 3 # nothing removed
|
|
|
|
# Accept all matches
|
|
result = deduplicate(df, strategies=[strategy],
|
|
review_callback=lambda g, d: True)
|
|
assert len(result.deduplicated_df) == 2
|