feat: add documentation, Streamlit GUI, and full source tree
- Rewrite README.md with project overview, quick-start, and CLI summary - Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections - Add docs/DEVELOPER.md with architecture, data flow, and extension guides - Rewrite src/core/__init__.py with public API exports and module docstring - Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive match group review with side-by-side diff, and download buttons - Add .gitignore, requirements.txt, all source code, tests, and sample data - Add streamlit to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
258
tests/test_dedup.py
Normal file
258
tests/test_dedup.py
Normal file
@@ -0,0 +1,258 @@
|
||||
"""Tests for src.core.dedup — matching engine."""
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from src.core.dedup import (
|
||||
Algorithm,
|
||||
ColumnMatchStrategy,
|
||||
MatchStrategy,
|
||||
SurvivorRule,
|
||||
_compute_similarity,
|
||||
_compare_pair,
|
||||
_UnionFind,
|
||||
build_default_strategies,
|
||||
deduplicate,
|
||||
)
|
||||
from src.core.normalizers import NormalizerType
|
||||
|
||||
|
||||
class TestComputeSimilarity:
|
||||
def test_exact_match(self):
|
||||
assert _compute_similarity("hello", "hello", Algorithm.EXACT) == 100.0
|
||||
|
||||
def test_exact_mismatch(self):
|
||||
assert _compute_similarity("hello", "world", Algorithm.EXACT) == 0.0
|
||||
|
||||
def test_levenshtein_similar(self):
|
||||
score = _compute_similarity("kitten", "sitting", Algorithm.LEVENSHTEIN)
|
||||
assert 50 < score < 80
|
||||
|
||||
def test_jaro_winkler_similar(self):
|
||||
score = _compute_similarity("john", "jon", Algorithm.JARO_WINKLER)
|
||||
assert score > 80
|
||||
|
||||
def test_token_set_ratio(self):
|
||||
score = _compute_similarity(
|
||||
"123 main street apt 4",
|
||||
"apt 4 123 main street",
|
||||
Algorithm.TOKEN_SET_RATIO,
|
||||
)
|
||||
assert score == 100.0
|
||||
|
||||
|
||||
class TestUnionFind:
|
||||
def test_basic_union(self):
|
||||
uf = _UnionFind(5)
|
||||
uf.union(0, 1)
|
||||
uf.union(1, 2)
|
||||
assert uf.find(0) == uf.find(2) # transitive
|
||||
|
||||
def test_separate_groups(self):
|
||||
uf = _UnionFind(5)
|
||||
uf.union(0, 1)
|
||||
uf.union(3, 4)
|
||||
assert uf.find(0) != uf.find(3)
|
||||
|
||||
def test_groups(self):
|
||||
uf = _UnionFind(5)
|
||||
uf.union(0, 1)
|
||||
uf.union(1, 2)
|
||||
uf.union(3, 4)
|
||||
groups = uf.groups()
|
||||
assert len(groups) == 2
|
||||
sizes = sorted(len(v) for v in groups.values())
|
||||
assert sizes == [2, 3]
|
||||
|
||||
|
||||
class TestComparePair:
|
||||
def test_exact_match(self):
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||
])
|
||||
row_a = pd.Series({"email": "test@example.com"})
|
||||
row_b = pd.Series({"email": "test@example.com"})
|
||||
is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
|
||||
assert is_match
|
||||
assert conf == 100.0
|
||||
assert cols == ["email"]
|
||||
|
||||
def test_exact_mismatch(self):
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||
])
|
||||
row_a = pd.Series({"email": "a@test.com"})
|
||||
row_b = pd.Series({"email": "b@test.com"})
|
||||
is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
|
||||
assert not is_match
|
||||
|
||||
def test_fuzzy_match(self):
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="name", algorithm=Algorithm.JARO_WINKLER, threshold=80),
|
||||
])
|
||||
row_a = pd.Series({"name": "john smith"})
|
||||
row_b = pd.Series({"name": "jon smith"})
|
||||
is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
|
||||
assert is_match
|
||||
assert conf > 80
|
||||
|
||||
def test_and_logic_both_must_match(self):
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="name", algorithm=Algorithm.EXACT),
|
||||
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||
])
|
||||
# name matches, email doesn't
|
||||
row_a = pd.Series({"name": "alice", "email": "a@test.com"})
|
||||
row_b = pd.Series({"name": "alice", "email": "b@test.com"})
|
||||
is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
|
||||
assert not is_match
|
||||
|
||||
|
||||
class TestBuildDefaultStrategies:
|
||||
def test_detects_email(self):
|
||||
df = pd.DataFrame({"email": ["a@b.com"], "name": ["Alice"]})
|
||||
strats = build_default_strategies(df)
|
||||
# email (strong, standalone) + name AND email (weak paired with strong) = 2
|
||||
assert len(strats) == 2
|
||||
found_email = any(
|
||||
cs.column == "email" and cs.normalizer == NormalizerType.EMAIL
|
||||
for s in strats for cs in s.column_strategies
|
||||
)
|
||||
assert found_email
|
||||
# Name should only appear paired with email, not standalone
|
||||
name_strats = [s for s in strats
|
||||
if any(cs.column == "name" for cs in s.column_strategies)]
|
||||
for s in name_strats:
|
||||
assert len(s.column_strategies) >= 2, "Name should be paired with a strong key"
|
||||
|
||||
def test_fallback_all_columns(self):
|
||||
df = pd.DataFrame({"x": [1], "y": [2], "z": [3]})
|
||||
strats = build_default_strategies(df)
|
||||
assert len(strats) == 1
|
||||
assert len(strats[0].column_strategies) == 3
|
||||
|
||||
|
||||
class TestDeduplicate:
|
||||
def test_exact_duplicates(self, simple_df):
|
||||
# Alice appears 3 times with same email
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||
])
|
||||
result = deduplicate(simple_df, strategies=[strategy])
|
||||
# 3 Alices -> 1, Bob stays, Charlie stays = 3 rows
|
||||
assert len(result.deduplicated_df) == 3
|
||||
assert result.original_row_count == 5
|
||||
assert len(result.match_groups) == 1
|
||||
|
||||
def test_fuzzy_name_match(self):
|
||||
df = pd.DataFrame({
|
||||
"name": ["John Smith", "Jon Smith", "Jane Doe"],
|
||||
"email": ["a@test.com", "b@test.com", "c@test.com"],
|
||||
})
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(
|
||||
column="name",
|
||||
algorithm=Algorithm.JARO_WINKLER,
|
||||
threshold=85,
|
||||
normalizer=NormalizerType.NAME,
|
||||
),
|
||||
])
|
||||
result = deduplicate(df, strategies=[strategy])
|
||||
assert len(result.deduplicated_df) == 2
|
||||
assert len(result.match_groups) == 1
|
||||
|
||||
def test_survivor_keep_last(self, simple_df):
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||
])
|
||||
result = deduplicate(simple_df, strategies=[strategy],
|
||||
survivor_rule=SurvivorRule.KEEP_LAST)
|
||||
# The last Alice (index 4) should survive
|
||||
assert len(result.match_groups) == 1
|
||||
assert result.match_groups[0].survivor_index == 4
|
||||
|
||||
def test_survivor_most_complete(self, merge_df):
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||
])
|
||||
result = deduplicate(merge_df, strategies=[strategy],
|
||||
survivor_rule=SurvivorRule.KEEP_MOST_COMPLETE)
|
||||
# Row 0 has phone but no address (1 empty)
|
||||
# Row 1 has address but no phone (1 empty)
|
||||
# Both have 1 empty, so keep_first among ties
|
||||
assert len(result.deduplicated_df) == 2
|
||||
|
||||
def test_merge_mode(self, merge_df):
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||
])
|
||||
result = deduplicate(merge_df, strategies=[strategy], merge=True)
|
||||
# Survivor should have both phone and address filled
|
||||
john_row = result.deduplicated_df[
|
||||
result.deduplicated_df["name"] == "John Doe"
|
||||
].iloc[0]
|
||||
assert john_row["phone"] == "555-1111"
|
||||
assert john_row["address"] == "123 Main St"
|
||||
|
||||
def test_multi_strategy_or(self):
|
||||
df = pd.DataFrame({
|
||||
"name": ["Alice", "Bob", "Alice B."],
|
||||
"email": ["a@test.com", "a@test.com", "c@test.com"],
|
||||
})
|
||||
# Strategy 1: match on email
|
||||
# Strategy 2: match on name (fuzzy)
|
||||
strat1 = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||
])
|
||||
strat2 = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="name", algorithm=Algorithm.JARO_WINKLER, threshold=70),
|
||||
])
|
||||
result = deduplicate(df, strategies=[strat1, strat2])
|
||||
# All three should end up in one group via transitive closure:
|
||||
# Alice~Bob (email), Alice~Alice B. (name)
|
||||
assert len(result.deduplicated_df) == 1
|
||||
|
||||
def test_confidence_score(self, simple_df):
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||
])
|
||||
result = deduplicate(simple_df, strategies=[strategy])
|
||||
for group in result.match_groups:
|
||||
assert 0 <= group.confidence <= 100
|
||||
|
||||
def test_preview_flag(self, simple_df):
|
||||
result = deduplicate(simple_df, preview=True)
|
||||
assert result.is_preview is True
|
||||
result2 = deduplicate(simple_df, preview=False)
|
||||
assert result2.is_preview is False
|
||||
|
||||
def test_auto_detect_strategies(self, sample_df):
|
||||
result = deduplicate(sample_df)
|
||||
# Should find duplicates in the sample data
|
||||
assert len(result.match_groups) > 0
|
||||
assert len(result.deduplicated_df) < len(sample_df)
|
||||
|
||||
def test_idempotent(self, sample_df):
|
||||
"""Running dedup twice with same config produces same output."""
|
||||
result1 = deduplicate(sample_df)
|
||||
result2 = deduplicate(result1.deduplicated_df)
|
||||
# Second pass should find no new duplicates
|
||||
assert len(result2.match_groups) == 0
|
||||
assert len(result2.deduplicated_df) == len(result1.deduplicated_df)
|
||||
|
||||
def test_review_callback(self):
|
||||
df = pd.DataFrame({
|
||||
"email": ["a@test.com", "a@test.com", "b@test.com"],
|
||||
})
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||
])
|
||||
# Reject all matches
|
||||
result = deduplicate(df, strategies=[strategy],
|
||||
review_callback=lambda g, d: False)
|
||||
assert len(result.deduplicated_df) == 3 # nothing removed
|
||||
|
||||
# Accept all matches
|
||||
result = deduplicate(df, strategies=[strategy],
|
||||
review_callback=lambda g, d: True)
|
||||
assert len(result.deduplicated_df) == 2
|
||||
Reference in New Issue
Block a user