"""Tests for src.core.dedup — matching engine.""" import pandas as pd import pytest from src.core.dedup import ( Algorithm, ColumnMatchStrategy, MatchStrategy, SurvivorRule, _compute_similarity, _compare_pair, _UnionFind, build_default_strategies, deduplicate, ) from src.core.normalizers import NormalizerType class TestComputeSimilarity: def test_exact_match(self): assert _compute_similarity("hello", "hello", Algorithm.EXACT) == 100.0 def test_exact_mismatch(self): assert _compute_similarity("hello", "world", Algorithm.EXACT) == 0.0 def test_levenshtein_similar(self): score = _compute_similarity("kitten", "sitting", Algorithm.LEVENSHTEIN) assert 50 < score < 80 def test_jaro_winkler_similar(self): score = _compute_similarity("john", "jon", Algorithm.JARO_WINKLER) assert score > 80 def test_token_set_ratio(self): score = _compute_similarity( "123 main street apt 4", "apt 4 123 main street", Algorithm.TOKEN_SET_RATIO, ) assert score == 100.0 class TestUnionFind: def test_basic_union(self): uf = _UnionFind(5) uf.union(0, 1) uf.union(1, 2) assert uf.find(0) == uf.find(2) # transitive def test_separate_groups(self): uf = _UnionFind(5) uf.union(0, 1) uf.union(3, 4) assert uf.find(0) != uf.find(3) def test_groups(self): uf = _UnionFind(5) uf.union(0, 1) uf.union(1, 2) uf.union(3, 4) groups = uf.groups() assert len(groups) == 2 sizes = sorted(len(v) for v in groups.values()) assert sizes == [2, 3] class TestComparePair: def test_exact_match(self): strategy = MatchStrategy(column_strategies=[ ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT), ]) row_a = pd.Series({"email": "test@example.com"}) row_b = pd.Series({"email": "test@example.com"}) is_match, conf, cols = _compare_pair(row_a, row_b, strategy) assert is_match assert conf == 100.0 assert cols == ["email"] def test_exact_mismatch(self): strategy = MatchStrategy(column_strategies=[ ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT), ]) row_a = pd.Series({"email": "a@test.com"}) row_b = pd.Series({"email": "b@test.com"}) is_match, conf, cols = _compare_pair(row_a, row_b, strategy) assert not is_match def test_fuzzy_match(self): strategy = MatchStrategy(column_strategies=[ ColumnMatchStrategy(column="name", algorithm=Algorithm.JARO_WINKLER, threshold=80), ]) row_a = pd.Series({"name": "john smith"}) row_b = pd.Series({"name": "jon smith"}) is_match, conf, cols = _compare_pair(row_a, row_b, strategy) assert is_match assert conf > 80 def test_and_logic_both_must_match(self): strategy = MatchStrategy(column_strategies=[ ColumnMatchStrategy(column="name", algorithm=Algorithm.EXACT), ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT), ]) # name matches, email doesn't row_a = pd.Series({"name": "alice", "email": "a@test.com"}) row_b = pd.Series({"name": "alice", "email": "b@test.com"}) is_match, conf, cols = _compare_pair(row_a, row_b, strategy) assert not is_match class TestBuildDefaultStrategies: def test_detects_email(self): df = pd.DataFrame({"email": ["a@b.com"], "name": ["Alice"]}) strats = build_default_strategies(df) # email (strong, standalone) + name AND email (weak paired with strong) = 2 assert len(strats) == 2 found_email = any( cs.column == "email" and cs.normalizer == NormalizerType.EMAIL for s in strats for cs in s.column_strategies ) assert found_email # Name should only appear paired with email, not standalone name_strats = [s for s in strats if any(cs.column == "name" for cs in s.column_strategies)] for s in name_strats: assert len(s.column_strategies) >= 2, "Name should be paired with a strong key" def test_fallback_all_columns(self): df = pd.DataFrame({"x": [1], "y": [2], "z": [3]}) strats = build_default_strategies(df) assert len(strats) == 1 assert len(strats[0].column_strategies) == 3 class TestDeduplicate: def test_exact_duplicates(self, simple_df): # Alice appears 3 times with same email strategy = MatchStrategy(column_strategies=[ ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT), ]) result = deduplicate(simple_df, strategies=[strategy]) # 3 Alices -> 1, Bob stays, Charlie stays = 3 rows assert len(result.deduplicated_df) == 3 assert result.original_row_count == 5 assert len(result.match_groups) == 1 def test_fuzzy_name_match(self): df = pd.DataFrame({ "name": ["John Smith", "Jon Smith", "Jane Doe"], "email": ["a@test.com", "b@test.com", "c@test.com"], }) strategy = MatchStrategy(column_strategies=[ ColumnMatchStrategy( column="name", algorithm=Algorithm.JARO_WINKLER, threshold=85, normalizer=NormalizerType.NAME, ), ]) result = deduplicate(df, strategies=[strategy]) assert len(result.deduplicated_df) == 2 assert len(result.match_groups) == 1 def test_survivor_keep_last(self, simple_df): strategy = MatchStrategy(column_strategies=[ ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT), ]) result = deduplicate(simple_df, strategies=[strategy], survivor_rule=SurvivorRule.KEEP_LAST) # The last Alice (index 4) should survive assert len(result.match_groups) == 1 assert result.match_groups[0].survivor_index == 4 def test_survivor_most_complete(self, merge_df): strategy = MatchStrategy(column_strategies=[ ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT), ]) result = deduplicate(merge_df, strategies=[strategy], survivor_rule=SurvivorRule.KEEP_MOST_COMPLETE) # Row 0 has phone but no address (1 empty) # Row 1 has address but no phone (1 empty) # Both have 1 empty, so keep_first among ties assert len(result.deduplicated_df) == 2 def test_merge_mode(self, merge_df): strategy = MatchStrategy(column_strategies=[ ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT), ]) result = deduplicate(merge_df, strategies=[strategy], merge=True) # Survivor should have both phone and address filled john_row = result.deduplicated_df[ result.deduplicated_df["name"] == "John Doe" ].iloc[0] assert john_row["phone"] == "555-1111" assert john_row["address"] == "123 Main St" def test_multi_strategy_or(self): df = pd.DataFrame({ "name": ["Alice", "Bob", "Alice B."], "email": ["a@test.com", "a@test.com", "c@test.com"], }) # Strategy 1: match on email # Strategy 2: match on name (fuzzy) strat1 = MatchStrategy(column_strategies=[ ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT), ]) strat2 = MatchStrategy(column_strategies=[ ColumnMatchStrategy(column="name", algorithm=Algorithm.JARO_WINKLER, threshold=70), ]) result = deduplicate(df, strategies=[strat1, strat2]) # All three should end up in one group via transitive closure: # Alice~Bob (email), Alice~Alice B. (name) assert len(result.deduplicated_df) == 1 def test_confidence_score(self, simple_df): strategy = MatchStrategy(column_strategies=[ ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT), ]) result = deduplicate(simple_df, strategies=[strategy]) for group in result.match_groups: assert 0 <= group.confidence <= 100 def test_preview_flag(self, simple_df): result = deduplicate(simple_df, preview=True) assert result.is_preview is True result2 = deduplicate(simple_df, preview=False) assert result2.is_preview is False def test_auto_detect_strategies(self, sample_df): result = deduplicate(sample_df) # Should find duplicates in the sample data assert len(result.match_groups) > 0 assert len(result.deduplicated_df) < len(sample_df) def test_idempotent(self, sample_df): """Running dedup twice with same config produces same output.""" result1 = deduplicate(sample_df) result2 = deduplicate(result1.deduplicated_df) # Second pass should find no new duplicates assert len(result2.match_groups) == 0 assert len(result2.deduplicated_df) == len(result1.deduplicated_df) def test_review_callback(self): df = pd.DataFrame({ "email": ["a@test.com", "a@test.com", "b@test.com"], }) strategy = MatchStrategy(column_strategies=[ ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT), ]) # Reject all matches result = deduplicate(df, strategies=[strategy], review_callback=lambda g, d: False) assert len(result.deduplicated_df) == 3 # nothing removed # Accept all matches result = deduplicate(df, strategies=[strategy], review_callback=lambda g, d: True) assert len(result.deduplicated_df) == 2