feat: add documentation, Streamlit GUI, and full source tree

- Rewrite README.md with project overview, quick-start, and CLI summary - Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections - Add docs/DEVELOPER.md with architecture, data flow, and extension guides - Rewrite src/core/__init__.py with public API exports and module docstring - Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive match group review with side-by-side diff, and download buttons - Add .gitignore, requirements.txt, all source code, tests, and sample data - Add streamlit to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-28 23:06:39 +00:00
parent 0613dc420c
commit b871ab24fc
47 changed files with 4413 additions and 2 deletions
--- a/tests/test_dedup.py
+++ b/tests/test_dedup.py
@@ -0,0 +1,258 @@
+"""Tests for src.core.dedup — matching engine."""
+
+import pandas as pd
+import pytest
+
+from src.core.dedup import (
+    Algorithm,
+    ColumnMatchStrategy,
+    MatchStrategy,
+    SurvivorRule,
+    _compute_similarity,
+    _compare_pair,
+    _UnionFind,
+    build_default_strategies,
+    deduplicate,
+)
+from src.core.normalizers import NormalizerType
+
+
+class TestComputeSimilarity:
+    def test_exact_match(self):
+        assert _compute_similarity("hello", "hello", Algorithm.EXACT) == 100.0
+
+    def test_exact_mismatch(self):
+        assert _compute_similarity("hello", "world", Algorithm.EXACT) == 0.0
+
+    def test_levenshtein_similar(self):
+        score = _compute_similarity("kitten", "sitting", Algorithm.LEVENSHTEIN)
+        assert 50 < score < 80
+
+    def test_jaro_winkler_similar(self):
+        score = _compute_similarity("john", "jon", Algorithm.JARO_WINKLER)
+        assert score > 80
+
+    def test_token_set_ratio(self):
+        score = _compute_similarity(
+            "123 main street apt 4",
+            "apt 4 123 main street",
+            Algorithm.TOKEN_SET_RATIO,
+        )
+        assert score == 100.0
+
+
+class TestUnionFind:
+    def test_basic_union(self):
+        uf = _UnionFind(5)
+        uf.union(0, 1)
+        uf.union(1, 2)
+        assert uf.find(0) == uf.find(2)  # transitive
+
+    def test_separate_groups(self):
+        uf = _UnionFind(5)
+        uf.union(0, 1)
+        uf.union(3, 4)
+        assert uf.find(0) != uf.find(3)
+
+    def test_groups(self):
+        uf = _UnionFind(5)
+        uf.union(0, 1)
+        uf.union(1, 2)
+        uf.union(3, 4)
+        groups = uf.groups()
+        assert len(groups) == 2
+        sizes = sorted(len(v) for v in groups.values())
+        assert sizes == [2, 3]
+
+
+class TestComparePair:
+    def test_exact_match(self):
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
+        ])
+        row_a = pd.Series({"email": "test@example.com"})
+        row_b = pd.Series({"email": "test@example.com"})
+        is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
+        assert is_match
+        assert conf == 100.0
+        assert cols == ["email"]
+
+    def test_exact_mismatch(self):
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
+        ])
+        row_a = pd.Series({"email": "a@test.com"})
+        row_b = pd.Series({"email": "b@test.com"})
+        is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
+        assert not is_match
+
+    def test_fuzzy_match(self):
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="name", algorithm=Algorithm.JARO_WINKLER, threshold=80),
+        ])
+        row_a = pd.Series({"name": "john smith"})
+        row_b = pd.Series({"name": "jon smith"})
+        is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
+        assert is_match
+        assert conf > 80
+
+    def test_and_logic_both_must_match(self):
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="name", algorithm=Algorithm.EXACT),
+            ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
+        ])
+        # name matches, email doesn't
+        row_a = pd.Series({"name": "alice", "email": "a@test.com"})
+        row_b = pd.Series({"name": "alice", "email": "b@test.com"})
+        is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
+        assert not is_match
+
+
+class TestBuildDefaultStrategies:
+    def test_detects_email(self):
+        df = pd.DataFrame({"email": ["a@b.com"], "name": ["Alice"]})
+        strats = build_default_strategies(df)
+        # email (strong, standalone) + name AND email (weak paired with strong) = 2
+        assert len(strats) == 2
+        found_email = any(
+            cs.column == "email" and cs.normalizer == NormalizerType.EMAIL
+            for s in strats for cs in s.column_strategies
+        )
+        assert found_email
+        # Name should only appear paired with email, not standalone
+        name_strats = [s for s in strats
+                       if any(cs.column == "name" for cs in s.column_strategies)]
+        for s in name_strats:
+            assert len(s.column_strategies) >= 2, "Name should be paired with a strong key"
+
+    def test_fallback_all_columns(self):
+        df = pd.DataFrame({"x": [1], "y": [2], "z": [3]})
+        strats = build_default_strategies(df)
+        assert len(strats) == 1
+        assert len(strats[0].column_strategies) == 3
+
+
+class TestDeduplicate:
+    def test_exact_duplicates(self, simple_df):
+        # Alice appears 3 times with same email
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
+        ])
+        result = deduplicate(simple_df, strategies=[strategy])
+        # 3 Alices -> 1, Bob stays, Charlie stays = 3 rows
+        assert len(result.deduplicated_df) == 3
+        assert result.original_row_count == 5
+        assert len(result.match_groups) == 1
+
+    def test_fuzzy_name_match(self):
+        df = pd.DataFrame({
+            "name": ["John Smith", "Jon Smith", "Jane Doe"],
+            "email": ["a@test.com", "b@test.com", "c@test.com"],
+        })
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(
+                column="name",
+                algorithm=Algorithm.JARO_WINKLER,
+                threshold=85,
+                normalizer=NormalizerType.NAME,
+            ),
+        ])
+        result = deduplicate(df, strategies=[strategy])
+        assert len(result.deduplicated_df) == 2
+        assert len(result.match_groups) == 1
+
+    def test_survivor_keep_last(self, simple_df):
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
+        ])
+        result = deduplicate(simple_df, strategies=[strategy],
+                            survivor_rule=SurvivorRule.KEEP_LAST)
+        # The last Alice (index 4) should survive
+        assert len(result.match_groups) == 1
+        assert result.match_groups[0].survivor_index == 4
+
+    def test_survivor_most_complete(self, merge_df):
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
+        ])
+        result = deduplicate(merge_df, strategies=[strategy],
+                            survivor_rule=SurvivorRule.KEEP_MOST_COMPLETE)
+        # Row 0 has phone but no address (1 empty)
+        # Row 1 has address but no phone (1 empty)
+        # Both have 1 empty, so keep_first among ties
+        assert len(result.deduplicated_df) == 2
+
+    def test_merge_mode(self, merge_df):
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
+        ])
+        result = deduplicate(merge_df, strategies=[strategy], merge=True)
+        # Survivor should have both phone and address filled
+        john_row = result.deduplicated_df[
+            result.deduplicated_df["name"] == "John Doe"
+        ].iloc[0]
+        assert john_row["phone"] == "555-1111"
+        assert john_row["address"] == "123 Main St"
+
+    def test_multi_strategy_or(self):
+        df = pd.DataFrame({
+            "name": ["Alice", "Bob", "Alice B."],
+            "email": ["a@test.com", "a@test.com", "c@test.com"],
+        })
+        # Strategy 1: match on email
+        # Strategy 2: match on name (fuzzy)
+        strat1 = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
+        ])
+        strat2 = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="name", algorithm=Algorithm.JARO_WINKLER, threshold=70),
+        ])
+        result = deduplicate(df, strategies=[strat1, strat2])
+        # All three should end up in one group via transitive closure:
+        # Alice~Bob (email), Alice~Alice B. (name)
+        assert len(result.deduplicated_df) == 1
+
+    def test_confidence_score(self, simple_df):
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
+        ])
+        result = deduplicate(simple_df, strategies=[strategy])
+        for group in result.match_groups:
+            assert 0 <= group.confidence <= 100
+
+    def test_preview_flag(self, simple_df):
+        result = deduplicate(simple_df, preview=True)
+        assert result.is_preview is True
+        result2 = deduplicate(simple_df, preview=False)
+        assert result2.is_preview is False
+
+    def test_auto_detect_strategies(self, sample_df):
+        result = deduplicate(sample_df)
+        # Should find duplicates in the sample data
+        assert len(result.match_groups) > 0
+        assert len(result.deduplicated_df) < len(sample_df)
+
+    def test_idempotent(self, sample_df):
+        """Running dedup twice with same config produces same output."""
+        result1 = deduplicate(sample_df)
+        result2 = deduplicate(result1.deduplicated_df)
+        # Second pass should find no new duplicates
+        assert len(result2.match_groups) == 0
+        assert len(result2.deduplicated_df) == len(result1.deduplicated_df)
+
+    def test_review_callback(self):
+        df = pd.DataFrame({
+            "email": ["a@test.com", "a@test.com", "b@test.com"],
+        })
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
+        ])
+        # Reject all matches
+        result = deduplicate(df, strategies=[strategy],
+                            review_callback=lambda g, d: False)
+        assert len(result.deduplicated_df) == 3  # nothing removed
+
+        # Accept all matches
+        result = deduplicate(df, strategies=[strategy],
+                            review_callback=lambda g, d: True)
+        assert len(result.deduplicated_df) == 2