feat: add documentation, Streamlit GUI, and full source tree

- Rewrite README.md with project overview, quick-start, and CLI summary - Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections - Add docs/DEVELOPER.md with architecture, data flow, and extension guides - Rewrite src/core/__init__.py with public API exports and module docstring - Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive match group review with side-by-side diff, and download buttons - Add .gitignore, requirements.txt, all source code, tests, and sample data - Add streamlit to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-28 23:06:39 +00:00
parent 0613dc420c
commit b871ab24fc
47 changed files with 4413 additions and 2 deletions
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,47 @@
+"""Shared test fixtures."""
+
+import pandas as pd
+import pytest
+from pathlib import Path
+
+SAMPLES_DIR = Path(__file__).parent.parent / "samples"
+
+
+@pytest.fixture
+def sample_csv_path():
+    return SAMPLES_DIR / "messy_sales.csv"
+
+
+@pytest.fixture
+def sample_df(sample_csv_path):
+    return pd.read_csv(sample_csv_path, dtype=str, keep_default_na=False)
+
+
+@pytest.fixture
+def simple_df():
+    """Small DataFrame with obvious duplicates for unit testing."""
+    return pd.DataFrame({
+        "name": ["Alice", "alice", "Bob", "Charlie", "ALICE"],
+        "email": ["alice@test.com", "alice@test.com", "bob@test.com",
+                  "charlie@test.com", "alice@test.com"],
+        "phone": ["555-1234", "555-1234", "555-5678", "555-9012", "555-1234"],
+    })
+
+
+@pytest.fixture
+def merge_df():
+    """DataFrame with partial records that benefit from merge."""
+    return pd.DataFrame({
+        "name": ["John Doe", "John Doe", "Jane Smith"],
+        "email": ["john@test.com", "john@test.com", "jane@test.com"],
+        "phone": ["555-1111", "", "555-3333"],
+        "address": ["", "123 Main St", "456 Oak Ave"],
+    })
+
+
+@pytest.fixture
+def tmp_csv(tmp_path, simple_df):
+    """Write simple_df to a temp CSV and return the path."""
+    path = tmp_path / "test_input.csv"
+    simple_df.to_csv(path, index=False)
+    return path
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -0,0 +1,147 @@
+"""Integration tests for the CLI via Typer's CliRunner."""
+
+import pytest
+from pathlib import Path
+from typer.testing import CliRunner
+
+from src.cli import app
+
+runner = CliRunner()
+
+
+class TestCliPreview:
+    def test_preview_default(self, tmp_csv):
+        result = runner.invoke(app, [str(tmp_csv)])
+        assert result.exit_code == 0
+        assert "preview" in result.output.lower() or "Rows in" in result.output
+
+    def test_preview_shows_row_counts(self, tmp_csv):
+        result = runner.invoke(app, [str(tmp_csv)])
+        assert result.exit_code == 0
+        assert "Rows in" in result.output
+        assert "Rows out" in result.output
+
+    def test_file_not_found(self):
+        result = runner.invoke(app, ["/tmp/nonexistent_xyz_abc.csv"])
+        assert result.exit_code != 0
+        assert "not found" in result.output.lower()
+
+
+class TestCliApply:
+    def test_apply_writes_output(self, tmp_csv, tmp_path):
+        out = tmp_path / "output.csv"
+        result = runner.invoke(app, [str(tmp_csv), "--apply", "-o", str(out)])
+        assert result.exit_code == 0
+        assert out.exists()
+
+    def test_apply_default_output_name(self, tmp_csv):
+        result = runner.invoke(app, [str(tmp_csv), "--apply"])
+        assert result.exit_code == 0
+        expected = tmp_csv.parent / f"{tmp_csv.stem}_deduplicated.csv"
+        assert expected.exists()
+
+    def test_apply_creates_removed_file(self, tmp_csv):
+        result = runner.invoke(app, [str(tmp_csv), "--apply"])
+        assert result.exit_code == 0
+        removed = tmp_csv.parent / f"{tmp_csv.stem}_removed.csv"
+        # May or may not exist depending on whether duplicates were found
+        # with default auto-detect on simple_df
+
+
+class TestCliFuzzy:
+    def test_fuzzy_flag(self, tmp_csv):
+        result = runner.invoke(app, [
+            str(tmp_csv), "--fuzzy", "name", "--threshold", "80",
+        ])
+        assert result.exit_code == 0
+
+    def test_subset_flag(self, tmp_csv):
+        result = runner.invoke(app, [
+            str(tmp_csv), "--subset", "email",
+        ])
+        assert result.exit_code == 0
+
+    def test_bad_column_error(self, tmp_csv):
+        result = runner.invoke(app, [
+            str(tmp_csv), "--subset", "nonexistent_column",
+        ])
+        assert result.exit_code != 0
+        assert "not found" in result.output.lower()
+
+
+class TestCliConfig:
+    def test_save_and_load_config(self, tmp_csv, tmp_path):
+        cfg_path = tmp_path / "my_config.json"
+        # Save
+        result = runner.invoke(app, [
+            str(tmp_csv), "--subset", "email", "--save-config", str(cfg_path),
+        ])
+        assert result.exit_code == 0
+        assert cfg_path.exists()
+
+        # Load and apply
+        result = runner.invoke(app, [
+            str(tmp_csv), "--config", str(cfg_path), "--apply",
+        ])
+        assert result.exit_code == 0
+
+
+class TestCliSurvivor:
+    def test_survivor_last(self, tmp_csv):
+        result = runner.invoke(app, [str(tmp_csv), "--survivor", "last"])
+        assert result.exit_code == 0
+
+    def test_survivor_most_complete(self, tmp_csv):
+        result = runner.invoke(app, [str(tmp_csv), "--survivor", "most-complete"])
+        assert result.exit_code == 0
+
+    def test_invalid_survivor(self, tmp_csv):
+        result = runner.invoke(app, [str(tmp_csv), "--survivor", "bogus"])
+        assert result.exit_code != 0
+
+
+class TestCliMerge:
+    def test_merge_flag(self, tmp_csv):
+        result = runner.invoke(app, [str(tmp_csv), "--merge", "--apply"])
+        assert result.exit_code == 0
+
+
+class TestCliSampleData:
+    def test_sample_preview(self, sample_csv_path):
+        result = runner.invoke(app, [str(sample_csv_path)])
+        assert result.exit_code == 0
+        assert "Rows in:   50" in result.output
+        # Should find duplicates
+        assert "Removed:" in result.output
+
+    def test_sample_apply(self, sample_csv_path, tmp_path):
+        out = tmp_path / "deduped.csv"
+        result = runner.invoke(app, [
+            str(sample_csv_path), "--apply", "-o", str(out),
+        ])
+        assert result.exit_code == 0
+        assert out.exists()
+        import pandas as pd
+        df = pd.read_csv(out, encoding="utf-8-sig")
+        # Should have fewer than 50 rows
+        assert len(df) < 50
+
+    def test_sample_fuzzy_with_merge(self, sample_csv_path, tmp_path):
+        out = tmp_path / "fuzzy_merged.csv"
+        result = runner.invoke(app, [
+            str(sample_csv_path),
+            "--fuzzy", "customer_name",
+            "--threshold", "80",
+            "--merge",
+            "--apply",
+            "-o", str(out),
+        ])
+        assert result.exit_code == 0
+        assert out.exists()
+
+
+class TestCliHelp:
+    def test_help(self):
+        result = runner.invoke(app, ["--help"])
+        assert result.exit_code == 0
+        assert "--apply" in result.output
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -0,0 +1,102 @@
+"""Tests for src.core.config — save/load configuration profiles."""
+
+import json
+import pytest
+from pathlib import Path
+
+from src.core.config import (
+    DeduplicationConfig,
+    StrategyConfig,
+    ColumnStrategyConfig,
+)
+from src.core.dedup import Algorithm, SurvivorRule
+from src.core.normalizers import NormalizerType
+
+
+class TestDeduplicationConfig:
+    def test_default(self):
+        cfg = DeduplicationConfig.default()
+        assert cfg.survivor_rule == "first"
+        assert cfg.merge is False
+        assert cfg.strategies == []
+
+    def test_to_dict_roundtrip(self):
+        cfg = DeduplicationConfig(
+            strategies=[
+                StrategyConfig(columns=[
+                    ColumnStrategyConfig(
+                        column="email",
+                        algorithm="exact",
+                        threshold=100.0,
+                        normalizer="email",
+                    ),
+                ]),
+            ],
+            survivor_rule="most_complete",
+            merge=True,
+        )
+        d = cfg.to_dict()
+        cfg2 = DeduplicationConfig.from_dict(d)
+        assert cfg2.survivor_rule == "most_complete"
+        assert cfg2.merge is True
+        assert len(cfg2.strategies) == 1
+        assert cfg2.strategies[0].columns[0].column == "email"
+
+    def test_to_file_from_file(self, tmp_path):
+        cfg = DeduplicationConfig(
+            strategies=[
+                StrategyConfig(columns=[
+                    ColumnStrategyConfig(column="name", algorithm="jaro_winkler",
+                                       threshold=85.0, normalizer="name"),
+                ]),
+            ],
+            survivor_rule="last",
+        )
+        path = tmp_path / "test_config.json"
+        cfg.to_file(path)
+
+        loaded = DeduplicationConfig.from_file(path)
+        assert loaded.survivor_rule == "last"
+        assert len(loaded.strategies) == 1
+        assert loaded.strategies[0].columns[0].algorithm == "jaro_winkler"
+
+    def test_to_strategies(self):
+        cfg = DeduplicationConfig(
+            strategies=[
+                StrategyConfig(columns=[
+                    ColumnStrategyConfig(column="email", algorithm="exact",
+                                       threshold=100.0, normalizer="email"),
+                    ColumnStrategyConfig(column="phone", algorithm="exact",
+                                       threshold=100.0, normalizer="phone"),
+                ]),
+            ],
+        )
+        strats = cfg.to_strategies()
+        assert strats is not None
+        assert len(strats) == 1
+        assert len(strats[0].column_strategies) == 2
+        assert strats[0].column_strategies[0].algorithm == Algorithm.EXACT
+        assert strats[0].column_strategies[0].normalizer == NormalizerType.EMAIL
+
+    def test_to_strategies_empty(self):
+        cfg = DeduplicationConfig.default()
+        assert cfg.to_strategies() is None
+
+    def test_to_survivor_rule(self):
+        cfg = DeduplicationConfig(survivor_rule="most_complete")
+        assert cfg.to_survivor_rule() == SurvivorRule.KEEP_MOST_COMPLETE
+
+    def test_json_is_valid(self, tmp_path):
+        cfg = DeduplicationConfig(
+            strategies=[
+                StrategyConfig(columns=[
+                    ColumnStrategyConfig(column="x", algorithm="exact"),
+                ]),
+            ],
+            normalize_map={"email": "email"},
+        )
+        path = tmp_path / "valid.json"
+        cfg.to_file(path)
+        data = json.loads(path.read_text())
+        assert "strategies" in data
+        assert "normalize_map" in data
--- a/tests/test_dedup.py
+++ b/tests/test_dedup.py
@@ -0,0 +1,258 @@
+"""Tests for src.core.dedup — matching engine."""
+
+import pandas as pd
+import pytest
+
+from src.core.dedup import (
+    Algorithm,
+    ColumnMatchStrategy,
+    MatchStrategy,
+    SurvivorRule,
+    _compute_similarity,
+    _compare_pair,
+    _UnionFind,
+    build_default_strategies,
+    deduplicate,
+)
+from src.core.normalizers import NormalizerType
+
+
+class TestComputeSimilarity:
+    def test_exact_match(self):
+        assert _compute_similarity("hello", "hello", Algorithm.EXACT) == 100.0
+
+    def test_exact_mismatch(self):
+        assert _compute_similarity("hello", "world", Algorithm.EXACT) == 0.0
+
+    def test_levenshtein_similar(self):
+        score = _compute_similarity("kitten", "sitting", Algorithm.LEVENSHTEIN)
+        assert 50 < score < 80
+
+    def test_jaro_winkler_similar(self):
+        score = _compute_similarity("john", "jon", Algorithm.JARO_WINKLER)
+        assert score > 80
+
+    def test_token_set_ratio(self):
+        score = _compute_similarity(
+            "123 main street apt 4",
+            "apt 4 123 main street",
+            Algorithm.TOKEN_SET_RATIO,
+        )
+        assert score == 100.0
+
+
+class TestUnionFind:
+    def test_basic_union(self):
+        uf = _UnionFind(5)
+        uf.union(0, 1)
+        uf.union(1, 2)
+        assert uf.find(0) == uf.find(2)  # transitive
+
+    def test_separate_groups(self):
+        uf = _UnionFind(5)
+        uf.union(0, 1)
+        uf.union(3, 4)
+        assert uf.find(0) != uf.find(3)
+
+    def test_groups(self):
+        uf = _UnionFind(5)
+        uf.union(0, 1)
+        uf.union(1, 2)
+        uf.union(3, 4)
+        groups = uf.groups()
+        assert len(groups) == 2
+        sizes = sorted(len(v) for v in groups.values())
+        assert sizes == [2, 3]
+
+
+class TestComparePair:
+    def test_exact_match(self):
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
+        ])
+        row_a = pd.Series({"email": "test@example.com"})
+        row_b = pd.Series({"email": "test@example.com"})
+        is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
+        assert is_match
+        assert conf == 100.0
+        assert cols == ["email"]
+
+    def test_exact_mismatch(self):
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
+        ])
+        row_a = pd.Series({"email": "a@test.com"})
+        row_b = pd.Series({"email": "b@test.com"})
+        is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
+        assert not is_match
+
+    def test_fuzzy_match(self):
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="name", algorithm=Algorithm.JARO_WINKLER, threshold=80),
+        ])
+        row_a = pd.Series({"name": "john smith"})
+        row_b = pd.Series({"name": "jon smith"})
+        is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
+        assert is_match
+        assert conf > 80
+
+    def test_and_logic_both_must_match(self):
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="name", algorithm=Algorithm.EXACT),
+            ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
+        ])
+        # name matches, email doesn't
+        row_a = pd.Series({"name": "alice", "email": "a@test.com"})
+        row_b = pd.Series({"name": "alice", "email": "b@test.com"})
+        is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
+        assert not is_match
+
+
+class TestBuildDefaultStrategies:
+    def test_detects_email(self):
+        df = pd.DataFrame({"email": ["a@b.com"], "name": ["Alice"]})
+        strats = build_default_strategies(df)
+        # email (strong, standalone) + name AND email (weak paired with strong) = 2
+        assert len(strats) == 2
+        found_email = any(
+            cs.column == "email" and cs.normalizer == NormalizerType.EMAIL
+            for s in strats for cs in s.column_strategies
+        )
+        assert found_email
+        # Name should only appear paired with email, not standalone
+        name_strats = [s for s in strats
+                       if any(cs.column == "name" for cs in s.column_strategies)]
+        for s in name_strats:
+            assert len(s.column_strategies) >= 2, "Name should be paired with a strong key"
+
+    def test_fallback_all_columns(self):
+        df = pd.DataFrame({"x": [1], "y": [2], "z": [3]})
+        strats = build_default_strategies(df)
+        assert len(strats) == 1
+        assert len(strats[0].column_strategies) == 3
+
+
+class TestDeduplicate:
+    def test_exact_duplicates(self, simple_df):
+        # Alice appears 3 times with same email
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
+        ])
+        result = deduplicate(simple_df, strategies=[strategy])
+        # 3 Alices -> 1, Bob stays, Charlie stays = 3 rows
+        assert len(result.deduplicated_df) == 3
+        assert result.original_row_count == 5
+        assert len(result.match_groups) == 1
+
+    def test_fuzzy_name_match(self):
+        df = pd.DataFrame({
+            "name": ["John Smith", "Jon Smith", "Jane Doe"],
+            "email": ["a@test.com", "b@test.com", "c@test.com"],
+        })
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(
+                column="name",
+                algorithm=Algorithm.JARO_WINKLER,
+                threshold=85,
+                normalizer=NormalizerType.NAME,
+            ),
+        ])
+        result = deduplicate(df, strategies=[strategy])
+        assert len(result.deduplicated_df) == 2
+        assert len(result.match_groups) == 1
+
+    def test_survivor_keep_last(self, simple_df):
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
+        ])
+        result = deduplicate(simple_df, strategies=[strategy],
+                            survivor_rule=SurvivorRule.KEEP_LAST)
+        # The last Alice (index 4) should survive
+        assert len(result.match_groups) == 1
+        assert result.match_groups[0].survivor_index == 4
+
+    def test_survivor_most_complete(self, merge_df):
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
+        ])
+        result = deduplicate(merge_df, strategies=[strategy],
+                            survivor_rule=SurvivorRule.KEEP_MOST_COMPLETE)
+        # Row 0 has phone but no address (1 empty)
+        # Row 1 has address but no phone (1 empty)
+        # Both have 1 empty, so keep_first among ties
+        assert len(result.deduplicated_df) == 2
+
+    def test_merge_mode(self, merge_df):
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
+        ])
+        result = deduplicate(merge_df, strategies=[strategy], merge=True)
+        # Survivor should have both phone and address filled
+        john_row = result.deduplicated_df[
+            result.deduplicated_df["name"] == "John Doe"
+        ].iloc[0]
+        assert john_row["phone"] == "555-1111"
+        assert john_row["address"] == "123 Main St"
+
+    def test_multi_strategy_or(self):
+        df = pd.DataFrame({
+            "name": ["Alice", "Bob", "Alice B."],
+            "email": ["a@test.com", "a@test.com", "c@test.com"],
+        })
+        # Strategy 1: match on email
+        # Strategy 2: match on name (fuzzy)
+        strat1 = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
+        ])
+        strat2 = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="name", algorithm=Algorithm.JARO_WINKLER, threshold=70),
+        ])
+        result = deduplicate(df, strategies=[strat1, strat2])
+        # All three should end up in one group via transitive closure:
+        # Alice~Bob (email), Alice~Alice B. (name)
+        assert len(result.deduplicated_df) == 1
+
+    def test_confidence_score(self, simple_df):
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
+        ])
+        result = deduplicate(simple_df, strategies=[strategy])
+        for group in result.match_groups:
+            assert 0 <= group.confidence <= 100
+
+    def test_preview_flag(self, simple_df):
+        result = deduplicate(simple_df, preview=True)
+        assert result.is_preview is True
+        result2 = deduplicate(simple_df, preview=False)
+        assert result2.is_preview is False
+
+    def test_auto_detect_strategies(self, sample_df):
+        result = deduplicate(sample_df)
+        # Should find duplicates in the sample data
+        assert len(result.match_groups) > 0
+        assert len(result.deduplicated_df) < len(sample_df)
+
+    def test_idempotent(self, sample_df):
+        """Running dedup twice with same config produces same output."""
+        result1 = deduplicate(sample_df)
+        result2 = deduplicate(result1.deduplicated_df)
+        # Second pass should find no new duplicates
+        assert len(result2.match_groups) == 0
+        assert len(result2.deduplicated_df) == len(result1.deduplicated_df)
+
+    def test_review_callback(self):
+        df = pd.DataFrame({
+            "email": ["a@test.com", "a@test.com", "b@test.com"],
+        })
+        strategy = MatchStrategy(column_strategies=[
+            ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
+        ])
+        # Reject all matches
+        result = deduplicate(df, strategies=[strategy],
+                            review_callback=lambda g, d: False)
+        assert len(result.deduplicated_df) == 3  # nothing removed
+
+        # Accept all matches
+        result = deduplicate(df, strategies=[strategy],
+                            review_callback=lambda g, d: True)
+        assert len(result.deduplicated_df) == 2
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -0,0 +1,130 @@
+"""Tests for src.core.io — file reading, encoding/delimiter detection."""
+
+import pandas as pd
+import pytest
+from pathlib import Path
+
+from src.core.io import (
+    detect_encoding,
+    detect_delimiter,
+    detect_header_row,
+    read_file,
+    write_file,
+    list_sheets,
+)
+
+
+class TestDetectEncoding:
+    def test_utf8_file(self, sample_csv_path):
+        enc = detect_encoding(sample_csv_path)
+        assert enc.lower().replace("-", "") in ("utf8", "ascii", "utf8sig")
+
+    def test_empty_file(self, tmp_path):
+        f = tmp_path / "empty.csv"
+        f.write_bytes(b"")
+        assert detect_encoding(f) == "utf-8"
+
+    def test_bom_file(self, tmp_path):
+        f = tmp_path / "bom.csv"
+        f.write_bytes(b"\xef\xbb\xbfname,email\nAlice,a@b.com\n")
+        assert detect_encoding(f) == "utf-8-sig"
+
+    def test_latin1_file(self, tmp_path):
+        f = tmp_path / "latin.csv"
+        content = "name,city\nJosé,São Paulo\n".encode("latin-1")
+        f.write_bytes(content)
+        enc = detect_encoding(f)
+        # Should detect something compatible with latin-1 family
+        assert enc in ("iso-8859-1", "latin-1", "windows-1252", "cp1252",
+                       "iso-8859-9", "cp1250", "iso-8859-15", "utf-8")
+
+
+class TestDetectDelimiter:
+    def test_comma(self, sample_csv_path):
+        assert detect_delimiter(sample_csv_path) == ","
+
+    def test_tab(self, tmp_path):
+        f = tmp_path / "tabs.tsv"
+        f.write_text("name\temail\nAlice\ta@b.com\n")
+        assert detect_delimiter(f) == "\t"
+
+    def test_semicolon(self, tmp_path):
+        f = tmp_path / "semi.csv"
+        f.write_text("name;email;phone\nAlice;a@b.com;555\n")
+        assert detect_delimiter(f) == ";"
+
+    def test_pipe(self, tmp_path):
+        f = tmp_path / "pipe.csv"
+        f.write_text("name|email|phone\nAlice|a@b.com|555\n")
+        assert detect_delimiter(f) == "|"
+
+
+class TestDetectHeaderRow:
+    def test_standard_csv(self, sample_csv_path):
+        assert detect_header_row(sample_csv_path) == 0
+
+    def test_with_junk_rows(self, tmp_path):
+        f = tmp_path / "junk.csv"
+        f.write_text("Report generated 2024-01-01\n\nname,email,phone\nAlice,a@b.com,555\n")
+        # Row 0 has "Report generated..." which is a single non-numeric string
+        # Row 2 has "name,email,phone" which looks like headers
+        # The heuristic checks all cells, so row 0 may match if it's a single cell
+        hdr = detect_header_row(f)
+        assert hdr in (0, 2)  # depends on delimiter detection
+
+
+class TestReadFile:
+    def test_read_csv(self, sample_csv_path):
+        df = read_file(sample_csv_path)
+        assert isinstance(df, pd.DataFrame)
+        assert len(df) == 50
+        assert "customer_name" in df.columns
+
+    def test_read_nonexistent(self):
+        with pytest.raises(FileNotFoundError):
+            read_file("/tmp/nonexistent_file_xyz.csv")
+
+    def test_read_with_encoding_override(self, sample_csv_path):
+        df = read_file(sample_csv_path, encoding="utf-8")
+        assert len(df) == 50
+
+    def test_chunked_reading(self, sample_csv_path):
+        chunks = read_file(sample_csv_path, chunk_size=10)
+        # Should be a generator
+        all_chunks = list(chunks)
+        assert len(all_chunks) == 5
+        total_rows = sum(len(c) for c in all_chunks)
+        assert total_rows == 50
+
+
+class TestWriteFile:
+    def test_write_csv(self, tmp_path, simple_df):
+        out = tmp_path / "output.csv"
+        write_file(simple_df, out)
+        assert out.exists()
+        # Read back
+        df = pd.read_csv(out, encoding="utf-8-sig")
+        assert len(df) == len(simple_df)
+
+    def test_write_xlsx(self, tmp_path, simple_df):
+        out = tmp_path / "output.xlsx"
+        write_file(simple_df, out)
+        assert out.exists()
+        df = pd.read_excel(out)
+        assert len(df) == len(simple_df)
+
+    def test_utf8_bom_default(self, tmp_path, simple_df):
+        out = tmp_path / "bom.csv"
+        write_file(simple_df, out)
+        raw = out.read_bytes()
+        assert raw[:3] == b"\xef\xbb\xbf"
+
+
+class TestListSheets:
+    def test_list_sheets(self, tmp_path, simple_df):
+        path = tmp_path / "multi.xlsx"
+        with pd.ExcelWriter(path, engine="openpyxl") as writer:
+            simple_df.to_excel(writer, sheet_name="Sheet1", index=False)
+            simple_df.to_excel(writer, sheet_name="Sheet2", index=False)
+        sheets = list_sheets(path)
+        assert sheets == ["Sheet1", "Sheet2"]
--- a/tests/test_normalizers.py
+++ b/tests/test_normalizers.py
@@ -0,0 +1,158 @@
+"""Tests for src.core.normalizers."""
+
+import pytest
+from src.core.normalizers import (
+    NormalizerType,
+    get_normalizer,
+    normalize_email,
+    normalize_phone,
+    normalize_name,
+    normalize_address,
+    normalize_string,
+)
+
+
+class TestNormalizeEmail:
+    def test_basic_lowercase(self):
+        assert normalize_email("John@Example.COM") == "john@example.com"
+
+    def test_strip_whitespace(self):
+        assert normalize_email("  alice@test.com  ") == "alice@test.com"
+
+    def test_strip_gmail_dots(self):
+        assert normalize_email("j.o.h.n@gmail.com") == "john@gmail.com"
+
+    def test_strip_plus_tag(self):
+        assert normalize_email("alice+promo@test.com") == "alice@test.com"
+
+    def test_gmail_dots_and_plus(self):
+        assert normalize_email("j.smith+tag@gmail.com") == "jsmith@gmail.com"
+
+    def test_non_gmail_keeps_dots(self):
+        assert normalize_email("j.smith@company.com") == "j.smith@company.com"
+
+    def test_empty(self):
+        assert normalize_email("") == ""
+        assert normalize_email(None) == ""
+
+    def test_no_at_sign(self):
+        assert normalize_email("not-an-email") == "not-an-email"
+
+    def test_idempotent(self):
+        result = normalize_email("J.Smith+tag@Gmail.com")
+        assert normalize_email(result) == result
+
+
+class TestNormalizePhone:
+    def test_us_formatted(self):
+        assert normalize_phone("(555) 123-4567") == "+15551234567"
+
+    def test_dashes(self):
+        assert normalize_phone("555-123-4567") == "+15551234567"
+
+    def test_dots(self):
+        assert normalize_phone("555.123.4567") == "+15551234567"
+
+    def test_with_country_code(self):
+        assert normalize_phone("+1 555-123-4567") == "+15551234567"
+
+    def test_digits_only_input(self):
+        assert normalize_phone("5551234567") == "+15551234567"
+
+    def test_empty(self):
+        assert normalize_phone("") == ""
+        assert normalize_phone(None) == ""
+
+    def test_invalid_fallback_digits(self):
+        # Very short number that phonenumbers rejects
+        result = normalize_phone("123")
+        assert result == "123"
+
+    def test_idempotent(self):
+        result = normalize_phone("(555) 123-4567")
+        assert normalize_phone(result) == result
+
+
+class TestNormalizeName:
+    def test_strip_mr(self):
+        assert normalize_name("Mr. John Smith") == "john smith"
+
+    def test_strip_dr(self):
+        assert normalize_name("Dr. Jane Doe") == "jane doe"
+
+    def test_strip_suffix(self):
+        assert normalize_name("Robert Brown Jr.") == "robert brown"
+
+    def test_strip_numeral_suffix(self):
+        assert normalize_name("James Wilson III") == "james wilson"
+
+    def test_title_and_suffix(self):
+        assert normalize_name("Dr. Michael Williams III") == "michael williams"
+
+    def test_collapse_whitespace(self):
+        assert normalize_name("  John   Smith  ") == "john smith"
+
+    def test_case_fold(self):
+        assert normalize_name("JOHN SMITH") == "john smith"
+
+    def test_empty(self):
+        assert normalize_name("") == ""
+        assert normalize_name(None) == ""
+
+    def test_idempotent(self):
+        result = normalize_name("Mr. John Smith Jr.")
+        assert normalize_name(result) == result
+
+
+class TestNormalizeAddress:
+    def test_street_abbreviation(self):
+        assert normalize_address("123 Main Street") == "123 main st"
+
+    def test_avenue_abbreviation(self):
+        assert normalize_address("456 Oak Avenue") == "456 oak ave"
+
+    def test_boulevard_abbreviation(self):
+        assert normalize_address("789 Pine Boulevard") == "789 pine blvd"
+
+    def test_apartment(self):
+        assert normalize_address("123 Main St Apartment 4") == "123 main st apt 4"
+
+    def test_direction(self):
+        assert normalize_address("111 First Street North") == "111 first st n"
+
+    def test_collapse_whitespace(self):
+        assert normalize_address("  123   Main   Street  ") == "123 main st"
+
+    def test_empty(self):
+        assert normalize_address("") == ""
+        assert normalize_address(None) == ""
+
+    def test_idempotent(self):
+        result = normalize_address("123 Main Street Apartment 4")
+        assert normalize_address(result) == result
+
+
+class TestNormalizeString:
+    def test_trim_and_casefold(self):
+        assert normalize_string("  Hello World  ") == "hello world"
+
+    def test_collapse_whitespace(self):
+        assert normalize_string("a   b   c") == "a b c"
+
+    def test_empty(self):
+        assert normalize_string("") == ""
+        assert normalize_string(None) == ""
+
+
+class TestGetNormalizer:
+    def test_get_by_enum(self):
+        fn = get_normalizer(NormalizerType.EMAIL)
+        assert fn("TEST@Gmail.com") == "test@gmail.com"
+
+    def test_get_by_string(self):
+        fn = get_normalizer("phone")
+        assert fn("(555) 123-4567") == "+15551234567"
+
+    def test_unknown_raises(self):
+        with pytest.raises(ValueError):
+            get_normalizer("unknown_type")