feat: add documentation, Streamlit GUI, and full source tree

- Rewrite README.md with project overview, quick-start, and CLI summary - Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections - Add docs/DEVELOPER.md with architecture, data flow, and extension guides - Rewrite src/core/__init__.py with public API exports and module docstring - Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive match group review with side-by-side diff, and download buttons - Add .gitignore, requirements.txt, all source code, tests, and sample data - Add streamlit to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-28 23:06:39 +00:00
parent 0613dc420c
commit b871ab24fc
47 changed files with 4413 additions and 2 deletions
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -0,0 +1,147 @@
+"""Integration tests for the CLI via Typer's CliRunner."""
+
+import pytest
+from pathlib import Path
+from typer.testing import CliRunner
+
+from src.cli import app
+
+runner = CliRunner()
+
+
+class TestCliPreview:
+    def test_preview_default(self, tmp_csv):
+        result = runner.invoke(app, [str(tmp_csv)])
+        assert result.exit_code == 0
+        assert "preview" in result.output.lower() or "Rows in" in result.output
+
+    def test_preview_shows_row_counts(self, tmp_csv):
+        result = runner.invoke(app, [str(tmp_csv)])
+        assert result.exit_code == 0
+        assert "Rows in" in result.output
+        assert "Rows out" in result.output
+
+    def test_file_not_found(self):
+        result = runner.invoke(app, ["/tmp/nonexistent_xyz_abc.csv"])
+        assert result.exit_code != 0
+        assert "not found" in result.output.lower()
+
+
+class TestCliApply:
+    def test_apply_writes_output(self, tmp_csv, tmp_path):
+        out = tmp_path / "output.csv"
+        result = runner.invoke(app, [str(tmp_csv), "--apply", "-o", str(out)])
+        assert result.exit_code == 0
+        assert out.exists()
+
+    def test_apply_default_output_name(self, tmp_csv):
+        result = runner.invoke(app, [str(tmp_csv), "--apply"])
+        assert result.exit_code == 0
+        expected = tmp_csv.parent / f"{tmp_csv.stem}_deduplicated.csv"
+        assert expected.exists()
+
+    def test_apply_creates_removed_file(self, tmp_csv):
+        result = runner.invoke(app, [str(tmp_csv), "--apply"])
+        assert result.exit_code == 0
+        removed = tmp_csv.parent / f"{tmp_csv.stem}_removed.csv"
+        # May or may not exist depending on whether duplicates were found
+        # with default auto-detect on simple_df
+
+
+class TestCliFuzzy:
+    def test_fuzzy_flag(self, tmp_csv):
+        result = runner.invoke(app, [
+            str(tmp_csv), "--fuzzy", "name", "--threshold", "80",
+        ])
+        assert result.exit_code == 0
+
+    def test_subset_flag(self, tmp_csv):
+        result = runner.invoke(app, [
+            str(tmp_csv), "--subset", "email",
+        ])
+        assert result.exit_code == 0
+
+    def test_bad_column_error(self, tmp_csv):
+        result = runner.invoke(app, [
+            str(tmp_csv), "--subset", "nonexistent_column",
+        ])
+        assert result.exit_code != 0
+        assert "not found" in result.output.lower()
+
+
+class TestCliConfig:
+    def test_save_and_load_config(self, tmp_csv, tmp_path):
+        cfg_path = tmp_path / "my_config.json"
+        # Save
+        result = runner.invoke(app, [
+            str(tmp_csv), "--subset", "email", "--save-config", str(cfg_path),
+        ])
+        assert result.exit_code == 0
+        assert cfg_path.exists()
+
+        # Load and apply
+        result = runner.invoke(app, [
+            str(tmp_csv), "--config", str(cfg_path), "--apply",
+        ])
+        assert result.exit_code == 0
+
+
+class TestCliSurvivor:
+    def test_survivor_last(self, tmp_csv):
+        result = runner.invoke(app, [str(tmp_csv), "--survivor", "last"])
+        assert result.exit_code == 0
+
+    def test_survivor_most_complete(self, tmp_csv):
+        result = runner.invoke(app, [str(tmp_csv), "--survivor", "most-complete"])
+        assert result.exit_code == 0
+
+    def test_invalid_survivor(self, tmp_csv):
+        result = runner.invoke(app, [str(tmp_csv), "--survivor", "bogus"])
+        assert result.exit_code != 0
+
+
+class TestCliMerge:
+    def test_merge_flag(self, tmp_csv):
+        result = runner.invoke(app, [str(tmp_csv), "--merge", "--apply"])
+        assert result.exit_code == 0
+
+
+class TestCliSampleData:
+    def test_sample_preview(self, sample_csv_path):
+        result = runner.invoke(app, [str(sample_csv_path)])
+        assert result.exit_code == 0
+        assert "Rows in:   50" in result.output
+        # Should find duplicates
+        assert "Removed:" in result.output
+
+    def test_sample_apply(self, sample_csv_path, tmp_path):
+        out = tmp_path / "deduped.csv"
+        result = runner.invoke(app, [
+            str(sample_csv_path), "--apply", "-o", str(out),
+        ])
+        assert result.exit_code == 0
+        assert out.exists()
+        import pandas as pd
+        df = pd.read_csv(out, encoding="utf-8-sig")
+        # Should have fewer than 50 rows
+        assert len(df) < 50
+
+    def test_sample_fuzzy_with_merge(self, sample_csv_path, tmp_path):
+        out = tmp_path / "fuzzy_merged.csv"
+        result = runner.invoke(app, [
+            str(sample_csv_path),
+            "--fuzzy", "customer_name",
+            "--threshold", "80",
+            "--merge",
+            "--apply",
+            "-o", str(out),
+        ])
+        assert result.exit_code == 0
+        assert out.exists()
+
+
+class TestCliHelp:
+    def test_help(self):
+        result = runner.invoke(app, ["--help"])
+        assert result.exit_code == 0
+        assert "--apply" in result.output