feat: add documentation, Streamlit GUI, and full source tree
- Rewrite README.md with project overview, quick-start, and CLI summary - Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections - Add docs/DEVELOPER.md with architecture, data flow, and extension guides - Rewrite src/core/__init__.py with public API exports and module docstring - Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive match group review with side-by-side diff, and download buttons - Add .gitignore, requirements.txt, all source code, tests, and sample data - Add streamlit to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
147
tests/test_cli.py
Normal file
147
tests/test_cli.py
Normal file
@@ -0,0 +1,147 @@
|
||||
"""Integration tests for the CLI via Typer's CliRunner."""
|
||||
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from src.cli import app
|
||||
|
||||
runner = CliRunner()
|
||||
|
||||
|
||||
class TestCliPreview:
|
||||
def test_preview_default(self, tmp_csv):
|
||||
result = runner.invoke(app, [str(tmp_csv)])
|
||||
assert result.exit_code == 0
|
||||
assert "preview" in result.output.lower() or "Rows in" in result.output
|
||||
|
||||
def test_preview_shows_row_counts(self, tmp_csv):
|
||||
result = runner.invoke(app, [str(tmp_csv)])
|
||||
assert result.exit_code == 0
|
||||
assert "Rows in" in result.output
|
||||
assert "Rows out" in result.output
|
||||
|
||||
def test_file_not_found(self):
|
||||
result = runner.invoke(app, ["/tmp/nonexistent_xyz_abc.csv"])
|
||||
assert result.exit_code != 0
|
||||
assert "not found" in result.output.lower()
|
||||
|
||||
|
||||
class TestCliApply:
|
||||
def test_apply_writes_output(self, tmp_csv, tmp_path):
|
||||
out = tmp_path / "output.csv"
|
||||
result = runner.invoke(app, [str(tmp_csv), "--apply", "-o", str(out)])
|
||||
assert result.exit_code == 0
|
||||
assert out.exists()
|
||||
|
||||
def test_apply_default_output_name(self, tmp_csv):
|
||||
result = runner.invoke(app, [str(tmp_csv), "--apply"])
|
||||
assert result.exit_code == 0
|
||||
expected = tmp_csv.parent / f"{tmp_csv.stem}_deduplicated.csv"
|
||||
assert expected.exists()
|
||||
|
||||
def test_apply_creates_removed_file(self, tmp_csv):
|
||||
result = runner.invoke(app, [str(tmp_csv), "--apply"])
|
||||
assert result.exit_code == 0
|
||||
removed = tmp_csv.parent / f"{tmp_csv.stem}_removed.csv"
|
||||
# May or may not exist depending on whether duplicates were found
|
||||
# with default auto-detect on simple_df
|
||||
|
||||
|
||||
class TestCliFuzzy:
|
||||
def test_fuzzy_flag(self, tmp_csv):
|
||||
result = runner.invoke(app, [
|
||||
str(tmp_csv), "--fuzzy", "name", "--threshold", "80",
|
||||
])
|
||||
assert result.exit_code == 0
|
||||
|
||||
def test_subset_flag(self, tmp_csv):
|
||||
result = runner.invoke(app, [
|
||||
str(tmp_csv), "--subset", "email",
|
||||
])
|
||||
assert result.exit_code == 0
|
||||
|
||||
def test_bad_column_error(self, tmp_csv):
|
||||
result = runner.invoke(app, [
|
||||
str(tmp_csv), "--subset", "nonexistent_column",
|
||||
])
|
||||
assert result.exit_code != 0
|
||||
assert "not found" in result.output.lower()
|
||||
|
||||
|
||||
class TestCliConfig:
|
||||
def test_save_and_load_config(self, tmp_csv, tmp_path):
|
||||
cfg_path = tmp_path / "my_config.json"
|
||||
# Save
|
||||
result = runner.invoke(app, [
|
||||
str(tmp_csv), "--subset", "email", "--save-config", str(cfg_path),
|
||||
])
|
||||
assert result.exit_code == 0
|
||||
assert cfg_path.exists()
|
||||
|
||||
# Load and apply
|
||||
result = runner.invoke(app, [
|
||||
str(tmp_csv), "--config", str(cfg_path), "--apply",
|
||||
])
|
||||
assert result.exit_code == 0
|
||||
|
||||
|
||||
class TestCliSurvivor:
|
||||
def test_survivor_last(self, tmp_csv):
|
||||
result = runner.invoke(app, [str(tmp_csv), "--survivor", "last"])
|
||||
assert result.exit_code == 0
|
||||
|
||||
def test_survivor_most_complete(self, tmp_csv):
|
||||
result = runner.invoke(app, [str(tmp_csv), "--survivor", "most-complete"])
|
||||
assert result.exit_code == 0
|
||||
|
||||
def test_invalid_survivor(self, tmp_csv):
|
||||
result = runner.invoke(app, [str(tmp_csv), "--survivor", "bogus"])
|
||||
assert result.exit_code != 0
|
||||
|
||||
|
||||
class TestCliMerge:
|
||||
def test_merge_flag(self, tmp_csv):
|
||||
result = runner.invoke(app, [str(tmp_csv), "--merge", "--apply"])
|
||||
assert result.exit_code == 0
|
||||
|
||||
|
||||
class TestCliSampleData:
|
||||
def test_sample_preview(self, sample_csv_path):
|
||||
result = runner.invoke(app, [str(sample_csv_path)])
|
||||
assert result.exit_code == 0
|
||||
assert "Rows in: 50" in result.output
|
||||
# Should find duplicates
|
||||
assert "Removed:" in result.output
|
||||
|
||||
def test_sample_apply(self, sample_csv_path, tmp_path):
|
||||
out = tmp_path / "deduped.csv"
|
||||
result = runner.invoke(app, [
|
||||
str(sample_csv_path), "--apply", "-o", str(out),
|
||||
])
|
||||
assert result.exit_code == 0
|
||||
assert out.exists()
|
||||
import pandas as pd
|
||||
df = pd.read_csv(out, encoding="utf-8-sig")
|
||||
# Should have fewer than 50 rows
|
||||
assert len(df) < 50
|
||||
|
||||
def test_sample_fuzzy_with_merge(self, sample_csv_path, tmp_path):
|
||||
out = tmp_path / "fuzzy_merged.csv"
|
||||
result = runner.invoke(app, [
|
||||
str(sample_csv_path),
|
||||
"--fuzzy", "customer_name",
|
||||
"--threshold", "80",
|
||||
"--merge",
|
||||
"--apply",
|
||||
"-o", str(out),
|
||||
])
|
||||
assert result.exit_code == 0
|
||||
assert out.exists()
|
||||
|
||||
|
||||
class TestCliHelp:
|
||||
def test_help(self):
|
||||
result = runner.invoke(app, ["--help"])
|
||||
assert result.exit_code == 0
|
||||
assert "--apply" in result.output
|
||||
Reference in New Issue
Block a user