- Rewrite README.md with project overview, quick-start, and CLI summary - Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections - Add docs/DEVELOPER.md with architecture, data flow, and extension guides - Rewrite src/core/__init__.py with public API exports and module docstring - Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive match group review with side-by-side diff, and download buttons - Add .gitignore, requirements.txt, all source code, tests, and sample data - Add streamlit to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
148 lines
4.7 KiB
Python
148 lines
4.7 KiB
Python
"""Integration tests for the CLI via Typer's CliRunner."""
|
|
|
|
import pytest
|
|
from pathlib import Path
|
|
from typer.testing import CliRunner
|
|
|
|
from src.cli import app
|
|
|
|
runner = CliRunner()
|
|
|
|
|
|
class TestCliPreview:
|
|
def test_preview_default(self, tmp_csv):
|
|
result = runner.invoke(app, [str(tmp_csv)])
|
|
assert result.exit_code == 0
|
|
assert "preview" in result.output.lower() or "Rows in" in result.output
|
|
|
|
def test_preview_shows_row_counts(self, tmp_csv):
|
|
result = runner.invoke(app, [str(tmp_csv)])
|
|
assert result.exit_code == 0
|
|
assert "Rows in" in result.output
|
|
assert "Rows out" in result.output
|
|
|
|
def test_file_not_found(self):
|
|
result = runner.invoke(app, ["/tmp/nonexistent_xyz_abc.csv"])
|
|
assert result.exit_code != 0
|
|
assert "not found" in result.output.lower()
|
|
|
|
|
|
class TestCliApply:
|
|
def test_apply_writes_output(self, tmp_csv, tmp_path):
|
|
out = tmp_path / "output.csv"
|
|
result = runner.invoke(app, [str(tmp_csv), "--apply", "-o", str(out)])
|
|
assert result.exit_code == 0
|
|
assert out.exists()
|
|
|
|
def test_apply_default_output_name(self, tmp_csv):
|
|
result = runner.invoke(app, [str(tmp_csv), "--apply"])
|
|
assert result.exit_code == 0
|
|
expected = tmp_csv.parent / f"{tmp_csv.stem}_deduplicated.csv"
|
|
assert expected.exists()
|
|
|
|
def test_apply_creates_removed_file(self, tmp_csv):
|
|
result = runner.invoke(app, [str(tmp_csv), "--apply"])
|
|
assert result.exit_code == 0
|
|
removed = tmp_csv.parent / f"{tmp_csv.stem}_removed.csv"
|
|
# May or may not exist depending on whether duplicates were found
|
|
# with default auto-detect on simple_df
|
|
|
|
|
|
class TestCliFuzzy:
|
|
def test_fuzzy_flag(self, tmp_csv):
|
|
result = runner.invoke(app, [
|
|
str(tmp_csv), "--fuzzy", "name", "--threshold", "80",
|
|
])
|
|
assert result.exit_code == 0
|
|
|
|
def test_subset_flag(self, tmp_csv):
|
|
result = runner.invoke(app, [
|
|
str(tmp_csv), "--subset", "email",
|
|
])
|
|
assert result.exit_code == 0
|
|
|
|
def test_bad_column_error(self, tmp_csv):
|
|
result = runner.invoke(app, [
|
|
str(tmp_csv), "--subset", "nonexistent_column",
|
|
])
|
|
assert result.exit_code != 0
|
|
assert "not found" in result.output.lower()
|
|
|
|
|
|
class TestCliConfig:
|
|
def test_save_and_load_config(self, tmp_csv, tmp_path):
|
|
cfg_path = tmp_path / "my_config.json"
|
|
# Save
|
|
result = runner.invoke(app, [
|
|
str(tmp_csv), "--subset", "email", "--save-config", str(cfg_path),
|
|
])
|
|
assert result.exit_code == 0
|
|
assert cfg_path.exists()
|
|
|
|
# Load and apply
|
|
result = runner.invoke(app, [
|
|
str(tmp_csv), "--config", str(cfg_path), "--apply",
|
|
])
|
|
assert result.exit_code == 0
|
|
|
|
|
|
class TestCliSurvivor:
|
|
def test_survivor_last(self, tmp_csv):
|
|
result = runner.invoke(app, [str(tmp_csv), "--survivor", "last"])
|
|
assert result.exit_code == 0
|
|
|
|
def test_survivor_most_complete(self, tmp_csv):
|
|
result = runner.invoke(app, [str(tmp_csv), "--survivor", "most-complete"])
|
|
assert result.exit_code == 0
|
|
|
|
def test_invalid_survivor(self, tmp_csv):
|
|
result = runner.invoke(app, [str(tmp_csv), "--survivor", "bogus"])
|
|
assert result.exit_code != 0
|
|
|
|
|
|
class TestCliMerge:
|
|
def test_merge_flag(self, tmp_csv):
|
|
result = runner.invoke(app, [str(tmp_csv), "--merge", "--apply"])
|
|
assert result.exit_code == 0
|
|
|
|
|
|
class TestCliSampleData:
|
|
def test_sample_preview(self, sample_csv_path):
|
|
result = runner.invoke(app, [str(sample_csv_path)])
|
|
assert result.exit_code == 0
|
|
assert "Rows in: 50" in result.output
|
|
# Should find duplicates
|
|
assert "Removed:" in result.output
|
|
|
|
def test_sample_apply(self, sample_csv_path, tmp_path):
|
|
out = tmp_path / "deduped.csv"
|
|
result = runner.invoke(app, [
|
|
str(sample_csv_path), "--apply", "-o", str(out),
|
|
])
|
|
assert result.exit_code == 0
|
|
assert out.exists()
|
|
import pandas as pd
|
|
df = pd.read_csv(out, encoding="utf-8-sig")
|
|
# Should have fewer than 50 rows
|
|
assert len(df) < 50
|
|
|
|
def test_sample_fuzzy_with_merge(self, sample_csv_path, tmp_path):
|
|
out = tmp_path / "fuzzy_merged.csv"
|
|
result = runner.invoke(app, [
|
|
str(sample_csv_path),
|
|
"--fuzzy", "customer_name",
|
|
"--threshold", "80",
|
|
"--merge",
|
|
"--apply",
|
|
"-o", str(out),
|
|
])
|
|
assert result.exit_code == 0
|
|
assert out.exists()
|
|
|
|
|
|
class TestCliHelp:
|
|
def test_help(self):
|
|
result = runner.invoke(app, ["--help"])
|
|
assert result.exit_code == 0
|
|
assert "--apply" in result.output
|