Files
datatools-dev/tests/test_cli.py
Michael b871ab24fc feat: add documentation, Streamlit GUI, and full source tree
- Rewrite README.md with project overview, quick-start, and CLI summary
- Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections
- Add docs/DEVELOPER.md with architecture, data flow, and extension guides
- Rewrite src/core/__init__.py with public API exports and module docstring
- Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive
  match group review with side-by-side diff, and download buttons
- Add .gitignore, requirements.txt, all source code, tests, and sample data
- Add streamlit to requirements.txt

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-28 23:06:39 +00:00

148 lines
4.7 KiB
Python

"""Integration tests for the CLI via Typer's CliRunner."""
import pytest
from pathlib import Path
from typer.testing import CliRunner
from src.cli import app
runner = CliRunner()
class TestCliPreview:
def test_preview_default(self, tmp_csv):
result = runner.invoke(app, [str(tmp_csv)])
assert result.exit_code == 0
assert "preview" in result.output.lower() or "Rows in" in result.output
def test_preview_shows_row_counts(self, tmp_csv):
result = runner.invoke(app, [str(tmp_csv)])
assert result.exit_code == 0
assert "Rows in" in result.output
assert "Rows out" in result.output
def test_file_not_found(self):
result = runner.invoke(app, ["/tmp/nonexistent_xyz_abc.csv"])
assert result.exit_code != 0
assert "not found" in result.output.lower()
class TestCliApply:
def test_apply_writes_output(self, tmp_csv, tmp_path):
out = tmp_path / "output.csv"
result = runner.invoke(app, [str(tmp_csv), "--apply", "-o", str(out)])
assert result.exit_code == 0
assert out.exists()
def test_apply_default_output_name(self, tmp_csv):
result = runner.invoke(app, [str(tmp_csv), "--apply"])
assert result.exit_code == 0
expected = tmp_csv.parent / f"{tmp_csv.stem}_deduplicated.csv"
assert expected.exists()
def test_apply_creates_removed_file(self, tmp_csv):
result = runner.invoke(app, [str(tmp_csv), "--apply"])
assert result.exit_code == 0
removed = tmp_csv.parent / f"{tmp_csv.stem}_removed.csv"
# May or may not exist depending on whether duplicates were found
# with default auto-detect on simple_df
class TestCliFuzzy:
def test_fuzzy_flag(self, tmp_csv):
result = runner.invoke(app, [
str(tmp_csv), "--fuzzy", "name", "--threshold", "80",
])
assert result.exit_code == 0
def test_subset_flag(self, tmp_csv):
result = runner.invoke(app, [
str(tmp_csv), "--subset", "email",
])
assert result.exit_code == 0
def test_bad_column_error(self, tmp_csv):
result = runner.invoke(app, [
str(tmp_csv), "--subset", "nonexistent_column",
])
assert result.exit_code != 0
assert "not found" in result.output.lower()
class TestCliConfig:
def test_save_and_load_config(self, tmp_csv, tmp_path):
cfg_path = tmp_path / "my_config.json"
# Save
result = runner.invoke(app, [
str(tmp_csv), "--subset", "email", "--save-config", str(cfg_path),
])
assert result.exit_code == 0
assert cfg_path.exists()
# Load and apply
result = runner.invoke(app, [
str(tmp_csv), "--config", str(cfg_path), "--apply",
])
assert result.exit_code == 0
class TestCliSurvivor:
def test_survivor_last(self, tmp_csv):
result = runner.invoke(app, [str(tmp_csv), "--survivor", "last"])
assert result.exit_code == 0
def test_survivor_most_complete(self, tmp_csv):
result = runner.invoke(app, [str(tmp_csv), "--survivor", "most-complete"])
assert result.exit_code == 0
def test_invalid_survivor(self, tmp_csv):
result = runner.invoke(app, [str(tmp_csv), "--survivor", "bogus"])
assert result.exit_code != 0
class TestCliMerge:
def test_merge_flag(self, tmp_csv):
result = runner.invoke(app, [str(tmp_csv), "--merge", "--apply"])
assert result.exit_code == 0
class TestCliSampleData:
def test_sample_preview(self, sample_csv_path):
result = runner.invoke(app, [str(sample_csv_path)])
assert result.exit_code == 0
assert "Rows in: 50" in result.output
# Should find duplicates
assert "Removed:" in result.output
def test_sample_apply(self, sample_csv_path, tmp_path):
out = tmp_path / "deduped.csv"
result = runner.invoke(app, [
str(sample_csv_path), "--apply", "-o", str(out),
])
assert result.exit_code == 0
assert out.exists()
import pandas as pd
df = pd.read_csv(out, encoding="utf-8-sig")
# Should have fewer than 50 rows
assert len(df) < 50
def test_sample_fuzzy_with_merge(self, sample_csv_path, tmp_path):
out = tmp_path / "fuzzy_merged.csv"
result = runner.invoke(app, [
str(sample_csv_path),
"--fuzzy", "customer_name",
"--threshold", "80",
"--merge",
"--apply",
"-o", str(out),
])
assert result.exit_code == 0
assert out.exists()
class TestCliHelp:
def test_help(self):
result = runner.invoke(app, ["--help"])
assert result.exit_code == 0
assert "--apply" in result.output