"""End-to-end smoke tests. Round-trips through the CLI binaries with real fixture inputs to catch glue-code breakage that pure unit tests miss: argv parsing, file I/O, log configuration, exit codes, and the integration between the analyzer, the pre-parse repair, and pandas. These are intentionally lightweight — one happy path per CLI plus a couple of failure modes. Bigger scenarios live in ``test_corpus.py`` and ``test_fixtures_sweep.py``. """ from __future__ import annotations import json import subprocess import sys from pathlib import Path import pandas as pd import pytest pytestmark = pytest.mark.e2e PROJECT_ROOT = Path(__file__).resolve().parent.parent CORPUS_KITCHEN_SINK = ( PROJECT_ROOT / "test-cases" / "text-cleaner-corpus" / "test_data" / "20_kitchen_sink.csv" ) def _run(*args: str, cwd: Path | None = None, **kwargs): return subprocess.run( [sys.executable, *args], capture_output=True, text=True, timeout=60, cwd=cwd or PROJECT_ROOT, **kwargs, ) # --------------------------------------------------------------------------- # cli_analyze — full round-trip # --------------------------------------------------------------------------- class TestAnalyzeCliE2E: def test_table_output_on_kitchen_sink(self): if not CORPUS_KITCHEN_SINK.exists(): pytest.skip("kitchen-sink fixture missing") proc = _run("-m", "src.cli_analyze", str(CORPUS_KITCHEN_SINK)) assert proc.returncode == 0, proc.stderr # Rich tables wrap; assert on stable substrings. assert "Clean Text" in proc.stdout assert "csv_bom_stripped" in proc.stdout or "smart_quotes" in proc.stdout def test_json_output_parses(self): if not CORPUS_KITCHEN_SINK.exists(): pytest.skip("kitchen-sink fixture missing") proc = _run("-m", "src.cli_analyze", str(CORPUS_KITCHEN_SINK), "--json") assert proc.returncode == 0, proc.stderr data = json.loads(proc.stdout) assert isinstance(data, list) and len(data) > 0 for item in data: assert {"id", "severity", "tool", "count", "description"} <= set(item) # --------------------------------------------------------------------------- # cli_text_clean — full round-trip # --------------------------------------------------------------------------- class TestTextCleanCliE2E: def test_apply_writes_cleaned_file(self, tmp_path): # Build a small dirty CSV: NBSP padding + smart quotes. src = tmp_path / "dirty.csv" src.write_text( "id,name,note\n" "1, Alice ,“hello”\n" "2, Bob ,it’s fine\n", encoding="utf-8", ) out = tmp_path / "out.csv" proc = _run( "-m", "src.cli_text_clean", str(src), "--apply", "--output", str(out), ) assert proc.returncode == 0, proc.stderr assert out.exists(), "cleaned file was not written" cleaned = pd.read_csv(out, dtype=str, keep_default_na=False, encoding="utf-8-sig") # NBSP padding stripped assert cleaned.iloc[0]["name"] == "Alice" assert cleaned.iloc[1]["name"] == "Bob" # Smart quotes folded assert cleaned.iloc[0]["note"] == '"hello"' assert cleaned.iloc[1]["note"] == "it's fine" def test_preview_does_not_write(self, tmp_path): src = tmp_path / "input.csv" src.write_text("id,name\n1,Alice\n", encoding="utf-8") # Without --apply, no output file should appear. proc = _run("-m", "src.cli_text_clean", str(src)) assert proc.returncode == 0 # Default output path next to input — must not exist. default_out = src.with_name(src.stem + "_cleaned.csv") assert not default_out.exists() # --------------------------------------------------------------------------- # cli (dedup) — full round-trip # --------------------------------------------------------------------------- class TestDedupCliE2E: def test_apply_removes_duplicates(self, tmp_path): src = tmp_path / "dups.csv" src.write_text( "name,email\n" "Alice,alice@x.com\n" "Alice,alice@x.com\n" "Bob,bob@x.com\n", encoding="utf-8", ) out = tmp_path / "deduped.csv" proc = _run( "-m", "src.cli", str(src), "--apply", "--output", str(out), ) assert proc.returncode == 0, proc.stderr assert out.exists() result = pd.read_csv(out, dtype=str, keep_default_na=False, encoding="utf-8-sig") assert len(result) == 2 # Alice deduped, Bob unique # --------------------------------------------------------------------------- # run_tests.py self-test — sanity check the runner itself works # --------------------------------------------------------------------------- class TestRunTestsE2E: def test_tool_filter_runs_subset(self): proc = _run("run_tests.py", "--tool", "config", "-v") assert proc.returncode == 0, proc.stderr # Check we limited the run via -k. assert "config" in proc.stdout.lower() def test_unknown_tool_exits_2(self): proc = _run("run_tests.py", "--tool", "no_such_tool") assert proc.returncode == 2