Files
datatools-dev/tests/test_e2e.py
Michael db5ec084da docs+code: rename tool labels everywhere
Sweep follow-up to 93e43fc. Display labels now consistent across docs,
landing pages, CLI output, code comments, docstrings, and test prose.
Five parallel surfaces touched:

- docs (EN + ES): README, USER-GUIDE, CLI-REFERENCE, and 11 internal
  design/planning docs
- landing pages: index + bookkeeper/revops/shopify-pet
- src: CLI module docstrings, _TOOL_DISPLAY dicts in cli_analyze.py
  and gui/components/_legacy.py, core module headers, every tool
  page's module docstring
- tests: class/method/module docstrings and section-header comments
- test-cases READMEs

Page slugs (1_Deduplicator etc.), tool_id strings (01_deduplicator
etc.), Python class names (TestDeduplicatorWorkflow, FeatureFlag.*),
URL paths, anchor IDs, CSS classes, and asset filenames were left
intact since they're code identifiers / structural references.

All 2033 tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 19:50:09 +00:00

144 lines
5.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""End-to-end smoke tests.
Round-trips through the CLI binaries with real fixture inputs to catch
glue-code breakage that pure unit tests miss: argv parsing, file I/O, log
configuration, exit codes, and the integration between the analyzer, the
pre-parse repair, and pandas.
These are intentionally lightweight — one happy path per CLI plus a
couple of failure modes. Bigger scenarios live in ``test_corpus.py`` and
``test_fixtures_sweep.py``.
"""
from __future__ import annotations
import json
import subprocess
import sys
from pathlib import Path
import pandas as pd
import pytest
pytestmark = pytest.mark.e2e
PROJECT_ROOT = Path(__file__).resolve().parent.parent
CORPUS_KITCHEN_SINK = (
PROJECT_ROOT / "test-cases" / "text-cleaner-corpus" / "test_data" / "20_kitchen_sink.csv"
)
def _run(*args: str, cwd: Path | None = None, **kwargs):
return subprocess.run(
[sys.executable, *args],
capture_output=True, text=True, timeout=60,
cwd=cwd or PROJECT_ROOT,
**kwargs,
)
# ---------------------------------------------------------------------------
# cli_analyze — full round-trip
# ---------------------------------------------------------------------------
class TestAnalyzeCliE2E:
def test_table_output_on_kitchen_sink(self):
if not CORPUS_KITCHEN_SINK.exists():
pytest.skip("kitchen-sink fixture missing")
proc = _run("-m", "src.cli_analyze", str(CORPUS_KITCHEN_SINK))
assert proc.returncode == 0, proc.stderr
# Rich tables wrap; assert on stable substrings.
assert "Clean Text" in proc.stdout
assert "csv_bom_stripped" in proc.stdout or "smart_quotes" in proc.stdout
def test_json_output_parses(self):
if not CORPUS_KITCHEN_SINK.exists():
pytest.skip("kitchen-sink fixture missing")
proc = _run("-m", "src.cli_analyze", str(CORPUS_KITCHEN_SINK), "--json")
assert proc.returncode == 0, proc.stderr
data = json.loads(proc.stdout)
assert isinstance(data, list) and len(data) > 0
for item in data:
assert {"id", "severity", "tool", "count", "description"} <= set(item)
# ---------------------------------------------------------------------------
# cli_text_clean — full round-trip
# ---------------------------------------------------------------------------
class TestTextCleanCliE2E:
def test_apply_writes_cleaned_file(self, tmp_path):
# Build a small dirty CSV: NBSP padding + smart quotes.
src = tmp_path / "dirty.csv"
src.write_text(
"id,name,note\n"
"1, Alice ,“hello”\n"
"2, Bob ,its fine\n",
encoding="utf-8",
)
out = tmp_path / "out.csv"
proc = _run(
"-m", "src.cli_text_clean", str(src),
"--apply", "--output", str(out),
)
assert proc.returncode == 0, proc.stderr
assert out.exists(), "cleaned file was not written"
cleaned = pd.read_csv(out, dtype=str, keep_default_na=False, encoding="utf-8-sig")
# NBSP padding stripped
assert cleaned.iloc[0]["name"] == "Alice"
assert cleaned.iloc[1]["name"] == "Bob"
# Smart quotes folded
assert cleaned.iloc[0]["note"] == '"hello"'
assert cleaned.iloc[1]["note"] == "it's fine"
def test_preview_does_not_write(self, tmp_path):
src = tmp_path / "input.csv"
src.write_text("id,name\n1,Alice\n", encoding="utf-8")
# Without --apply, no output file should appear.
proc = _run("-m", "src.cli_text_clean", str(src))
assert proc.returncode == 0
# Default output path next to input — must not exist.
default_out = src.with_name(src.stem + "_cleaned.csv")
assert not default_out.exists()
# ---------------------------------------------------------------------------
# cli (dedup) — full round-trip
# ---------------------------------------------------------------------------
class TestDedupCliE2E:
def test_apply_removes_duplicates(self, tmp_path):
src = tmp_path / "dups.csv"
src.write_text(
"name,email\n"
"Alice,alice@x.com\n"
"Alice,alice@x.com\n"
"Bob,bob@x.com\n",
encoding="utf-8",
)
out = tmp_path / "deduped.csv"
proc = _run(
"-m", "src.cli", str(src),
"--apply", "--output", str(out),
)
assert proc.returncode == 0, proc.stderr
assert out.exists()
result = pd.read_csv(out, dtype=str, keep_default_na=False, encoding="utf-8-sig")
assert len(result) == 2 # Alice deduped, Bob unique
# ---------------------------------------------------------------------------
# run_tests.py self-test — sanity check the runner itself works
# ---------------------------------------------------------------------------
class TestRunTestsE2E:
def test_tool_filter_runs_subset(self):
proc = _run("run_tests.py", "--tool", "config", "-v")
assert proc.returncode == 0, proc.stderr
# Check we limited the run via -k.
assert "config" in proc.stdout.lower()
def test_unknown_tool_exits_2(self):
proc = _run("run_tests.py", "--tool", "no_such_tool")
assert proc.returncode == 2