datatools-dev/tests/test_e2e.py

"""End-to-end smoke tests.

Round-trips through the CLI binaries with real fixture inputs to catch
glue-code breakage that pure unit tests miss: argv parsing, file I/O, log
configuration, exit codes, and the integration between the analyzer, the
pre-parse repair, and pandas.

These are intentionally lightweight — one happy path per CLI plus a
couple of failure modes. Bigger scenarios live in ``test_corpus.py`` and
``test_fixtures_sweep.py``.
"""

from __future__ import annotations

import json
import subprocess
import sys
from pathlib import Path

import pandas as pd
import pytest

pytestmark = pytest.mark.e2e

PROJECT_ROOT = Path(__file__).resolve().parent.parent
CORPUS_KITCHEN_SINK = (
    PROJECT_ROOT / "test-cases" / "text-cleaner-corpus" / "test_data" / "20_kitchen_sink.csv"
)


def _run(*args: str, cwd: Path | None = None, **kwargs):
    return subprocess.run(
        [sys.executable, *args],
        capture_output=True, text=True, timeout=60,
        cwd=cwd or PROJECT_ROOT,
        **kwargs,
    )


# ---------------------------------------------------------------------------
# cli_analyze — full round-trip
# ---------------------------------------------------------------------------

class TestAnalyzeCliE2E:
    def test_table_output_on_kitchen_sink(self):
        if not CORPUS_KITCHEN_SINK.exists():
            pytest.skip("kitchen-sink fixture missing")
        proc = _run("-m", "src.cli_analyze", str(CORPUS_KITCHEN_SINK))
        assert proc.returncode == 0, proc.stderr
        # Rich tables wrap; assert on stable substrings.
        assert "Clean Text" in proc.stdout
        assert "csv_bom_stripped" in proc.stdout or "smart_quotes" in proc.stdout

    def test_json_output_parses(self):
        if not CORPUS_KITCHEN_SINK.exists():
            pytest.skip("kitchen-sink fixture missing")
        proc = _run("-m", "src.cli_analyze", str(CORPUS_KITCHEN_SINK), "--json")
        assert proc.returncode == 0, proc.stderr
        data = json.loads(proc.stdout)
        assert isinstance(data, list) and len(data) > 0
        for item in data:
            assert {"id", "severity", "tool", "count", "description"} <= set(item)


# ---------------------------------------------------------------------------
# cli_text_clean — full round-trip
# ---------------------------------------------------------------------------

class TestTextCleanCliE2E:
    def test_apply_writes_cleaned_file(self, tmp_path):
        # Build a small dirty CSV: NBSP padding + smart quotes.
        src = tmp_path / "dirty.csv"
        src.write_text(
            "id,name,note\n"
            "1, Alice ,“hello”\n"
            "2,  Bob  ,it’s fine\n",
            encoding="utf-8",
        )
        out = tmp_path / "out.csv"
        proc = _run(
            "-m", "src.cli_text_clean", str(src),
            "--apply", "--output", str(out),
        )
        assert proc.returncode == 0, proc.stderr
        assert out.exists(), "cleaned file was not written"
        cleaned = pd.read_csv(out, dtype=str, keep_default_na=False, encoding="utf-8-sig")
        # NBSP padding stripped
        assert cleaned.iloc[0]["name"] == "Alice"
        assert cleaned.iloc[1]["name"] == "Bob"
        # Smart quotes folded
        assert cleaned.iloc[0]["note"] == '"hello"'
        assert cleaned.iloc[1]["note"] == "it's fine"

    def test_preview_does_not_write(self, tmp_path):
        src = tmp_path / "input.csv"
        src.write_text("id,name\n1,Alice\n", encoding="utf-8")
        # Without --apply, no output file should appear.
        proc = _run("-m", "src.cli_text_clean", str(src))
        assert proc.returncode == 0
        # Default output path next to input — must not exist.
        default_out = src.with_name(src.stem + "_cleaned.csv")
        assert not default_out.exists()


# ---------------------------------------------------------------------------
# cli (dedup) — full round-trip
# ---------------------------------------------------------------------------

class TestDedupCliE2E:
    def test_apply_removes_duplicates(self, tmp_path):
        src = tmp_path / "dups.csv"
        src.write_text(
            "name,email\n"
            "Alice,alice@x.com\n"
            "Alice,alice@x.com\n"
            "Bob,bob@x.com\n",
            encoding="utf-8",
        )
        out = tmp_path / "deduped.csv"
        proc = _run(
            "-m", "src.cli", str(src),
            "--apply", "--output", str(out),
        )
        assert proc.returncode == 0, proc.stderr
        assert out.exists()
        result = pd.read_csv(out, dtype=str, keep_default_na=False, encoding="utf-8-sig")
        assert len(result) == 2  # Alice deduped, Bob unique


# ---------------------------------------------------------------------------
# run_tests.py self-test — sanity check the runner itself works
# ---------------------------------------------------------------------------

class TestRunTestsE2E:
    def test_tool_filter_runs_subset(self):
        proc = _run("run_tests.py", "--tool", "config", "-v")
        assert proc.returncode == 0, proc.stderr
        # Check we limited the run via -k.
        assert "config" in proc.stdout.lower()

    def test_unknown_tool_exits_2(self):
        proc = _run("run_tests.py", "--tool", "no_such_tool")
        assert proc.returncode == 2