Files
datatools-dev/tests/test_fixtures_sweep.py
Michael 696996c119 test(junk-corpus): pathological-input stress suite for the analyzer
Build a corpus of 35 deliberately-broken files (empty bytes, NUL
bytes, mojibake, UTF-16 without BOM, mismatched columns, unescaped
quotes, corrupt zip, etc.) and pin the analyzer's stability contract
against them.

Files land in ``test-cases/junk-corpus/test_data/``. The generator
``make_junk_corpus.py`` produces them deterministically (one random
sample uses ``secrets.token_bytes`` — committed bytes are stable
across regenerations because the byte stream is captured at commit
time). README documents the categories and how to add new shapes.

``tests/test_junk_corpus.py`` parametrizes over every file in the
corpus and asserts:

1. ``_run_analysis_on_upload`` never raises — exceptions must be
   caught and surfaced as a synthetic ``Finding`` with
   severity="error". This was the user-reported crash for
   13_non_latin_scripts.csv that the previous fix in ae9d4a2
   defensively wrapped; the corpus now stops the regression
   from re-landing on a different shape.
2. Every Finding in the result list is well-formed (string id,
   valid severity, non-empty description).
3. A high-risk subset (empty.csv, only_bom.csv, only_nul.csv,
   corrupt_xlsx.xlsx) MUST surface at least one error-level
   Finding — otherwise the GUI would render "no issues found"
   for a structurally broken file.
4. Error-level Finding descriptions are at least 20 chars so the
   UI banner gives the user something to act on.

Also exclude ``junk-corpus`` from ``tests/test_fixtures_sweep.py``
since that sweep is happy-path (round-trip the text cleaner) and
fights with files designed to break it. The contract is enforced
by the dedicated junk-corpus test, not the sweep.

Runtime: 12 s for the junk-corpus tests, 30 s for the full
project suite (was 19 s without these). 2118 tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 21:35:22 +00:00

159 lines
5.9 KiB
Python

"""Automated sweep over every fixture in ``test-cases/``.
Drop a new CSV/TSV/XLSX into ``test-cases/`` and the sweep picks it up the
next time pytest runs — no test code changes required. Each fixture goes
through three smoke tests:
1. **Pre-parse repair runs cleanly.** Byte-level repair (BOM, NUL, smart
quotes, rogue delimiters) must not crash, and produced bytes must be
valid for ``pd.read_csv``.
2. **Analyzer runs cleanly.** ``analyze()`` must produce a list of
:class:`Finding` objects without raising.
3. **Text cleaner runs cleanly and preserves schema.** Default-config
``clean_dataframe`` must not change row count and must return the same
number of columns it started with.
The sweep skips files inside ``text-cleaner-corpus/`` because that subdir
has its own dedicated test (``test_corpus.py``) with byte-exact expected
outputs.
"""
from __future__ import annotations
import io
from pathlib import Path
import pandas as pd
import pytest
from src.core.analyze import Finding, analyze
from src.core.io import detect_delimiter, detect_encoding, repair_bytes
from src.core.text_clean import clean_dataframe
TEST_CASES_DIR = Path(__file__).resolve().parent.parent / "test-cases"
# Subdirectories in test-cases/ that are exercised by their own dedicated
# tests. The sweep ignores these so we don't double-test or fight expected
# byte-exact outputs. ``junk-corpus`` is intentionally pathological —
# files there are designed to break the cleaner/analyzer; the contract is
# enforced by ``tests/test_junk_corpus.py``, not this happy-path sweep.
_EXCLUDED_SUBDIRS = {"text-cleaner-corpus", "junk-corpus"}
# File suffixes we know how to load.
_SUPPORTED_SUFFIXES = {".csv", ".tsv", ".xlsx", ".xls"}
def _discover_fixtures() -> list[Path]:
"""Return every fixture file under test-cases/ that the sweep should run.
Walks one level deep — CSV/XLSX directly inside test-cases/ are picked
up; files in excluded subdirectories are not.
"""
if not TEST_CASES_DIR.is_dir():
return []
out: list[Path] = []
for entry in sorted(TEST_CASES_DIR.iterdir()):
if entry.is_dir():
if entry.name in _EXCLUDED_SUBDIRS:
continue
for sub in sorted(entry.rglob("*")):
if sub.is_file() and sub.suffix.lower() in _SUPPORTED_SUFFIXES:
out.append(sub)
continue
if entry.is_file() and entry.suffix.lower() in _SUPPORTED_SUFFIXES:
out.append(entry)
return out
_FIXTURES = _discover_fixtures()
def _fixture_id(path: Path) -> str:
"""Pretty pytest id derived from the filename, keeping subdirs visible."""
rel = path.relative_to(TEST_CASES_DIR)
return str(rel)
# Skip the entire module gracefully when no fixtures are present, instead of
# emitting a "no tests collected" failure.
pytestmark = [
pytest.mark.fixture_sweep,
pytest.mark.skipif(
not _FIXTURES,
reason="no fixtures found under test-cases/ — drop a CSV/XLSX in to enable the sweep",
),
]
def _read_with_repair(path: Path) -> tuple[pd.DataFrame, object | None]:
"""Read *path* with the same robust pipeline analyze() uses.
Returns ``(df, repair_result)`` where repair_result is None for Excel.
"""
suffix = path.suffix.lower()
if suffix in (".xlsx", ".xls"):
df = pd.read_excel(path, dtype=str, keep_default_na=False, engine="openpyxl")
return df, None
enc = detect_encoding(path)
delim = detect_delimiter(path, enc)
raw = path.read_bytes()
repair = repair_bytes(raw, encoding=enc, delimiter=delim)
df = pd.read_csv(
io.BytesIO(repair.repaired_bytes),
encoding="utf-8", delimiter=delim,
dtype=str, keep_default_na=False, on_bad_lines="warn",
)
return df, repair
@pytest.mark.parametrize("fixture", _FIXTURES, ids=[_fixture_id(p) for p in _FIXTURES])
class TestFixtureSweep:
"""Smoke tests that every fixture in ``test-cases/`` must pass."""
def test_repair_and_load(self, fixture: Path) -> None:
df, _ = _read_with_repair(fixture)
assert isinstance(df, pd.DataFrame), f"{fixture.name}: did not return a DataFrame"
assert len(df.columns) >= 1, f"{fixture.name}: zero columns after parse"
def test_analyze_runs(self, fixture: Path) -> None:
df, repair = _read_with_repair(fixture)
findings = analyze(df, repair_result=repair)
assert isinstance(findings, list)
for f in findings:
assert isinstance(f, Finding), (
f"{fixture.name}: analyze() returned a non-Finding ({type(f)})"
)
def test_text_cleaner_preserves_schema(self, fixture: Path) -> None:
df, _ = _read_with_repair(fixture)
before_rows = len(df)
before_cols = len(df.columns)
result = clean_dataframe(df)
assert len(result.cleaned_df) == before_rows, (
f"{fixture.name}: row count changed "
f"({before_rows} -> {len(result.cleaned_df)})"
)
assert len(result.cleaned_df.columns) == before_cols, (
f"{fixture.name}: column count changed "
f"({before_cols} -> {len(result.cleaned_df.columns)})"
)
def test_text_cleaner_idempotent(self, fixture: Path) -> None:
df, _ = _read_with_repair(fixture)
once = clean_dataframe(df).cleaned_df.reset_index(drop=True)
twice = clean_dataframe(once).cleaned_df.reset_index(drop=True)
assert once.equals(twice), (
f"{fixture.name}: clean(clean(x)) != clean(x); cleaner is not idempotent"
)
def test_at_least_one_fixture_present() -> None:
"""Smoke check: every project should ship at least one fixture so the
sweep is not silently skipped on a clean checkout. Adjust the threshold
only if intentionally moving fixtures elsewhere."""
assert len(_FIXTURES) > 0, (
"No fixtures found under test-cases/. "
"Drop a CSV or XLSX file into the directory and re-run."
)