"""Automated sweep over every fixture in ``test-cases/``. Drop a new CSV/TSV/XLSX into ``test-cases/`` and the sweep picks it up the next time pytest runs — no test code changes required. Each fixture goes through three smoke tests: 1. **Pre-parse repair runs cleanly.** Byte-level repair (BOM, NUL, smart quotes, rogue delimiters) must not crash, and produced bytes must be valid for ``pd.read_csv``. 2. **Analyzer runs cleanly.** ``analyze()`` must produce a list of :class:`Finding` objects without raising. 3. **Text cleaner runs cleanly and preserves schema.** Default-config ``clean_dataframe`` must not change row count and must return the same number of columns it started with. The sweep skips files inside ``text-cleaner-corpus/`` because that subdir has its own dedicated test (``test_corpus.py``) with byte-exact expected outputs. """ from __future__ import annotations import io from pathlib import Path import pandas as pd import pytest from src.core.analyze import Finding, analyze from src.core.io import detect_delimiter, detect_encoding, repair_bytes from src.core.text_clean import clean_dataframe TEST_CASES_DIR = Path(__file__).resolve().parent.parent / "test-cases" # Subdirectories in test-cases/ that are exercised by their own dedicated # tests. The sweep ignores these so we don't double-test or fight expected # byte-exact outputs. ``junk-corpus`` is intentionally pathological — # files there are designed to break the cleaner/analyzer; the contract is # enforced by ``tests/test_junk_corpus.py``, not this happy-path sweep. _EXCLUDED_SUBDIRS = {"text-cleaner-corpus", "junk-corpus"} # File suffixes we know how to load. _SUPPORTED_SUFFIXES = {".csv", ".tsv", ".xlsx", ".xls"} def _discover_fixtures() -> list[Path]: """Return every fixture file under test-cases/ that the sweep should run. Walks one level deep — CSV/XLSX directly inside test-cases/ are picked up; files in excluded subdirectories are not. """ if not TEST_CASES_DIR.is_dir(): return [] out: list[Path] = [] for entry in sorted(TEST_CASES_DIR.iterdir()): if entry.is_dir(): if entry.name in _EXCLUDED_SUBDIRS: continue for sub in sorted(entry.rglob("*")): if sub.is_file() and sub.suffix.lower() in _SUPPORTED_SUFFIXES: out.append(sub) continue if entry.is_file() and entry.suffix.lower() in _SUPPORTED_SUFFIXES: out.append(entry) return out _FIXTURES = _discover_fixtures() def _fixture_id(path: Path) -> str: """Pretty pytest id derived from the filename, keeping subdirs visible.""" rel = path.relative_to(TEST_CASES_DIR) return str(rel) # Skip the entire module gracefully when no fixtures are present, instead of # emitting a "no tests collected" failure. pytestmark = [ pytest.mark.fixture_sweep, pytest.mark.skipif( not _FIXTURES, reason="no fixtures found under test-cases/ — drop a CSV/XLSX in to enable the sweep", ), ] def _read_with_repair(path: Path) -> tuple[pd.DataFrame, object | None]: """Read *path* with the same robust pipeline analyze() uses. Returns ``(df, repair_result)`` where repair_result is None for Excel. """ suffix = path.suffix.lower() if suffix in (".xlsx", ".xls"): df = pd.read_excel(path, dtype=str, keep_default_na=False, engine="openpyxl") return df, None enc = detect_encoding(path) delim = detect_delimiter(path, enc) raw = path.read_bytes() repair = repair_bytes(raw, encoding=enc, delimiter=delim) df = pd.read_csv( io.BytesIO(repair.repaired_bytes), encoding="utf-8", delimiter=delim, dtype=str, keep_default_na=False, on_bad_lines="warn", ) return df, repair @pytest.mark.parametrize("fixture", _FIXTURES, ids=[_fixture_id(p) for p in _FIXTURES]) class TestFixtureSweep: """Smoke tests that every fixture in ``test-cases/`` must pass.""" def test_repair_and_load(self, fixture: Path) -> None: df, _ = _read_with_repair(fixture) assert isinstance(df, pd.DataFrame), f"{fixture.name}: did not return a DataFrame" assert len(df.columns) >= 1, f"{fixture.name}: zero columns after parse" def test_analyze_runs(self, fixture: Path) -> None: df, repair = _read_with_repair(fixture) findings = analyze(df, repair_result=repair) assert isinstance(findings, list) for f in findings: assert isinstance(f, Finding), ( f"{fixture.name}: analyze() returned a non-Finding ({type(f)})" ) def test_text_cleaner_preserves_schema(self, fixture: Path) -> None: df, _ = _read_with_repair(fixture) before_rows = len(df) before_cols = len(df.columns) result = clean_dataframe(df) assert len(result.cleaned_df) == before_rows, ( f"{fixture.name}: row count changed " f"({before_rows} -> {len(result.cleaned_df)})" ) assert len(result.cleaned_df.columns) == before_cols, ( f"{fixture.name}: column count changed " f"({before_cols} -> {len(result.cleaned_df.columns)})" ) def test_text_cleaner_idempotent(self, fixture: Path) -> None: df, _ = _read_with_repair(fixture) once = clean_dataframe(df).cleaned_df.reset_index(drop=True) twice = clean_dataframe(once).cleaned_df.reset_index(drop=True) assert once.equals(twice), ( f"{fixture.name}: clean(clean(x)) != clean(x); cleaner is not idempotent" ) def test_at_least_one_fixture_present() -> None: """Smoke check: every project should ship at least one fixture so the sweep is not silently skipped on a clean checkout. Adjust the threshold only if intentionally moving fixtures elsewhere.""" assert len(_FIXTURES) > 0, ( "No fixtures found under test-cases/. " "Drop a CSV or XLSX file into the directory and re-run." )