datatools-dev/tests/test_fixtures_sweep.py

"""Automated sweep over every fixture in ``test-cases/``.

Drop a new CSV/TSV/XLSX into ``test-cases/`` and the sweep picks it up the
next time pytest runs — no test code changes required. Each fixture goes
through three smoke tests:

1. **Pre-parse repair runs cleanly.** Byte-level repair (BOM, NUL, smart
   quotes, rogue delimiters) must not crash, and produced bytes must be
   valid for ``pd.read_csv``.
2. **Analyzer runs cleanly.** ``analyze()`` must produce a list of
   :class:`Finding` objects without raising.
3. **Text cleaner runs cleanly and preserves schema.** Default-config
   ``clean_dataframe`` must not change row count and must return the same
   number of columns it started with.

The sweep skips files inside ``text-cleaner-corpus/`` because that subdir
has its own dedicated test (``test_corpus.py``) with byte-exact expected
outputs.
"""

from __future__ import annotations

import io
from pathlib import Path

import pandas as pd
import pytest

from src.core.analyze import Finding, analyze
from src.core.io import detect_delimiter, detect_encoding, repair_bytes
from src.core.text_clean import clean_dataframe


TEST_CASES_DIR = Path(__file__).resolve().parent.parent / "test-cases"

# Subdirectories in test-cases/ that are exercised by their own dedicated
# tests. The sweep ignores these so we don't double-test or fight expected
# byte-exact outputs. ``junk-corpus`` is intentionally pathological —
# files there are designed to break the cleaner/analyzer; the contract is
# enforced by ``tests/test_junk_corpus.py``, not this happy-path sweep.
_EXCLUDED_SUBDIRS = {"text-cleaner-corpus", "junk-corpus"}

# File suffixes we know how to load.
_SUPPORTED_SUFFIXES = {".csv", ".tsv", ".xlsx", ".xls"}


def _discover_fixtures() -> list[Path]:
    """Return every fixture file under test-cases/ that the sweep should run.

    Walks one level deep — CSV/XLSX directly inside test-cases/ are picked
    up; files in excluded subdirectories are not.
    """
    if not TEST_CASES_DIR.is_dir():
        return []
    out: list[Path] = []
    for entry in sorted(TEST_CASES_DIR.iterdir()):
        if entry.is_dir():
            if entry.name in _EXCLUDED_SUBDIRS:
                continue
            for sub in sorted(entry.rglob("*")):
                if sub.is_file() and sub.suffix.lower() in _SUPPORTED_SUFFIXES:
                    out.append(sub)
            continue
        if entry.is_file() and entry.suffix.lower() in _SUPPORTED_SUFFIXES:
            out.append(entry)
    return out


_FIXTURES = _discover_fixtures()


def _fixture_id(path: Path) -> str:
    """Pretty pytest id derived from the filename, keeping subdirs visible."""
    rel = path.relative_to(TEST_CASES_DIR)
    return str(rel)


# Skip the entire module gracefully when no fixtures are present, instead of
# emitting a "no tests collected" failure.
pytestmark = [
    pytest.mark.fixture_sweep,
    pytest.mark.skipif(
        not _FIXTURES,
        reason="no fixtures found under test-cases/ — drop a CSV/XLSX in to enable the sweep",
    ),
]


def _read_with_repair(path: Path) -> tuple[pd.DataFrame, object | None]:
    """Read *path* with the same robust pipeline analyze() uses.

    Returns ``(df, repair_result)`` where repair_result is None for Excel.
    """
    suffix = path.suffix.lower()
    if suffix in (".xlsx", ".xls"):
        df = pd.read_excel(path, dtype=str, keep_default_na=False, engine="openpyxl")
        return df, None
    enc = detect_encoding(path)
    delim = detect_delimiter(path, enc)
    raw = path.read_bytes()
    repair = repair_bytes(raw, encoding=enc, delimiter=delim)
    df = pd.read_csv(
        io.BytesIO(repair.repaired_bytes),
        encoding="utf-8", delimiter=delim,
        dtype=str, keep_default_na=False, on_bad_lines="warn",
    )
    return df, repair


@pytest.mark.parametrize("fixture", _FIXTURES, ids=[_fixture_id(p) for p in _FIXTURES])
class TestFixtureSweep:
    """Smoke tests that every fixture in ``test-cases/`` must pass."""

    def test_repair_and_load(self, fixture: Path) -> None:
        df, _ = _read_with_repair(fixture)
        assert isinstance(df, pd.DataFrame), f"{fixture.name}: did not return a DataFrame"
        assert len(df.columns) >= 1, f"{fixture.name}: zero columns after parse"

    def test_analyze_runs(self, fixture: Path) -> None:
        df, repair = _read_with_repair(fixture)
        findings = analyze(df, repair_result=repair)
        assert isinstance(findings, list)
        for f in findings:
            assert isinstance(f, Finding), (
                f"{fixture.name}: analyze() returned a non-Finding ({type(f)})"
            )

    def test_text_cleaner_preserves_schema(self, fixture: Path) -> None:
        df, _ = _read_with_repair(fixture)
        before_rows = len(df)
        before_cols = len(df.columns)
        result = clean_dataframe(df)
        assert len(result.cleaned_df) == before_rows, (
            f"{fixture.name}: row count changed "
            f"({before_rows} -> {len(result.cleaned_df)})"
        )
        assert len(result.cleaned_df.columns) == before_cols, (
            f"{fixture.name}: column count changed "
            f"({before_cols} -> {len(result.cleaned_df.columns)})"
        )

    def test_text_cleaner_idempotent(self, fixture: Path) -> None:
        df, _ = _read_with_repair(fixture)
        once = clean_dataframe(df).cleaned_df.reset_index(drop=True)
        twice = clean_dataframe(once).cleaned_df.reset_index(drop=True)
        assert once.equals(twice), (
            f"{fixture.name}: clean(clean(x)) != clean(x); cleaner is not idempotent"
        )


def test_at_least_one_fixture_present() -> None:
    """Smoke check: every project should ship at least one fixture so the
    sweep is not silently skipped on a clean checkout. Adjust the threshold
    only if intentionally moving fixtures elsewhere."""
    assert len(_FIXTURES) > 0, (
        "No fixtures found under test-cases/. "
        "Drop a CSV or XLSX file into the directory and re-run."
    )