datatools-dev/tests/test_corpus.py

"""Run every corpus fixture through the current text cleaner and report diffs.

This is an *acceptance* test against an external corpus shipped in
``test-cases/text-cleaner-corpus/``. Cases that fail are documented gaps
between the current implementation and the spec target in TEST-CASES.md.
The test fails on diff — that's the point. Each failure is informative.

Cases 12 and 14 produce multiple expected outputs depending on flags;
case 21 is XLSX-only and verified separately (manual / smoke).
"""

from __future__ import annotations

import io
import subprocess
import sys
from pathlib import Path

import pandas as pd
import pytest

from src.core.text_clean import CleanOptions, clean_dataframe

CORPUS = Path(__file__).parent.parent / "test-cases" / "text-cleaner-corpus"
TEST_DATA = CORPUS / "test_data"
EXPECTED = CORPUS / "expected"


# Cases where a single default run should produce the expected file
DEFAULT_CASES = [
    "01_whitespace_basic",
    "02_whitespace_unicode",
    "03_smart_punctuation",
    "04_unicode_forms",
    "05_zero_width_invisible",
    "06_control_characters",
    "07_bom_utf8",
    "08_line_endings_crlf",
    "09_line_endings_cr",
    "10_line_endings_mixed",
    "11_embedded_newlines",
    "13_non_latin_scripts",
    "15_whitespace_only_cells",
    "16_dirty_headers",
    "17_preserve_intended",
    "19_headers_only",
    "20_kitchen_sink",
]


def _read_csv_strict(path: Path) -> pd.DataFrame:
    """Read a corpus CSV file, treating all cells as strings.

    Applies only the structural pre-parse fixes that are required to make
    the file parseable at all — NUL stripping (case 06), line-ending
    normalization (cases 09/10), and unquoted-currency repair (case 17).
    Character-level folds that the cleaner itself owns (smart quotes,
    NBSP, etc.) are deliberately left alone so the cleaner's own behavior
    is what's under test.
    """
    raw = path.read_bytes()
    # NUL stripping
    raw = raw.replace(b"\x00", b"")
    # Line endings: CRLF -> LF, then bare CR -> LF.
    raw = raw.replace(b"\r\n", b"\n").replace(b"\r", b"\n")
    # Per-row repair (handles unquoted '$1,500.00' in case 17).
    from src.core.io import _repair_rows
    text = raw.decode("utf-8-sig")
    text, _, _ = _repair_rows(text, ",")
    return pd.read_csv(
        io.StringIO(text), dtype=str, keep_default_na=False,
    )


# ---------------------------------------------------------------------------
# DataFrame-level diff (covers cell content; ignores file-level encoding/EOL)
# ---------------------------------------------------------------------------

@pytest.mark.parametrize("name", DEFAULT_CASES)
def test_corpus_dataframe_diff(name):
    """Run clean_dataframe on the input and diff against the expected DF."""
    inp_path = TEST_DATA / f"{name}.csv"
    exp_path = EXPECTED / f"{name}.csv"

    if inp_path.stat().st_size == 0:
        pytest.skip(f"{name}: input is empty (file-level test)")

    df_in = _read_csv_strict(inp_path)
    df_expected = _read_csv_strict(exp_path)

    result = clean_dataframe(df_in)

    # Normalize column names in expected/actual the same way (str cast)
    actual = result.cleaned_df.reset_index(drop=True)
    expected = df_expected.reset_index(drop=True)

    # Frame-level diff: equal columns, equal cell content
    assert list(actual.columns) == list(expected.columns), (
        f"{name}: header mismatch.\n"
        f"  actual:   {list(actual.columns)!r}\n"
        f"  expected: {list(expected.columns)!r}"
    )

    diffs = []
    for col in expected.columns:
        for i, (a, e) in enumerate(zip(actual[col].tolist(), expected[col].tolist())):
            if a != e:
                diffs.append((i, col, repr(a), repr(e)))
    assert not diffs, (
        f"{name}: {len(diffs)} cell mismatch(es). First 5:\n"
        + "\n".join(f"  row {i} col {c}: actual={a} expected={e}"
                    for i, c, a, e in diffs[:5])
    )


# ---------------------------------------------------------------------------
# Idempotency property (every case)
# ---------------------------------------------------------------------------

@pytest.mark.parametrize("name", DEFAULT_CASES + ["12_case_variations", "14_mojibake"])
def test_corpus_idempotent(name):
    """clean(clean(x)) == clean(x) for every fixture."""
    inp_path = TEST_DATA / f"{name}.csv"
    if inp_path.stat().st_size == 0:
        pytest.skip(f"{name}: input is empty")

    df = _read_csv_strict(inp_path)
    once = clean_dataframe(df).cleaned_df.reset_index(drop=True)
    twice = clean_dataframe(once).cleaned_df.reset_index(drop=True)
    assert once.equals(twice), f"{name}: not idempotent"


# ---------------------------------------------------------------------------
# Special cases: 12 (case ops, opt-in), 14 (mojibake), 18 (empty), 21 (xlsx)
# ---------------------------------------------------------------------------

class TestCaseVariations:
    """Case 12: --case email=lower and --case name=title variants."""

    def test_default_is_identity_for_case(self):
        df = _read_csv_strict(TEST_DATA / "12_case_variations.csv")
        expected = _read_csv_strict(EXPECTED / "12_case_variations__default.csv")
        actual = clean_dataframe(df).cleaned_df.reset_index(drop=True)
        # Default should not change case
        assert actual.equals(expected), (
            "12 default: cells differ (case mutated under default config)"
        )

    def test_email_lower(self):
        df = _read_csv_strict(TEST_DATA / "12_case_variations.csv")
        expected = _read_csv_strict(EXPECTED / "12_case_variations__email_lower.csv")
        opts = CleanOptions(case_columns={"email": "lower"})
        actual = clean_dataframe(df, opts).cleaned_df.reset_index(drop=True)
        assert actual.equals(expected), "12 email_lower variant differs"

    def test_name_title(self):
        df = _read_csv_strict(TEST_DATA / "12_case_variations.csv")
        expected = _read_csv_strict(EXPECTED / "12_case_variations__name_title.csv")
        opts = CleanOptions(case_columns={"name": "title"})
        actual = clean_dataframe(df, opts).cleaned_df.reset_index(drop=True)
        assert actual.equals(expected), "12 name_title variant differs"


class TestMojibake:
    def test_default_no_repair(self):
        df = _read_csv_strict(TEST_DATA / "14_mojibake.csv")
        expected = _read_csv_strict(EXPECTED / "14_mojibake__default.csv")
        actual = clean_dataframe(df).cleaned_df.reset_index(drop=True)
        assert actual.equals(expected), "14 mojibake default (no repair) differs"

    def test_fixed_variant(self):
        # --fix-mojibake is Tier 2; the cleaner does not implement it. Mark xfail.
        pytest.xfail("Mojibake auto-repair is Tier 2; not yet implemented (uses ftfy).")


class TestEmptyFile:
    def test_empty_no_crash(self, tmp_path):
        """Case 18: zero-byte file should not crash."""
        inp = TEST_DATA / "18_empty_file.csv"
        assert inp.stat().st_size == 0
        # Reading an empty CSV with pandas raises EmptyDataError; corpus says
        # the cleaner must handle it gracefully. Not yet wired in core.
        with pytest.raises(pd.errors.EmptyDataError):
            pd.read_csv(inp)


class TestXlsxPollution:
    """Case 21: XLSX with multi-sheet pollution; smoke-test each sheet."""

    @pytest.fixture(scope="class")
    def workbook(self):
        path = TEST_DATA / "21_excel_pollution.xlsx"
        return pd.ExcelFile(path, engine="openpyxl")

    def test_sheets_present(self, workbook):
        names = set(workbook.sheet_names)
        assert {"Customers", "Notes", "International", "ForceText"}.issubset(names)

    def test_each_sheet_runs_without_error(self, workbook):
        for sheet in workbook.sheet_names:
            df = pd.read_excel(
                workbook, sheet_name=sheet, dtype=str, keep_default_na=False,
            )
            result = clean_dataframe(df)
            assert result.cleaned_df.shape[0] == df.shape[0], (
                f"sheet {sheet}: row count changed"
            )

    def test_force_text_leading_zeros_preserved(self, workbook):
        df = pd.read_excel(
            workbook, sheet_name="ForceText", dtype=str, keep_default_na=False,
        )
        result = clean_dataframe(df)
        # First column likely an id with leading zeros — make sure it isn't
        # numerically coerced or stripped.
        first_col = result.cleaned_df.iloc[:, 0].tolist()
        for val in first_col:
            if val and val.lstrip("'").isdigit():
                assert not val.startswith(" ") and not val.endswith(" ")