"""Run every corpus fixture through the current text cleaner and report diffs. This is an *acceptance* test against an external corpus shipped in ``test-cases/text-cleaner-corpus/``. Cases that fail are documented gaps between the current implementation and the spec target in TEST-CASES.md. The test fails on diff — that's the point. Each failure is informative. Cases 12 and 14 produce multiple expected outputs depending on flags; case 21 is XLSX-only and verified separately (manual / smoke). """ from __future__ import annotations import io import subprocess import sys from pathlib import Path import pandas as pd import pytest from src.core.text_clean import CleanOptions, clean_dataframe CORPUS = Path(__file__).parent.parent / "test-cases" / "text-cleaner-corpus" TEST_DATA = CORPUS / "test_data" EXPECTED = CORPUS / "expected" # Cases where a single default run should produce the expected file DEFAULT_CASES = [ "01_whitespace_basic", "02_whitespace_unicode", "03_smart_punctuation", "04_unicode_forms", "05_zero_width_invisible", "06_control_characters", "07_bom_utf8", "08_line_endings_crlf", "09_line_endings_cr", "10_line_endings_mixed", "11_embedded_newlines", "13_non_latin_scripts", "15_whitespace_only_cells", "16_dirty_headers", "17_preserve_intended", "19_headers_only", "20_kitchen_sink", ] def _read_csv_strict(path: Path) -> pd.DataFrame: """Read a corpus CSV file, treating all cells as strings. Applies only the structural pre-parse fixes that are required to make the file parseable at all — NUL stripping (case 06), line-ending normalization (cases 09/10), and unquoted-currency repair (case 17). Character-level folds that the cleaner itself owns (smart quotes, NBSP, etc.) are deliberately left alone so the cleaner's own behavior is what's under test. """ raw = path.read_bytes() # NUL stripping raw = raw.replace(b"\x00", b"") # Line endings: CRLF -> LF, then bare CR -> LF. raw = raw.replace(b"\r\n", b"\n").replace(b"\r", b"\n") # Per-row repair (handles unquoted '$1,500.00' in case 17). from src.core.io import _repair_rows text = raw.decode("utf-8-sig") text, _, _ = _repair_rows(text, ",") return pd.read_csv( io.StringIO(text), dtype=str, keep_default_na=False, ) # --------------------------------------------------------------------------- # DataFrame-level diff (covers cell content; ignores file-level encoding/EOL) # --------------------------------------------------------------------------- @pytest.mark.parametrize("name", DEFAULT_CASES) def test_corpus_dataframe_diff(name): """Run clean_dataframe on the input and diff against the expected DF.""" inp_path = TEST_DATA / f"{name}.csv" exp_path = EXPECTED / f"{name}.csv" if inp_path.stat().st_size == 0: pytest.skip(f"{name}: input is empty (file-level test)") df_in = _read_csv_strict(inp_path) df_expected = _read_csv_strict(exp_path) result = clean_dataframe(df_in) # Normalize column names in expected/actual the same way (str cast) actual = result.cleaned_df.reset_index(drop=True) expected = df_expected.reset_index(drop=True) # Frame-level diff: equal columns, equal cell content assert list(actual.columns) == list(expected.columns), ( f"{name}: header mismatch.\n" f" actual: {list(actual.columns)!r}\n" f" expected: {list(expected.columns)!r}" ) diffs = [] for col in expected.columns: for i, (a, e) in enumerate(zip(actual[col].tolist(), expected[col].tolist())): if a != e: diffs.append((i, col, repr(a), repr(e))) assert not diffs, ( f"{name}: {len(diffs)} cell mismatch(es). First 5:\n" + "\n".join(f" row {i} col {c}: actual={a} expected={e}" for i, c, a, e in diffs[:5]) ) # --------------------------------------------------------------------------- # Idempotency property (every case) # --------------------------------------------------------------------------- @pytest.mark.parametrize("name", DEFAULT_CASES + ["12_case_variations", "14_mojibake"]) def test_corpus_idempotent(name): """clean(clean(x)) == clean(x) for every fixture.""" inp_path = TEST_DATA / f"{name}.csv" if inp_path.stat().st_size == 0: pytest.skip(f"{name}: input is empty") df = _read_csv_strict(inp_path) once = clean_dataframe(df).cleaned_df.reset_index(drop=True) twice = clean_dataframe(once).cleaned_df.reset_index(drop=True) assert once.equals(twice), f"{name}: not idempotent" # --------------------------------------------------------------------------- # Special cases: 12 (case ops, opt-in), 14 (mojibake), 18 (empty), 21 (xlsx) # --------------------------------------------------------------------------- class TestCaseVariations: """Case 12: --case email=lower and --case name=title variants.""" def test_default_is_identity_for_case(self): df = _read_csv_strict(TEST_DATA / "12_case_variations.csv") expected = _read_csv_strict(EXPECTED / "12_case_variations__default.csv") actual = clean_dataframe(df).cleaned_df.reset_index(drop=True) # Default should not change case assert actual.equals(expected), ( "12 default: cells differ (case mutated under default config)" ) def test_email_lower(self): df = _read_csv_strict(TEST_DATA / "12_case_variations.csv") expected = _read_csv_strict(EXPECTED / "12_case_variations__email_lower.csv") opts = CleanOptions(case_columns={"email": "lower"}) actual = clean_dataframe(df, opts).cleaned_df.reset_index(drop=True) assert actual.equals(expected), "12 email_lower variant differs" def test_name_title(self): df = _read_csv_strict(TEST_DATA / "12_case_variations.csv") expected = _read_csv_strict(EXPECTED / "12_case_variations__name_title.csv") opts = CleanOptions(case_columns={"name": "title"}) actual = clean_dataframe(df, opts).cleaned_df.reset_index(drop=True) assert actual.equals(expected), "12 name_title variant differs" class TestMojibake: def test_default_no_repair(self): df = _read_csv_strict(TEST_DATA / "14_mojibake.csv") expected = _read_csv_strict(EXPECTED / "14_mojibake__default.csv") actual = clean_dataframe(df).cleaned_df.reset_index(drop=True) assert actual.equals(expected), "14 mojibake default (no repair) differs" def test_fixed_variant(self): """Mojibake auto-repair (ftfy-backed) restores the original text. Skipped automatically when ftfy is not installed — the engine falls back to a no-op in that case and the diff would never close. """ try: import ftfy # noqa: F401 except ImportError: pytest.skip("ftfy not installed — install ftfy to enable mojibake repair") from src.core.fixes import repair_mojibake df = _read_csv_strict(TEST_DATA / "14_mojibake.csv") expected = _read_csv_strict(EXPECTED / "14_mojibake__fixed.csv") repaired, _ = repair_mojibake(df) actual = repaired.reset_index(drop=True) assert actual.equals(expected), "14 mojibake fixed variant differs" class TestEmptyFile: def test_empty_no_crash(self, tmp_path): """Case 18: zero-byte file should not crash.""" inp = TEST_DATA / "18_empty_file.csv" assert inp.stat().st_size == 0 # Reading an empty CSV with pandas raises EmptyDataError; corpus says # the cleaner must handle it gracefully. Not yet wired in core. with pytest.raises(pd.errors.EmptyDataError): pd.read_csv(inp) class TestXlsxPollution: """Case 21: XLSX with multi-sheet pollution; smoke-test each sheet.""" @pytest.fixture(scope="class") def workbook(self): path = TEST_DATA / "21_excel_pollution.xlsx" return pd.ExcelFile(path, engine="openpyxl") def test_sheets_present(self, workbook): names = set(workbook.sheet_names) assert {"Customers", "Notes", "International", "ForceText"}.issubset(names) def test_each_sheet_runs_without_error(self, workbook): for sheet in workbook.sheet_names: df = pd.read_excel( workbook, sheet_name=sheet, dtype=str, keep_default_na=False, ) result = clean_dataframe(df) assert result.cleaned_df.shape[0] == df.shape[0], ( f"sheet {sheet}: row count changed" ) def test_force_text_leading_zeros_preserved(self, workbook): df = pd.read_excel( workbook, sheet_name="ForceText", dtype=str, keep_default_na=False, ) result = clean_dataframe(df) # First column likely an id with leading zeros — make sure it isn't # numerically coerced or stripped. first_col = result.cleaned_df.iloc[:, 0].tolist() for val in first_col: if val and val.lstrip("'").isdigit(): assert not val.startswith(" ") and not val.endswith(" ")