"""Run the analyzer + detector against the code-page test corpus. Fixtures live in ``test-cases/encodings-corpus/`` (synced from ``Business/DataTools/test-case-code-page-variations``). Each test runs against one fixture and uses the corpus manifest (``expected_detection.csv``) for ground truth. What's tested ------------- 1. ``analyze()`` does not crash on any fixture — every encoded file produces a Finding list (possibly empty), never an exception. 2. ``detect_encoding()`` returns one of the manifest's accepted answers, OR the manifest itself flagged the case as AMBIGUOUS / UNRELIABLE / REJECT / LOW_CONFIDENCE. 3. The decoded DataFrame matches the canonical reference content. Cases where the current implementation is known to fail (charset- normalizer label drift on byte-equivalent encodings, ``repair_bytes`` NUL-strip destroying UTF-16, the "lying BOM" pathological case) are marked ``xfail`` so they surface in the report as documented gaps. A future fix that makes the case pass will flip xfail to xpass and the test owner can drop the marker. """ from __future__ import annotations import csv import io from pathlib import Path import pandas as pd import pytest from src.core.analyze import analyze, _load_for_analysis from src.core.io import detect_encoding CORPUS = Path(__file__).parent.parent / "test-cases" / "encodings-corpus" MANIFEST = CORPUS / "expected_detection.csv" REFERENCE_DIR = CORPUS / "reference" # Known failures the analyzer does not yet handle correctly. Each entry # has a one-line reason — drop the entry once a fix lands. KNOWN_DETECTION_FAILURES = { "E03_western_basic_cp1252.csv": "charset-normalizer returns cp1250 for byte-equivalent content", "E04_western_basic_latin1.csv": "charset-normalizer returns cp1250 for byte-equivalent content", "E05_western_basic_latin9.csv": "charset-normalizer returns cp1250 for byte-equivalent content", "E06_western_basic_macroman.csv": "returns mac_iceland (same family) instead of mac_roman", "E11_western_extended_cp1252.csv": "charset-normalizer returns cp1250 for cp1252 content", "E15_eastern_european_iso88592.csv": "charset-normalizer returns cp1258 for ISO-8859-2 content", "E18_cyrillic_koi8r.csv": "charset-normalizer returns shift_jis_2004 for KOI8-R content", } KNOWN_DECODE_FAILURES = { "E03_western_basic_cp1252.csv": "decoded as cp1250 — different mapping at 0xF1 (ñ vs ń)", "E04_western_basic_latin1.csv": "decoded as cp1250 — different mapping at 0xF1", "E05_western_basic_latin9.csv": "decoded as cp1250 — different mapping at 0xF1", "E10_western_extended_utf8.csv": "byte-level smart-quote fold rewrites U+201C/U+201D to ASCII before parse", "E11_western_extended_cp1252.csv": "wrong encoding + smart-quote fold", "E12_western_extended_utf16le.csv": "byte-level smart-quote fold rewrites U+201C/U+201D before parse", "E15_eastern_european_iso88592.csv": "wrong encoding (cp1258 != ISO-8859-2)", "E18_cyrillic_koi8r.csv": "wrong encoding (shift_jis_2004 != KOI8-R)", "E30_pathological_lying_bom.csv": "utf-8-sig fails on cp1252 body bytes; needs lying-BOM recovery", } def _normalize_encoding(name: str) -> str: return name.lower().replace("-", "_").replace(" ", "_") def _load_manifest() -> list[dict]: if not MANIFEST.exists(): return [] with MANIFEST.open() as fh: return list(csv.DictReader(fh)) def _load_references() -> dict[str, str]: if not REFERENCE_DIR.exists(): return {} return { p.stem.replace(".utf8", ""): p.read_text(encoding="utf-8") for p in REFERENCE_DIR.glob("*.utf8.txt") } MANIFEST_ENTRIES = _load_manifest() REFERENCES = _load_references() def _entry_id(entry: dict) -> str: return entry["filename"] # --------------------------------------------------------------------------- # 1. Analyzer never crashes # --------------------------------------------------------------------------- @pytest.mark.parametrize("entry", MANIFEST_ENTRIES, ids=_entry_id) def test_analyzer_does_not_crash(entry): findings = analyze(CORPUS / entry["filename"], sample_rows=1000) # Either empty or a list of Findings — but never raises. assert isinstance(findings, list) # --------------------------------------------------------------------------- # 2. detect_encoding returns an acceptable answer # --------------------------------------------------------------------------- def _detection_marker(entry): fname = entry["filename"] if fname in KNOWN_DETECTION_FAILURES: return pytest.mark.xfail( reason=KNOWN_DETECTION_FAILURES[fname], strict=False, ) return () @pytest.mark.parametrize( "entry", [ pytest.param(e, marks=_detection_marker(e), id=_entry_id(e)) for e in MANIFEST_ENTRIES ], ) def test_detect_encoding_accepted(entry): accepted_raw = entry["expected_detection"] # Manifest fuzzy markers — any answer is acceptable. if any(m in accepted_raw for m in ("AMBIGUOUS", "UNRELIABLE", "REJECT", "LOW_CONFIDENCE")): # Just call to ensure no exception. detect_encoding(CORPUS / entry["filename"]) return accepted = {_normalize_encoding(s.strip()) for s in accepted_raw.split("|") if s.strip()} detected = detect_encoding(CORPUS / entry["filename"]) detected_n = _normalize_encoding(detected) assert detected_n in accepted, ( f"{entry['filename']}: detected {detected!r} not in {sorted(accepted)}" ) # --------------------------------------------------------------------------- # 3. Decoded content matches the canonical reference # --------------------------------------------------------------------------- def _decode_marker(entry): fname = entry["filename"] if fname in KNOWN_DECODE_FAILURES: return pytest.mark.xfail( reason=KNOWN_DECODE_FAILURES[fname], strict=False, ) return () def _decodable_entries(): """Skip pathological cases that have no canonical reference.""" return [e for e in MANIFEST_ENTRIES if e["canonical_content_id"] in REFERENCES] @pytest.mark.parametrize( "entry", [ pytest.param(e, marks=_decode_marker(e), id=_entry_id(e)) for e in _decodable_entries() ], ) def test_decoded_matches_reference(entry): df, _, _ = _load_for_analysis(CORPUS / entry["filename"], sample_rows=1000) ref_text = REFERENCES[entry["canonical_content_id"]] ref_rows = list(csv.reader(io.StringIO(ref_text))) if not ref_rows: pytest.skip("empty reference") # First row = headers in the reference; compare data rows to df rows. ref_data = ref_rows[1:] assert len(df) >= len(ref_data), ( f"{entry['filename']}: parsed {len(df)} rows, reference has {len(ref_data)}" ) for r, ref_row in enumerate(ref_data): for c, ref_cell in enumerate(ref_row): actual = str(df.iloc[r, c]) assert actual == ref_cell, ( f"{entry['filename']}: row {r} col {c}: " f"got {actual!r}, expected {ref_cell!r}" )