Files
datatools-dev/tests/test_corpus.py
Michael 966af8ef94 feat: 3 new tools, format streaming, distribution-ready demo + landing pages
Tools shipped this batch (4 → 6 of 9 Ready):
  04 Missing Value Handler   src/core/missing.py + cli_missing.py + GUI
  05 Column Mapper           src/core/column_mapper.py + cli_column_map.py + GUI
  09 Pipeline Runner         src/core/pipeline.py + cli_pipeline.py + GUI
                             with soft tool-dependency graph (recommended,
                             not enforced) and JSON save/load for repeatable
                             weekly cleanups.

Format Standardizer reworked for 1 GB international files:
  • Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
  • Per-row country / address columns drive parsing
  • Audit cap (default 10 k rows, ~50 MB RAM)
  • standardize_file(): chunked streaming entry point (~165 k rows/sec)
  • currency_decimal="auto" for EU comma-decimal locales
  • R$ / kr / zł multi-char currency prefixes
  • cli_format.py with auto-stream above 100 MB inputs

Encoding detection arbiter + language-aware probe:
  Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
  via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.

Distribution-readiness assets:
  • streamlit_app.py — Streamlit Community Cloud entry shim
  • src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
    100-row cap + watermark, free-vs-paid boundary enforced at surface
  • samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
  • landing/ — 4 static HTML pages (apex chooser + 3 niche),
    shared CSS, deploy.py URL-substitution script,
    auto-generated robots.txt + sitemap.xml + 404.html + favicon
  • docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
    — full strategy + measurement + deployment + master checklist

Test counts:
  before: 1,520 passed · 4 skipped · 17 xfailed
  after:  1,729 passed · 0 skipped · 0  xfailed

Tier-1 corpora added:
  • missing-corpus           3 use cases + 16 edge cases
  • column-mapper-corpus     3 use cases + 5 edge cases
  • format-cleaner intl      20-row 13-country stress fixture

Engine hardening flushed out by the corpora:
  • interpolate guards against object-dtype columns
  • mean/median skip all-NaN columns (silences numpy warning)
  • fillna runs under future.no_silent_downcasting (silences pandas warning)
  • mojibake test no longer skips when ftfy installed (monkeypatch path)
  • drop-row threshold semantics: strict-greater (consistent across rows / cols)
  • currency_decimal validator allow-set updated for "auto"

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 22:31:26 +00:00

235 lines
9.0 KiB
Python

"""Run every corpus fixture through the current text cleaner and report diffs.
This is an *acceptance* test against an external corpus shipped in
``test-cases/text-cleaner-corpus/``. Cases that fail are documented gaps
between the current implementation and the spec target in TEST-CASES.md.
The test fails on diff — that's the point. Each failure is informative.
Cases 12 and 14 produce multiple expected outputs depending on flags;
case 21 is XLSX-only and verified separately (manual / smoke).
"""
from __future__ import annotations
import io
import subprocess
import sys
from pathlib import Path
import pandas as pd
import pytest
from src.core.text_clean import CleanOptions, clean_dataframe
CORPUS = Path(__file__).parent.parent / "test-cases" / "text-cleaner-corpus"
TEST_DATA = CORPUS / "test_data"
EXPECTED = CORPUS / "expected"
# Cases where a single default run should produce the expected file
DEFAULT_CASES = [
"01_whitespace_basic",
"02_whitespace_unicode",
"03_smart_punctuation",
"04_unicode_forms",
"05_zero_width_invisible",
"06_control_characters",
"07_bom_utf8",
"08_line_endings_crlf",
"09_line_endings_cr",
"10_line_endings_mixed",
"11_embedded_newlines",
"13_non_latin_scripts",
"15_whitespace_only_cells",
"16_dirty_headers",
"17_preserve_intended",
"19_headers_only",
"20_kitchen_sink",
]
def _read_csv_strict(path: Path) -> pd.DataFrame:
"""Read a corpus CSV file, treating all cells as strings.
Applies only the structural pre-parse fixes that are required to make
the file parseable at all — NUL stripping (case 06), line-ending
normalization (cases 09/10), and unquoted-currency repair (case 17).
Character-level folds that the cleaner itself owns (smart quotes,
NBSP, etc.) are deliberately left alone so the cleaner's own behavior
is what's under test.
"""
raw = path.read_bytes()
# NUL stripping
raw = raw.replace(b"\x00", b"")
# Line endings: CRLF -> LF, then bare CR -> LF.
raw = raw.replace(b"\r\n", b"\n").replace(b"\r", b"\n")
# Per-row repair (handles unquoted '$1,500.00' in case 17).
from src.core.io import _repair_rows
text = raw.decode("utf-8-sig")
text, _, _ = _repair_rows(text, ",")
return pd.read_csv(
io.StringIO(text), dtype=str, keep_default_na=False,
)
# ---------------------------------------------------------------------------
# DataFrame-level diff (covers cell content; ignores file-level encoding/EOL)
# ---------------------------------------------------------------------------
@pytest.mark.parametrize("name", DEFAULT_CASES)
def test_corpus_dataframe_diff(name):
"""Run clean_dataframe on the input and diff against the expected DF."""
inp_path = TEST_DATA / f"{name}.csv"
exp_path = EXPECTED / f"{name}.csv"
if inp_path.stat().st_size == 0:
pytest.skip(f"{name}: input is empty (file-level test)")
df_in = _read_csv_strict(inp_path)
df_expected = _read_csv_strict(exp_path)
result = clean_dataframe(df_in)
# Normalize column names in expected/actual the same way (str cast)
actual = result.cleaned_df.reset_index(drop=True)
expected = df_expected.reset_index(drop=True)
# Frame-level diff: equal columns, equal cell content
assert list(actual.columns) == list(expected.columns), (
f"{name}: header mismatch.\n"
f" actual: {list(actual.columns)!r}\n"
f" expected: {list(expected.columns)!r}"
)
diffs = []
for col in expected.columns:
for i, (a, e) in enumerate(zip(actual[col].tolist(), expected[col].tolist())):
if a != e:
diffs.append((i, col, repr(a), repr(e)))
assert not diffs, (
f"{name}: {len(diffs)} cell mismatch(es). First 5:\n"
+ "\n".join(f" row {i} col {c}: actual={a} expected={e}"
for i, c, a, e in diffs[:5])
)
# ---------------------------------------------------------------------------
# Idempotency property (every case)
# ---------------------------------------------------------------------------
@pytest.mark.parametrize("name", DEFAULT_CASES + ["12_case_variations", "14_mojibake"])
def test_corpus_idempotent(name):
"""clean(clean(x)) == clean(x) for every fixture."""
inp_path = TEST_DATA / f"{name}.csv"
if inp_path.stat().st_size == 0:
pytest.skip(f"{name}: input is empty")
df = _read_csv_strict(inp_path)
once = clean_dataframe(df).cleaned_df.reset_index(drop=True)
twice = clean_dataframe(once).cleaned_df.reset_index(drop=True)
assert once.equals(twice), f"{name}: not idempotent"
# ---------------------------------------------------------------------------
# Special cases: 12 (case ops, opt-in), 14 (mojibake), 18 (empty), 21 (xlsx)
# ---------------------------------------------------------------------------
class TestCaseVariations:
"""Case 12: --case email=lower and --case name=title variants."""
def test_default_is_identity_for_case(self):
df = _read_csv_strict(TEST_DATA / "12_case_variations.csv")
expected = _read_csv_strict(EXPECTED / "12_case_variations__default.csv")
actual = clean_dataframe(df).cleaned_df.reset_index(drop=True)
# Default should not change case
assert actual.equals(expected), (
"12 default: cells differ (case mutated under default config)"
)
def test_email_lower(self):
df = _read_csv_strict(TEST_DATA / "12_case_variations.csv")
expected = _read_csv_strict(EXPECTED / "12_case_variations__email_lower.csv")
opts = CleanOptions(case_columns={"email": "lower"})
actual = clean_dataframe(df, opts).cleaned_df.reset_index(drop=True)
assert actual.equals(expected), "12 email_lower variant differs"
def test_name_title(self):
df = _read_csv_strict(TEST_DATA / "12_case_variations.csv")
expected = _read_csv_strict(EXPECTED / "12_case_variations__name_title.csv")
opts = CleanOptions(case_columns={"name": "title"})
actual = clean_dataframe(df, opts).cleaned_df.reset_index(drop=True)
assert actual.equals(expected), "12 name_title variant differs"
class TestMojibake:
def test_default_no_repair(self):
df = _read_csv_strict(TEST_DATA / "14_mojibake.csv")
expected = _read_csv_strict(EXPECTED / "14_mojibake__default.csv")
actual = clean_dataframe(df).cleaned_df.reset_index(drop=True)
assert actual.equals(expected), "14 mojibake default (no repair) differs"
def test_fixed_variant(self):
"""Mojibake auto-repair (ftfy-backed) restores the original text.
Skipped automatically when ftfy is not installed — the engine
falls back to a no-op in that case and the diff would never close.
"""
try:
import ftfy # noqa: F401
except ImportError:
pytest.skip("ftfy not installed — install ftfy to enable mojibake repair")
from src.core.fixes import repair_mojibake
df = _read_csv_strict(TEST_DATA / "14_mojibake.csv")
expected = _read_csv_strict(EXPECTED / "14_mojibake__fixed.csv")
repaired, _ = repair_mojibake(df)
actual = repaired.reset_index(drop=True)
assert actual.equals(expected), "14 mojibake fixed variant differs"
class TestEmptyFile:
def test_empty_no_crash(self, tmp_path):
"""Case 18: zero-byte file should not crash."""
inp = TEST_DATA / "18_empty_file.csv"
assert inp.stat().st_size == 0
# Reading an empty CSV with pandas raises EmptyDataError; corpus says
# the cleaner must handle it gracefully. Not yet wired in core.
with pytest.raises(pd.errors.EmptyDataError):
pd.read_csv(inp)
class TestXlsxPollution:
"""Case 21: XLSX with multi-sheet pollution; smoke-test each sheet."""
@pytest.fixture(scope="class")
def workbook(self):
path = TEST_DATA / "21_excel_pollution.xlsx"
return pd.ExcelFile(path, engine="openpyxl")
def test_sheets_present(self, workbook):
names = set(workbook.sheet_names)
assert {"Customers", "Notes", "International", "ForceText"}.issubset(names)
def test_each_sheet_runs_without_error(self, workbook):
for sheet in workbook.sheet_names:
df = pd.read_excel(
workbook, sheet_name=sheet, dtype=str, keep_default_na=False,
)
result = clean_dataframe(df)
assert result.cleaned_df.shape[0] == df.shape[0], (
f"sheet {sheet}: row count changed"
)
def test_force_text_leading_zeros_preserved(self, workbook):
df = pd.read_excel(
workbook, sheet_name="ForceText", dtype=str, keep_default_na=False,
)
result = clean_dataframe(df)
# First column likely an id with leading zeros — make sure it isn't
# numerically coerced or stripped.
first_col = result.cleaned_df.iloc[:, 0].tolist()
for val in first_col:
if val and val.lstrip("'").isdigit():
assert not val.startswith(" ") and not val.endswith(" ")