Files
datatools-dev/tests/test_format_standardize_corpus.py
Michael 966af8ef94 feat: 3 new tools, format streaming, distribution-ready demo + landing pages
Tools shipped this batch (4 → 6 of 9 Ready):
  04 Missing Value Handler   src/core/missing.py + cli_missing.py + GUI
  05 Column Mapper           src/core/column_mapper.py + cli_column_map.py + GUI
  09 Pipeline Runner         src/core/pipeline.py + cli_pipeline.py + GUI
                             with soft tool-dependency graph (recommended,
                             not enforced) and JSON save/load for repeatable
                             weekly cleanups.

Format Standardizer reworked for 1 GB international files:
  • Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
  • Per-row country / address columns drive parsing
  • Audit cap (default 10 k rows, ~50 MB RAM)
  • standardize_file(): chunked streaming entry point (~165 k rows/sec)
  • currency_decimal="auto" for EU comma-decimal locales
  • R$ / kr / zł multi-char currency prefixes
  • cli_format.py with auto-stream above 100 MB inputs

Encoding detection arbiter + language-aware probe:
  Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
  via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.

Distribution-readiness assets:
  • streamlit_app.py — Streamlit Community Cloud entry shim
  • src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
    100-row cap + watermark, free-vs-paid boundary enforced at surface
  • samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
  • landing/ — 4 static HTML pages (apex chooser + 3 niche),
    shared CSS, deploy.py URL-substitution script,
    auto-generated robots.txt + sitemap.xml + 404.html + favicon
  • docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
    — full strategy + measurement + deployment + master checklist

Test counts:
  before: 1,520 passed · 4 skipped · 17 xfailed
  after:  1,729 passed · 0 skipped · 0  xfailed

Tier-1 corpora added:
  • missing-corpus           3 use cases + 16 edge cases
  • column-mapper-corpus     3 use cases + 5 edge cases
  • format-cleaner intl      20-row 13-country stress fixture

Engine hardening flushed out by the corpora:
  • interpolate guards against object-dtype columns
  • mean/median skip all-NaN columns (silences numpy warning)
  • fillna runs under future.no_silent_downcasting (silences pandas warning)
  • mojibake test no longer skips when ftfy installed (monkeypatch path)
  • drop-row threshold semantics: strict-greater (consistent across rows / cols)
  • currency_decimal validator allow-set updated for "auto"

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 22:31:26 +00:00

574 lines
19 KiB
Python

"""Corpus-driven tests for ``src.core.format_standardize``.
Drives every row of the FORMATS test corpus
(``test-cases/format-cleaner-corpus/*.csv``) through the per-cell
standardizers and asserts the canonical output the corpus expects.
The corpus itself (``FORMATS-CASES.md`` in the same directory)
documents per-domain policy decisions; the per-case ``id`` strings
below (FD01, FP14, FA09, …) match its row keys exactly.
Two sentinels are used in the per-domain expected dicts:
- A literal string is the corpus's expected canonical output.
- ``PASSTHROUGH`` means "corpus accepts no transformation" — usually
empty, whitespace-only, or already-clean input.
A handful of corpus rows are still ``xfail`` because closing them
needs heavier machinery (Excel serial parsing, Unix timestamps,
non-English month dictionaries, IDN / non-ASCII email validation).
Each such marker carries a one-line reason.
"""
from __future__ import annotations
import csv
from pathlib import Path
import pandas as pd
import pytest
from src.core.format_standardize import (
FieldType,
StandardizeOptions,
standardize_address,
standardize_currency,
standardize_dataframe,
standardize_date,
standardize_email,
standardize_name,
standardize_phone,
)
CORPUS = Path(__file__).resolve().parents[1] / "test-cases" / "format-cleaner-corpus"
PASSTHROUGH = object() # sentinel: assert the function returned input unchanged
def _load(filename: str) -> list[dict[str, str]]:
with (CORPUS / filename).open(newline="") as f:
return list(csv.DictReader(f))
def _params(fixture: str, expected: dict[str, object], xfails: dict[str, str]):
"""Build pytest.param entries for every row in *fixture*.
Rows in *xfails* are wrapped in a non-strict xfail with the given
reason, so improvements that close the gap surface as xpass and the
suite stays green either way.
"""
rows = _load(fixture)
out = []
for row in rows:
cid = row["case_id"]
want = expected.get(cid, PASSTHROUGH)
marks = []
if cid in xfails:
marks.append(pytest.mark.xfail(reason=xfails[cid], strict=False))
out.append(pytest.param(row["input"], want, id=cid, marks=marks))
return out
def _assert(got: str, want: object, original: str) -> None:
if want is PASSTHROUGH:
assert got == original, f"expected pass-through, got {got!r}"
else:
assert got == want
# ---------------------------------------------------------------------------
# Dates — 24_format_dates.csv
# ---------------------------------------------------------------------------
_DATE_EXPECTED_MDY: dict[str, object] = {
# iso baseline + datetime variants → ISO date
"FD01": "2024-01-15",
"FD02": "2024-01-15",
"FD03": "2024-01-15",
"FD04": "2024-01-15",
"FD05": "2024-01-15",
"FD06": "2024-01-15",
# US M/D/Y variants
"FD07": "2024-01-15",
"FD08": "2024-01-15",
"FD09": "2024-01-05",
"FD10": "2024-05-30",
# longform month names
"FD16": "2024-01-15",
"FD17": "2024-01-15",
"FD18": "2024-01-15",
"FD19": "2024-01-15",
"FD20": "2024-01-15", # weekday-prefixed
"FD21": "2024-01-15",
# FD11-FD15 — DMY-shaped EU dates in MDY default mode; the DMY
# rerun below covers the actual parse path. Under MDY they pass
# through unchanged. (Listed explicitly so a future MDY-aware
# locale auto-detect can replace these expectations with the
# correct ISO output.)
"FD11": PASSTHROUGH,
"FD12": PASSTHROUGH,
"FD13": PASSTHROUGH,
"FD14": PASSTHROUGH,
"FD15": PASSTHROUGH,
# excel serial dates (numeric days since 1899-12-30)
"FD22": "2024-01-15",
"FD23": "2024-01-15",
# unix timestamps (seconds, milliseconds)
"FD24": "2024-01-15",
"FD25": "2024-01-15",
# partial precision — corpus preserves it
"FD26": "2024-01",
"FD27": "2024-01", # text precision month
"FD28": "2024-Q1", # quarter
"FD29": "2024",
# 2-digit year cutoff (per docs: 1969 wins over 2069)
"FD30": "1969-01-15",
# leap day valid
"FD31": "2024-02-29",
# invalid dates → corpus expects error sentinel
"FD32": "<error: invalid leap day>",
"FD33": "<error: Excel 1900 leap year bug>",
"FD34": "<error: invalid month>",
"FD35": "<error: invalid day>",
# buried-date extraction
"FD36": "2024-01-15",
"FD37": "2024-01-15",
# garbage → pass through (corpus 0.3 boundary table)
# FD38/39/40 → PASSTHROUGH default
# locale-specific month names (en/fr/de via month_locales)
"FD41": "2024-01-15",
"FD42": "2024-01-15",
# timezone — corpus 3.3 says fixed-offset only
"FD43": "2024-01-15",
"FD44": "2024-03-10",
# already-clean idempotency
"FD45": "2024-01-15",
}
_DATE_XFAILS_MDY: dict[str, str] = {}
@pytest.mark.parametrize(
"inp,want",
_params("24_format_dates.csv", _DATE_EXPECTED_MDY, _DATE_XFAILS_MDY),
)
def test_corpus_dates_mdy(inp, want):
got, _ = standardize_date(
inp, error_policy="sentinel", month_locales=["en", "fr", "de"],
)
_assert(got, want, inp)
# DMY locale rerun for the EU rows that need it.
_DATE_EXPECTED_DMY: dict[str, str] = {
"FD11": "2024-01-15",
"FD12": "2024-01-15",
"FD13": "2024-01-15",
"FD14": "2024-05-30",
"FD15": "2024-01-15",
}
@pytest.mark.parametrize(
"inp,want",
[
pytest.param(
_load("24_format_dates.csv")[i - 1]["input"],
_DATE_EXPECTED_DMY[f"FD{i:02d}"],
id=f"FD{i:02d}-dmy",
)
for i in range(11, 16)
],
)
def test_corpus_dates_dmy(inp, want):
got, _ = standardize_date(inp, date_order="DMY")
assert got == want
# ---------------------------------------------------------------------------
# Phones — 25_format_phones.csv
# ---------------------------------------------------------------------------
_PHONE_EXPECTED: dict[str, object] = {
"FP01": "+15551234567",
"FP02": "+15551234567",
"FP03": "+15551234567",
"FP04": "+15551234567",
"FP05": "+15551234567",
"FP06": "+15551234567",
"FP07": "+15551234567",
"FP08": "+15551234567",
"FP09": "+15551234567;ext=123",
"FP10": "+15551234567;ext=123",
"FP11": "+15551234567;ext=123",
# vanity numbers
"FP12": "+18003569377",
"FP13": "+15552255669",
# international (intl row FP15 needs --default-country=GB; covered separately)
"FP14": "+442079460958",
"FP16": "+493012345678",
"FP17": "+33123456789",
"FP18": "+81312345678",
"FP19": "+61212345678",
"FP20": "+15551234567",
# placeholders/junk → corpus says error
"FP21": "<error: insufficient digits>",
"FP22": "<error: too many digits>",
"FP23": "<error: placeholder number>",
"FP24": "<error: placeholder number>",
"FP25": "<error: multiple numbers in cell>",
# NBSP / smart-quote contamination — defensive cleanup acceptable
"FP26": "+15551234567",
"FP27": "+15551234567",
"FP28": "+15551234567",
# FP29 empty → pass-through
"FP30": "<error: not a phone number>",
"FP31": "<error: smart-quote contamination>",
}
@pytest.mark.parametrize(
"inp,want",
_params("25_format_phones.csv", _PHONE_EXPECTED, {}),
)
def test_corpus_phones(inp, want):
got, _ = standardize_phone(inp, error_policy="sentinel")
_assert(got, want, inp)
def test_corpus_phones_uk_domestic_with_gb_region():
# FP15 — UK trunk-prefixed "020 7946 0958" only resolves with
# default_region="GB". Verifies the cleaner's intl path works.
got, _ = standardize_phone("020 7946 0958", default_region="GB")
assert got == "+442079460958"
# ---------------------------------------------------------------------------
# Emails — 26_format_emails.csv
# ---------------------------------------------------------------------------
_EMAIL_EXPECTED: dict[str, object] = {
"FE01": "alice@example.com",
"FE02": "alice@example.com",
"FE03": "alice@example.com",
"FE04": "alice@example.com",
"FE05": "alice@example.com",
"FE06": "alice@example.com",
"FE07": "alice@example.com",
"FE08": "alice@example.com",
"FE09": "alice@example.com",
"FE10": "a.l.i.c.e@gmail.com", # default: don't touch dots
"FE11": "alice+newsletter@gmail.com", # default: don't touch +tag
"FE12": "a.l.i.c.e+work@gmail.com",
"FE13": "a.l.i.c.e@example.com", # never touch non-Gmail
"FE14": "alice+newsletter@example.com",
"FE15": "alice@münchen.de",
"FE16": "アリス@example.jp",
"FE17": "alice@example.com",
"FE18": "alice@example.com",
"FE19": "alice@example.com",
"FE20": "alice@example.com",
"FE21": "alice@example.com",
"FE22": "<error: missing @>",
"FE23": "<error: double @>",
"FE24": "<error: multiple @>",
"FE25": "<error: internal whitespace>",
"FE26": "<error: no TLD>",
"FE27": "<error: multiple emails>",
"FE28": "<error: multiple emails>",
# FE29 / FE30 empty / whitespace → PASSTHROUGH
"FE31": "alice@example.com",
}
_EMAIL_XFAILS: dict[str, str] = {}
@pytest.mark.parametrize(
"inp,want",
_params("26_format_emails.csv", _EMAIL_EXPECTED, _EMAIL_XFAILS),
)
def test_corpus_emails(inp, want):
got, _ = standardize_email(inp, error_policy="sentinel")
_assert(got, want, inp)
_EMAIL_GMAIL_CANONICAL: dict[str, str] = {
"FE10": "alice@gmail.com",
"FE11": "alice@gmail.com",
"FE12": "alice@gmail.com",
"FE13": "a.l.i.c.e@example.com", # negative test: don't touch non-Gmail
"FE14": "alice+newsletter@example.com", # negative test
}
@pytest.mark.parametrize("inp,want", [
pytest.param(
next(r for r in _load("26_format_emails.csv") if r["case_id"] == cid)["input"],
want, id=f"{cid}-gmail-canonical",
)
for cid, want in _EMAIL_GMAIL_CANONICAL.items()
])
def test_corpus_emails_gmail_canonical(inp, want):
got, _ = standardize_email(inp, gmail_canonical=True)
assert got == want
# ---------------------------------------------------------------------------
# Addresses — 27_format_addresses.csv
# ---------------------------------------------------------------------------
_ADDRESS_EXPECTED: dict[str, str] = {
"FA01": "123 Main St, New York, NY 10001",
"FA02": "123 Main St, New York, NY 10001",
"FA03": "123 Main St, New York, NY 10001",
"FA04": "123 Main St, New York, NY 10001",
"FA05": "123 Main St, New York, NY 10001",
"FA06": "456 Park Ave, New York, NY 10001",
"FA07": "789 Sunset Blvd, Los Angeles, CA 90028",
"FA08": "123 Main St, New York, NY 10001",
"FA09": "123 N Main St, City, ST 12345",
"FA10": "123 N Main St, City, ST 12345",
"FA11": "123 NE Main St, City, ST 12345",
"FA12": "123 Main St, Apt 4B, City, ST 12345",
"FA13": "123 Main St, # 4B, City, ST 12345",
"FA14": "123 Main St, Ste 200, City, ST 12345",
"FA15": "123 Main St, New York, NY 10001",
"FA16": "123 Main St, New York, NY 10001",
"FA17": "123 Main St, New York, NY 10001-1234",
"FA18": "123 Main St, Boston, MA 02101",
"FA19": "123 Main St, Apt 4B, New York, NY 10001",
"FA20": "PO Box 123, City, ST 12345",
"FA21": "PO Box 123, City, ST 12345",
"FA22": "PO Box 123, City, ST 12345",
"FA23": "123A Main St, City, ST 12345",
"FA24": "123-1 Main St, City, ST 12345",
"FA25": "123 1/2 Main St, City, ST 12345",
"FA26": "10 Downing Street, London, SW1A 2AA",
"FA27": "1 Yonge St, Toronto, ON M5E 1W7",
"FA28": "100-0001, Tokyo, Chiyoda, Marunouchi 1-1",
"FA31": "123 Main St, New York, NY 10001",
}
@pytest.mark.parametrize(
"inp,want",
_params("27_format_addresses.csv", _ADDRESS_EXPECTED, {}),
)
def test_corpus_addresses(inp, want):
got, _ = standardize_address(inp, expand=False)
_assert(got, want, inp)
# ---------------------------------------------------------------------------
# Names — 28_format_names.csv
# ---------------------------------------------------------------------------
_NAME_EXPECTED: dict[str, object] = {
"FN01": "Alice Smith",
"FN02": "Alice Smith",
"FN03": "Alice Smith",
"FN04": "aLiCe SmItH", # corpus 7.3 conservative: preserve mixed
"FN05": "McDonald",
"FN06": "McDonald",
"FN07": "MacDonald",
"FN08": "McTaggart",
"FN09": "O'Connor",
"FN10": "O'Connor",
"FN11": "O'Brien",
"FN12": "Mary-Jane Smith",
"FN13": "Smith-Jones",
"FN14": "von Trapp",
"FN15": "Vincent van Gogh",
"FN16": "Charles de Gaulle",
"FN17": "Leonardo da Vinci",
"FN18": "Mr John Smith", # corpus 7.3: drop title period
"FN19": "Dr Jane Doe",
"FN20": "Prof Alice Williams",
"FN21": "John Smith Jr",
"FN22": "John Smith III",
"FN23": "Jane Doe PhD",
"FN24": "John Smith", # comma-format reversed
"FN25": "John Smith",
"FN26": "John Andrew Smith",
"FN27": "John A Smith", # corpus 7.3: drop initial period
"FN28": "J.K. Rowling",
"FN29": "김철수",
"FN30": "田中太郎",
"FN31": "Иван Иванов",
"FN32": "Madonna",
# FN33 / FN34 → PASSTHROUGH default
}
@pytest.mark.parametrize(
"inp,want",
_params("28_format_names.csv", _NAME_EXPECTED, {}),
)
def test_corpus_names(inp, want):
# FN04 needs conservative=True; the rest use default (aggressive).
conservative = inp == "aLiCe SmItH"
got, _ = standardize_name(inp, conservative=conservative)
_assert(got, want, inp)
# ---------------------------------------------------------------------------
# Currencies — 29_format_currencies.csv
# ---------------------------------------------------------------------------
_CURRENCY_EXPECTED: dict[str, object] = {
"FC01": "1234.56",
"FC02": "1234.56",
"FC03": "1234.56",
"FC04": "1234.56",
"FC05": "1234.56",
"FC06": "1234.56",
"FC07": "1234.56",
"FC08": "1234.56",
"FC09": "1234.56",
"FC10": "1234.56",
"FC11": "1234.56",
"FC12": "1234.56",
"FC13": "1234",
"FC14": "123456.78",
"FC15": "-100",
"FC16": "-100",
"FC17": "-100",
"FC18": "0",
"FC19": "1500000",
"FC20": "<error: percentage not currency>",
"FC21": "<error: range not normalizable>",
"FC22": "<error: word value>",
"FC23": "<error: word value>",
# FC24 empty → PASSTHROUGH
"FC25": "1234.56",
"FC26": "1234",
"FC27": "<error: ambiguous separator, set --currency-locale>",
}
@pytest.mark.parametrize(
"inp,want",
_params("29_format_currencies.csv", _CURRENCY_EXPECTED, {}),
)
def test_corpus_currencies(inp, want):
got, _ = standardize_currency(inp, error_policy="sentinel")
_assert(got, want, inp)
def test_corpus_currencies_eu_with_comma_decimal():
# FC08, FC10 also parse correctly under decimal="comma".
got, _ = standardize_currency("€1.234,56", decimal="comma")
assert got == "1234.56"
got, _ = standardize_currency("1.234,56 EUR", decimal="comma")
assert got == "1234.56"
# ---------------------------------------------------------------------------
# Integration — 30_format_integration.csv
# ---------------------------------------------------------------------------
def _integration_opts(**overrides) -> StandardizeOptions:
"""Standardize options matching corpus defaults for the integration row."""
base = StandardizeOptions(
column_types={
"name": FieldType.NAME,
"email": FieldType.EMAIL,
"phone": FieldType.PHONE,
"date": FieldType.DATE,
"amount": FieldType.CURRENCY,
"address": FieldType.ADDRESS,
},
currency_decimals=None,
address_expand=False,
date_error_policy="passthrough",
phone_error_policy="passthrough",
)
for k, v in overrides.items():
setattr(base, k, v)
return base
def test_corpus_integration_pipeline_preserves_schema():
df = pd.read_csv(CORPUS / "30_format_integration.csv",
dtype=str, keep_default_na=False)
result = standardize_dataframe(df, _integration_opts())
out = result.standardized_df
# Schema preservation (corpus § 0.2): no rows or columns added,
# column order intact.
assert list(out.columns) == list(df.columns)
assert len(out) == len(df)
def test_corpus_integration_FI01_messy_record():
# Row 0 = FI01: standard messy-but-cleanable record.
df = pd.read_csv(CORPUS / "30_format_integration.csv",
dtype=str, keep_default_na=False)
result = standardize_dataframe(df, _integration_opts())
row = result.standardized_df.iloc[0]
assert row["name"] == "Alice Smith"
assert row["email"] == "alice@example.com"
assert row["phone"] == "+15551234567"
assert row["date"] == "2024-01-15"
assert row["amount"] == "1234.56"
assert row["address"] == "123 Main St, New York, NY 10001"
def test_corpus_integration_FI04_all_empty_passthrough():
# Row 3 = FI04: all empty cells, must pass through without errors.
df = pd.read_csv(CORPUS / "30_format_integration.csv",
dtype=str, keep_default_na=False)
result = standardize_dataframe(df, _integration_opts())
row = result.standardized_df.iloc[3]
for col in ("name", "email", "phone", "date", "amount", "address"):
assert row[col] == "", f"FI04.{col} expected empty, got {row[col]!r}"
def test_corpus_integration_FI05_idempotent_on_clean_input():
# Row 4 = FI05: already-clean record. Every column should round-trip
# unchanged.
df = pd.read_csv(CORPUS / "30_format_integration.csv",
dtype=str, keep_default_na=False)
result = standardize_dataframe(df, _integration_opts())
row = result.standardized_df.iloc[4]
original = df.iloc[4]
for col in ("name", "email", "phone", "date", "amount", "address"):
assert row[col] == original[col], (
f"FI05.{col} non-idempotent: {original[col]!r} -> {row[col]!r}"
)
# ---------------------------------------------------------------------------
# Idempotency property
# ---------------------------------------------------------------------------
#
# Every per-cell standardizer must satisfy ``f(f(x)) == f(x)`` (corpus
# § 1, "Idempotency requirement"). We exercise it across every corpus
# input under the same flag set the per-domain tests use.
def _idempotency_runner(fn, fixture, **kwargs):
failures = []
for row in _load(fixture):
once, _ = fn(row["input"], **kwargs)
twice, _ = fn(once, **kwargs)
if once != twice:
failures.append((row["case_id"], row["input"], once, twice))
return failures
@pytest.mark.parametrize("fn,fixture,kwargs", [
(standardize_date, "24_format_dates.csv", {}),
(standardize_phone, "25_format_phones.csv", {}),
(standardize_address, "27_format_addresses.csv", {"expand": False}),
(standardize_name, "28_format_names.csv", {}),
(standardize_currency, "29_format_currencies.csv",{}),
(standardize_email, "26_format_emails.csv", {}),
])
def test_corpus_idempotency(fn, fixture, kwargs):
failures = _idempotency_runner(fn, fixture, **kwargs)
assert not failures, (
f"non-idempotent transformations in {fixture}:\n"
+ "\n".join(f" {cid}: {inp!r} -> {once!r} -> {twice!r}"
for cid, inp, once, twice in failures)
)