Tools shipped this batch (4 → 6 of 9 Ready):
04 Missing Value Handler src/core/missing.py + cli_missing.py + GUI
05 Column Mapper src/core/column_mapper.py + cli_column_map.py + GUI
09 Pipeline Runner src/core/pipeline.py + cli_pipeline.py + GUI
with soft tool-dependency graph (recommended,
not enforced) and JSON save/load for repeatable
weekly cleanups.
Format Standardizer reworked for 1 GB international files:
• Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
• Per-row country / address columns drive parsing
• Audit cap (default 10 k rows, ~50 MB RAM)
• standardize_file(): chunked streaming entry point (~165 k rows/sec)
• currency_decimal="auto" for EU comma-decimal locales
• R$ / kr / zł multi-char currency prefixes
• cli_format.py with auto-stream above 100 MB inputs
Encoding detection arbiter + language-aware probe:
Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.
Distribution-readiness assets:
• streamlit_app.py — Streamlit Community Cloud entry shim
• src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
100-row cap + watermark, free-vs-paid boundary enforced at surface
• samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
• landing/ — 4 static HTML pages (apex chooser + 3 niche),
shared CSS, deploy.py URL-substitution script,
auto-generated robots.txt + sitemap.xml + 404.html + favicon
• docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
— full strategy + measurement + deployment + master checklist
Test counts:
before: 1,520 passed · 4 skipped · 17 xfailed
after: 1,729 passed · 0 skipped · 0 xfailed
Tier-1 corpora added:
• missing-corpus 3 use cases + 16 edge cases
• column-mapper-corpus 3 use cases + 5 edge cases
• format-cleaner intl 20-row 13-country stress fixture
Engine hardening flushed out by the corpora:
• interpolate guards against object-dtype columns
• mean/median skip all-NaN columns (silences numpy warning)
• fillna runs under future.no_silent_downcasting (silences pandas warning)
• mojibake test no longer skips when ftfy installed (monkeypatch path)
• drop-row threshold semantics: strict-greater (consistent across rows / cols)
• currency_decimal validator allow-set updated for "auto"
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
574 lines
19 KiB
Python
574 lines
19 KiB
Python
"""Corpus-driven tests for ``src.core.format_standardize``.
|
|
|
|
Drives every row of the FORMATS test corpus
|
|
(``test-cases/format-cleaner-corpus/*.csv``) through the per-cell
|
|
standardizers and asserts the canonical output the corpus expects.
|
|
|
|
The corpus itself (``FORMATS-CASES.md`` in the same directory)
|
|
documents per-domain policy decisions; the per-case ``id`` strings
|
|
below (FD01, FP14, FA09, …) match its row keys exactly.
|
|
|
|
Two sentinels are used in the per-domain expected dicts:
|
|
|
|
- A literal string is the corpus's expected canonical output.
|
|
- ``PASSTHROUGH`` means "corpus accepts no transformation" — usually
|
|
empty, whitespace-only, or already-clean input.
|
|
|
|
A handful of corpus rows are still ``xfail`` because closing them
|
|
needs heavier machinery (Excel serial parsing, Unix timestamps,
|
|
non-English month dictionaries, IDN / non-ASCII email validation).
|
|
Each such marker carries a one-line reason.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import csv
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
import pytest
|
|
|
|
from src.core.format_standardize import (
|
|
FieldType,
|
|
StandardizeOptions,
|
|
standardize_address,
|
|
standardize_currency,
|
|
standardize_dataframe,
|
|
standardize_date,
|
|
standardize_email,
|
|
standardize_name,
|
|
standardize_phone,
|
|
)
|
|
|
|
CORPUS = Path(__file__).resolve().parents[1] / "test-cases" / "format-cleaner-corpus"
|
|
|
|
PASSTHROUGH = object() # sentinel: assert the function returned input unchanged
|
|
|
|
|
|
def _load(filename: str) -> list[dict[str, str]]:
|
|
with (CORPUS / filename).open(newline="") as f:
|
|
return list(csv.DictReader(f))
|
|
|
|
|
|
def _params(fixture: str, expected: dict[str, object], xfails: dict[str, str]):
|
|
"""Build pytest.param entries for every row in *fixture*.
|
|
|
|
Rows in *xfails* are wrapped in a non-strict xfail with the given
|
|
reason, so improvements that close the gap surface as xpass and the
|
|
suite stays green either way.
|
|
"""
|
|
rows = _load(fixture)
|
|
out = []
|
|
for row in rows:
|
|
cid = row["case_id"]
|
|
want = expected.get(cid, PASSTHROUGH)
|
|
marks = []
|
|
if cid in xfails:
|
|
marks.append(pytest.mark.xfail(reason=xfails[cid], strict=False))
|
|
out.append(pytest.param(row["input"], want, id=cid, marks=marks))
|
|
return out
|
|
|
|
|
|
def _assert(got: str, want: object, original: str) -> None:
|
|
if want is PASSTHROUGH:
|
|
assert got == original, f"expected pass-through, got {got!r}"
|
|
else:
|
|
assert got == want
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Dates — 24_format_dates.csv
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_DATE_EXPECTED_MDY: dict[str, object] = {
|
|
# iso baseline + datetime variants → ISO date
|
|
"FD01": "2024-01-15",
|
|
"FD02": "2024-01-15",
|
|
"FD03": "2024-01-15",
|
|
"FD04": "2024-01-15",
|
|
"FD05": "2024-01-15",
|
|
"FD06": "2024-01-15",
|
|
# US M/D/Y variants
|
|
"FD07": "2024-01-15",
|
|
"FD08": "2024-01-15",
|
|
"FD09": "2024-01-05",
|
|
"FD10": "2024-05-30",
|
|
# longform month names
|
|
"FD16": "2024-01-15",
|
|
"FD17": "2024-01-15",
|
|
"FD18": "2024-01-15",
|
|
"FD19": "2024-01-15",
|
|
"FD20": "2024-01-15", # weekday-prefixed
|
|
"FD21": "2024-01-15",
|
|
# FD11-FD15 — DMY-shaped EU dates in MDY default mode; the DMY
|
|
# rerun below covers the actual parse path. Under MDY they pass
|
|
# through unchanged. (Listed explicitly so a future MDY-aware
|
|
# locale auto-detect can replace these expectations with the
|
|
# correct ISO output.)
|
|
"FD11": PASSTHROUGH,
|
|
"FD12": PASSTHROUGH,
|
|
"FD13": PASSTHROUGH,
|
|
"FD14": PASSTHROUGH,
|
|
"FD15": PASSTHROUGH,
|
|
# excel serial dates (numeric days since 1899-12-30)
|
|
"FD22": "2024-01-15",
|
|
"FD23": "2024-01-15",
|
|
# unix timestamps (seconds, milliseconds)
|
|
"FD24": "2024-01-15",
|
|
"FD25": "2024-01-15",
|
|
# partial precision — corpus preserves it
|
|
"FD26": "2024-01",
|
|
"FD27": "2024-01", # text precision month
|
|
"FD28": "2024-Q1", # quarter
|
|
"FD29": "2024",
|
|
# 2-digit year cutoff (per docs: 1969 wins over 2069)
|
|
"FD30": "1969-01-15",
|
|
# leap day valid
|
|
"FD31": "2024-02-29",
|
|
# invalid dates → corpus expects error sentinel
|
|
"FD32": "<error: invalid leap day>",
|
|
"FD33": "<error: Excel 1900 leap year bug>",
|
|
"FD34": "<error: invalid month>",
|
|
"FD35": "<error: invalid day>",
|
|
# buried-date extraction
|
|
"FD36": "2024-01-15",
|
|
"FD37": "2024-01-15",
|
|
# garbage → pass through (corpus 0.3 boundary table)
|
|
# FD38/39/40 → PASSTHROUGH default
|
|
# locale-specific month names (en/fr/de via month_locales)
|
|
"FD41": "2024-01-15",
|
|
"FD42": "2024-01-15",
|
|
# timezone — corpus 3.3 says fixed-offset only
|
|
"FD43": "2024-01-15",
|
|
"FD44": "2024-03-10",
|
|
# already-clean idempotency
|
|
"FD45": "2024-01-15",
|
|
}
|
|
|
|
_DATE_XFAILS_MDY: dict[str, str] = {}
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"inp,want",
|
|
_params("24_format_dates.csv", _DATE_EXPECTED_MDY, _DATE_XFAILS_MDY),
|
|
)
|
|
def test_corpus_dates_mdy(inp, want):
|
|
got, _ = standardize_date(
|
|
inp, error_policy="sentinel", month_locales=["en", "fr", "de"],
|
|
)
|
|
_assert(got, want, inp)
|
|
|
|
|
|
# DMY locale rerun for the EU rows that need it.
|
|
_DATE_EXPECTED_DMY: dict[str, str] = {
|
|
"FD11": "2024-01-15",
|
|
"FD12": "2024-01-15",
|
|
"FD13": "2024-01-15",
|
|
"FD14": "2024-05-30",
|
|
"FD15": "2024-01-15",
|
|
}
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"inp,want",
|
|
[
|
|
pytest.param(
|
|
_load("24_format_dates.csv")[i - 1]["input"],
|
|
_DATE_EXPECTED_DMY[f"FD{i:02d}"],
|
|
id=f"FD{i:02d}-dmy",
|
|
)
|
|
for i in range(11, 16)
|
|
],
|
|
)
|
|
def test_corpus_dates_dmy(inp, want):
|
|
got, _ = standardize_date(inp, date_order="DMY")
|
|
assert got == want
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Phones — 25_format_phones.csv
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_PHONE_EXPECTED: dict[str, object] = {
|
|
"FP01": "+15551234567",
|
|
"FP02": "+15551234567",
|
|
"FP03": "+15551234567",
|
|
"FP04": "+15551234567",
|
|
"FP05": "+15551234567",
|
|
"FP06": "+15551234567",
|
|
"FP07": "+15551234567",
|
|
"FP08": "+15551234567",
|
|
"FP09": "+15551234567;ext=123",
|
|
"FP10": "+15551234567;ext=123",
|
|
"FP11": "+15551234567;ext=123",
|
|
# vanity numbers
|
|
"FP12": "+18003569377",
|
|
"FP13": "+15552255669",
|
|
# international (intl row FP15 needs --default-country=GB; covered separately)
|
|
"FP14": "+442079460958",
|
|
"FP16": "+493012345678",
|
|
"FP17": "+33123456789",
|
|
"FP18": "+81312345678",
|
|
"FP19": "+61212345678",
|
|
"FP20": "+15551234567",
|
|
# placeholders/junk → corpus says error
|
|
"FP21": "<error: insufficient digits>",
|
|
"FP22": "<error: too many digits>",
|
|
"FP23": "<error: placeholder number>",
|
|
"FP24": "<error: placeholder number>",
|
|
"FP25": "<error: multiple numbers in cell>",
|
|
# NBSP / smart-quote contamination — defensive cleanup acceptable
|
|
"FP26": "+15551234567",
|
|
"FP27": "+15551234567",
|
|
"FP28": "+15551234567",
|
|
# FP29 empty → pass-through
|
|
"FP30": "<error: not a phone number>",
|
|
"FP31": "<error: smart-quote contamination>",
|
|
}
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"inp,want",
|
|
_params("25_format_phones.csv", _PHONE_EXPECTED, {}),
|
|
)
|
|
def test_corpus_phones(inp, want):
|
|
got, _ = standardize_phone(inp, error_policy="sentinel")
|
|
_assert(got, want, inp)
|
|
|
|
|
|
def test_corpus_phones_uk_domestic_with_gb_region():
|
|
# FP15 — UK trunk-prefixed "020 7946 0958" only resolves with
|
|
# default_region="GB". Verifies the cleaner's intl path works.
|
|
got, _ = standardize_phone("020 7946 0958", default_region="GB")
|
|
assert got == "+442079460958"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Emails — 26_format_emails.csv
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_EMAIL_EXPECTED: dict[str, object] = {
|
|
"FE01": "alice@example.com",
|
|
"FE02": "alice@example.com",
|
|
"FE03": "alice@example.com",
|
|
"FE04": "alice@example.com",
|
|
"FE05": "alice@example.com",
|
|
"FE06": "alice@example.com",
|
|
"FE07": "alice@example.com",
|
|
"FE08": "alice@example.com",
|
|
"FE09": "alice@example.com",
|
|
"FE10": "a.l.i.c.e@gmail.com", # default: don't touch dots
|
|
"FE11": "alice+newsletter@gmail.com", # default: don't touch +tag
|
|
"FE12": "a.l.i.c.e+work@gmail.com",
|
|
"FE13": "a.l.i.c.e@example.com", # never touch non-Gmail
|
|
"FE14": "alice+newsletter@example.com",
|
|
"FE15": "alice@münchen.de",
|
|
"FE16": "アリス@example.jp",
|
|
"FE17": "alice@example.com",
|
|
"FE18": "alice@example.com",
|
|
"FE19": "alice@example.com",
|
|
"FE20": "alice@example.com",
|
|
"FE21": "alice@example.com",
|
|
"FE22": "<error: missing @>",
|
|
"FE23": "<error: double @>",
|
|
"FE24": "<error: multiple @>",
|
|
"FE25": "<error: internal whitespace>",
|
|
"FE26": "<error: no TLD>",
|
|
"FE27": "<error: multiple emails>",
|
|
"FE28": "<error: multiple emails>",
|
|
# FE29 / FE30 empty / whitespace → PASSTHROUGH
|
|
"FE31": "alice@example.com",
|
|
}
|
|
|
|
_EMAIL_XFAILS: dict[str, str] = {}
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"inp,want",
|
|
_params("26_format_emails.csv", _EMAIL_EXPECTED, _EMAIL_XFAILS),
|
|
)
|
|
def test_corpus_emails(inp, want):
|
|
got, _ = standardize_email(inp, error_policy="sentinel")
|
|
_assert(got, want, inp)
|
|
|
|
|
|
_EMAIL_GMAIL_CANONICAL: dict[str, str] = {
|
|
"FE10": "alice@gmail.com",
|
|
"FE11": "alice@gmail.com",
|
|
"FE12": "alice@gmail.com",
|
|
"FE13": "a.l.i.c.e@example.com", # negative test: don't touch non-Gmail
|
|
"FE14": "alice+newsletter@example.com", # negative test
|
|
}
|
|
|
|
|
|
@pytest.mark.parametrize("inp,want", [
|
|
pytest.param(
|
|
next(r for r in _load("26_format_emails.csv") if r["case_id"] == cid)["input"],
|
|
want, id=f"{cid}-gmail-canonical",
|
|
)
|
|
for cid, want in _EMAIL_GMAIL_CANONICAL.items()
|
|
])
|
|
def test_corpus_emails_gmail_canonical(inp, want):
|
|
got, _ = standardize_email(inp, gmail_canonical=True)
|
|
assert got == want
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Addresses — 27_format_addresses.csv
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_ADDRESS_EXPECTED: dict[str, str] = {
|
|
"FA01": "123 Main St, New York, NY 10001",
|
|
"FA02": "123 Main St, New York, NY 10001",
|
|
"FA03": "123 Main St, New York, NY 10001",
|
|
"FA04": "123 Main St, New York, NY 10001",
|
|
"FA05": "123 Main St, New York, NY 10001",
|
|
"FA06": "456 Park Ave, New York, NY 10001",
|
|
"FA07": "789 Sunset Blvd, Los Angeles, CA 90028",
|
|
"FA08": "123 Main St, New York, NY 10001",
|
|
"FA09": "123 N Main St, City, ST 12345",
|
|
"FA10": "123 N Main St, City, ST 12345",
|
|
"FA11": "123 NE Main St, City, ST 12345",
|
|
"FA12": "123 Main St, Apt 4B, City, ST 12345",
|
|
"FA13": "123 Main St, # 4B, City, ST 12345",
|
|
"FA14": "123 Main St, Ste 200, City, ST 12345",
|
|
"FA15": "123 Main St, New York, NY 10001",
|
|
"FA16": "123 Main St, New York, NY 10001",
|
|
"FA17": "123 Main St, New York, NY 10001-1234",
|
|
"FA18": "123 Main St, Boston, MA 02101",
|
|
"FA19": "123 Main St, Apt 4B, New York, NY 10001",
|
|
"FA20": "PO Box 123, City, ST 12345",
|
|
"FA21": "PO Box 123, City, ST 12345",
|
|
"FA22": "PO Box 123, City, ST 12345",
|
|
"FA23": "123A Main St, City, ST 12345",
|
|
"FA24": "123-1 Main St, City, ST 12345",
|
|
"FA25": "123 1/2 Main St, City, ST 12345",
|
|
"FA26": "10 Downing Street, London, SW1A 2AA",
|
|
"FA27": "1 Yonge St, Toronto, ON M5E 1W7",
|
|
"FA28": "100-0001, Tokyo, Chiyoda, Marunouchi 1-1",
|
|
"FA31": "123 Main St, New York, NY 10001",
|
|
}
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"inp,want",
|
|
_params("27_format_addresses.csv", _ADDRESS_EXPECTED, {}),
|
|
)
|
|
def test_corpus_addresses(inp, want):
|
|
got, _ = standardize_address(inp, expand=False)
|
|
_assert(got, want, inp)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Names — 28_format_names.csv
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_NAME_EXPECTED: dict[str, object] = {
|
|
"FN01": "Alice Smith",
|
|
"FN02": "Alice Smith",
|
|
"FN03": "Alice Smith",
|
|
"FN04": "aLiCe SmItH", # corpus 7.3 conservative: preserve mixed
|
|
"FN05": "McDonald",
|
|
"FN06": "McDonald",
|
|
"FN07": "MacDonald",
|
|
"FN08": "McTaggart",
|
|
"FN09": "O'Connor",
|
|
"FN10": "O'Connor",
|
|
"FN11": "O'Brien",
|
|
"FN12": "Mary-Jane Smith",
|
|
"FN13": "Smith-Jones",
|
|
"FN14": "von Trapp",
|
|
"FN15": "Vincent van Gogh",
|
|
"FN16": "Charles de Gaulle",
|
|
"FN17": "Leonardo da Vinci",
|
|
"FN18": "Mr John Smith", # corpus 7.3: drop title period
|
|
"FN19": "Dr Jane Doe",
|
|
"FN20": "Prof Alice Williams",
|
|
"FN21": "John Smith Jr",
|
|
"FN22": "John Smith III",
|
|
"FN23": "Jane Doe PhD",
|
|
"FN24": "John Smith", # comma-format reversed
|
|
"FN25": "John Smith",
|
|
"FN26": "John Andrew Smith",
|
|
"FN27": "John A Smith", # corpus 7.3: drop initial period
|
|
"FN28": "J.K. Rowling",
|
|
"FN29": "김철수",
|
|
"FN30": "田中太郎",
|
|
"FN31": "Иван Иванов",
|
|
"FN32": "Madonna",
|
|
# FN33 / FN34 → PASSTHROUGH default
|
|
}
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"inp,want",
|
|
_params("28_format_names.csv", _NAME_EXPECTED, {}),
|
|
)
|
|
def test_corpus_names(inp, want):
|
|
# FN04 needs conservative=True; the rest use default (aggressive).
|
|
conservative = inp == "aLiCe SmItH"
|
|
got, _ = standardize_name(inp, conservative=conservative)
|
|
_assert(got, want, inp)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Currencies — 29_format_currencies.csv
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_CURRENCY_EXPECTED: dict[str, object] = {
|
|
"FC01": "1234.56",
|
|
"FC02": "1234.56",
|
|
"FC03": "1234.56",
|
|
"FC04": "1234.56",
|
|
"FC05": "1234.56",
|
|
"FC06": "1234.56",
|
|
"FC07": "1234.56",
|
|
"FC08": "1234.56",
|
|
"FC09": "1234.56",
|
|
"FC10": "1234.56",
|
|
"FC11": "1234.56",
|
|
"FC12": "1234.56",
|
|
"FC13": "1234",
|
|
"FC14": "123456.78",
|
|
"FC15": "-100",
|
|
"FC16": "-100",
|
|
"FC17": "-100",
|
|
"FC18": "0",
|
|
"FC19": "1500000",
|
|
"FC20": "<error: percentage not currency>",
|
|
"FC21": "<error: range not normalizable>",
|
|
"FC22": "<error: word value>",
|
|
"FC23": "<error: word value>",
|
|
# FC24 empty → PASSTHROUGH
|
|
"FC25": "1234.56",
|
|
"FC26": "1234",
|
|
"FC27": "<error: ambiguous separator, set --currency-locale>",
|
|
}
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"inp,want",
|
|
_params("29_format_currencies.csv", _CURRENCY_EXPECTED, {}),
|
|
)
|
|
def test_corpus_currencies(inp, want):
|
|
got, _ = standardize_currency(inp, error_policy="sentinel")
|
|
_assert(got, want, inp)
|
|
|
|
|
|
def test_corpus_currencies_eu_with_comma_decimal():
|
|
# FC08, FC10 also parse correctly under decimal="comma".
|
|
got, _ = standardize_currency("€1.234,56", decimal="comma")
|
|
assert got == "1234.56"
|
|
got, _ = standardize_currency("1.234,56 EUR", decimal="comma")
|
|
assert got == "1234.56"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Integration — 30_format_integration.csv
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _integration_opts(**overrides) -> StandardizeOptions:
|
|
"""Standardize options matching corpus defaults for the integration row."""
|
|
base = StandardizeOptions(
|
|
column_types={
|
|
"name": FieldType.NAME,
|
|
"email": FieldType.EMAIL,
|
|
"phone": FieldType.PHONE,
|
|
"date": FieldType.DATE,
|
|
"amount": FieldType.CURRENCY,
|
|
"address": FieldType.ADDRESS,
|
|
},
|
|
currency_decimals=None,
|
|
address_expand=False,
|
|
date_error_policy="passthrough",
|
|
phone_error_policy="passthrough",
|
|
)
|
|
for k, v in overrides.items():
|
|
setattr(base, k, v)
|
|
return base
|
|
|
|
|
|
def test_corpus_integration_pipeline_preserves_schema():
|
|
df = pd.read_csv(CORPUS / "30_format_integration.csv",
|
|
dtype=str, keep_default_na=False)
|
|
result = standardize_dataframe(df, _integration_opts())
|
|
out = result.standardized_df
|
|
|
|
# Schema preservation (corpus § 0.2): no rows or columns added,
|
|
# column order intact.
|
|
assert list(out.columns) == list(df.columns)
|
|
assert len(out) == len(df)
|
|
|
|
|
|
def test_corpus_integration_FI01_messy_record():
|
|
# Row 0 = FI01: standard messy-but-cleanable record.
|
|
df = pd.read_csv(CORPUS / "30_format_integration.csv",
|
|
dtype=str, keep_default_na=False)
|
|
result = standardize_dataframe(df, _integration_opts())
|
|
row = result.standardized_df.iloc[0]
|
|
assert row["name"] == "Alice Smith"
|
|
assert row["email"] == "alice@example.com"
|
|
assert row["phone"] == "+15551234567"
|
|
assert row["date"] == "2024-01-15"
|
|
assert row["amount"] == "1234.56"
|
|
assert row["address"] == "123 Main St, New York, NY 10001"
|
|
|
|
|
|
def test_corpus_integration_FI04_all_empty_passthrough():
|
|
# Row 3 = FI04: all empty cells, must pass through without errors.
|
|
df = pd.read_csv(CORPUS / "30_format_integration.csv",
|
|
dtype=str, keep_default_na=False)
|
|
result = standardize_dataframe(df, _integration_opts())
|
|
row = result.standardized_df.iloc[3]
|
|
for col in ("name", "email", "phone", "date", "amount", "address"):
|
|
assert row[col] == "", f"FI04.{col} expected empty, got {row[col]!r}"
|
|
|
|
|
|
def test_corpus_integration_FI05_idempotent_on_clean_input():
|
|
# Row 4 = FI05: already-clean record. Every column should round-trip
|
|
# unchanged.
|
|
df = pd.read_csv(CORPUS / "30_format_integration.csv",
|
|
dtype=str, keep_default_na=False)
|
|
result = standardize_dataframe(df, _integration_opts())
|
|
row = result.standardized_df.iloc[4]
|
|
original = df.iloc[4]
|
|
for col in ("name", "email", "phone", "date", "amount", "address"):
|
|
assert row[col] == original[col], (
|
|
f"FI05.{col} non-idempotent: {original[col]!r} -> {row[col]!r}"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Idempotency property
|
|
# ---------------------------------------------------------------------------
|
|
#
|
|
# Every per-cell standardizer must satisfy ``f(f(x)) == f(x)`` (corpus
|
|
# § 1, "Idempotency requirement"). We exercise it across every corpus
|
|
# input under the same flag set the per-domain tests use.
|
|
|
|
def _idempotency_runner(fn, fixture, **kwargs):
|
|
failures = []
|
|
for row in _load(fixture):
|
|
once, _ = fn(row["input"], **kwargs)
|
|
twice, _ = fn(once, **kwargs)
|
|
if once != twice:
|
|
failures.append((row["case_id"], row["input"], once, twice))
|
|
return failures
|
|
|
|
|
|
@pytest.mark.parametrize("fn,fixture,kwargs", [
|
|
(standardize_date, "24_format_dates.csv", {}),
|
|
(standardize_phone, "25_format_phones.csv", {}),
|
|
(standardize_address, "27_format_addresses.csv", {"expand": False}),
|
|
(standardize_name, "28_format_names.csv", {}),
|
|
(standardize_currency, "29_format_currencies.csv",{}),
|
|
(standardize_email, "26_format_emails.csv", {}),
|
|
])
|
|
def test_corpus_idempotency(fn, fixture, kwargs):
|
|
failures = _idempotency_runner(fn, fixture, **kwargs)
|
|
assert not failures, (
|
|
f"non-idempotent transformations in {fixture}:\n"
|
|
+ "\n".join(f" {cid}: {inp!r} -> {once!r} -> {twice!r}"
|
|
for cid, inp, once, twice in failures)
|
|
)
|