"""Corpus-driven tests for ``src.core.format_standardize``. Drives every row of the FORMATS test corpus (``test-cases/format-cleaner-corpus/*.csv``) through the per-cell standardizers and asserts the canonical output the corpus expects. The corpus itself (``FORMATS-CASES.md`` in the same directory) documents per-domain policy decisions; the per-case ``id`` strings below (FD01, FP14, FA09, …) match its row keys exactly. Two sentinels are used in the per-domain expected dicts: - A literal string is the corpus's expected canonical output. - ``PASSTHROUGH`` means "corpus accepts no transformation" — usually empty, whitespace-only, or already-clean input. A handful of corpus rows are still ``xfail`` because closing them needs heavier machinery (Excel serial parsing, Unix timestamps, non-English month dictionaries, IDN / non-ASCII email validation). Each such marker carries a one-line reason. """ from __future__ import annotations import csv from pathlib import Path import pandas as pd import pytest from src.core.format_standardize import ( FieldType, StandardizeOptions, standardize_address, standardize_currency, standardize_dataframe, standardize_date, standardize_email, standardize_name, standardize_phone, ) CORPUS = Path(__file__).resolve().parents[1] / "test-cases" / "format-cleaner-corpus" PASSTHROUGH = object() # sentinel: assert the function returned input unchanged def _load(filename: str) -> list[dict[str, str]]: with (CORPUS / filename).open(newline="") as f: return list(csv.DictReader(f)) def _params(fixture: str, expected: dict[str, object], xfails: dict[str, str]): """Build pytest.param entries for every row in *fixture*. Rows in *xfails* are wrapped in a non-strict xfail with the given reason, so improvements that close the gap surface as xpass and the suite stays green either way. """ rows = _load(fixture) out = [] for row in rows: cid = row["case_id"] want = expected.get(cid, PASSTHROUGH) marks = [] if cid in xfails: marks.append(pytest.mark.xfail(reason=xfails[cid], strict=False)) out.append(pytest.param(row["input"], want, id=cid, marks=marks)) return out def _assert(got: str, want: object, original: str) -> None: if want is PASSTHROUGH: assert got == original, f"expected pass-through, got {got!r}" else: assert got == want # --------------------------------------------------------------------------- # Dates — 24_format_dates.csv # --------------------------------------------------------------------------- _DATE_EXPECTED_MDY: dict[str, object] = { # iso baseline + datetime variants → ISO date "FD01": "2024-01-15", "FD02": "2024-01-15", "FD03": "2024-01-15", "FD04": "2024-01-15", "FD05": "2024-01-15", "FD06": "2024-01-15", # US M/D/Y variants "FD07": "2024-01-15", "FD08": "2024-01-15", "FD09": "2024-01-05", "FD10": "2024-05-30", # longform month names "FD16": "2024-01-15", "FD17": "2024-01-15", "FD18": "2024-01-15", "FD19": "2024-01-15", "FD20": "2024-01-15", # weekday-prefixed "FD21": "2024-01-15", # FD11-FD15 — DMY-shaped EU dates in MDY default mode; the DMY # rerun below covers the actual parse path. Under MDY they pass # through unchanged. (Listed explicitly so a future MDY-aware # locale auto-detect can replace these expectations with the # correct ISO output.) "FD11": PASSTHROUGH, "FD12": PASSTHROUGH, "FD13": PASSTHROUGH, "FD14": PASSTHROUGH, "FD15": PASSTHROUGH, # excel serial dates (numeric days since 1899-12-30) "FD22": "2024-01-15", "FD23": "2024-01-15", # unix timestamps (seconds, milliseconds) "FD24": "2024-01-15", "FD25": "2024-01-15", # partial precision — corpus preserves it "FD26": "2024-01", "FD27": "2024-01", # text precision month "FD28": "2024-Q1", # quarter "FD29": "2024", # 2-digit year cutoff (per docs: 1969 wins over 2069) "FD30": "1969-01-15", # leap day valid "FD31": "2024-02-29", # invalid dates → corpus expects error sentinel "FD32": "", "FD33": "", "FD34": "", "FD35": "", # buried-date extraction "FD36": "2024-01-15", "FD37": "2024-01-15", # garbage → pass through (corpus 0.3 boundary table) # FD38/39/40 → PASSTHROUGH default # locale-specific month names (en/fr/de via month_locales) "FD41": "2024-01-15", "FD42": "2024-01-15", # timezone — corpus 3.3 says fixed-offset only "FD43": "2024-01-15", "FD44": "2024-03-10", # already-clean idempotency "FD45": "2024-01-15", } _DATE_XFAILS_MDY: dict[str, str] = {} @pytest.mark.parametrize( "inp,want", _params("24_format_dates.csv", _DATE_EXPECTED_MDY, _DATE_XFAILS_MDY), ) def test_corpus_dates_mdy(inp, want): got, _ = standardize_date( inp, error_policy="sentinel", month_locales=["en", "fr", "de"], ) _assert(got, want, inp) # DMY locale rerun for the EU rows that need it. _DATE_EXPECTED_DMY: dict[str, str] = { "FD11": "2024-01-15", "FD12": "2024-01-15", "FD13": "2024-01-15", "FD14": "2024-05-30", "FD15": "2024-01-15", } @pytest.mark.parametrize( "inp,want", [ pytest.param( _load("24_format_dates.csv")[i - 1]["input"], _DATE_EXPECTED_DMY[f"FD{i:02d}"], id=f"FD{i:02d}-dmy", ) for i in range(11, 16) ], ) def test_corpus_dates_dmy(inp, want): got, _ = standardize_date(inp, date_order="DMY") assert got == want # --------------------------------------------------------------------------- # Phones — 25_format_phones.csv # --------------------------------------------------------------------------- _PHONE_EXPECTED: dict[str, object] = { "FP01": "+15551234567", "FP02": "+15551234567", "FP03": "+15551234567", "FP04": "+15551234567", "FP05": "+15551234567", "FP06": "+15551234567", "FP07": "+15551234567", "FP08": "+15551234567", "FP09": "+15551234567;ext=123", "FP10": "+15551234567;ext=123", "FP11": "+15551234567;ext=123", # vanity numbers "FP12": "+18003569377", "FP13": "+15552255669", # international (intl row FP15 needs --default-country=GB; covered separately) "FP14": "+442079460958", "FP16": "+493012345678", "FP17": "+33123456789", "FP18": "+81312345678", "FP19": "+61212345678", "FP20": "+15551234567", # placeholders/junk → corpus says error "FP21": "", "FP22": "", "FP23": "", "FP24": "", "FP25": "", # NBSP / smart-quote contamination — defensive cleanup acceptable "FP26": "+15551234567", "FP27": "+15551234567", "FP28": "+15551234567", # FP29 empty → pass-through "FP30": "", "FP31": "", } @pytest.mark.parametrize( "inp,want", _params("25_format_phones.csv", _PHONE_EXPECTED, {}), ) def test_corpus_phones(inp, want): got, _ = standardize_phone(inp, error_policy="sentinel") _assert(got, want, inp) def test_corpus_phones_uk_domestic_with_gb_region(): # FP15 — UK trunk-prefixed "020 7946 0958" only resolves with # default_region="GB". Verifies the cleaner's intl path works. got, _ = standardize_phone("020 7946 0958", default_region="GB") assert got == "+442079460958" # --------------------------------------------------------------------------- # Emails — 26_format_emails.csv # --------------------------------------------------------------------------- _EMAIL_EXPECTED: dict[str, object] = { "FE01": "alice@example.com", "FE02": "alice@example.com", "FE03": "alice@example.com", "FE04": "alice@example.com", "FE05": "alice@example.com", "FE06": "alice@example.com", "FE07": "alice@example.com", "FE08": "alice@example.com", "FE09": "alice@example.com", "FE10": "a.l.i.c.e@gmail.com", # default: don't touch dots "FE11": "alice+newsletter@gmail.com", # default: don't touch +tag "FE12": "a.l.i.c.e+work@gmail.com", "FE13": "a.l.i.c.e@example.com", # never touch non-Gmail "FE14": "alice+newsletter@example.com", "FE15": "alice@münchen.de", "FE16": "アリス@example.jp", "FE17": "alice@example.com", "FE18": "alice@example.com", "FE19": "alice@example.com", "FE20": "alice@example.com", "FE21": "alice@example.com", "FE22": "", "FE23": "", "FE24": "", "FE25": "", "FE26": "", "FE27": "", "FE28": "", # FE29 / FE30 empty / whitespace → PASSTHROUGH "FE31": "alice@example.com", } _EMAIL_XFAILS: dict[str, str] = {} @pytest.mark.parametrize( "inp,want", _params("26_format_emails.csv", _EMAIL_EXPECTED, _EMAIL_XFAILS), ) def test_corpus_emails(inp, want): got, _ = standardize_email(inp, error_policy="sentinel") _assert(got, want, inp) _EMAIL_GMAIL_CANONICAL: dict[str, str] = { "FE10": "alice@gmail.com", "FE11": "alice@gmail.com", "FE12": "alice@gmail.com", "FE13": "a.l.i.c.e@example.com", # negative test: don't touch non-Gmail "FE14": "alice+newsletter@example.com", # negative test } @pytest.mark.parametrize("inp,want", [ pytest.param( next(r for r in _load("26_format_emails.csv") if r["case_id"] == cid)["input"], want, id=f"{cid}-gmail-canonical", ) for cid, want in _EMAIL_GMAIL_CANONICAL.items() ]) def test_corpus_emails_gmail_canonical(inp, want): got, _ = standardize_email(inp, gmail_canonical=True) assert got == want # --------------------------------------------------------------------------- # Addresses — 27_format_addresses.csv # --------------------------------------------------------------------------- _ADDRESS_EXPECTED: dict[str, str] = { "FA01": "123 Main St, New York, NY 10001", "FA02": "123 Main St, New York, NY 10001", "FA03": "123 Main St, New York, NY 10001", "FA04": "123 Main St, New York, NY 10001", "FA05": "123 Main St, New York, NY 10001", "FA06": "456 Park Ave, New York, NY 10001", "FA07": "789 Sunset Blvd, Los Angeles, CA 90028", "FA08": "123 Main St, New York, NY 10001", "FA09": "123 N Main St, City, ST 12345", "FA10": "123 N Main St, City, ST 12345", "FA11": "123 NE Main St, City, ST 12345", "FA12": "123 Main St, Apt 4B, City, ST 12345", "FA13": "123 Main St, # 4B, City, ST 12345", "FA14": "123 Main St, Ste 200, City, ST 12345", "FA15": "123 Main St, New York, NY 10001", "FA16": "123 Main St, New York, NY 10001", "FA17": "123 Main St, New York, NY 10001-1234", "FA18": "123 Main St, Boston, MA 02101", "FA19": "123 Main St, Apt 4B, New York, NY 10001", "FA20": "PO Box 123, City, ST 12345", "FA21": "PO Box 123, City, ST 12345", "FA22": "PO Box 123, City, ST 12345", "FA23": "123A Main St, City, ST 12345", "FA24": "123-1 Main St, City, ST 12345", "FA25": "123 1/2 Main St, City, ST 12345", "FA26": "10 Downing Street, London, SW1A 2AA", "FA27": "1 Yonge St, Toronto, ON M5E 1W7", "FA28": "100-0001, Tokyo, Chiyoda, Marunouchi 1-1", "FA31": "123 Main St, New York, NY 10001", } @pytest.mark.parametrize( "inp,want", _params("27_format_addresses.csv", _ADDRESS_EXPECTED, {}), ) def test_corpus_addresses(inp, want): got, _ = standardize_address(inp, expand=False) _assert(got, want, inp) # --------------------------------------------------------------------------- # Names — 28_format_names.csv # --------------------------------------------------------------------------- _NAME_EXPECTED: dict[str, object] = { "FN01": "Alice Smith", "FN02": "Alice Smith", "FN03": "Alice Smith", "FN04": "aLiCe SmItH", # corpus 7.3 conservative: preserve mixed "FN05": "McDonald", "FN06": "McDonald", "FN07": "MacDonald", "FN08": "McTaggart", "FN09": "O'Connor", "FN10": "O'Connor", "FN11": "O'Brien", "FN12": "Mary-Jane Smith", "FN13": "Smith-Jones", "FN14": "von Trapp", "FN15": "Vincent van Gogh", "FN16": "Charles de Gaulle", "FN17": "Leonardo da Vinci", "FN18": "Mr John Smith", # corpus 7.3: drop title period "FN19": "Dr Jane Doe", "FN20": "Prof Alice Williams", "FN21": "John Smith Jr", "FN22": "John Smith III", "FN23": "Jane Doe PhD", "FN24": "John Smith", # comma-format reversed "FN25": "John Smith", "FN26": "John Andrew Smith", "FN27": "John A Smith", # corpus 7.3: drop initial period "FN28": "J.K. Rowling", "FN29": "김철수", "FN30": "田中太郎", "FN31": "Иван Иванов", "FN32": "Madonna", # FN33 / FN34 → PASSTHROUGH default } @pytest.mark.parametrize( "inp,want", _params("28_format_names.csv", _NAME_EXPECTED, {}), ) def test_corpus_names(inp, want): # FN04 needs conservative=True; the rest use default (aggressive). conservative = inp == "aLiCe SmItH" got, _ = standardize_name(inp, conservative=conservative) _assert(got, want, inp) # --------------------------------------------------------------------------- # Currencies — 29_format_currencies.csv # --------------------------------------------------------------------------- _CURRENCY_EXPECTED: dict[str, object] = { "FC01": "1234.56", "FC02": "1234.56", "FC03": "1234.56", "FC04": "1234.56", "FC05": "1234.56", "FC06": "1234.56", "FC07": "1234.56", "FC08": "1234.56", "FC09": "1234.56", "FC10": "1234.56", "FC11": "1234.56", "FC12": "1234.56", "FC13": "1234", "FC14": "123456.78", "FC15": "-100", "FC16": "-100", "FC17": "-100", "FC18": "0", "FC19": "1500000", "FC20": "", "FC21": "", "FC22": "", "FC23": "", # FC24 empty → PASSTHROUGH "FC25": "1234.56", "FC26": "1234", "FC27": "", } @pytest.mark.parametrize( "inp,want", _params("29_format_currencies.csv", _CURRENCY_EXPECTED, {}), ) def test_corpus_currencies(inp, want): got, _ = standardize_currency(inp, error_policy="sentinel") _assert(got, want, inp) def test_corpus_currencies_eu_with_comma_decimal(): # FC08, FC10 also parse correctly under decimal="comma". got, _ = standardize_currency("€1.234,56", decimal="comma") assert got == "1234.56" got, _ = standardize_currency("1.234,56 EUR", decimal="comma") assert got == "1234.56" # --------------------------------------------------------------------------- # Integration — 30_format_integration.csv # --------------------------------------------------------------------------- def _integration_opts(**overrides) -> StandardizeOptions: """Standardize options matching corpus defaults for the integration row.""" base = StandardizeOptions( column_types={ "name": FieldType.NAME, "email": FieldType.EMAIL, "phone": FieldType.PHONE, "date": FieldType.DATE, "amount": FieldType.CURRENCY, "address": FieldType.ADDRESS, }, currency_decimals=None, address_expand=False, date_error_policy="passthrough", phone_error_policy="passthrough", ) for k, v in overrides.items(): setattr(base, k, v) return base def test_corpus_integration_pipeline_preserves_schema(): df = pd.read_csv(CORPUS / "30_format_integration.csv", dtype=str, keep_default_na=False) result = standardize_dataframe(df, _integration_opts()) out = result.standardized_df # Schema preservation (corpus § 0.2): no rows or columns added, # column order intact. assert list(out.columns) == list(df.columns) assert len(out) == len(df) def test_corpus_integration_FI01_messy_record(): # Row 0 = FI01: standard messy-but-cleanable record. df = pd.read_csv(CORPUS / "30_format_integration.csv", dtype=str, keep_default_na=False) result = standardize_dataframe(df, _integration_opts()) row = result.standardized_df.iloc[0] assert row["name"] == "Alice Smith" assert row["email"] == "alice@example.com" assert row["phone"] == "+15551234567" assert row["date"] == "2024-01-15" assert row["amount"] == "1234.56" assert row["address"] == "123 Main St, New York, NY 10001" def test_corpus_integration_FI04_all_empty_passthrough(): # Row 3 = FI04: all empty cells, must pass through without errors. df = pd.read_csv(CORPUS / "30_format_integration.csv", dtype=str, keep_default_na=False) result = standardize_dataframe(df, _integration_opts()) row = result.standardized_df.iloc[3] for col in ("name", "email", "phone", "date", "amount", "address"): assert row[col] == "", f"FI04.{col} expected empty, got {row[col]!r}" def test_corpus_integration_FI05_idempotent_on_clean_input(): # Row 4 = FI05: already-clean record. Every column should round-trip # unchanged. df = pd.read_csv(CORPUS / "30_format_integration.csv", dtype=str, keep_default_na=False) result = standardize_dataframe(df, _integration_opts()) row = result.standardized_df.iloc[4] original = df.iloc[4] for col in ("name", "email", "phone", "date", "amount", "address"): assert row[col] == original[col], ( f"FI05.{col} non-idempotent: {original[col]!r} -> {row[col]!r}" ) # --------------------------------------------------------------------------- # Idempotency property # --------------------------------------------------------------------------- # # Every per-cell standardizer must satisfy ``f(f(x)) == f(x)`` (corpus # § 1, "Idempotency requirement"). We exercise it across every corpus # input under the same flag set the per-domain tests use. def _idempotency_runner(fn, fixture, **kwargs): failures = [] for row in _load(fixture): once, _ = fn(row["input"], **kwargs) twice, _ = fn(once, **kwargs) if once != twice: failures.append((row["case_id"], row["input"], once, twice)) return failures @pytest.mark.parametrize("fn,fixture,kwargs", [ (standardize_date, "24_format_dates.csv", {}), (standardize_phone, "25_format_phones.csv", {}), (standardize_address, "27_format_addresses.csv", {"expand": False}), (standardize_name, "28_format_names.csv", {}), (standardize_currency, "29_format_currencies.csv",{}), (standardize_email, "26_format_emails.csv", {}), ]) def test_corpus_idempotency(fn, fixture, kwargs): failures = _idempotency_runner(fn, fixture, **kwargs) assert not failures, ( f"non-idempotent transformations in {fixture}:\n" + "\n".join(f" {cid}: {inp!r} -> {once!r} -> {twice!r}" for cid, inp, once, twice in failures) )