datatools-dev/tests/test_format_standardize_corpus.py

"""Corpus-driven tests for ``src.core.format_standardize``.

Drives every row of the FORMATS test corpus
(``test-cases/format-cleaner-corpus/*.csv``) through the per-cell
standardizers and asserts the canonical output the corpus expects.

The corpus itself (``FORMATS-CASES.md`` in the same directory)
documents per-domain policy decisions; the per-case ``id`` strings
below (FD01, FP14, FA09, …) match its row keys exactly.

Two sentinels are used in the per-domain expected dicts:

- A literal string is the corpus's expected canonical output.
- ``PASSTHROUGH`` means "corpus accepts no transformation" — usually
  empty, whitespace-only, or already-clean input.

A handful of corpus rows are still ``xfail`` because closing them
needs heavier machinery (Excel serial parsing, Unix timestamps,
non-English month dictionaries, IDN / non-ASCII email validation).
Each such marker carries a one-line reason.
"""

from __future__ import annotations

import csv
from pathlib import Path

import pandas as pd
import pytest

from src.core.format_standardize import (
    FieldType,
    StandardizeOptions,
    standardize_address,
    standardize_currency,
    standardize_dataframe,
    standardize_date,
    standardize_email,
    standardize_name,
    standardize_phone,
)

CORPUS = Path(__file__).resolve().parents[1] / "test-cases" / "format-cleaner-corpus"

PASSTHROUGH = object()  # sentinel: assert the function returned input unchanged


def _load(filename: str) -> list[dict[str, str]]:
    with (CORPUS / filename).open(newline="") as f:
        return list(csv.DictReader(f))


def _params(fixture: str, expected: dict[str, object], xfails: dict[str, str]):
    """Build pytest.param entries for every row in *fixture*.

    Rows in *xfails* are wrapped in a non-strict xfail with the given
    reason, so improvements that close the gap surface as xpass and the
    suite stays green either way.
    """
    rows = _load(fixture)
    out = []
    for row in rows:
        cid = row["case_id"]
        want = expected.get(cid, PASSTHROUGH)
        marks = []
        if cid in xfails:
            marks.append(pytest.mark.xfail(reason=xfails[cid], strict=False))
        out.append(pytest.param(row["input"], want, id=cid, marks=marks))
    return out


def _assert(got: str, want: object, original: str) -> None:
    if want is PASSTHROUGH:
        assert got == original, f"expected pass-through, got {got!r}"
    else:
        assert got == want


# ---------------------------------------------------------------------------
# Dates — 24_format_dates.csv
# ---------------------------------------------------------------------------

_DATE_EXPECTED_MDY: dict[str, object] = {
    # iso baseline + datetime variants → ISO date
    "FD01": "2024-01-15",
    "FD02": "2024-01-15",
    "FD03": "2024-01-15",
    "FD04": "2024-01-15",
    "FD05": "2024-01-15",
    "FD06": "2024-01-15",
    # US M/D/Y variants
    "FD07": "2024-01-15",
    "FD08": "2024-01-15",
    "FD09": "2024-01-05",
    "FD10": "2024-05-30",
    # longform month names
    "FD16": "2024-01-15",
    "FD17": "2024-01-15",
    "FD18": "2024-01-15",
    "FD19": "2024-01-15",
    "FD20": "2024-01-15",   # weekday-prefixed
    "FD21": "2024-01-15",
    # FD11-FD15 — DMY-shaped EU dates in MDY default mode; the DMY
    # rerun below covers the actual parse path. Under MDY they pass
    # through unchanged. (Listed explicitly so a future MDY-aware
    # locale auto-detect can replace these expectations with the
    # correct ISO output.)
    "FD11": PASSTHROUGH,
    "FD12": PASSTHROUGH,
    "FD13": PASSTHROUGH,
    "FD14": PASSTHROUGH,
    "FD15": PASSTHROUGH,
    # excel serial dates (numeric days since 1899-12-30)
    "FD22": "2024-01-15",
    "FD23": "2024-01-15",
    # unix timestamps (seconds, milliseconds)
    "FD24": "2024-01-15",
    "FD25": "2024-01-15",
    # partial precision — corpus preserves it
    "FD26": "2024-01",
    "FD27": "2024-01",       # text precision month
    "FD28": "2024-Q1",       # quarter
    "FD29": "2024",
    # 2-digit year cutoff (per docs: 1969 wins over 2069)
    "FD30": "1969-01-15",
    # leap day valid
    "FD31": "2024-02-29",
    # invalid dates → corpus expects error sentinel
    "FD32": "<error: invalid leap day>",
    "FD33": "<error: Excel 1900 leap year bug>",
    "FD34": "<error: invalid month>",
    "FD35": "<error: invalid day>",
    # buried-date extraction
    "FD36": "2024-01-15",
    "FD37": "2024-01-15",
    # garbage → pass through (corpus 0.3 boundary table)
    # FD38/39/40 → PASSTHROUGH default
    # locale-specific month names (en/fr/de via month_locales)
    "FD41": "2024-01-15",
    "FD42": "2024-01-15",
    # timezone — corpus 3.3 says fixed-offset only
    "FD43": "2024-01-15",
    "FD44": "2024-03-10",
    # already-clean idempotency
    "FD45": "2024-01-15",
}

_DATE_XFAILS_MDY: dict[str, str] = {}


@pytest.mark.parametrize(
    "inp,want",
    _params("24_format_dates.csv", _DATE_EXPECTED_MDY, _DATE_XFAILS_MDY),
)
def test_corpus_dates_mdy(inp, want):
    got, _ = standardize_date(
        inp, error_policy="sentinel", month_locales=["en", "fr", "de"],
    )
    _assert(got, want, inp)


# DMY locale rerun for the EU rows that need it.
_DATE_EXPECTED_DMY: dict[str, str] = {
    "FD11": "2024-01-15",
    "FD12": "2024-01-15",
    "FD13": "2024-01-15",
    "FD14": "2024-05-30",
    "FD15": "2024-01-15",
}


@pytest.mark.parametrize(
    "inp,want",
    [
        pytest.param(
            _load("24_format_dates.csv")[i - 1]["input"],
            _DATE_EXPECTED_DMY[f"FD{i:02d}"],
            id=f"FD{i:02d}-dmy",
        )
        for i in range(11, 16)
    ],
)
def test_corpus_dates_dmy(inp, want):
    got, _ = standardize_date(inp, date_order="DMY")
    assert got == want


# ---------------------------------------------------------------------------
# Phones — 25_format_phones.csv
# ---------------------------------------------------------------------------

_PHONE_EXPECTED: dict[str, object] = {
    "FP01": "+15551234567",
    "FP02": "+15551234567",
    "FP03": "+15551234567",
    "FP04": "+15551234567",
    "FP05": "+15551234567",
    "FP06": "+15551234567",
    "FP07": "+15551234567",
    "FP08": "+15551234567",
    "FP09": "+15551234567;ext=123",
    "FP10": "+15551234567;ext=123",
    "FP11": "+15551234567;ext=123",
    # vanity numbers
    "FP12": "+18003569377",
    "FP13": "+15552255669",
    # international (intl row FP15 needs --default-country=GB; covered separately)
    "FP14": "+442079460958",
    "FP16": "+493012345678",
    "FP17": "+33123456789",
    "FP18": "+81312345678",
    "FP19": "+61212345678",
    "FP20": "+15551234567",
    # placeholders/junk → corpus says error
    "FP21": "<error: insufficient digits>",
    "FP22": "<error: too many digits>",
    "FP23": "<error: placeholder number>",
    "FP24": "<error: placeholder number>",
    "FP25": "<error: multiple numbers in cell>",
    # NBSP / smart-quote contamination — defensive cleanup acceptable
    "FP26": "+15551234567",
    "FP27": "+15551234567",
    "FP28": "+15551234567",
    # FP29 empty → pass-through
    "FP30": "<error: not a phone number>",
    "FP31": "<error: smart-quote contamination>",
}


@pytest.mark.parametrize(
    "inp,want",
    _params("25_format_phones.csv", _PHONE_EXPECTED, {}),
)
def test_corpus_phones(inp, want):
    got, _ = standardize_phone(inp, error_policy="sentinel")
    _assert(got, want, inp)


def test_corpus_phones_uk_domestic_with_gb_region():
    # FP15 — UK trunk-prefixed "020 7946 0958" only resolves with
    # default_region="GB". Verifies the cleaner's intl path works.
    got, _ = standardize_phone("020 7946 0958", default_region="GB")
    assert got == "+442079460958"


# ---------------------------------------------------------------------------
# Emails — 26_format_emails.csv
# ---------------------------------------------------------------------------

_EMAIL_EXPECTED: dict[str, object] = {
    "FE01": "alice@example.com",
    "FE02": "alice@example.com",
    "FE03": "alice@example.com",
    "FE04": "alice@example.com",
    "FE05": "alice@example.com",
    "FE06": "alice@example.com",
    "FE07": "alice@example.com",
    "FE08": "alice@example.com",
    "FE09": "alice@example.com",
    "FE10": "a.l.i.c.e@gmail.com",            # default: don't touch dots
    "FE11": "alice+newsletter@gmail.com",     # default: don't touch +tag
    "FE12": "a.l.i.c.e+work@gmail.com",
    "FE13": "a.l.i.c.e@example.com",          # never touch non-Gmail
    "FE14": "alice+newsletter@example.com",
    "FE15": "alice@münchen.de",
    "FE16": "アリス@example.jp",
    "FE17": "alice@example.com",
    "FE18": "alice@example.com",
    "FE19": "alice@example.com",
    "FE20": "alice@example.com",
    "FE21": "alice@example.com",
    "FE22": "<error: missing @>",
    "FE23": "<error: double @>",
    "FE24": "<error: multiple @>",
    "FE25": "<error: internal whitespace>",
    "FE26": "<error: no TLD>",
    "FE27": "<error: multiple emails>",
    "FE28": "<error: multiple emails>",
    # FE29 / FE30 empty / whitespace → PASSTHROUGH
    "FE31": "alice@example.com",
}

_EMAIL_XFAILS: dict[str, str] = {}


@pytest.mark.parametrize(
    "inp,want",
    _params("26_format_emails.csv", _EMAIL_EXPECTED, _EMAIL_XFAILS),
)
def test_corpus_emails(inp, want):
    got, _ = standardize_email(inp, error_policy="sentinel")
    _assert(got, want, inp)


_EMAIL_GMAIL_CANONICAL: dict[str, str] = {
    "FE10": "alice@gmail.com",
    "FE11": "alice@gmail.com",
    "FE12": "alice@gmail.com",
    "FE13": "a.l.i.c.e@example.com",      # negative test: don't touch non-Gmail
    "FE14": "alice+newsletter@example.com",  # negative test
}


@pytest.mark.parametrize("inp,want", [
    pytest.param(
        next(r for r in _load("26_format_emails.csv") if r["case_id"] == cid)["input"],
        want, id=f"{cid}-gmail-canonical",
    )
    for cid, want in _EMAIL_GMAIL_CANONICAL.items()
])
def test_corpus_emails_gmail_canonical(inp, want):
    got, _ = standardize_email(inp, gmail_canonical=True)
    assert got == want


# ---------------------------------------------------------------------------
# Addresses — 27_format_addresses.csv
# ---------------------------------------------------------------------------

_ADDRESS_EXPECTED: dict[str, str] = {
    "FA01": "123 Main St, New York, NY 10001",
    "FA02": "123 Main St, New York, NY 10001",
    "FA03": "123 Main St, New York, NY 10001",
    "FA04": "123 Main St, New York, NY 10001",
    "FA05": "123 Main St, New York, NY 10001",
    "FA06": "456 Park Ave, New York, NY 10001",
    "FA07": "789 Sunset Blvd, Los Angeles, CA 90028",
    "FA08": "123 Main St, New York, NY 10001",
    "FA09": "123 N Main St, City, ST 12345",
    "FA10": "123 N Main St, City, ST 12345",
    "FA11": "123 NE Main St, City, ST 12345",
    "FA12": "123 Main St, Apt 4B, City, ST 12345",
    "FA13": "123 Main St, # 4B, City, ST 12345",
    "FA14": "123 Main St, Ste 200, City, ST 12345",
    "FA15": "123 Main St, New York, NY 10001",
    "FA16": "123 Main St, New York, NY 10001",
    "FA17": "123 Main St, New York, NY 10001-1234",
    "FA18": "123 Main St, Boston, MA 02101",
    "FA19": "123 Main St, Apt 4B, New York, NY 10001",
    "FA20": "PO Box 123, City, ST 12345",
    "FA21": "PO Box 123, City, ST 12345",
    "FA22": "PO Box 123, City, ST 12345",
    "FA23": "123A Main St, City, ST 12345",
    "FA24": "123-1 Main St, City, ST 12345",
    "FA25": "123 1/2 Main St, City, ST 12345",
    "FA26": "10 Downing Street, London, SW1A 2AA",
    "FA27": "1 Yonge St, Toronto, ON M5E 1W7",
    "FA28": "100-0001, Tokyo, Chiyoda, Marunouchi 1-1",
    "FA31": "123 Main St, New York, NY 10001",
}


@pytest.mark.parametrize(
    "inp,want",
    _params("27_format_addresses.csv", _ADDRESS_EXPECTED, {}),
)
def test_corpus_addresses(inp, want):
    got, _ = standardize_address(inp, expand=False)
    _assert(got, want, inp)


# ---------------------------------------------------------------------------
# Names — 28_format_names.csv
# ---------------------------------------------------------------------------

_NAME_EXPECTED: dict[str, object] = {
    "FN01": "Alice Smith",
    "FN02": "Alice Smith",
    "FN03": "Alice Smith",
    "FN04": "aLiCe SmItH",          # corpus 7.3 conservative: preserve mixed
    "FN05": "McDonald",
    "FN06": "McDonald",
    "FN07": "MacDonald",
    "FN08": "McTaggart",
    "FN09": "O'Connor",
    "FN10": "O'Connor",
    "FN11": "O'Brien",
    "FN12": "Mary-Jane Smith",
    "FN13": "Smith-Jones",
    "FN14": "von Trapp",
    "FN15": "Vincent van Gogh",
    "FN16": "Charles de Gaulle",
    "FN17": "Leonardo da Vinci",
    "FN18": "Mr John Smith",        # corpus 7.3: drop title period
    "FN19": "Dr Jane Doe",
    "FN20": "Prof Alice Williams",
    "FN21": "John Smith Jr",
    "FN22": "John Smith III",
    "FN23": "Jane Doe PhD",
    "FN24": "John Smith",           # comma-format reversed
    "FN25": "John Smith",
    "FN26": "John Andrew Smith",
    "FN27": "John A Smith",         # corpus 7.3: drop initial period
    "FN28": "J.K. Rowling",
    "FN29": "김철수",
    "FN30": "田中太郎",
    "FN31": "Иван Иванов",
    "FN32": "Madonna",
    # FN33 / FN34 → PASSTHROUGH default
}


@pytest.mark.parametrize(
    "inp,want",
    _params("28_format_names.csv", _NAME_EXPECTED, {}),
)
def test_corpus_names(inp, want):
    # FN04 needs conservative=True; the rest use default (aggressive).
    conservative = inp == "aLiCe SmItH"
    got, _ = standardize_name(inp, conservative=conservative)
    _assert(got, want, inp)


# ---------------------------------------------------------------------------
# Currencies — 29_format_currencies.csv
# ---------------------------------------------------------------------------

_CURRENCY_EXPECTED: dict[str, object] = {
    "FC01": "1234.56",
    "FC02": "1234.56",
    "FC03": "1234.56",
    "FC04": "1234.56",
    "FC05": "1234.56",
    "FC06": "1234.56",
    "FC07": "1234.56",
    "FC08": "1234.56",
    "FC09": "1234.56",
    "FC10": "1234.56",
    "FC11": "1234.56",
    "FC12": "1234.56",
    "FC13": "1234",
    "FC14": "123456.78",
    "FC15": "-100",
    "FC16": "-100",
    "FC17": "-100",
    "FC18": "0",
    "FC19": "1500000",
    "FC20": "<error: percentage not currency>",
    "FC21": "<error: range not normalizable>",
    "FC22": "<error: word value>",
    "FC23": "<error: word value>",
    # FC24 empty → PASSTHROUGH
    "FC25": "1234.56",
    "FC26": "1234",
    "FC27": "<error: ambiguous separator, set --currency-locale>",
}


@pytest.mark.parametrize(
    "inp,want",
    _params("29_format_currencies.csv", _CURRENCY_EXPECTED, {}),
)
def test_corpus_currencies(inp, want):
    got, _ = standardize_currency(inp, error_policy="sentinel")
    _assert(got, want, inp)


def test_corpus_currencies_eu_with_comma_decimal():
    # FC08, FC10 also parse correctly under decimal="comma".
    got, _ = standardize_currency("€1.234,56", decimal="comma")
    assert got == "1234.56"
    got, _ = standardize_currency("1.234,56 EUR", decimal="comma")
    assert got == "1234.56"


# ---------------------------------------------------------------------------
# Integration — 30_format_integration.csv
# ---------------------------------------------------------------------------

def _integration_opts(**overrides) -> StandardizeOptions:
    """Standardize options matching corpus defaults for the integration row."""
    base = StandardizeOptions(
        column_types={
            "name":    FieldType.NAME,
            "email":   FieldType.EMAIL,
            "phone":   FieldType.PHONE,
            "date":    FieldType.DATE,
            "amount":  FieldType.CURRENCY,
            "address": FieldType.ADDRESS,
        },
        currency_decimals=None,
        address_expand=False,
        date_error_policy="passthrough",
        phone_error_policy="passthrough",
    )
    for k, v in overrides.items():
        setattr(base, k, v)
    return base


def test_corpus_integration_pipeline_preserves_schema():
    df = pd.read_csv(CORPUS / "30_format_integration.csv",
                     dtype=str, keep_default_na=False)
    result = standardize_dataframe(df, _integration_opts())
    out = result.standardized_df

    # Schema preservation (corpus § 0.2): no rows or columns added,
    # column order intact.
    assert list(out.columns) == list(df.columns)
    assert len(out) == len(df)


def test_corpus_integration_FI01_messy_record():
    # Row 0 = FI01: standard messy-but-cleanable record.
    df = pd.read_csv(CORPUS / "30_format_integration.csv",
                     dtype=str, keep_default_na=False)
    result = standardize_dataframe(df, _integration_opts())
    row = result.standardized_df.iloc[0]
    assert row["name"]    == "Alice Smith"
    assert row["email"]   == "alice@example.com"
    assert row["phone"]   == "+15551234567"
    assert row["date"]    == "2024-01-15"
    assert row["amount"]  == "1234.56"
    assert row["address"] == "123 Main St, New York, NY 10001"


def test_corpus_integration_FI04_all_empty_passthrough():
    # Row 3 = FI04: all empty cells, must pass through without errors.
    df = pd.read_csv(CORPUS / "30_format_integration.csv",
                     dtype=str, keep_default_na=False)
    result = standardize_dataframe(df, _integration_opts())
    row = result.standardized_df.iloc[3]
    for col in ("name", "email", "phone", "date", "amount", "address"):
        assert row[col] == "", f"FI04.{col} expected empty, got {row[col]!r}"


def test_corpus_integration_FI05_idempotent_on_clean_input():
    # Row 4 = FI05: already-clean record. Every column should round-trip
    # unchanged.
    df = pd.read_csv(CORPUS / "30_format_integration.csv",
                     dtype=str, keep_default_na=False)
    result = standardize_dataframe(df, _integration_opts())
    row = result.standardized_df.iloc[4]
    original = df.iloc[4]
    for col in ("name", "email", "phone", "date", "amount", "address"):
        assert row[col] == original[col], (
            f"FI05.{col} non-idempotent: {original[col]!r} -> {row[col]!r}"
        )


# ---------------------------------------------------------------------------
# Idempotency property
# ---------------------------------------------------------------------------
#
# Every per-cell standardizer must satisfy ``f(f(x)) == f(x)`` (corpus
# § 1, "Idempotency requirement"). We exercise it across every corpus
# input under the same flag set the per-domain tests use.

def _idempotency_runner(fn, fixture, **kwargs):
    failures = []
    for row in _load(fixture):
        once, _ = fn(row["input"], **kwargs)
        twice, _ = fn(once, **kwargs)
        if once != twice:
            failures.append((row["case_id"], row["input"], once, twice))
    return failures


@pytest.mark.parametrize("fn,fixture,kwargs", [
    (standardize_date,     "24_format_dates.csv",     {}),
    (standardize_phone,    "25_format_phones.csv",    {}),
    (standardize_address,  "27_format_addresses.csv", {"expand": False}),
    (standardize_name,     "28_format_names.csv",     {}),
    (standardize_currency, "29_format_currencies.csv",{}),
    (standardize_email,    "26_format_emails.csv",    {}),
])
def test_corpus_idempotency(fn, fixture, kwargs):
    failures = _idempotency_runner(fn, fixture, **kwargs)
    assert not failures, (
        f"non-idempotent transformations in {fixture}:\n"
        + "\n".join(f"  {cid}: {inp!r} -> {once!r} -> {twice!r}"
                    for cid, inp, once, twice in failures)
    )