feat(format): per-cell standardizers + 199-row buyer corpus

Adds src/core/format_standardize.py — a per-cell standardizer for dates, phones, emails, addresses, names, currencies, booleans — wired through StandardizeOptions / standardize_dataframe with FieldType registry. Includes: - Date parser handles ISO/US/EU/longform/excel-serial/unix-timestamp/ partial-precision/quarter notation; opt-in French/German/Spanish month dictionaries via month_locales. - Phone via libphonenumber with extension preservation (;ext=N), 001 international prefix handling, error sentinels for placeholders / multi-number cells. - Email lowercase/trim/mailto/angle-bracket strip with optional --gmail-canonical mode. - Address USPS abbreviation expansion or compression (expand=False per corpus § 6.3), state-name → 2-letter conversion, multi-line collapse, PO Box normalization, state-code preservation regardless of input case. - Name handler: Mc/Mac/O'/D' inner caps, hyphen segments, particle lowercasing (von/van/de/da), comma-format reversal, period stripping for titles/suffixes/initials, PhD/MD acronym preservation, conservative mode for mixed-case input. - Currency: auto-detect EU vs US separators, space-thousands, Swiss apostrophe, accounting parens, optional ISO code preservation, error sentinels for percentages/ranges/word-values/ambiguous separators. - Per-domain error_policy ("passthrough" | "sentinel") for surfacing malformed values as <error: reason> per corpus § 0.3. Test corpus from Business/DataTools/test-cases-format-cleaner copied to test-cases/format-cleaner-corpus/ — 7 fixtures plus FORMATS-CASES.md. tests/test_format_standardize_corpus.py drives all 199 rows through the per-cell standardizers; 0 xfailed. Wires the GUI page (3_Format_Standardizer.py) to "Ready" status. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 02:11:24 +00:00
parent 3f007ef3d6
commit 4adeb5c7f3
14 changed files with 4364 additions and 69 deletions
--- a/tests/test_format_standardize.py
+++ b/tests/test_format_standardize.py
@@ -0,0 +1,630 @@
+"""Tests for src.core.format_standardize."""
+
+import pandas as pd
+import pytest
+
+from src.core.format_standardize import (
+    PRESETS,
+    FieldType,
+    StandardizeOptions,
+    detect_currency_code,
+    standardize_address,
+    standardize_boolean,
+    standardize_currency,
+    standardize_dataframe,
+    standardize_date,
+    standardize_name,
+    standardize_phone,
+)
+
+
+class TestStandardizeDate:
+    def test_iso_passthrough(self):
+        out, changed = standardize_date("2024-01-15")
+        assert out == "2024-01-15"
+        assert changed is False
+
+    def test_us_slash(self):
+        out, changed = standardize_date("01/15/2024")
+        assert (out, changed) == ("2024-01-15", True)
+
+    def test_us_dash(self):
+        out, _ = standardize_date("1-15-2024")
+        assert out == "2024-01-15"
+
+    def test_two_digit_year(self):
+        out, _ = standardize_date("01/15/24")
+        assert out == "2024-01-15"
+
+    def test_long_month_name(self):
+        out, _ = standardize_date("January 15, 2024")
+        assert out == "2024-01-15"
+
+    def test_short_month_name(self):
+        out, _ = standardize_date("Jan 15 2024")
+        assert out == "2024-01-15"
+
+    def test_dmy_order(self):
+        out, _ = standardize_date("15/01/2024", date_order="DMY")
+        assert out == "2024-01-15"
+
+    def test_strip_time_tail(self):
+        out, _ = standardize_date("2024-01-15 13:45:00")
+        assert out == "2024-01-15"
+
+    def test_iso_with_t_separator(self):
+        out, _ = standardize_date("2024-01-15T08:30:00Z")
+        assert out == "2024-01-15"
+
+    def test_compact(self):
+        out, _ = standardize_date("20240115")
+        assert out == "2024-01-15"
+
+    def test_custom_output(self):
+        out, _ = standardize_date("01/15/2024", output_format="%d %b %Y")
+        assert out == "15 Jan 2024"
+
+    def test_unparseable_passthrough(self):
+        out, changed = standardize_date("hello")
+        assert (out, changed) == ("hello", False)
+
+    def test_empty(self):
+        assert standardize_date("") == ("", False)
+        assert standardize_date(None) == ("", False)
+
+    def test_idempotent(self):
+        out, _ = standardize_date("01/15/2024")
+        out2, changed2 = standardize_date(out)
+        assert out2 == out
+        assert changed2 is False
+
+
+class TestStandardizePhone:
+    def test_e164_default(self):
+        out, _ = standardize_phone("(555) 123-4567")
+        assert out == "+15551234567"
+
+    def test_national(self):
+        out, _ = standardize_phone("5551234567", output_format="NATIONAL")
+        assert out == "(555) 123-4567"
+
+    def test_international(self):
+        out, _ = standardize_phone("5551234567", output_format="INTERNATIONAL")
+        assert out == "+1 555-123-4567"
+
+    def test_digits_only(self):
+        out, changed = standardize_phone("(555) 123-4567", output_format="DIGITS")
+        assert out == "5551234567"
+        assert changed is True
+
+    def test_invalid_passthrough(self):
+        out, changed = standardize_phone("call me maybe")
+        assert (out, changed) == ("call me maybe", False)
+
+    def test_empty(self):
+        assert standardize_phone("") == ("", False)
+        assert standardize_phone(None) == ("", False)
+
+    def test_idempotent(self):
+        out, _ = standardize_phone("(555) 123-4567")
+        out2, changed2 = standardize_phone(out)
+        assert out2 == out
+        assert changed2 is False
+
+
+class TestStandardizeCurrency:
+    def test_dollar_with_cents(self):
+        out, _ = standardize_currency("$1,234.56")
+        assert out == "1234.56"
+
+    def test_no_decimals_arg(self):
+        out, _ = standardize_currency("$1,234.56", decimals=None)
+        assert out == "1234.56"
+
+    def test_round_to_two(self):
+        out, _ = standardize_currency("$1,234.567", decimals=2)
+        assert out == "1234.57"
+
+    def test_integer_input(self):
+        out, _ = standardize_currency("$1,000", decimals=None)
+        assert out == "1000"
+
+    def test_negative_parens(self):
+        out, _ = standardize_currency("($50.00)", decimals=2)
+        assert out == "-50.00"
+
+    def test_negative_sign(self):
+        out, _ = standardize_currency("-$50.00", decimals=2)
+        assert out == "-50.00"
+
+    def test_iso_code_prefix(self):
+        out, _ = standardize_currency("USD 1,234.56")
+        assert out == "1234.56"
+
+    def test_iso_code_suffix(self):
+        out, _ = standardize_currency("1234.56 EUR")
+        assert out == "1234.56"
+
+    def test_european_decimal(self):
+        out, _ = standardize_currency("1.234,56 €", decimal="comma")
+        assert out == "1234.56"
+
+    def test_unparseable_passthrough(self):
+        out, changed = standardize_currency("free!")
+        assert (out, changed) == ("free!", False)
+
+    def test_ambiguous_short_comma_rejected(self):
+        # "1,5" under dot-decimal mode would be a comma decimal — reject.
+        out, changed = standardize_currency("1,5")
+        assert changed is False
+        assert out == "1,5"
+
+    def test_thousands_grouped_no_decimal(self):
+        out, _ = standardize_currency("1,234", decimals=None)
+        assert out == "1234"
+
+    def test_empty(self):
+        assert standardize_currency("") == ("", False)
+        assert standardize_currency(None) == ("", False)
+
+    def test_idempotent(self):
+        out, _ = standardize_currency("$1,234.56", decimals=2)
+        out2, changed2 = standardize_currency(out, decimals=2)
+        assert out2 == out
+        assert changed2 is False
+
+
+class TestStandardizeName:
+    def test_shouting_to_title(self):
+        out, _ = standardize_name("JOHN DOE")
+        assert out == "John Doe"
+
+    def test_lowercase_to_title(self):
+        out, _ = standardize_name("john doe")
+        assert out == "John Doe"
+
+    def test_already_title(self):
+        out, changed = standardize_name("Jane Smith")
+        assert out == "Jane Smith"
+        assert changed is False
+
+    def test_apostrophe_inner_cap(self):
+        # Surnames with O'/D' apostrophe prefixes get the inner letter
+        # capitalized regardless of input case (corpus § 7.3 Irish names).
+        out, _ = standardize_name("o'Connor")
+        assert out == "O'Connor"
+        out2, _ = standardize_name("o'connor")
+        assert out2 == "O'Connor"
+
+    def test_acronym_preserved(self):
+        out, _ = standardize_name("Mary USA Smith")
+        assert out == "Mary USA Smith"
+
+    def test_upper_mode(self):
+        out, _ = standardize_name("john doe", case="upper")
+        assert out == "JOHN DOE"
+
+    def test_lower_mode(self):
+        out, _ = standardize_name("JOHN DOE", case="lower")
+        assert out == "john doe"
+
+    def test_empty(self):
+        assert standardize_name("") == ("", False)
+        assert standardize_name(None) == ("", False)
+
+    def test_idempotent(self):
+        out, _ = standardize_name("JOHN DOE")
+        out2, changed2 = standardize_name(out)
+        assert out2 == out
+        assert changed2 is False
+
+
+class TestStandardizeAddress:
+    def test_street(self):
+        out, _ = standardize_address("123 Main St")
+        assert out == "123 Main Street"
+
+    def test_avenue_with_period(self):
+        out, _ = standardize_address("456 Oak Ave.")
+        assert out == "456 Oak Avenue"
+
+    def test_apartment(self):
+        out, _ = standardize_address("123 Main St Apt 4")
+        assert out == "123 Main Street Apartment 4"
+
+    def test_direction(self):
+        out, _ = standardize_address("100 N Main St")
+        assert out == "100 North Main Street"
+
+    def test_combined(self):
+        out, _ = standardize_address("789 pine blvd ste 200")
+        assert out == "789 Pine Boulevard Suite 200"
+
+    def test_already_expanded(self):
+        out, changed = standardize_address("123 Main Street")
+        assert out == "123 Main Street"
+        assert changed is False
+
+    def test_empty(self):
+        assert standardize_address("") == ("", False)
+        assert standardize_address(None) == ("", False)
+
+    def test_idempotent(self):
+        out, _ = standardize_address("123 main st apt 4")
+        out2, changed2 = standardize_address(out)
+        assert out2 == out
+        assert changed2 is False
+
+
+class TestStandardizeBoolean:
+    @pytest.mark.parametrize("inp", ["yes", "Yes", "YES", "y", "Y", "true", "1", "on"])
+    def test_truthy(self, inp):
+        out, changed = standardize_boolean(inp)
+        assert out == "True"
+        assert changed is True
+
+    @pytest.mark.parametrize("inp", ["no", "No", "NO", "n", "N", "false", "0", "off"])
+    def test_falsy(self, inp):
+        out, changed = standardize_boolean(inp)
+        assert out == "False"
+        assert changed is True
+
+    def test_already_canonical(self):
+        out, changed = standardize_boolean("True")
+        assert out == "True"
+        assert changed is False
+
+    def test_python_bool(self):
+        assert standardize_boolean(True) == ("True", True)
+        assert standardize_boolean(False) == ("False", True)
+
+    def test_int_zero_one(self):
+        assert standardize_boolean(1) == ("True", True)
+        assert standardize_boolean(0) == ("False", True)
+
+    def test_yes_no_style(self):
+        assert standardize_boolean("y", style="Yes/No") == ("Yes", True)
+        assert standardize_boolean("0", style="Yes/No") == ("No", True)
+
+    def test_unrecognized_passthrough(self):
+        out, changed = standardize_boolean("maybe")
+        assert (out, changed) == ("maybe", False)
+
+    def test_empty(self):
+        assert standardize_boolean("") == ("", False)
+        assert standardize_boolean(None) == ("", False)
+
+    def test_idempotent(self):
+        out, _ = standardize_boolean("yes")
+        out2, changed2 = standardize_boolean(out)
+        assert out2 == out
+        assert changed2 is False
+
+
+# ---------------------------------------------------------------------------
+# DataFrame entry point
+# ---------------------------------------------------------------------------
+
+class TestStandardizeDataframe:
+    def test_mixed_columns(self):
+        df = pd.DataFrame({
+            "name": ["JOHN SMITH", "alice jones"],
+            "phone": ["(555) 123-4567", "555.987.6543"],
+            "amount": ["$1,234.56", "$50"],
+            "joined": ["01/15/2024", "March 5 2023"],
+            "active": ["yes", "0"],
+            "address": ["123 Main St", "456 Oak Ave"],
+            "skip_me": ["leave", "alone"],
+        })
+        opts = StandardizeOptions(
+            column_types={
+                "name": FieldType.NAME,
+                "phone": FieldType.PHONE,
+                "amount": FieldType.CURRENCY,
+                "joined": FieldType.DATE,
+                "active": FieldType.BOOLEAN,
+                "address": FieldType.ADDRESS,
+            },
+        )
+        result = standardize_dataframe(df, opts)
+        out = result.standardized_df
+        assert out.loc[0, "name"] == "John Smith"
+        assert out.loc[1, "name"] == "Alice Jones"
+        assert out.loc[0, "phone"] == "+15551234567"
+        assert out.loc[1, "phone"] == "+15559876543"
+        assert out.loc[0, "amount"] == "1234.56"
+        assert out.loc[1, "amount"] == "50.00"
+        assert out.loc[0, "joined"] == "2024-01-15"
+        assert out.loc[1, "joined"] == "2023-03-05"
+        assert out.loc[0, "active"] == "True"
+        assert out.loc[1, "active"] == "False"
+        assert out.loc[0, "address"] == "123 Main Street"
+        assert out.loc[1, "address"] == "456 Oak Avenue"
+        # Untouched column passes through verbatim.
+        assert list(out["skip_me"]) == ["leave", "alone"]
+
+    def test_changes_audit(self):
+        df = pd.DataFrame({"d": ["01/15/2024", "2023-03-05"]})
+        opts = StandardizeOptions(column_types={"d": FieldType.DATE})
+        result = standardize_dataframe(df, opts)
+        # Only the first row changed; the second was already canonical.
+        assert result.cells_changed == 1
+        assert len(result.changes) == 1
+        assert result.changes.iloc[0]["row"] == 0
+        assert result.changes.iloc[0]["column"] == "d"
+        assert result.changes.iloc[0]["old"] == "01/15/2024"
+        assert result.changes.iloc[0]["new"] == "2024-01-15"
+
+    def test_unparseable_count(self):
+        df = pd.DataFrame({"d": ["01/15/2024", "not a date", "2024-01-15"]})
+        opts = StandardizeOptions(column_types={"d": FieldType.DATE})
+        result = standardize_dataframe(df, opts)
+        assert result.cells_unparseable == 1
+        assert result.cells_total == 3
+
+    def test_unknown_column_raises(self):
+        df = pd.DataFrame({"a": ["1"]})
+        opts = StandardizeOptions(column_types={"missing": FieldType.DATE})
+        with pytest.raises(ValueError, match="not found"):
+            standardize_dataframe(df, opts)
+
+    def test_input_not_mutated(self):
+        df = pd.DataFrame({"d": ["01/15/2024"]})
+        opts = StandardizeOptions(column_types={"d": FieldType.DATE})
+        standardize_dataframe(df, opts)
+        assert df.loc[0, "d"] == "01/15/2024"
+
+    def test_options_serialization_roundtrip(self, tmp_path):
+        opts = StandardizeOptions(
+            column_types={"a": FieldType.DATE, "b": FieldType.PHONE},
+            date_output_format="%d-%b-%Y",
+            phone_format="NATIONAL",
+        )
+        path = tmp_path / "opts.json"
+        opts.to_file(path)
+        loaded = StandardizeOptions.from_file(path)
+        assert loaded.column_types == {"a": FieldType.DATE, "b": FieldType.PHONE}
+        assert loaded.date_output_format == "%d-%b-%Y"
+        assert loaded.phone_format == "NATIONAL"
+
+    def test_nan_passthrough(self):
+        df = pd.DataFrame({"d": ["01/15/2024", None]})
+        opts = StandardizeOptions(column_types={"d": FieldType.DATE})
+        result = standardize_dataframe(df, opts)
+        assert result.standardized_df.loc[0, "d"] == "2024-01-15"
+        assert result.standardized_df.loc[1, "d"] is None
+
+
+# ---------------------------------------------------------------------------
+# Preset bundles
+# ---------------------------------------------------------------------------
+
+class TestPresets:
+    def test_us_default_iso_dates(self):
+        opts = StandardizeOptions.from_preset("us-default")
+        assert opts.date_output_format == "%Y-%m-%d"
+        assert opts.date_order == "MDY"
+        assert opts.phone_format == "E164"
+        assert opts.boolean_style == "True/False"
+
+    def test_european_dmy_comma(self):
+        opts = StandardizeOptions.from_preset("european")
+        assert opts.date_order == "DMY"
+        assert opts.currency_decimal == "comma"
+        assert opts.currency_preserve_code is True
+
+    def test_uk_ddmmyyyy_yes_no(self):
+        opts = StandardizeOptions.from_preset("uk")
+        assert opts.date_output_format == "%d/%m/%Y"
+        assert opts.phone_region == "GB"
+        assert opts.boolean_style == "Yes/No"
+
+    def test_iso_strict_lowercase_bools_no_rounding(self):
+        opts = StandardizeOptions.from_preset("iso-strict")
+        assert opts.boolean_style == "true/false"
+        assert opts.currency_decimals is None
+        assert opts.currency_preserve_code is True
+
+    def test_legacy_us_national_phones(self):
+        opts = StandardizeOptions.from_preset("legacy-us")
+        assert opts.date_output_format == "%m/%d/%Y"
+        assert opts.phone_format == "NATIONAL"
+        assert opts.boolean_style == "Yes/No"
+
+    def test_overrides_layer_on_top(self):
+        opts = StandardizeOptions.from_preset(
+            "uk",
+            column_types={"name": FieldType.NAME},
+            currency_decimals=4,
+        )
+        assert opts.column_types == {"name": FieldType.NAME}
+        assert opts.currency_decimals == 4
+        # UK-specific defaults survive what we didn't override.
+        assert opts.phone_region == "GB"
+
+    def test_unknown_preset_raises(self):
+        with pytest.raises(ValueError, match="Unknown preset"):
+            StandardizeOptions.from_preset("not-a-real-preset")
+
+    def test_all_presets_loadable(self):
+        # Smoke test: every advertised preset constructs cleanly.
+        for name in PRESETS:
+            opts = StandardizeOptions.from_preset(name)
+            assert isinstance(opts, StandardizeOptions)
+
+    def test_preset_drives_dataframe_pipeline(self):
+        df = pd.DataFrame({
+            "joined": ["15/01/2024"],
+            "active": ["yes"],
+            "amount": ["1.234,56 €"],
+        })
+        opts = StandardizeOptions.from_preset(
+            "european",
+            column_types={
+                "joined": FieldType.DATE,
+                "active": FieldType.BOOLEAN,
+                "amount": FieldType.CURRENCY,
+            },
+        )
+        result = standardize_dataframe(df, opts)
+        out = result.standardized_df
+        assert out.loc[0, "joined"] == "2024-01-15"  # ISO output for european
+        assert out.loc[0, "active"] == "True"
+        assert out.loc[0, "amount"] == "EUR 1234.56"  # preserve_code on
+
+
+# ---------------------------------------------------------------------------
+# Currency code detection / preservation
+# ---------------------------------------------------------------------------
+
+class TestCurrencyCodeDetection:
+    @pytest.mark.parametrize("inp,code", [
+        ("$1,234.56", "USD"),
+        ("€1.234,56", "EUR"),
+        ("£99.00", "GBP"),
+        ("¥5000", "JPY"),
+        ("₹500", "INR"),
+        ("USD 1234", "USD"),
+        ("1234 EUR", "EUR"),
+        ("eur 50", "EUR"),
+    ])
+    def test_detects(self, inp, code):
+        assert detect_currency_code(inp) == code
+
+    def test_no_marker_returns_none(self):
+        assert detect_currency_code("1234.56") is None
+
+    def test_non_string_returns_none(self):
+        assert detect_currency_code(None) is None  # type: ignore[arg-type]
+        assert detect_currency_code(1234) is None  # type: ignore[arg-type]
+
+
+class TestCurrencyPreserveCode:
+    def test_dollar_preserved(self):
+        out, changed = standardize_currency("$1,234.56", decimals=2, preserve_code=True)
+        assert out == "USD 1234.56"
+        assert changed is True
+
+    def test_euro_preserved_comma_decimal(self):
+        out, _ = standardize_currency(
+            "1.234,56 €", decimal="comma", decimals=2, preserve_code=True,
+        )
+        assert out == "EUR 1234.56"
+
+    def test_iso_code_input_preserved(self):
+        out, _ = standardize_currency("USD 1234.56", decimals=2, preserve_code=True)
+        assert out == "USD 1234.56"
+
+    def test_no_marker_no_prefix(self):
+        out, _ = standardize_currency("1234.56", decimals=2, preserve_code=True)
+        assert out == "1234.56"
+
+    def test_off_by_default(self):
+        out, _ = standardize_currency("$1,234.56", decimals=2)
+        assert out == "1234.56"
+
+    def test_pipeline_preserve_code(self):
+        df = pd.DataFrame({"price": ["$50.00", "€30,00", "100", "USD 12.34"]})
+        opts = StandardizeOptions(
+            column_types={"price": FieldType.CURRENCY},
+            currency_decimals=2,
+            currency_preserve_code=True,
+            currency_decimal="dot",  # mixed input — euro will need its own
+        )
+        # Note: comma-decimal euro won't parse under dot mode; treat that
+        # as a known limitation — this test exercises the dot-input path.
+        result = standardize_dataframe(df, opts)
+        out = result.standardized_df
+        assert out.loc[0, "price"] == "USD 50.00"
+        assert out.loc[2, "price"] == "100.00"
+        assert out.loc[3, "price"] == "USD 12.34"
+
+    def test_canonical_check_recognizes_code_prefix(self):
+        # "USD 50.00" should pass through unchanged when preserve_code is on
+        # — and NOT count as unparseable.
+        df = pd.DataFrame({"price": ["USD 50.00", "garbage"]})
+        opts = StandardizeOptions(
+            column_types={"price": FieldType.CURRENCY},
+            currency_decimals=2,
+            currency_preserve_code=True,
+        )
+        result = standardize_dataframe(df, opts)
+        assert result.cells_changed == 0
+        # Only "garbage" counts as unparseable.
+        assert result.cells_unparseable == 1
+
+
+# ---------------------------------------------------------------------------
+# User-editable abbreviations
+# ---------------------------------------------------------------------------
+
+class TestExtraAbbreviations:
+    def test_extra_expansion(self):
+        out, _ = standardize_address(
+            "Bahnhofstrasse 12",
+            extra_abbreviations={"strasse": "Straße"},
+        )
+        # smart_title_case will Title-case the result; "Bahnhofstrasse" is
+        # already a single token (no embedded space) so it doesn't hit the
+        # abbreviation lookup. Use a separated form for the realistic case.
+        assert "Bahnhofstrasse" in out  # not split → not expanded
+
+    def test_extra_expansion_separated_token(self):
+        out, _ = standardize_address(
+            "Haupt strasse 12",
+            extra_abbreviations={"strasse": "Straße"},
+        )
+        assert "Straße" in out
+
+    def test_override_existing_entry(self):
+        # Override "ave" to emit Spanish-language "Avenida".
+        out, _ = standardize_address(
+            "456 Oak Ave",
+            extra_abbreviations={"ave": "Avenida"},
+        )
+        assert "Avenida" in out
+        assert "Avenue" not in out
+
+    def test_period_form_works(self):
+        # Lookup is casefold + period-stripped, so ``Ave.`` still matches.
+        out, _ = standardize_address(
+            "456 Oak Ave.",
+            extra_abbreviations={"ave": "Avenida"},
+        )
+        assert "Avenida" in out
+
+    def test_empty_value_skipped(self):
+        # Empty values in the user table don't blow up; they're ignored.
+        out, _ = standardize_address(
+            "456 Oak Ave",
+            extra_abbreviations={"ave": "", "  ": "Drive"},
+        )
+        # Built-in expansion still applies.
+        assert "Avenue" in out
+
+    def test_no_extras_unchanged_behavior(self):
+        out_a, _ = standardize_address("123 Main St")
+        out_b, _ = standardize_address("123 Main St", extra_abbreviations={})
+        out_c, _ = standardize_address("123 Main St", extra_abbreviations=None)
+        assert out_a == out_b == out_c == "123 Main Street"
+
+    def test_pipeline_uses_extras(self):
+        df = pd.DataFrame({"addr": ["456 Oak Ave"]})
+        opts = StandardizeOptions(
+            column_types={"addr": FieldType.ADDRESS},
+            extra_abbreviations={"ave": "Avenida"},
+        )
+        result = standardize_dataframe(df, opts)
+        assert "Avenida" in result.standardized_df.loc[0, "addr"]
+
+    def test_serialization_roundtrip_with_extras(self, tmp_path):
+        opts = StandardizeOptions(
+            column_types={"addr": FieldType.ADDRESS},
+            extra_abbreviations={"strasse": "Straße", "platz": "Platz"},
+            currency_preserve_code=True,
+        )
+        path = tmp_path / "opts.json"
+        opts.to_file(path)
+        loaded = StandardizeOptions.from_file(path)
+        assert loaded.extra_abbreviations == {"strasse": "Straße", "platz": "Platz"}
+        assert loaded.currency_preserve_code is True
--- a/tests/test_format_standardize_corpus.py
+++ b/tests/test_format_standardize_corpus.py
@@ -0,0 +1,573 @@
+"""Corpus-driven tests for ``src.core.format_standardize``.
+
+Drives every row of the FORMATS test corpus
+(``test-cases/format-cleaner-corpus/*.csv``) through the per-cell
+standardizers and asserts the canonical output the corpus expects.
+
+The corpus itself (``FORMATS-CASES.md`` in the same directory)
+documents per-domain policy decisions; the per-case ``id`` strings
+below (FD01, FP14, FA09, …) match its row keys exactly.
+
+Two sentinels are used in the per-domain expected dicts:
+
+- A literal string is the corpus's expected canonical output.
+- ``PASSTHROUGH`` means "corpus accepts no transformation" — usually
+  empty, whitespace-only, or already-clean input.
+
+A handful of corpus rows are still ``xfail`` because closing them
+needs heavier machinery (Excel serial parsing, Unix timestamps,
+non-English month dictionaries, IDN / non-ASCII email validation).
+Each such marker carries a one-line reason.
+"""
+
+from __future__ import annotations
+
+import csv
+from pathlib import Path
+
+import pandas as pd
+import pytest
+
+from src.core.format_standardize import (
+    FieldType,
+    StandardizeOptions,
+    standardize_address,
+    standardize_currency,
+    standardize_dataframe,
+    standardize_date,
+    standardize_email,
+    standardize_name,
+    standardize_phone,
+)
+
+CORPUS = Path(__file__).resolve().parents[1] / "test-cases" / "format-cleaner-corpus"
+
+PASSTHROUGH = object()  # sentinel: assert the function returned input unchanged
+
+
+def _load(filename: str) -> list[dict[str, str]]:
+    with (CORPUS / filename).open(newline="") as f:
+        return list(csv.DictReader(f))
+
+
+def _params(fixture: str, expected: dict[str, object], xfails: dict[str, str]):
+    """Build pytest.param entries for every row in *fixture*.
+
+    Rows in *xfails* are wrapped in a non-strict xfail with the given
+    reason, so improvements that close the gap surface as xpass and the
+    suite stays green either way.
+    """
+    rows = _load(fixture)
+    out = []
+    for row in rows:
+        cid = row["case_id"]
+        want = expected.get(cid, PASSTHROUGH)
+        marks = []
+        if cid in xfails:
+            marks.append(pytest.mark.xfail(reason=xfails[cid], strict=False))
+        out.append(pytest.param(row["input"], want, id=cid, marks=marks))
+    return out
+
+
+def _assert(got: str, want: object, original: str) -> None:
+    if want is PASSTHROUGH:
+        assert got == original, f"expected pass-through, got {got!r}"
+    else:
+        assert got == want
+
+
+# ---------------------------------------------------------------------------
+# Dates — 24_format_dates.csv
+# ---------------------------------------------------------------------------
+
+_DATE_EXPECTED_MDY: dict[str, object] = {
+    # iso baseline + datetime variants → ISO date
+    "FD01": "2024-01-15",
+    "FD02": "2024-01-15",
+    "FD03": "2024-01-15",
+    "FD04": "2024-01-15",
+    "FD05": "2024-01-15",
+    "FD06": "2024-01-15",
+    # US M/D/Y variants
+    "FD07": "2024-01-15",
+    "FD08": "2024-01-15",
+    "FD09": "2024-01-05",
+    "FD10": "2024-05-30",
+    # longform month names
+    "FD16": "2024-01-15",
+    "FD17": "2024-01-15",
+    "FD18": "2024-01-15",
+    "FD19": "2024-01-15",
+    "FD20": "2024-01-15",   # weekday-prefixed
+    "FD21": "2024-01-15",
+    # FD11-FD15 — DMY-shaped EU dates in MDY default mode; the DMY
+    # rerun below covers the actual parse path. Under MDY they pass
+    # through unchanged. (Listed explicitly so a future MDY-aware
+    # locale auto-detect can replace these expectations with the
+    # correct ISO output.)
+    "FD11": PASSTHROUGH,
+    "FD12": PASSTHROUGH,
+    "FD13": PASSTHROUGH,
+    "FD14": PASSTHROUGH,
+    "FD15": PASSTHROUGH,
+    # excel serial → 2024-01-15 (xfail — not implemented)
+    "FD22": "2024-01-15",
+    "FD23": "2024-01-15",
+    # unix timestamp seconds / millis → 2024-01-15 (xfail)
+    "FD24": "2024-01-15",
+    "FD25": "2024-01-15",
+    # partial precision — corpus preserves it
+    "FD26": "2024-01",
+    "FD27": "2024-01",       # xfail — text precision
+    "FD28": "2024-Q1",       # xfail — quarter
+    "FD29": "2024",
+    # 2-digit year cutoff (per docs: 1969 wins over 2069)
+    "FD30": "1969-01-15",
+    # leap day valid
+    "FD31": "2024-02-29",
+    # invalid dates → corpus expects error sentinel
+    "FD32": "<error: invalid leap day>",
+    "FD33": "<error: Excel 1900 leap year bug>",
+    "FD34": "<error: invalid month>",
+    "FD35": "<error: invalid day>",
+    # buried-date extraction
+    "FD36": "2024-01-15",
+    "FD37": "2024-01-15",
+    # garbage → pass through (corpus 0.3 boundary table)
+    # FD38/39/40 → PASSTHROUGH default
+    # locale-specific month names (xfail — not shipped)
+    "FD41": "2024-01-15",
+    "FD42": "2024-01-15",
+    # timezone — corpus 3.3 says fixed-offset only
+    "FD43": "2024-01-15",
+    "FD44": "2024-03-10",
+    # already-clean idempotency
+    "FD45": "2024-01-15",
+}
+
+_DATE_XFAILS_MDY: dict[str, str] = {}
+
+
+@pytest.mark.parametrize(
+    "inp,want",
+    _params("24_format_dates.csv", _DATE_EXPECTED_MDY, _DATE_XFAILS_MDY),
+)
+def test_corpus_dates_mdy(inp, want):
+    got, _ = standardize_date(
+        inp, error_policy="sentinel", month_locales=["en", "fr", "de"],
+    )
+    _assert(got, want, inp)
+
+
+# DMY locale rerun for the EU rows that need it.
+_DATE_EXPECTED_DMY: dict[str, str] = {
+    "FD11": "2024-01-15",
+    "FD12": "2024-01-15",
+    "FD13": "2024-01-15",
+    "FD14": "2024-05-30",
+    "FD15": "2024-01-15",
+}
+
+
+@pytest.mark.parametrize(
+    "inp,want",
+    [
+        pytest.param(
+            _load("24_format_dates.csv")[i - 1]["input"],
+            _DATE_EXPECTED_DMY[f"FD{i:02d}"],
+            id=f"FD{i:02d}-dmy",
+        )
+        for i in range(11, 16)
+    ],
+)
+def test_corpus_dates_dmy(inp, want):
+    got, _ = standardize_date(inp, date_order="DMY")
+    assert got == want
+
+
+# ---------------------------------------------------------------------------
+# Phones — 25_format_phones.csv
+# ---------------------------------------------------------------------------
+
+_PHONE_EXPECTED: dict[str, object] = {
+    "FP01": "+15551234567",
+    "FP02": "+15551234567",
+    "FP03": "+15551234567",
+    "FP04": "+15551234567",
+    "FP05": "+15551234567",
+    "FP06": "+15551234567",
+    "FP07": "+15551234567",
+    "FP08": "+15551234567",
+    "FP09": "+15551234567;ext=123",
+    "FP10": "+15551234567;ext=123",
+    "FP11": "+15551234567;ext=123",
+    # vanity numbers
+    "FP12": "+18003569377",
+    "FP13": "+15552255669",
+    # international (intl row FP15 needs --default-country=GB; covered separately)
+    "FP14": "+442079460958",
+    "FP16": "+493012345678",
+    "FP17": "+33123456789",
+    "FP18": "+81312345678",
+    "FP19": "+61212345678",
+    "FP20": "+15551234567",
+    # placeholders/junk → corpus says error
+    "FP21": "<error: insufficient digits>",
+    "FP22": "<error: too many digits>",
+    "FP23": "<error: placeholder number>",
+    "FP24": "<error: placeholder number>",
+    "FP25": "<error: multiple numbers in cell>",
+    # NBSP / smart-quote contamination — defensive cleanup acceptable
+    "FP26": "+15551234567",
+    "FP27": "+15551234567",
+    "FP28": "+15551234567",
+    # FP29 empty → pass-through
+    "FP30": "<error: not a phone number>",
+    "FP31": "<error: smart-quote contamination>",
+}
+
+
+@pytest.mark.parametrize(
+    "inp,want",
+    _params("25_format_phones.csv", _PHONE_EXPECTED, {}),
+)
+def test_corpus_phones(inp, want):
+    got, _ = standardize_phone(inp, error_policy="sentinel")
+    _assert(got, want, inp)
+
+
+def test_corpus_phones_uk_domestic_with_gb_region():
+    # FP15 — UK trunk-prefixed "020 7946 0958" only resolves with
+    # default_region="GB". Verifies the cleaner's intl path works.
+    got, _ = standardize_phone("020 7946 0958", default_region="GB")
+    assert got == "+442079460958"
+
+
+# ---------------------------------------------------------------------------
+# Emails — 26_format_emails.csv
+# ---------------------------------------------------------------------------
+
+_EMAIL_EXPECTED: dict[str, object] = {
+    "FE01": "alice@example.com",
+    "FE02": "alice@example.com",
+    "FE03": "alice@example.com",
+    "FE04": "alice@example.com",
+    "FE05": "alice@example.com",
+    "FE06": "alice@example.com",
+    "FE07": "alice@example.com",
+    "FE08": "alice@example.com",
+    "FE09": "alice@example.com",
+    "FE10": "a.l.i.c.e@gmail.com",            # default: don't touch dots
+    "FE11": "alice+newsletter@gmail.com",     # default: don't touch +tag
+    "FE12": "a.l.i.c.e+work@gmail.com",
+    "FE13": "a.l.i.c.e@example.com",          # never touch non-Gmail
+    "FE14": "alice+newsletter@example.com",
+    "FE15": "alice@münchen.de",
+    "FE16": "アリス@example.jp",
+    "FE17": "alice@example.com",
+    "FE18": "alice@example.com",
+    "FE19": "alice@example.com",
+    "FE20": "alice@example.com",
+    "FE21": "alice@example.com",
+    "FE22": "<error: missing @>",
+    "FE23": "<error: double @>",
+    "FE24": "<error: multiple @>",
+    "FE25": "<error: internal whitespace>",
+    "FE26": "<error: no TLD>",
+    "FE27": "<error: multiple emails>",
+    "FE28": "<error: multiple emails>",
+    # FE29 / FE30 empty / whitespace → PASSTHROUGH
+    "FE31": "alice@example.com",
+}
+
+_EMAIL_XFAILS: dict[str, str] = {}
+
+
+@pytest.mark.parametrize(
+    "inp,want",
+    _params("26_format_emails.csv", _EMAIL_EXPECTED, _EMAIL_XFAILS),
+)
+def test_corpus_emails(inp, want):
+    got, _ = standardize_email(inp, error_policy="sentinel")
+    _assert(got, want, inp)
+
+
+_EMAIL_GMAIL_CANONICAL: dict[str, str] = {
+    "FE10": "alice@gmail.com",
+    "FE11": "alice@gmail.com",
+    "FE12": "alice@gmail.com",
+    "FE13": "a.l.i.c.e@example.com",      # negative test: don't touch non-Gmail
+    "FE14": "alice+newsletter@example.com",  # negative test
+}
+
+
+@pytest.mark.parametrize("inp,want", [
+    pytest.param(
+        next(r for r in _load("26_format_emails.csv") if r["case_id"] == cid)["input"],
+        want, id=f"{cid}-gmail-canonical",
+    )
+    for cid, want in _EMAIL_GMAIL_CANONICAL.items()
+])
+def test_corpus_emails_gmail_canonical(inp, want):
+    got, _ = standardize_email(inp, gmail_canonical=True)
+    assert got == want
+
+
+# ---------------------------------------------------------------------------
+# Addresses — 27_format_addresses.csv
+# ---------------------------------------------------------------------------
+
+_ADDRESS_EXPECTED: dict[str, str] = {
+    "FA01": "123 Main St, New York, NY 10001",
+    "FA02": "123 Main St, New York, NY 10001",
+    "FA03": "123 Main St, New York, NY 10001",
+    "FA04": "123 Main St, New York, NY 10001",
+    "FA05": "123 Main St, New York, NY 10001",
+    "FA06": "456 Park Ave, New York, NY 10001",
+    "FA07": "789 Sunset Blvd, Los Angeles, CA 90028",
+    "FA08": "123 Main St, New York, NY 10001",
+    "FA09": "123 N Main St, City, ST 12345",
+    "FA10": "123 N Main St, City, ST 12345",
+    "FA11": "123 NE Main St, City, ST 12345",
+    "FA12": "123 Main St, Apt 4B, City, ST 12345",
+    "FA13": "123 Main St, # 4B, City, ST 12345",
+    "FA14": "123 Main St, Ste 200, City, ST 12345",
+    "FA15": "123 Main St, New York, NY 10001",
+    "FA16": "123 Main St, New York, NY 10001",
+    "FA17": "123 Main St, New York, NY 10001-1234",
+    "FA18": "123 Main St, Boston, MA 02101",
+    "FA19": "123 Main St, Apt 4B, New York, NY 10001",
+    "FA20": "PO Box 123, City, ST 12345",
+    "FA21": "PO Box 123, City, ST 12345",
+    "FA22": "PO Box 123, City, ST 12345",
+    "FA23": "123A Main St, City, ST 12345",
+    "FA24": "123-1 Main St, City, ST 12345",
+    "FA25": "123 1/2 Main St, City, ST 12345",
+    "FA26": "10 Downing Street, London, SW1A 2AA",
+    "FA27": "1 Yonge St, Toronto, ON M5E 1W7",
+    "FA28": "100-0001, Tokyo, Chiyoda, Marunouchi 1-1",
+    "FA31": "123 Main St, New York, NY 10001",
+}
+
+
+@pytest.mark.parametrize(
+    "inp,want",
+    _params("27_format_addresses.csv", _ADDRESS_EXPECTED, {}),
+)
+def test_corpus_addresses(inp, want):
+    got, _ = standardize_address(inp, expand=False)
+    _assert(got, want, inp)
+
+
+# ---------------------------------------------------------------------------
+# Names — 28_format_names.csv
+# ---------------------------------------------------------------------------
+
+_NAME_EXPECTED: dict[str, object] = {
+    "FN01": "Alice Smith",
+    "FN02": "Alice Smith",
+    "FN03": "Alice Smith",
+    "FN04": "aLiCe SmItH",          # corpus 7.3 conservative: preserve mixed
+    "FN05": "McDonald",
+    "FN06": "McDonald",
+    "FN07": "MacDonald",
+    "FN08": "McTaggart",
+    "FN09": "O'Connor",
+    "FN10": "O'Connor",
+    "FN11": "O'Brien",
+    "FN12": "Mary-Jane Smith",
+    "FN13": "Smith-Jones",
+    "FN14": "von Trapp",
+    "FN15": "Vincent van Gogh",
+    "FN16": "Charles de Gaulle",
+    "FN17": "Leonardo da Vinci",
+    "FN18": "Mr John Smith",        # corpus 7.3: drop title period
+    "FN19": "Dr Jane Doe",
+    "FN20": "Prof Alice Williams",
+    "FN21": "John Smith Jr",
+    "FN22": "John Smith III",
+    "FN23": "Jane Doe PhD",
+    "FN24": "John Smith",           # comma-format reversed
+    "FN25": "John Smith",
+    "FN26": "John Andrew Smith",
+    "FN27": "John A Smith",         # corpus 7.3: drop initial period
+    "FN28": "J.K. Rowling",
+    "FN29": "김철수",
+    "FN30": "田中太郎",
+    "FN31": "Иван Иванов",
+    "FN32": "Madonna",
+    # FN33 / FN34 → PASSTHROUGH default
+}
+
+
+@pytest.mark.parametrize(
+    "inp,want",
+    _params("28_format_names.csv", _NAME_EXPECTED, {}),
+)
+def test_corpus_names(inp, want):
+    # FN04 needs conservative=True; the rest use default (aggressive).
+    conservative = inp == "aLiCe SmItH"
+    got, _ = standardize_name(inp, conservative=conservative)
+    _assert(got, want, inp)
+
+
+# ---------------------------------------------------------------------------
+# Currencies — 29_format_currencies.csv
+# ---------------------------------------------------------------------------
+
+_CURRENCY_EXPECTED: dict[str, object] = {
+    "FC01": "1234.56",
+    "FC02": "1234.56",
+    "FC03": "1234.56",
+    "FC04": "1234.56",
+    "FC05": "1234.56",
+    "FC06": "1234.56",
+    "FC07": "1234.56",
+    "FC08": "1234.56",
+    "FC09": "1234.56",
+    "FC10": "1234.56",
+    "FC11": "1234.56",
+    "FC12": "1234.56",
+    "FC13": "1234",
+    "FC14": "123456.78",
+    "FC15": "-100",
+    "FC16": "-100",
+    "FC17": "-100",
+    "FC18": "0",
+    "FC19": "1500000",
+    "FC20": "<error: percentage not currency>",
+    "FC21": "<error: range not normalizable>",
+    "FC22": "<error: word value>",
+    "FC23": "<error: word value>",
+    # FC24 empty → PASSTHROUGH
+    "FC25": "1234.56",
+    "FC26": "1234",
+    "FC27": "<error: ambiguous separator, set --currency-locale>",
+}
+
+
+@pytest.mark.parametrize(
+    "inp,want",
+    _params("29_format_currencies.csv", _CURRENCY_EXPECTED, {}),
+)
+def test_corpus_currencies(inp, want):
+    got, _ = standardize_currency(inp, error_policy="sentinel")
+    _assert(got, want, inp)
+
+
+def test_corpus_currencies_eu_with_comma_decimal():
+    # FC08, FC10 also parse correctly under decimal="comma".
+    got, _ = standardize_currency("€1.234,56", decimal="comma")
+    assert got == "1234.56"
+    got, _ = standardize_currency("1.234,56 EUR", decimal="comma")
+    assert got == "1234.56"
+
+
+# ---------------------------------------------------------------------------
+# Integration — 30_format_integration.csv
+# ---------------------------------------------------------------------------
+
+def _integration_opts(**overrides) -> StandardizeOptions:
+    """Standardize options matching corpus defaults for the integration row."""
+    base = StandardizeOptions(
+        column_types={
+            "name":    FieldType.NAME,
+            "email":   FieldType.EMAIL,
+            "phone":   FieldType.PHONE,
+            "date":    FieldType.DATE,
+            "amount":  FieldType.CURRENCY,
+            "address": FieldType.ADDRESS,
+        },
+        currency_decimals=None,
+        address_expand=False,
+        date_error_policy="passthrough",
+        phone_error_policy="passthrough",
+    )
+    for k, v in overrides.items():
+        setattr(base, k, v)
+    return base
+
+
+def test_corpus_integration_pipeline_preserves_schema():
+    df = pd.read_csv(CORPUS / "30_format_integration.csv",
+                     dtype=str, keep_default_na=False)
+    result = standardize_dataframe(df, _integration_opts())
+    out = result.standardized_df
+
+    # Schema preservation (corpus § 0.2): no rows or columns added,
+    # column order intact.
+    assert list(out.columns) == list(df.columns)
+    assert len(out) == len(df)
+
+
+def test_corpus_integration_FI01_messy_record():
+    # Row 0 = FI01: standard messy-but-cleanable record.
+    df = pd.read_csv(CORPUS / "30_format_integration.csv",
+                     dtype=str, keep_default_na=False)
+    result = standardize_dataframe(df, _integration_opts())
+    row = result.standardized_df.iloc[0]
+    assert row["name"]    == "Alice Smith"
+    assert row["email"]   == "alice@example.com"
+    assert row["phone"]   == "+15551234567"
+    assert row["date"]    == "2024-01-15"
+    assert row["amount"]  == "1234.56"
+    assert row["address"] == "123 Main St, New York, NY 10001"
+
+
+def test_corpus_integration_FI04_all_empty_passthrough():
+    # Row 3 = FI04: all empty cells, must pass through without errors.
+    df = pd.read_csv(CORPUS / "30_format_integration.csv",
+                     dtype=str, keep_default_na=False)
+    result = standardize_dataframe(df, _integration_opts())
+    row = result.standardized_df.iloc[3]
+    for col in ("name", "email", "phone", "date", "amount", "address"):
+        assert row[col] == "", f"FI04.{col} expected empty, got {row[col]!r}"
+
+
+def test_corpus_integration_FI05_idempotent_on_clean_input():
+    # Row 4 = FI05: already-clean record. Every column should round-trip
+    # unchanged.
+    df = pd.read_csv(CORPUS / "30_format_integration.csv",
+                     dtype=str, keep_default_na=False)
+    result = standardize_dataframe(df, _integration_opts())
+    row = result.standardized_df.iloc[4]
+    original = df.iloc[4]
+    for col in ("name", "email", "phone", "date", "amount", "address"):
+        assert row[col] == original[col], (
+            f"FI05.{col} non-idempotent: {original[col]!r} -> {row[col]!r}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Idempotency property
+# ---------------------------------------------------------------------------
+#
+# Every per-cell standardizer must satisfy ``f(f(x)) == f(x)`` (corpus
+# § 1, "Idempotency requirement"). We exercise it across every corpus
+# input under the same flag set the per-domain tests use.
+
+def _idempotency_runner(fn, fixture, **kwargs):
+    failures = []
+    for row in _load(fixture):
+        once, _ = fn(row["input"], **kwargs)
+        twice, _ = fn(once, **kwargs)
+        if once != twice:
+            failures.append((row["case_id"], row["input"], once, twice))
+    return failures
+
+
+@pytest.mark.parametrize("fn,fixture,kwargs", [
+    (standardize_date,     "24_format_dates.csv",     {}),
+    (standardize_phone,    "25_format_phones.csv",    {}),
+    (standardize_address,  "27_format_addresses.csv", {"expand": False}),
+    (standardize_name,     "28_format_names.csv",     {}),
+    (standardize_currency, "29_format_currencies.csv",{}),
+    (standardize_email,    "26_format_emails.csv",    {}),
+])
+def test_corpus_idempotency(fn, fixture, kwargs):
+    failures = _idempotency_runner(fn, fixture, **kwargs)
+    assert not failures, (
+        f"non-idempotent transformations in {fixture}:\n"
+        + "\n".join(f"  {cid}: {inp!r} -> {once!r} -> {twice!r}"
+                    for cid, inp, once, twice in failures)
+    )