datatools-dev/src/core/format_standardize.py

"""Format standardization for tabular data.

Per-cell standardizers turn messy free-form values into a single canonical
representation: dates → ISO ``YYYY-MM-DD``, phones → E.164 (or other
formats from ``phonenumbers``), currency → bare numeric strings, names →
``Title Case``, addresses → expanded USPS forms (``St.`` → ``Street``),
booleans → ``True``/``False``.

Each per-cell function is ``str -> tuple[str, bool]`` — returning
``(new_value, changed)`` so the DataFrame-level pipeline can audit which
cells were rewritten and which it left alone (unparseable input passes
through). All standardizers handle ``None``/empty gracefully and are
idempotent (applying twice yields the same result as once).

The DataFrame entry point :func:`standardize_dataframe` mirrors
:func:`src.core.text_clean.clean_dataframe` in shape: per-column type
assignments drive the pipeline, the input DataFrame is not mutated, and
a :class:`StandardizeResult` carries both the rewritten frame and a
row-by-row change audit.
"""

from __future__ import annotations

import json
import re

from loguru import logger
from dataclasses import asdict, dataclass, field
from datetime import datetime, timedelta
from enum import Enum
from pathlib import Path
from typing import Any, Iterable, Literal, Optional

import pandas as pd
import phonenumbers

from .text_clean import smart_title_case


# ---------------------------------------------------------------------------
# Field-type registry
# ---------------------------------------------------------------------------

class FieldType(str, Enum):
    """The kinds of values the standardizer knows how to canonicalize."""

    DATE = "date"
    PHONE = "phone"
    CURRENCY = "currency"
    NAME = "name"
    ADDRESS = "address"
    BOOLEAN = "boolean"
    EMAIL = "email"


# Shared error-policy helper used by every per-domain standardizer.
# Returns ``(<error: reason>, changed)`` under the ``"sentinel"`` policy
# and ``(value, False)`` under ``"passthrough"`` so unparseable input
# survives unchanged.
def _err_or_passthrough(
    reason: str, value: str, policy: str,
) -> tuple[str, bool]:
    if policy == "sentinel":
        sentinel = f"<error: {reason}>"
        return sentinel, sentinel != value
    return value, False


# ---------------------------------------------------------------------------
# Date
# ---------------------------------------------------------------------------

# Order matters: longer / more-specific formats first. Two-digit-year
# formats sit below their four-digit counterparts so ``2024-01-15`` parses
# as ISO before ``%y-%m-%d`` even gets a look-in.
_DATE_FORMATS_MDY = [
    "%Y-%m-%d", "%Y/%m/%d", "%Y.%m.%d",
    "%m/%d/%Y", "%m-%d-%Y", "%m.%d.%Y",
    "%m/%d/%y", "%m-%d-%y",
    "%B %d, %Y", "%b %d, %Y", "%B %d %Y", "%b %d %Y",
    "%d %B %Y", "%d %b %Y",
    "%d-%b-%Y", "%d-%b-%y",
    "%Y%m%d",
]

_DATE_FORMATS_DMY = [
    "%Y-%m-%d", "%Y/%m/%d", "%Y.%m.%d",
    "%d/%m/%Y", "%d-%m-%Y", "%d.%m.%Y",
    "%d/%m/%y", "%d-%m-%y", "%d.%m.%y",
    "%d %B %Y", "%d %b %Y",
    "%B %d, %Y", "%b %d, %Y", "%B %d %Y", "%b %d %Y",
    "%d-%b-%Y", "%d-%b-%y",
    "%Y%m%d",
]

# Weekday-prefixed long form: ``Monday, January 15, 2024``.
_WEEKDAY_PREFIX_RE = re.compile(
    r"^(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)(?:day|sday|nesday|rsday|urday)?\s*,?\s+",
    re.IGNORECASE,
)

# Strip a trailing time component (``2024-01-15 13:45:00`` etc.) before
# format-matching the date portion.
_TIME_TAIL_RE = re.compile(r"[\sT]\d{1,2}:\d{2}(?::\d{2}(?:\.\d+)?)?(?:\s*[AaPp][Mm])?(?:\s*[+-]\d{2}:?\d{2}|\s*Z|\s*[A-Z]{2,4})?$")

# Buried date: a strict YYYY-MM-DD substring inside other text, used
# only when the whole string fails strptime first.
_BURIED_ISO_DATE_RE = re.compile(r"\b(\d{4}-\d{2}-\d{2})\b")

# Excel serial date range — Jan 1 1970 to Jan 1 2099 (inclusive). Excel
# 1900 leap year bug: serials >= 60 are off by one because Excel pretends
# 1900-02-29 exists; we subtract a day in that range.
_EXCEL_SERIAL_MIN = 25569.0   # Jan 1 1970
_EXCEL_SERIAL_MAX = 73050.0   # Jan 1 2099
_EXCEL_EPOCH = datetime(1899, 12, 30)  # accounts for the leap-year bug

# Unix timestamp ranges — covers Jan 1 2000 to Jan 1 2100 in seconds and
# milliseconds. Narrow enough that we don't false-positive on other ints.
_UNIX_S_MIN = 946684800        # 2000-01-01 00:00:00 UTC
_UNIX_S_MAX = 4102444800       # 2100-01-01 00:00:00 UTC
_UNIX_MS_MIN = _UNIX_S_MIN * 1000
_UNIX_MS_MAX = _UNIX_S_MAX * 1000

# Year-month text (``January 2024`` / ``Jan 2024``) → ``YYYY-MM``.
_MONTH_NAMES_EN = [
    "january", "february", "march", "april", "may", "june",
    "july", "august", "september", "october", "november", "december",
]
_MONTH_ABBR_EN = ["jan", "feb", "mar", "apr", "may", "jun",
                  "jul", "aug", "sep", "oct", "nov", "dec"]
_YEAR_MONTH_TEXT_RE = re.compile(
    rf"^\s*({'|'.join(_MONTH_NAMES_EN + _MONTH_ABBR_EN)})\s+(\d{{4}})\s*$",
    re.IGNORECASE,
)

# Quarter notation: ``Q1 2024`` → ``2024-Q1``.
_QUARTER_RE = re.compile(r"^\s*Q([1-4])\s+(\d{4})\s*$", re.IGNORECASE)

# Localized month names → English. Substituted before strptime so the
# regular ``%B``/``%b`` formats catch them. Includes both full and
# abbreviated forms where conventional.
_MONTH_LOCALES: dict[str, dict[str, str]] = {
    "fr": {
        "janvier": "January", "février": "February", "fevrier": "February",
        "mars": "March", "avril": "April", "mai": "May", "juin": "June",
        "juillet": "July", "août": "August", "aout": "August",
        "septembre": "September", "octobre": "October",
        "novembre": "November", "décembre": "December", "decembre": "December",
        "janv": "Jan", "févr": "Feb", "fevr": "Feb", "avr": "Apr",
        "juil": "Jul", "sept": "Sep", "oct": "Oct", "nov": "Nov",
        "déc": "Dec", "dec": "Dec",
    },
    "de": {
        "januar": "January", "februar": "February", "märz": "March",
        "marz": "March", "april": "April", "mai": "May", "juni": "June",
        "juli": "July", "august": "August", "september": "September",
        "oktober": "October", "november": "November", "dezember": "December",
        "jan": "Jan", "feb": "Feb", "mär": "Mar", "mar": "Mar",
        "apr": "Apr", "jun": "Jun", "jul": "Jul", "aug": "Aug",
        "sep": "Sep", "okt": "Oct", "nov": "Nov", "dez": "Dec",
    },
    "es": {
        "enero": "January", "febrero": "February", "marzo": "March",
        "abril": "April", "mayo": "May", "junio": "June", "julio": "July",
        "agosto": "August", "septiembre": "September", "setiembre": "September",
        "octubre": "October", "noviembre": "November", "diciembre": "December",
    },
    "pt": {
        "janeiro": "January", "fevereiro": "February", "março": "March",
        "marco": "March", "abril": "April", "maio": "May", "junho": "June",
        "julho": "July", "agosto": "August", "setembro": "September",
        "outubro": "October", "novembro": "November", "dezembro": "December",
        "jan": "Jan", "fev": "Feb", "mar": "Mar", "abr": "Apr",
        "mai": "May", "jun": "Jun", "jul": "Jul", "ago": "Aug",
        "set": "Sep", "out": "Oct", "nov": "Nov", "dez": "Dec",
    },
    "it": {
        "gennaio": "January", "febbraio": "February", "marzo": "March",
        "aprile": "April", "maggio": "May", "giugno": "June",
        "luglio": "July", "agosto": "August", "settembre": "September",
        "ottobre": "October", "novembre": "November", "dicembre": "December",
        "gen": "Jan", "feb": "Feb", "mar": "Mar", "apr": "Apr",
        "mag": "May", "giu": "Jun", "lug": "Jul", "ago": "Aug",
        "set": "Sep", "ott": "Oct", "nov": "Nov", "dic": "Dec",
    },
    "nl": {
        "januari": "January", "februari": "February", "maart": "March",
        "april": "April", "mei": "May", "juni": "June", "juli": "July",
        "augustus": "August", "september": "September", "oktober": "October",
        "november": "November", "december": "December",
        "jan": "Jan", "feb": "Feb", "mrt": "Mar", "apr": "Apr",
        "mei": "May", "jun": "Jun", "jul": "Jul", "aug": "Aug",
        "sep": "Sep", "okt": "Oct", "nov": "Nov", "dec": "Dec",
    },
    "ru": {
        "января": "January", "февраля": "February", "марта": "March",
        "апреля": "April", "мая": "May", "июня": "June", "июля": "July",
        "августа": "August", "сентября": "September", "октября": "October",
        "ноября": "November", "декабря": "December",
        # Nominative forms (less common in dates but possible)
        "январь": "January", "февраль": "February", "март": "March",
        "апрель": "April", "май": "May", "июнь": "June", "июль": "July",
        "август": "August", "сентябрь": "September", "октябрь": "October",
        "ноябрь": "November", "декабрь": "December",
    },
}

# Localized weekday prefix removal — same idea as month substitution.
# Each locale's set lists full + abbreviated forms (lowercase) that
# should be stripped from the start of a date string before format
# matching. English is in ``_WEEKDAY_PREFIX_RE`` already.
_WEEKDAY_LOCALES: dict[str, list[str]] = {
    "fr": ["lundi", "mardi", "mercredi", "jeudi", "vendredi", "samedi",
           "dimanche", "lun", "mar", "mer", "jeu", "ven", "sam", "dim"],
    "de": ["montag", "dienstag", "mittwoch", "donnerstag", "freitag",
           "samstag", "sonntag", "mo", "di", "mi", "do", "fr", "sa", "so"],
    "es": ["lunes", "martes", "miércoles", "miercoles", "jueves",
           "viernes", "sábado", "sabado", "domingo"],
    "it": ["lunedì", "lunedi", "martedì", "martedi", "mercoledì",
           "mercoledi", "giovedì", "giovedi", "venerdì", "venerdi",
           "sabato", "domenica"],
    "pt": ["segunda-feira", "segunda", "terça-feira", "terca-feira",
           "terça", "terca", "quarta-feira", "quarta", "quinta-feira",
           "quinta", "sexta-feira", "sexta", "sábado", "sabado", "domingo"],
    "nl": ["maandag", "dinsdag", "woensdag", "donderdag", "vrijdag",
           "zaterdag", "zondag",
           "ma", "di", "wo", "do", "vr", "za", "zo"],
    "ru": ["понедельник", "вторник", "среда", "четверг", "пятница",
           "суббота", "воскресенье",
           "пн", "вт", "ср", "чт", "пт", "сб", "вс"],
}


def _build_weekday_patterns() -> dict[str, "re.Pattern[str]"]:
    """One regex per locale matching any leading weekday + optional comma."""
    out = {}
    for loc, words in _WEEKDAY_LOCALES.items():
        # Sort longest first so ``segunda-feira`` wins over ``segunda``.
        alt = "|".join(re.escape(w) for w in sorted(words, key=len, reverse=True))
        out[loc] = re.compile(rf"^(?:{alt})\s*,?\s+", re.IGNORECASE)
    return out


_WEEKDAY_LOCALE_PATTERNS = _build_weekday_patterns()


# Named timezone → fixed UTC offset. Resolves common abbreviations so
# ``2024-01-15 10:30:00 EST`` produces a date instead of falling through
# unparseably. Per FORMATS-CASES.md § 3.3, these are *fixed* offsets —
# DST-aware handling is out of scope (would require pyzoneinfo).
_NAMED_TZ_OFFSETS: dict[str, str] = {
    # Universal
    "UTC": "+00:00", "GMT": "+00:00", "Z": "+00:00",
    # Americas
    "EST": "-05:00", "EDT": "-04:00",
    "CST": "-06:00", "CDT": "-05:00",
    "MST": "-07:00", "MDT": "-06:00",
    "PST": "-08:00", "PDT": "-07:00",
    "AST": "-04:00", "AKST": "-09:00", "HST": "-10:00",
    "BRT": "-03:00", "ART": "-03:00",
    # Europe
    "BST": "+01:00", "CET": "+01:00", "CEST": "+02:00",
    "EET": "+02:00", "EEST": "+03:00", "WET": "+00:00", "WEST": "+01:00",
    "MSK": "+03:00",
    # Asia / Pacific
    "IST": "+05:30",
    "PKT": "+05:00", "BDT": "+06:00",
    "ICT": "+07:00", "WIB": "+07:00",
    "CST_CN": "+08:00", "HKT": "+08:00", "SGT": "+08:00", "PHT": "+08:00",
    "JST": "+09:00", "KST": "+09:00",
    "AEST": "+10:00", "AEDT": "+11:00", "NZST": "+12:00",
}


def _build_month_locale_patterns() -> dict[str, list[tuple["re.Pattern[str]", str]]]:
    """Precompile per-locale (pattern, replacement) lists once at import.

    The previous loop compiled every pattern for every input cell — at
    millions of rows that's a measurable hot spot.
    """
    out: dict[str, list[tuple[re.Pattern[str], str]]] = {}
    for loc, table in _MONTH_LOCALES.items():
        out[loc] = [
            (
                re.compile(
                    rf"(?<![A-Za-z]){re.escape(foreign)}(?![A-Za-z])",
                    re.IGNORECASE,
                ),
                english,
            )
            for foreign, english in table.items()
        ]
    return out


_MONTH_LOCALE_PATTERNS = _build_month_locale_patterns()


def _apply_month_locale(s: str, locales: list[str]) -> str:
    """Replace localized month names with English equivalents.

    Raises ``ValueError`` if any locale is unrecognized — silent skip
    would mask typos like ``"FR"`` (uppercase) or ``"french"``.
    """
    unknown = [
        loc for loc in locales if loc != "en" and loc not in _MONTH_LOCALES
    ]
    if unknown:
        raise ValueError(
            f"Unknown month locale(s): {unknown}. "
            f"Available: {sorted(_MONTH_LOCALES) + ['en']}"
        )
    for loc in locales:
        if loc == "en":
            continue
        for pat, english in _MONTH_LOCALE_PATTERNS[loc]:
            s = pat.sub(english, s)
    return s


def _try_excel_serial(s: str, output_format: str) -> Optional[str]:
    """Excel-1900 serial date → formatted date, or None if out of range."""
    try:
        n = float(s)
    except ValueError:
        return None
    if not (_EXCEL_SERIAL_MIN <= n <= _EXCEL_SERIAL_MAX):
        return None
    days = int(n)  # drop fractional time-of-day component
    # Excel 1900 leap year bug: serials >= 60 are off by one day. Our
    # epoch (1899-12-30) already corrects for this for serials >= 60.
    # For serials < 60, we'd need a different epoch (1899-12-31), but
    # those serials are pre-1900 anyway and outside our supported range.
    try:
        return (_EXCEL_EPOCH + timedelta(days=days)).strftime(output_format)
    except (OverflowError, ValueError):
        return None


def _try_unix_timestamp(s: str, output_format: str) -> Optional[str]:
    """Unix seconds / milliseconds → formatted date, or None."""
    try:
        n = int(s)
    except ValueError:
        return None
    if _UNIX_S_MIN <= n <= _UNIX_S_MAX:
        seconds = n
    elif _UNIX_MS_MIN <= n <= _UNIX_MS_MAX:
        seconds = n // 1000
    else:
        return None
    try:
        return datetime.utcfromtimestamp(seconds).strftime(output_format)
    except (OverflowError, ValueError, OSError):
        return None


DateOrder = Literal["MDY", "DMY"]
DateErrorPolicy = Literal["passthrough", "sentinel"]


def standardize_date(
    value: Optional[str],
    *,
    output_format: str = "%Y-%m-%d",
    date_order: DateOrder = "MDY",
    error_policy: DateErrorPolicy = "passthrough",
    month_locales: Optional[list[str]] = None,
    two_digit_year_cutoff: int = 69,
) -> tuple[str, bool]:
    """Parse *value* as a date and return it formatted per *output_format*.

    ``date_order`` disambiguates ``01/02/2024``: ``"MDY"`` reads it as
    Jan 2, ``"DMY"`` as Feb 1. ISO-shaped inputs (``YYYY-MM-DD``) are
    unambiguous and parse the same way under either setting.

    With ``error_policy="passthrough"`` (default) unparseable input
    passes through unchanged. With ``"sentinel"`` the cleaner emits
    ``<error: <reason>>`` for invalid dates per corpus § 0.3.

    ``month_locales`` enables non-English month names. Pass any subset
    of ``["en", "fr", "de", "es", "pt", "it", "nl", "ru"]`` to recognize
    those locales' month + weekday names in addition to English.
    Defaults to English-only.

    ``two_digit_year_cutoff`` controls the pivot for 2-digit years:
    years ``00..cutoff`` map to 2000-2099, ``cutoff+1..99`` map to
    1900-1999. Default 69 (Python's stdlib default). Override to ~25
    for birth-year columns where most subjects were born ≤ 1999.

    Recognizes Excel-1900 serial dates (``45306`` → ``2024-01-15``),
    Unix timestamps in seconds and milliseconds, year-month text
    (``January 2024`` → ``2024-01``), and quarter notation (``Q1 2024``
    → ``2024-Q1``) in addition to the standard date formats.

    Returns ``(new_value, changed)``.
    """
    if not value or not isinstance(value, str):
        return value or "", False
    s = value.strip()
    if not s:
        return value, False

    _err = lambda reason: _err_or_passthrough(reason, value, error_policy)

    # Excel serial dates and Unix timestamps don't survive the weekday-
    # prefix / time-tail strips, so try them first. They short-circuit
    # for pure-numeric inputs.
    if re.match(r"^-?\d+(?:\.\d+)?$", s):
        excel = _try_excel_serial(s, output_format)
        if excel is not None:
            return excel, excel != value
        unix = _try_unix_timestamp(s, output_format)
        if unix is not None:
            return unix, unix != value

    # Year-month text (``January 2024``) → ``YYYY-MM`` (precision-preserving).
    ym = _YEAR_MONTH_TEXT_RE.match(s)
    if ym:
        month_word = ym.group(1).lower()
        if month_word in _MONTH_NAMES_EN:
            month_num = _MONTH_NAMES_EN.index(month_word) + 1
        else:
            month_num = _MONTH_ABBR_EN.index(month_word) + 1
        out = f"{ym.group(2)}-{month_num:02d}"
        return out, out != value

    # Quarter notation (``Q1 2024``) → ``YYYY-Q1``.
    q = _QUARTER_RE.match(s)
    if q:
        out = f"{q.group(2)}-Q{q.group(1)}"
        return out, out != value

    # CJK separator normalization: Japanese ``2024年01月15日`` → ``2024-01-15``,
    # Korean ``2024.01.15`` is already covered by the dot format. Also fold
    # fullwidth digits (０-９) to ASCII so any of the parsers can read them.
    s = _normalize_cjk_date_chars(s)

    # Substitute localized month names with English before format-match.
    if month_locales:
        s = _apply_month_locale(s, month_locales)
        # Strip localized weekday prefixes for any enabled locale BEFORE
        # the day-period strip — otherwise ``Montag, 15. Januar 2024``
        # never reaches the digit-leading shape the period strip expects.
        for loc in month_locales:
            pat = _WEEKDAY_LOCALE_PATTERNS.get(loc)
            if pat is not None:
                s = pat.sub("", s).strip()
        # German DMY uses ``15.`` for the day; strip the trailing period
        # so ``15. Januar 2024`` parses as ``15 January 2024``.
        s = re.sub(r"^(\d{1,2})\.\s+", r"\1 ", s)

    # Strip a leading weekday prefix (``Monday, January 15, 2024``).
    s = _WEEKDAY_PREFIX_RE.sub("", s).strip()
    # Resolve named timezones (EST/PST/JST/…) to fixed offsets, then
    # drop the trailing time portion before format-matching.
    s = _resolve_named_tz(s)
    s = _TIME_TAIL_RE.sub("", s).strip()

    # ISO 8601 extended formats — week date + ordinal date — and
    # RFC 2822 mail-header form.
    iso_extended = _try_iso_extended(s, output_format)
    if iso_extended is not None:
        return iso_extended, iso_extended != value
    rfc = _try_rfc2822(s, output_format)
    if rfc is not None:
        return rfc, rfc != value

    parsed = _try_parse_date(s, date_order, two_digit_year_cutoff)
    if parsed is not None:
        out = parsed.strftime(output_format)
        return out, out != value

    # Buried-date extraction: try a strict ISO substring (``Date: 2024-01-15``,
    # ``2024-01-15 (verified)``).
    m = _BURIED_ISO_DATE_RE.search(value)
    if m:
        try:
            parsed = datetime.strptime(m.group(1), "%Y-%m-%d")
            out = parsed.strftime(output_format)
            return out, out != value
        except ValueError:
            pass

    # Detect explicit-but-invalid date shapes — give the user a clearer
    # error than silent passthrough. Other shapes (partial precision,
    # unknown text) pass through unchanged regardless of error policy.
    iso_shape = re.match(r"^(\d{4})-(\d{1,2})-(\d{1,2})$", s)
    if iso_shape:
        y, mo, d = int(iso_shape[1]), int(iso_shape[2]), int(iso_shape[3])
        if y == 1900 and mo == 2 and d == 29:
            return _err("Excel 1900 leap year bug")
        if mo > 12 or mo < 1:
            return _err("invalid month")
        if d > 31 or d < 1:
            return _err("invalid day")
        if mo == 2:
            leap = y % 4 == 0 and (y % 100 != 0 or y % 400 == 0)
            if d > (29 if leap else 28):
                return _err("invalid leap day" if d == 29 else "invalid day")
        if mo in {4, 6, 9, 11} and d > 30:
            return _err("invalid day")

    return value, False


def _try_parse_date(
    s: str, date_order: DateOrder, two_digit_year_cutoff: int = 69,
) -> Optional[datetime]:
    formats = _DATE_FORMATS_DMY if date_order == "DMY" else _DATE_FORMATS_MDY
    for fmt in formats:
        try:
            parsed = datetime.strptime(s, fmt)
        except ValueError:
            continue
        # Re-pivot 2-digit years if the user changed the cutoff. strptime
        # uses Python's stdlib default of 69; for cutoff != 69 we may need
        # to roll the century forward or back.
        if "%y" in fmt and two_digit_year_cutoff != 69:
            year_2 = parsed.year % 100
            if year_2 <= two_digit_year_cutoff:
                century = 2000
            else:
                century = 1900
            parsed = parsed.replace(year=century + year_2)
        return parsed
    return None


_FULLWIDTH_DIGITS = str.maketrans("０１２３４５６７８９", "0123456789")
_CJK_DATE_MARKERS = str.maketrans({"年": "-", "月": "-", "日": "", "．": ".", "／": "/"})


def _normalize_cjk_date_chars(s: str) -> str:
    """Fold East Asian date markers + fullwidth digits to ASCII equivalents.

    ``2024年01月15日`` → ``2024-01-15``; fullwidth ``２０２４／０１／１５``
    → ``2024/01/15``. Idempotent on ASCII input.
    """
    if not any(c > "\x7f" for c in s):
        return s
    s = s.translate(_FULLWIDTH_DIGITS).translate(_CJK_DATE_MARKERS)
    # ``2024年01月15日`` becomes ``2024-01-15-`` with our trailing-day
    # mapping; strip any trailing dash artifact.
    return s.rstrip("-").strip()


_NAMED_TZ_RE = re.compile(
    r"\s+(" + "|".join(re.escape(k) for k in sorted(_NAMED_TZ_OFFSETS, key=len, reverse=True)) + r")\b"
)


def _resolve_named_tz(s: str) -> str:
    """Replace a trailing named timezone with its fixed UTC offset.

    ``2024-01-15 10:30:00 EST`` → ``2024-01-15 10:30:00-05:00``. Per
    FORMATS-CASES.md § 3.3, offsets are fixed (not DST-aware); see
    ``_NAMED_TZ_OFFSETS`` for the table.
    """
    def repl(m: re.Match) -> str:
        return _NAMED_TZ_OFFSETS[m.group(1)]
    return _NAMED_TZ_RE.sub(repl, s)


_ISO_WEEK_RE = re.compile(r"^(\d{4})-W(\d{2})-(\d)$")
_ISO_ORDINAL_RE = re.compile(r"^(\d{4})-(\d{3})$")


def _try_iso_extended(s: str, output_format: str) -> Optional[str]:
    """Parse ISO 8601 week date or ordinal date, return formatted string."""
    m = _ISO_WEEK_RE.match(s)
    if m:
        try:
            parsed = datetime.fromisocalendar(
                int(m.group(1)), int(m.group(2)), int(m.group(3)),
            )
            return parsed.strftime(output_format)
        except ValueError:
            return None
    m = _ISO_ORDINAL_RE.match(s)
    if m:
        year, day = int(m.group(1)), int(m.group(2))
        if 1 <= day <= 366:
            try:
                parsed = datetime(year, 1, 1) + timedelta(days=day - 1)
                if parsed.year == year:
                    return parsed.strftime(output_format)
            except ValueError:
                return None
    return None


# RFC 2822 mail-header form: ``Wed, 15 Jan 2024 10:30:00 GMT``.
_RFC2822_FORMATS = [
    "%a, %d %b %Y %H:%M:%S",       # without TZ
    "%a, %d %b %Y %H:%M:%S %Z",    # with named TZ (already resolved upstream)
    "%a, %d %b %Y %H:%M:%S %z",    # with offset
    "%d %b %Y %H:%M:%S",
]


def _try_rfc2822(s: str, output_format: str) -> Optional[str]:
    """Parse RFC 2822 mail-header date format."""
    for fmt in _RFC2822_FORMATS:
        try:
            parsed = datetime.strptime(s, fmt)
        except ValueError:
            continue
        try:
            return parsed.strftime(output_format)
        except ValueError:
            return None
    return None


# ---------------------------------------------------------------------------
# Phone
# ---------------------------------------------------------------------------

PhoneFormat = Literal["E164", "INTERNATIONAL", "NATIONAL", "DIGITS"]
PhoneErrorPolicy = Literal["passthrough", "sentinel"]

_PHONE_FORMAT_MAP = {
    "E164": phonenumbers.PhoneNumberFormat.E164,
    "INTERNATIONAL": phonenumbers.PhoneNumberFormat.INTERNATIONAL,
    "NATIONAL": phonenumbers.PhoneNumberFormat.NATIONAL,
}

# Placeholder sequences that look like phone numbers but are CRM
# sentinels for "no phone" — repeated single digit at NANP length.
_PHONE_PLACEHOLDER_RE = re.compile(r"^\+?1?[\s.()-]*([0-9])(?:[\s.()-]*\1){9}$")
# Multi-number cells split by ``/``, ``;``, ``,`` or `` and ``.
_PHONE_MULTI_SPLIT_RE = re.compile(r"\s*(?:/|;|,| and )\s*")


def standardize_phone(
    value: Optional[str],
    *,
    output_format: PhoneFormat = "E164",
    default_region: str = "US",
    error_policy: PhoneErrorPolicy = "passthrough",
) -> tuple[str, bool]:
    """Parse with ``phonenumbers``, return in the requested format.

    Default is ``passthrough`` for unparseable input; pass
    ``error_policy="sentinel"`` to emit ``<error: <reason>>`` for
    placeholder runs (000-000-0000), multi-number cells, and contaminated
    inputs (corpus § 4.3).

    Extensions are preserved as a ``;ext=N`` suffix (RFC 3966 syntax)
    when the format is E.164. Other output formats use libphonenumber's
    native rendering, which already includes extensions.

    The ``001`` international prefix is normalized to ``+`` before
    parsing — without this, ``001 555 123 4567`` fails to parse under
    ``default_region="US"``.

    ``DIGITS`` strips every non-digit character without going through
    ``phonenumbers``.
    """
    if not value or not isinstance(value, str):
        return value or "", False
    s = value.strip()
    if not s:
        return value, False

    _err = lambda reason: _err_or_passthrough(reason, value, error_policy)

    if output_format == "DIGITS":
        digits = re.sub(r"\D", "", s)
        return (digits, digits != value) if digits else (value, False)

    # Multi-number per cell — error before we silently parse only the
    # first number. ``5551234567 / 5559876543`` both parse independently.
    if _PHONE_MULTI_SPLIT_RE.search(s):
        parts = [p for p in _PHONE_MULTI_SPLIT_RE.split(s) if p.strip()]
        if len(parts) >= 2 and all(
            _looks_like_phone(p, default_region) for p in parts
        ):
            return _err("multiple numbers in cell")

    # Smart-quote contamination — unparseable detritus interleaved with
    # digits. Strip and re-test, but flag when error_policy is sentinel.
    if any(c in s for c in "‘’“”"):
        cleaned = re.sub(r"[‘’“”][a-z]*", "", s).strip()
        if cleaned != s:
            if error_policy == "sentinel":
                return _err("smart-quote contamination")
            s = cleaned

    # 001 international access prefix (US-style for "dial out") — strip
    # entirely; the remaining digits are a regular national number that
    # the region default can resolve.
    if re.match(r"^001[\s\-]", s):
        s = s[3:].lstrip(" -")

    # Placeholder all-same-digit runs.
    if _PHONE_PLACEHOLDER_RE.match(s):
        return _err("placeholder number")

    fmt = _PHONE_FORMAT_MAP[output_format]
    try:
        parsed = phonenumbers.parse(s, default_region)
    except phonenumbers.NumberParseException:
        # Anything that can't be parsed becomes a sentinel under the
        # sentinel policy; passthrough returns the original. Both digit-
        # and-formatting failures and pure non-numeric ("TBD"-style) cells
        # land here.
        return _err("not a phone number")

    if not phonenumbers.is_possible_number(parsed):
        # Distinguish "too many digits" from generic invalidity for
        # NANP-shaped inputs. Inputs that look like local-only NANP
        # numbers (7 digits) get a specific "insufficient digits" tag.
        raw_digits = re.sub(r"\D", "", s)
        if len(raw_digits) > 11 and default_region in {"US", "CA"}:
            return _err("too many digits")
        if 0 < len(raw_digits) < 10 and default_region in {"US", "CA"}:
            return _err("insufficient digits")
        return value, False  # genuinely unparseable elsewhere — passthrough

    # Extra-digit detection: NANP (region US/CA, country code 1) only
    # accepts 10 digits (or 11 with leading 1). Excess digits in input
    # like "1-555-123-4567-extra-99" parse out as more digits and we
    # error rather than silently truncate.
    raw_digits = re.sub(r"\D", "", s)
    parsed_digits = re.sub(r"\D", "", phonenumbers.format_number(
        parsed, phonenumbers.PhoneNumberFormat.E164,
    ))
    if len(raw_digits) > len(parsed_digits) + 4:
        return _err("too many digits")

    # NANP minimum-length check — phonenumbers.is_possible_number is
    # permissive; corpus § 4.3 wants insufficient-digits flagged.
    if parsed.country_code == 1 and len(str(parsed.national_number)) < 10:
        return _err("insufficient digits")

    out = phonenumbers.format_number(parsed, fmt)

    # Append extension as RFC 3966 ;ext= suffix on E.164 output (other
    # formats already include the extension natively).
    if output_format == "E164" and parsed.extension:
        out = f"{out};ext={parsed.extension}"

    return out, out != value


def _looks_like_phone(s: str, region: str) -> bool:
    """Quick check: does *s* parse as a possible phone in *region*?"""
    try:
        p = phonenumbers.parse(s, region)
    except phonenumbers.NumberParseException:
        return False
    return phonenumbers.is_possible_number(p)


# ---------------------------------------------------------------------------
# Currency
# ---------------------------------------------------------------------------

# Symbol → ISO 4217 mapping. Used both for stripping currency markers
# before number parsing AND for the optional ``preserve_code`` mode that
# re-emits the detected code as a prefix on the standardized output.
_SYMBOL_TO_ISO: dict[str, str] = {
    "$": "USD",   # ambiguous w/ CAD/AUD/MXN — caller can override via input code
    "€": "EUR",
    "£": "GBP",
    "¥": "JPY",   # ambiguous w/ CNY — same caveat
    "₹": "INR",
    "₩": "KRW",
    "₽": "RUB",
    "₪": "ILS",
    "₺": "TRY",
    "¢": "USD",   # cents — coerce to USD for the code; value is still numeric
    # International additions:
    "฿": "THB",   # Thai Baht
    "₫": "VND",   # Vietnamese Dong
    "₮": "MNT",   # Mongolian Tugrik
    "₴": "UAH",   # Ukrainian Hryvnia
    "₦": "NGN",   # Nigerian Naira
    "₱": "PHP",   # Philippine Peso
    "₲": "PYG",   # Paraguayan Guarani
    "﷼": "SAR",   # ambiguous Saudi/Omani/Iranian; pick the most common
    "₨": "PKR",   # Pakistani Rupee (and historical Sri Lankan)
    "₵": "GHS",   # Ghanaian Cedi
}
_CURRENCY_SYMBOLS = "".join(_SYMBOL_TO_ISO)
# ISO 4217 codes — the long tail of currencies in active use. Order
# matters for the regex alternation: a 3-letter ISO code is unambiguous,
# but ``R$`` (Brazil) and ``kr`` (DKK/NOK/SEK) are 1-2 char prefixes
# that need to lose to a 3-letter code if both appear.
_CURRENCY_CODES_LIST = [
    "USD", "EUR", "GBP", "JPY", "CNY", "CAD", "AUD", "CHF", "INR", "KRW",
    "RUB", "MXN", "BRL", "ILS", "TRY", "ZAR", "SEK", "NOK", "DKK", "PLN",
    "HKD", "SGD", "NZD",
    # Major non-G10 economies:
    "SAR", "AED", "QAR", "KWD", "BHD", "OMR",   # Gulf
    "ARS", "CLP", "COP", "PEN", "UYU",          # Latin America
    "EGP", "MAD", "TND", "NGN", "GHS", "KES", "ZAR", "TZS", "UGX",  # Africa
    "IDR", "MYR", "PHP", "THB", "VND", "TWD",   # SE Asia
    "PKR", "BDT", "LKR", "NPR",                 # South Asia
    "HUF", "CZK", "RON", "BGN", "HRK", "ISK",   # Europe-other
    "UAH", "KZT", "GEL", "AMD", "AZN",          # Eastern Europe / Caucasus
]
_CURRENCY_CODES = "|".join(_CURRENCY_CODES_LIST)
_CURRENCY_DETECT_RE = re.compile(
    rf"(?P<code>{_CURRENCY_CODES})|(?P<sym>[{_CURRENCY_SYMBOLS}])",
    re.IGNORECASE,
)
_CURRENCY_TRIM_RE = re.compile(
    rf"^[\s{_CURRENCY_SYMBOLS}]*(?:{_CURRENCY_CODES})?[\s{_CURRENCY_SYMBOLS}]*"
    rf"|[\s{_CURRENCY_SYMBOLS}]*(?:{_CURRENCY_CODES})?[\s{_CURRENCY_SYMBOLS}]*$",
    re.IGNORECASE,
)
_PARENS_NEGATIVE_RE = re.compile(r"^\s*\(\s*(.+?)\s*\)\s*$")


CurrencyDecimal = Literal["dot", "comma", "auto"]


# Multi-character symbol prefixes that aren't captured by the
# single-codepoint ``_CURRENCY_SYMBOLS`` table. Order matters: the
# detector checks these prefixes BEFORE the single-symbol regex, so
# ``R$`` resolves to BRL even though ``$`` alone would map to USD.
_PREFIX_TO_ISO: dict[str, str] = {
    "r$":  "BRL",   # Brazilian Real
    "kr":  "SEK",   # ambiguous Nordic — picks SEK as most common; see tests
    "zł":  "PLN",   # Polish Złoty
    "лв":  "BGN",   # Bulgarian Lev
    "₽":   "RUB",   # already in symbol table; kept for parity
    "rs.": "INR",   # rupees — covers IN/PK informal usage
    "rs":  "INR",
}


def detect_currency_code(value: str) -> Optional[str]:
    """Return the ISO 4217 code implied by *value*, or None.

    Looks for an explicit ISO code first (``USD 1234``) and falls back to a
    symbol → code mapping (``$1234`` → ``USD``). Symbol mapping is best-
    effort: ``$`` is ambiguous between USD/CAD/AUD/MXN — the caller is
    expected to constrain that via input data discipline.

    Multi-char prefixes (``R$``, ``zł``, ``kr``) are recognised before
    the single-symbol regex so Brazilian / Polish / Nordic data isn't
    silently bucketed as USD.
    """
    if not isinstance(value, str):
        return None
    head = value.lstrip().lower()
    for prefix, code in _PREFIX_TO_ISO.items():
        if head.startswith(prefix):
            # Make sure the next char (if any) isn't a letter — avoid
            # matching ``rsa`` as ``rs``-then-``a``.
            tail = head[len(prefix):]
            if not tail or not tail[0].isalpha():
                return code
    m = _CURRENCY_DETECT_RE.search(value)
    if m is None:
        return None
    if m.group("code"):
        return m.group("code").upper()
    sym = m.group("sym")
    return _SYMBOL_TO_ISO.get(sym)


CurrencyErrorPolicy = Literal["passthrough", "sentinel"]


def standardize_currency(
    value: Optional[str],
    *,
    decimal: CurrencyDecimal = "dot",
    decimals: Optional[int] = None,
    preserve_code: bool = False,
    error_policy: CurrencyErrorPolicy = "passthrough",
) -> tuple[str, bool]:
    """Strip currency symbols/grouping separators, return a bare number string.

    ``decimal="dot"``: ``$1,234.56`` → ``1234.56`` (US/UK convention).
    ``decimal="comma"``: ``1.234,56 €`` → ``1234.56`` (EU convention).
    ``decimal="auto"``: same as ``dot`` but a single trailing comma
    whose tail is NOT exactly 3 digits is read as a decimal separator
    (``850,50`` → ``850.50``, ``R$ 1,5`` → ``1.5``). Use this for
    mixed-locale international files. Length-3 tails (``1,234``) stay
    ambiguous regardless of mode.

    All three modes auto-detect the EU shape when both ``.`` and ``,``
    are present and the comma sits after the dot (so ``€1.234,56``
    parses correctly even under the dot-default mode). Space-thousands
    and Swiss apostrophe-thousands are also recognized.

    The output always uses a dot as the decimal separator since that is
    the form pandas/Python parse natively.

    Accounting-style negatives (``($50.00)``) become ``-50.00``.

    With ``error_policy="passthrough"`` (default) unparseable input
    passes through unchanged. With ``error_policy="sentinel"`` the
    cleaner emits ``<error: <reason>>`` for percentages, ranges, word
    values, ambiguous separators, and other non-currency content per
    corpus § 8.3.

    When *decimals* is given, the result is rounded to that many places.

    When *preserve_code* is True, an ISO 4217 code is detected from the
    input (``USD 1234`` or ``$1234``) and re-emitted as a space-separated
    prefix on the standardized number (``USD 1234.56``).
    """
    if not value or not isinstance(value, str):
        return value or "", False
    s = value.strip()
    if not s:
        return value, False

    _err = lambda reason: _err_or_passthrough(reason, value, error_policy)

    if "%" in s:
        return _err("percentage not currency")
    # Range like "$50-$100" or "50–100" — distinguished from a single
    # signed number by either two currency symbols, or a digit-then-
    # dash-then-digit with the dash NOT being the leading sign.
    sym_count = sum(1 for c in s if c in "$£€¥₹")
    if sym_count >= 2 and re.search(r"\d\s*[-–—]\s*[$£€¥₹]", s):
        return _err("range not normalizable")
    if (
        sym_count == 0
        and re.search(r"\d\s*[-–—]\s*\d", s)
        and not re.match(r"^[+-]?\d", s.strip())
    ):
        return _err("range not normalizable")

    code = detect_currency_code(s) if preserve_code else None

    # Strip any multi-char currency prefix (``R$``, ``kr``, ``zł``)
    # before the symbol-table regex — these aren't single codepoints
    # so the table-driven trim would otherwise leave them in place.
    head = s.lstrip().lower()
    for prefix in _PREFIX_TO_ISO:
        if head.startswith(prefix):
            tail_start = len(prefix)
            if tail_start < len(head) and head[tail_start].isalpha():
                continue
            # Strip the matched prefix from the original (preserve case
            # of any trailing content).
            stripped_lead = s[: len(s) - len(head)]
            s = stripped_lead + s.lstrip()[len(prefix):]
            s = s.lstrip()
            break

    negative = False
    m = _PARENS_NEGATIVE_RE.match(s)
    if m:
        negative = True
        s = m.group(1)

    s = _CURRENCY_TRIM_RE.sub("", s).strip()
    if not s:
        return _err("empty after symbol strip")

    if s.startswith(("+", "-")):
        sign, rest = s[0], s[1:]
        if sign == "-":
            negative = not negative
        rest = _CURRENCY_TRIM_RE.sub("", rest).strip()
    else:
        rest = s

    # Swiss apostrophe-thousands → drop apostrophes used as group sep.
    if "'" in rest:
        rest = rest.replace("'", "")

    # Space- or NBSP-thousands → drop spaces between digit groups
    # (``1 234,56`` → ``1234,56``). Track whether we saw such a
    # separator so we can disambiguate the comma below.
    had_space_thousands = bool(re.search(r"\d[ \xa0]\d", rest))
    rest = re.sub(r"(?<=\d)[ \xa0](?=\d)", "", rest)

    has_dot = "." in rest
    has_comma = "," in rest

    if decimal == "comma":
        # EU explicit: dots are thousands, comma is decimal.
        rest = rest.replace(".", "").replace(",", ".")
    else:
        if has_dot and has_comma:
            # Both present — the rightmost separator is the decimal.
            if rest.rfind(",") > rest.rfind("."):
                # EU: 1.234,56
                rest = rest.replace(".", "").replace(",", ".")
            else:
                # US: 1,234.56
                rest = rest.replace(",", "")
        elif has_comma and not has_dot:
            # ``1,234`` (no dot) is thousands-grouped US; ``1,5`` is
            # ambiguous. But a leading space-thousand separator (``1 234,56``)
            # is unambiguously EU — treat the comma as decimal.
            if had_space_thousands:
                rest = rest.replace(",", ".")
            elif decimal == "auto":
                # International auto-detection: a single comma whose
                # tail is NOT exactly 3 digits is far more likely to be
                # an EU/BRL decimal (``850,50``, ``1,5``) than a
                # malformed US thousands group. Length-3 tails stay
                # ambiguous and require an explicit locale.
                after = rest.rsplit(",", 1)[1]
                if rest.count(",") > 1:
                    rest = rest.replace(",", "")
                elif len(after) == 3:
                    return _err("ambiguous separator, set --currency-locale")
                else:
                    rest = rest.replace(",", ".")
            else:
                after = rest.rsplit(",", 1)[1]
                if len(after) != 3:
                    return _err("ambiguous separator, set --currency-locale")
                rest = rest.replace(",", "")
        elif has_dot and not has_comma:
            # Scientific notation (``1.5e6``) is not ambiguous — the tail
            # after the dot contains a non-digit. Skip the EU-thousands
            # check in that case.
            after = rest.rsplit(".", 1)[1]
            tail_is_pure_digits = after.isdigit()
            if (
                tail_is_pure_digits
                and len(after) == 3
                and len(rest.split(".")[0]) <= 3
                and rest.count(".") == 1
            ):
                return _err("ambiguous separator, set --currency-locale")

    try:
        num = float(rest)
    except ValueError:
        return _err("word value")

    if negative:
        num = -num

    if decimals is not None:
        out = f"{num:.{decimals}f}"
    elif num == int(num) and "." not in rest:
        out = str(int(num))
    else:
        out = f"{num:g}" if abs(num) >= 1e16 else format(num, "f").rstrip("0").rstrip(".")
        if not out or out in ("-", ""):
            out = "0"

    if code is not None:
        out = f"{code} {out}"

    return out, out != value


# ---------------------------------------------------------------------------
# Name
# ---------------------------------------------------------------------------

NameCase = Literal["title", "upper", "lower"]

# Particles in surnames that conventionally stay lowercase in natural
# reading order. Covers the major Indo-European traditions plus
# Arabic/Hebrew patronymic markers.
_NAME_PARTICLES: set[str] = {
    # Germanic / Dutch / French / Italian
    "von", "van", "de", "da", "del", "della", "di", "du", "der",
    "den", "ter", "ten", "le", "la", "los", "las", "el",
    # Spanish / Portuguese
    "dos", "das", "do", "y",
    # Arabic patronymic / nisba
    "bin", "ibn", "bint", "abu", "abd", "al", "el-", "al-",
    # Hebrew
    "ben", "bat", "ha", "ha-",
    # Slavic transliterated (rare in Western forms)
    "z", "ze",
}

# Acronyms / honorifics that keep their conventional casing rather than
# being title-cased (``PhD``, ``MD``, ``Esq``). Includes international
# academic credentials.
_NAME_ACRONYMS: dict[str, str] = {
    # English
    "phd": "PhD", "md": "MD", "esq": "Esq", "ma": "MA", "ba": "BA",
    "bs": "BS", "ms": "MS", "dds": "DDS", "dvm": "DVM", "jd": "JD",
    "rn": "RN", "cpa": "CPA", "ceo": "CEO", "cto": "CTO", "cfo": "CFO",
    # German / Austrian academic
    "dipl": "Dipl", "ing": "Ing", "mag": "Mag", "habil": "Habil",
    "drmed": "Dr.med.", "drphil": "Dr.phil.", "drrernat": "Dr.rer.nat.",
    "msc": "MSc", "bsc": "BSc",
    # International degrees
    "llb": "LLB", "llm": "LLM",
}

# Roman numeral suffixes — preserved verbatim (already uppercase).
_NAME_ROMAN_RE = re.compile(r"^[IVX]+$")

# Titles. Most languages strip the trailing period (``Mr.`` → ``Mr``);
# the dispatcher in _standardize_name_token does the strip.
_NAME_TITLES: set[str] = {
    # English
    "mr", "mrs", "ms", "miss", "dr", "prof", "sr", "jr", "sir", "madam",
    "rev", "hon",
    # German
    "herr", "frau", "fr", "hr",
    # French
    "m", "mme", "mlle", "mr",
    # Spanish
    "sr", "sra", "srta", "don", "doña", "dona",
    # Italian
    "sig", "sigra", "dott", "dottoressa",
    # Portuguese
    "snr", "snra",
}

# East Asian honorific suffixes — appended after the family name with a
# hyphen. Preserved verbatim (lowercase). Supports both Latin
# transliteration and the underlying Japanese/Korean characters.
_EAST_ASIAN_HONORIFICS: set[str] = {
    "san", "sama", "kun", "chan", "sensei", "senpai", "kohai", "dono",
    "shi", "tan", "chin",
    # Korean
    "ssi", "nim",
}

# Suffixes that take a trailing period in their short form (``Jr.``).
_NAME_SUFFIXES: set[str] = {"jr", "sr", "esq"}


def _cap_segment(seg: str) -> str:
    """Capitalize a single word/segment, leaving the rest lowercase."""
    if not seg:
        return seg
    return seg[0].upper() + seg[1:].lower()


def _standardize_name_token(tok: str, *, position: str, all_shouting: bool = False) -> str:
    """Standardize one space-separated token.

    *position* is one of ``"first"``, ``"middle"``, ``"last"`` and
    drives particle / capitalization rules. *all_shouting* is True when
    every token in the surrounding name is uppercase — in that case,
    don't preserve any single token as an acronym.
    """
    if not tok:
        return tok

    # Trailing punctuation gets stripped and re-attached.
    suffix_punct = ""
    while tok and tok[-1] in ",;:":
        suffix_punct = tok[-1] + suffix_punct
        tok = tok[:-1]
    if not tok:
        return suffix_punct

    lowered = tok.lower()
    bare = lowered.rstrip(".")

    # Roman numerals (II, III, IV, …)
    if _NAME_ROMAN_RE.match(tok.upper()):
        return tok.upper() + suffix_punct

    # Known acronym (PhD, MD, …)
    if bare in _NAME_ACRONYMS:
        return _NAME_ACRONYMS[bare] + suffix_punct

    # All-caps token of length >= 2 with no lowercase letters and at
    # least one alpha — treat as an acronym in the middle of a name
    # (``Mary USA Smith``, ``John IBM Doe``). Doesn't fire for single
    # initials (``A.``), and doesn't fire when the whole name is
    # shouting (``DR JANE DOE`` shouldn't preserve JANE as an acronym
    # — the whole thing is just the user's caps lock key).
    if (
        position == "middle"
        and not all_shouting
        and len(bare) >= 2
        and tok.isupper()
        and any(c.isalpha() for c in tok)
        and bare not in _NAME_TITLES
        and bare not in _NAME_SUFFIXES
        and bare not in _NAME_PARTICLES
    ):
        return tok + suffix_punct

    # Title (Mr, Dr, Prof) — strip trailing period
    if bare in _NAME_TITLES:
        return _cap_segment(bare) + suffix_punct

    # Suffix (Jr, Sr) — strip trailing period
    if bare in _NAME_SUFFIXES and position == "last":
        return _cap_segment(bare) + suffix_punct

    # Particle (von, van, de, …) — stay lowercase except as final token
    # of the name (the surname slot — ``van Gogh`` last is ``Gogh``,
    # but standalone ``Van`` would be a first name).
    if lowered.rstrip(".") in _NAME_PARTICLES and position != "last":
        return lowered.rstrip(".") + suffix_punct

    # Single-letter initial like ``A`` or ``A.`` → strip trailing
    # period, uppercase. (Check before multi-initial so ``A.`` doesn't
    # fall into the multi-initial branch and keep its period.)
    if len(bare) == 1 and bare.isalpha():
        return bare.upper() + suffix_punct

    # Multi-initial token like ``j.k.`` or ``J.K.`` → uppercase letters,
    # keep internal periods.
    if "." in tok and all(
        seg == "" or (len(seg) == 1 and seg.isalpha()) for seg in tok.split(".")
    ):
        return tok.upper() + suffix_punct

    # Hyphenated segment — capitalize each piece. Special cases:
    #   - East Asian honorific suffix (``Tanaka-san``) stays lowercase.
    #   - Arabic transliterated prefix (``al-Rashid``, ``el-Sayed``)
    #     keeps the prefix lowercase per Arabic naming convention.
    if "-" in tok:
        parts = tok.split("-")
        out_parts = []
        for j, p in enumerate(parts):
            if j > 0 and p.lower() in _EAST_ASIAN_HONORIFICS:
                out_parts.append(p.lower())
            elif j == 0 and p.lower() in {"al", "el", "an", "ad"}:
                out_parts.append(p.lower())
            else:
                out_parts.append(_cap_segment(p))
        return "-".join(out_parts) + suffix_punct

    # Mc / Mac prefix — inner cap.
    if lowered.startswith("mc") and len(lowered) > 2:
        return "Mc" + _cap_segment(tok[2:]) + suffix_punct
    if lowered.startswith("mac") and len(lowered) > 3:
        # Heuristic: only capitalize after Mac if the following segment
        # would also be capitalized in title case. ``machine`` should
        # stay ``Machine`` not ``MacHine`` — but real surnames are far
        # more common as inputs to a name standardizer than dictionary
        # words. Apply Mac inner-cap unconditionally; document as a
        # known limitation.
        return "Mac" + _cap_segment(tok[3:]) + suffix_punct

    # O' prefix — inner cap.
    if lowered.startswith("o'") and len(lowered) > 2:
        return "O'" + _cap_segment(tok[2:]) + suffix_punct

    # D' prefix — inner cap (D'Angelo, D'Arcy).
    if lowered.startswith("d'") and len(lowered) > 2:
        return "D'" + _cap_segment(tok[2:]) + suffix_punct

    return _cap_segment(tok) + suffix_punct


def _is_non_latin_script(s: str) -> bool:
    """Heuristic: true when the string contains non-Latin cased letters."""
    for c in s:
        if c.isalpha():
            cp = ord(c)
            # Latin range up to Latin Extended-B (covers Latin + accents).
            if cp <= 0x024F:
                return False
    # No Latin alpha characters at all → treat as non-Latin.
    return any(c.isalpha() for c in s)


def standardize_name(
    value: Optional[str],
    *,
    case: NameCase = "title",
    conservative: bool = False,
    reverse_comma_format: bool = True,
    family_first: bool = False,
) -> tuple[str, bool]:
    """Apply name-friendly casing with prefix / particle / suffix awareness.

    ``"title"`` (default) handles:
      * Mc / Mac inner caps (``mcdonald`` → ``McDonald``).
      * O'/D' inner caps (``o'connor`` → ``O'Connor``).
      * Hyphenated segments (``mary-jane`` → ``Mary-Jane``).
      * Particles stay lowercase mid-name (``van Gogh``, ``de Gaulle``,
        ``bin Salman``, ``ben Avraham``).
      * East Asian honorific suffixes (``Tanaka-san``, ``Lee-ssi``)
        preserved lowercase after the hyphen.
      * Title / suffix periods stripped (``Mr.`` → ``Mr``, ``Jr.`` → ``Jr``).
      * Roman numeral suffixes preserved (``III``).
      * PhD / MD / Esq style acronyms preserved.
      * Multi-initial tokens uppercased (``j.k.`` → ``J.K.``).
      * Non-Latin scripts (Korean, Japanese, Cyrillic) pass through.

    ``conservative=True`` preserves mixed-case input verbatim per the
    corpus § 7.3 ``--name-conservative=on`` policy.

    ``reverse_comma_format`` flips ``Last, First`` to ``First Last``
    (default per corpus § 7.3).

    ``family_first=True`` skips comma reversal and disables Western
    title detection — appropriate for East Asian columns where the
    family name comes first natively (``Kim Min-jae``, ``田中 太郎``).
    Set this per-column when you know the cultural convention.

    ``"upper"`` / ``"lower"`` are simple case conversions.
    """
    if not value or not isinstance(value, str):
        return value or "", False
    s = value.strip()
    if not s:
        return value, False

    if case == "upper":
        out = s.upper()
        return out, out != value
    if case == "lower":
        out = s.lower()
        return out, out != value
    if case != "title":
        raise ValueError(f"Unknown name case: {case}")

    # Non-Latin scripts pass through unchanged — no case to apply.
    if _is_non_latin_script(s):
        return value, False

    # Conservative mode: only normalize all-caps or all-lowercase input.
    if conservative:
        cased = [c for c in s if c.isalpha()]
        if cased and any(c.isupper() for c in cased) and any(c.islower() for c in cased):
            return value, False

    # Comma-format reversal: "Smith, John Andrew" → "John Andrew Smith".
    # Skipped under family_first because East Asian conventions write
    # the family name first natively — reversing would corrupt them.
    if reverse_comma_format and not family_first and "," in s:
        parts = [p.strip() for p in s.split(",", 1)]
        if len(parts) == 2 and parts[0] and parts[1]:
            s = f"{parts[1]} {parts[0]}"

    tokens = s.split(" ")
    n = len(tokens)
    cased = [c for c in s if c.isalpha()]
    all_shouting = bool(cased) and not any(c.islower() for c in cased)
    out_tokens: list[str] = []
    for i, tok in enumerate(tokens):
        if not tok:
            out_tokens.append(tok)
            continue
        position = "first" if i == 0 else ("last" if i == n - 1 else "middle")
        out_tokens.append(_standardize_name_token(
            tok, position=position, all_shouting=all_shouting,
        ))

    out = " ".join(out_tokens)
    return out, out != value


# ---------------------------------------------------------------------------
# Address
# ---------------------------------------------------------------------------

# Expansion table — the inverse of the dedup-side compression set in
# ``normalize_address``. We deliberately don't expand ``unit``, ``loop``,
# or ``way`` because those are already the long form. Canonical mappings
# live in :mod:`src.core._constants` so both modules stay in sync.
from ._constants import (
    USPS_EXPANSIONS as _ADDRESS_EXPANSIONS,
    USPS_COMPRESSIONS as _ADDRESS_COMPRESSIONS,
    US_STATE_CODES as _US_STATE_CODES_SHARED,
    US_STATE_NAMES as _US_STATE_NAMES_SHARED,
    CA_PROVINCE_CODES, CA_PROVINCE_NAMES,
    AU_STATE_CODES, AU_STATE_NAMES,
    DE_STATE_CODES, DE_STATE_NAMES,
    POSTAL_PATTERNS,
    INTL_PO_BOX_PATTERNS,
)

# Short tokens that look like directions but only mean a direction at the
# start or end of an address — never in the middle of a street name. This
# avoids mangling ``123 N Main St`` (legit) vs. ``123 N. Main`` (legit) but
# also keeping us from rewriting ``Tower N`` → ``Tower North`` mid-line if
# it's part of a building name.
_DIRECTION_TOKENS = {"n", "s", "e", "w", "ne", "nw", "se", "sw"}

_TOKEN_RE = re.compile(r"\w+|[^\w\s]+|\s+")

# Aliases over the shared constants — kept for the local module-level
# reads that already reference these names.
_US_STATE_CODES = _US_STATE_CODES_SHARED
_US_STATE_NAMES = _US_STATE_NAMES_SHARED

# Per-country (full-name, code, postal-pattern) tables. Each yields a
# precompiled regex matching ``, <state name> <postal>``. Sorted
# longest-first so multi-word names win over their prefixes.
def _build_state_patterns(
    name_to_code: dict[str, str], postal_pattern: str,
) -> list[tuple[re.Pattern[str], str]]:
    return [
        (
            re.compile(
                rf"(,\s*){re.escape(full)}(\s+{postal_pattern})",
                re.IGNORECASE,
            ),
            code,
        )
        for full, code in sorted(name_to_code.items(), key=lambda kv: -len(kv[0]))
    ]


_STATE_NAME_PATTERNS: list[tuple[re.Pattern[str], str]] = _build_state_patterns(
    _US_STATE_NAMES, r"\d{5}(?:-\d{4})?",
)
_CA_PROVINCE_PATTERNS: list[tuple[re.Pattern[str], str]] = _build_state_patterns(
    CA_PROVINCE_NAMES, r"[A-Z]\d[A-Z]\s*\d[A-Z]\d",
)
_AU_STATE_PATTERNS: list[tuple[re.Pattern[str], str]] = _build_state_patterns(
    AU_STATE_NAMES, r"\d{4}",
)
_DE_STATE_PATTERNS: list[tuple[re.Pattern[str], str]] = _build_state_patterns(
    DE_STATE_NAMES, r"\d{5}",
)

# PO Box variants normalize to a single canonical form. Combines the
# English pattern with the international locale variants registered in
# _constants.INTL_PO_BOX_PATTERNS.
_PO_BOX_RE = re.compile(
    r"\b(?:" + "|".join(INTL_PO_BOX_PATTERNS.values()) + r")\b",
    re.IGNORECASE,
)

# Country-shape postal patterns (precompiled). Used to detect which
# country-specific normalization to apply (state-code preservation,
# street-suffix dictionary, etc.).
_POSTAL_REGEXES: dict[str, re.Pattern[str]] = {
    cc: re.compile(pat) for cc, pat in POSTAL_PATTERNS.items()
}
# Back-compat aliases for sites that already reference these names.
_US_ZIP_TAIL_RE = _POSTAL_REGEXES["us"]
_CANADA_POSTAL_RE = _POSTAL_REGEXES["ca"]
_UK_POSTCODE_RE = _POSTAL_REGEXES["uk"]

# Combined state-code set: US + Canada + Australia + Germany. The
# state-code-position check preserves any of these when found in the
# slot between a comma and the postal code.
_INTL_STATE_CODES: frozenset[str] = (
    _US_STATE_CODES_SHARED | CA_PROVINCE_CODES | AU_STATE_CODES | DE_STATE_CODES
)


def _is_state_code_position(tokens: list[str], idx: int) -> bool:
    """Heuristic: ``tokens[idx]`` sits in a state-code slot.

    A state code typically appears as ``…, XX 12345`` — preceded (modulo
    whitespace) by a comma and followed by a 5-digit ZIP. We allow some
    flexibility: a trailing position after a comma also counts even
    without a ZIP.
    """
    # Look back for a comma (skipping whitespace).
    j = idx - 1
    while j >= 0 and tokens[j].isspace():
        j -= 1
    if j < 0 or tokens[j] != ",":
        return False
    # Look ahead for a postal-shaped token. Accepts US ZIP (5 digits +
    # optional +4), Australian (4 digits), Canadian first half (single
    # letter + digit + letter), and the start of a UK outward code.
    j = idx + 1
    while j < len(tokens) and tokens[j].isspace():
        j += 1
    if j >= len(tokens):
        return True  # tail of line, after a comma — accept
    nxt = tokens[j]
    return bool(re.match(
        r"\d{4,5}(?:-\d{4})?$|^[A-Z]\d[A-Z]$|^[A-Z]{1,2}\d",
        nxt, re.IGNORECASE,
    ))


def standardize_address(
    value: Optional[str],
    *,
    extra_abbreviations: Optional[dict[str, str]] = None,
    expand: bool = True,
    state_to_code: bool = True,
    collapse_multiline: bool = True,
    trim_trailing_comma: bool = True,
    normalize_po_box: bool = True,
) -> tuple[str, bool]:
    """Standardize a US-style address.

    By default expands USPS abbreviations (``St`` → ``Street``) and
    title-cases the result. With ``expand=False`` the inverse direction
    is used (``Street`` → ``St``), which matches the corpus default of
    USPS abbreviated form as canonical (FORMATS-CASES.md § 6.3).

    Other policy knobs:
      * ``state_to_code`` — convert spelled-out state names to 2-letter
        postal codes (``New York`` (state) → ``NY``).
      * ``collapse_multiline`` — replace embedded newlines with ``, ``
        so ``123 Main St\\nApt 4B`` becomes ``123 Main St, Apt 4B``.
      * ``trim_trailing_comma`` — drop a sole trailing comma left by
        loose CSV exports.
      * ``normalize_po_box`` — fold ``P.O. Box`` / ``Post Office Box``
        / ``po box`` variants to canonical ``PO Box``.

    State codes are preserved verbatim regardless of the surrounding
    case (``ny`` in all-lowercase input becomes ``NY``, not ``Ny``).
    """
    if not value or not isinstance(value, str):
        return value or "", False
    if not value.strip():
        return value, False

    s = value
    # If the whole input is shouting (every cased letter uppercase),
    # casefold it before any token replacement so the title-case pass
    # produces ``Main St`` rather than seeing a mix of ``MAIN`` and
    # already-replaced ``St`` and giving up on the all-caps tokens.
    cased = [c for c in s if c.isalpha()]
    if cased and not any(c.islower() for c in cased):
        s = s.lower()
    if collapse_multiline and "\n" in s:
        # Each line becomes a comma-joined segment — but skip empty lines
        # and dedupe a comma the user already had at the line break.
        parts = [p.strip().rstrip(",").strip() for p in s.splitlines()]
        s = ", ".join(p for p in parts if p)

    if normalize_po_box:
        s = _PO_BOX_RE.sub("PO Box", s)

    is_us_shaped = bool(_US_ZIP_TAIL_RE.search(s))
    is_ca_shaped = bool(_CANADA_POSTAL_RE.search(s))
    is_uk_shaped = bool(_UK_POSTCODE_RE.search(s))
    # German postal is just 5 digits — same as US ZIP — so we only
    # treat as DE if the input is NOT already US-state-shaped.
    is_de_shaped = (
        is_us_shaped and any(
            re.search(rf",\s*{re.escape(name)}\s+\d{{5}}", s, re.IGNORECASE)
            or re.search(rf",\s*{re.escape(code)}\s+\d{{5}}", s, re.IGNORECASE)
            for name, code in DE_STATE_NAMES.items()
        )
    )
    # AU detection: 4-digit postal at tail AND a known AU state code or
    # full-name substring is present somewhere in the address.
    _au_state_words = "|".join(
        list(AU_STATE_CODES) + [re.escape(n) for n in AU_STATE_NAMES]
    )
    is_au_shaped = bool(
        re.search(r"\b\d{4}\b\s*$", s.rstrip(","))
        and re.search(rf"\b(?:{_au_state_words})\b", s, re.IGNORECASE)
    )

    if state_to_code:
        # State-name → code conversion. Each country's pattern only
        # fires when its own postal-code shape is detected, so US
        # "New York" before "NY 10001" is left alone (it's a city), and
        # Canadian "Ontario" before "M5E 1W7" becomes "ON".
        if is_us_shaped:
            for pat, code in _STATE_NAME_PATTERNS:
                s = pat.sub(rf"\g<1>{code}\g<2>", s)
        if is_ca_shaped:
            for pat, code in _CA_PROVINCE_PATTERNS:
                s = pat.sub(rf"\g<1>{code}\g<2>", s)
        if is_au_shaped:
            for pat, code in _AU_STATE_PATTERNS:
                s = pat.sub(rf"\g<1>{code}\g<2>", s)
        if is_de_shaped:
            for pat, code in _DE_STATE_PATTERNS:
                s = pat.sub(rf"\g<1>{code}\g<2>", s)

    if not expand:
        # Compression direction is only safe for US-shaped addresses.
        # International rows (UK postcodes, Canada/Japan postal patterns)
        # keep their original spelling — ``Downing Street`` stays
        # ``Downing Street``, not ``Downing St``.
        abbrev_table = (
            {k: v for k, v in _ADDRESS_COMPRESSIONS.items()}
            if is_us_shaped or _CANADA_POSTAL_RE.search(s)
            else {}
        )
    else:
        abbrev_table = dict(_ADDRESS_EXPANSIONS)

    if extra_abbreviations:
        abbrev_table = {**abbrev_table}
        for k, v in extra_abbreviations.items():
            if isinstance(k, str) and isinstance(v, str) and k.strip() and v.strip():
                abbrev_table[k.casefold().rstrip(".").strip()] = v.strip()

    expansion_values = set(abbrev_table.values())
    # Canonical USPS abbreviation forms (``St``, ``Ave``, …) — used to
    # strip a trailing period when the abbreviation is already canonical
    # in compression mode (``St.`` → ``St``).
    canonical_abbrevs = set(_ADDRESS_COMPRESSIONS.values()) | set(
        _ADDRESS_EXPANSIONS
    )

    tokens = _TOKEN_RE.findall(s)

    out_tokens: list[str] = []
    for i, tok in enumerate(tokens):
        if not tok or not tok[0].isalnum():
            # Punctuation / whitespace passes through verbatim — but if
            # it begins with a period and the previous output token is a
            # known USPS abbreviation, strip the leading period (``St.``
            # → ``St``, ``St.,`` → ``St,``).
            if (
                tok.startswith(".")
                and out_tokens
                and (out_tokens[-1] in expansion_values
                     or out_tokens[-1] in canonical_abbrevs)
            ):
                tok = tok[1:]
                if not tok:
                    continue
            out_tokens.append(tok)
            continue

        key = tok.casefold().rstrip(".")
        upper_form = tok.upper().rstrip(".")

        # State code preservation: if this token is a 2-letter state code
        # in a state-code position, preserve it as uppercase regardless
        # of input case or abbreviation table collisions.
        if upper_form in _INTL_STATE_CODES and _is_state_code_position(tokens, i):
            out_tokens.append(upper_form)
            continue

        expansion = abbrev_table.get(key)
        if expansion is not None:
            out_tokens.append(expansion)
        else:
            out_tokens.append(tok)

    rebuilt = "".join(out_tokens)
    titled = smart_title_case(rebuilt)

    # Re-apply state-code preservation post title-case (smart_title_case
    # may have lowercased an all-lowercase token before we could fix it).
    titled = _restore_state_codes(titled)

    if trim_trailing_comma:
        titled = titled.rstrip()
        if titled.endswith(","):
            titled = titled[:-1].rstrip()

    return titled, titled != value


_STATE_CODE_AFTER_COMMA_RE = re.compile(
    r"(,\s*)([A-Za-z]{2})(\s+\d{5}(?:-\d{4})?|\s*$)"
)


def _restore_state_codes(s: str) -> str:
    """Force-uppercase 2-letter state codes following a comma."""
    def repl(m: re.Match) -> str:
        candidate = m.group(2).upper()
        if candidate in _INTL_STATE_CODES:
            return f"{m.group(1)}{candidate}{m.group(3)}"
        return m.group(0)

    return _STATE_CODE_AFTER_COMMA_RE.sub(repl, s)


# ---------------------------------------------------------------------------
# Email
# ---------------------------------------------------------------------------
#
# 03's email cleaner is the public surface for normalization (see
# FORMATS-CASES.md § 0.1 — duplicates the matching logic the dedup
# tier-1 spec uses internally, so callers don't have to run dedup just
# to lowercase a list of emails).

EmailErrorPolicy = Literal["passthrough", "sentinel"]

# Strict-enough RFC 5322-ish regex: local@domain.tld, allowing IDN.
_EMAIL_RE = re.compile(
    r"^(?P<local>[^\s@<>\"]+)@(?P<domain>[^\s@<>\"]+\.[^\s@<>\".]+)$"
)
# Display-name extraction: ``"Alice" <alice@example.com>`` or
# ``Alice Smith <alice@example.com>``.
_EMAIL_ANGLE_RE = re.compile(r"<([^<>]+)>")
_MAILTO_PREFIX_RE = re.compile(r"^mailto:", re.IGNORECASE)
# Smart-quote wrapping the whole address.
_EMAIL_SMARTQUOTE_RE = re.compile(r"^[“”‘’]+|[“”‘’]+$")
# Bidirectional control characters used in homograph / spoofing attacks
# against email addresses (``alice‮@example.com`` displays as
# ``alice@elpmaxe.com`` to RTL-aware renderers). Strip on every parse.
_EMAIL_BIDI_RE = re.compile(r"[‪-‮⁦-⁩‎‏]")
# Multi-email cell separator.
_EMAIL_MULTI_RE = re.compile(r"[,;]\s*\S+@\S+\.\S+")


def standardize_email(
    value: Optional[str],
    *,
    gmail_canonical: bool = False,
    error_policy: EmailErrorPolicy = "passthrough",
) -> tuple[str, bool]:
    """Lowercase + trim + strip mailto/display-name wrappers.

    Default behavior preserves Gmail dots and ``+tag`` segments — that's
    a Gmail provider policy, not a generic email standard. Set
    ``gmail_canonical=True`` to strip dots and ``+`` tags from the local
    part for ``@gmail.com`` addresses only (corpus § 5.3).

    Multiple addresses in a single cell, missing/duplicate ``@``,
    internal whitespace, and TLD-less inputs are surfaced as
    ``<error: <reason>>`` when ``error_policy="sentinel"``.
    """
    if not value or not isinstance(value, str):
        return value or "", False
    s = value.strip()
    if not s:
        return value, False

    _err = lambda reason: _err_or_passthrough(reason, value, error_policy)

    # Multi-email cell — error before we silently pick one.
    if _EMAIL_MULTI_RE.search(s) and not s.startswith("<"):
        # If splitting on ;/, yields multiple email-shaped tokens, error.
        parts = re.split(r"[,;]\s*", s)
        email_parts = [p for p in parts if "@" in p and "." in p.split("@")[-1]]
        if len(email_parts) >= 2:
            return _err("multiple emails")

    # Smart-quote wrappers (``"alice@example.com"``).
    s = _EMAIL_SMARTQUOTE_RE.sub("", s).strip()
    # Strip BIDI / RTL override controls — these are a homograph attack
    # vector and have no legitimate use inside an email address.
    s = _EMAIL_BIDI_RE.sub("", s)

    # Display-name with angle brackets — extract the address.
    m = _EMAIL_ANGLE_RE.search(s)
    if m:
        s = m.group(1).strip()

    # mailto: prefix.
    s = _MAILTO_PREFIX_RE.sub("", s).strip()

    # Trailing punctuation contamination (``alice@example.com,`` etc.).
    s = s.rstrip(",;:.)”’")

    # Internal whitespace check (``alice @ example.com``).
    if re.search(r"\s", s):
        return _err("internal whitespace")

    # Lowercase the whole thing — both local part and domain are
    # case-insensitive in practice (RFC 5321 says local can be
    # case-sensitive but no real provider treats it that way).
    s = s.lower()

    # Validate shape.
    if "@" not in s:
        return _err("missing @")
    if s.count("@") >= 2:
        # ``alice@@example.com`` is double-@, ``alice@example@com`` is
        # multi-@; both error.
        return _err("double @" if "@@" in s else "multiple @")
    m = _EMAIL_RE.match(s)
    if not m:
        return _err("no TLD")

    local = m.group("local")
    domain = m.group("domain")

    if gmail_canonical and domain == "gmail.com":
        local = local.replace(".", "").split("+", 1)[0]
        s = f"{local}@{domain}"

    return s, s != value


# ---------------------------------------------------------------------------
# Boolean
# ---------------------------------------------------------------------------

_TRUE_TOKENS = {"true", "t", "yes", "y", "1", "on"}
_FALSE_TOKENS = {"false", "f", "no", "n", "0", "off"}

BoolStyle = Literal["True/False", "true/false", "Yes/No", "Y/N", "1/0"]

_BOOL_OUTPUT: dict[BoolStyle, tuple[str, str]] = {
    "True/False": ("True", "False"),
    "true/false": ("true", "false"),
    "Yes/No": ("Yes", "No"),
    "Y/N": ("Y", "N"),
    "1/0": ("1", "0"),
}


def standardize_boolean(
    value: Any,
    *,
    style: BoolStyle = "True/False",
) -> tuple[str, bool]:
    """Map common truthy/falsy strings (and Python bools) to a canonical pair.

    Recognized truthy: ``true t yes y 1 on``. Recognized falsy:
    ``false f no n 0 off``. Comparison is case-insensitive after trim.
    Unrecognized input passes through unchanged.
    """
    true_out, false_out = _BOOL_OUTPUT[style]

    if isinstance(value, bool):
        out = true_out if value else false_out
        return out, True

    if value is None or (isinstance(value, float) and pd.isna(value)):
        return "", False

    if not isinstance(value, str):
        # Numeric 0/1 → False/True; anything else is unrecognized.
        if value == 0:
            return false_out, True
        if value == 1:
            return true_out, True
        return str(value), False

    s = value.strip().casefold()
    if not s:
        return value, False
    if s in _TRUE_TOKENS:
        return true_out, true_out != value
    if s in _FALSE_TOKENS:
        return false_out, false_out != value
    return value, False


# ---------------------------------------------------------------------------
# Options / result dataclasses
# ---------------------------------------------------------------------------

# ---------------------------------------------------------------------------
# Preset bundles
# ---------------------------------------------------------------------------
#
# A preset is a flat dict of ``StandardizeOptions`` field defaults — the
# subset that varies between locales / standards. ``column_types`` and
# ``extra_abbreviations`` are caller-supplied and never carried by a
# preset.
#
# Standards backing each preset:
#   us-default  ISO 8601 dates · ITU-T E.164 phones (US) · ISO 4217 minor
#               unit (2dp) · USPS Pub. 28 address expansion · "True/False"
#   european    ISO 8601 dates with DMY for ambiguous input · E.164 phones
#               · ISO 4217 with comma decimal input · "True/False"
#   uk          DD/MM/YYYY display · GB region phones · ISO 4217 dot ·
#               "Yes/No" booleans (common in UK gov forms)
#   iso-strict  ISO 8601 dates · E.164 · bare-number currency, no rounding
#               · "true/false" lowercase (JSON canonical) · Title names
#   legacy-us   MM/DD/YYYY display · National-format phones · 2dp currency
#               · "Yes/No" — for downstream systems that haven't moved off
#               local conventions yet.

PRESETS: dict[str, dict[str, Any]] = {
    "us-default": {
        "date_output_format": "%Y-%m-%d",
        "date_order": "MDY",
        "phone_format": "E164",
        "phone_region": "US",
        "currency_decimal": "dot",
        "currency_decimals": 2,
        "currency_preserve_code": False,
        "name_case": "title",
        "boolean_style": "True/False",
    },
    "european": {
        "date_output_format": "%Y-%m-%d",
        "date_order": "DMY",
        "phone_format": "INTERNATIONAL",
        "phone_region": "DE",
        "currency_decimal": "comma",
        "currency_decimals": 2,
        "currency_preserve_code": True,
        "name_case": "title",
        "boolean_style": "True/False",
    },
    "uk": {
        "date_output_format": "%d/%m/%Y",
        "date_order": "DMY",
        "phone_format": "INTERNATIONAL",
        "phone_region": "GB",
        "currency_decimal": "dot",
        "currency_decimals": 2,
        "currency_preserve_code": False,
        "name_case": "title",
        "boolean_style": "Yes/No",
    },
    "iso-strict": {
        "date_output_format": "%Y-%m-%d",
        "date_order": "MDY",
        "phone_format": "E164",
        "phone_region": "US",
        "currency_decimal": "dot",
        "currency_decimals": None,
        "currency_preserve_code": True,
        "name_case": "title",
        "boolean_style": "true/false",
    },
    "legacy-us": {
        "date_output_format": "%m/%d/%Y",
        "date_order": "MDY",
        "phone_format": "NATIONAL",
        "phone_region": "US",
        "currency_decimal": "dot",
        "currency_decimals": 2,
        "currency_preserve_code": False,
        "name_case": "title",
        "boolean_style": "Yes/No",
    },
}


@dataclass
class StandardizeOptions:
    """Configuration for :func:`standardize_dataframe`.

    The standardizer is column-typed: the user (or auto-detection layer
    above) assigns each column a :class:`FieldType`, and the per-cell
    function for that type runs over the column. Columns absent from
    ``column_types`` pass through untouched.
    """

    # column name -> field type (string or FieldType enum value)
    column_types: dict[str, FieldType] = field(default_factory=dict)

    # Date formatting
    date_output_format: str = "%Y-%m-%d"
    date_order: DateOrder = "MDY"

    # Phone formatting
    phone_format: PhoneFormat = "E164"
    phone_region: str = "US"

    # Currency formatting
    currency_decimal: CurrencyDecimal = "dot"
    currency_decimals: Optional[int] = 2
    # When True, an ISO 4217 code detected in the input is re-emitted as a
    # space-separated prefix on the standardized number.
    currency_preserve_code: bool = False

    # Name casing
    name_case: NameCase = "title"

    # Boolean style
    boolean_style: BoolStyle = "True/False"

    # Email policy
    email_gmail_canonical: bool = False
    email_error_policy: EmailErrorPolicy = "passthrough"

    # Address policy (corpus § 6.3 — abbreviated form is canonical, but
    # the existing tests/baseline assume expand-by-default; new callers
    # opt into compression by setting expand=False).
    address_expand: bool = True
    address_state_to_code: bool = True
    address_collapse_multiline: bool = True
    address_trim_trailing_comma: bool = True
    address_normalize_po_box: bool = True

    # Per-domain error sentinels — when "sentinel", emit ``<error: …>``
    # for unparseable / out-of-domain values. Default ``passthrough``
    # preserves the input unchanged.
    date_error_policy: DateErrorPolicy = "passthrough"
    phone_error_policy: PhoneErrorPolicy = "passthrough"
    currency_error_policy: CurrencyErrorPolicy = "passthrough"

    # Date locale handling — extra month-name dictionaries beyond English.
    date_month_locales: Optional[list[str]] = None

    # Name policy
    name_conservative: bool = False
    name_reverse_comma_format: bool = True
    name_family_first: bool = False  # set per-column for East Asian data

    # User overrides for the address abbreviation table. Merged on top of
    # the built-in USPS Pub. 28 list at runtime; values flow through
    # verbatim into Title Case rendering.
    extra_abbreviations: dict[str, str] = field(default_factory=dict)

    # ----- Scale knobs for large international files -----
    # Per-row country/region overrides. When set, each phone or address
    # row's region is read from the named column (an ISO-3166 alpha-2 code:
    # "US", "GB", "JP", "FR", …). Falls back to ``phone_region`` /
    # global default when the column is missing or the cell is blank.
    phone_country_column: Optional[str] = None
    address_country_column: Optional[str] = None

    # Audit cap. The change table can grow to tens of millions of rows on
    # a 1 GB input — capping protects memory and keeps the audit usable.
    # ``cells_changed`` still counts every modification; only the per-row
    # ``changes`` DataFrame is truncated. Set to None for unbounded.
    audit_max_rows: Optional[int] = 10_000

    # Value-level LRU cache size per standardizer. Repeated phone numbers
    # (call-list duplicates), repeated currencies, repeated boolean
    # tokens — all dominate at scale. A 256k-entry cache absorbs most
    # real-world cardinalities without ballooning memory.
    cache_size: int = 262_144

    @classmethod
    def from_preset(cls, name: str, **overrides: Any) -> StandardizeOptions:
        """Build options from a named preset, with optional field overrides.

        Example: ``StandardizeOptions.from_preset("uk", column_types={...})``
        starts from UK defaults and layers ``column_types`` on top.
        """
        if name not in PRESETS:
            raise ValueError(
                f"Unknown preset '{name}'. "
                f"Available: {', '.join(sorted(PRESETS))}."
            )
        base = dict(PRESETS[name])
        base.update(overrides)
        return cls(**base)

    @classmethod
    def from_dict(cls, data: dict) -> StandardizeOptions:
        from .errors import ConfigError
        known = {f for f in cls.__dataclass_fields__}
        kwargs = {k: v for k, v in data.items() if k in known}
        column_types = kwargs.get("column_types") or {}
        resolved: dict[str, FieldType] = {}
        for col, raw in column_types.items():
            try:
                resolved[col] = (
                    FieldType(raw) if not isinstance(raw, FieldType) else raw
                )
            except ValueError as e:
                valid = sorted(t.value for t in FieldType)
                raise ConfigError(
                    f"Invalid field type {raw!r} for column {col!r}",
                    column=col,
                    operation="StandardizeOptions.from_dict",
                    cause=e,
                    suggestion=f"Valid field types: {valid}",
                ) from e
        kwargs["column_types"] = resolved
        # Surface enum-string mismatches early — bad date_order ("xyz")
        # would otherwise crash deep inside standardize_date.
        for field_name, valid in (
            ("date_order", {"MDY", "DMY"}),
            ("phone_format", set(_PHONE_FORMAT_MAP) | {"DIGITS"}),
            ("currency_decimal", {"dot", "comma", "auto"}),
            ("name_case", {"title", "upper", "lower"}),
            ("boolean_style", set(_BOOL_OUTPUT)),
            ("date_error_policy", {"passthrough", "sentinel"}),
            ("phone_error_policy", {"passthrough", "sentinel"}),
            ("currency_error_policy", {"passthrough", "sentinel"}),
            ("email_error_policy", {"passthrough", "sentinel"}),
        ):
            value = kwargs.get(field_name)
            if value is not None and value not in valid:
                raise ConfigError(
                    f"Invalid {field_name}={value!r}",
                    operation="StandardizeOptions.from_dict",
                    suggestion=f"Valid values: {sorted(valid)}",
                )
        return cls(**kwargs)

    def to_dict(self) -> dict:
        d = asdict(self)
        d["column_types"] = {c: t.value if isinstance(t, FieldType) else t
                             for c, t in self.column_types.items()}
        return d

    def to_file(self, path: str | Path) -> Path:
        from .errors import ConfigError, wrap_file_write
        out = Path(path)
        try:
            payload = json.dumps(self.to_dict(), indent=2)
        except TypeError as e:
            raise ConfigError(
                "Could not serialize StandardizeOptions to JSON",
                operation="StandardizeOptions.to_file",
                cause=e,
                suggestion=(
                    "extra_abbreviations or column_types likely contains a "
                    "non-string/non-enum value. Inspect with .to_dict() and "
                    "remove the offending entry."
                ),
            ) from e
        try:
            out.write_text(payload)
        except (OSError, PermissionError) as e:
            raise wrap_file_write(out, "StandardizeOptions.to_file", e) from e
        return out

    @classmethod
    def from_file(cls, path: str | Path) -> StandardizeOptions:
        from .errors import ConfigError, wrap_file_read
        path = Path(path)
        try:
            text = path.read_text()
        except OSError as e:
            raise wrap_file_read(path, "StandardizeOptions.from_file", e) from e
        try:
            data = json.loads(text)
        except json.JSONDecodeError as e:
            raise ConfigError(
                "Invalid JSON in StandardizeOptions config",
                path=path,
                operation="StandardizeOptions.from_file",
                cause=e,
                suggestion=(
                    f"JSON parser failed at line {e.lineno}, column {e.colno}. "
                    "Validate the file with `python -m json.tool < file.json`."
                ),
            ) from e
        return cls.from_dict(data)


@dataclass
class StandardizeResult:
    """Output of :func:`standardize_dataframe`."""

    standardized_df: pd.DataFrame
    changes: pd.DataFrame                # cols: row, column, field_type, old, new
    cells_changed: int
    cells_unparseable: int               # rows where a typed column held junk
    cells_total: int
    columns_processed: list[str]


# ---------------------------------------------------------------------------
# Per-cell dispatch
# ---------------------------------------------------------------------------

def _apply_field_type(
    value: Any,
    field_type: FieldType,
    options: StandardizeOptions,
) -> tuple[Any, bool, bool]:
    """Run the standardizer for *field_type* on *value*.

    Returns ``(new_value, changed, parsed)``. ``parsed`` is False when the
    value was non-empty but the standardizer couldn't recognize it — used
    to surface a "junk in a typed column" count.
    """
    if value is None or (isinstance(value, float) and pd.isna(value)):
        return value, False, True
    if not isinstance(value, str):
        # Non-string inputs are converted via str() for everything except
        # booleans, which have a richer accept set.
        if field_type == FieldType.BOOLEAN:
            new, changed = standardize_boolean(value, style=options.boolean_style)
            return new, changed, True
        value = str(value)

    s_stripped = value.strip()
    if not s_stripped:
        return value, False, True

    if field_type == FieldType.DATE:
        new, changed = standardize_date(
            value,
            output_format=options.date_output_format,
            date_order=options.date_order,
            error_policy=options.date_error_policy,
            month_locales=options.date_month_locales,
        )
    elif field_type == FieldType.PHONE:
        new, changed = standardize_phone(
            value,
            output_format=options.phone_format,
            default_region=options.phone_region,
            error_policy=options.phone_error_policy,
        )
    elif field_type == FieldType.CURRENCY:
        new, changed = standardize_currency(
            value,
            decimal=options.currency_decimal,
            decimals=options.currency_decimals,
            preserve_code=options.currency_preserve_code,
            error_policy=options.currency_error_policy,
        )
    elif field_type == FieldType.NAME:
        new, changed = standardize_name(
            value,
            case=options.name_case,
            conservative=options.name_conservative,
            reverse_comma_format=options.name_reverse_comma_format,
            family_first=options.name_family_first,
        )
    elif field_type == FieldType.ADDRESS:
        new, changed = standardize_address(
            value,
            extra_abbreviations=options.extra_abbreviations or None,
            expand=options.address_expand,
            state_to_code=options.address_state_to_code,
            collapse_multiline=options.address_collapse_multiline,
            trim_trailing_comma=options.address_trim_trailing_comma,
            normalize_po_box=options.address_normalize_po_box,
        )
    elif field_type == FieldType.EMAIL:
        new, changed = standardize_email(
            value,
            gmail_canonical=options.email_gmail_canonical,
            error_policy=options.email_error_policy,
        )
    elif field_type == FieldType.BOOLEAN:
        new, changed = standardize_boolean(value, style=options.boolean_style)
    else:
        # Unreachable for well-formed input — _resolve_column_types
        # would have rejected the bad enum at the entry point. Hitting
        # this means an internal invariant was broken, not user error.
        raise AssertionError(
            f"Unhandled FieldType in dispatcher: {field_type!r}. "
            "This indicates a code bug — a new FieldType was added to "
            "the enum without a matching branch here."
        )

    # ``changed=False`` on a non-empty cell means the standardizer either
    # accepted the input as already-canonical OR couldn't parse it. The
    # name/address standardizers always succeed (any string is a valid
    # name); the others can fail. We only count parse failures for the
    # types that have a real parsing step.
    parsed = True
    if not changed and field_type in {
        FieldType.DATE, FieldType.PHONE, FieldType.CURRENCY, FieldType.BOOLEAN,
    }:
        parsed = _is_already_canonical(value, field_type, options)

    return new, changed, parsed


def _is_already_canonical(
    value: str,
    field_type: FieldType,
    options: StandardizeOptions,
) -> bool:
    """Check whether *value* is already in the canonical output shape.

    Used to distinguish "no change because input was already canonical"
    (a successful pass) from "no change because we couldn't parse it"
    (a junk row to flag).
    """
    if field_type == FieldType.DATE:
        try:
            datetime.strptime(value.strip(), options.date_output_format)
            return True
        except ValueError:
            return False
    if field_type == FieldType.PHONE:
        if options.phone_format == "DIGITS":
            return value.strip().isdigit() and len(value.strip()) >= 7
        try:
            parsed = phonenumbers.parse(value, options.phone_region)
        except phonenumbers.NumberParseException:
            return False
        if not phonenumbers.is_possible_number(parsed):
            return False
        fmt = _PHONE_FORMAT_MAP[options.phone_format]
        return phonenumbers.format_number(parsed, fmt) == value.strip()
    if field_type == FieldType.CURRENCY:
        # Pure numeric (with optional sign and one decimal point) is
        # treated as already-canonical. When ``preserve_code`` is on, an
        # ``ISO 1234.56`` form also counts as canonical so we don't flag
        # rows that already match the preserved-code output shape.
        bare_re = r"-?\d+(?:\.\d+)?"
        if options.currency_preserve_code:
            return bool(re.fullmatch(
                rf"(?:{_CURRENCY_CODES})\s+{bare_re}|{bare_re}",
                value.strip(),
                re.IGNORECASE,
            ))
        return bool(re.fullmatch(bare_re, value.strip()))
    if field_type == FieldType.BOOLEAN:
        true_out, false_out = _BOOL_OUTPUT[options.boolean_style]
        return value.strip() in (true_out, false_out)
    return True


# ---------------------------------------------------------------------------
# DataFrame entry point
# ---------------------------------------------------------------------------

def _resolve_column_types(
    options: StandardizeOptions,
    df_columns: Iterable[str],
) -> dict[str, FieldType]:
    """Validate column references and coerce string types to enum values."""
    cols = set(df_columns)
    resolved: dict[str, FieldType] = {}
    missing: list[str] = []
    for col, ft in options.column_types.items():
        if col not in cols:
            missing.append(col)
            continue
        resolved[col] = ft if isinstance(ft, FieldType) else FieldType(ft)
    if missing:
        from .errors import InputValidationError
        raise InputValidationError(
            f"Columns referenced by column_types not found in input: {missing}",
            operation="standardize_dataframe",
            suggestion=(
                f"Available columns: {list(df_columns)}. "
                "Check for typos and for header rows that didn't get parsed."
            ),
        )
    return resolved


def _build_cached_dispatcher(
    field_type: FieldType,
    options: StandardizeOptions,
):
    """Return a per-value standardizer wrapped in an LRU cache.

    The cache key is the raw cell value plus, when applicable, the
    per-row region derived from ``phone_country_column`` /
    ``address_country_column``. Repeated values are O(1) lookups —
    critical at 1 GB scale where the same number appears thousands
    of times.

    The dispatcher captures the relevant subset of ``options`` so the
    cache key stays small (we don't want to serialize the whole
    options dataclass into every cache entry).
    """
    from functools import lru_cache

    cache_size = options.cache_size if options.cache_size > 0 else None

    if field_type == FieldType.DATE:
        out_fmt = options.date_output_format
        date_order = options.date_order
        date_err = options.date_error_policy
        locales = (
            tuple(options.date_month_locales) if options.date_month_locales else None
        )

        @lru_cache(maxsize=cache_size)
        def fn(value: Any, _region: Optional[str] = None):
            return _apply_field_type_for(
                value, FieldType.DATE, options,
                _date_args=(out_fmt, date_order, date_err, locales),
            )
        return fn

    if field_type == FieldType.PHONE:
        out_fmt = options.phone_format
        err = options.phone_error_policy
        default_region = options.phone_region

        @lru_cache(maxsize=cache_size)
        def fn(value: Any, region: Optional[str] = None):
            r = region or default_region
            return _apply_field_type_for(
                value, FieldType.PHONE, options,
                _phone_args=(out_fmt, r, err),
            )
        return fn

    if field_type == FieldType.CURRENCY:
        decimal = options.currency_decimal
        decimals = options.currency_decimals
        preserve = options.currency_preserve_code
        err = options.currency_error_policy

        @lru_cache(maxsize=cache_size)
        def fn(value: Any, _region: Optional[str] = None):
            return _apply_field_type_for(
                value, FieldType.CURRENCY, options,
                _currency_args=(decimal, decimals, preserve, err),
            )
        return fn

    if field_type == FieldType.BOOLEAN:
        style = options.boolean_style

        @lru_cache(maxsize=cache_size)
        def fn(value: Any, _region: Optional[str] = None):
            return _apply_field_type_for(
                value, FieldType.BOOLEAN, options,
                _boolean_args=(style,),
            )
        return fn

    if field_type == FieldType.EMAIL:
        gmail = options.email_gmail_canonical
        err = options.email_error_policy

        @lru_cache(maxsize=cache_size)
        def fn(value: Any, _region: Optional[str] = None):
            return _apply_field_type_for(
                value, FieldType.EMAIL, options,
                _email_args=(gmail, err),
            )
        return fn

    # Names and addresses are usually unique per row; no cache wraps
    # them but we still go through ``_apply_field_type`` for parity.
    if field_type == FieldType.NAME:
        def fn(value: Any, _region: Optional[str] = None):
            return _apply_field_type(value, FieldType.NAME, options)
        return fn

    if field_type == FieldType.ADDRESS:
        # Addresses can be cached too — long lists of repeated office
        # addresses or warehouse locations are common in commerce data.
        @lru_cache(maxsize=cache_size)
        def fn(value: Any, _region: Optional[str] = None):
            return _apply_field_type(value, FieldType.ADDRESS, options)
        return fn

    # Fallback (shouldn't happen — every FieldType is covered above).
    return lambda value, _region=None: _apply_field_type(value, field_type, options)


def _apply_field_type_for(
    value: Any,
    field_type: FieldType,
    options: StandardizeOptions,
    *,
    _date_args=None,
    _phone_args=None,
    _currency_args=None,
    _boolean_args=None,
    _email_args=None,
) -> tuple[Any, bool, bool]:
    """Cacheable dispatcher: same shape as :func:`_apply_field_type` but
    accepts pre-extracted scalar argument tuples so the LRU cache key is
    just ``(value, region)`` instead of the full options object.
    """
    if value is None or (isinstance(value, float) and pd.isna(value)):
        return value, False, True
    if not isinstance(value, str):
        if field_type == FieldType.BOOLEAN:
            style = (_boolean_args or (options.boolean_style,))[0]
            new, changed = standardize_boolean(value, style=style)
            return new, changed, True
        value = str(value)

    if not value.strip():
        return value, False, True

    if field_type == FieldType.DATE:
        out_fmt, date_order, err, locales = _date_args or (
            options.date_output_format, options.date_order,
            options.date_error_policy,
            tuple(options.date_month_locales) if options.date_month_locales else None,
        )
        new, changed = standardize_date(
            value,
            output_format=out_fmt,
            date_order=date_order,
            error_policy=err,
            month_locales=list(locales) if locales else None,
        )
    elif field_type == FieldType.PHONE:
        out_fmt, region, err = _phone_args or (
            options.phone_format, options.phone_region, options.phone_error_policy,
        )
        new, changed = standardize_phone(
            value, output_format=out_fmt, default_region=region, error_policy=err,
        )
    elif field_type == FieldType.CURRENCY:
        decimal, decimals, preserve, err = _currency_args or (
            options.currency_decimal, options.currency_decimals,
            options.currency_preserve_code, options.currency_error_policy,
        )
        new, changed = standardize_currency(
            value,
            decimal=decimal,
            decimals=decimals,
            preserve_code=preserve,
            error_policy=err,
        )
    elif field_type == FieldType.BOOLEAN:
        style = (_boolean_args or (options.boolean_style,))[0]
        new, changed = standardize_boolean(value, style=style)
    elif field_type == FieldType.EMAIL:
        gmail, err = _email_args or (
            options.email_gmail_canonical, options.email_error_policy,
        )
        new, changed = standardize_email(
            value, gmail_canonical=gmail, error_policy=err,
        )
    else:
        return _apply_field_type(value, field_type, options)

    parsed = True
    if not changed and field_type in {
        FieldType.DATE, FieldType.PHONE, FieldType.CURRENCY, FieldType.BOOLEAN,
    }:
        parsed = _is_already_canonical(value, field_type, options)

    return new, changed, parsed


def standardize_dataframe(
    df: pd.DataFrame,
    options: Optional[StandardizeOptions] = None,
) -> StandardizeResult:
    """Apply per-column standardizers across *df*.

    Columns absent from ``options.column_types`` pass through unchanged.
    The input DataFrame is not mutated.

    Pipeline placement (recommended, not enforced)
    ----------------------------------------------
    Run *after* the text cleaner (smart-quote / NBSP / zero-width
    pollution breaks phone, currency, and date parsers) and *before*
    the missing-value handler (numeric imputation expects canonical
    types) and the deduplicator (canonical phone E.164 / lowercase
    email enables cross-format duplicate matching). See
    ``src.core.pipeline.SOFT_DEPENDENCIES``.

    Performance characteristics
    ---------------------------
    Per-cell standardizers are wrapped in an LRU cache (size
    ``options.cache_size``) so repeated values — common in real
    international data, where the same office phone or vendor address
    appears thousands of times — short-circuit. The dispatch loop uses
    ``Series.map`` for pandas-native iteration; on a 10-million-row
    column this is roughly 4-8× faster than the previous
    ``for v in series.tolist()`` path.

    For inputs larger than will fit comfortably in RAM, prefer
    :func:`standardize_file` which streams chunks from disk.
    """
    from .errors import ensure_dataframe
    ensure_dataframe(df, function="standardize_dataframe")
    options = options or StandardizeOptions()
    out = df.copy()
    column_types = _resolve_column_types(options, out.columns)

    cells_changed = 0
    cells_unparseable = 0
    cells_total = 0
    audit_cap = options.audit_max_rows
    audit_room = float("inf") if audit_cap is None else audit_cap
    audit_records: list[dict[str, Any]] = []

    # Per-row region columns must exist in the frame when set.
    if options.phone_country_column and options.phone_country_column not in out.columns:
        from .errors import InputValidationError
        raise InputValidationError(
            f"phone_country_column={options.phone_country_column!r} not in input columns",
            operation="standardize_dataframe",
            suggestion=f"Available: {list(out.columns)}",
        )
    if options.address_country_column and options.address_country_column not in out.columns:
        from .errors import InputValidationError
        raise InputValidationError(
            f"address_country_column={options.address_country_column!r} not in input columns",
            operation="standardize_dataframe",
            suggestion=f"Available: {list(out.columns)}",
        )

    for col, field_type in column_types.items():
        series = out[col]
        cells_total += len(series)
        dispatcher = _build_cached_dispatcher(field_type, options)

        # Per-row region lookup. Phones and addresses are the two types
        # that benefit from country context; everything else ignores the
        # second argument.
        region_series: Optional[pd.Series] = None
        if field_type == FieldType.PHONE and options.phone_country_column:
            region_series = out[options.phone_country_column]
        elif field_type == FieldType.ADDRESS and options.address_country_column:
            region_series = out[options.address_country_column]

        new_values: list[Any] = [None] * len(series)
        if region_series is None:
            triples = [dispatcher(v) for v in series.tolist()]
        else:
            regions = region_series.tolist()
            triples = [
                dispatcher(v, _normalize_region(r))
                for v, r in zip(series.tolist(), regions)
            ]

        for i, (orig, (new, changed, parsed)) in enumerate(
            zip(series.tolist(), triples)
        ):
            new_values[i] = new
            if changed:
                cells_changed += 1
                if audit_room > 0:
                    audit_records.append({
                        "row": i,
                        "column": col,
                        "field_type": field_type.value,
                        "old": orig,
                        "new": new,
                    })
                    audit_room -= 1
            if not parsed:
                cells_unparseable += 1
        out[col] = new_values

    changes_df = pd.DataFrame(
        audit_records,
        columns=["row", "column", "field_type", "old", "new"],
    )

    # Surface a warning when more than 10% of typed cells failed to
    # parse — usually means the user mis-typed a column (text marked
    # as DATE) or the data is genuinely garbage. Without this, a
    # quietly-broken pipeline shows zero changes and silently lets bad
    # data flow downstream.
    if cells_total > 0 and cells_unparseable / cells_total > 0.1:
        logger.warning(
            "standardize_dataframe: {}/{} cells ({}%) in typed columns were "
            "unparseable — check column_types for mismatches with the data.",
            cells_unparseable,
            cells_total,
            int(100 * cells_unparseable / cells_total),
        )

    # Only log the cap message when it would surprise the caller —
    # cap=0 is the streaming-path's deliberate "audit budget exhausted"
    # signal and shouldn't generate noise per chunk.
    if audit_cap and audit_cap > 0 and cells_changed > audit_cap:
        logger.info(
            "standardize_dataframe: audit capped at {} rows "
            "(cells_changed={}); raise audit_max_rows or set to None for full audit.",
            audit_cap, cells_changed,
        )

    return StandardizeResult(
        standardized_df=out,
        changes=changes_df,
        cells_changed=cells_changed,
        cells_unparseable=cells_unparseable,
        cells_total=cells_total,
        columns_processed=list(column_types.keys()),
    )


# ---------------------------------------------------------------------------
# Per-row region helpers
# ---------------------------------------------------------------------------

# Common country-name → ISO-3166 alpha-2 mappings. The phonenumbers
# library wants the alpha-2 code, but real spreadsheets carry full names
# ("United Kingdom", "Japan", "Brazil"). Add new entries lazily as users
# bring in data — the table is a soft mapping, missing entries fall back
# to the global ``phone_region``.
_COUNTRY_NAME_TO_ISO2: dict[str, str] = {
    "united states": "US", "usa": "US", "u.s.": "US", "u.s.a.": "US",
    "united kingdom": "GB", "uk": "GB", "great britain": "GB", "england": "GB",
    "canada": "CA",
    "mexico": "MX",
    "france": "FR",
    "germany": "DE", "deutschland": "DE",
    "italy": "IT", "italia": "IT",
    "spain": "ES", "españa": "ES",
    "portugal": "PT",
    "netherlands": "NL", "holland": "NL",
    "belgium": "BE",
    "switzerland": "CH", "schweiz": "CH",
    "austria": "AT", "österreich": "AT",
    "ireland": "IE",
    "sweden": "SE", "norway": "NO", "denmark": "DK", "finland": "FI",
    "poland": "PL", "czech republic": "CZ", "czechia": "CZ", "hungary": "HU",
    "russia": "RU", "ukraine": "UA",
    "japan": "JP", "中国": "CN", "china": "CN", "south korea": "KR", "korea": "KR",
    "india": "IN", "indonesia": "ID", "thailand": "TH", "vietnam": "VN",
    "philippines": "PH", "malaysia": "MY", "singapore": "SG",
    "australia": "AU", "new zealand": "NZ",
    "brazil": "BR", "brasil": "BR",
    "argentina": "AR", "chile": "CL", "colombia": "CO", "peru": "PE",
    "south africa": "ZA",
    "uae": "AE", "united arab emirates": "AE",
    "saudi arabia": "SA",
    "egypt": "EG",
    "israel": "IL",
    "turkey": "TR", "türkiye": "TR",
}


def _normalize_region(value: Any) -> Optional[str]:
    """Normalise a region cell to an ISO-3166 alpha-2 code.

    Accepts ISO codes (``US``, ``us``, ``USA``), full names
    (``United States``, ``Japan``), and falls back to None when the
    value is empty or unrecognized — letting the dispatcher use the
    global default region.
    """
    if value is None:
        return None
    if isinstance(value, float) and pd.isna(value):
        return None
    if not isinstance(value, str):
        value = str(value)
    s = value.strip()
    if not s:
        return None
    upper = s.upper()
    # ISO-3166 alpha-2 (e.g. "US", "JP")
    if len(upper) == 2 and upper.isalpha():
        return upper
    # ISO-3166 alpha-3 (e.g. "USA", "JPN") — strip last letter as a
    # cheap heuristic, then validate alpha-2.
    if len(upper) == 3 and upper.isalpha():
        # phonenumbers accepts alpha-2 only; map a few common alpha-3.
        alpha3_map = {
            "USA": "US", "GBR": "GB", "CAN": "CA", "MEX": "MX", "DEU": "DE",
            "FRA": "FR", "ITA": "IT", "ESP": "ES", "JPN": "JP", "CHN": "CN",
            "KOR": "KR", "BRA": "BR", "AUS": "AU", "IND": "IN", "RUS": "RU",
        }
        if upper in alpha3_map:
            return alpha3_map[upper]
    # Full country name lookup.
    return _COUNTRY_NAME_TO_ISO2.get(s.lower())


# ---------------------------------------------------------------------------
# Streaming entry point — for inputs that don't fit in memory
# ---------------------------------------------------------------------------

@dataclass
class StreamingStandardizeResult:
    """Summary returned by :func:`standardize_file`.

    Mirrors :class:`StandardizeResult` but without the in-memory
    DataFrame — the standardized output is written incrementally to
    ``output_path``. The ``changes`` audit is also written
    incrementally to ``audit_path`` and capped at
    ``options.audit_max_rows`` total rows across all chunks.
    """

    output_path: Path
    audit_path: Optional[Path]
    rows_processed: int
    chunks_processed: int
    cells_changed: int
    cells_unparseable: int
    cells_total: int
    columns_processed: list[str]


def standardize_file(
    input_path: str | Path,
    output_path: str | Path,
    options: Optional[StandardizeOptions] = None,
    *,
    chunk_size: int = 50_000,
    audit_path: Optional[str | Path] = None,
    progress_callback: Optional[Any] = None,
    encoding: str = "utf-8",
    delimiter: str = ",",
) -> StreamingStandardizeResult:
    """Standardize a CSV/TSV file in chunks, writing output incrementally.

    For inputs too large to materialize in memory, this entry point
    streams ``chunk_size`` rows at a time through
    :func:`standardize_dataframe` and writes each chunk to *output_path*
    as it completes. Memory stays bounded by the chunk size regardless
    of input file size.

    The audit is written to *audit_path* (default
    ``{output_path.stem}_changes.csv``). Each chunk's
    ``options.audit_max_rows`` budget is respected per chunk; pass
    ``audit_max_rows=None`` for a full audit (memory-bounded only by
    disk).

    Performance for a 1 GB CSV with ~10 M rows on a typical workstation:
        - chunk_size=50_000 → ~50 MB peak DataFrame footprint
        - phone-only standardization: ~3-6 minutes (cache-warm)
        - mixed phone + currency + address: ~8-15 minutes
        - first chunk is the cold-cache slowest; later chunks ride the LRU.

    Parameters
    ----------
    input_path
        CSV or TSV path. Excel inputs aren't streamed — load with
        :func:`read_file` and use :func:`standardize_dataframe`.
    output_path
        Where to write the standardized CSV. Existing files are
        overwritten.
    chunk_size
        Rows per chunk. Default 50,000 ≈ 50 MB resident for typical
        widths. Higher → less I/O overhead, more peak memory.
    progress_callback
        Optional ``callable(rows_processed, chunks_processed)``
        called once per chunk.
    """
    from .errors import wrap_file_read, wrap_file_write
    options = options or StandardizeOptions()
    inp = Path(input_path)
    out = Path(output_path)
    if not inp.exists():
        from .errors import FileAccessError
        raise FileAccessError(
            f"Input file not found: {inp}",
            path=inp, operation="standardize_file",
        )

    audit_p = Path(audit_path) if audit_path else out.with_name(
        f"{out.stem}_changes.csv"
    )

    rows_processed = 0
    chunks_processed = 0
    cells_changed = 0
    cells_unparseable = 0
    cells_total = 0
    columns_processed: list[str] = []
    audit_room = (
        options.audit_max_rows if options.audit_max_rows is not None
        else float("inf")
    )

    out.parent.mkdir(parents=True, exist_ok=True)
    audit_p.parent.mkdir(parents=True, exist_ok=True)

    out_writer_open = False
    audit_writer_open = False

    try:
        reader = pd.read_csv(
            inp, chunksize=chunk_size, encoding=encoding,
            sep=delimiter, dtype=str, keep_default_na=False,
        )
    except (OSError, FileNotFoundError) as e:
        raise wrap_file_read(inp, "standardize_file", e) from e

    try:
        for chunk in reader:
            # The chunked reader gives back row indices that restart
            # at chunk boundaries; renumber so audit row indices reflect
            # the full input file.
            chunk_offset = rows_processed
            chunk_options = options
            # Local audit cap per chunk: never exceed the global budget.
            if options.audit_max_rows is not None and audit_room <= 0:
                # Disable audit for this chunk by setting cap=0; the
                # standardizer skips appending records once room == 0.
                chunk_options = _replace_options(options, audit_max_rows=0)

            result = standardize_dataframe(chunk, chunk_options)
            cells_changed += result.cells_changed
            cells_unparseable += result.cells_unparseable
            cells_total += result.cells_total
            if not columns_processed:
                columns_processed = list(result.columns_processed)

            # Write the standardized chunk
            try:
                if not out_writer_open:
                    result.standardized_df.to_csv(
                        out, mode="w", index=False, encoding=encoding,
                        sep=delimiter,
                    )
                    out_writer_open = True
                else:
                    result.standardized_df.to_csv(
                        out, mode="a", index=False, header=False,
                        encoding=encoding, sep=delimiter,
                    )
            except OSError as e:
                raise wrap_file_write(out, "standardize_file", e) from e

            # Write the audit (re-numbering rows to absolute file positions).
            if not result.changes.empty and audit_room > 0:
                # ``audit_room`` is float('inf') when the user wants an
                # unbounded audit; ``iloc[:inf]`` is invalid, so take the
                # whole frame in that case.
                if audit_room == float("inf"):
                    cap_changes = result.changes.copy()
                else:
                    cap_changes = result.changes.iloc[: int(audit_room)].copy()
                cap_changes["row"] = cap_changes["row"] + chunk_offset
                try:
                    if not audit_writer_open:
                        cap_changes.to_csv(
                            audit_p, mode="w", index=False, encoding=encoding,
                        )
                        audit_writer_open = True
                    else:
                        cap_changes.to_csv(
                            audit_p, mode="a", index=False, header=False,
                            encoding=encoding,
                        )
                except OSError as e:
                    raise wrap_file_write(audit_p, "standardize_file", e) from e
                audit_room -= len(cap_changes)

            rows_processed += len(chunk)
            chunks_processed += 1
            if progress_callback:
                try:
                    progress_callback(rows_processed, chunks_processed)
                except Exception:
                    # Progress callbacks are advisory — don't kill the run.
                    logger.opt(exception=True).debug(
                        "progress_callback raised; ignoring"
                    )
    finally:
        # Ensure the iterator is closed (closes the underlying file).
        if hasattr(reader, "close"):
            reader.close()

    return StreamingStandardizeResult(
        output_path=out,
        audit_path=audit_p if audit_writer_open else None,
        rows_processed=rows_processed,
        chunks_processed=chunks_processed,
        cells_changed=cells_changed,
        cells_unparseable=cells_unparseable,
        cells_total=cells_total,
        columns_processed=columns_processed,
    )


def _replace_options(options: StandardizeOptions, **kwargs: Any) -> StandardizeOptions:
    """Cheap shallow clone of :class:`StandardizeOptions` with overrides.

    Used by the streaming path to reduce the audit budget chunk-by-chunk
    without mutating the caller's options object.
    """
    from dataclasses import replace
    return replace(options, **kwargs)