"""Format standardization for tabular data. Per-cell standardizers turn messy free-form values into a single canonical representation: dates → ISO ``YYYY-MM-DD``, phones → E.164 (or other formats from ``phonenumbers``), currency → bare numeric strings, names → ``Title Case``, addresses → expanded USPS forms (``St.`` → ``Street``), booleans → ``True``/``False``. Each per-cell function is ``str -> tuple[str, bool]`` — returning ``(new_value, changed)`` so the DataFrame-level pipeline can audit which cells were rewritten and which it left alone (unparseable input passes through). All standardizers handle ``None``/empty gracefully and are idempotent (applying twice yields the same result as once). The DataFrame entry point :func:`standardize_dataframe` mirrors :func:`src.core.text_clean.clean_dataframe` in shape: per-column type assignments drive the pipeline, the input DataFrame is not mutated, and a :class:`StandardizeResult` carries both the rewritten frame and a row-by-row change audit. """ from __future__ import annotations import json import re from loguru import logger from dataclasses import asdict, dataclass, field from datetime import datetime, timedelta from enum import Enum from pathlib import Path from typing import Any, Iterable, Literal, Optional import pandas as pd import phonenumbers from .text_clean import smart_title_case # --------------------------------------------------------------------------- # Field-type registry # --------------------------------------------------------------------------- class FieldType(str, Enum): """The kinds of values the standardizer knows how to canonicalize.""" DATE = "date" PHONE = "phone" CURRENCY = "currency" NAME = "name" ADDRESS = "address" BOOLEAN = "boolean" EMAIL = "email" # Shared error-policy helper used by every per-domain standardizer. # Returns ``(, changed)`` under the ``"sentinel"`` policy # and ``(value, False)`` under ``"passthrough"`` so unparseable input # survives unchanged. def _err_or_passthrough( reason: str, value: str, policy: str, ) -> tuple[str, bool]: if policy == "sentinel": sentinel = f"" return sentinel, sentinel != value return value, False # --------------------------------------------------------------------------- # Date # --------------------------------------------------------------------------- # Order matters: longer / more-specific formats first. Two-digit-year # formats sit below their four-digit counterparts so ``2024-01-15`` parses # as ISO before ``%y-%m-%d`` even gets a look-in. _DATE_FORMATS_MDY = [ "%Y-%m-%d", "%Y/%m/%d", "%Y.%m.%d", "%m/%d/%Y", "%m-%d-%Y", "%m.%d.%Y", "%m/%d/%y", "%m-%d-%y", "%B %d, %Y", "%b %d, %Y", "%B %d %Y", "%b %d %Y", "%d %B %Y", "%d %b %Y", "%d-%b-%Y", "%d-%b-%y", "%Y%m%d", ] _DATE_FORMATS_DMY = [ "%Y-%m-%d", "%Y/%m/%d", "%Y.%m.%d", "%d/%m/%Y", "%d-%m-%Y", "%d.%m.%Y", "%d/%m/%y", "%d-%m-%y", "%d.%m.%y", "%d %B %Y", "%d %b %Y", "%B %d, %Y", "%b %d, %Y", "%B %d %Y", "%b %d %Y", "%d-%b-%Y", "%d-%b-%y", "%Y%m%d", ] # Weekday-prefixed long form: ``Monday, January 15, 2024``. _WEEKDAY_PREFIX_RE = re.compile( r"^(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)(?:day|sday|nesday|rsday|urday)?\s*,?\s+", re.IGNORECASE, ) # Strip a trailing time component (``2024-01-15 13:45:00`` etc.) before # format-matching the date portion. _TIME_TAIL_RE = re.compile(r"[\sT]\d{1,2}:\d{2}(?::\d{2}(?:\.\d+)?)?(?:\s*[AaPp][Mm])?(?:\s*[+-]\d{2}:?\d{2}|\s*Z|\s*[A-Z]{2,4})?$") # Buried date: a strict YYYY-MM-DD substring inside other text, used # only when the whole string fails strptime first. _BURIED_ISO_DATE_RE = re.compile(r"\b(\d{4}-\d{2}-\d{2})\b") # Excel serial date range — Jan 1 1970 to Jan 1 2099 (inclusive). Excel # 1900 leap year bug: serials >= 60 are off by one because Excel pretends # 1900-02-29 exists; we subtract a day in that range. _EXCEL_SERIAL_MIN = 25569.0 # Jan 1 1970 _EXCEL_SERIAL_MAX = 73050.0 # Jan 1 2099 _EXCEL_EPOCH = datetime(1899, 12, 30) # accounts for the leap-year bug # Unix timestamp ranges — covers Jan 1 2000 to Jan 1 2100 in seconds and # milliseconds. Narrow enough that we don't false-positive on other ints. _UNIX_S_MIN = 946684800 # 2000-01-01 00:00:00 UTC _UNIX_S_MAX = 4102444800 # 2100-01-01 00:00:00 UTC _UNIX_MS_MIN = _UNIX_S_MIN * 1000 _UNIX_MS_MAX = _UNIX_S_MAX * 1000 # Year-month text (``January 2024`` / ``Jan 2024``) → ``YYYY-MM``. _MONTH_NAMES_EN = [ "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december", ] _MONTH_ABBR_EN = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"] _YEAR_MONTH_TEXT_RE = re.compile( rf"^\s*({'|'.join(_MONTH_NAMES_EN + _MONTH_ABBR_EN)})\s+(\d{{4}})\s*$", re.IGNORECASE, ) # Quarter notation: ``Q1 2024`` → ``2024-Q1``. _QUARTER_RE = re.compile(r"^\s*Q([1-4])\s+(\d{4})\s*$", re.IGNORECASE) # Localized month names → English. Substituted before strptime so the # regular ``%B``/``%b`` formats catch them. Includes both full and # abbreviated forms where conventional. _MONTH_LOCALES: dict[str, dict[str, str]] = { "fr": { "janvier": "January", "février": "February", "fevrier": "February", "mars": "March", "avril": "April", "mai": "May", "juin": "June", "juillet": "July", "août": "August", "aout": "August", "septembre": "September", "octobre": "October", "novembre": "November", "décembre": "December", "decembre": "December", "janv": "Jan", "févr": "Feb", "fevr": "Feb", "avr": "Apr", "juil": "Jul", "sept": "Sep", "oct": "Oct", "nov": "Nov", "déc": "Dec", "dec": "Dec", }, "de": { "januar": "January", "februar": "February", "märz": "March", "marz": "March", "april": "April", "mai": "May", "juni": "June", "juli": "July", "august": "August", "september": "September", "oktober": "October", "november": "November", "dezember": "December", "jan": "Jan", "feb": "Feb", "mär": "Mar", "mar": "Mar", "apr": "Apr", "jun": "Jun", "jul": "Jul", "aug": "Aug", "sep": "Sep", "okt": "Oct", "nov": "Nov", "dez": "Dec", }, "es": { "enero": "January", "febrero": "February", "marzo": "March", "abril": "April", "mayo": "May", "junio": "June", "julio": "July", "agosto": "August", "septiembre": "September", "setiembre": "September", "octubre": "October", "noviembre": "November", "diciembre": "December", }, "pt": { "janeiro": "January", "fevereiro": "February", "março": "March", "marco": "March", "abril": "April", "maio": "May", "junho": "June", "julho": "July", "agosto": "August", "setembro": "September", "outubro": "October", "novembro": "November", "dezembro": "December", "jan": "Jan", "fev": "Feb", "mar": "Mar", "abr": "Apr", "mai": "May", "jun": "Jun", "jul": "Jul", "ago": "Aug", "set": "Sep", "out": "Oct", "nov": "Nov", "dez": "Dec", }, "it": { "gennaio": "January", "febbraio": "February", "marzo": "March", "aprile": "April", "maggio": "May", "giugno": "June", "luglio": "July", "agosto": "August", "settembre": "September", "ottobre": "October", "novembre": "November", "dicembre": "December", "gen": "Jan", "feb": "Feb", "mar": "Mar", "apr": "Apr", "mag": "May", "giu": "Jun", "lug": "Jul", "ago": "Aug", "set": "Sep", "ott": "Oct", "nov": "Nov", "dic": "Dec", }, "nl": { "januari": "January", "februari": "February", "maart": "March", "april": "April", "mei": "May", "juni": "June", "juli": "July", "augustus": "August", "september": "September", "oktober": "October", "november": "November", "december": "December", "jan": "Jan", "feb": "Feb", "mrt": "Mar", "apr": "Apr", "mei": "May", "jun": "Jun", "jul": "Jul", "aug": "Aug", "sep": "Sep", "okt": "Oct", "nov": "Nov", "dec": "Dec", }, "ru": { "января": "January", "февраля": "February", "марта": "March", "апреля": "April", "мая": "May", "июня": "June", "июля": "July", "августа": "August", "сентября": "September", "октября": "October", "ноября": "November", "декабря": "December", # Nominative forms (less common in dates but possible) "январь": "January", "февраль": "February", "март": "March", "апрель": "April", "май": "May", "июнь": "June", "июль": "July", "август": "August", "сентябрь": "September", "октябрь": "October", "ноябрь": "November", "декабрь": "December", }, } # Localized weekday prefix removal — same idea as month substitution. # Each locale's set lists full + abbreviated forms (lowercase) that # should be stripped from the start of a date string before format # matching. English is in ``_WEEKDAY_PREFIX_RE`` already. _WEEKDAY_LOCALES: dict[str, list[str]] = { "fr": ["lundi", "mardi", "mercredi", "jeudi", "vendredi", "samedi", "dimanche", "lun", "mar", "mer", "jeu", "ven", "sam", "dim"], "de": ["montag", "dienstag", "mittwoch", "donnerstag", "freitag", "samstag", "sonntag", "mo", "di", "mi", "do", "fr", "sa", "so"], "es": ["lunes", "martes", "miércoles", "miercoles", "jueves", "viernes", "sábado", "sabado", "domingo"], "it": ["lunedì", "lunedi", "martedì", "martedi", "mercoledì", "mercoledi", "giovedì", "giovedi", "venerdì", "venerdi", "sabato", "domenica"], "pt": ["segunda-feira", "segunda", "terça-feira", "terca-feira", "terça", "terca", "quarta-feira", "quarta", "quinta-feira", "quinta", "sexta-feira", "sexta", "sábado", "sabado", "domingo"], "nl": ["maandag", "dinsdag", "woensdag", "donderdag", "vrijdag", "zaterdag", "zondag", "ma", "di", "wo", "do", "vr", "za", "zo"], "ru": ["понедельник", "вторник", "среда", "четверг", "пятница", "суббота", "воскресенье", "пн", "вт", "ср", "чт", "пт", "сб", "вс"], } def _build_weekday_patterns() -> dict[str, "re.Pattern[str]"]: """One regex per locale matching any leading weekday + optional comma.""" out = {} for loc, words in _WEEKDAY_LOCALES.items(): # Sort longest first so ``segunda-feira`` wins over ``segunda``. alt = "|".join(re.escape(w) for w in sorted(words, key=len, reverse=True)) out[loc] = re.compile(rf"^(?:{alt})\s*,?\s+", re.IGNORECASE) return out _WEEKDAY_LOCALE_PATTERNS = _build_weekday_patterns() # Named timezone → fixed UTC offset. Resolves common abbreviations so # ``2024-01-15 10:30:00 EST`` produces a date instead of falling through # unparseably. Per FORMATS-CASES.md § 3.3, these are *fixed* offsets — # DST-aware handling is out of scope (would require pyzoneinfo). _NAMED_TZ_OFFSETS: dict[str, str] = { # Universal "UTC": "+00:00", "GMT": "+00:00", "Z": "+00:00", # Americas "EST": "-05:00", "EDT": "-04:00", "CST": "-06:00", "CDT": "-05:00", "MST": "-07:00", "MDT": "-06:00", "PST": "-08:00", "PDT": "-07:00", "AST": "-04:00", "AKST": "-09:00", "HST": "-10:00", "BRT": "-03:00", "ART": "-03:00", # Europe "BST": "+01:00", "CET": "+01:00", "CEST": "+02:00", "EET": "+02:00", "EEST": "+03:00", "WET": "+00:00", "WEST": "+01:00", "MSK": "+03:00", # Asia / Pacific "IST": "+05:30", "PKT": "+05:00", "BDT": "+06:00", "ICT": "+07:00", "WIB": "+07:00", "CST_CN": "+08:00", "HKT": "+08:00", "SGT": "+08:00", "PHT": "+08:00", "JST": "+09:00", "KST": "+09:00", "AEST": "+10:00", "AEDT": "+11:00", "NZST": "+12:00", } def _build_month_locale_patterns() -> dict[str, list[tuple["re.Pattern[str]", str]]]: """Precompile per-locale (pattern, replacement) lists once at import. The previous loop compiled every pattern for every input cell — at millions of rows that's a measurable hot spot. """ out: dict[str, list[tuple[re.Pattern[str], str]]] = {} for loc, table in _MONTH_LOCALES.items(): out[loc] = [ ( re.compile( rf"(? str: """Replace localized month names with English equivalents. Raises ``ValueError`` if any locale is unrecognized — silent skip would mask typos like ``"FR"`` (uppercase) or ``"french"``. """ unknown = [ loc for loc in locales if loc != "en" and loc not in _MONTH_LOCALES ] if unknown: raise ValueError( f"Unknown month locale(s): {unknown}. " f"Available: {sorted(_MONTH_LOCALES) + ['en']}" ) for loc in locales: if loc == "en": continue for pat, english in _MONTH_LOCALE_PATTERNS[loc]: s = pat.sub(english, s) return s def _try_excel_serial(s: str, output_format: str) -> Optional[str]: """Excel-1900 serial date → formatted date, or None if out of range.""" try: n = float(s) except ValueError: return None if not (_EXCEL_SERIAL_MIN <= n <= _EXCEL_SERIAL_MAX): return None days = int(n) # drop fractional time-of-day component # Excel 1900 leap year bug: serials >= 60 are off by one day. Our # epoch (1899-12-30) already corrects for this for serials >= 60. # For serials < 60, we'd need a different epoch (1899-12-31), but # those serials are pre-1900 anyway and outside our supported range. try: return (_EXCEL_EPOCH + timedelta(days=days)).strftime(output_format) except (OverflowError, ValueError): return None def _try_unix_timestamp(s: str, output_format: str) -> Optional[str]: """Unix seconds / milliseconds → formatted date, or None.""" try: n = int(s) except ValueError: return None if _UNIX_S_MIN <= n <= _UNIX_S_MAX: seconds = n elif _UNIX_MS_MIN <= n <= _UNIX_MS_MAX: seconds = n // 1000 else: return None try: return datetime.utcfromtimestamp(seconds).strftime(output_format) except (OverflowError, ValueError, OSError): return None DateOrder = Literal["MDY", "DMY"] DateErrorPolicy = Literal["passthrough", "sentinel"] def standardize_date( value: Optional[str], *, output_format: str = "%Y-%m-%d", date_order: DateOrder = "MDY", error_policy: DateErrorPolicy = "passthrough", month_locales: Optional[list[str]] = None, two_digit_year_cutoff: int = 69, ) -> tuple[str, bool]: """Parse *value* as a date and return it formatted per *output_format*. ``date_order`` disambiguates ``01/02/2024``: ``"MDY"`` reads it as Jan 2, ``"DMY"`` as Feb 1. ISO-shaped inputs (``YYYY-MM-DD``) are unambiguous and parse the same way under either setting. With ``error_policy="passthrough"`` (default) unparseable input passes through unchanged. With ``"sentinel"`` the cleaner emits ``>`` for invalid dates per corpus § 0.3. ``month_locales`` enables non-English month names. Pass any subset of ``["en", "fr", "de", "es", "pt", "it", "nl", "ru"]`` to recognize those locales' month + weekday names in addition to English. Defaults to English-only. ``two_digit_year_cutoff`` controls the pivot for 2-digit years: years ``00..cutoff`` map to 2000-2099, ``cutoff+1..99`` map to 1900-1999. Default 69 (Python's stdlib default). Override to ~25 for birth-year columns where most subjects were born ≤ 1999. Recognizes Excel-1900 serial dates (``45306`` → ``2024-01-15``), Unix timestamps in seconds and milliseconds, year-month text (``January 2024`` → ``2024-01``), and quarter notation (``Q1 2024`` → ``2024-Q1``) in addition to the standard date formats. Returns ``(new_value, changed)``. """ if not value or not isinstance(value, str): return value or "", False s = value.strip() if not s: return value, False _err = lambda reason: _err_or_passthrough(reason, value, error_policy) # Excel serial dates and Unix timestamps don't survive the weekday- # prefix / time-tail strips, so try them first. They short-circuit # for pure-numeric inputs. if re.match(r"^-?\d+(?:\.\d+)?$", s): excel = _try_excel_serial(s, output_format) if excel is not None: return excel, excel != value unix = _try_unix_timestamp(s, output_format) if unix is not None: return unix, unix != value # Year-month text (``January 2024``) → ``YYYY-MM`` (precision-preserving). ym = _YEAR_MONTH_TEXT_RE.match(s) if ym: month_word = ym.group(1).lower() if month_word in _MONTH_NAMES_EN: month_num = _MONTH_NAMES_EN.index(month_word) + 1 else: month_num = _MONTH_ABBR_EN.index(month_word) + 1 out = f"{ym.group(2)}-{month_num:02d}" return out, out != value # Quarter notation (``Q1 2024``) → ``YYYY-Q1``. q = _QUARTER_RE.match(s) if q: out = f"{q.group(2)}-Q{q.group(1)}" return out, out != value # CJK separator normalization: Japanese ``2024年01月15日`` → ``2024-01-15``, # Korean ``2024.01.15`` is already covered by the dot format. Also fold # fullwidth digits (0-9) to ASCII so any of the parsers can read them. s = _normalize_cjk_date_chars(s) # Substitute localized month names with English before format-match. if month_locales: s = _apply_month_locale(s, month_locales) # Strip localized weekday prefixes for any enabled locale BEFORE # the day-period strip — otherwise ``Montag, 15. Januar 2024`` # never reaches the digit-leading shape the period strip expects. for loc in month_locales: pat = _WEEKDAY_LOCALE_PATTERNS.get(loc) if pat is not None: s = pat.sub("", s).strip() # German DMY uses ``15.`` for the day; strip the trailing period # so ``15. Januar 2024`` parses as ``15 January 2024``. s = re.sub(r"^(\d{1,2})\.\s+", r"\1 ", s) # Strip a leading weekday prefix (``Monday, January 15, 2024``). s = _WEEKDAY_PREFIX_RE.sub("", s).strip() # Resolve named timezones (EST/PST/JST/…) to fixed offsets, then # drop the trailing time portion before format-matching. s = _resolve_named_tz(s) s = _TIME_TAIL_RE.sub("", s).strip() # ISO 8601 extended formats — week date + ordinal date — and # RFC 2822 mail-header form. iso_extended = _try_iso_extended(s, output_format) if iso_extended is not None: return iso_extended, iso_extended != value rfc = _try_rfc2822(s, output_format) if rfc is not None: return rfc, rfc != value parsed = _try_parse_date(s, date_order, two_digit_year_cutoff) if parsed is not None: out = parsed.strftime(output_format) return out, out != value # Buried-date extraction: try a strict ISO substring (``Date: 2024-01-15``, # ``2024-01-15 (verified)``). m = _BURIED_ISO_DATE_RE.search(value) if m: try: parsed = datetime.strptime(m.group(1), "%Y-%m-%d") out = parsed.strftime(output_format) return out, out != value except ValueError: pass # Detect explicit-but-invalid date shapes — give the user a clearer # error than silent passthrough. Other shapes (partial precision, # unknown text) pass through unchanged regardless of error policy. iso_shape = re.match(r"^(\d{4})-(\d{1,2})-(\d{1,2})$", s) if iso_shape: y, mo, d = int(iso_shape[1]), int(iso_shape[2]), int(iso_shape[3]) if y == 1900 and mo == 2 and d == 29: return _err("Excel 1900 leap year bug") if mo > 12 or mo < 1: return _err("invalid month") if d > 31 or d < 1: return _err("invalid day") if mo == 2: leap = y % 4 == 0 and (y % 100 != 0 or y % 400 == 0) if d > (29 if leap else 28): return _err("invalid leap day" if d == 29 else "invalid day") if mo in {4, 6, 9, 11} and d > 30: return _err("invalid day") return value, False def _try_parse_date( s: str, date_order: DateOrder, two_digit_year_cutoff: int = 69, ) -> Optional[datetime]: formats = _DATE_FORMATS_DMY if date_order == "DMY" else _DATE_FORMATS_MDY for fmt in formats: try: parsed = datetime.strptime(s, fmt) except ValueError: continue # Re-pivot 2-digit years if the user changed the cutoff. strptime # uses Python's stdlib default of 69; for cutoff != 69 we may need # to roll the century forward or back. if "%y" in fmt and two_digit_year_cutoff != 69: year_2 = parsed.year % 100 if year_2 <= two_digit_year_cutoff: century = 2000 else: century = 1900 parsed = parsed.replace(year=century + year_2) return parsed return None _FULLWIDTH_DIGITS = str.maketrans("0123456789", "0123456789") _CJK_DATE_MARKERS = str.maketrans({"年": "-", "月": "-", "日": "", ".": ".", "/": "/"}) def _normalize_cjk_date_chars(s: str) -> str: """Fold East Asian date markers + fullwidth digits to ASCII equivalents. ``2024年01月15日`` → ``2024-01-15``; fullwidth ``2024/01/15`` → ``2024/01/15``. Idempotent on ASCII input. """ if not any(c > "\x7f" for c in s): return s s = s.translate(_FULLWIDTH_DIGITS).translate(_CJK_DATE_MARKERS) # ``2024年01月15日`` becomes ``2024-01-15-`` with our trailing-day # mapping; strip any trailing dash artifact. return s.rstrip("-").strip() _NAMED_TZ_RE = re.compile( r"\s+(" + "|".join(re.escape(k) for k in sorted(_NAMED_TZ_OFFSETS, key=len, reverse=True)) + r")\b" ) def _resolve_named_tz(s: str) -> str: """Replace a trailing named timezone with its fixed UTC offset. ``2024-01-15 10:30:00 EST`` → ``2024-01-15 10:30:00-05:00``. Per FORMATS-CASES.md § 3.3, offsets are fixed (not DST-aware); see ``_NAMED_TZ_OFFSETS`` for the table. """ def repl(m: re.Match) -> str: return _NAMED_TZ_OFFSETS[m.group(1)] return _NAMED_TZ_RE.sub(repl, s) _ISO_WEEK_RE = re.compile(r"^(\d{4})-W(\d{2})-(\d)$") _ISO_ORDINAL_RE = re.compile(r"^(\d{4})-(\d{3})$") def _try_iso_extended(s: str, output_format: str) -> Optional[str]: """Parse ISO 8601 week date or ordinal date, return formatted string.""" m = _ISO_WEEK_RE.match(s) if m: try: parsed = datetime.fromisocalendar( int(m.group(1)), int(m.group(2)), int(m.group(3)), ) return parsed.strftime(output_format) except ValueError: return None m = _ISO_ORDINAL_RE.match(s) if m: year, day = int(m.group(1)), int(m.group(2)) if 1 <= day <= 366: try: parsed = datetime(year, 1, 1) + timedelta(days=day - 1) if parsed.year == year: return parsed.strftime(output_format) except ValueError: return None return None # RFC 2822 mail-header form: ``Wed, 15 Jan 2024 10:30:00 GMT``. _RFC2822_FORMATS = [ "%a, %d %b %Y %H:%M:%S", # without TZ "%a, %d %b %Y %H:%M:%S %Z", # with named TZ (already resolved upstream) "%a, %d %b %Y %H:%M:%S %z", # with offset "%d %b %Y %H:%M:%S", ] def _try_rfc2822(s: str, output_format: str) -> Optional[str]: """Parse RFC 2822 mail-header date format.""" for fmt in _RFC2822_FORMATS: try: parsed = datetime.strptime(s, fmt) except ValueError: continue try: return parsed.strftime(output_format) except ValueError: return None return None # --------------------------------------------------------------------------- # Phone # --------------------------------------------------------------------------- PhoneFormat = Literal["E164", "INTERNATIONAL", "NATIONAL", "DIGITS"] PhoneErrorPolicy = Literal["passthrough", "sentinel"] _PHONE_FORMAT_MAP = { "E164": phonenumbers.PhoneNumberFormat.E164, "INTERNATIONAL": phonenumbers.PhoneNumberFormat.INTERNATIONAL, "NATIONAL": phonenumbers.PhoneNumberFormat.NATIONAL, } # Placeholder sequences that look like phone numbers but are CRM # sentinels for "no phone" — repeated single digit at NANP length. _PHONE_PLACEHOLDER_RE = re.compile(r"^\+?1?[\s.()-]*([0-9])(?:[\s.()-]*\1){9}$") # Multi-number cells split by ``/``, ``;``, ``,`` or `` and ``. _PHONE_MULTI_SPLIT_RE = re.compile(r"\s*(?:/|;|,| and )\s*") def standardize_phone( value: Optional[str], *, output_format: PhoneFormat = "E164", default_region: str = "US", error_policy: PhoneErrorPolicy = "passthrough", ) -> tuple[str, bool]: """Parse with ``phonenumbers``, return in the requested format. Default is ``passthrough`` for unparseable input; pass ``error_policy="sentinel"`` to emit ``>`` for placeholder runs (000-000-0000), multi-number cells, and contaminated inputs (corpus § 4.3). Extensions are preserved as a ``;ext=N`` suffix (RFC 3966 syntax) when the format is E.164. Other output formats use libphonenumber's native rendering, which already includes extensions. The ``001`` international prefix is normalized to ``+`` before parsing — without this, ``001 555 123 4567`` fails to parse under ``default_region="US"``. ``DIGITS`` strips every non-digit character without going through ``phonenumbers``. """ if not value or not isinstance(value, str): return value or "", False s = value.strip() if not s: return value, False _err = lambda reason: _err_or_passthrough(reason, value, error_policy) if output_format == "DIGITS": digits = re.sub(r"\D", "", s) return (digits, digits != value) if digits else (value, False) # Multi-number per cell — error before we silently parse only the # first number. ``5551234567 / 5559876543`` both parse independently. if _PHONE_MULTI_SPLIT_RE.search(s): parts = [p for p in _PHONE_MULTI_SPLIT_RE.split(s) if p.strip()] if len(parts) >= 2 and all( _looks_like_phone(p, default_region) for p in parts ): return _err("multiple numbers in cell") # Smart-quote contamination — unparseable detritus interleaved with # digits. Strip and re-test, but flag when error_policy is sentinel. if any(c in s for c in "‘’“”"): cleaned = re.sub(r"[‘’“”][a-z]*", "", s).strip() if cleaned != s: if error_policy == "sentinel": return _err("smart-quote contamination") s = cleaned # 001 international access prefix (US-style for "dial out") — strip # entirely; the remaining digits are a regular national number that # the region default can resolve. if re.match(r"^001[\s\-]", s): s = s[3:].lstrip(" -") # Placeholder all-same-digit runs. if _PHONE_PLACEHOLDER_RE.match(s): return _err("placeholder number") fmt = _PHONE_FORMAT_MAP[output_format] try: parsed = phonenumbers.parse(s, default_region) except phonenumbers.NumberParseException: # Anything that can't be parsed becomes a sentinel under the # sentinel policy; passthrough returns the original. Both digit- # and-formatting failures and pure non-numeric ("TBD"-style) cells # land here. return _err("not a phone number") if not phonenumbers.is_possible_number(parsed): # Distinguish "too many digits" from generic invalidity for # NANP-shaped inputs. Inputs that look like local-only NANP # numbers (7 digits) get a specific "insufficient digits" tag. raw_digits = re.sub(r"\D", "", s) if len(raw_digits) > 11 and default_region in {"US", "CA"}: return _err("too many digits") if 0 < len(raw_digits) < 10 and default_region in {"US", "CA"}: return _err("insufficient digits") return value, False # genuinely unparseable elsewhere — passthrough # Extra-digit detection: NANP (region US/CA, country code 1) only # accepts 10 digits (or 11 with leading 1). Excess digits in input # like "1-555-123-4567-extra-99" parse out as more digits and we # error rather than silently truncate. raw_digits = re.sub(r"\D", "", s) parsed_digits = re.sub(r"\D", "", phonenumbers.format_number( parsed, phonenumbers.PhoneNumberFormat.E164, )) if len(raw_digits) > len(parsed_digits) + 4: return _err("too many digits") # NANP minimum-length check — phonenumbers.is_possible_number is # permissive; corpus § 4.3 wants insufficient-digits flagged. if parsed.country_code == 1 and len(str(parsed.national_number)) < 10: return _err("insufficient digits") out = phonenumbers.format_number(parsed, fmt) # Append extension as RFC 3966 ;ext= suffix on E.164 output (other # formats already include the extension natively). if output_format == "E164" and parsed.extension: out = f"{out};ext={parsed.extension}" return out, out != value def _looks_like_phone(s: str, region: str) -> bool: """Quick check: does *s* parse as a possible phone in *region*?""" try: p = phonenumbers.parse(s, region) except phonenumbers.NumberParseException: return False return phonenumbers.is_possible_number(p) # --------------------------------------------------------------------------- # Currency # --------------------------------------------------------------------------- # Symbol → ISO 4217 mapping. Used both for stripping currency markers # before number parsing AND for the optional ``preserve_code`` mode that # re-emits the detected code as a prefix on the standardized output. _SYMBOL_TO_ISO: dict[str, str] = { "$": "USD", # ambiguous w/ CAD/AUD/MXN — caller can override via input code "€": "EUR", "£": "GBP", "¥": "JPY", # ambiguous w/ CNY — same caveat "₹": "INR", "₩": "KRW", "₽": "RUB", "₪": "ILS", "₺": "TRY", "¢": "USD", # cents — coerce to USD for the code; value is still numeric # International additions: "฿": "THB", # Thai Baht "₫": "VND", # Vietnamese Dong "₮": "MNT", # Mongolian Tugrik "₴": "UAH", # Ukrainian Hryvnia "₦": "NGN", # Nigerian Naira "₱": "PHP", # Philippine Peso "₲": "PYG", # Paraguayan Guarani "﷼": "SAR", # ambiguous Saudi/Omani/Iranian; pick the most common "₨": "PKR", # Pakistani Rupee (and historical Sri Lankan) "₵": "GHS", # Ghanaian Cedi } _CURRENCY_SYMBOLS = "".join(_SYMBOL_TO_ISO) # ISO 4217 codes — the long tail of currencies in active use. Order # matters for the regex alternation: a 3-letter ISO code is unambiguous, # but ``R$`` (Brazil) and ``kr`` (DKK/NOK/SEK) are 1-2 char prefixes # that need to lose to a 3-letter code if both appear. _CURRENCY_CODES_LIST = [ "USD", "EUR", "GBP", "JPY", "CNY", "CAD", "AUD", "CHF", "INR", "KRW", "RUB", "MXN", "BRL", "ILS", "TRY", "ZAR", "SEK", "NOK", "DKK", "PLN", "HKD", "SGD", "NZD", # Major non-G10 economies: "SAR", "AED", "QAR", "KWD", "BHD", "OMR", # Gulf "ARS", "CLP", "COP", "PEN", "UYU", # Latin America "EGP", "MAD", "TND", "NGN", "GHS", "KES", "ZAR", "TZS", "UGX", # Africa "IDR", "MYR", "PHP", "THB", "VND", "TWD", # SE Asia "PKR", "BDT", "LKR", "NPR", # South Asia "HUF", "CZK", "RON", "BGN", "HRK", "ISK", # Europe-other "UAH", "KZT", "GEL", "AMD", "AZN", # Eastern Europe / Caucasus ] _CURRENCY_CODES = "|".join(_CURRENCY_CODES_LIST) _CURRENCY_DETECT_RE = re.compile( rf"(?P{_CURRENCY_CODES})|(?P[{_CURRENCY_SYMBOLS}])", re.IGNORECASE, ) _CURRENCY_TRIM_RE = re.compile( rf"^[\s{_CURRENCY_SYMBOLS}]*(?:{_CURRENCY_CODES})?[\s{_CURRENCY_SYMBOLS}]*" rf"|[\s{_CURRENCY_SYMBOLS}]*(?:{_CURRENCY_CODES})?[\s{_CURRENCY_SYMBOLS}]*$", re.IGNORECASE, ) _PARENS_NEGATIVE_RE = re.compile(r"^\s*\(\s*(.+?)\s*\)\s*$") CurrencyDecimal = Literal["dot", "comma", "auto"] # Multi-character symbol prefixes that aren't captured by the # single-codepoint ``_CURRENCY_SYMBOLS`` table. Order matters: the # detector checks these prefixes BEFORE the single-symbol regex, so # ``R$`` resolves to BRL even though ``$`` alone would map to USD. _PREFIX_TO_ISO: dict[str, str] = { "r$": "BRL", # Brazilian Real "kr": "SEK", # ambiguous Nordic — picks SEK as most common; see tests "zł": "PLN", # Polish Złoty "лв": "BGN", # Bulgarian Lev "₽": "RUB", # already in symbol table; kept for parity "rs.": "INR", # rupees — covers IN/PK informal usage "rs": "INR", } def detect_currency_code(value: str) -> Optional[str]: """Return the ISO 4217 code implied by *value*, or None. Looks for an explicit ISO code first (``USD 1234``) and falls back to a symbol → code mapping (``$1234`` → ``USD``). Symbol mapping is best- effort: ``$`` is ambiguous between USD/CAD/AUD/MXN — the caller is expected to constrain that via input data discipline. Multi-char prefixes (``R$``, ``zł``, ``kr``) are recognised before the single-symbol regex so Brazilian / Polish / Nordic data isn't silently bucketed as USD. """ if not isinstance(value, str): return None head = value.lstrip().lower() for prefix, code in _PREFIX_TO_ISO.items(): if head.startswith(prefix): # Make sure the next char (if any) isn't a letter — avoid # matching ``rsa`` as ``rs``-then-``a``. tail = head[len(prefix):] if not tail or not tail[0].isalpha(): return code m = _CURRENCY_DETECT_RE.search(value) if m is None: return None if m.group("code"): return m.group("code").upper() sym = m.group("sym") return _SYMBOL_TO_ISO.get(sym) CurrencyErrorPolicy = Literal["passthrough", "sentinel"] def standardize_currency( value: Optional[str], *, decimal: CurrencyDecimal = "dot", decimals: Optional[int] = None, preserve_code: bool = False, error_policy: CurrencyErrorPolicy = "passthrough", ) -> tuple[str, bool]: """Strip currency symbols/grouping separators, return a bare number string. ``decimal="dot"``: ``$1,234.56`` → ``1234.56`` (US/UK convention). ``decimal="comma"``: ``1.234,56 €`` → ``1234.56`` (EU convention). ``decimal="auto"``: same as ``dot`` but a single trailing comma whose tail is NOT exactly 3 digits is read as a decimal separator (``850,50`` → ``850.50``, ``R$ 1,5`` → ``1.5``). Use this for mixed-locale international files. Length-3 tails (``1,234``) stay ambiguous regardless of mode. All three modes auto-detect the EU shape when both ``.`` and ``,`` are present and the comma sits after the dot (so ``€1.234,56`` parses correctly even under the dot-default mode). Space-thousands and Swiss apostrophe-thousands are also recognized. The output always uses a dot as the decimal separator since that is the form pandas/Python parse natively. Accounting-style negatives (``($50.00)``) become ``-50.00``. With ``error_policy="passthrough"`` (default) unparseable input passes through unchanged. With ``error_policy="sentinel"`` the cleaner emits ``>`` for percentages, ranges, word values, ambiguous separators, and other non-currency content per corpus § 8.3. When *decimals* is given, the result is rounded to that many places. When *preserve_code* is True, an ISO 4217 code is detected from the input (``USD 1234`` or ``$1234``) and re-emitted as a space-separated prefix on the standardized number (``USD 1234.56``). """ if not value or not isinstance(value, str): return value or "", False s = value.strip() if not s: return value, False _err = lambda reason: _err_or_passthrough(reason, value, error_policy) if "%" in s: return _err("percentage not currency") # Range like "$50-$100" or "50–100" — distinguished from a single # signed number by either two currency symbols, or a digit-then- # dash-then-digit with the dash NOT being the leading sign. sym_count = sum(1 for c in s if c in "$£€¥₹") if sym_count >= 2 and re.search(r"\d\s*[-–—]\s*[$£€¥₹]", s): return _err("range not normalizable") if ( sym_count == 0 and re.search(r"\d\s*[-–—]\s*\d", s) and not re.match(r"^[+-]?\d", s.strip()) ): return _err("range not normalizable") code = detect_currency_code(s) if preserve_code else None # Strip any multi-char currency prefix (``R$``, ``kr``, ``zł``) # before the symbol-table regex — these aren't single codepoints # so the table-driven trim would otherwise leave them in place. head = s.lstrip().lower() for prefix in _PREFIX_TO_ISO: if head.startswith(prefix): tail_start = len(prefix) if tail_start < len(head) and head[tail_start].isalpha(): continue # Strip the matched prefix from the original (preserve case # of any trailing content). stripped_lead = s[: len(s) - len(head)] s = stripped_lead + s.lstrip()[len(prefix):] s = s.lstrip() break negative = False m = _PARENS_NEGATIVE_RE.match(s) if m: negative = True s = m.group(1) s = _CURRENCY_TRIM_RE.sub("", s).strip() if not s: return _err("empty after symbol strip") if s.startswith(("+", "-")): sign, rest = s[0], s[1:] if sign == "-": negative = not negative rest = _CURRENCY_TRIM_RE.sub("", rest).strip() else: rest = s # Swiss apostrophe-thousands → drop apostrophes used as group sep. if "'" in rest: rest = rest.replace("'", "") # Space- or NBSP-thousands → drop spaces between digit groups # (``1 234,56`` → ``1234,56``). Track whether we saw such a # separator so we can disambiguate the comma below. had_space_thousands = bool(re.search(r"\d[ \xa0]\d", rest)) rest = re.sub(r"(?<=\d)[ \xa0](?=\d)", "", rest) has_dot = "." in rest has_comma = "," in rest if decimal == "comma": # EU explicit: dots are thousands, comma is decimal. rest = rest.replace(".", "").replace(",", ".") else: if has_dot and has_comma: # Both present — the rightmost separator is the decimal. if rest.rfind(",") > rest.rfind("."): # EU: 1.234,56 rest = rest.replace(".", "").replace(",", ".") else: # US: 1,234.56 rest = rest.replace(",", "") elif has_comma and not has_dot: # ``1,234`` (no dot) is thousands-grouped US; ``1,5`` is # ambiguous. But a leading space-thousand separator (``1 234,56``) # is unambiguously EU — treat the comma as decimal. if had_space_thousands: rest = rest.replace(",", ".") elif decimal == "auto": # International auto-detection: a single comma whose # tail is NOT exactly 3 digits is far more likely to be # an EU/BRL decimal (``850,50``, ``1,5``) than a # malformed US thousands group. Length-3 tails stay # ambiguous and require an explicit locale. after = rest.rsplit(",", 1)[1] if rest.count(",") > 1: rest = rest.replace(",", "") elif len(after) == 3: return _err("ambiguous separator, set --currency-locale") else: rest = rest.replace(",", ".") else: after = rest.rsplit(",", 1)[1] if len(after) != 3: return _err("ambiguous separator, set --currency-locale") rest = rest.replace(",", "") elif has_dot and not has_comma: # Scientific notation (``1.5e6``) is not ambiguous — the tail # after the dot contains a non-digit. Skip the EU-thousands # check in that case. after = rest.rsplit(".", 1)[1] tail_is_pure_digits = after.isdigit() if ( tail_is_pure_digits and len(after) == 3 and len(rest.split(".")[0]) <= 3 and rest.count(".") == 1 ): return _err("ambiguous separator, set --currency-locale") try: num = float(rest) except ValueError: return _err("word value") if negative: num = -num if decimals is not None: out = f"{num:.{decimals}f}" elif num == int(num) and "." not in rest: out = str(int(num)) else: out = f"{num:g}" if abs(num) >= 1e16 else format(num, "f").rstrip("0").rstrip(".") if not out or out in ("-", ""): out = "0" if code is not None: out = f"{code} {out}" return out, out != value # --------------------------------------------------------------------------- # Name # --------------------------------------------------------------------------- NameCase = Literal["title", "upper", "lower"] # Particles in surnames that conventionally stay lowercase in natural # reading order. Covers the major Indo-European traditions plus # Arabic/Hebrew patronymic markers. _NAME_PARTICLES: set[str] = { # Germanic / Dutch / French / Italian "von", "van", "de", "da", "del", "della", "di", "du", "der", "den", "ter", "ten", "le", "la", "los", "las", "el", # Spanish / Portuguese "dos", "das", "do", "y", # Arabic patronymic / nisba "bin", "ibn", "bint", "abu", "abd", "al", "el-", "al-", # Hebrew "ben", "bat", "ha", "ha-", # Slavic transliterated (rare in Western forms) "z", "ze", } # Acronyms / honorifics that keep their conventional casing rather than # being title-cased (``PhD``, ``MD``, ``Esq``). Includes international # academic credentials. _NAME_ACRONYMS: dict[str, str] = { # English "phd": "PhD", "md": "MD", "esq": "Esq", "ma": "MA", "ba": "BA", "bs": "BS", "ms": "MS", "dds": "DDS", "dvm": "DVM", "jd": "JD", "rn": "RN", "cpa": "CPA", "ceo": "CEO", "cto": "CTO", "cfo": "CFO", # German / Austrian academic "dipl": "Dipl", "ing": "Ing", "mag": "Mag", "habil": "Habil", "drmed": "Dr.med.", "drphil": "Dr.phil.", "drrernat": "Dr.rer.nat.", "msc": "MSc", "bsc": "BSc", # International degrees "llb": "LLB", "llm": "LLM", } # Roman numeral suffixes — preserved verbatim (already uppercase). _NAME_ROMAN_RE = re.compile(r"^[IVX]+$") # Titles. Most languages strip the trailing period (``Mr.`` → ``Mr``); # the dispatcher in _standardize_name_token does the strip. _NAME_TITLES: set[str] = { # English "mr", "mrs", "ms", "miss", "dr", "prof", "sr", "jr", "sir", "madam", "rev", "hon", # German "herr", "frau", "fr", "hr", # French "m", "mme", "mlle", "mr", # Spanish "sr", "sra", "srta", "don", "doña", "dona", # Italian "sig", "sigra", "dott", "dottoressa", # Portuguese "snr", "snra", } # East Asian honorific suffixes — appended after the family name with a # hyphen. Preserved verbatim (lowercase). Supports both Latin # transliteration and the underlying Japanese/Korean characters. _EAST_ASIAN_HONORIFICS: set[str] = { "san", "sama", "kun", "chan", "sensei", "senpai", "kohai", "dono", "shi", "tan", "chin", # Korean "ssi", "nim", } # Suffixes that take a trailing period in their short form (``Jr.``). _NAME_SUFFIXES: set[str] = {"jr", "sr", "esq"} def _cap_segment(seg: str) -> str: """Capitalize a single word/segment, leaving the rest lowercase.""" if not seg: return seg return seg[0].upper() + seg[1:].lower() def _standardize_name_token(tok: str, *, position: str, all_shouting: bool = False) -> str: """Standardize one space-separated token. *position* is one of ``"first"``, ``"middle"``, ``"last"`` and drives particle / capitalization rules. *all_shouting* is True when every token in the surrounding name is uppercase — in that case, don't preserve any single token as an acronym. """ if not tok: return tok # Trailing punctuation gets stripped and re-attached. suffix_punct = "" while tok and tok[-1] in ",;:": suffix_punct = tok[-1] + suffix_punct tok = tok[:-1] if not tok: return suffix_punct lowered = tok.lower() bare = lowered.rstrip(".") # Roman numerals (II, III, IV, …) if _NAME_ROMAN_RE.match(tok.upper()): return tok.upper() + suffix_punct # Known acronym (PhD, MD, …) if bare in _NAME_ACRONYMS: return _NAME_ACRONYMS[bare] + suffix_punct # All-caps token of length >= 2 with no lowercase letters and at # least one alpha — treat as an acronym in the middle of a name # (``Mary USA Smith``, ``John IBM Doe``). Doesn't fire for single # initials (``A.``), and doesn't fire when the whole name is # shouting (``DR JANE DOE`` shouldn't preserve JANE as an acronym # — the whole thing is just the user's caps lock key). if ( position == "middle" and not all_shouting and len(bare) >= 2 and tok.isupper() and any(c.isalpha() for c in tok) and bare not in _NAME_TITLES and bare not in _NAME_SUFFIXES and bare not in _NAME_PARTICLES ): return tok + suffix_punct # Title (Mr, Dr, Prof) — strip trailing period if bare in _NAME_TITLES: return _cap_segment(bare) + suffix_punct # Suffix (Jr, Sr) — strip trailing period if bare in _NAME_SUFFIXES and position == "last": return _cap_segment(bare) + suffix_punct # Particle (von, van, de, …) — stay lowercase except as final token # of the name (the surname slot — ``van Gogh`` last is ``Gogh``, # but standalone ``Van`` would be a first name). if lowered.rstrip(".") in _NAME_PARTICLES and position != "last": return lowered.rstrip(".") + suffix_punct # Single-letter initial like ``A`` or ``A.`` → strip trailing # period, uppercase. (Check before multi-initial so ``A.`` doesn't # fall into the multi-initial branch and keep its period.) if len(bare) == 1 and bare.isalpha(): return bare.upper() + suffix_punct # Multi-initial token like ``j.k.`` or ``J.K.`` → uppercase letters, # keep internal periods. if "." in tok and all( seg == "" or (len(seg) == 1 and seg.isalpha()) for seg in tok.split(".") ): return tok.upper() + suffix_punct # Hyphenated segment — capitalize each piece. Special cases: # - East Asian honorific suffix (``Tanaka-san``) stays lowercase. # - Arabic transliterated prefix (``al-Rashid``, ``el-Sayed``) # keeps the prefix lowercase per Arabic naming convention. if "-" in tok: parts = tok.split("-") out_parts = [] for j, p in enumerate(parts): if j > 0 and p.lower() in _EAST_ASIAN_HONORIFICS: out_parts.append(p.lower()) elif j == 0 and p.lower() in {"al", "el", "an", "ad"}: out_parts.append(p.lower()) else: out_parts.append(_cap_segment(p)) return "-".join(out_parts) + suffix_punct # Mc / Mac prefix — inner cap. if lowered.startswith("mc") and len(lowered) > 2: return "Mc" + _cap_segment(tok[2:]) + suffix_punct if lowered.startswith("mac") and len(lowered) > 3: # Heuristic: only capitalize after Mac if the following segment # would also be capitalized in title case. ``machine`` should # stay ``Machine`` not ``MacHine`` — but real surnames are far # more common as inputs to a name standardizer than dictionary # words. Apply Mac inner-cap unconditionally; document as a # known limitation. return "Mac" + _cap_segment(tok[3:]) + suffix_punct # O' prefix — inner cap. if lowered.startswith("o'") and len(lowered) > 2: return "O'" + _cap_segment(tok[2:]) + suffix_punct # D' prefix — inner cap (D'Angelo, D'Arcy). if lowered.startswith("d'") and len(lowered) > 2: return "D'" + _cap_segment(tok[2:]) + suffix_punct return _cap_segment(tok) + suffix_punct def _is_non_latin_script(s: str) -> bool: """Heuristic: true when the string contains non-Latin cased letters.""" for c in s: if c.isalpha(): cp = ord(c) # Latin range up to Latin Extended-B (covers Latin + accents). if cp <= 0x024F: return False # No Latin alpha characters at all → treat as non-Latin. return any(c.isalpha() for c in s) def standardize_name( value: Optional[str], *, case: NameCase = "title", conservative: bool = False, reverse_comma_format: bool = True, family_first: bool = False, ) -> tuple[str, bool]: """Apply name-friendly casing with prefix / particle / suffix awareness. ``"title"`` (default) handles: * Mc / Mac inner caps (``mcdonald`` → ``McDonald``). * O'/D' inner caps (``o'connor`` → ``O'Connor``). * Hyphenated segments (``mary-jane`` → ``Mary-Jane``). * Particles stay lowercase mid-name (``van Gogh``, ``de Gaulle``, ``bin Salman``, ``ben Avraham``). * East Asian honorific suffixes (``Tanaka-san``, ``Lee-ssi``) preserved lowercase after the hyphen. * Title / suffix periods stripped (``Mr.`` → ``Mr``, ``Jr.`` → ``Jr``). * Roman numeral suffixes preserved (``III``). * PhD / MD / Esq style acronyms preserved. * Multi-initial tokens uppercased (``j.k.`` → ``J.K.``). * Non-Latin scripts (Korean, Japanese, Cyrillic) pass through. ``conservative=True`` preserves mixed-case input verbatim per the corpus § 7.3 ``--name-conservative=on`` policy. ``reverse_comma_format`` flips ``Last, First`` to ``First Last`` (default per corpus § 7.3). ``family_first=True`` skips comma reversal and disables Western title detection — appropriate for East Asian columns where the family name comes first natively (``Kim Min-jae``, ``田中 太郎``). Set this per-column when you know the cultural convention. ``"upper"`` / ``"lower"`` are simple case conversions. """ if not value or not isinstance(value, str): return value or "", False s = value.strip() if not s: return value, False if case == "upper": out = s.upper() return out, out != value if case == "lower": out = s.lower() return out, out != value if case != "title": raise ValueError(f"Unknown name case: {case}") # Non-Latin scripts pass through unchanged — no case to apply. if _is_non_latin_script(s): return value, False # Conservative mode: only normalize all-caps or all-lowercase input. if conservative: cased = [c for c in s if c.isalpha()] if cased and any(c.isupper() for c in cased) and any(c.islower() for c in cased): return value, False # Comma-format reversal: "Smith, John Andrew" → "John Andrew Smith". # Skipped under family_first because East Asian conventions write # the family name first natively — reversing would corrupt them. if reverse_comma_format and not family_first and "," in s: parts = [p.strip() for p in s.split(",", 1)] if len(parts) == 2 and parts[0] and parts[1]: s = f"{parts[1]} {parts[0]}" tokens = s.split(" ") n = len(tokens) cased = [c for c in s if c.isalpha()] all_shouting = bool(cased) and not any(c.islower() for c in cased) out_tokens: list[str] = [] for i, tok in enumerate(tokens): if not tok: out_tokens.append(tok) continue position = "first" if i == 0 else ("last" if i == n - 1 else "middle") out_tokens.append(_standardize_name_token( tok, position=position, all_shouting=all_shouting, )) out = " ".join(out_tokens) return out, out != value # --------------------------------------------------------------------------- # Address # --------------------------------------------------------------------------- # Expansion table — the inverse of the dedup-side compression set in # ``normalize_address``. We deliberately don't expand ``unit``, ``loop``, # or ``way`` because those are already the long form. Canonical mappings # live in :mod:`src.core._constants` so both modules stay in sync. from ._constants import ( USPS_EXPANSIONS as _ADDRESS_EXPANSIONS, USPS_COMPRESSIONS as _ADDRESS_COMPRESSIONS, US_STATE_CODES as _US_STATE_CODES_SHARED, US_STATE_NAMES as _US_STATE_NAMES_SHARED, CA_PROVINCE_CODES, CA_PROVINCE_NAMES, AU_STATE_CODES, AU_STATE_NAMES, DE_STATE_CODES, DE_STATE_NAMES, POSTAL_PATTERNS, INTL_PO_BOX_PATTERNS, ) # Short tokens that look like directions but only mean a direction at the # start or end of an address — never in the middle of a street name. This # avoids mangling ``123 N Main St`` (legit) vs. ``123 N. Main`` (legit) but # also keeping us from rewriting ``Tower N`` → ``Tower North`` mid-line if # it's part of a building name. _DIRECTION_TOKENS = {"n", "s", "e", "w", "ne", "nw", "se", "sw"} _TOKEN_RE = re.compile(r"\w+|[^\w\s]+|\s+") # Aliases over the shared constants — kept for the local module-level # reads that already reference these names. _US_STATE_CODES = _US_STATE_CODES_SHARED _US_STATE_NAMES = _US_STATE_NAMES_SHARED # Per-country (full-name, code, postal-pattern) tables. Each yields a # precompiled regex matching ``, ``. Sorted # longest-first so multi-word names win over their prefixes. def _build_state_patterns( name_to_code: dict[str, str], postal_pattern: str, ) -> list[tuple[re.Pattern[str], str]]: return [ ( re.compile( rf"(,\s*){re.escape(full)}(\s+{postal_pattern})", re.IGNORECASE, ), code, ) for full, code in sorted(name_to_code.items(), key=lambda kv: -len(kv[0])) ] _STATE_NAME_PATTERNS: list[tuple[re.Pattern[str], str]] = _build_state_patterns( _US_STATE_NAMES, r"\d{5}(?:-\d{4})?", ) _CA_PROVINCE_PATTERNS: list[tuple[re.Pattern[str], str]] = _build_state_patterns( CA_PROVINCE_NAMES, r"[A-Z]\d[A-Z]\s*\d[A-Z]\d", ) _AU_STATE_PATTERNS: list[tuple[re.Pattern[str], str]] = _build_state_patterns( AU_STATE_NAMES, r"\d{4}", ) _DE_STATE_PATTERNS: list[tuple[re.Pattern[str], str]] = _build_state_patterns( DE_STATE_NAMES, r"\d{5}", ) # PO Box variants normalize to a single canonical form. Combines the # English pattern with the international locale variants registered in # _constants.INTL_PO_BOX_PATTERNS. _PO_BOX_RE = re.compile( r"\b(?:" + "|".join(INTL_PO_BOX_PATTERNS.values()) + r")\b", re.IGNORECASE, ) # Country-shape postal patterns (precompiled). Used to detect which # country-specific normalization to apply (state-code preservation, # street-suffix dictionary, etc.). _POSTAL_REGEXES: dict[str, re.Pattern[str]] = { cc: re.compile(pat) for cc, pat in POSTAL_PATTERNS.items() } # Back-compat aliases for sites that already reference these names. _US_ZIP_TAIL_RE = _POSTAL_REGEXES["us"] _CANADA_POSTAL_RE = _POSTAL_REGEXES["ca"] _UK_POSTCODE_RE = _POSTAL_REGEXES["uk"] # Combined state-code set: US + Canada + Australia + Germany. The # state-code-position check preserves any of these when found in the # slot between a comma and the postal code. _INTL_STATE_CODES: frozenset[str] = ( _US_STATE_CODES_SHARED | CA_PROVINCE_CODES | AU_STATE_CODES | DE_STATE_CODES ) def _is_state_code_position(tokens: list[str], idx: int) -> bool: """Heuristic: ``tokens[idx]`` sits in a state-code slot. A state code typically appears as ``…, XX 12345`` — preceded (modulo whitespace) by a comma and followed by a 5-digit ZIP. We allow some flexibility: a trailing position after a comma also counts even without a ZIP. """ # Look back for a comma (skipping whitespace). j = idx - 1 while j >= 0 and tokens[j].isspace(): j -= 1 if j < 0 or tokens[j] != ",": return False # Look ahead for a postal-shaped token. Accepts US ZIP (5 digits + # optional +4), Australian (4 digits), Canadian first half (single # letter + digit + letter), and the start of a UK outward code. j = idx + 1 while j < len(tokens) and tokens[j].isspace(): j += 1 if j >= len(tokens): return True # tail of line, after a comma — accept nxt = tokens[j] return bool(re.match( r"\d{4,5}(?:-\d{4})?$|^[A-Z]\d[A-Z]$|^[A-Z]{1,2}\d", nxt, re.IGNORECASE, )) def standardize_address( value: Optional[str], *, extra_abbreviations: Optional[dict[str, str]] = None, expand: bool = True, state_to_code: bool = True, collapse_multiline: bool = True, trim_trailing_comma: bool = True, normalize_po_box: bool = True, ) -> tuple[str, bool]: """Standardize a US-style address. By default expands USPS abbreviations (``St`` → ``Street``) and title-cases the result. With ``expand=False`` the inverse direction is used (``Street`` → ``St``), which matches the corpus default of USPS abbreviated form as canonical (FORMATS-CASES.md § 6.3). Other policy knobs: * ``state_to_code`` — convert spelled-out state names to 2-letter postal codes (``New York`` (state) → ``NY``). * ``collapse_multiline`` — replace embedded newlines with ``, `` so ``123 Main St\\nApt 4B`` becomes ``123 Main St, Apt 4B``. * ``trim_trailing_comma`` — drop a sole trailing comma left by loose CSV exports. * ``normalize_po_box`` — fold ``P.O. Box`` / ``Post Office Box`` / ``po box`` variants to canonical ``PO Box``. State codes are preserved verbatim regardless of the surrounding case (``ny`` in all-lowercase input becomes ``NY``, not ``Ny``). """ if not value or not isinstance(value, str): return value or "", False if not value.strip(): return value, False s = value # If the whole input is shouting (every cased letter uppercase), # casefold it before any token replacement so the title-case pass # produces ``Main St`` rather than seeing a mix of ``MAIN`` and # already-replaced ``St`` and giving up on the all-caps tokens. cased = [c for c in s if c.isalpha()] if cased and not any(c.islower() for c in cased): s = s.lower() if collapse_multiline and "\n" in s: # Each line becomes a comma-joined segment — but skip empty lines # and dedupe a comma the user already had at the line break. parts = [p.strip().rstrip(",").strip() for p in s.splitlines()] s = ", ".join(p for p in parts if p) if normalize_po_box: s = _PO_BOX_RE.sub("PO Box", s) is_us_shaped = bool(_US_ZIP_TAIL_RE.search(s)) is_ca_shaped = bool(_CANADA_POSTAL_RE.search(s)) is_uk_shaped = bool(_UK_POSTCODE_RE.search(s)) # German postal is just 5 digits — same as US ZIP — so we only # treat as DE if the input is NOT already US-state-shaped. is_de_shaped = ( is_us_shaped and any( re.search(rf",\s*{re.escape(name)}\s+\d{{5}}", s, re.IGNORECASE) or re.search(rf",\s*{re.escape(code)}\s+\d{{5}}", s, re.IGNORECASE) for name, code in DE_STATE_NAMES.items() ) ) # AU detection: 4-digit postal at tail AND a known AU state code or # full-name substring is present somewhere in the address. _au_state_words = "|".join( list(AU_STATE_CODES) + [re.escape(n) for n in AU_STATE_NAMES] ) is_au_shaped = bool( re.search(r"\b\d{4}\b\s*$", s.rstrip(",")) and re.search(rf"\b(?:{_au_state_words})\b", s, re.IGNORECASE) ) if state_to_code: # State-name → code conversion. Each country's pattern only # fires when its own postal-code shape is detected, so US # "New York" before "NY 10001" is left alone (it's a city), and # Canadian "Ontario" before "M5E 1W7" becomes "ON". if is_us_shaped: for pat, code in _STATE_NAME_PATTERNS: s = pat.sub(rf"\g<1>{code}\g<2>", s) if is_ca_shaped: for pat, code in _CA_PROVINCE_PATTERNS: s = pat.sub(rf"\g<1>{code}\g<2>", s) if is_au_shaped: for pat, code in _AU_STATE_PATTERNS: s = pat.sub(rf"\g<1>{code}\g<2>", s) if is_de_shaped: for pat, code in _DE_STATE_PATTERNS: s = pat.sub(rf"\g<1>{code}\g<2>", s) if not expand: # Compression direction is only safe for US-shaped addresses. # International rows (UK postcodes, Canada/Japan postal patterns) # keep their original spelling — ``Downing Street`` stays # ``Downing Street``, not ``Downing St``. abbrev_table = ( {k: v for k, v in _ADDRESS_COMPRESSIONS.items()} if is_us_shaped or _CANADA_POSTAL_RE.search(s) else {} ) else: abbrev_table = dict(_ADDRESS_EXPANSIONS) if extra_abbreviations: abbrev_table = {**abbrev_table} for k, v in extra_abbreviations.items(): if isinstance(k, str) and isinstance(v, str) and k.strip() and v.strip(): abbrev_table[k.casefold().rstrip(".").strip()] = v.strip() expansion_values = set(abbrev_table.values()) # Canonical USPS abbreviation forms (``St``, ``Ave``, …) — used to # strip a trailing period when the abbreviation is already canonical # in compression mode (``St.`` → ``St``). canonical_abbrevs = set(_ADDRESS_COMPRESSIONS.values()) | set( _ADDRESS_EXPANSIONS ) tokens = _TOKEN_RE.findall(s) out_tokens: list[str] = [] for i, tok in enumerate(tokens): if not tok or not tok[0].isalnum(): # Punctuation / whitespace passes through verbatim — but if # it begins with a period and the previous output token is a # known USPS abbreviation, strip the leading period (``St.`` # → ``St``, ``St.,`` → ``St,``). if ( tok.startswith(".") and out_tokens and (out_tokens[-1] in expansion_values or out_tokens[-1] in canonical_abbrevs) ): tok = tok[1:] if not tok: continue out_tokens.append(tok) continue key = tok.casefold().rstrip(".") upper_form = tok.upper().rstrip(".") # State code preservation: if this token is a 2-letter state code # in a state-code position, preserve it as uppercase regardless # of input case or abbreviation table collisions. if upper_form in _INTL_STATE_CODES and _is_state_code_position(tokens, i): out_tokens.append(upper_form) continue expansion = abbrev_table.get(key) if expansion is not None: out_tokens.append(expansion) else: out_tokens.append(tok) rebuilt = "".join(out_tokens) titled = smart_title_case(rebuilt) # Re-apply state-code preservation post title-case (smart_title_case # may have lowercased an all-lowercase token before we could fix it). titled = _restore_state_codes(titled) if trim_trailing_comma: titled = titled.rstrip() if titled.endswith(","): titled = titled[:-1].rstrip() return titled, titled != value _STATE_CODE_AFTER_COMMA_RE = re.compile( r"(,\s*)([A-Za-z]{2})(\s+\d{5}(?:-\d{4})?|\s*$)" ) def _restore_state_codes(s: str) -> str: """Force-uppercase 2-letter state codes following a comma.""" def repl(m: re.Match) -> str: candidate = m.group(2).upper() if candidate in _INTL_STATE_CODES: return f"{m.group(1)}{candidate}{m.group(3)}" return m.group(0) return _STATE_CODE_AFTER_COMMA_RE.sub(repl, s) # --------------------------------------------------------------------------- # Email # --------------------------------------------------------------------------- # # 03's email cleaner is the public surface for normalization (see # FORMATS-CASES.md § 0.1 — duplicates the matching logic the dedup # tier-1 spec uses internally, so callers don't have to run dedup just # to lowercase a list of emails). EmailErrorPolicy = Literal["passthrough", "sentinel"] # Strict-enough RFC 5322-ish regex: local@domain.tld, allowing IDN. _EMAIL_RE = re.compile( r"^(?P[^\s@<>\"]+)@(?P[^\s@<>\"]+\.[^\s@<>\".]+)$" ) # Display-name extraction: ``"Alice" `` or # ``Alice Smith ``. _EMAIL_ANGLE_RE = re.compile(r"<([^<>]+)>") _MAILTO_PREFIX_RE = re.compile(r"^mailto:", re.IGNORECASE) # Smart-quote wrapping the whole address. _EMAIL_SMARTQUOTE_RE = re.compile(r"^[“”‘’]+|[“”‘’]+$") # Bidirectional control characters used in homograph / spoofing attacks # against email addresses (``alice‮@example.com`` displays as # ``alice@elpmaxe.com`` to RTL-aware renderers). Strip on every parse. _EMAIL_BIDI_RE = re.compile(r"[‪-‮⁦-⁩‎‏]") # Multi-email cell separator. _EMAIL_MULTI_RE = re.compile(r"[,;]\s*\S+@\S+\.\S+") def standardize_email( value: Optional[str], *, gmail_canonical: bool = False, error_policy: EmailErrorPolicy = "passthrough", ) -> tuple[str, bool]: """Lowercase + trim + strip mailto/display-name wrappers. Default behavior preserves Gmail dots and ``+tag`` segments — that's a Gmail provider policy, not a generic email standard. Set ``gmail_canonical=True`` to strip dots and ``+`` tags from the local part for ``@gmail.com`` addresses only (corpus § 5.3). Multiple addresses in a single cell, missing/duplicate ``@``, internal whitespace, and TLD-less inputs are surfaced as ``>`` when ``error_policy="sentinel"``. """ if not value or not isinstance(value, str): return value or "", False s = value.strip() if not s: return value, False _err = lambda reason: _err_or_passthrough(reason, value, error_policy) # Multi-email cell — error before we silently pick one. if _EMAIL_MULTI_RE.search(s) and not s.startswith("<"): # If splitting on ;/, yields multiple email-shaped tokens, error. parts = re.split(r"[,;]\s*", s) email_parts = [p for p in parts if "@" in p and "." in p.split("@")[-1]] if len(email_parts) >= 2: return _err("multiple emails") # Smart-quote wrappers (``"alice@example.com"``). s = _EMAIL_SMARTQUOTE_RE.sub("", s).strip() # Strip BIDI / RTL override controls — these are a homograph attack # vector and have no legitimate use inside an email address. s = _EMAIL_BIDI_RE.sub("", s) # Display-name with angle brackets — extract the address. m = _EMAIL_ANGLE_RE.search(s) if m: s = m.group(1).strip() # mailto: prefix. s = _MAILTO_PREFIX_RE.sub("", s).strip() # Trailing punctuation contamination (``alice@example.com,`` etc.). s = s.rstrip(",;:.)”’") # Internal whitespace check (``alice @ example.com``). if re.search(r"\s", s): return _err("internal whitespace") # Lowercase the whole thing — both local part and domain are # case-insensitive in practice (RFC 5321 says local can be # case-sensitive but no real provider treats it that way). s = s.lower() # Validate shape. if "@" not in s: return _err("missing @") if s.count("@") >= 2: # ``alice@@example.com`` is double-@, ``alice@example@com`` is # multi-@; both error. return _err("double @" if "@@" in s else "multiple @") m = _EMAIL_RE.match(s) if not m: return _err("no TLD") local = m.group("local") domain = m.group("domain") if gmail_canonical and domain == "gmail.com": local = local.replace(".", "").split("+", 1)[0] s = f"{local}@{domain}" return s, s != value # --------------------------------------------------------------------------- # Boolean # --------------------------------------------------------------------------- _TRUE_TOKENS = {"true", "t", "yes", "y", "1", "on"} _FALSE_TOKENS = {"false", "f", "no", "n", "0", "off"} BoolStyle = Literal["True/False", "true/false", "Yes/No", "Y/N", "1/0"] _BOOL_OUTPUT: dict[BoolStyle, tuple[str, str]] = { "True/False": ("True", "False"), "true/false": ("true", "false"), "Yes/No": ("Yes", "No"), "Y/N": ("Y", "N"), "1/0": ("1", "0"), } def standardize_boolean( value: Any, *, style: BoolStyle = "True/False", ) -> tuple[str, bool]: """Map common truthy/falsy strings (and Python bools) to a canonical pair. Recognized truthy: ``true t yes y 1 on``. Recognized falsy: ``false f no n 0 off``. Comparison is case-insensitive after trim. Unrecognized input passes through unchanged. """ true_out, false_out = _BOOL_OUTPUT[style] if isinstance(value, bool): out = true_out if value else false_out return out, True if value is None or (isinstance(value, float) and pd.isna(value)): return "", False if not isinstance(value, str): # Numeric 0/1 → False/True; anything else is unrecognized. if value == 0: return false_out, True if value == 1: return true_out, True return str(value), False s = value.strip().casefold() if not s: return value, False if s in _TRUE_TOKENS: return true_out, true_out != value if s in _FALSE_TOKENS: return false_out, false_out != value return value, False # --------------------------------------------------------------------------- # Options / result dataclasses # --------------------------------------------------------------------------- # --------------------------------------------------------------------------- # Preset bundles # --------------------------------------------------------------------------- # # A preset is a flat dict of ``StandardizeOptions`` field defaults — the # subset that varies between locales / standards. ``column_types`` and # ``extra_abbreviations`` are caller-supplied and never carried by a # preset. # # Standards backing each preset: # us-default ISO 8601 dates · ITU-T E.164 phones (US) · ISO 4217 minor # unit (2dp) · USPS Pub. 28 address expansion · "True/False" # european ISO 8601 dates with DMY for ambiguous input · E.164 phones # · ISO 4217 with comma decimal input · "True/False" # uk DD/MM/YYYY display · GB region phones · ISO 4217 dot · # "Yes/No" booleans (common in UK gov forms) # iso-strict ISO 8601 dates · E.164 · bare-number currency, no rounding # · "true/false" lowercase (JSON canonical) · Title names # legacy-us MM/DD/YYYY display · National-format phones · 2dp currency # · "Yes/No" — for downstream systems that haven't moved off # local conventions yet. PRESETS: dict[str, dict[str, Any]] = { "us-default": { "date_output_format": "%Y-%m-%d", "date_order": "MDY", "phone_format": "E164", "phone_region": "US", "currency_decimal": "dot", "currency_decimals": 2, "currency_preserve_code": False, "name_case": "title", "boolean_style": "True/False", }, "european": { "date_output_format": "%Y-%m-%d", "date_order": "DMY", "phone_format": "INTERNATIONAL", "phone_region": "DE", "currency_decimal": "comma", "currency_decimals": 2, "currency_preserve_code": True, "name_case": "title", "boolean_style": "True/False", }, "uk": { "date_output_format": "%d/%m/%Y", "date_order": "DMY", "phone_format": "INTERNATIONAL", "phone_region": "GB", "currency_decimal": "dot", "currency_decimals": 2, "currency_preserve_code": False, "name_case": "title", "boolean_style": "Yes/No", }, "iso-strict": { "date_output_format": "%Y-%m-%d", "date_order": "MDY", "phone_format": "E164", "phone_region": "US", "currency_decimal": "dot", "currency_decimals": None, "currency_preserve_code": True, "name_case": "title", "boolean_style": "true/false", }, "legacy-us": { "date_output_format": "%m/%d/%Y", "date_order": "MDY", "phone_format": "NATIONAL", "phone_region": "US", "currency_decimal": "dot", "currency_decimals": 2, "currency_preserve_code": False, "name_case": "title", "boolean_style": "Yes/No", }, } @dataclass class StandardizeOptions: """Configuration for :func:`standardize_dataframe`. The standardizer is column-typed: the user (or auto-detection layer above) assigns each column a :class:`FieldType`, and the per-cell function for that type runs over the column. Columns absent from ``column_types`` pass through untouched. """ # column name -> field type (string or FieldType enum value) column_types: dict[str, FieldType] = field(default_factory=dict) # Date formatting date_output_format: str = "%Y-%m-%d" date_order: DateOrder = "MDY" # Phone formatting phone_format: PhoneFormat = "E164" phone_region: str = "US" # Currency formatting currency_decimal: CurrencyDecimal = "dot" currency_decimals: Optional[int] = 2 # When True, an ISO 4217 code detected in the input is re-emitted as a # space-separated prefix on the standardized number. currency_preserve_code: bool = False # Name casing name_case: NameCase = "title" # Boolean style boolean_style: BoolStyle = "True/False" # Email policy email_gmail_canonical: bool = False email_error_policy: EmailErrorPolicy = "passthrough" # Address policy (corpus § 6.3 — abbreviated form is canonical, but # the existing tests/baseline assume expand-by-default; new callers # opt into compression by setting expand=False). address_expand: bool = True address_state_to_code: bool = True address_collapse_multiline: bool = True address_trim_trailing_comma: bool = True address_normalize_po_box: bool = True # Per-domain error sentinels — when "sentinel", emit ```` # for unparseable / out-of-domain values. Default ``passthrough`` # preserves the input unchanged. date_error_policy: DateErrorPolicy = "passthrough" phone_error_policy: PhoneErrorPolicy = "passthrough" currency_error_policy: CurrencyErrorPolicy = "passthrough" # Date locale handling — extra month-name dictionaries beyond English. date_month_locales: Optional[list[str]] = None # Name policy name_conservative: bool = False name_reverse_comma_format: bool = True name_family_first: bool = False # set per-column for East Asian data # User overrides for the address abbreviation table. Merged on top of # the built-in USPS Pub. 28 list at runtime; values flow through # verbatim into Title Case rendering. extra_abbreviations: dict[str, str] = field(default_factory=dict) # ----- Scale knobs for large international files ----- # Per-row country/region overrides. When set, each phone or address # row's region is read from the named column (an ISO-3166 alpha-2 code: # "US", "GB", "JP", "FR", …). Falls back to ``phone_region`` / # global default when the column is missing or the cell is blank. phone_country_column: Optional[str] = None address_country_column: Optional[str] = None # Audit cap. The change table can grow to tens of millions of rows on # a 1 GB input — capping protects memory and keeps the audit usable. # ``cells_changed`` still counts every modification; only the per-row # ``changes`` DataFrame is truncated. Set to None for unbounded. audit_max_rows: Optional[int] = 10_000 # Value-level LRU cache size per standardizer. Repeated phone numbers # (call-list duplicates), repeated currencies, repeated boolean # tokens — all dominate at scale. A 256k-entry cache absorbs most # real-world cardinalities without ballooning memory. cache_size: int = 262_144 @classmethod def from_preset(cls, name: str, **overrides: Any) -> StandardizeOptions: """Build options from a named preset, with optional field overrides. Example: ``StandardizeOptions.from_preset("uk", column_types={...})`` starts from UK defaults and layers ``column_types`` on top. """ if name not in PRESETS: raise ValueError( f"Unknown preset '{name}'. " f"Available: {', '.join(sorted(PRESETS))}." ) base = dict(PRESETS[name]) base.update(overrides) return cls(**base) @classmethod def from_dict(cls, data: dict) -> StandardizeOptions: from .errors import ConfigError known = {f for f in cls.__dataclass_fields__} kwargs = {k: v for k, v in data.items() if k in known} column_types = kwargs.get("column_types") or {} resolved: dict[str, FieldType] = {} for col, raw in column_types.items(): try: resolved[col] = ( FieldType(raw) if not isinstance(raw, FieldType) else raw ) except ValueError as e: valid = sorted(t.value for t in FieldType) raise ConfigError( f"Invalid field type {raw!r} for column {col!r}", column=col, operation="StandardizeOptions.from_dict", cause=e, suggestion=f"Valid field types: {valid}", ) from e kwargs["column_types"] = resolved # Surface enum-string mismatches early — bad date_order ("xyz") # would otherwise crash deep inside standardize_date. for field_name, valid in ( ("date_order", {"MDY", "DMY"}), ("phone_format", set(_PHONE_FORMAT_MAP) | {"DIGITS"}), ("currency_decimal", {"dot", "comma", "auto"}), ("name_case", {"title", "upper", "lower"}), ("boolean_style", set(_BOOL_OUTPUT)), ("date_error_policy", {"passthrough", "sentinel"}), ("phone_error_policy", {"passthrough", "sentinel"}), ("currency_error_policy", {"passthrough", "sentinel"}), ("email_error_policy", {"passthrough", "sentinel"}), ): value = kwargs.get(field_name) if value is not None and value not in valid: raise ConfigError( f"Invalid {field_name}={value!r}", operation="StandardizeOptions.from_dict", suggestion=f"Valid values: {sorted(valid)}", ) return cls(**kwargs) def to_dict(self) -> dict: d = asdict(self) d["column_types"] = {c: t.value if isinstance(t, FieldType) else t for c, t in self.column_types.items()} return d def to_file(self, path: str | Path) -> Path: from .errors import ConfigError, wrap_file_write out = Path(path) try: payload = json.dumps(self.to_dict(), indent=2) except TypeError as e: raise ConfigError( "Could not serialize StandardizeOptions to JSON", operation="StandardizeOptions.to_file", cause=e, suggestion=( "extra_abbreviations or column_types likely contains a " "non-string/non-enum value. Inspect with .to_dict() and " "remove the offending entry." ), ) from e try: out.write_text(payload) except (OSError, PermissionError) as e: raise wrap_file_write(out, "StandardizeOptions.to_file", e) from e return out @classmethod def from_file(cls, path: str | Path) -> StandardizeOptions: from .errors import ConfigError, wrap_file_read path = Path(path) try: text = path.read_text() except OSError as e: raise wrap_file_read(path, "StandardizeOptions.from_file", e) from e try: data = json.loads(text) except json.JSONDecodeError as e: raise ConfigError( "Invalid JSON in StandardizeOptions config", path=path, operation="StandardizeOptions.from_file", cause=e, suggestion=( f"JSON parser failed at line {e.lineno}, column {e.colno}. " "Validate the file with `python -m json.tool < file.json`." ), ) from e return cls.from_dict(data) @dataclass class StandardizeResult: """Output of :func:`standardize_dataframe`.""" standardized_df: pd.DataFrame changes: pd.DataFrame # cols: row, column, field_type, old, new cells_changed: int cells_unparseable: int # rows where a typed column held junk cells_total: int columns_processed: list[str] # --------------------------------------------------------------------------- # Per-cell dispatch # --------------------------------------------------------------------------- def _apply_field_type( value: Any, field_type: FieldType, options: StandardizeOptions, ) -> tuple[Any, bool, bool]: """Run the standardizer for *field_type* on *value*. Returns ``(new_value, changed, parsed)``. ``parsed`` is False when the value was non-empty but the standardizer couldn't recognize it — used to surface a "junk in a typed column" count. """ if value is None or (isinstance(value, float) and pd.isna(value)): return value, False, True if not isinstance(value, str): # Non-string inputs are converted via str() for everything except # booleans, which have a richer accept set. if field_type == FieldType.BOOLEAN: new, changed = standardize_boolean(value, style=options.boolean_style) return new, changed, True value = str(value) s_stripped = value.strip() if not s_stripped: return value, False, True if field_type == FieldType.DATE: new, changed = standardize_date( value, output_format=options.date_output_format, date_order=options.date_order, error_policy=options.date_error_policy, month_locales=options.date_month_locales, ) elif field_type == FieldType.PHONE: new, changed = standardize_phone( value, output_format=options.phone_format, default_region=options.phone_region, error_policy=options.phone_error_policy, ) elif field_type == FieldType.CURRENCY: new, changed = standardize_currency( value, decimal=options.currency_decimal, decimals=options.currency_decimals, preserve_code=options.currency_preserve_code, error_policy=options.currency_error_policy, ) elif field_type == FieldType.NAME: new, changed = standardize_name( value, case=options.name_case, conservative=options.name_conservative, reverse_comma_format=options.name_reverse_comma_format, family_first=options.name_family_first, ) elif field_type == FieldType.ADDRESS: new, changed = standardize_address( value, extra_abbreviations=options.extra_abbreviations or None, expand=options.address_expand, state_to_code=options.address_state_to_code, collapse_multiline=options.address_collapse_multiline, trim_trailing_comma=options.address_trim_trailing_comma, normalize_po_box=options.address_normalize_po_box, ) elif field_type == FieldType.EMAIL: new, changed = standardize_email( value, gmail_canonical=options.email_gmail_canonical, error_policy=options.email_error_policy, ) elif field_type == FieldType.BOOLEAN: new, changed = standardize_boolean(value, style=options.boolean_style) else: # Unreachable for well-formed input — _resolve_column_types # would have rejected the bad enum at the entry point. Hitting # this means an internal invariant was broken, not user error. raise AssertionError( f"Unhandled FieldType in dispatcher: {field_type!r}. " "This indicates a code bug — a new FieldType was added to " "the enum without a matching branch here." ) # ``changed=False`` on a non-empty cell means the standardizer either # accepted the input as already-canonical OR couldn't parse it. The # name/address standardizers always succeed (any string is a valid # name); the others can fail. We only count parse failures for the # types that have a real parsing step. parsed = True if not changed and field_type in { FieldType.DATE, FieldType.PHONE, FieldType.CURRENCY, FieldType.BOOLEAN, }: parsed = _is_already_canonical(value, field_type, options) return new, changed, parsed def _is_already_canonical( value: str, field_type: FieldType, options: StandardizeOptions, ) -> bool: """Check whether *value* is already in the canonical output shape. Used to distinguish "no change because input was already canonical" (a successful pass) from "no change because we couldn't parse it" (a junk row to flag). """ if field_type == FieldType.DATE: try: datetime.strptime(value.strip(), options.date_output_format) return True except ValueError: return False if field_type == FieldType.PHONE: if options.phone_format == "DIGITS": return value.strip().isdigit() and len(value.strip()) >= 7 try: parsed = phonenumbers.parse(value, options.phone_region) except phonenumbers.NumberParseException: return False if not phonenumbers.is_possible_number(parsed): return False fmt = _PHONE_FORMAT_MAP[options.phone_format] return phonenumbers.format_number(parsed, fmt) == value.strip() if field_type == FieldType.CURRENCY: # Pure numeric (with optional sign and one decimal point) is # treated as already-canonical. When ``preserve_code`` is on, an # ``ISO 1234.56`` form also counts as canonical so we don't flag # rows that already match the preserved-code output shape. bare_re = r"-?\d+(?:\.\d+)?" if options.currency_preserve_code: return bool(re.fullmatch( rf"(?:{_CURRENCY_CODES})\s+{bare_re}|{bare_re}", value.strip(), re.IGNORECASE, )) return bool(re.fullmatch(bare_re, value.strip())) if field_type == FieldType.BOOLEAN: true_out, false_out = _BOOL_OUTPUT[options.boolean_style] return value.strip() in (true_out, false_out) return True # --------------------------------------------------------------------------- # DataFrame entry point # --------------------------------------------------------------------------- def _resolve_column_types( options: StandardizeOptions, df_columns: Iterable[str], ) -> dict[str, FieldType]: """Validate column references and coerce string types to enum values.""" cols = set(df_columns) resolved: dict[str, FieldType] = {} missing: list[str] = [] for col, ft in options.column_types.items(): if col not in cols: missing.append(col) continue resolved[col] = ft if isinstance(ft, FieldType) else FieldType(ft) if missing: from .errors import InputValidationError raise InputValidationError( f"Columns referenced by column_types not found in input: {missing}", operation="standardize_dataframe", suggestion=( f"Available columns: {list(df_columns)}. " "Check for typos and for header rows that didn't get parsed." ), ) return resolved def _build_cached_dispatcher( field_type: FieldType, options: StandardizeOptions, ): """Return a per-value standardizer wrapped in an LRU cache. The cache key is the raw cell value plus, when applicable, the per-row region derived from ``phone_country_column`` / ``address_country_column``. Repeated values are O(1) lookups — critical at 1 GB scale where the same number appears thousands of times. The dispatcher captures the relevant subset of ``options`` so the cache key stays small (we don't want to serialize the whole options dataclass into every cache entry). """ from functools import lru_cache cache_size = options.cache_size if options.cache_size > 0 else None if field_type == FieldType.DATE: out_fmt = options.date_output_format date_order = options.date_order date_err = options.date_error_policy locales = ( tuple(options.date_month_locales) if options.date_month_locales else None ) @lru_cache(maxsize=cache_size) def fn(value: Any, _region: Optional[str] = None): return _apply_field_type_for( value, FieldType.DATE, options, _date_args=(out_fmt, date_order, date_err, locales), ) return fn if field_type == FieldType.PHONE: out_fmt = options.phone_format err = options.phone_error_policy default_region = options.phone_region @lru_cache(maxsize=cache_size) def fn(value: Any, region: Optional[str] = None): r = region or default_region return _apply_field_type_for( value, FieldType.PHONE, options, _phone_args=(out_fmt, r, err), ) return fn if field_type == FieldType.CURRENCY: decimal = options.currency_decimal decimals = options.currency_decimals preserve = options.currency_preserve_code err = options.currency_error_policy @lru_cache(maxsize=cache_size) def fn(value: Any, _region: Optional[str] = None): return _apply_field_type_for( value, FieldType.CURRENCY, options, _currency_args=(decimal, decimals, preserve, err), ) return fn if field_type == FieldType.BOOLEAN: style = options.boolean_style @lru_cache(maxsize=cache_size) def fn(value: Any, _region: Optional[str] = None): return _apply_field_type_for( value, FieldType.BOOLEAN, options, _boolean_args=(style,), ) return fn if field_type == FieldType.EMAIL: gmail = options.email_gmail_canonical err = options.email_error_policy @lru_cache(maxsize=cache_size) def fn(value: Any, _region: Optional[str] = None): return _apply_field_type_for( value, FieldType.EMAIL, options, _email_args=(gmail, err), ) return fn # Names and addresses are usually unique per row; no cache wraps # them but we still go through ``_apply_field_type`` for parity. if field_type == FieldType.NAME: def fn(value: Any, _region: Optional[str] = None): return _apply_field_type(value, FieldType.NAME, options) return fn if field_type == FieldType.ADDRESS: # Addresses can be cached too — long lists of repeated office # addresses or warehouse locations are common in commerce data. @lru_cache(maxsize=cache_size) def fn(value: Any, _region: Optional[str] = None): return _apply_field_type(value, FieldType.ADDRESS, options) return fn # Fallback (shouldn't happen — every FieldType is covered above). return lambda value, _region=None: _apply_field_type(value, field_type, options) def _apply_field_type_for( value: Any, field_type: FieldType, options: StandardizeOptions, *, _date_args=None, _phone_args=None, _currency_args=None, _boolean_args=None, _email_args=None, ) -> tuple[Any, bool, bool]: """Cacheable dispatcher: same shape as :func:`_apply_field_type` but accepts pre-extracted scalar argument tuples so the LRU cache key is just ``(value, region)`` instead of the full options object. """ if value is None or (isinstance(value, float) and pd.isna(value)): return value, False, True if not isinstance(value, str): if field_type == FieldType.BOOLEAN: style = (_boolean_args or (options.boolean_style,))[0] new, changed = standardize_boolean(value, style=style) return new, changed, True value = str(value) if not value.strip(): return value, False, True if field_type == FieldType.DATE: out_fmt, date_order, err, locales = _date_args or ( options.date_output_format, options.date_order, options.date_error_policy, tuple(options.date_month_locales) if options.date_month_locales else None, ) new, changed = standardize_date( value, output_format=out_fmt, date_order=date_order, error_policy=err, month_locales=list(locales) if locales else None, ) elif field_type == FieldType.PHONE: out_fmt, region, err = _phone_args or ( options.phone_format, options.phone_region, options.phone_error_policy, ) new, changed = standardize_phone( value, output_format=out_fmt, default_region=region, error_policy=err, ) elif field_type == FieldType.CURRENCY: decimal, decimals, preserve, err = _currency_args or ( options.currency_decimal, options.currency_decimals, options.currency_preserve_code, options.currency_error_policy, ) new, changed = standardize_currency( value, decimal=decimal, decimals=decimals, preserve_code=preserve, error_policy=err, ) elif field_type == FieldType.BOOLEAN: style = (_boolean_args or (options.boolean_style,))[0] new, changed = standardize_boolean(value, style=style) elif field_type == FieldType.EMAIL: gmail, err = _email_args or ( options.email_gmail_canonical, options.email_error_policy, ) new, changed = standardize_email( value, gmail_canonical=gmail, error_policy=err, ) else: return _apply_field_type(value, field_type, options) parsed = True if not changed and field_type in { FieldType.DATE, FieldType.PHONE, FieldType.CURRENCY, FieldType.BOOLEAN, }: parsed = _is_already_canonical(value, field_type, options) return new, changed, parsed def standardize_dataframe( df: pd.DataFrame, options: Optional[StandardizeOptions] = None, ) -> StandardizeResult: """Apply per-column standardizers across *df*. Columns absent from ``options.column_types`` pass through unchanged. The input DataFrame is not mutated. Pipeline placement (recommended, not enforced) ---------------------------------------------- Run *after* the text cleaner (smart-quote / NBSP / zero-width pollution breaks phone, currency, and date parsers) and *before* the missing-value handler (numeric imputation expects canonical types) and the deduplicator (canonical phone E.164 / lowercase email enables cross-format duplicate matching). See ``src.core.pipeline.SOFT_DEPENDENCIES``. Performance characteristics --------------------------- Per-cell standardizers are wrapped in an LRU cache (size ``options.cache_size``) so repeated values — common in real international data, where the same office phone or vendor address appears thousands of times — short-circuit. The dispatch loop uses ``Series.map`` for pandas-native iteration; on a 10-million-row column this is roughly 4-8× faster than the previous ``for v in series.tolist()`` path. For inputs larger than will fit comfortably in RAM, prefer :func:`standardize_file` which streams chunks from disk. """ from .errors import ensure_dataframe ensure_dataframe(df, function="standardize_dataframe") options = options or StandardizeOptions() out = df.copy() column_types = _resolve_column_types(options, out.columns) cells_changed = 0 cells_unparseable = 0 cells_total = 0 audit_cap = options.audit_max_rows audit_room = float("inf") if audit_cap is None else audit_cap audit_records: list[dict[str, Any]] = [] # Per-row region columns must exist in the frame when set. if options.phone_country_column and options.phone_country_column not in out.columns: from .errors import InputValidationError raise InputValidationError( f"phone_country_column={options.phone_country_column!r} not in input columns", operation="standardize_dataframe", suggestion=f"Available: {list(out.columns)}", ) if options.address_country_column and options.address_country_column not in out.columns: from .errors import InputValidationError raise InputValidationError( f"address_country_column={options.address_country_column!r} not in input columns", operation="standardize_dataframe", suggestion=f"Available: {list(out.columns)}", ) for col, field_type in column_types.items(): series = out[col] cells_total += len(series) dispatcher = _build_cached_dispatcher(field_type, options) # Per-row region lookup. Phones and addresses are the two types # that benefit from country context; everything else ignores the # second argument. region_series: Optional[pd.Series] = None if field_type == FieldType.PHONE and options.phone_country_column: region_series = out[options.phone_country_column] elif field_type == FieldType.ADDRESS and options.address_country_column: region_series = out[options.address_country_column] new_values: list[Any] = [None] * len(series) if region_series is None: triples = [dispatcher(v) for v in series.tolist()] else: regions = region_series.tolist() triples = [ dispatcher(v, _normalize_region(r)) for v, r in zip(series.tolist(), regions) ] for i, (orig, (new, changed, parsed)) in enumerate( zip(series.tolist(), triples) ): new_values[i] = new if changed: cells_changed += 1 if audit_room > 0: audit_records.append({ "row": i, "column": col, "field_type": field_type.value, "old": orig, "new": new, }) audit_room -= 1 if not parsed: cells_unparseable += 1 out[col] = new_values changes_df = pd.DataFrame( audit_records, columns=["row", "column", "field_type", "old", "new"], ) # Surface a warning when more than 10% of typed cells failed to # parse — usually means the user mis-typed a column (text marked # as DATE) or the data is genuinely garbage. Without this, a # quietly-broken pipeline shows zero changes and silently lets bad # data flow downstream. if cells_total > 0 and cells_unparseable / cells_total > 0.1: logger.warning( "standardize_dataframe: {}/{} cells ({}%) in typed columns were " "unparseable — check column_types for mismatches with the data.", cells_unparseable, cells_total, int(100 * cells_unparseable / cells_total), ) # Only log the cap message when it would surprise the caller — # cap=0 is the streaming-path's deliberate "audit budget exhausted" # signal and shouldn't generate noise per chunk. if audit_cap and audit_cap > 0 and cells_changed > audit_cap: logger.info( "standardize_dataframe: audit capped at {} rows " "(cells_changed={}); raise audit_max_rows or set to None for full audit.", audit_cap, cells_changed, ) return StandardizeResult( standardized_df=out, changes=changes_df, cells_changed=cells_changed, cells_unparseable=cells_unparseable, cells_total=cells_total, columns_processed=list(column_types.keys()), ) # --------------------------------------------------------------------------- # Per-row region helpers # --------------------------------------------------------------------------- # Common country-name → ISO-3166 alpha-2 mappings. The phonenumbers # library wants the alpha-2 code, but real spreadsheets carry full names # ("United Kingdom", "Japan", "Brazil"). Add new entries lazily as users # bring in data — the table is a soft mapping, missing entries fall back # to the global ``phone_region``. _COUNTRY_NAME_TO_ISO2: dict[str, str] = { "united states": "US", "usa": "US", "u.s.": "US", "u.s.a.": "US", "united kingdom": "GB", "uk": "GB", "great britain": "GB", "england": "GB", "canada": "CA", "mexico": "MX", "france": "FR", "germany": "DE", "deutschland": "DE", "italy": "IT", "italia": "IT", "spain": "ES", "españa": "ES", "portugal": "PT", "netherlands": "NL", "holland": "NL", "belgium": "BE", "switzerland": "CH", "schweiz": "CH", "austria": "AT", "österreich": "AT", "ireland": "IE", "sweden": "SE", "norway": "NO", "denmark": "DK", "finland": "FI", "poland": "PL", "czech republic": "CZ", "czechia": "CZ", "hungary": "HU", "russia": "RU", "ukraine": "UA", "japan": "JP", "中国": "CN", "china": "CN", "south korea": "KR", "korea": "KR", "india": "IN", "indonesia": "ID", "thailand": "TH", "vietnam": "VN", "philippines": "PH", "malaysia": "MY", "singapore": "SG", "australia": "AU", "new zealand": "NZ", "brazil": "BR", "brasil": "BR", "argentina": "AR", "chile": "CL", "colombia": "CO", "peru": "PE", "south africa": "ZA", "uae": "AE", "united arab emirates": "AE", "saudi arabia": "SA", "egypt": "EG", "israel": "IL", "turkey": "TR", "türkiye": "TR", } def _normalize_region(value: Any) -> Optional[str]: """Normalise a region cell to an ISO-3166 alpha-2 code. Accepts ISO codes (``US``, ``us``, ``USA``), full names (``United States``, ``Japan``), and falls back to None when the value is empty or unrecognized — letting the dispatcher use the global default region. """ if value is None: return None if isinstance(value, float) and pd.isna(value): return None if not isinstance(value, str): value = str(value) s = value.strip() if not s: return None upper = s.upper() # ISO-3166 alpha-2 (e.g. "US", "JP") if len(upper) == 2 and upper.isalpha(): return upper # ISO-3166 alpha-3 (e.g. "USA", "JPN") — strip last letter as a # cheap heuristic, then validate alpha-2. if len(upper) == 3 and upper.isalpha(): # phonenumbers accepts alpha-2 only; map a few common alpha-3. alpha3_map = { "USA": "US", "GBR": "GB", "CAN": "CA", "MEX": "MX", "DEU": "DE", "FRA": "FR", "ITA": "IT", "ESP": "ES", "JPN": "JP", "CHN": "CN", "KOR": "KR", "BRA": "BR", "AUS": "AU", "IND": "IN", "RUS": "RU", } if upper in alpha3_map: return alpha3_map[upper] # Full country name lookup. return _COUNTRY_NAME_TO_ISO2.get(s.lower()) # --------------------------------------------------------------------------- # Streaming entry point — for inputs that don't fit in memory # --------------------------------------------------------------------------- @dataclass class StreamingStandardizeResult: """Summary returned by :func:`standardize_file`. Mirrors :class:`StandardizeResult` but without the in-memory DataFrame — the standardized output is written incrementally to ``output_path``. The ``changes`` audit is also written incrementally to ``audit_path`` and capped at ``options.audit_max_rows`` total rows across all chunks. """ output_path: Path audit_path: Optional[Path] rows_processed: int chunks_processed: int cells_changed: int cells_unparseable: int cells_total: int columns_processed: list[str] def standardize_file( input_path: str | Path, output_path: str | Path, options: Optional[StandardizeOptions] = None, *, chunk_size: int = 50_000, audit_path: Optional[str | Path] = None, progress_callback: Optional[Any] = None, encoding: str = "utf-8", delimiter: str = ",", ) -> StreamingStandardizeResult: """Standardize a CSV/TSV file in chunks, writing output incrementally. For inputs too large to materialize in memory, this entry point streams ``chunk_size`` rows at a time through :func:`standardize_dataframe` and writes each chunk to *output_path* as it completes. Memory stays bounded by the chunk size regardless of input file size. The audit is written to *audit_path* (default ``{output_path.stem}_changes.csv``). Each chunk's ``options.audit_max_rows`` budget is respected per chunk; pass ``audit_max_rows=None`` for a full audit (memory-bounded only by disk). Performance for a 1 GB CSV with ~10 M rows on a typical workstation: - chunk_size=50_000 → ~50 MB peak DataFrame footprint - phone-only standardization: ~3-6 minutes (cache-warm) - mixed phone + currency + address: ~8-15 minutes - first chunk is the cold-cache slowest; later chunks ride the LRU. Parameters ---------- input_path CSV or TSV path. Excel inputs aren't streamed — load with :func:`read_file` and use :func:`standardize_dataframe`. output_path Where to write the standardized CSV. Existing files are overwritten. chunk_size Rows per chunk. Default 50,000 ≈ 50 MB resident for typical widths. Higher → less I/O overhead, more peak memory. progress_callback Optional ``callable(rows_processed, chunks_processed)`` called once per chunk. """ from .errors import wrap_file_read, wrap_file_write options = options or StandardizeOptions() inp = Path(input_path) out = Path(output_path) if not inp.exists(): from .errors import FileAccessError raise FileAccessError( f"Input file not found: {inp}", path=inp, operation="standardize_file", ) audit_p = Path(audit_path) if audit_path else out.with_name( f"{out.stem}_changes.csv" ) rows_processed = 0 chunks_processed = 0 cells_changed = 0 cells_unparseable = 0 cells_total = 0 columns_processed: list[str] = [] audit_room = ( options.audit_max_rows if options.audit_max_rows is not None else float("inf") ) out.parent.mkdir(parents=True, exist_ok=True) audit_p.parent.mkdir(parents=True, exist_ok=True) out_writer_open = False audit_writer_open = False try: reader = pd.read_csv( inp, chunksize=chunk_size, encoding=encoding, sep=delimiter, dtype=str, keep_default_na=False, ) except (OSError, FileNotFoundError) as e: raise wrap_file_read(inp, "standardize_file", e) from e try: for chunk in reader: # The chunked reader gives back row indices that restart # at chunk boundaries; renumber so audit row indices reflect # the full input file. chunk_offset = rows_processed chunk_options = options # Local audit cap per chunk: never exceed the global budget. if options.audit_max_rows is not None and audit_room <= 0: # Disable audit for this chunk by setting cap=0; the # standardizer skips appending records once room == 0. chunk_options = _replace_options(options, audit_max_rows=0) result = standardize_dataframe(chunk, chunk_options) cells_changed += result.cells_changed cells_unparseable += result.cells_unparseable cells_total += result.cells_total if not columns_processed: columns_processed = list(result.columns_processed) # Write the standardized chunk try: if not out_writer_open: result.standardized_df.to_csv( out, mode="w", index=False, encoding=encoding, sep=delimiter, ) out_writer_open = True else: result.standardized_df.to_csv( out, mode="a", index=False, header=False, encoding=encoding, sep=delimiter, ) except OSError as e: raise wrap_file_write(out, "standardize_file", e) from e # Write the audit (re-numbering rows to absolute file positions). if not result.changes.empty and audit_room > 0: # ``audit_room`` is float('inf') when the user wants an # unbounded audit; ``iloc[:inf]`` is invalid, so take the # whole frame in that case. if audit_room == float("inf"): cap_changes = result.changes.copy() else: cap_changes = result.changes.iloc[: int(audit_room)].copy() cap_changes["row"] = cap_changes["row"] + chunk_offset try: if not audit_writer_open: cap_changes.to_csv( audit_p, mode="w", index=False, encoding=encoding, ) audit_writer_open = True else: cap_changes.to_csv( audit_p, mode="a", index=False, header=False, encoding=encoding, ) except OSError as e: raise wrap_file_write(audit_p, "standardize_file", e) from e audit_room -= len(cap_changes) rows_processed += len(chunk) chunks_processed += 1 if progress_callback: try: progress_callback(rows_processed, chunks_processed) except Exception: # Progress callbacks are advisory — don't kill the run. logger.opt(exception=True).debug( "progress_callback raised; ignoring" ) finally: # Ensure the iterator is closed (closes the underlying file). if hasattr(reader, "close"): reader.close() return StreamingStandardizeResult( output_path=out, audit_path=audit_p if audit_writer_open else None, rows_processed=rows_processed, chunks_processed=chunks_processed, cells_changed=cells_changed, cells_unparseable=cells_unparseable, cells_total=cells_total, columns_processed=columns_processed, ) def _replace_options(options: StandardizeOptions, **kwargs: Any) -> StandardizeOptions: """Cheap shallow clone of :class:`StandardizeOptions` with overrides. Used by the streaming path to reduce the audit budget chunk-by-chunk without mutating the caller's options object. """ from dataclasses import replace return replace(options, **kwargs)