Tools shipped this batch (4 → 6 of 9 Ready):
04 Missing Value Handler src/core/missing.py + cli_missing.py + GUI
05 Column Mapper src/core/column_mapper.py + cli_column_map.py + GUI
09 Pipeline Runner src/core/pipeline.py + cli_pipeline.py + GUI
with soft tool-dependency graph (recommended,
not enforced) and JSON save/load for repeatable
weekly cleanups.
Format Standardizer reworked for 1 GB international files:
• Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
• Per-row country / address columns drive parsing
• Audit cap (default 10 k rows, ~50 MB RAM)
• standardize_file(): chunked streaming entry point (~165 k rows/sec)
• currency_decimal="auto" for EU comma-decimal locales
• R$ / kr / zł multi-char currency prefixes
• cli_format.py with auto-stream above 100 MB inputs
Encoding detection arbiter + language-aware probe:
Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.
Distribution-readiness assets:
• streamlit_app.py — Streamlit Community Cloud entry shim
• src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
100-row cap + watermark, free-vs-paid boundary enforced at surface
• samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
• landing/ — 4 static HTML pages (apex chooser + 3 niche),
shared CSS, deploy.py URL-substitution script,
auto-generated robots.txt + sitemap.xml + 404.html + favicon
• docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
— full strategy + measurement + deployment + master checklist
Test counts:
before: 1,520 passed · 4 skipped · 17 xfailed
after: 1,729 passed · 0 skipped · 0 xfailed
Tier-1 corpora added:
• missing-corpus 3 use cases + 16 edge cases
• column-mapper-corpus 3 use cases + 5 edge cases
• format-cleaner intl 20-row 13-country stress fixture
Engine hardening flushed out by the corpora:
• interpolate guards against object-dtype columns
• mean/median skip all-NaN columns (silences numpy warning)
• fillna runs under future.no_silent_downcasting (silences pandas warning)
• mojibake test no longer skips when ftfy installed (monkeypatch path)
• drop-row threshold semantics: strict-greater (consistent across rows / cols)
• currency_decimal validator allow-set updated for "auto"
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2912 lines
112 KiB
Python
2912 lines
112 KiB
Python
"""Format standardization for tabular data.
|
||
|
||
Per-cell standardizers turn messy free-form values into a single canonical
|
||
representation: dates → ISO ``YYYY-MM-DD``, phones → E.164 (or other
|
||
formats from ``phonenumbers``), currency → bare numeric strings, names →
|
||
``Title Case``, addresses → expanded USPS forms (``St.`` → ``Street``),
|
||
booleans → ``True``/``False``.
|
||
|
||
Each per-cell function is ``str -> tuple[str, bool]`` — returning
|
||
``(new_value, changed)`` so the DataFrame-level pipeline can audit which
|
||
cells were rewritten and which it left alone (unparseable input passes
|
||
through). All standardizers handle ``None``/empty gracefully and are
|
||
idempotent (applying twice yields the same result as once).
|
||
|
||
The DataFrame entry point :func:`standardize_dataframe` mirrors
|
||
:func:`src.core.text_clean.clean_dataframe` in shape: per-column type
|
||
assignments drive the pipeline, the input DataFrame is not mutated, and
|
||
a :class:`StandardizeResult` carries both the rewritten frame and a
|
||
row-by-row change audit.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import re
|
||
|
||
from loguru import logger
|
||
from dataclasses import asdict, dataclass, field
|
||
from datetime import datetime, timedelta
|
||
from enum import Enum
|
||
from pathlib import Path
|
||
from typing import Any, Iterable, Literal, Optional
|
||
|
||
import pandas as pd
|
||
import phonenumbers
|
||
|
||
from .text_clean import smart_title_case
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Field-type registry
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class FieldType(str, Enum):
|
||
"""The kinds of values the standardizer knows how to canonicalize."""
|
||
|
||
DATE = "date"
|
||
PHONE = "phone"
|
||
CURRENCY = "currency"
|
||
NAME = "name"
|
||
ADDRESS = "address"
|
||
BOOLEAN = "boolean"
|
||
EMAIL = "email"
|
||
|
||
|
||
# Shared error-policy helper used by every per-domain standardizer.
|
||
# Returns ``(<error: reason>, changed)`` under the ``"sentinel"`` policy
|
||
# and ``(value, False)`` under ``"passthrough"`` so unparseable input
|
||
# survives unchanged.
|
||
def _err_or_passthrough(
|
||
reason: str, value: str, policy: str,
|
||
) -> tuple[str, bool]:
|
||
if policy == "sentinel":
|
||
sentinel = f"<error: {reason}>"
|
||
return sentinel, sentinel != value
|
||
return value, False
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Date
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# Order matters: longer / more-specific formats first. Two-digit-year
|
||
# formats sit below their four-digit counterparts so ``2024-01-15`` parses
|
||
# as ISO before ``%y-%m-%d`` even gets a look-in.
|
||
_DATE_FORMATS_MDY = [
|
||
"%Y-%m-%d", "%Y/%m/%d", "%Y.%m.%d",
|
||
"%m/%d/%Y", "%m-%d-%Y", "%m.%d.%Y",
|
||
"%m/%d/%y", "%m-%d-%y",
|
||
"%B %d, %Y", "%b %d, %Y", "%B %d %Y", "%b %d %Y",
|
||
"%d %B %Y", "%d %b %Y",
|
||
"%d-%b-%Y", "%d-%b-%y",
|
||
"%Y%m%d",
|
||
]
|
||
|
||
_DATE_FORMATS_DMY = [
|
||
"%Y-%m-%d", "%Y/%m/%d", "%Y.%m.%d",
|
||
"%d/%m/%Y", "%d-%m-%Y", "%d.%m.%Y",
|
||
"%d/%m/%y", "%d-%m-%y", "%d.%m.%y",
|
||
"%d %B %Y", "%d %b %Y",
|
||
"%B %d, %Y", "%b %d, %Y", "%B %d %Y", "%b %d %Y",
|
||
"%d-%b-%Y", "%d-%b-%y",
|
||
"%Y%m%d",
|
||
]
|
||
|
||
# Weekday-prefixed long form: ``Monday, January 15, 2024``.
|
||
_WEEKDAY_PREFIX_RE = re.compile(
|
||
r"^(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)(?:day|sday|nesday|rsday|urday)?\s*,?\s+",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
# Strip a trailing time component (``2024-01-15 13:45:00`` etc.) before
|
||
# format-matching the date portion.
|
||
_TIME_TAIL_RE = re.compile(r"[\sT]\d{1,2}:\d{2}(?::\d{2}(?:\.\d+)?)?(?:\s*[AaPp][Mm])?(?:\s*[+-]\d{2}:?\d{2}|\s*Z|\s*[A-Z]{2,4})?$")
|
||
|
||
# Buried date: a strict YYYY-MM-DD substring inside other text, used
|
||
# only when the whole string fails strptime first.
|
||
_BURIED_ISO_DATE_RE = re.compile(r"\b(\d{4}-\d{2}-\d{2})\b")
|
||
|
||
# Excel serial date range — Jan 1 1970 to Jan 1 2099 (inclusive). Excel
|
||
# 1900 leap year bug: serials >= 60 are off by one because Excel pretends
|
||
# 1900-02-29 exists; we subtract a day in that range.
|
||
_EXCEL_SERIAL_MIN = 25569.0 # Jan 1 1970
|
||
_EXCEL_SERIAL_MAX = 73050.0 # Jan 1 2099
|
||
_EXCEL_EPOCH = datetime(1899, 12, 30) # accounts for the leap-year bug
|
||
|
||
# Unix timestamp ranges — covers Jan 1 2000 to Jan 1 2100 in seconds and
|
||
# milliseconds. Narrow enough that we don't false-positive on other ints.
|
||
_UNIX_S_MIN = 946684800 # 2000-01-01 00:00:00 UTC
|
||
_UNIX_S_MAX = 4102444800 # 2100-01-01 00:00:00 UTC
|
||
_UNIX_MS_MIN = _UNIX_S_MIN * 1000
|
||
_UNIX_MS_MAX = _UNIX_S_MAX * 1000
|
||
|
||
# Year-month text (``January 2024`` / ``Jan 2024``) → ``YYYY-MM``.
|
||
_MONTH_NAMES_EN = [
|
||
"january", "february", "march", "april", "may", "june",
|
||
"july", "august", "september", "october", "november", "december",
|
||
]
|
||
_MONTH_ABBR_EN = ["jan", "feb", "mar", "apr", "may", "jun",
|
||
"jul", "aug", "sep", "oct", "nov", "dec"]
|
||
_YEAR_MONTH_TEXT_RE = re.compile(
|
||
rf"^\s*({'|'.join(_MONTH_NAMES_EN + _MONTH_ABBR_EN)})\s+(\d{{4}})\s*$",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
# Quarter notation: ``Q1 2024`` → ``2024-Q1``.
|
||
_QUARTER_RE = re.compile(r"^\s*Q([1-4])\s+(\d{4})\s*$", re.IGNORECASE)
|
||
|
||
# Localized month names → English. Substituted before strptime so the
|
||
# regular ``%B``/``%b`` formats catch them. Includes both full and
|
||
# abbreviated forms where conventional.
|
||
_MONTH_LOCALES: dict[str, dict[str, str]] = {
|
||
"fr": {
|
||
"janvier": "January", "février": "February", "fevrier": "February",
|
||
"mars": "March", "avril": "April", "mai": "May", "juin": "June",
|
||
"juillet": "July", "août": "August", "aout": "August",
|
||
"septembre": "September", "octobre": "October",
|
||
"novembre": "November", "décembre": "December", "decembre": "December",
|
||
"janv": "Jan", "févr": "Feb", "fevr": "Feb", "avr": "Apr",
|
||
"juil": "Jul", "sept": "Sep", "oct": "Oct", "nov": "Nov",
|
||
"déc": "Dec", "dec": "Dec",
|
||
},
|
||
"de": {
|
||
"januar": "January", "februar": "February", "märz": "March",
|
||
"marz": "March", "april": "April", "mai": "May", "juni": "June",
|
||
"juli": "July", "august": "August", "september": "September",
|
||
"oktober": "October", "november": "November", "dezember": "December",
|
||
"jan": "Jan", "feb": "Feb", "mär": "Mar", "mar": "Mar",
|
||
"apr": "Apr", "jun": "Jun", "jul": "Jul", "aug": "Aug",
|
||
"sep": "Sep", "okt": "Oct", "nov": "Nov", "dez": "Dec",
|
||
},
|
||
"es": {
|
||
"enero": "January", "febrero": "February", "marzo": "March",
|
||
"abril": "April", "mayo": "May", "junio": "June", "julio": "July",
|
||
"agosto": "August", "septiembre": "September", "setiembre": "September",
|
||
"octubre": "October", "noviembre": "November", "diciembre": "December",
|
||
},
|
||
"pt": {
|
||
"janeiro": "January", "fevereiro": "February", "março": "March",
|
||
"marco": "March", "abril": "April", "maio": "May", "junho": "June",
|
||
"julho": "July", "agosto": "August", "setembro": "September",
|
||
"outubro": "October", "novembro": "November", "dezembro": "December",
|
||
"jan": "Jan", "fev": "Feb", "mar": "Mar", "abr": "Apr",
|
||
"mai": "May", "jun": "Jun", "jul": "Jul", "ago": "Aug",
|
||
"set": "Sep", "out": "Oct", "nov": "Nov", "dez": "Dec",
|
||
},
|
||
"it": {
|
||
"gennaio": "January", "febbraio": "February", "marzo": "March",
|
||
"aprile": "April", "maggio": "May", "giugno": "June",
|
||
"luglio": "July", "agosto": "August", "settembre": "September",
|
||
"ottobre": "October", "novembre": "November", "dicembre": "December",
|
||
"gen": "Jan", "feb": "Feb", "mar": "Mar", "apr": "Apr",
|
||
"mag": "May", "giu": "Jun", "lug": "Jul", "ago": "Aug",
|
||
"set": "Sep", "ott": "Oct", "nov": "Nov", "dic": "Dec",
|
||
},
|
||
"nl": {
|
||
"januari": "January", "februari": "February", "maart": "March",
|
||
"april": "April", "mei": "May", "juni": "June", "juli": "July",
|
||
"augustus": "August", "september": "September", "oktober": "October",
|
||
"november": "November", "december": "December",
|
||
"jan": "Jan", "feb": "Feb", "mrt": "Mar", "apr": "Apr",
|
||
"mei": "May", "jun": "Jun", "jul": "Jul", "aug": "Aug",
|
||
"sep": "Sep", "okt": "Oct", "nov": "Nov", "dec": "Dec",
|
||
},
|
||
"ru": {
|
||
"января": "January", "февраля": "February", "марта": "March",
|
||
"апреля": "April", "мая": "May", "июня": "June", "июля": "July",
|
||
"августа": "August", "сентября": "September", "октября": "October",
|
||
"ноября": "November", "декабря": "December",
|
||
# Nominative forms (less common in dates but possible)
|
||
"январь": "January", "февраль": "February", "март": "March",
|
||
"апрель": "April", "май": "May", "июнь": "June", "июль": "July",
|
||
"август": "August", "сентябрь": "September", "октябрь": "October",
|
||
"ноябрь": "November", "декабрь": "December",
|
||
},
|
||
}
|
||
|
||
# Localized weekday prefix removal — same idea as month substitution.
|
||
# Each locale's set lists full + abbreviated forms (lowercase) that
|
||
# should be stripped from the start of a date string before format
|
||
# matching. English is in ``_WEEKDAY_PREFIX_RE`` already.
|
||
_WEEKDAY_LOCALES: dict[str, list[str]] = {
|
||
"fr": ["lundi", "mardi", "mercredi", "jeudi", "vendredi", "samedi",
|
||
"dimanche", "lun", "mar", "mer", "jeu", "ven", "sam", "dim"],
|
||
"de": ["montag", "dienstag", "mittwoch", "donnerstag", "freitag",
|
||
"samstag", "sonntag", "mo", "di", "mi", "do", "fr", "sa", "so"],
|
||
"es": ["lunes", "martes", "miércoles", "miercoles", "jueves",
|
||
"viernes", "sábado", "sabado", "domingo"],
|
||
"it": ["lunedì", "lunedi", "martedì", "martedi", "mercoledì",
|
||
"mercoledi", "giovedì", "giovedi", "venerdì", "venerdi",
|
||
"sabato", "domenica"],
|
||
"pt": ["segunda-feira", "segunda", "terça-feira", "terca-feira",
|
||
"terça", "terca", "quarta-feira", "quarta", "quinta-feira",
|
||
"quinta", "sexta-feira", "sexta", "sábado", "sabado", "domingo"],
|
||
"nl": ["maandag", "dinsdag", "woensdag", "donderdag", "vrijdag",
|
||
"zaterdag", "zondag",
|
||
"ma", "di", "wo", "do", "vr", "za", "zo"],
|
||
"ru": ["понедельник", "вторник", "среда", "четверг", "пятница",
|
||
"суббота", "воскресенье",
|
||
"пн", "вт", "ср", "чт", "пт", "сб", "вс"],
|
||
}
|
||
|
||
|
||
def _build_weekday_patterns() -> dict[str, "re.Pattern[str]"]:
|
||
"""One regex per locale matching any leading weekday + optional comma."""
|
||
out = {}
|
||
for loc, words in _WEEKDAY_LOCALES.items():
|
||
# Sort longest first so ``segunda-feira`` wins over ``segunda``.
|
||
alt = "|".join(re.escape(w) for w in sorted(words, key=len, reverse=True))
|
||
out[loc] = re.compile(rf"^(?:{alt})\s*,?\s+", re.IGNORECASE)
|
||
return out
|
||
|
||
|
||
_WEEKDAY_LOCALE_PATTERNS = _build_weekday_patterns()
|
||
|
||
|
||
# Named timezone → fixed UTC offset. Resolves common abbreviations so
|
||
# ``2024-01-15 10:30:00 EST`` produces a date instead of falling through
|
||
# unparseably. Per FORMATS-CASES.md § 3.3, these are *fixed* offsets —
|
||
# DST-aware handling is out of scope (would require pyzoneinfo).
|
||
_NAMED_TZ_OFFSETS: dict[str, str] = {
|
||
# Universal
|
||
"UTC": "+00:00", "GMT": "+00:00", "Z": "+00:00",
|
||
# Americas
|
||
"EST": "-05:00", "EDT": "-04:00",
|
||
"CST": "-06:00", "CDT": "-05:00",
|
||
"MST": "-07:00", "MDT": "-06:00",
|
||
"PST": "-08:00", "PDT": "-07:00",
|
||
"AST": "-04:00", "AKST": "-09:00", "HST": "-10:00",
|
||
"BRT": "-03:00", "ART": "-03:00",
|
||
# Europe
|
||
"BST": "+01:00", "CET": "+01:00", "CEST": "+02:00",
|
||
"EET": "+02:00", "EEST": "+03:00", "WET": "+00:00", "WEST": "+01:00",
|
||
"MSK": "+03:00",
|
||
# Asia / Pacific
|
||
"IST": "+05:30",
|
||
"PKT": "+05:00", "BDT": "+06:00",
|
||
"ICT": "+07:00", "WIB": "+07:00",
|
||
"CST_CN": "+08:00", "HKT": "+08:00", "SGT": "+08:00", "PHT": "+08:00",
|
||
"JST": "+09:00", "KST": "+09:00",
|
||
"AEST": "+10:00", "AEDT": "+11:00", "NZST": "+12:00",
|
||
}
|
||
|
||
|
||
def _build_month_locale_patterns() -> dict[str, list[tuple["re.Pattern[str]", str]]]:
|
||
"""Precompile per-locale (pattern, replacement) lists once at import.
|
||
|
||
The previous loop compiled every pattern for every input cell — at
|
||
millions of rows that's a measurable hot spot.
|
||
"""
|
||
out: dict[str, list[tuple[re.Pattern[str], str]]] = {}
|
||
for loc, table in _MONTH_LOCALES.items():
|
||
out[loc] = [
|
||
(
|
||
re.compile(
|
||
rf"(?<![A-Za-z]){re.escape(foreign)}(?![A-Za-z])",
|
||
re.IGNORECASE,
|
||
),
|
||
english,
|
||
)
|
||
for foreign, english in table.items()
|
||
]
|
||
return out
|
||
|
||
|
||
_MONTH_LOCALE_PATTERNS = _build_month_locale_patterns()
|
||
|
||
|
||
def _apply_month_locale(s: str, locales: list[str]) -> str:
|
||
"""Replace localized month names with English equivalents.
|
||
|
||
Raises ``ValueError`` if any locale is unrecognized — silent skip
|
||
would mask typos like ``"FR"`` (uppercase) or ``"french"``.
|
||
"""
|
||
unknown = [
|
||
loc for loc in locales if loc != "en" and loc not in _MONTH_LOCALES
|
||
]
|
||
if unknown:
|
||
raise ValueError(
|
||
f"Unknown month locale(s): {unknown}. "
|
||
f"Available: {sorted(_MONTH_LOCALES) + ['en']}"
|
||
)
|
||
for loc in locales:
|
||
if loc == "en":
|
||
continue
|
||
for pat, english in _MONTH_LOCALE_PATTERNS[loc]:
|
||
s = pat.sub(english, s)
|
||
return s
|
||
|
||
|
||
def _try_excel_serial(s: str, output_format: str) -> Optional[str]:
|
||
"""Excel-1900 serial date → formatted date, or None if out of range."""
|
||
try:
|
||
n = float(s)
|
||
except ValueError:
|
||
return None
|
||
if not (_EXCEL_SERIAL_MIN <= n <= _EXCEL_SERIAL_MAX):
|
||
return None
|
||
days = int(n) # drop fractional time-of-day component
|
||
# Excel 1900 leap year bug: serials >= 60 are off by one day. Our
|
||
# epoch (1899-12-30) already corrects for this for serials >= 60.
|
||
# For serials < 60, we'd need a different epoch (1899-12-31), but
|
||
# those serials are pre-1900 anyway and outside our supported range.
|
||
try:
|
||
return (_EXCEL_EPOCH + timedelta(days=days)).strftime(output_format)
|
||
except (OverflowError, ValueError):
|
||
return None
|
||
|
||
|
||
def _try_unix_timestamp(s: str, output_format: str) -> Optional[str]:
|
||
"""Unix seconds / milliseconds → formatted date, or None."""
|
||
try:
|
||
n = int(s)
|
||
except ValueError:
|
||
return None
|
||
if _UNIX_S_MIN <= n <= _UNIX_S_MAX:
|
||
seconds = n
|
||
elif _UNIX_MS_MIN <= n <= _UNIX_MS_MAX:
|
||
seconds = n // 1000
|
||
else:
|
||
return None
|
||
try:
|
||
return datetime.utcfromtimestamp(seconds).strftime(output_format)
|
||
except (OverflowError, ValueError, OSError):
|
||
return None
|
||
|
||
|
||
DateOrder = Literal["MDY", "DMY"]
|
||
DateErrorPolicy = Literal["passthrough", "sentinel"]
|
||
|
||
|
||
def standardize_date(
|
||
value: Optional[str],
|
||
*,
|
||
output_format: str = "%Y-%m-%d",
|
||
date_order: DateOrder = "MDY",
|
||
error_policy: DateErrorPolicy = "passthrough",
|
||
month_locales: Optional[list[str]] = None,
|
||
two_digit_year_cutoff: int = 69,
|
||
) -> tuple[str, bool]:
|
||
"""Parse *value* as a date and return it formatted per *output_format*.
|
||
|
||
``date_order`` disambiguates ``01/02/2024``: ``"MDY"`` reads it as
|
||
Jan 2, ``"DMY"`` as Feb 1. ISO-shaped inputs (``YYYY-MM-DD``) are
|
||
unambiguous and parse the same way under either setting.
|
||
|
||
With ``error_policy="passthrough"`` (default) unparseable input
|
||
passes through unchanged. With ``"sentinel"`` the cleaner emits
|
||
``<error: <reason>>`` for invalid dates per corpus § 0.3.
|
||
|
||
``month_locales`` enables non-English month names. Pass any subset
|
||
of ``["en", "fr", "de", "es", "pt", "it", "nl", "ru"]`` to recognize
|
||
those locales' month + weekday names in addition to English.
|
||
Defaults to English-only.
|
||
|
||
``two_digit_year_cutoff`` controls the pivot for 2-digit years:
|
||
years ``00..cutoff`` map to 2000-2099, ``cutoff+1..99`` map to
|
||
1900-1999. Default 69 (Python's stdlib default). Override to ~25
|
||
for birth-year columns where most subjects were born ≤ 1999.
|
||
|
||
Recognizes Excel-1900 serial dates (``45306`` → ``2024-01-15``),
|
||
Unix timestamps in seconds and milliseconds, year-month text
|
||
(``January 2024`` → ``2024-01``), and quarter notation (``Q1 2024``
|
||
→ ``2024-Q1``) in addition to the standard date formats.
|
||
|
||
Returns ``(new_value, changed)``.
|
||
"""
|
||
if not value or not isinstance(value, str):
|
||
return value or "", False
|
||
s = value.strip()
|
||
if not s:
|
||
return value, False
|
||
|
||
_err = lambda reason: _err_or_passthrough(reason, value, error_policy)
|
||
|
||
# Excel serial dates and Unix timestamps don't survive the weekday-
|
||
# prefix / time-tail strips, so try them first. They short-circuit
|
||
# for pure-numeric inputs.
|
||
if re.match(r"^-?\d+(?:\.\d+)?$", s):
|
||
excel = _try_excel_serial(s, output_format)
|
||
if excel is not None:
|
||
return excel, excel != value
|
||
unix = _try_unix_timestamp(s, output_format)
|
||
if unix is not None:
|
||
return unix, unix != value
|
||
|
||
# Year-month text (``January 2024``) → ``YYYY-MM`` (precision-preserving).
|
||
ym = _YEAR_MONTH_TEXT_RE.match(s)
|
||
if ym:
|
||
month_word = ym.group(1).lower()
|
||
if month_word in _MONTH_NAMES_EN:
|
||
month_num = _MONTH_NAMES_EN.index(month_word) + 1
|
||
else:
|
||
month_num = _MONTH_ABBR_EN.index(month_word) + 1
|
||
out = f"{ym.group(2)}-{month_num:02d}"
|
||
return out, out != value
|
||
|
||
# Quarter notation (``Q1 2024``) → ``YYYY-Q1``.
|
||
q = _QUARTER_RE.match(s)
|
||
if q:
|
||
out = f"{q.group(2)}-Q{q.group(1)}"
|
||
return out, out != value
|
||
|
||
# CJK separator normalization: Japanese ``2024年01月15日`` → ``2024-01-15``,
|
||
# Korean ``2024.01.15`` is already covered by the dot format. Also fold
|
||
# fullwidth digits (0-9) to ASCII so any of the parsers can read them.
|
||
s = _normalize_cjk_date_chars(s)
|
||
|
||
# Substitute localized month names with English before format-match.
|
||
if month_locales:
|
||
s = _apply_month_locale(s, month_locales)
|
||
# Strip localized weekday prefixes for any enabled locale BEFORE
|
||
# the day-period strip — otherwise ``Montag, 15. Januar 2024``
|
||
# never reaches the digit-leading shape the period strip expects.
|
||
for loc in month_locales:
|
||
pat = _WEEKDAY_LOCALE_PATTERNS.get(loc)
|
||
if pat is not None:
|
||
s = pat.sub("", s).strip()
|
||
# German DMY uses ``15.`` for the day; strip the trailing period
|
||
# so ``15. Januar 2024`` parses as ``15 January 2024``.
|
||
s = re.sub(r"^(\d{1,2})\.\s+", r"\1 ", s)
|
||
|
||
# Strip a leading weekday prefix (``Monday, January 15, 2024``).
|
||
s = _WEEKDAY_PREFIX_RE.sub("", s).strip()
|
||
# Resolve named timezones (EST/PST/JST/…) to fixed offsets, then
|
||
# drop the trailing time portion before format-matching.
|
||
s = _resolve_named_tz(s)
|
||
s = _TIME_TAIL_RE.sub("", s).strip()
|
||
|
||
# ISO 8601 extended formats — week date + ordinal date — and
|
||
# RFC 2822 mail-header form.
|
||
iso_extended = _try_iso_extended(s, output_format)
|
||
if iso_extended is not None:
|
||
return iso_extended, iso_extended != value
|
||
rfc = _try_rfc2822(s, output_format)
|
||
if rfc is not None:
|
||
return rfc, rfc != value
|
||
|
||
parsed = _try_parse_date(s, date_order, two_digit_year_cutoff)
|
||
if parsed is not None:
|
||
out = parsed.strftime(output_format)
|
||
return out, out != value
|
||
|
||
# Buried-date extraction: try a strict ISO substring (``Date: 2024-01-15``,
|
||
# ``2024-01-15 (verified)``).
|
||
m = _BURIED_ISO_DATE_RE.search(value)
|
||
if m:
|
||
try:
|
||
parsed = datetime.strptime(m.group(1), "%Y-%m-%d")
|
||
out = parsed.strftime(output_format)
|
||
return out, out != value
|
||
except ValueError:
|
||
pass
|
||
|
||
# Detect explicit-but-invalid date shapes — give the user a clearer
|
||
# error than silent passthrough. Other shapes (partial precision,
|
||
# unknown text) pass through unchanged regardless of error policy.
|
||
iso_shape = re.match(r"^(\d{4})-(\d{1,2})-(\d{1,2})$", s)
|
||
if iso_shape:
|
||
y, mo, d = int(iso_shape[1]), int(iso_shape[2]), int(iso_shape[3])
|
||
if y == 1900 and mo == 2 and d == 29:
|
||
return _err("Excel 1900 leap year bug")
|
||
if mo > 12 or mo < 1:
|
||
return _err("invalid month")
|
||
if d > 31 or d < 1:
|
||
return _err("invalid day")
|
||
if mo == 2:
|
||
leap = y % 4 == 0 and (y % 100 != 0 or y % 400 == 0)
|
||
if d > (29 if leap else 28):
|
||
return _err("invalid leap day" if d == 29 else "invalid day")
|
||
if mo in {4, 6, 9, 11} and d > 30:
|
||
return _err("invalid day")
|
||
|
||
return value, False
|
||
|
||
|
||
def _try_parse_date(
|
||
s: str, date_order: DateOrder, two_digit_year_cutoff: int = 69,
|
||
) -> Optional[datetime]:
|
||
formats = _DATE_FORMATS_DMY if date_order == "DMY" else _DATE_FORMATS_MDY
|
||
for fmt in formats:
|
||
try:
|
||
parsed = datetime.strptime(s, fmt)
|
||
except ValueError:
|
||
continue
|
||
# Re-pivot 2-digit years if the user changed the cutoff. strptime
|
||
# uses Python's stdlib default of 69; for cutoff != 69 we may need
|
||
# to roll the century forward or back.
|
||
if "%y" in fmt and two_digit_year_cutoff != 69:
|
||
year_2 = parsed.year % 100
|
||
if year_2 <= two_digit_year_cutoff:
|
||
century = 2000
|
||
else:
|
||
century = 1900
|
||
parsed = parsed.replace(year=century + year_2)
|
||
return parsed
|
||
return None
|
||
|
||
|
||
_FULLWIDTH_DIGITS = str.maketrans("0123456789", "0123456789")
|
||
_CJK_DATE_MARKERS = str.maketrans({"年": "-", "月": "-", "日": "", ".": ".", "/": "/"})
|
||
|
||
|
||
def _normalize_cjk_date_chars(s: str) -> str:
|
||
"""Fold East Asian date markers + fullwidth digits to ASCII equivalents.
|
||
|
||
``2024年01月15日`` → ``2024-01-15``; fullwidth ``2024/01/15``
|
||
→ ``2024/01/15``. Idempotent on ASCII input.
|
||
"""
|
||
if not any(c > "\x7f" for c in s):
|
||
return s
|
||
s = s.translate(_FULLWIDTH_DIGITS).translate(_CJK_DATE_MARKERS)
|
||
# ``2024年01月15日`` becomes ``2024-01-15-`` with our trailing-day
|
||
# mapping; strip any trailing dash artifact.
|
||
return s.rstrip("-").strip()
|
||
|
||
|
||
_NAMED_TZ_RE = re.compile(
|
||
r"\s+(" + "|".join(re.escape(k) for k in sorted(_NAMED_TZ_OFFSETS, key=len, reverse=True)) + r")\b"
|
||
)
|
||
|
||
|
||
def _resolve_named_tz(s: str) -> str:
|
||
"""Replace a trailing named timezone with its fixed UTC offset.
|
||
|
||
``2024-01-15 10:30:00 EST`` → ``2024-01-15 10:30:00-05:00``. Per
|
||
FORMATS-CASES.md § 3.3, offsets are fixed (not DST-aware); see
|
||
``_NAMED_TZ_OFFSETS`` for the table.
|
||
"""
|
||
def repl(m: re.Match) -> str:
|
||
return _NAMED_TZ_OFFSETS[m.group(1)]
|
||
return _NAMED_TZ_RE.sub(repl, s)
|
||
|
||
|
||
_ISO_WEEK_RE = re.compile(r"^(\d{4})-W(\d{2})-(\d)$")
|
||
_ISO_ORDINAL_RE = re.compile(r"^(\d{4})-(\d{3})$")
|
||
|
||
|
||
def _try_iso_extended(s: str, output_format: str) -> Optional[str]:
|
||
"""Parse ISO 8601 week date or ordinal date, return formatted string."""
|
||
m = _ISO_WEEK_RE.match(s)
|
||
if m:
|
||
try:
|
||
parsed = datetime.fromisocalendar(
|
||
int(m.group(1)), int(m.group(2)), int(m.group(3)),
|
||
)
|
||
return parsed.strftime(output_format)
|
||
except ValueError:
|
||
return None
|
||
m = _ISO_ORDINAL_RE.match(s)
|
||
if m:
|
||
year, day = int(m.group(1)), int(m.group(2))
|
||
if 1 <= day <= 366:
|
||
try:
|
||
parsed = datetime(year, 1, 1) + timedelta(days=day - 1)
|
||
if parsed.year == year:
|
||
return parsed.strftime(output_format)
|
||
except ValueError:
|
||
return None
|
||
return None
|
||
|
||
|
||
# RFC 2822 mail-header form: ``Wed, 15 Jan 2024 10:30:00 GMT``.
|
||
_RFC2822_FORMATS = [
|
||
"%a, %d %b %Y %H:%M:%S", # without TZ
|
||
"%a, %d %b %Y %H:%M:%S %Z", # with named TZ (already resolved upstream)
|
||
"%a, %d %b %Y %H:%M:%S %z", # with offset
|
||
"%d %b %Y %H:%M:%S",
|
||
]
|
||
|
||
|
||
def _try_rfc2822(s: str, output_format: str) -> Optional[str]:
|
||
"""Parse RFC 2822 mail-header date format."""
|
||
for fmt in _RFC2822_FORMATS:
|
||
try:
|
||
parsed = datetime.strptime(s, fmt)
|
||
except ValueError:
|
||
continue
|
||
try:
|
||
return parsed.strftime(output_format)
|
||
except ValueError:
|
||
return None
|
||
return None
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Phone
|
||
# ---------------------------------------------------------------------------
|
||
|
||
PhoneFormat = Literal["E164", "INTERNATIONAL", "NATIONAL", "DIGITS"]
|
||
PhoneErrorPolicy = Literal["passthrough", "sentinel"]
|
||
|
||
_PHONE_FORMAT_MAP = {
|
||
"E164": phonenumbers.PhoneNumberFormat.E164,
|
||
"INTERNATIONAL": phonenumbers.PhoneNumberFormat.INTERNATIONAL,
|
||
"NATIONAL": phonenumbers.PhoneNumberFormat.NATIONAL,
|
||
}
|
||
|
||
# Placeholder sequences that look like phone numbers but are CRM
|
||
# sentinels for "no phone" — repeated single digit at NANP length.
|
||
_PHONE_PLACEHOLDER_RE = re.compile(r"^\+?1?[\s.()-]*([0-9])(?:[\s.()-]*\1){9}$")
|
||
# Multi-number cells split by ``/``, ``;``, ``,`` or `` and ``.
|
||
_PHONE_MULTI_SPLIT_RE = re.compile(r"\s*(?:/|;|,| and )\s*")
|
||
|
||
|
||
def standardize_phone(
|
||
value: Optional[str],
|
||
*,
|
||
output_format: PhoneFormat = "E164",
|
||
default_region: str = "US",
|
||
error_policy: PhoneErrorPolicy = "passthrough",
|
||
) -> tuple[str, bool]:
|
||
"""Parse with ``phonenumbers``, return in the requested format.
|
||
|
||
Default is ``passthrough`` for unparseable input; pass
|
||
``error_policy="sentinel"`` to emit ``<error: <reason>>`` for
|
||
placeholder runs (000-000-0000), multi-number cells, and contaminated
|
||
inputs (corpus § 4.3).
|
||
|
||
Extensions are preserved as a ``;ext=N`` suffix (RFC 3966 syntax)
|
||
when the format is E.164. Other output formats use libphonenumber's
|
||
native rendering, which already includes extensions.
|
||
|
||
The ``001`` international prefix is normalized to ``+`` before
|
||
parsing — without this, ``001 555 123 4567`` fails to parse under
|
||
``default_region="US"``.
|
||
|
||
``DIGITS`` strips every non-digit character without going through
|
||
``phonenumbers``.
|
||
"""
|
||
if not value or not isinstance(value, str):
|
||
return value or "", False
|
||
s = value.strip()
|
||
if not s:
|
||
return value, False
|
||
|
||
_err = lambda reason: _err_or_passthrough(reason, value, error_policy)
|
||
|
||
if output_format == "DIGITS":
|
||
digits = re.sub(r"\D", "", s)
|
||
return (digits, digits != value) if digits else (value, False)
|
||
|
||
# Multi-number per cell — error before we silently parse only the
|
||
# first number. ``5551234567 / 5559876543`` both parse independently.
|
||
if _PHONE_MULTI_SPLIT_RE.search(s):
|
||
parts = [p for p in _PHONE_MULTI_SPLIT_RE.split(s) if p.strip()]
|
||
if len(parts) >= 2 and all(
|
||
_looks_like_phone(p, default_region) for p in parts
|
||
):
|
||
return _err("multiple numbers in cell")
|
||
|
||
# Smart-quote contamination — unparseable detritus interleaved with
|
||
# digits. Strip and re-test, but flag when error_policy is sentinel.
|
||
if any(c in s for c in "‘’“”"):
|
||
cleaned = re.sub(r"[‘’“”][a-z]*", "", s).strip()
|
||
if cleaned != s:
|
||
if error_policy == "sentinel":
|
||
return _err("smart-quote contamination")
|
||
s = cleaned
|
||
|
||
# 001 international access prefix (US-style for "dial out") — strip
|
||
# entirely; the remaining digits are a regular national number that
|
||
# the region default can resolve.
|
||
if re.match(r"^001[\s\-]", s):
|
||
s = s[3:].lstrip(" -")
|
||
|
||
# Placeholder all-same-digit runs.
|
||
if _PHONE_PLACEHOLDER_RE.match(s):
|
||
return _err("placeholder number")
|
||
|
||
fmt = _PHONE_FORMAT_MAP[output_format]
|
||
try:
|
||
parsed = phonenumbers.parse(s, default_region)
|
||
except phonenumbers.NumberParseException:
|
||
# Anything that can't be parsed becomes a sentinel under the
|
||
# sentinel policy; passthrough returns the original. Both digit-
|
||
# and-formatting failures and pure non-numeric ("TBD"-style) cells
|
||
# land here.
|
||
return _err("not a phone number")
|
||
|
||
if not phonenumbers.is_possible_number(parsed):
|
||
# Distinguish "too many digits" from generic invalidity for
|
||
# NANP-shaped inputs. Inputs that look like local-only NANP
|
||
# numbers (7 digits) get a specific "insufficient digits" tag.
|
||
raw_digits = re.sub(r"\D", "", s)
|
||
if len(raw_digits) > 11 and default_region in {"US", "CA"}:
|
||
return _err("too many digits")
|
||
if 0 < len(raw_digits) < 10 and default_region in {"US", "CA"}:
|
||
return _err("insufficient digits")
|
||
return value, False # genuinely unparseable elsewhere — passthrough
|
||
|
||
# Extra-digit detection: NANP (region US/CA, country code 1) only
|
||
# accepts 10 digits (or 11 with leading 1). Excess digits in input
|
||
# like "1-555-123-4567-extra-99" parse out as more digits and we
|
||
# error rather than silently truncate.
|
||
raw_digits = re.sub(r"\D", "", s)
|
||
parsed_digits = re.sub(r"\D", "", phonenumbers.format_number(
|
||
parsed, phonenumbers.PhoneNumberFormat.E164,
|
||
))
|
||
if len(raw_digits) > len(parsed_digits) + 4:
|
||
return _err("too many digits")
|
||
|
||
# NANP minimum-length check — phonenumbers.is_possible_number is
|
||
# permissive; corpus § 4.3 wants insufficient-digits flagged.
|
||
if parsed.country_code == 1 and len(str(parsed.national_number)) < 10:
|
||
return _err("insufficient digits")
|
||
|
||
out = phonenumbers.format_number(parsed, fmt)
|
||
|
||
# Append extension as RFC 3966 ;ext= suffix on E.164 output (other
|
||
# formats already include the extension natively).
|
||
if output_format == "E164" and parsed.extension:
|
||
out = f"{out};ext={parsed.extension}"
|
||
|
||
return out, out != value
|
||
|
||
|
||
def _looks_like_phone(s: str, region: str) -> bool:
|
||
"""Quick check: does *s* parse as a possible phone in *region*?"""
|
||
try:
|
||
p = phonenumbers.parse(s, region)
|
||
except phonenumbers.NumberParseException:
|
||
return False
|
||
return phonenumbers.is_possible_number(p)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Currency
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# Symbol → ISO 4217 mapping. Used both for stripping currency markers
|
||
# before number parsing AND for the optional ``preserve_code`` mode that
|
||
# re-emits the detected code as a prefix on the standardized output.
|
||
_SYMBOL_TO_ISO: dict[str, str] = {
|
||
"$": "USD", # ambiguous w/ CAD/AUD/MXN — caller can override via input code
|
||
"€": "EUR",
|
||
"£": "GBP",
|
||
"¥": "JPY", # ambiguous w/ CNY — same caveat
|
||
"₹": "INR",
|
||
"₩": "KRW",
|
||
"₽": "RUB",
|
||
"₪": "ILS",
|
||
"₺": "TRY",
|
||
"¢": "USD", # cents — coerce to USD for the code; value is still numeric
|
||
# International additions:
|
||
"฿": "THB", # Thai Baht
|
||
"₫": "VND", # Vietnamese Dong
|
||
"₮": "MNT", # Mongolian Tugrik
|
||
"₴": "UAH", # Ukrainian Hryvnia
|
||
"₦": "NGN", # Nigerian Naira
|
||
"₱": "PHP", # Philippine Peso
|
||
"₲": "PYG", # Paraguayan Guarani
|
||
"﷼": "SAR", # ambiguous Saudi/Omani/Iranian; pick the most common
|
||
"₨": "PKR", # Pakistani Rupee (and historical Sri Lankan)
|
||
"₵": "GHS", # Ghanaian Cedi
|
||
}
|
||
_CURRENCY_SYMBOLS = "".join(_SYMBOL_TO_ISO)
|
||
# ISO 4217 codes — the long tail of currencies in active use. Order
|
||
# matters for the regex alternation: a 3-letter ISO code is unambiguous,
|
||
# but ``R$`` (Brazil) and ``kr`` (DKK/NOK/SEK) are 1-2 char prefixes
|
||
# that need to lose to a 3-letter code if both appear.
|
||
_CURRENCY_CODES_LIST = [
|
||
"USD", "EUR", "GBP", "JPY", "CNY", "CAD", "AUD", "CHF", "INR", "KRW",
|
||
"RUB", "MXN", "BRL", "ILS", "TRY", "ZAR", "SEK", "NOK", "DKK", "PLN",
|
||
"HKD", "SGD", "NZD",
|
||
# Major non-G10 economies:
|
||
"SAR", "AED", "QAR", "KWD", "BHD", "OMR", # Gulf
|
||
"ARS", "CLP", "COP", "PEN", "UYU", # Latin America
|
||
"EGP", "MAD", "TND", "NGN", "GHS", "KES", "ZAR", "TZS", "UGX", # Africa
|
||
"IDR", "MYR", "PHP", "THB", "VND", "TWD", # SE Asia
|
||
"PKR", "BDT", "LKR", "NPR", # South Asia
|
||
"HUF", "CZK", "RON", "BGN", "HRK", "ISK", # Europe-other
|
||
"UAH", "KZT", "GEL", "AMD", "AZN", # Eastern Europe / Caucasus
|
||
]
|
||
_CURRENCY_CODES = "|".join(_CURRENCY_CODES_LIST)
|
||
_CURRENCY_DETECT_RE = re.compile(
|
||
rf"(?P<code>{_CURRENCY_CODES})|(?P<sym>[{_CURRENCY_SYMBOLS}])",
|
||
re.IGNORECASE,
|
||
)
|
||
_CURRENCY_TRIM_RE = re.compile(
|
||
rf"^[\s{_CURRENCY_SYMBOLS}]*(?:{_CURRENCY_CODES})?[\s{_CURRENCY_SYMBOLS}]*"
|
||
rf"|[\s{_CURRENCY_SYMBOLS}]*(?:{_CURRENCY_CODES})?[\s{_CURRENCY_SYMBOLS}]*$",
|
||
re.IGNORECASE,
|
||
)
|
||
_PARENS_NEGATIVE_RE = re.compile(r"^\s*\(\s*(.+?)\s*\)\s*$")
|
||
|
||
|
||
CurrencyDecimal = Literal["dot", "comma", "auto"]
|
||
|
||
|
||
# Multi-character symbol prefixes that aren't captured by the
|
||
# single-codepoint ``_CURRENCY_SYMBOLS`` table. Order matters: the
|
||
# detector checks these prefixes BEFORE the single-symbol regex, so
|
||
# ``R$`` resolves to BRL even though ``$`` alone would map to USD.
|
||
_PREFIX_TO_ISO: dict[str, str] = {
|
||
"r$": "BRL", # Brazilian Real
|
||
"kr": "SEK", # ambiguous Nordic — picks SEK as most common; see tests
|
||
"zł": "PLN", # Polish Złoty
|
||
"лв": "BGN", # Bulgarian Lev
|
||
"₽": "RUB", # already in symbol table; kept for parity
|
||
"rs.": "INR", # rupees — covers IN/PK informal usage
|
||
"rs": "INR",
|
||
}
|
||
|
||
|
||
def detect_currency_code(value: str) -> Optional[str]:
|
||
"""Return the ISO 4217 code implied by *value*, or None.
|
||
|
||
Looks for an explicit ISO code first (``USD 1234``) and falls back to a
|
||
symbol → code mapping (``$1234`` → ``USD``). Symbol mapping is best-
|
||
effort: ``$`` is ambiguous between USD/CAD/AUD/MXN — the caller is
|
||
expected to constrain that via input data discipline.
|
||
|
||
Multi-char prefixes (``R$``, ``zł``, ``kr``) are recognised before
|
||
the single-symbol regex so Brazilian / Polish / Nordic data isn't
|
||
silently bucketed as USD.
|
||
"""
|
||
if not isinstance(value, str):
|
||
return None
|
||
head = value.lstrip().lower()
|
||
for prefix, code in _PREFIX_TO_ISO.items():
|
||
if head.startswith(prefix):
|
||
# Make sure the next char (if any) isn't a letter — avoid
|
||
# matching ``rsa`` as ``rs``-then-``a``.
|
||
tail = head[len(prefix):]
|
||
if not tail or not tail[0].isalpha():
|
||
return code
|
||
m = _CURRENCY_DETECT_RE.search(value)
|
||
if m is None:
|
||
return None
|
||
if m.group("code"):
|
||
return m.group("code").upper()
|
||
sym = m.group("sym")
|
||
return _SYMBOL_TO_ISO.get(sym)
|
||
|
||
|
||
CurrencyErrorPolicy = Literal["passthrough", "sentinel"]
|
||
|
||
|
||
def standardize_currency(
|
||
value: Optional[str],
|
||
*,
|
||
decimal: CurrencyDecimal = "dot",
|
||
decimals: Optional[int] = None,
|
||
preserve_code: bool = False,
|
||
error_policy: CurrencyErrorPolicy = "passthrough",
|
||
) -> tuple[str, bool]:
|
||
"""Strip currency symbols/grouping separators, return a bare number string.
|
||
|
||
``decimal="dot"``: ``$1,234.56`` → ``1234.56`` (US/UK convention).
|
||
``decimal="comma"``: ``1.234,56 €`` → ``1234.56`` (EU convention).
|
||
``decimal="auto"``: same as ``dot`` but a single trailing comma
|
||
whose tail is NOT exactly 3 digits is read as a decimal separator
|
||
(``850,50`` → ``850.50``, ``R$ 1,5`` → ``1.5``). Use this for
|
||
mixed-locale international files. Length-3 tails (``1,234``) stay
|
||
ambiguous regardless of mode.
|
||
|
||
All three modes auto-detect the EU shape when both ``.`` and ``,``
|
||
are present and the comma sits after the dot (so ``€1.234,56``
|
||
parses correctly even under the dot-default mode). Space-thousands
|
||
and Swiss apostrophe-thousands are also recognized.
|
||
|
||
The output always uses a dot as the decimal separator since that is
|
||
the form pandas/Python parse natively.
|
||
|
||
Accounting-style negatives (``($50.00)``) become ``-50.00``.
|
||
|
||
With ``error_policy="passthrough"`` (default) unparseable input
|
||
passes through unchanged. With ``error_policy="sentinel"`` the
|
||
cleaner emits ``<error: <reason>>`` for percentages, ranges, word
|
||
values, ambiguous separators, and other non-currency content per
|
||
corpus § 8.3.
|
||
|
||
When *decimals* is given, the result is rounded to that many places.
|
||
|
||
When *preserve_code* is True, an ISO 4217 code is detected from the
|
||
input (``USD 1234`` or ``$1234``) and re-emitted as a space-separated
|
||
prefix on the standardized number (``USD 1234.56``).
|
||
"""
|
||
if not value or not isinstance(value, str):
|
||
return value or "", False
|
||
s = value.strip()
|
||
if not s:
|
||
return value, False
|
||
|
||
_err = lambda reason: _err_or_passthrough(reason, value, error_policy)
|
||
|
||
if "%" in s:
|
||
return _err("percentage not currency")
|
||
# Range like "$50-$100" or "50–100" — distinguished from a single
|
||
# signed number by either two currency symbols, or a digit-then-
|
||
# dash-then-digit with the dash NOT being the leading sign.
|
||
sym_count = sum(1 for c in s if c in "$£€¥₹")
|
||
if sym_count >= 2 and re.search(r"\d\s*[-–—]\s*[$£€¥₹]", s):
|
||
return _err("range not normalizable")
|
||
if (
|
||
sym_count == 0
|
||
and re.search(r"\d\s*[-–—]\s*\d", s)
|
||
and not re.match(r"^[+-]?\d", s.strip())
|
||
):
|
||
return _err("range not normalizable")
|
||
|
||
code = detect_currency_code(s) if preserve_code else None
|
||
|
||
# Strip any multi-char currency prefix (``R$``, ``kr``, ``zł``)
|
||
# before the symbol-table regex — these aren't single codepoints
|
||
# so the table-driven trim would otherwise leave them in place.
|
||
head = s.lstrip().lower()
|
||
for prefix in _PREFIX_TO_ISO:
|
||
if head.startswith(prefix):
|
||
tail_start = len(prefix)
|
||
if tail_start < len(head) and head[tail_start].isalpha():
|
||
continue
|
||
# Strip the matched prefix from the original (preserve case
|
||
# of any trailing content).
|
||
stripped_lead = s[: len(s) - len(head)]
|
||
s = stripped_lead + s.lstrip()[len(prefix):]
|
||
s = s.lstrip()
|
||
break
|
||
|
||
negative = False
|
||
m = _PARENS_NEGATIVE_RE.match(s)
|
||
if m:
|
||
negative = True
|
||
s = m.group(1)
|
||
|
||
s = _CURRENCY_TRIM_RE.sub("", s).strip()
|
||
if not s:
|
||
return _err("empty after symbol strip")
|
||
|
||
if s.startswith(("+", "-")):
|
||
sign, rest = s[0], s[1:]
|
||
if sign == "-":
|
||
negative = not negative
|
||
rest = _CURRENCY_TRIM_RE.sub("", rest).strip()
|
||
else:
|
||
rest = s
|
||
|
||
# Swiss apostrophe-thousands → drop apostrophes used as group sep.
|
||
if "'" in rest:
|
||
rest = rest.replace("'", "")
|
||
|
||
# Space- or NBSP-thousands → drop spaces between digit groups
|
||
# (``1 234,56`` → ``1234,56``). Track whether we saw such a
|
||
# separator so we can disambiguate the comma below.
|
||
had_space_thousands = bool(re.search(r"\d[ \xa0]\d", rest))
|
||
rest = re.sub(r"(?<=\d)[ \xa0](?=\d)", "", rest)
|
||
|
||
has_dot = "." in rest
|
||
has_comma = "," in rest
|
||
|
||
if decimal == "comma":
|
||
# EU explicit: dots are thousands, comma is decimal.
|
||
rest = rest.replace(".", "").replace(",", ".")
|
||
else:
|
||
if has_dot and has_comma:
|
||
# Both present — the rightmost separator is the decimal.
|
||
if rest.rfind(",") > rest.rfind("."):
|
||
# EU: 1.234,56
|
||
rest = rest.replace(".", "").replace(",", ".")
|
||
else:
|
||
# US: 1,234.56
|
||
rest = rest.replace(",", "")
|
||
elif has_comma and not has_dot:
|
||
# ``1,234`` (no dot) is thousands-grouped US; ``1,5`` is
|
||
# ambiguous. But a leading space-thousand separator (``1 234,56``)
|
||
# is unambiguously EU — treat the comma as decimal.
|
||
if had_space_thousands:
|
||
rest = rest.replace(",", ".")
|
||
elif decimal == "auto":
|
||
# International auto-detection: a single comma whose
|
||
# tail is NOT exactly 3 digits is far more likely to be
|
||
# an EU/BRL decimal (``850,50``, ``1,5``) than a
|
||
# malformed US thousands group. Length-3 tails stay
|
||
# ambiguous and require an explicit locale.
|
||
after = rest.rsplit(",", 1)[1]
|
||
if rest.count(",") > 1:
|
||
rest = rest.replace(",", "")
|
||
elif len(after) == 3:
|
||
return _err("ambiguous separator, set --currency-locale")
|
||
else:
|
||
rest = rest.replace(",", ".")
|
||
else:
|
||
after = rest.rsplit(",", 1)[1]
|
||
if len(after) != 3:
|
||
return _err("ambiguous separator, set --currency-locale")
|
||
rest = rest.replace(",", "")
|
||
elif has_dot and not has_comma:
|
||
# Scientific notation (``1.5e6``) is not ambiguous — the tail
|
||
# after the dot contains a non-digit. Skip the EU-thousands
|
||
# check in that case.
|
||
after = rest.rsplit(".", 1)[1]
|
||
tail_is_pure_digits = after.isdigit()
|
||
if (
|
||
tail_is_pure_digits
|
||
and len(after) == 3
|
||
and len(rest.split(".")[0]) <= 3
|
||
and rest.count(".") == 1
|
||
):
|
||
return _err("ambiguous separator, set --currency-locale")
|
||
|
||
try:
|
||
num = float(rest)
|
||
except ValueError:
|
||
return _err("word value")
|
||
|
||
if negative:
|
||
num = -num
|
||
|
||
if decimals is not None:
|
||
out = f"{num:.{decimals}f}"
|
||
elif num == int(num) and "." not in rest:
|
||
out = str(int(num))
|
||
else:
|
||
out = f"{num:g}" if abs(num) >= 1e16 else format(num, "f").rstrip("0").rstrip(".")
|
||
if not out or out in ("-", ""):
|
||
out = "0"
|
||
|
||
if code is not None:
|
||
out = f"{code} {out}"
|
||
|
||
return out, out != value
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Name
|
||
# ---------------------------------------------------------------------------
|
||
|
||
NameCase = Literal["title", "upper", "lower"]
|
||
|
||
# Particles in surnames that conventionally stay lowercase in natural
|
||
# reading order. Covers the major Indo-European traditions plus
|
||
# Arabic/Hebrew patronymic markers.
|
||
_NAME_PARTICLES: set[str] = {
|
||
# Germanic / Dutch / French / Italian
|
||
"von", "van", "de", "da", "del", "della", "di", "du", "der",
|
||
"den", "ter", "ten", "le", "la", "los", "las", "el",
|
||
# Spanish / Portuguese
|
||
"dos", "das", "do", "y",
|
||
# Arabic patronymic / nisba
|
||
"bin", "ibn", "bint", "abu", "abd", "al", "el-", "al-",
|
||
# Hebrew
|
||
"ben", "bat", "ha", "ha-",
|
||
# Slavic transliterated (rare in Western forms)
|
||
"z", "ze",
|
||
}
|
||
|
||
# Acronyms / honorifics that keep their conventional casing rather than
|
||
# being title-cased (``PhD``, ``MD``, ``Esq``). Includes international
|
||
# academic credentials.
|
||
_NAME_ACRONYMS: dict[str, str] = {
|
||
# English
|
||
"phd": "PhD", "md": "MD", "esq": "Esq", "ma": "MA", "ba": "BA",
|
||
"bs": "BS", "ms": "MS", "dds": "DDS", "dvm": "DVM", "jd": "JD",
|
||
"rn": "RN", "cpa": "CPA", "ceo": "CEO", "cto": "CTO", "cfo": "CFO",
|
||
# German / Austrian academic
|
||
"dipl": "Dipl", "ing": "Ing", "mag": "Mag", "habil": "Habil",
|
||
"drmed": "Dr.med.", "drphil": "Dr.phil.", "drrernat": "Dr.rer.nat.",
|
||
"msc": "MSc", "bsc": "BSc",
|
||
# International degrees
|
||
"llb": "LLB", "llm": "LLM",
|
||
}
|
||
|
||
# Roman numeral suffixes — preserved verbatim (already uppercase).
|
||
_NAME_ROMAN_RE = re.compile(r"^[IVX]+$")
|
||
|
||
# Titles. Most languages strip the trailing period (``Mr.`` → ``Mr``);
|
||
# the dispatcher in _standardize_name_token does the strip.
|
||
_NAME_TITLES: set[str] = {
|
||
# English
|
||
"mr", "mrs", "ms", "miss", "dr", "prof", "sr", "jr", "sir", "madam",
|
||
"rev", "hon",
|
||
# German
|
||
"herr", "frau", "fr", "hr",
|
||
# French
|
||
"m", "mme", "mlle", "mr",
|
||
# Spanish
|
||
"sr", "sra", "srta", "don", "doña", "dona",
|
||
# Italian
|
||
"sig", "sigra", "dott", "dottoressa",
|
||
# Portuguese
|
||
"snr", "snra",
|
||
}
|
||
|
||
# East Asian honorific suffixes — appended after the family name with a
|
||
# hyphen. Preserved verbatim (lowercase). Supports both Latin
|
||
# transliteration and the underlying Japanese/Korean characters.
|
||
_EAST_ASIAN_HONORIFICS: set[str] = {
|
||
"san", "sama", "kun", "chan", "sensei", "senpai", "kohai", "dono",
|
||
"shi", "tan", "chin",
|
||
# Korean
|
||
"ssi", "nim",
|
||
}
|
||
|
||
# Suffixes that take a trailing period in their short form (``Jr.``).
|
||
_NAME_SUFFIXES: set[str] = {"jr", "sr", "esq"}
|
||
|
||
|
||
def _cap_segment(seg: str) -> str:
|
||
"""Capitalize a single word/segment, leaving the rest lowercase."""
|
||
if not seg:
|
||
return seg
|
||
return seg[0].upper() + seg[1:].lower()
|
||
|
||
|
||
def _standardize_name_token(tok: str, *, position: str, all_shouting: bool = False) -> str:
|
||
"""Standardize one space-separated token.
|
||
|
||
*position* is one of ``"first"``, ``"middle"``, ``"last"`` and
|
||
drives particle / capitalization rules. *all_shouting* is True when
|
||
every token in the surrounding name is uppercase — in that case,
|
||
don't preserve any single token as an acronym.
|
||
"""
|
||
if not tok:
|
||
return tok
|
||
|
||
# Trailing punctuation gets stripped and re-attached.
|
||
suffix_punct = ""
|
||
while tok and tok[-1] in ",;:":
|
||
suffix_punct = tok[-1] + suffix_punct
|
||
tok = tok[:-1]
|
||
if not tok:
|
||
return suffix_punct
|
||
|
||
lowered = tok.lower()
|
||
bare = lowered.rstrip(".")
|
||
|
||
# Roman numerals (II, III, IV, …)
|
||
if _NAME_ROMAN_RE.match(tok.upper()):
|
||
return tok.upper() + suffix_punct
|
||
|
||
# Known acronym (PhD, MD, …)
|
||
if bare in _NAME_ACRONYMS:
|
||
return _NAME_ACRONYMS[bare] + suffix_punct
|
||
|
||
# All-caps token of length >= 2 with no lowercase letters and at
|
||
# least one alpha — treat as an acronym in the middle of a name
|
||
# (``Mary USA Smith``, ``John IBM Doe``). Doesn't fire for single
|
||
# initials (``A.``), and doesn't fire when the whole name is
|
||
# shouting (``DR JANE DOE`` shouldn't preserve JANE as an acronym
|
||
# — the whole thing is just the user's caps lock key).
|
||
if (
|
||
position == "middle"
|
||
and not all_shouting
|
||
and len(bare) >= 2
|
||
and tok.isupper()
|
||
and any(c.isalpha() for c in tok)
|
||
and bare not in _NAME_TITLES
|
||
and bare not in _NAME_SUFFIXES
|
||
and bare not in _NAME_PARTICLES
|
||
):
|
||
return tok + suffix_punct
|
||
|
||
# Title (Mr, Dr, Prof) — strip trailing period
|
||
if bare in _NAME_TITLES:
|
||
return _cap_segment(bare) + suffix_punct
|
||
|
||
# Suffix (Jr, Sr) — strip trailing period
|
||
if bare in _NAME_SUFFIXES and position == "last":
|
||
return _cap_segment(bare) + suffix_punct
|
||
|
||
# Particle (von, van, de, …) — stay lowercase except as final token
|
||
# of the name (the surname slot — ``van Gogh`` last is ``Gogh``,
|
||
# but standalone ``Van`` would be a first name).
|
||
if lowered.rstrip(".") in _NAME_PARTICLES and position != "last":
|
||
return lowered.rstrip(".") + suffix_punct
|
||
|
||
# Single-letter initial like ``A`` or ``A.`` → strip trailing
|
||
# period, uppercase. (Check before multi-initial so ``A.`` doesn't
|
||
# fall into the multi-initial branch and keep its period.)
|
||
if len(bare) == 1 and bare.isalpha():
|
||
return bare.upper() + suffix_punct
|
||
|
||
# Multi-initial token like ``j.k.`` or ``J.K.`` → uppercase letters,
|
||
# keep internal periods.
|
||
if "." in tok and all(
|
||
seg == "" or (len(seg) == 1 and seg.isalpha()) for seg in tok.split(".")
|
||
):
|
||
return tok.upper() + suffix_punct
|
||
|
||
# Hyphenated segment — capitalize each piece. Special cases:
|
||
# - East Asian honorific suffix (``Tanaka-san``) stays lowercase.
|
||
# - Arabic transliterated prefix (``al-Rashid``, ``el-Sayed``)
|
||
# keeps the prefix lowercase per Arabic naming convention.
|
||
if "-" in tok:
|
||
parts = tok.split("-")
|
||
out_parts = []
|
||
for j, p in enumerate(parts):
|
||
if j > 0 and p.lower() in _EAST_ASIAN_HONORIFICS:
|
||
out_parts.append(p.lower())
|
||
elif j == 0 and p.lower() in {"al", "el", "an", "ad"}:
|
||
out_parts.append(p.lower())
|
||
else:
|
||
out_parts.append(_cap_segment(p))
|
||
return "-".join(out_parts) + suffix_punct
|
||
|
||
# Mc / Mac prefix — inner cap.
|
||
if lowered.startswith("mc") and len(lowered) > 2:
|
||
return "Mc" + _cap_segment(tok[2:]) + suffix_punct
|
||
if lowered.startswith("mac") and len(lowered) > 3:
|
||
# Heuristic: only capitalize after Mac if the following segment
|
||
# would also be capitalized in title case. ``machine`` should
|
||
# stay ``Machine`` not ``MacHine`` — but real surnames are far
|
||
# more common as inputs to a name standardizer than dictionary
|
||
# words. Apply Mac inner-cap unconditionally; document as a
|
||
# known limitation.
|
||
return "Mac" + _cap_segment(tok[3:]) + suffix_punct
|
||
|
||
# O' prefix — inner cap.
|
||
if lowered.startswith("o'") and len(lowered) > 2:
|
||
return "O'" + _cap_segment(tok[2:]) + suffix_punct
|
||
|
||
# D' prefix — inner cap (D'Angelo, D'Arcy).
|
||
if lowered.startswith("d'") and len(lowered) > 2:
|
||
return "D'" + _cap_segment(tok[2:]) + suffix_punct
|
||
|
||
return _cap_segment(tok) + suffix_punct
|
||
|
||
|
||
def _is_non_latin_script(s: str) -> bool:
|
||
"""Heuristic: true when the string contains non-Latin cased letters."""
|
||
for c in s:
|
||
if c.isalpha():
|
||
cp = ord(c)
|
||
# Latin range up to Latin Extended-B (covers Latin + accents).
|
||
if cp <= 0x024F:
|
||
return False
|
||
# No Latin alpha characters at all → treat as non-Latin.
|
||
return any(c.isalpha() for c in s)
|
||
|
||
|
||
def standardize_name(
|
||
value: Optional[str],
|
||
*,
|
||
case: NameCase = "title",
|
||
conservative: bool = False,
|
||
reverse_comma_format: bool = True,
|
||
family_first: bool = False,
|
||
) -> tuple[str, bool]:
|
||
"""Apply name-friendly casing with prefix / particle / suffix awareness.
|
||
|
||
``"title"`` (default) handles:
|
||
* Mc / Mac inner caps (``mcdonald`` → ``McDonald``).
|
||
* O'/D' inner caps (``o'connor`` → ``O'Connor``).
|
||
* Hyphenated segments (``mary-jane`` → ``Mary-Jane``).
|
||
* Particles stay lowercase mid-name (``van Gogh``, ``de Gaulle``,
|
||
``bin Salman``, ``ben Avraham``).
|
||
* East Asian honorific suffixes (``Tanaka-san``, ``Lee-ssi``)
|
||
preserved lowercase after the hyphen.
|
||
* Title / suffix periods stripped (``Mr.`` → ``Mr``, ``Jr.`` → ``Jr``).
|
||
* Roman numeral suffixes preserved (``III``).
|
||
* PhD / MD / Esq style acronyms preserved.
|
||
* Multi-initial tokens uppercased (``j.k.`` → ``J.K.``).
|
||
* Non-Latin scripts (Korean, Japanese, Cyrillic) pass through.
|
||
|
||
``conservative=True`` preserves mixed-case input verbatim per the
|
||
corpus § 7.3 ``--name-conservative=on`` policy.
|
||
|
||
``reverse_comma_format`` flips ``Last, First`` to ``First Last``
|
||
(default per corpus § 7.3).
|
||
|
||
``family_first=True`` skips comma reversal and disables Western
|
||
title detection — appropriate for East Asian columns where the
|
||
family name comes first natively (``Kim Min-jae``, ``田中 太郎``).
|
||
Set this per-column when you know the cultural convention.
|
||
|
||
``"upper"`` / ``"lower"`` are simple case conversions.
|
||
"""
|
||
if not value or not isinstance(value, str):
|
||
return value or "", False
|
||
s = value.strip()
|
||
if not s:
|
||
return value, False
|
||
|
||
if case == "upper":
|
||
out = s.upper()
|
||
return out, out != value
|
||
if case == "lower":
|
||
out = s.lower()
|
||
return out, out != value
|
||
if case != "title":
|
||
raise ValueError(f"Unknown name case: {case}")
|
||
|
||
# Non-Latin scripts pass through unchanged — no case to apply.
|
||
if _is_non_latin_script(s):
|
||
return value, False
|
||
|
||
# Conservative mode: only normalize all-caps or all-lowercase input.
|
||
if conservative:
|
||
cased = [c for c in s if c.isalpha()]
|
||
if cased and any(c.isupper() for c in cased) and any(c.islower() for c in cased):
|
||
return value, False
|
||
|
||
# Comma-format reversal: "Smith, John Andrew" → "John Andrew Smith".
|
||
# Skipped under family_first because East Asian conventions write
|
||
# the family name first natively — reversing would corrupt them.
|
||
if reverse_comma_format and not family_first and "," in s:
|
||
parts = [p.strip() for p in s.split(",", 1)]
|
||
if len(parts) == 2 and parts[0] and parts[1]:
|
||
s = f"{parts[1]} {parts[0]}"
|
||
|
||
tokens = s.split(" ")
|
||
n = len(tokens)
|
||
cased = [c for c in s if c.isalpha()]
|
||
all_shouting = bool(cased) and not any(c.islower() for c in cased)
|
||
out_tokens: list[str] = []
|
||
for i, tok in enumerate(tokens):
|
||
if not tok:
|
||
out_tokens.append(tok)
|
||
continue
|
||
position = "first" if i == 0 else ("last" if i == n - 1 else "middle")
|
||
out_tokens.append(_standardize_name_token(
|
||
tok, position=position, all_shouting=all_shouting,
|
||
))
|
||
|
||
out = " ".join(out_tokens)
|
||
return out, out != value
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Address
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# Expansion table — the inverse of the dedup-side compression set in
|
||
# ``normalize_address``. We deliberately don't expand ``unit``, ``loop``,
|
||
# or ``way`` because those are already the long form. Canonical mappings
|
||
# live in :mod:`src.core._constants` so both modules stay in sync.
|
||
from ._constants import (
|
||
USPS_EXPANSIONS as _ADDRESS_EXPANSIONS,
|
||
USPS_COMPRESSIONS as _ADDRESS_COMPRESSIONS,
|
||
US_STATE_CODES as _US_STATE_CODES_SHARED,
|
||
US_STATE_NAMES as _US_STATE_NAMES_SHARED,
|
||
CA_PROVINCE_CODES, CA_PROVINCE_NAMES,
|
||
AU_STATE_CODES, AU_STATE_NAMES,
|
||
DE_STATE_CODES, DE_STATE_NAMES,
|
||
POSTAL_PATTERNS,
|
||
INTL_PO_BOX_PATTERNS,
|
||
)
|
||
|
||
# Short tokens that look like directions but only mean a direction at the
|
||
# start or end of an address — never in the middle of a street name. This
|
||
# avoids mangling ``123 N Main St`` (legit) vs. ``123 N. Main`` (legit) but
|
||
# also keeping us from rewriting ``Tower N`` → ``Tower North`` mid-line if
|
||
# it's part of a building name.
|
||
_DIRECTION_TOKENS = {"n", "s", "e", "w", "ne", "nw", "se", "sw"}
|
||
|
||
_TOKEN_RE = re.compile(r"\w+|[^\w\s]+|\s+")
|
||
|
||
# Aliases over the shared constants — kept for the local module-level
|
||
# reads that already reference these names.
|
||
_US_STATE_CODES = _US_STATE_CODES_SHARED
|
||
_US_STATE_NAMES = _US_STATE_NAMES_SHARED
|
||
|
||
# Per-country (full-name, code, postal-pattern) tables. Each yields a
|
||
# precompiled regex matching ``, <state name> <postal>``. Sorted
|
||
# longest-first so multi-word names win over their prefixes.
|
||
def _build_state_patterns(
|
||
name_to_code: dict[str, str], postal_pattern: str,
|
||
) -> list[tuple[re.Pattern[str], str]]:
|
||
return [
|
||
(
|
||
re.compile(
|
||
rf"(,\s*){re.escape(full)}(\s+{postal_pattern})",
|
||
re.IGNORECASE,
|
||
),
|
||
code,
|
||
)
|
||
for full, code in sorted(name_to_code.items(), key=lambda kv: -len(kv[0]))
|
||
]
|
||
|
||
|
||
_STATE_NAME_PATTERNS: list[tuple[re.Pattern[str], str]] = _build_state_patterns(
|
||
_US_STATE_NAMES, r"\d{5}(?:-\d{4})?",
|
||
)
|
||
_CA_PROVINCE_PATTERNS: list[tuple[re.Pattern[str], str]] = _build_state_patterns(
|
||
CA_PROVINCE_NAMES, r"[A-Z]\d[A-Z]\s*\d[A-Z]\d",
|
||
)
|
||
_AU_STATE_PATTERNS: list[tuple[re.Pattern[str], str]] = _build_state_patterns(
|
||
AU_STATE_NAMES, r"\d{4}",
|
||
)
|
||
_DE_STATE_PATTERNS: list[tuple[re.Pattern[str], str]] = _build_state_patterns(
|
||
DE_STATE_NAMES, r"\d{5}",
|
||
)
|
||
|
||
# PO Box variants normalize to a single canonical form. Combines the
|
||
# English pattern with the international locale variants registered in
|
||
# _constants.INTL_PO_BOX_PATTERNS.
|
||
_PO_BOX_RE = re.compile(
|
||
r"\b(?:" + "|".join(INTL_PO_BOX_PATTERNS.values()) + r")\b",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
# Country-shape postal patterns (precompiled). Used to detect which
|
||
# country-specific normalization to apply (state-code preservation,
|
||
# street-suffix dictionary, etc.).
|
||
_POSTAL_REGEXES: dict[str, re.Pattern[str]] = {
|
||
cc: re.compile(pat) for cc, pat in POSTAL_PATTERNS.items()
|
||
}
|
||
# Back-compat aliases for sites that already reference these names.
|
||
_US_ZIP_TAIL_RE = _POSTAL_REGEXES["us"]
|
||
_CANADA_POSTAL_RE = _POSTAL_REGEXES["ca"]
|
||
_UK_POSTCODE_RE = _POSTAL_REGEXES["uk"]
|
||
|
||
# Combined state-code set: US + Canada + Australia + Germany. The
|
||
# state-code-position check preserves any of these when found in the
|
||
# slot between a comma and the postal code.
|
||
_INTL_STATE_CODES: frozenset[str] = (
|
||
_US_STATE_CODES_SHARED | CA_PROVINCE_CODES | AU_STATE_CODES | DE_STATE_CODES
|
||
)
|
||
|
||
|
||
def _is_state_code_position(tokens: list[str], idx: int) -> bool:
|
||
"""Heuristic: ``tokens[idx]`` sits in a state-code slot.
|
||
|
||
A state code typically appears as ``…, XX 12345`` — preceded (modulo
|
||
whitespace) by a comma and followed by a 5-digit ZIP. We allow some
|
||
flexibility: a trailing position after a comma also counts even
|
||
without a ZIP.
|
||
"""
|
||
# Look back for a comma (skipping whitespace).
|
||
j = idx - 1
|
||
while j >= 0 and tokens[j].isspace():
|
||
j -= 1
|
||
if j < 0 or tokens[j] != ",":
|
||
return False
|
||
# Look ahead for a postal-shaped token. Accepts US ZIP (5 digits +
|
||
# optional +4), Australian (4 digits), Canadian first half (single
|
||
# letter + digit + letter), and the start of a UK outward code.
|
||
j = idx + 1
|
||
while j < len(tokens) and tokens[j].isspace():
|
||
j += 1
|
||
if j >= len(tokens):
|
||
return True # tail of line, after a comma — accept
|
||
nxt = tokens[j]
|
||
return bool(re.match(
|
||
r"\d{4,5}(?:-\d{4})?$|^[A-Z]\d[A-Z]$|^[A-Z]{1,2}\d",
|
||
nxt, re.IGNORECASE,
|
||
))
|
||
|
||
|
||
def standardize_address(
|
||
value: Optional[str],
|
||
*,
|
||
extra_abbreviations: Optional[dict[str, str]] = None,
|
||
expand: bool = True,
|
||
state_to_code: bool = True,
|
||
collapse_multiline: bool = True,
|
||
trim_trailing_comma: bool = True,
|
||
normalize_po_box: bool = True,
|
||
) -> tuple[str, bool]:
|
||
"""Standardize a US-style address.
|
||
|
||
By default expands USPS abbreviations (``St`` → ``Street``) and
|
||
title-cases the result. With ``expand=False`` the inverse direction
|
||
is used (``Street`` → ``St``), which matches the corpus default of
|
||
USPS abbreviated form as canonical (FORMATS-CASES.md § 6.3).
|
||
|
||
Other policy knobs:
|
||
* ``state_to_code`` — convert spelled-out state names to 2-letter
|
||
postal codes (``New York`` (state) → ``NY``).
|
||
* ``collapse_multiline`` — replace embedded newlines with ``, ``
|
||
so ``123 Main St\\nApt 4B`` becomes ``123 Main St, Apt 4B``.
|
||
* ``trim_trailing_comma`` — drop a sole trailing comma left by
|
||
loose CSV exports.
|
||
* ``normalize_po_box`` — fold ``P.O. Box`` / ``Post Office Box``
|
||
/ ``po box`` variants to canonical ``PO Box``.
|
||
|
||
State codes are preserved verbatim regardless of the surrounding
|
||
case (``ny`` in all-lowercase input becomes ``NY``, not ``Ny``).
|
||
"""
|
||
if not value or not isinstance(value, str):
|
||
return value or "", False
|
||
if not value.strip():
|
||
return value, False
|
||
|
||
s = value
|
||
# If the whole input is shouting (every cased letter uppercase),
|
||
# casefold it before any token replacement so the title-case pass
|
||
# produces ``Main St`` rather than seeing a mix of ``MAIN`` and
|
||
# already-replaced ``St`` and giving up on the all-caps tokens.
|
||
cased = [c for c in s if c.isalpha()]
|
||
if cased and not any(c.islower() for c in cased):
|
||
s = s.lower()
|
||
if collapse_multiline and "\n" in s:
|
||
# Each line becomes a comma-joined segment — but skip empty lines
|
||
# and dedupe a comma the user already had at the line break.
|
||
parts = [p.strip().rstrip(",").strip() for p in s.splitlines()]
|
||
s = ", ".join(p for p in parts if p)
|
||
|
||
if normalize_po_box:
|
||
s = _PO_BOX_RE.sub("PO Box", s)
|
||
|
||
is_us_shaped = bool(_US_ZIP_TAIL_RE.search(s))
|
||
is_ca_shaped = bool(_CANADA_POSTAL_RE.search(s))
|
||
is_uk_shaped = bool(_UK_POSTCODE_RE.search(s))
|
||
# German postal is just 5 digits — same as US ZIP — so we only
|
||
# treat as DE if the input is NOT already US-state-shaped.
|
||
is_de_shaped = (
|
||
is_us_shaped and any(
|
||
re.search(rf",\s*{re.escape(name)}\s+\d{{5}}", s, re.IGNORECASE)
|
||
or re.search(rf",\s*{re.escape(code)}\s+\d{{5}}", s, re.IGNORECASE)
|
||
for name, code in DE_STATE_NAMES.items()
|
||
)
|
||
)
|
||
# AU detection: 4-digit postal at tail AND a known AU state code or
|
||
# full-name substring is present somewhere in the address.
|
||
_au_state_words = "|".join(
|
||
list(AU_STATE_CODES) + [re.escape(n) for n in AU_STATE_NAMES]
|
||
)
|
||
is_au_shaped = bool(
|
||
re.search(r"\b\d{4}\b\s*$", s.rstrip(","))
|
||
and re.search(rf"\b(?:{_au_state_words})\b", s, re.IGNORECASE)
|
||
)
|
||
|
||
if state_to_code:
|
||
# State-name → code conversion. Each country's pattern only
|
||
# fires when its own postal-code shape is detected, so US
|
||
# "New York" before "NY 10001" is left alone (it's a city), and
|
||
# Canadian "Ontario" before "M5E 1W7" becomes "ON".
|
||
if is_us_shaped:
|
||
for pat, code in _STATE_NAME_PATTERNS:
|
||
s = pat.sub(rf"\g<1>{code}\g<2>", s)
|
||
if is_ca_shaped:
|
||
for pat, code in _CA_PROVINCE_PATTERNS:
|
||
s = pat.sub(rf"\g<1>{code}\g<2>", s)
|
||
if is_au_shaped:
|
||
for pat, code in _AU_STATE_PATTERNS:
|
||
s = pat.sub(rf"\g<1>{code}\g<2>", s)
|
||
if is_de_shaped:
|
||
for pat, code in _DE_STATE_PATTERNS:
|
||
s = pat.sub(rf"\g<1>{code}\g<2>", s)
|
||
|
||
if not expand:
|
||
# Compression direction is only safe for US-shaped addresses.
|
||
# International rows (UK postcodes, Canada/Japan postal patterns)
|
||
# keep their original spelling — ``Downing Street`` stays
|
||
# ``Downing Street``, not ``Downing St``.
|
||
abbrev_table = (
|
||
{k: v for k, v in _ADDRESS_COMPRESSIONS.items()}
|
||
if is_us_shaped or _CANADA_POSTAL_RE.search(s)
|
||
else {}
|
||
)
|
||
else:
|
||
abbrev_table = dict(_ADDRESS_EXPANSIONS)
|
||
|
||
if extra_abbreviations:
|
||
abbrev_table = {**abbrev_table}
|
||
for k, v in extra_abbreviations.items():
|
||
if isinstance(k, str) and isinstance(v, str) and k.strip() and v.strip():
|
||
abbrev_table[k.casefold().rstrip(".").strip()] = v.strip()
|
||
|
||
expansion_values = set(abbrev_table.values())
|
||
# Canonical USPS abbreviation forms (``St``, ``Ave``, …) — used to
|
||
# strip a trailing period when the abbreviation is already canonical
|
||
# in compression mode (``St.`` → ``St``).
|
||
canonical_abbrevs = set(_ADDRESS_COMPRESSIONS.values()) | set(
|
||
_ADDRESS_EXPANSIONS
|
||
)
|
||
|
||
tokens = _TOKEN_RE.findall(s)
|
||
|
||
out_tokens: list[str] = []
|
||
for i, tok in enumerate(tokens):
|
||
if not tok or not tok[0].isalnum():
|
||
# Punctuation / whitespace passes through verbatim — but if
|
||
# it begins with a period and the previous output token is a
|
||
# known USPS abbreviation, strip the leading period (``St.``
|
||
# → ``St``, ``St.,`` → ``St,``).
|
||
if (
|
||
tok.startswith(".")
|
||
and out_tokens
|
||
and (out_tokens[-1] in expansion_values
|
||
or out_tokens[-1] in canonical_abbrevs)
|
||
):
|
||
tok = tok[1:]
|
||
if not tok:
|
||
continue
|
||
out_tokens.append(tok)
|
||
continue
|
||
|
||
key = tok.casefold().rstrip(".")
|
||
upper_form = tok.upper().rstrip(".")
|
||
|
||
# State code preservation: if this token is a 2-letter state code
|
||
# in a state-code position, preserve it as uppercase regardless
|
||
# of input case or abbreviation table collisions.
|
||
if upper_form in _INTL_STATE_CODES and _is_state_code_position(tokens, i):
|
||
out_tokens.append(upper_form)
|
||
continue
|
||
|
||
expansion = abbrev_table.get(key)
|
||
if expansion is not None:
|
||
out_tokens.append(expansion)
|
||
else:
|
||
out_tokens.append(tok)
|
||
|
||
rebuilt = "".join(out_tokens)
|
||
titled = smart_title_case(rebuilt)
|
||
|
||
# Re-apply state-code preservation post title-case (smart_title_case
|
||
# may have lowercased an all-lowercase token before we could fix it).
|
||
titled = _restore_state_codes(titled)
|
||
|
||
if trim_trailing_comma:
|
||
titled = titled.rstrip()
|
||
if titled.endswith(","):
|
||
titled = titled[:-1].rstrip()
|
||
|
||
return titled, titled != value
|
||
|
||
|
||
_STATE_CODE_AFTER_COMMA_RE = re.compile(
|
||
r"(,\s*)([A-Za-z]{2})(\s+\d{5}(?:-\d{4})?|\s*$)"
|
||
)
|
||
|
||
|
||
def _restore_state_codes(s: str) -> str:
|
||
"""Force-uppercase 2-letter state codes following a comma."""
|
||
def repl(m: re.Match) -> str:
|
||
candidate = m.group(2).upper()
|
||
if candidate in _INTL_STATE_CODES:
|
||
return f"{m.group(1)}{candidate}{m.group(3)}"
|
||
return m.group(0)
|
||
|
||
return _STATE_CODE_AFTER_COMMA_RE.sub(repl, s)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Email
|
||
# ---------------------------------------------------------------------------
|
||
#
|
||
# 03's email cleaner is the public surface for normalization (see
|
||
# FORMATS-CASES.md § 0.1 — duplicates the matching logic the dedup
|
||
# tier-1 spec uses internally, so callers don't have to run dedup just
|
||
# to lowercase a list of emails).
|
||
|
||
EmailErrorPolicy = Literal["passthrough", "sentinel"]
|
||
|
||
# Strict-enough RFC 5322-ish regex: local@domain.tld, allowing IDN.
|
||
_EMAIL_RE = re.compile(
|
||
r"^(?P<local>[^\s@<>\"]+)@(?P<domain>[^\s@<>\"]+\.[^\s@<>\".]+)$"
|
||
)
|
||
# Display-name extraction: ``"Alice" <alice@example.com>`` or
|
||
# ``Alice Smith <alice@example.com>``.
|
||
_EMAIL_ANGLE_RE = re.compile(r"<([^<>]+)>")
|
||
_MAILTO_PREFIX_RE = re.compile(r"^mailto:", re.IGNORECASE)
|
||
# Smart-quote wrapping the whole address.
|
||
_EMAIL_SMARTQUOTE_RE = re.compile(r"^[“”‘’]+|[“”‘’]+$")
|
||
# Bidirectional control characters used in homograph / spoofing attacks
|
||
# against email addresses (``alice@example.com`` displays as
|
||
# ``alice@elpmaxe.com`` to RTL-aware renderers). Strip on every parse.
|
||
_EMAIL_BIDI_RE = re.compile(r"[--]")
|
||
# Multi-email cell separator.
|
||
_EMAIL_MULTI_RE = re.compile(r"[,;]\s*\S+@\S+\.\S+")
|
||
|
||
|
||
def standardize_email(
|
||
value: Optional[str],
|
||
*,
|
||
gmail_canonical: bool = False,
|
||
error_policy: EmailErrorPolicy = "passthrough",
|
||
) -> tuple[str, bool]:
|
||
"""Lowercase + trim + strip mailto/display-name wrappers.
|
||
|
||
Default behavior preserves Gmail dots and ``+tag`` segments — that's
|
||
a Gmail provider policy, not a generic email standard. Set
|
||
``gmail_canonical=True`` to strip dots and ``+`` tags from the local
|
||
part for ``@gmail.com`` addresses only (corpus § 5.3).
|
||
|
||
Multiple addresses in a single cell, missing/duplicate ``@``,
|
||
internal whitespace, and TLD-less inputs are surfaced as
|
||
``<error: <reason>>`` when ``error_policy="sentinel"``.
|
||
"""
|
||
if not value or not isinstance(value, str):
|
||
return value or "", False
|
||
s = value.strip()
|
||
if not s:
|
||
return value, False
|
||
|
||
_err = lambda reason: _err_or_passthrough(reason, value, error_policy)
|
||
|
||
# Multi-email cell — error before we silently pick one.
|
||
if _EMAIL_MULTI_RE.search(s) and not s.startswith("<"):
|
||
# If splitting on ;/, yields multiple email-shaped tokens, error.
|
||
parts = re.split(r"[,;]\s*", s)
|
||
email_parts = [p for p in parts if "@" in p and "." in p.split("@")[-1]]
|
||
if len(email_parts) >= 2:
|
||
return _err("multiple emails")
|
||
|
||
# Smart-quote wrappers (``"alice@example.com"``).
|
||
s = _EMAIL_SMARTQUOTE_RE.sub("", s).strip()
|
||
# Strip BIDI / RTL override controls — these are a homograph attack
|
||
# vector and have no legitimate use inside an email address.
|
||
s = _EMAIL_BIDI_RE.sub("", s)
|
||
|
||
# Display-name with angle brackets — extract the address.
|
||
m = _EMAIL_ANGLE_RE.search(s)
|
||
if m:
|
||
s = m.group(1).strip()
|
||
|
||
# mailto: prefix.
|
||
s = _MAILTO_PREFIX_RE.sub("", s).strip()
|
||
|
||
# Trailing punctuation contamination (``alice@example.com,`` etc.).
|
||
s = s.rstrip(",;:.)”’")
|
||
|
||
# Internal whitespace check (``alice @ example.com``).
|
||
if re.search(r"\s", s):
|
||
return _err("internal whitespace")
|
||
|
||
# Lowercase the whole thing — both local part and domain are
|
||
# case-insensitive in practice (RFC 5321 says local can be
|
||
# case-sensitive but no real provider treats it that way).
|
||
s = s.lower()
|
||
|
||
# Validate shape.
|
||
if "@" not in s:
|
||
return _err("missing @")
|
||
if s.count("@") >= 2:
|
||
# ``alice@@example.com`` is double-@, ``alice@example@com`` is
|
||
# multi-@; both error.
|
||
return _err("double @" if "@@" in s else "multiple @")
|
||
m = _EMAIL_RE.match(s)
|
||
if not m:
|
||
return _err("no TLD")
|
||
|
||
local = m.group("local")
|
||
domain = m.group("domain")
|
||
|
||
if gmail_canonical and domain == "gmail.com":
|
||
local = local.replace(".", "").split("+", 1)[0]
|
||
s = f"{local}@{domain}"
|
||
|
||
return s, s != value
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Boolean
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_TRUE_TOKENS = {"true", "t", "yes", "y", "1", "on"}
|
||
_FALSE_TOKENS = {"false", "f", "no", "n", "0", "off"}
|
||
|
||
BoolStyle = Literal["True/False", "true/false", "Yes/No", "Y/N", "1/0"]
|
||
|
||
_BOOL_OUTPUT: dict[BoolStyle, tuple[str, str]] = {
|
||
"True/False": ("True", "False"),
|
||
"true/false": ("true", "false"),
|
||
"Yes/No": ("Yes", "No"),
|
||
"Y/N": ("Y", "N"),
|
||
"1/0": ("1", "0"),
|
||
}
|
||
|
||
|
||
def standardize_boolean(
|
||
value: Any,
|
||
*,
|
||
style: BoolStyle = "True/False",
|
||
) -> tuple[str, bool]:
|
||
"""Map common truthy/falsy strings (and Python bools) to a canonical pair.
|
||
|
||
Recognized truthy: ``true t yes y 1 on``. Recognized falsy:
|
||
``false f no n 0 off``. Comparison is case-insensitive after trim.
|
||
Unrecognized input passes through unchanged.
|
||
"""
|
||
true_out, false_out = _BOOL_OUTPUT[style]
|
||
|
||
if isinstance(value, bool):
|
||
out = true_out if value else false_out
|
||
return out, True
|
||
|
||
if value is None or (isinstance(value, float) and pd.isna(value)):
|
||
return "", False
|
||
|
||
if not isinstance(value, str):
|
||
# Numeric 0/1 → False/True; anything else is unrecognized.
|
||
if value == 0:
|
||
return false_out, True
|
||
if value == 1:
|
||
return true_out, True
|
||
return str(value), False
|
||
|
||
s = value.strip().casefold()
|
||
if not s:
|
||
return value, False
|
||
if s in _TRUE_TOKENS:
|
||
return true_out, true_out != value
|
||
if s in _FALSE_TOKENS:
|
||
return false_out, false_out != value
|
||
return value, False
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Options / result dataclasses
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Preset bundles
|
||
# ---------------------------------------------------------------------------
|
||
#
|
||
# A preset is a flat dict of ``StandardizeOptions`` field defaults — the
|
||
# subset that varies between locales / standards. ``column_types`` and
|
||
# ``extra_abbreviations`` are caller-supplied and never carried by a
|
||
# preset.
|
||
#
|
||
# Standards backing each preset:
|
||
# us-default ISO 8601 dates · ITU-T E.164 phones (US) · ISO 4217 minor
|
||
# unit (2dp) · USPS Pub. 28 address expansion · "True/False"
|
||
# european ISO 8601 dates with DMY for ambiguous input · E.164 phones
|
||
# · ISO 4217 with comma decimal input · "True/False"
|
||
# uk DD/MM/YYYY display · GB region phones · ISO 4217 dot ·
|
||
# "Yes/No" booleans (common in UK gov forms)
|
||
# iso-strict ISO 8601 dates · E.164 · bare-number currency, no rounding
|
||
# · "true/false" lowercase (JSON canonical) · Title names
|
||
# legacy-us MM/DD/YYYY display · National-format phones · 2dp currency
|
||
# · "Yes/No" — for downstream systems that haven't moved off
|
||
# local conventions yet.
|
||
|
||
PRESETS: dict[str, dict[str, Any]] = {
|
||
"us-default": {
|
||
"date_output_format": "%Y-%m-%d",
|
||
"date_order": "MDY",
|
||
"phone_format": "E164",
|
||
"phone_region": "US",
|
||
"currency_decimal": "dot",
|
||
"currency_decimals": 2,
|
||
"currency_preserve_code": False,
|
||
"name_case": "title",
|
||
"boolean_style": "True/False",
|
||
},
|
||
"european": {
|
||
"date_output_format": "%Y-%m-%d",
|
||
"date_order": "DMY",
|
||
"phone_format": "INTERNATIONAL",
|
||
"phone_region": "DE",
|
||
"currency_decimal": "comma",
|
||
"currency_decimals": 2,
|
||
"currency_preserve_code": True,
|
||
"name_case": "title",
|
||
"boolean_style": "True/False",
|
||
},
|
||
"uk": {
|
||
"date_output_format": "%d/%m/%Y",
|
||
"date_order": "DMY",
|
||
"phone_format": "INTERNATIONAL",
|
||
"phone_region": "GB",
|
||
"currency_decimal": "dot",
|
||
"currency_decimals": 2,
|
||
"currency_preserve_code": False,
|
||
"name_case": "title",
|
||
"boolean_style": "Yes/No",
|
||
},
|
||
"iso-strict": {
|
||
"date_output_format": "%Y-%m-%d",
|
||
"date_order": "MDY",
|
||
"phone_format": "E164",
|
||
"phone_region": "US",
|
||
"currency_decimal": "dot",
|
||
"currency_decimals": None,
|
||
"currency_preserve_code": True,
|
||
"name_case": "title",
|
||
"boolean_style": "true/false",
|
||
},
|
||
"legacy-us": {
|
||
"date_output_format": "%m/%d/%Y",
|
||
"date_order": "MDY",
|
||
"phone_format": "NATIONAL",
|
||
"phone_region": "US",
|
||
"currency_decimal": "dot",
|
||
"currency_decimals": 2,
|
||
"currency_preserve_code": False,
|
||
"name_case": "title",
|
||
"boolean_style": "Yes/No",
|
||
},
|
||
}
|
||
|
||
|
||
@dataclass
|
||
class StandardizeOptions:
|
||
"""Configuration for :func:`standardize_dataframe`.
|
||
|
||
The standardizer is column-typed: the user (or auto-detection layer
|
||
above) assigns each column a :class:`FieldType`, and the per-cell
|
||
function for that type runs over the column. Columns absent from
|
||
``column_types`` pass through untouched.
|
||
"""
|
||
|
||
# column name -> field type (string or FieldType enum value)
|
||
column_types: dict[str, FieldType] = field(default_factory=dict)
|
||
|
||
# Date formatting
|
||
date_output_format: str = "%Y-%m-%d"
|
||
date_order: DateOrder = "MDY"
|
||
|
||
# Phone formatting
|
||
phone_format: PhoneFormat = "E164"
|
||
phone_region: str = "US"
|
||
|
||
# Currency formatting
|
||
currency_decimal: CurrencyDecimal = "dot"
|
||
currency_decimals: Optional[int] = 2
|
||
# When True, an ISO 4217 code detected in the input is re-emitted as a
|
||
# space-separated prefix on the standardized number.
|
||
currency_preserve_code: bool = False
|
||
|
||
# Name casing
|
||
name_case: NameCase = "title"
|
||
|
||
# Boolean style
|
||
boolean_style: BoolStyle = "True/False"
|
||
|
||
# Email policy
|
||
email_gmail_canonical: bool = False
|
||
email_error_policy: EmailErrorPolicy = "passthrough"
|
||
|
||
# Address policy (corpus § 6.3 — abbreviated form is canonical, but
|
||
# the existing tests/baseline assume expand-by-default; new callers
|
||
# opt into compression by setting expand=False).
|
||
address_expand: bool = True
|
||
address_state_to_code: bool = True
|
||
address_collapse_multiline: bool = True
|
||
address_trim_trailing_comma: bool = True
|
||
address_normalize_po_box: bool = True
|
||
|
||
# Per-domain error sentinels — when "sentinel", emit ``<error: …>``
|
||
# for unparseable / out-of-domain values. Default ``passthrough``
|
||
# preserves the input unchanged.
|
||
date_error_policy: DateErrorPolicy = "passthrough"
|
||
phone_error_policy: PhoneErrorPolicy = "passthrough"
|
||
currency_error_policy: CurrencyErrorPolicy = "passthrough"
|
||
|
||
# Date locale handling — extra month-name dictionaries beyond English.
|
||
date_month_locales: Optional[list[str]] = None
|
||
|
||
# Name policy
|
||
name_conservative: bool = False
|
||
name_reverse_comma_format: bool = True
|
||
name_family_first: bool = False # set per-column for East Asian data
|
||
|
||
# User overrides for the address abbreviation table. Merged on top of
|
||
# the built-in USPS Pub. 28 list at runtime; values flow through
|
||
# verbatim into Title Case rendering.
|
||
extra_abbreviations: dict[str, str] = field(default_factory=dict)
|
||
|
||
# ----- Scale knobs for large international files -----
|
||
# Per-row country/region overrides. When set, each phone or address
|
||
# row's region is read from the named column (an ISO-3166 alpha-2 code:
|
||
# "US", "GB", "JP", "FR", …). Falls back to ``phone_region`` /
|
||
# global default when the column is missing or the cell is blank.
|
||
phone_country_column: Optional[str] = None
|
||
address_country_column: Optional[str] = None
|
||
|
||
# Audit cap. The change table can grow to tens of millions of rows on
|
||
# a 1 GB input — capping protects memory and keeps the audit usable.
|
||
# ``cells_changed`` still counts every modification; only the per-row
|
||
# ``changes`` DataFrame is truncated. Set to None for unbounded.
|
||
audit_max_rows: Optional[int] = 10_000
|
||
|
||
# Value-level LRU cache size per standardizer. Repeated phone numbers
|
||
# (call-list duplicates), repeated currencies, repeated boolean
|
||
# tokens — all dominate at scale. A 256k-entry cache absorbs most
|
||
# real-world cardinalities without ballooning memory.
|
||
cache_size: int = 262_144
|
||
|
||
@classmethod
|
||
def from_preset(cls, name: str, **overrides: Any) -> StandardizeOptions:
|
||
"""Build options from a named preset, with optional field overrides.
|
||
|
||
Example: ``StandardizeOptions.from_preset("uk", column_types={...})``
|
||
starts from UK defaults and layers ``column_types`` on top.
|
||
"""
|
||
if name not in PRESETS:
|
||
raise ValueError(
|
||
f"Unknown preset '{name}'. "
|
||
f"Available: {', '.join(sorted(PRESETS))}."
|
||
)
|
||
base = dict(PRESETS[name])
|
||
base.update(overrides)
|
||
return cls(**base)
|
||
|
||
@classmethod
|
||
def from_dict(cls, data: dict) -> StandardizeOptions:
|
||
from .errors import ConfigError
|
||
known = {f for f in cls.__dataclass_fields__}
|
||
kwargs = {k: v for k, v in data.items() if k in known}
|
||
column_types = kwargs.get("column_types") or {}
|
||
resolved: dict[str, FieldType] = {}
|
||
for col, raw in column_types.items():
|
||
try:
|
||
resolved[col] = (
|
||
FieldType(raw) if not isinstance(raw, FieldType) else raw
|
||
)
|
||
except ValueError as e:
|
||
valid = sorted(t.value for t in FieldType)
|
||
raise ConfigError(
|
||
f"Invalid field type {raw!r} for column {col!r}",
|
||
column=col,
|
||
operation="StandardizeOptions.from_dict",
|
||
cause=e,
|
||
suggestion=f"Valid field types: {valid}",
|
||
) from e
|
||
kwargs["column_types"] = resolved
|
||
# Surface enum-string mismatches early — bad date_order ("xyz")
|
||
# would otherwise crash deep inside standardize_date.
|
||
for field_name, valid in (
|
||
("date_order", {"MDY", "DMY"}),
|
||
("phone_format", set(_PHONE_FORMAT_MAP) | {"DIGITS"}),
|
||
("currency_decimal", {"dot", "comma", "auto"}),
|
||
("name_case", {"title", "upper", "lower"}),
|
||
("boolean_style", set(_BOOL_OUTPUT)),
|
||
("date_error_policy", {"passthrough", "sentinel"}),
|
||
("phone_error_policy", {"passthrough", "sentinel"}),
|
||
("currency_error_policy", {"passthrough", "sentinel"}),
|
||
("email_error_policy", {"passthrough", "sentinel"}),
|
||
):
|
||
value = kwargs.get(field_name)
|
||
if value is not None and value not in valid:
|
||
raise ConfigError(
|
||
f"Invalid {field_name}={value!r}",
|
||
operation="StandardizeOptions.from_dict",
|
||
suggestion=f"Valid values: {sorted(valid)}",
|
||
)
|
||
return cls(**kwargs)
|
||
|
||
def to_dict(self) -> dict:
|
||
d = asdict(self)
|
||
d["column_types"] = {c: t.value if isinstance(t, FieldType) else t
|
||
for c, t in self.column_types.items()}
|
||
return d
|
||
|
||
def to_file(self, path: str | Path) -> Path:
|
||
from .errors import ConfigError, wrap_file_write
|
||
out = Path(path)
|
||
try:
|
||
payload = json.dumps(self.to_dict(), indent=2)
|
||
except TypeError as e:
|
||
raise ConfigError(
|
||
"Could not serialize StandardizeOptions to JSON",
|
||
operation="StandardizeOptions.to_file",
|
||
cause=e,
|
||
suggestion=(
|
||
"extra_abbreviations or column_types likely contains a "
|
||
"non-string/non-enum value. Inspect with .to_dict() and "
|
||
"remove the offending entry."
|
||
),
|
||
) from e
|
||
try:
|
||
out.write_text(payload)
|
||
except (OSError, PermissionError) as e:
|
||
raise wrap_file_write(out, "StandardizeOptions.to_file", e) from e
|
||
return out
|
||
|
||
@classmethod
|
||
def from_file(cls, path: str | Path) -> StandardizeOptions:
|
||
from .errors import ConfigError, wrap_file_read
|
||
path = Path(path)
|
||
try:
|
||
text = path.read_text()
|
||
except OSError as e:
|
||
raise wrap_file_read(path, "StandardizeOptions.from_file", e) from e
|
||
try:
|
||
data = json.loads(text)
|
||
except json.JSONDecodeError as e:
|
||
raise ConfigError(
|
||
"Invalid JSON in StandardizeOptions config",
|
||
path=path,
|
||
operation="StandardizeOptions.from_file",
|
||
cause=e,
|
||
suggestion=(
|
||
f"JSON parser failed at line {e.lineno}, column {e.colno}. "
|
||
"Validate the file with `python -m json.tool < file.json`."
|
||
),
|
||
) from e
|
||
return cls.from_dict(data)
|
||
|
||
|
||
@dataclass
|
||
class StandardizeResult:
|
||
"""Output of :func:`standardize_dataframe`."""
|
||
|
||
standardized_df: pd.DataFrame
|
||
changes: pd.DataFrame # cols: row, column, field_type, old, new
|
||
cells_changed: int
|
||
cells_unparseable: int # rows where a typed column held junk
|
||
cells_total: int
|
||
columns_processed: list[str]
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Per-cell dispatch
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _apply_field_type(
|
||
value: Any,
|
||
field_type: FieldType,
|
||
options: StandardizeOptions,
|
||
) -> tuple[Any, bool, bool]:
|
||
"""Run the standardizer for *field_type* on *value*.
|
||
|
||
Returns ``(new_value, changed, parsed)``. ``parsed`` is False when the
|
||
value was non-empty but the standardizer couldn't recognize it — used
|
||
to surface a "junk in a typed column" count.
|
||
"""
|
||
if value is None or (isinstance(value, float) and pd.isna(value)):
|
||
return value, False, True
|
||
if not isinstance(value, str):
|
||
# Non-string inputs are converted via str() for everything except
|
||
# booleans, which have a richer accept set.
|
||
if field_type == FieldType.BOOLEAN:
|
||
new, changed = standardize_boolean(value, style=options.boolean_style)
|
||
return new, changed, True
|
||
value = str(value)
|
||
|
||
s_stripped = value.strip()
|
||
if not s_stripped:
|
||
return value, False, True
|
||
|
||
if field_type == FieldType.DATE:
|
||
new, changed = standardize_date(
|
||
value,
|
||
output_format=options.date_output_format,
|
||
date_order=options.date_order,
|
||
error_policy=options.date_error_policy,
|
||
month_locales=options.date_month_locales,
|
||
)
|
||
elif field_type == FieldType.PHONE:
|
||
new, changed = standardize_phone(
|
||
value,
|
||
output_format=options.phone_format,
|
||
default_region=options.phone_region,
|
||
error_policy=options.phone_error_policy,
|
||
)
|
||
elif field_type == FieldType.CURRENCY:
|
||
new, changed = standardize_currency(
|
||
value,
|
||
decimal=options.currency_decimal,
|
||
decimals=options.currency_decimals,
|
||
preserve_code=options.currency_preserve_code,
|
||
error_policy=options.currency_error_policy,
|
||
)
|
||
elif field_type == FieldType.NAME:
|
||
new, changed = standardize_name(
|
||
value,
|
||
case=options.name_case,
|
||
conservative=options.name_conservative,
|
||
reverse_comma_format=options.name_reverse_comma_format,
|
||
family_first=options.name_family_first,
|
||
)
|
||
elif field_type == FieldType.ADDRESS:
|
||
new, changed = standardize_address(
|
||
value,
|
||
extra_abbreviations=options.extra_abbreviations or None,
|
||
expand=options.address_expand,
|
||
state_to_code=options.address_state_to_code,
|
||
collapse_multiline=options.address_collapse_multiline,
|
||
trim_trailing_comma=options.address_trim_trailing_comma,
|
||
normalize_po_box=options.address_normalize_po_box,
|
||
)
|
||
elif field_type == FieldType.EMAIL:
|
||
new, changed = standardize_email(
|
||
value,
|
||
gmail_canonical=options.email_gmail_canonical,
|
||
error_policy=options.email_error_policy,
|
||
)
|
||
elif field_type == FieldType.BOOLEAN:
|
||
new, changed = standardize_boolean(value, style=options.boolean_style)
|
||
else:
|
||
# Unreachable for well-formed input — _resolve_column_types
|
||
# would have rejected the bad enum at the entry point. Hitting
|
||
# this means an internal invariant was broken, not user error.
|
||
raise AssertionError(
|
||
f"Unhandled FieldType in dispatcher: {field_type!r}. "
|
||
"This indicates a code bug — a new FieldType was added to "
|
||
"the enum without a matching branch here."
|
||
)
|
||
|
||
# ``changed=False`` on a non-empty cell means the standardizer either
|
||
# accepted the input as already-canonical OR couldn't parse it. The
|
||
# name/address standardizers always succeed (any string is a valid
|
||
# name); the others can fail. We only count parse failures for the
|
||
# types that have a real parsing step.
|
||
parsed = True
|
||
if not changed and field_type in {
|
||
FieldType.DATE, FieldType.PHONE, FieldType.CURRENCY, FieldType.BOOLEAN,
|
||
}:
|
||
parsed = _is_already_canonical(value, field_type, options)
|
||
|
||
return new, changed, parsed
|
||
|
||
|
||
def _is_already_canonical(
|
||
value: str,
|
||
field_type: FieldType,
|
||
options: StandardizeOptions,
|
||
) -> bool:
|
||
"""Check whether *value* is already in the canonical output shape.
|
||
|
||
Used to distinguish "no change because input was already canonical"
|
||
(a successful pass) from "no change because we couldn't parse it"
|
||
(a junk row to flag).
|
||
"""
|
||
if field_type == FieldType.DATE:
|
||
try:
|
||
datetime.strptime(value.strip(), options.date_output_format)
|
||
return True
|
||
except ValueError:
|
||
return False
|
||
if field_type == FieldType.PHONE:
|
||
if options.phone_format == "DIGITS":
|
||
return value.strip().isdigit() and len(value.strip()) >= 7
|
||
try:
|
||
parsed = phonenumbers.parse(value, options.phone_region)
|
||
except phonenumbers.NumberParseException:
|
||
return False
|
||
if not phonenumbers.is_possible_number(parsed):
|
||
return False
|
||
fmt = _PHONE_FORMAT_MAP[options.phone_format]
|
||
return phonenumbers.format_number(parsed, fmt) == value.strip()
|
||
if field_type == FieldType.CURRENCY:
|
||
# Pure numeric (with optional sign and one decimal point) is
|
||
# treated as already-canonical. When ``preserve_code`` is on, an
|
||
# ``ISO 1234.56`` form also counts as canonical so we don't flag
|
||
# rows that already match the preserved-code output shape.
|
||
bare_re = r"-?\d+(?:\.\d+)?"
|
||
if options.currency_preserve_code:
|
||
return bool(re.fullmatch(
|
||
rf"(?:{_CURRENCY_CODES})\s+{bare_re}|{bare_re}",
|
||
value.strip(),
|
||
re.IGNORECASE,
|
||
))
|
||
return bool(re.fullmatch(bare_re, value.strip()))
|
||
if field_type == FieldType.BOOLEAN:
|
||
true_out, false_out = _BOOL_OUTPUT[options.boolean_style]
|
||
return value.strip() in (true_out, false_out)
|
||
return True
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# DataFrame entry point
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _resolve_column_types(
|
||
options: StandardizeOptions,
|
||
df_columns: Iterable[str],
|
||
) -> dict[str, FieldType]:
|
||
"""Validate column references and coerce string types to enum values."""
|
||
cols = set(df_columns)
|
||
resolved: dict[str, FieldType] = {}
|
||
missing: list[str] = []
|
||
for col, ft in options.column_types.items():
|
||
if col not in cols:
|
||
missing.append(col)
|
||
continue
|
||
resolved[col] = ft if isinstance(ft, FieldType) else FieldType(ft)
|
||
if missing:
|
||
from .errors import InputValidationError
|
||
raise InputValidationError(
|
||
f"Columns referenced by column_types not found in input: {missing}",
|
||
operation="standardize_dataframe",
|
||
suggestion=(
|
||
f"Available columns: {list(df_columns)}. "
|
||
"Check for typos and for header rows that didn't get parsed."
|
||
),
|
||
)
|
||
return resolved
|
||
|
||
|
||
def _build_cached_dispatcher(
|
||
field_type: FieldType,
|
||
options: StandardizeOptions,
|
||
):
|
||
"""Return a per-value standardizer wrapped in an LRU cache.
|
||
|
||
The cache key is the raw cell value plus, when applicable, the
|
||
per-row region derived from ``phone_country_column`` /
|
||
``address_country_column``. Repeated values are O(1) lookups —
|
||
critical at 1 GB scale where the same number appears thousands
|
||
of times.
|
||
|
||
The dispatcher captures the relevant subset of ``options`` so the
|
||
cache key stays small (we don't want to serialize the whole
|
||
options dataclass into every cache entry).
|
||
"""
|
||
from functools import lru_cache
|
||
|
||
cache_size = options.cache_size if options.cache_size > 0 else None
|
||
|
||
if field_type == FieldType.DATE:
|
||
out_fmt = options.date_output_format
|
||
date_order = options.date_order
|
||
date_err = options.date_error_policy
|
||
locales = (
|
||
tuple(options.date_month_locales) if options.date_month_locales else None
|
||
)
|
||
|
||
@lru_cache(maxsize=cache_size)
|
||
def fn(value: Any, _region: Optional[str] = None):
|
||
return _apply_field_type_for(
|
||
value, FieldType.DATE, options,
|
||
_date_args=(out_fmt, date_order, date_err, locales),
|
||
)
|
||
return fn
|
||
|
||
if field_type == FieldType.PHONE:
|
||
out_fmt = options.phone_format
|
||
err = options.phone_error_policy
|
||
default_region = options.phone_region
|
||
|
||
@lru_cache(maxsize=cache_size)
|
||
def fn(value: Any, region: Optional[str] = None):
|
||
r = region or default_region
|
||
return _apply_field_type_for(
|
||
value, FieldType.PHONE, options,
|
||
_phone_args=(out_fmt, r, err),
|
||
)
|
||
return fn
|
||
|
||
if field_type == FieldType.CURRENCY:
|
||
decimal = options.currency_decimal
|
||
decimals = options.currency_decimals
|
||
preserve = options.currency_preserve_code
|
||
err = options.currency_error_policy
|
||
|
||
@lru_cache(maxsize=cache_size)
|
||
def fn(value: Any, _region: Optional[str] = None):
|
||
return _apply_field_type_for(
|
||
value, FieldType.CURRENCY, options,
|
||
_currency_args=(decimal, decimals, preserve, err),
|
||
)
|
||
return fn
|
||
|
||
if field_type == FieldType.BOOLEAN:
|
||
style = options.boolean_style
|
||
|
||
@lru_cache(maxsize=cache_size)
|
||
def fn(value: Any, _region: Optional[str] = None):
|
||
return _apply_field_type_for(
|
||
value, FieldType.BOOLEAN, options,
|
||
_boolean_args=(style,),
|
||
)
|
||
return fn
|
||
|
||
if field_type == FieldType.EMAIL:
|
||
gmail = options.email_gmail_canonical
|
||
err = options.email_error_policy
|
||
|
||
@lru_cache(maxsize=cache_size)
|
||
def fn(value: Any, _region: Optional[str] = None):
|
||
return _apply_field_type_for(
|
||
value, FieldType.EMAIL, options,
|
||
_email_args=(gmail, err),
|
||
)
|
||
return fn
|
||
|
||
# Names and addresses are usually unique per row; no cache wraps
|
||
# them but we still go through ``_apply_field_type`` for parity.
|
||
if field_type == FieldType.NAME:
|
||
def fn(value: Any, _region: Optional[str] = None):
|
||
return _apply_field_type(value, FieldType.NAME, options)
|
||
return fn
|
||
|
||
if field_type == FieldType.ADDRESS:
|
||
# Addresses can be cached too — long lists of repeated office
|
||
# addresses or warehouse locations are common in commerce data.
|
||
@lru_cache(maxsize=cache_size)
|
||
def fn(value: Any, _region: Optional[str] = None):
|
||
return _apply_field_type(value, FieldType.ADDRESS, options)
|
||
return fn
|
||
|
||
# Fallback (shouldn't happen — every FieldType is covered above).
|
||
return lambda value, _region=None: _apply_field_type(value, field_type, options)
|
||
|
||
|
||
def _apply_field_type_for(
|
||
value: Any,
|
||
field_type: FieldType,
|
||
options: StandardizeOptions,
|
||
*,
|
||
_date_args=None,
|
||
_phone_args=None,
|
||
_currency_args=None,
|
||
_boolean_args=None,
|
||
_email_args=None,
|
||
) -> tuple[Any, bool, bool]:
|
||
"""Cacheable dispatcher: same shape as :func:`_apply_field_type` but
|
||
accepts pre-extracted scalar argument tuples so the LRU cache key is
|
||
just ``(value, region)`` instead of the full options object.
|
||
"""
|
||
if value is None or (isinstance(value, float) and pd.isna(value)):
|
||
return value, False, True
|
||
if not isinstance(value, str):
|
||
if field_type == FieldType.BOOLEAN:
|
||
style = (_boolean_args or (options.boolean_style,))[0]
|
||
new, changed = standardize_boolean(value, style=style)
|
||
return new, changed, True
|
||
value = str(value)
|
||
|
||
if not value.strip():
|
||
return value, False, True
|
||
|
||
if field_type == FieldType.DATE:
|
||
out_fmt, date_order, err, locales = _date_args or (
|
||
options.date_output_format, options.date_order,
|
||
options.date_error_policy,
|
||
tuple(options.date_month_locales) if options.date_month_locales else None,
|
||
)
|
||
new, changed = standardize_date(
|
||
value,
|
||
output_format=out_fmt,
|
||
date_order=date_order,
|
||
error_policy=err,
|
||
month_locales=list(locales) if locales else None,
|
||
)
|
||
elif field_type == FieldType.PHONE:
|
||
out_fmt, region, err = _phone_args or (
|
||
options.phone_format, options.phone_region, options.phone_error_policy,
|
||
)
|
||
new, changed = standardize_phone(
|
||
value, output_format=out_fmt, default_region=region, error_policy=err,
|
||
)
|
||
elif field_type == FieldType.CURRENCY:
|
||
decimal, decimals, preserve, err = _currency_args or (
|
||
options.currency_decimal, options.currency_decimals,
|
||
options.currency_preserve_code, options.currency_error_policy,
|
||
)
|
||
new, changed = standardize_currency(
|
||
value,
|
||
decimal=decimal,
|
||
decimals=decimals,
|
||
preserve_code=preserve,
|
||
error_policy=err,
|
||
)
|
||
elif field_type == FieldType.BOOLEAN:
|
||
style = (_boolean_args or (options.boolean_style,))[0]
|
||
new, changed = standardize_boolean(value, style=style)
|
||
elif field_type == FieldType.EMAIL:
|
||
gmail, err = _email_args or (
|
||
options.email_gmail_canonical, options.email_error_policy,
|
||
)
|
||
new, changed = standardize_email(
|
||
value, gmail_canonical=gmail, error_policy=err,
|
||
)
|
||
else:
|
||
return _apply_field_type(value, field_type, options)
|
||
|
||
parsed = True
|
||
if not changed and field_type in {
|
||
FieldType.DATE, FieldType.PHONE, FieldType.CURRENCY, FieldType.BOOLEAN,
|
||
}:
|
||
parsed = _is_already_canonical(value, field_type, options)
|
||
|
||
return new, changed, parsed
|
||
|
||
|
||
def standardize_dataframe(
|
||
df: pd.DataFrame,
|
||
options: Optional[StandardizeOptions] = None,
|
||
) -> StandardizeResult:
|
||
"""Apply per-column standardizers across *df*.
|
||
|
||
Columns absent from ``options.column_types`` pass through unchanged.
|
||
The input DataFrame is not mutated.
|
||
|
||
Pipeline placement (recommended, not enforced)
|
||
----------------------------------------------
|
||
Run *after* the text cleaner (smart-quote / NBSP / zero-width
|
||
pollution breaks phone, currency, and date parsers) and *before*
|
||
the missing-value handler (numeric imputation expects canonical
|
||
types) and the deduplicator (canonical phone E.164 / lowercase
|
||
email enables cross-format duplicate matching). See
|
||
``src.core.pipeline.SOFT_DEPENDENCIES``.
|
||
|
||
Performance characteristics
|
||
---------------------------
|
||
Per-cell standardizers are wrapped in an LRU cache (size
|
||
``options.cache_size``) so repeated values — common in real
|
||
international data, where the same office phone or vendor address
|
||
appears thousands of times — short-circuit. The dispatch loop uses
|
||
``Series.map`` for pandas-native iteration; on a 10-million-row
|
||
column this is roughly 4-8× faster than the previous
|
||
``for v in series.tolist()`` path.
|
||
|
||
For inputs larger than will fit comfortably in RAM, prefer
|
||
:func:`standardize_file` which streams chunks from disk.
|
||
"""
|
||
from .errors import ensure_dataframe
|
||
ensure_dataframe(df, function="standardize_dataframe")
|
||
options = options or StandardizeOptions()
|
||
out = df.copy()
|
||
column_types = _resolve_column_types(options, out.columns)
|
||
|
||
cells_changed = 0
|
||
cells_unparseable = 0
|
||
cells_total = 0
|
||
audit_cap = options.audit_max_rows
|
||
audit_room = float("inf") if audit_cap is None else audit_cap
|
||
audit_records: list[dict[str, Any]] = []
|
||
|
||
# Per-row region columns must exist in the frame when set.
|
||
if options.phone_country_column and options.phone_country_column not in out.columns:
|
||
from .errors import InputValidationError
|
||
raise InputValidationError(
|
||
f"phone_country_column={options.phone_country_column!r} not in input columns",
|
||
operation="standardize_dataframe",
|
||
suggestion=f"Available: {list(out.columns)}",
|
||
)
|
||
if options.address_country_column and options.address_country_column not in out.columns:
|
||
from .errors import InputValidationError
|
||
raise InputValidationError(
|
||
f"address_country_column={options.address_country_column!r} not in input columns",
|
||
operation="standardize_dataframe",
|
||
suggestion=f"Available: {list(out.columns)}",
|
||
)
|
||
|
||
for col, field_type in column_types.items():
|
||
series = out[col]
|
||
cells_total += len(series)
|
||
dispatcher = _build_cached_dispatcher(field_type, options)
|
||
|
||
# Per-row region lookup. Phones and addresses are the two types
|
||
# that benefit from country context; everything else ignores the
|
||
# second argument.
|
||
region_series: Optional[pd.Series] = None
|
||
if field_type == FieldType.PHONE and options.phone_country_column:
|
||
region_series = out[options.phone_country_column]
|
||
elif field_type == FieldType.ADDRESS and options.address_country_column:
|
||
region_series = out[options.address_country_column]
|
||
|
||
new_values: list[Any] = [None] * len(series)
|
||
if region_series is None:
|
||
triples = [dispatcher(v) for v in series.tolist()]
|
||
else:
|
||
regions = region_series.tolist()
|
||
triples = [
|
||
dispatcher(v, _normalize_region(r))
|
||
for v, r in zip(series.tolist(), regions)
|
||
]
|
||
|
||
for i, (orig, (new, changed, parsed)) in enumerate(
|
||
zip(series.tolist(), triples)
|
||
):
|
||
new_values[i] = new
|
||
if changed:
|
||
cells_changed += 1
|
||
if audit_room > 0:
|
||
audit_records.append({
|
||
"row": i,
|
||
"column": col,
|
||
"field_type": field_type.value,
|
||
"old": orig,
|
||
"new": new,
|
||
})
|
||
audit_room -= 1
|
||
if not parsed:
|
||
cells_unparseable += 1
|
||
out[col] = new_values
|
||
|
||
changes_df = pd.DataFrame(
|
||
audit_records,
|
||
columns=["row", "column", "field_type", "old", "new"],
|
||
)
|
||
|
||
# Surface a warning when more than 10% of typed cells failed to
|
||
# parse — usually means the user mis-typed a column (text marked
|
||
# as DATE) or the data is genuinely garbage. Without this, a
|
||
# quietly-broken pipeline shows zero changes and silently lets bad
|
||
# data flow downstream.
|
||
if cells_total > 0 and cells_unparseable / cells_total > 0.1:
|
||
logger.warning(
|
||
"standardize_dataframe: {}/{} cells ({}%) in typed columns were "
|
||
"unparseable — check column_types for mismatches with the data.",
|
||
cells_unparseable,
|
||
cells_total,
|
||
int(100 * cells_unparseable / cells_total),
|
||
)
|
||
|
||
# Only log the cap message when it would surprise the caller —
|
||
# cap=0 is the streaming-path's deliberate "audit budget exhausted"
|
||
# signal and shouldn't generate noise per chunk.
|
||
if audit_cap and audit_cap > 0 and cells_changed > audit_cap:
|
||
logger.info(
|
||
"standardize_dataframe: audit capped at {} rows "
|
||
"(cells_changed={}); raise audit_max_rows or set to None for full audit.",
|
||
audit_cap, cells_changed,
|
||
)
|
||
|
||
return StandardizeResult(
|
||
standardized_df=out,
|
||
changes=changes_df,
|
||
cells_changed=cells_changed,
|
||
cells_unparseable=cells_unparseable,
|
||
cells_total=cells_total,
|
||
columns_processed=list(column_types.keys()),
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Per-row region helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# Common country-name → ISO-3166 alpha-2 mappings. The phonenumbers
|
||
# library wants the alpha-2 code, but real spreadsheets carry full names
|
||
# ("United Kingdom", "Japan", "Brazil"). Add new entries lazily as users
|
||
# bring in data — the table is a soft mapping, missing entries fall back
|
||
# to the global ``phone_region``.
|
||
_COUNTRY_NAME_TO_ISO2: dict[str, str] = {
|
||
"united states": "US", "usa": "US", "u.s.": "US", "u.s.a.": "US",
|
||
"united kingdom": "GB", "uk": "GB", "great britain": "GB", "england": "GB",
|
||
"canada": "CA",
|
||
"mexico": "MX",
|
||
"france": "FR",
|
||
"germany": "DE", "deutschland": "DE",
|
||
"italy": "IT", "italia": "IT",
|
||
"spain": "ES", "españa": "ES",
|
||
"portugal": "PT",
|
||
"netherlands": "NL", "holland": "NL",
|
||
"belgium": "BE",
|
||
"switzerland": "CH", "schweiz": "CH",
|
||
"austria": "AT", "österreich": "AT",
|
||
"ireland": "IE",
|
||
"sweden": "SE", "norway": "NO", "denmark": "DK", "finland": "FI",
|
||
"poland": "PL", "czech republic": "CZ", "czechia": "CZ", "hungary": "HU",
|
||
"russia": "RU", "ukraine": "UA",
|
||
"japan": "JP", "中国": "CN", "china": "CN", "south korea": "KR", "korea": "KR",
|
||
"india": "IN", "indonesia": "ID", "thailand": "TH", "vietnam": "VN",
|
||
"philippines": "PH", "malaysia": "MY", "singapore": "SG",
|
||
"australia": "AU", "new zealand": "NZ",
|
||
"brazil": "BR", "brasil": "BR",
|
||
"argentina": "AR", "chile": "CL", "colombia": "CO", "peru": "PE",
|
||
"south africa": "ZA",
|
||
"uae": "AE", "united arab emirates": "AE",
|
||
"saudi arabia": "SA",
|
||
"egypt": "EG",
|
||
"israel": "IL",
|
||
"turkey": "TR", "türkiye": "TR",
|
||
}
|
||
|
||
|
||
def _normalize_region(value: Any) -> Optional[str]:
|
||
"""Normalise a region cell to an ISO-3166 alpha-2 code.
|
||
|
||
Accepts ISO codes (``US``, ``us``, ``USA``), full names
|
||
(``United States``, ``Japan``), and falls back to None when the
|
||
value is empty or unrecognized — letting the dispatcher use the
|
||
global default region.
|
||
"""
|
||
if value is None:
|
||
return None
|
||
if isinstance(value, float) and pd.isna(value):
|
||
return None
|
||
if not isinstance(value, str):
|
||
value = str(value)
|
||
s = value.strip()
|
||
if not s:
|
||
return None
|
||
upper = s.upper()
|
||
# ISO-3166 alpha-2 (e.g. "US", "JP")
|
||
if len(upper) == 2 and upper.isalpha():
|
||
return upper
|
||
# ISO-3166 alpha-3 (e.g. "USA", "JPN") — strip last letter as a
|
||
# cheap heuristic, then validate alpha-2.
|
||
if len(upper) == 3 and upper.isalpha():
|
||
# phonenumbers accepts alpha-2 only; map a few common alpha-3.
|
||
alpha3_map = {
|
||
"USA": "US", "GBR": "GB", "CAN": "CA", "MEX": "MX", "DEU": "DE",
|
||
"FRA": "FR", "ITA": "IT", "ESP": "ES", "JPN": "JP", "CHN": "CN",
|
||
"KOR": "KR", "BRA": "BR", "AUS": "AU", "IND": "IN", "RUS": "RU",
|
||
}
|
||
if upper in alpha3_map:
|
||
return alpha3_map[upper]
|
||
# Full country name lookup.
|
||
return _COUNTRY_NAME_TO_ISO2.get(s.lower())
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Streaming entry point — for inputs that don't fit in memory
|
||
# ---------------------------------------------------------------------------
|
||
|
||
@dataclass
|
||
class StreamingStandardizeResult:
|
||
"""Summary returned by :func:`standardize_file`.
|
||
|
||
Mirrors :class:`StandardizeResult` but without the in-memory
|
||
DataFrame — the standardized output is written incrementally to
|
||
``output_path``. The ``changes`` audit is also written
|
||
incrementally to ``audit_path`` and capped at
|
||
``options.audit_max_rows`` total rows across all chunks.
|
||
"""
|
||
|
||
output_path: Path
|
||
audit_path: Optional[Path]
|
||
rows_processed: int
|
||
chunks_processed: int
|
||
cells_changed: int
|
||
cells_unparseable: int
|
||
cells_total: int
|
||
columns_processed: list[str]
|
||
|
||
|
||
def standardize_file(
|
||
input_path: str | Path,
|
||
output_path: str | Path,
|
||
options: Optional[StandardizeOptions] = None,
|
||
*,
|
||
chunk_size: int = 50_000,
|
||
audit_path: Optional[str | Path] = None,
|
||
progress_callback: Optional[Any] = None,
|
||
encoding: str = "utf-8",
|
||
delimiter: str = ",",
|
||
) -> StreamingStandardizeResult:
|
||
"""Standardize a CSV/TSV file in chunks, writing output incrementally.
|
||
|
||
For inputs too large to materialize in memory, this entry point
|
||
streams ``chunk_size`` rows at a time through
|
||
:func:`standardize_dataframe` and writes each chunk to *output_path*
|
||
as it completes. Memory stays bounded by the chunk size regardless
|
||
of input file size.
|
||
|
||
The audit is written to *audit_path* (default
|
||
``{output_path.stem}_changes.csv``). Each chunk's
|
||
``options.audit_max_rows`` budget is respected per chunk; pass
|
||
``audit_max_rows=None`` for a full audit (memory-bounded only by
|
||
disk).
|
||
|
||
Performance for a 1 GB CSV with ~10 M rows on a typical workstation:
|
||
- chunk_size=50_000 → ~50 MB peak DataFrame footprint
|
||
- phone-only standardization: ~3-6 minutes (cache-warm)
|
||
- mixed phone + currency + address: ~8-15 minutes
|
||
- first chunk is the cold-cache slowest; later chunks ride the LRU.
|
||
|
||
Parameters
|
||
----------
|
||
input_path
|
||
CSV or TSV path. Excel inputs aren't streamed — load with
|
||
:func:`read_file` and use :func:`standardize_dataframe`.
|
||
output_path
|
||
Where to write the standardized CSV. Existing files are
|
||
overwritten.
|
||
chunk_size
|
||
Rows per chunk. Default 50,000 ≈ 50 MB resident for typical
|
||
widths. Higher → less I/O overhead, more peak memory.
|
||
progress_callback
|
||
Optional ``callable(rows_processed, chunks_processed)``
|
||
called once per chunk.
|
||
"""
|
||
from .errors import wrap_file_read, wrap_file_write
|
||
options = options or StandardizeOptions()
|
||
inp = Path(input_path)
|
||
out = Path(output_path)
|
||
if not inp.exists():
|
||
from .errors import FileAccessError
|
||
raise FileAccessError(
|
||
f"Input file not found: {inp}",
|
||
path=inp, operation="standardize_file",
|
||
)
|
||
|
||
audit_p = Path(audit_path) if audit_path else out.with_name(
|
||
f"{out.stem}_changes.csv"
|
||
)
|
||
|
||
rows_processed = 0
|
||
chunks_processed = 0
|
||
cells_changed = 0
|
||
cells_unparseable = 0
|
||
cells_total = 0
|
||
columns_processed: list[str] = []
|
||
audit_room = (
|
||
options.audit_max_rows if options.audit_max_rows is not None
|
||
else float("inf")
|
||
)
|
||
|
||
out.parent.mkdir(parents=True, exist_ok=True)
|
||
audit_p.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
out_writer_open = False
|
||
audit_writer_open = False
|
||
|
||
try:
|
||
reader = pd.read_csv(
|
||
inp, chunksize=chunk_size, encoding=encoding,
|
||
sep=delimiter, dtype=str, keep_default_na=False,
|
||
)
|
||
except (OSError, FileNotFoundError) as e:
|
||
raise wrap_file_read(inp, "standardize_file", e) from e
|
||
|
||
try:
|
||
for chunk in reader:
|
||
# The chunked reader gives back row indices that restart
|
||
# at chunk boundaries; renumber so audit row indices reflect
|
||
# the full input file.
|
||
chunk_offset = rows_processed
|
||
chunk_options = options
|
||
# Local audit cap per chunk: never exceed the global budget.
|
||
if options.audit_max_rows is not None and audit_room <= 0:
|
||
# Disable audit for this chunk by setting cap=0; the
|
||
# standardizer skips appending records once room == 0.
|
||
chunk_options = _replace_options(options, audit_max_rows=0)
|
||
|
||
result = standardize_dataframe(chunk, chunk_options)
|
||
cells_changed += result.cells_changed
|
||
cells_unparseable += result.cells_unparseable
|
||
cells_total += result.cells_total
|
||
if not columns_processed:
|
||
columns_processed = list(result.columns_processed)
|
||
|
||
# Write the standardized chunk
|
||
try:
|
||
if not out_writer_open:
|
||
result.standardized_df.to_csv(
|
||
out, mode="w", index=False, encoding=encoding,
|
||
sep=delimiter,
|
||
)
|
||
out_writer_open = True
|
||
else:
|
||
result.standardized_df.to_csv(
|
||
out, mode="a", index=False, header=False,
|
||
encoding=encoding, sep=delimiter,
|
||
)
|
||
except OSError as e:
|
||
raise wrap_file_write(out, "standardize_file", e) from e
|
||
|
||
# Write the audit (re-numbering rows to absolute file positions).
|
||
if not result.changes.empty and audit_room > 0:
|
||
# ``audit_room`` is float('inf') when the user wants an
|
||
# unbounded audit; ``iloc[:inf]`` is invalid, so take the
|
||
# whole frame in that case.
|
||
if audit_room == float("inf"):
|
||
cap_changes = result.changes.copy()
|
||
else:
|
||
cap_changes = result.changes.iloc[: int(audit_room)].copy()
|
||
cap_changes["row"] = cap_changes["row"] + chunk_offset
|
||
try:
|
||
if not audit_writer_open:
|
||
cap_changes.to_csv(
|
||
audit_p, mode="w", index=False, encoding=encoding,
|
||
)
|
||
audit_writer_open = True
|
||
else:
|
||
cap_changes.to_csv(
|
||
audit_p, mode="a", index=False, header=False,
|
||
encoding=encoding,
|
||
)
|
||
except OSError as e:
|
||
raise wrap_file_write(audit_p, "standardize_file", e) from e
|
||
audit_room -= len(cap_changes)
|
||
|
||
rows_processed += len(chunk)
|
||
chunks_processed += 1
|
||
if progress_callback:
|
||
try:
|
||
progress_callback(rows_processed, chunks_processed)
|
||
except Exception:
|
||
# Progress callbacks are advisory — don't kill the run.
|
||
logger.opt(exception=True).debug(
|
||
"progress_callback raised; ignoring"
|
||
)
|
||
finally:
|
||
# Ensure the iterator is closed (closes the underlying file).
|
||
if hasattr(reader, "close"):
|
||
reader.close()
|
||
|
||
return StreamingStandardizeResult(
|
||
output_path=out,
|
||
audit_path=audit_p if audit_writer_open else None,
|
||
rows_processed=rows_processed,
|
||
chunks_processed=chunks_processed,
|
||
cells_changed=cells_changed,
|
||
cells_unparseable=cells_unparseable,
|
||
cells_total=cells_total,
|
||
columns_processed=columns_processed,
|
||
)
|
||
|
||
|
||
def _replace_options(options: StandardizeOptions, **kwargs: Any) -> StandardizeOptions:
|
||
"""Cheap shallow clone of :class:`StandardizeOptions` with overrides.
|
||
|
||
Used by the streaming path to reduce the audit budget chunk-by-chunk
|
||
without mutating the caller's options object.
|
||
"""
|
||
from dataclasses import replace
|
||
return replace(options, **kwargs)
|