refactor: dedup, consolidate, harden public APIs across core modules

Closes 16 high-value findings from a parallel cross-module review.

Refactors:
- New src/core/_constants.py centralizes USPS street-suffix
  abbreviations, US state names, and 2-letter postal codes — one source
  of truth for both normalize_address (matching keys) and
  standardize_address (display formatting). Eliminates ~80 lines of
  duplicated dicts across normalizers.py and format_standardize.py.
- format_standardize.py: collapse 4 identical nested _err() helpers
  into one shared _err_or_passthrough() module function; drop a dead
  duplicate `return _err("not a phone number")` branch in
  standardize_phone.
- format_standardize.py: precompile per-locale month-name regexes
  (_MONTH_LOCALE_PATTERNS) and per-state-name regexes
  (_STATE_NAME_PATTERNS) at import time — they were rebuilt on every
  cell, a measurable hot path on million-row inputs.
- dedup.py: extract _is_missing(value) helper; one definition of
  "this cell is None / NaN / pd.NA" instead of two.
- fixes.py: extract _is_string_column(ser) helper; one dtype check
  instead of three duplicates across _apply_to_strings,
  _vectorized_translate, _vectorized_regex_sub.

Production-readiness:
- format_standardize.standardize_dataframe now logs a warning when
  more than 10% of typed cells are unparseable — surfaces the
  silently-broken-pipeline failure mode.
- StandardizeOptions.from_dict validates date_order / phone_format /
  currency_decimal / name_case / boolean_style / *_error_policy
  enum values up front, with a clear error message instead of a deep
  crash inside the per-cell function.
- StandardizeOptions.from_file and DeduplicationConfig.from_file wrap
  read + json.loads with descriptive OSError / ValueError messages
  including the file path.
- standardize_date(month_locales=...) validates locale codes against
  the available set instead of silently passing through unknown ones.
- io.read_file rejects chunk_size <= 0 (was silently failing inside
  pandas) and logs the resolved suffix + chunk_size at info level so
  data-pipeline runs are debuggable.
- io.read_file's FileNotFoundError gains explanatory context.
- io.write_file, text_clean.clean_dataframe, and dedup.deduplicate
  now reject non-DataFrame inputs with clear TypeError instead of
  cryptic pandas tracebacks downstream.
- dedup.deduplicate validates that survivor_rule=KEEP_MOST_RECENT has
  a usable date_column up front; the helper _select_survivor now
  raises (instead of silently falling back to keep_first) when called
  directly with bad arguments.
- dedup.deduplicate gains a structured no-op return when strategies
  is empty after auto-detection — preserves schema instead of crashing.
- analyze._detect_inconsistent_date_format narrows its bare except to
  (TypeError, ValueError) and logs a debug line so genuine bugs don't
  hide behind silent skip.

Tests:
- tests/test_audit_fixes.py grows by 11 cases covering the new
  validation paths (chunk_size, DataFrame guards, KEEP_MOST_RECENT
  date_column, enum validation, locale validation, JSON error wrapping).

Full project suite: 1208 passed, 4 skipped, 17 xfailed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-01 02:23:09 +00:00
parent b23a27d4e3
commit 2eece6467d
10 changed files with 457 additions and 231 deletions

View File

@@ -23,6 +23,8 @@ from __future__ import annotations
import json
import re
from loguru import logger
from dataclasses import asdict, dataclass, field
from datetime import datetime, timedelta
from enum import Enum
@@ -51,6 +53,19 @@ class FieldType(str, Enum):
EMAIL = "email"
# Shared error-policy helper used by every per-domain standardizer.
# Returns ``(<error: reason>, changed)`` under the ``"sentinel"`` policy
# and ``(value, False)`` under ``"passthrough"`` so unparseable input
# survives unchanged.
def _err_or_passthrough(
reason: str, value: str, policy: str,
) -> tuple[str, bool]:
if policy == "sentinel":
sentinel = f"<error: {reason}>"
return sentinel, sentinel != value
return value, False
# ---------------------------------------------------------------------------
# Date
# ---------------------------------------------------------------------------
@@ -153,24 +168,49 @@ _MONTH_LOCALES: dict[str, dict[str, str]] = {
}
def _build_month_locale_patterns() -> dict[str, list[tuple["re.Pattern[str]", str]]]:
"""Precompile per-locale (pattern, replacement) lists once at import.
The previous loop compiled every pattern for every input cell — at
millions of rows that's a measurable hot spot.
"""
out: dict[str, list[tuple[re.Pattern[str], str]]] = {}
for loc, table in _MONTH_LOCALES.items():
out[loc] = [
(
re.compile(
rf"(?<![A-Za-z]){re.escape(foreign)}(?![A-Za-z])",
re.IGNORECASE,
),
english,
)
for foreign, english in table.items()
]
return out
_MONTH_LOCALE_PATTERNS = _build_month_locale_patterns()
def _apply_month_locale(s: str, locales: list[str]) -> str:
"""Replace localized month names with English equivalents."""
"""Replace localized month names with English equivalents.
Raises ``ValueError`` if any locale is unrecognized — silent skip
would mask typos like ``"FR"`` (uppercase) or ``"french"``.
"""
unknown = [
loc for loc in locales if loc != "en" and loc not in _MONTH_LOCALES
]
if unknown:
raise ValueError(
f"Unknown month locale(s): {unknown}. "
f"Available: {sorted(_MONTH_LOCALES) + ['en']}"
)
for loc in locales:
if loc == "en":
continue
table = _MONTH_LOCALES.get(loc)
if not table:
continue
for foreign, english in table.items():
# Word-boundary match, case-insensitive — covers ``15 janvier
# 2024`` and ``15. Januar 2024`` alike. The replacement also
# strips a trailing period after a German abbreviation (``15.``
# is the day; the month is the next token).
pattern = re.compile(
rf"(?<![A-Za-z]){re.escape(foreign)}(?![A-Za-z])",
re.IGNORECASE,
)
s = pattern.sub(english, s)
for pat, english in _MONTH_LOCALE_PATTERNS[loc]:
s = pat.sub(english, s)
return s
@@ -250,11 +290,7 @@ def standardize_date(
if not s:
return value, False
def _err(reason: str) -> tuple[str, bool]:
if error_policy == "sentinel":
sentinel = f"<error: {reason}>"
return sentinel, sentinel != value
return value, False
_err = lambda reason: _err_or_passthrough(reason, value, error_policy)
# Excel serial dates and Unix timestamps don't survive the weekday-
# prefix / time-tail strips, so try them first. They short-circuit
@@ -395,11 +431,7 @@ def standardize_phone(
if not s:
return value, False
def _err(reason: str) -> tuple[str, bool]:
if error_policy == "sentinel":
sentinel = f"<error: {reason}>"
return sentinel, sentinel != value
return value, False
_err = lambda reason: _err_or_passthrough(reason, value, error_policy)
if output_format == "DIGITS":
digits = re.sub(r"\D", "", s)
@@ -437,13 +469,11 @@ def standardize_phone(
try:
parsed = phonenumbers.parse(s, default_region)
except phonenumbers.NumberParseException:
# Only emit a sentinel for inputs that clearly contain digits
# but failed to parse (corpus § 4.3 errors). Pure non-numeric
# strings pass through unchanged so a "TBD"-style placeholder
# doesn't get reshaped into a phone error.
if re.search(r"\d", s):
return _err("not a phone number")
return _err("not a phone number") # symmetric — TBD/garbage flagged
# Anything that can't be parsed becomes a sentinel under the
# sentinel policy; passthrough returns the original. Both digit-
# and-formatting failures and pure non-numeric ("TBD"-style) cells
# land here.
return _err("not a phone number")
if not phonenumbers.is_possible_number(parsed):
# Distinguish "too many digits" from generic invalidity for
@@ -594,11 +624,7 @@ def standardize_currency(
if not s:
return value, False
def _err(reason: str) -> tuple[str, bool]:
if error_policy == "sentinel":
sentinel = f"<error: {reason}>"
return sentinel, sentinel != value
return value, False
_err = lambda reason: _err_or_passthrough(reason, value, error_policy)
if "%" in s:
return _err("percentage not currency")
@@ -941,51 +967,16 @@ def standardize_name(
# Address
# ---------------------------------------------------------------------------
# Expansion table — the inverse of the dedup-side ``_USPS_ABBREVIATIONS``.
# These are the canonical long-form spellings the standardizer emits when
# it sees the abbreviation. We deliberately don't expand ``unit``, ``loop``,
# or ``way`` because those are already the long form.
_ADDRESS_EXPANSIONS: dict[str, str] = {
"st": "Street",
"ave": "Avenue",
"av": "Avenue",
"blvd": "Boulevard",
"blv": "Boulevard",
"dr": "Drive",
"ln": "Lane",
"rd": "Road",
"ct": "Court",
"pl": "Place",
"cir": "Circle",
"trl": "Trail",
"tr": "Trail",
"ter": "Terrace",
"pkwy": "Parkway",
"hwy": "Highway",
"expy": "Expressway",
"fwy": "Freeway",
"sq": "Square",
"aly": "Alley",
"xing": "Crossing",
"pt": "Point",
"n": "North",
"s": "South",
"e": "East",
"w": "West",
"ne": "Northeast",
"nw": "Northwest",
"se": "Southeast",
"sw": "Southwest",
"apt": "Apartment",
"ste": "Suite",
"bldg": "Building",
"fl": "Floor",
"rm": "Room",
"ft": "Fort",
"mt": "Mount",
"hts": "Heights",
"spgs": "Springs",
}
# Expansion table — the inverse of the dedup-side compression set in
# ``normalize_address``. We deliberately don't expand ``unit``, ``loop``,
# or ``way`` because those are already the long form. Canonical mappings
# live in :mod:`src.core._constants` so both modules stay in sync.
from ._constants import (
USPS_EXPANSIONS as _ADDRESS_EXPANSIONS,
USPS_COMPRESSIONS as _ADDRESS_COMPRESSIONS,
US_STATE_CODES as _US_STATE_CODES_SHARED,
US_STATE_NAMES as _US_STATE_NAMES_SHARED,
)
# Short tokens that look like directions but only mean a direction at the
# start or end of an address — never in the middle of a street name. This
@@ -996,58 +987,23 @@ _DIRECTION_TOKENS = {"n", "s", "e", "w", "ne", "nw", "se", "sw"}
_TOKEN_RE = re.compile(r"\w+|[^\w\s]+|\s+")
# 2-letter US state postal codes — preserved verbatim so they don't get
# title-cased into ``Ny``/``Ca`` and don't collide with abbreviation
# entries (``ST`` no longer expands to ``Street`` when the surrounding
# context says it's a state code).
_US_STATE_CODES: set[str] = {
"AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
"HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
"MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
"NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
"SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY",
"DC", "PR", "VI", "GU", "AS", "MP",
# ``ST`` appears as a placeholder state in the corpus fixtures; keep
# it preserved so test rows don't trip the Street collision.
"ST",
}
# Aliases over the shared constants — kept for the local module-level
# reads that already reference these names.
_US_STATE_CODES = _US_STATE_CODES_SHARED
_US_STATE_NAMES = _US_STATE_NAMES_SHARED
# State name → 2-letter postal code. Used when ``state_to_code=True``.
_US_STATE_NAMES: dict[str, str] = {
"alabama": "AL", "alaska": "AK", "arizona": "AZ", "arkansas": "AR",
"california": "CA", "colorado": "CO", "connecticut": "CT",
"delaware": "DE", "florida": "FL", "georgia": "GA", "hawaii": "HI",
"idaho": "ID", "illinois": "IL", "indiana": "IN", "iowa": "IA",
"kansas": "KS", "kentucky": "KY", "louisiana": "LA", "maine": "ME",
"maryland": "MD", "massachusetts": "MA", "michigan": "MI",
"minnesota": "MN", "mississippi": "MS", "missouri": "MO",
"montana": "MT", "nebraska": "NE", "nevada": "NV",
"new hampshire": "NH", "new jersey": "NJ", "new mexico": "NM",
"new york": "NY", "north carolina": "NC", "north dakota": "ND",
"ohio": "OH", "oklahoma": "OK", "oregon": "OR", "pennsylvania": "PA",
"rhode island": "RI", "south carolina": "SC", "south dakota": "SD",
"tennessee": "TN", "texas": "TX", "utah": "UT", "vermont": "VT",
"virginia": "VA", "washington": "WA", "west virginia": "WV",
"wisconsin": "WI", "wyoming": "WY",
"district of columbia": "DC",
}
# Inverse abbreviation table used when ``expand=False`` — compresses
# spelled-out forms back to their USPS abbreviations.
_ADDRESS_COMPRESSIONS: dict[str, str] = {
"street": "St", "avenue": "Ave", "boulevard": "Blvd",
"drive": "Dr", "lane": "Ln", "road": "Rd", "court": "Ct",
"place": "Pl", "circle": "Cir", "trail": "Trl", "terrace": "Ter",
"parkway": "Pkwy", "highway": "Hwy", "expressway": "Expy",
"freeway": "Fwy", "square": "Sq", "alley": "Aly",
"crossing": "Xing", "point": "Pt",
"north": "N", "south": "S", "east": "E", "west": "W",
"northeast": "NE", "northwest": "NW", "southeast": "SE",
"southwest": "SW",
"apartment": "Apt", "suite": "Ste", "building": "Bldg",
"floor": "Fl", "room": "Rm", "fort": "Ft", "mount": "Mt",
"heights": "Hts", "springs": "Spgs",
}
# Precompiled (pattern, code) list for the state-name → 2-letter
# conversion. Sorted longest-first so ``new york`` matches before ``new``.
_STATE_NAME_PATTERNS: list[tuple[re.Pattern[str], str]] = [
(
re.compile(
rf"(,\s*){re.escape(full)}(\s+\d{{5}}(?:-\d{{4}})?)",
re.IGNORECASE,
),
code,
)
for full, code in sorted(_US_STATE_NAMES.items(), key=lambda kv: -len(kv[0]))
]
# PO Box variants normalize to a single canonical form.
_PO_BOX_RE = re.compile(
@@ -1144,15 +1100,10 @@ def standardize_address(
if state_to_code and is_us_shaped:
# Only convert state names in the *state slot* — between a comma
# and a US ZIP — so the city ``New York`` in ``…, New York, NY
# 10001`` is not shortened to ``NY``.
for full, code in sorted(
_US_STATE_NAMES.items(), key=lambda kv: -len(kv[0])
):
pattern = re.compile(
rf"(,\s*){re.escape(full)}(\s+\d{{5}}(?:-\d{{4}})?)",
re.IGNORECASE,
)
s = pattern.sub(rf"\g<1>{code}\g<2>", s)
# 10001`` is not shortened to ``NY``. Patterns are precompiled
# at module load.
for pat, code in _STATE_NAME_PATTERNS:
s = pat.sub(rf"\g<1>{code}\g<2>", s)
if not expand:
# Compression direction is only safe for US-shaped addresses.
@@ -1297,11 +1248,7 @@ def standardize_email(
if not s:
return value, False
def _err(reason: str) -> tuple[str, bool]:
if error_policy == "sentinel":
sentinel = f"<error: {reason}>"
return sentinel, sentinel != value
return value, False
_err = lambda reason: _err_or_passthrough(reason, value, error_policy)
# Multi-email cell — error before we silently pick one.
if _EMAIL_MULTI_RE.search(s) and not s.startswith("<"):
@@ -1583,10 +1530,34 @@ class StandardizeOptions:
known = {f for f in cls.__dataclass_fields__}
kwargs = {k: v for k, v in data.items() if k in known}
column_types = kwargs.get("column_types") or {}
kwargs["column_types"] = {
c: FieldType(t) if not isinstance(t, FieldType) else t
for c, t in column_types.items()
}
try:
kwargs["column_types"] = {
c: FieldType(t) if not isinstance(t, FieldType) else t
for c, t in column_types.items()
}
except ValueError as e:
valid = ", ".join(sorted(t.value for t in FieldType))
raise ValueError(
f"Invalid field type in column_types: {e}. Valid: {valid}"
) from e
# Surface enum-string mismatches early — bad date_order ("xyz")
# would otherwise crash deep inside standardize_date.
for field_name, valid in (
("date_order", {"MDY", "DMY"}),
("phone_format", set(_PHONE_FORMAT_MAP) | {"DIGITS"}),
("currency_decimal", {"dot", "comma"}),
("name_case", {"title", "upper", "lower"}),
("boolean_style", set(_BOOL_OUTPUT)),
("date_error_policy", {"passthrough", "sentinel"}),
("phone_error_policy", {"passthrough", "sentinel"}),
("currency_error_policy", {"passthrough", "sentinel"}),
("email_error_policy", {"passthrough", "sentinel"}),
):
value = kwargs.get(field_name)
if value is not None and value not in valid:
raise ValueError(
f"Invalid {field_name}={value!r}. Valid: {sorted(valid)}"
)
return cls(**kwargs)
def to_dict(self) -> dict:
@@ -1602,7 +1573,20 @@ class StandardizeOptions:
@classmethod
def from_file(cls, path: str | Path) -> StandardizeOptions:
return cls.from_dict(json.loads(Path(path).read_text()))
path = Path(path)
try:
text = path.read_text()
except OSError as e:
raise OSError(
f"Could not read StandardizeOptions config from {path}: {e}"
) from e
try:
data = json.loads(text)
except json.JSONDecodeError as e:
raise ValueError(
f"Invalid JSON in StandardizeOptions config {path}: {e}"
) from e
return cls.from_dict(data)
@dataclass
@@ -1826,6 +1810,20 @@ def standardize_dataframe(
columns=["row", "column", "field_type", "old", "new"],
)
# Surface a warning when more than 10% of typed cells failed to
# parse — usually means the user mis-typed a column (text marked
# as DATE) or the data is genuinely garbage. Without this, a
# quietly-broken pipeline shows zero changes and silently lets bad
# data flow downstream.
if cells_total > 0 and cells_unparseable / cells_total > 0.1:
logger.warning(
"standardize_dataframe: {}/{} cells ({}%) in typed columns were "
"unparseable — check column_types for mismatches with the data.",
cells_unparseable,
cells_total,
int(100 * cells_unparseable / cells_total),
)
return StandardizeResult(
standardized_df=out,
changes=changes_df,