"""Registry of fix algorithms keyed by ``fix_action`` id. Every :class:`~src.core.analyze.Finding` declares a ``fix_action`` naming the algorithm that resolves it. The normalize layer dispatches on that id into this registry. Each fix function takes a DataFrame plus an optional ``payload`` dict (for fixes that need user-supplied parameters, e.g. the custom null-sentinel list) and returns ``(new_df, n_cells_changed)``. Fixes here operate on the DataFrame after the byte-level pre-parse repair has already run (BOM, NUL, line endings, smart-quote bytes, unquoted delimiters). Anything in this layer is reversible from the audit log; a lossy fix (e.g. mojibake repair) is gated to ``confidence="low"`` and requires explicit user opt-in via the review page. """ from __future__ import annotations import re import unicodedata from typing import Any, Callable, Optional import pandas as pd from .text_clean import ( _SMART_TRANS, _ZERO_WIDTH_RE, _CONTROL_RE, _WHITESPACE_RUN_RE, _looks_structured, strip_bom, normalize_line_endings as _norm_le_str, ) # The package __init__ re-exports the analyze() function under the name # `analyze`, which shadows the submodule attribute. Reach the module via # sys.modules to get its private constants and FIX_* identifiers. import sys as _sys import src.core.analyze # noqa: F401 (registers the submodule) _a = _sys.modules["src.core.analyze"] # NBSP / Unicode-whitespace -> ASCII space. Mirrors the analyzer's # detection set (analyze._NBSP_LIKE_CHARS) so what the detector flags is # exactly what this fix replaces. _NBSP_TRANS = str.maketrans({c: " " for c in _a._NBSP_LIKE_CHARS}) FixFn = Callable[[pd.DataFrame, Optional[dict]], tuple[pd.DataFrame, int]] _REGISTRY: dict[str, FixFn] = {} def register(action_id: str) -> Callable[[FixFn], FixFn]: def deco(fn: FixFn) -> FixFn: _REGISTRY[action_id] = fn return fn return deco def get_fix(action_id: str) -> Optional[FixFn]: return _REGISTRY.get(action_id) def available_actions() -> list[str]: return sorted(_REGISTRY) # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _apply_to_strings( df: pd.DataFrame, fn: Callable[[str], str], *, include_headers: bool = False, ) -> tuple[pd.DataFrame, int]: """Apply *fn* to every string cell. Returns (new_df, cells_changed). Headers are not touched here — the dedicated header-cleaning fix owns that scope so the gate's audit log records header changes separately. """ out = df.copy() changed = 0 for col in out.columns: if not pd.api.types.is_object_dtype(out[col]) and not pd.api.types.is_string_dtype(out[col]): continue new_col = [] for v in out[col]: if isinstance(v, str): nv = fn(v) if nv != v: changed += 1 new_col.append(nv) else: new_col.append(v) out[col] = new_col if include_headers: new_headers = [] for h in out.columns: if isinstance(h, str): nh = fn(h) if nh != h: changed += 1 new_headers.append(nh) else: new_headers.append(h) out.columns = new_headers return out, changed # --------------------------------------------------------------------------- # High-confidence fixes # --------------------------------------------------------------------------- @register(_a.FIX_TRIM_WHITESPACE) def trim_whitespace(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]: """Strip leading/trailing whitespace; collapse internal runs in text cells. Numeric/date/phone-shaped cells get only outer trim — internal spacing in those is often semantic (`1 234`, `(555) 123-4567`). """ def fix(s: str) -> str: trimmed = s.strip() if not trimmed or _looks_structured(trimmed): return trimmed return _WHITESPACE_RUN_RE.sub(" ", trimmed) return _apply_to_strings(df, fix) @register(_a.FIX_STRIP_NBSP) def strip_nbsp(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]: """Replace NBSP and other Unicode spaces with ASCII space.""" def fix(s: str) -> str: return s.translate(_NBSP_TRANS) return _apply_to_strings(df, fix) @register(_a.FIX_STRIP_ZERO_WIDTH) def strip_zero_width(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]: """Remove zero-width and invisible characters from cells.""" def fix(s: str) -> str: return _ZERO_WIDTH_RE.sub("", s) return _apply_to_strings(df, fix) @register(_a.FIX_FOLD_SMART_PUNCT) def fold_smart_punctuation(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]: """ASCII-fy curly quotes, em/en dashes, ellipsis, primes.""" def fix(s: str) -> str: return s.translate(_SMART_TRANS) return _apply_to_strings(df, fix) @register(_a.FIX_CLEAN_HEADERS) def clean_headers(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]: """Apply the same per-cell hygiene to column headers. Fixes the df['Email'] vs df['Email '] class of bug. """ def fix(s: str) -> str: s = strip_bom(s) s = s.translate(_NBSP_TRANS) s = _ZERO_WIDTH_RE.sub("", s) s = s.translate(_SMART_TRANS) s = _CONTROL_RE.sub("", s) return s.strip() out = df.copy() new_headers = [] changed = 0 for h in out.columns: if isinstance(h, str): nh = fix(h) if nh != h: changed += 1 new_headers.append(nh) else: new_headers.append(h) out.columns = new_headers return out, changed @register(_a.FIX_NORMALIZE_LINE_ENDINGS) def normalize_line_endings(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]: """Normalize CRLF / bare CR inside cells to LF. File-level line endings are handled by ``repair_bytes`` before parsing; this fix covers embedded multi-line cells (case 11 in the corpus). """ return _apply_to_strings(df, _norm_le_str) # --------------------------------------------------------------------------- # Already-applied fixes (no-op at this layer; kept so the audit log is # uniform and the gate can reason about them) # --------------------------------------------------------------------------- @register(_a.FIX_STRIP_BOM) def strip_bom_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]: """BOM is stripped during read by repair_bytes; nothing to do here.""" return df, 0 @register(_a.FIX_STRIP_NUL) def strip_nul_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]: """NUL is stripped during read by repair_bytes.""" return df, 0 @register(_a.FIX_FOLD_SMART_QUOTES_BYTE) def fold_smart_quotes_byte_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]: """Byte-level smart-quote fold runs in repair_bytes.""" return df, 0 @register(_a.FIX_REPAIR_UNQUOTED_DELIM) def repair_unquoted_delim_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]: """Per-row delimiter repair runs in repair_bytes.""" return df, 0 # --------------------------------------------------------------------------- # Medium-confidence fixes (require user confirmation in the review flow) # --------------------------------------------------------------------------- @register(_a.FIX_LOWERCASE_EMAIL) def lowercase_email(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]: """Lowercase values in the column named in *payload['column']*. Defaults to lowercasing every column whose name matches the email heuristic if no payload is given. """ out = df.copy() payload = payload or {} target_cols: list[str] if "column" in payload: target_cols = [payload["column"]] else: target_cols = [ c for c in out.columns if isinstance(c, str) and _a._EMAIL_LIKE_COL.search(c) ] changed = 0 for col in target_cols: if col not in out.columns: continue new_col = [] for v in out[col]: if isinstance(v, str): nv = v.lower() if nv != v: changed += 1 new_col.append(nv) else: new_col.append(v) out[col] = new_col return out, changed @register(_a.FIX_REPLACE_NULL_SENTINELS) def replace_null_sentinels(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]: """Replace user-approved null-like sentinel strings with empty string. Payload: ``{"sentinels": ["N/A", "n/a", "nan", ...]}``. Defaults to the analyzer's built-in set when no payload is given. Comparison is case-insensitive, whitespace-trimmed. """ payload = payload or {} sentinels = payload.get("sentinels") if sentinels is None: sentinels = list(_a._NULL_LIKE) sentinel_set = {s.strip().lower() for s in sentinels} def fix(s: str) -> str: return "" if s.strip().lower() in sentinel_set else s return _apply_to_strings(df, fix) # --------------------------------------------------------------------------- # Low-confidence fixes (off by default; user-only) # --------------------------------------------------------------------------- @register(_a.FIX_REPAIR_MOJIBAKE) def repair_mojibake(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]: """Heuristic UTF-8-as-cp1252 mojibake repair via ftfy when available. Falls back to a no-op (returning ``(df, 0)``) when ftfy is not installed; the review page surfaces that as "library missing — install ftfy to enable" so we never silently corrupt data with a hand-rolled heuristic. """ try: import ftfy # type: ignore except ImportError: return df, 0 def fix(s: str) -> str: return ftfy.fix_text(s) return _apply_to_strings(df, fix)