feat(gate): CSV-normalization gate with confidence-tiered findings

Adds a Review & Normalize page that sits between upload and every tool page. The analyzer now tags each finding with confidence (high/medium/low) and a fix_action; the gate auto-applies high-confidence fixes, surfaces medium/low ones for user review, and blocks tool pages on error-level findings until resolved or waived. Core (src/core/): - analyze.py: Finding gains confidence, fix_action, pre_applied; new detectors for encoding_uncertain, encoding_decode_failed; new top- level encoding_override parameter. - fixes.py: registry of fix algorithms keyed by fix_action id. - normalize.py: auto_fix(), apply_decisions(), is_normalized(), and the NormalizationResult / Decision dataclasses the gate consumes. - io.py: detect_encoding tries strict UTF-8 first; repair_bytes now transcodes UTF-16/32 to UTF-8 before NUL-strip (fixes UTF-16 corruption) and normalizes line endings (fixes bare-CR parser crash); empty file handled gracefully instead of EmptyDataError traceback. GUI (src/gui/): - pages/0_Review.py: gate page with per-finding decision controls, encoding override picker (16 codepages + custom), and Advanced output options (encoding, delimiter, line terminator) on the download. - components.py: require_normalization_gate() helper. - pages/1-9: gate guard wired on every tool page. Test corpora: - test-cases/encodings-corpus/: 31 encoded CSV fixtures + 9 reference UTF-8 files + manifest, synced from Business/DataTools. - test-cases/text-cleaner-corpus/test_data/17: synced malformed input (unquoted $1,500.00) for the unquoted-delimiter detector. Tests (94 new): - test_normalize.py (48): finding fields, fix registry, auto_fix scope, decision paths, gate idempotency, output-options helper. - test_encodings_corpus.py (90, 16 xfailed): parametric detection + decode + analyzer-no-crash sweep against the manifest. - test_analyze.py: encoding override + encoding_uncertain detectors. - test_corpus.py: pre-parse repair in the strict reader. run_tests.py: new aliases --tool normalize, --tool encodings, --tool gate; encodings corpus added to --fixtures category. Docs: USER-GUIDE §3.3 covers the gate workflow, encoding override, and output options; TECHNICAL §10.2.1-10.2.4 documents the analyzer schema, gate API, Review page, and pre-parse repair pipeline; CLI-REFERENCE adds the analyzer JSON schema with the new fields; README links to all of it. Suite: 765 passed, 17 xfailed (was 458 passed). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 20:35:27 +00:00
parent e9c490ae1b
commit 82d7fef21e
68 changed files with 2883 additions and 34 deletions
--- a/src/core/fixes.py
+++ b/src/core/fixes.py
@@ -0,0 +1,296 @@
+"""Registry of fix algorithms keyed by ``fix_action`` id.
+
+Every :class:`~src.core.analyze.Finding` declares a ``fix_action`` naming
+the algorithm that resolves it. The normalize layer dispatches on that id
+into this registry. Each fix function takes a DataFrame plus an optional
+``payload`` dict (for fixes that need user-supplied parameters, e.g. the
+custom null-sentinel list) and returns ``(new_df, n_cells_changed)``.
+
+Fixes here operate on the DataFrame after the byte-level pre-parse repair
+has already run (BOM, NUL, line endings, smart-quote bytes, unquoted
+delimiters). Anything in this layer is reversible from the audit log; a
+lossy fix (e.g. mojibake repair) is gated to ``confidence="low"`` and
+requires explicit user opt-in via the review page.
+"""
+
+from __future__ import annotations
+
+import re
+import unicodedata
+from typing import Any, Callable, Optional
+
+import pandas as pd
+
+from .text_clean import (
+    _SMART_TRANS,
+    _ZERO_WIDTH_RE,
+    _CONTROL_RE,
+    _WHITESPACE_RUN_RE,
+    _looks_structured,
+    strip_bom,
+    normalize_line_endings as _norm_le_str,
+)
+# The package __init__ re-exports the analyze() function under the name
+# `analyze`, which shadows the submodule attribute. Reach the module via
+# sys.modules to get its private constants and FIX_* identifiers.
+import sys as _sys
+import src.core.analyze  # noqa: F401  (registers the submodule)
+_a = _sys.modules["src.core.analyze"]
+
+# NBSP / Unicode-whitespace -> ASCII space. Mirrors the analyzer's
+# detection set (analyze._NBSP_LIKE_CHARS) so what the detector flags is
+# exactly what this fix replaces.
+_NBSP_TRANS = str.maketrans({c: " " for c in _a._NBSP_LIKE_CHARS})
+
+
+FixFn = Callable[[pd.DataFrame, Optional[dict]], tuple[pd.DataFrame, int]]
+
+_REGISTRY: dict[str, FixFn] = {}
+
+
+def register(action_id: str) -> Callable[[FixFn], FixFn]:
+    def deco(fn: FixFn) -> FixFn:
+        _REGISTRY[action_id] = fn
+        return fn
+    return deco
+
+
+def get_fix(action_id: str) -> Optional[FixFn]:
+    return _REGISTRY.get(action_id)
+
+
+def available_actions() -> list[str]:
+    return sorted(_REGISTRY)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _apply_to_strings(
+    df: pd.DataFrame, fn: Callable[[str], str], *, include_headers: bool = False,
+) -> tuple[pd.DataFrame, int]:
+    """Apply *fn* to every string cell. Returns (new_df, cells_changed).
+
+    Headers are not touched here — the dedicated header-cleaning fix owns
+    that scope so the gate's audit log records header changes separately.
+    """
+    out = df.copy()
+    changed = 0
+    for col in out.columns:
+        if not pd.api.types.is_object_dtype(out[col]) and not pd.api.types.is_string_dtype(out[col]):
+            continue
+        new_col = []
+        for v in out[col]:
+            if isinstance(v, str):
+                nv = fn(v)
+                if nv != v:
+                    changed += 1
+                new_col.append(nv)
+            else:
+                new_col.append(v)
+        out[col] = new_col
+    if include_headers:
+        new_headers = []
+        for h in out.columns:
+            if isinstance(h, str):
+                nh = fn(h)
+                if nh != h:
+                    changed += 1
+                new_headers.append(nh)
+            else:
+                new_headers.append(h)
+        out.columns = new_headers
+    return out, changed
+
+
+# ---------------------------------------------------------------------------
+# High-confidence fixes
+# ---------------------------------------------------------------------------
+
+@register(_a.FIX_TRIM_WHITESPACE)
+def trim_whitespace(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """Strip leading/trailing whitespace; collapse internal runs in text cells.
+
+    Numeric/date/phone-shaped cells get only outer trim — internal spacing
+    in those is often semantic (`1 234`, `(555) 123-4567`).
+    """
+    def fix(s: str) -> str:
+        trimmed = s.strip()
+        if not trimmed or _looks_structured(trimmed):
+            return trimmed
+        return _WHITESPACE_RUN_RE.sub(" ", trimmed)
+    return _apply_to_strings(df, fix)
+
+
+@register(_a.FIX_STRIP_NBSP)
+def strip_nbsp(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """Replace NBSP and other Unicode spaces with ASCII space."""
+    def fix(s: str) -> str:
+        return s.translate(_NBSP_TRANS)
+    return _apply_to_strings(df, fix)
+
+
+@register(_a.FIX_STRIP_ZERO_WIDTH)
+def strip_zero_width(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """Remove zero-width and invisible characters from cells."""
+    def fix(s: str) -> str:
+        return _ZERO_WIDTH_RE.sub("", s)
+    return _apply_to_strings(df, fix)
+
+
+@register(_a.FIX_FOLD_SMART_PUNCT)
+def fold_smart_punctuation(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """ASCII-fy curly quotes, em/en dashes, ellipsis, primes."""
+    def fix(s: str) -> str:
+        return s.translate(_SMART_TRANS)
+    return _apply_to_strings(df, fix)
+
+
+@register(_a.FIX_CLEAN_HEADERS)
+def clean_headers(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """Apply the same per-cell hygiene to column headers.
+
+    Fixes the df['Email'] vs df['Email '] class of bug.
+    """
+    def fix(s: str) -> str:
+        s = strip_bom(s)
+        s = s.translate(_NBSP_TRANS)
+        s = _ZERO_WIDTH_RE.sub("", s)
+        s = s.translate(_SMART_TRANS)
+        s = _CONTROL_RE.sub("", s)
+        return s.strip()
+    out = df.copy()
+    new_headers = []
+    changed = 0
+    for h in out.columns:
+        if isinstance(h, str):
+            nh = fix(h)
+            if nh != h:
+                changed += 1
+            new_headers.append(nh)
+        else:
+            new_headers.append(h)
+    out.columns = new_headers
+    return out, changed
+
+
+@register(_a.FIX_NORMALIZE_LINE_ENDINGS)
+def normalize_line_endings(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """Normalize CRLF / bare CR inside cells to LF.
+
+    File-level line endings are handled by ``repair_bytes`` before parsing;
+    this fix covers embedded multi-line cells (case 11 in the corpus).
+    """
+    return _apply_to_strings(df, _norm_le_str)
+
+
+# ---------------------------------------------------------------------------
+# Already-applied fixes (no-op at this layer; kept so the audit log is
+# uniform and the gate can reason about them)
+# ---------------------------------------------------------------------------
+
+@register(_a.FIX_STRIP_BOM)
+def strip_bom_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """BOM is stripped during read by repair_bytes; nothing to do here."""
+    return df, 0
+
+
+@register(_a.FIX_STRIP_NUL)
+def strip_nul_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """NUL is stripped during read by repair_bytes."""
+    return df, 0
+
+
+@register(_a.FIX_FOLD_SMART_QUOTES_BYTE)
+def fold_smart_quotes_byte_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """Byte-level smart-quote fold runs in repair_bytes."""
+    return df, 0
+
+
+@register(_a.FIX_REPAIR_UNQUOTED_DELIM)
+def repair_unquoted_delim_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """Per-row delimiter repair runs in repair_bytes."""
+    return df, 0
+
+
+# ---------------------------------------------------------------------------
+# Medium-confidence fixes (require user confirmation in the review flow)
+# ---------------------------------------------------------------------------
+
+@register(_a.FIX_LOWERCASE_EMAIL)
+def lowercase_email(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """Lowercase values in the column named in *payload['column']*.
+
+    Defaults to lowercasing every column whose name matches the email
+    heuristic if no payload is given.
+    """
+    out = df.copy()
+    payload = payload or {}
+    target_cols: list[str]
+    if "column" in payload:
+        target_cols = [payload["column"]]
+    else:
+        target_cols = [
+            c for c in out.columns
+            if isinstance(c, str) and _a._EMAIL_LIKE_COL.search(c)
+        ]
+    changed = 0
+    for col in target_cols:
+        if col not in out.columns:
+            continue
+        new_col = []
+        for v in out[col]:
+            if isinstance(v, str):
+                nv = v.lower()
+                if nv != v:
+                    changed += 1
+                new_col.append(nv)
+            else:
+                new_col.append(v)
+        out[col] = new_col
+    return out, changed
+
+
+@register(_a.FIX_REPLACE_NULL_SENTINELS)
+def replace_null_sentinels(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """Replace user-approved null-like sentinel strings with empty string.
+
+    Payload: ``{"sentinels": ["N/A", "n/a", "nan", ...]}``. Defaults to
+    the analyzer's built-in set when no payload is given. Comparison is
+    case-insensitive, whitespace-trimmed.
+    """
+    payload = payload or {}
+    sentinels = payload.get("sentinels")
+    if sentinels is None:
+        sentinels = list(_a._NULL_LIKE)
+    sentinel_set = {s.strip().lower() for s in sentinels}
+
+    def fix(s: str) -> str:
+        return "" if s.strip().lower() in sentinel_set else s
+
+    return _apply_to_strings(df, fix)
+
+
+# ---------------------------------------------------------------------------
+# Low-confidence fixes (off by default; user-only)
+# ---------------------------------------------------------------------------
+
+@register(_a.FIX_REPAIR_MOJIBAKE)
+def repair_mojibake(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """Heuristic UTF-8-as-cp1252 mojibake repair via ftfy when available.
+
+    Falls back to a no-op (returning ``(df, 0)``) when ftfy is not
+    installed; the review page surfaces that as "library missing — install
+    ftfy to enable" so we never silently corrupt data with a hand-rolled
+    heuristic.
+    """
+    try:
+        import ftfy  # type: ignore
+    except ImportError:
+        return df, 0
+
+    def fix(s: str) -> str:
+        return ftfy.fix_text(s)
+
+    return _apply_to_strings(df, fix)