Files
datatools-dev/src/core/fixes.py
Michael 82d7fef21e feat(gate): CSV-normalization gate with confidence-tiered findings
Adds a Review & Normalize page that sits between upload and every tool
page. The analyzer now tags each finding with confidence (high/medium/low)
and a fix_action; the gate auto-applies high-confidence fixes, surfaces
medium/low ones for user review, and blocks tool pages on error-level
findings until resolved or waived.

Core (src/core/):
  - analyze.py: Finding gains confidence, fix_action, pre_applied; new
    detectors for encoding_uncertain, encoding_decode_failed; new top-
    level encoding_override parameter.
  - fixes.py: registry of fix algorithms keyed by fix_action id.
  - normalize.py: auto_fix(), apply_decisions(), is_normalized(), and
    the NormalizationResult / Decision dataclasses the gate consumes.
  - io.py: detect_encoding tries strict UTF-8 first; repair_bytes now
    transcodes UTF-16/32 to UTF-8 before NUL-strip (fixes UTF-16 corruption)
    and normalizes line endings (fixes bare-CR parser crash); empty file
    handled gracefully instead of EmptyDataError traceback.

GUI (src/gui/):
  - pages/0_Review.py: gate page with per-finding decision controls,
    encoding override picker (16 codepages + custom), and Advanced output
    options (encoding, delimiter, line terminator) on the download.
  - components.py: require_normalization_gate() helper.
  - pages/1-9: gate guard wired on every tool page.

Test corpora:
  - test-cases/encodings-corpus/: 31 encoded CSV fixtures + 9 reference
    UTF-8 files + manifest, synced from Business/DataTools.
  - test-cases/text-cleaner-corpus/test_data/17: synced malformed input
    (unquoted $1,500.00) for the unquoted-delimiter detector.

Tests (94 new):
  - test_normalize.py (48): finding fields, fix registry, auto_fix scope,
    decision paths, gate idempotency, output-options helper.
  - test_encodings_corpus.py (90, 16 xfailed): parametric detection +
    decode + analyzer-no-crash sweep against the manifest.
  - test_analyze.py: encoding override + encoding_uncertain detectors.
  - test_corpus.py: pre-parse repair in the strict reader.

run_tests.py: new aliases --tool normalize, --tool encodings, --tool gate;
encodings corpus added to --fixtures category.

Docs: USER-GUIDE §3.3 covers the gate workflow, encoding override, and
output options; TECHNICAL §10.2.1-10.2.4 documents the analyzer schema,
gate API, Review page, and pre-parse repair pipeline; CLI-REFERENCE adds
the analyzer JSON schema with the new fields; README links to all of it.

Suite: 765 passed, 17 xfailed (was 458 passed).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 20:35:27 +00:00

297 lines
10 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Registry of fix algorithms keyed by ``fix_action`` id.
Every :class:`~src.core.analyze.Finding` declares a ``fix_action`` naming
the algorithm that resolves it. The normalize layer dispatches on that id
into this registry. Each fix function takes a DataFrame plus an optional
``payload`` dict (for fixes that need user-supplied parameters, e.g. the
custom null-sentinel list) and returns ``(new_df, n_cells_changed)``.
Fixes here operate on the DataFrame after the byte-level pre-parse repair
has already run (BOM, NUL, line endings, smart-quote bytes, unquoted
delimiters). Anything in this layer is reversible from the audit log; a
lossy fix (e.g. mojibake repair) is gated to ``confidence="low"`` and
requires explicit user opt-in via the review page.
"""
from __future__ import annotations
import re
import unicodedata
from typing import Any, Callable, Optional
import pandas as pd
from .text_clean import (
_SMART_TRANS,
_ZERO_WIDTH_RE,
_CONTROL_RE,
_WHITESPACE_RUN_RE,
_looks_structured,
strip_bom,
normalize_line_endings as _norm_le_str,
)
# The package __init__ re-exports the analyze() function under the name
# `analyze`, which shadows the submodule attribute. Reach the module via
# sys.modules to get its private constants and FIX_* identifiers.
import sys as _sys
import src.core.analyze # noqa: F401 (registers the submodule)
_a = _sys.modules["src.core.analyze"]
# NBSP / Unicode-whitespace -> ASCII space. Mirrors the analyzer's
# detection set (analyze._NBSP_LIKE_CHARS) so what the detector flags is
# exactly what this fix replaces.
_NBSP_TRANS = str.maketrans({c: " " for c in _a._NBSP_LIKE_CHARS})
FixFn = Callable[[pd.DataFrame, Optional[dict]], tuple[pd.DataFrame, int]]
_REGISTRY: dict[str, FixFn] = {}
def register(action_id: str) -> Callable[[FixFn], FixFn]:
def deco(fn: FixFn) -> FixFn:
_REGISTRY[action_id] = fn
return fn
return deco
def get_fix(action_id: str) -> Optional[FixFn]:
return _REGISTRY.get(action_id)
def available_actions() -> list[str]:
return sorted(_REGISTRY)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _apply_to_strings(
df: pd.DataFrame, fn: Callable[[str], str], *, include_headers: bool = False,
) -> tuple[pd.DataFrame, int]:
"""Apply *fn* to every string cell. Returns (new_df, cells_changed).
Headers are not touched here — the dedicated header-cleaning fix owns
that scope so the gate's audit log records header changes separately.
"""
out = df.copy()
changed = 0
for col in out.columns:
if not pd.api.types.is_object_dtype(out[col]) and not pd.api.types.is_string_dtype(out[col]):
continue
new_col = []
for v in out[col]:
if isinstance(v, str):
nv = fn(v)
if nv != v:
changed += 1
new_col.append(nv)
else:
new_col.append(v)
out[col] = new_col
if include_headers:
new_headers = []
for h in out.columns:
if isinstance(h, str):
nh = fn(h)
if nh != h:
changed += 1
new_headers.append(nh)
else:
new_headers.append(h)
out.columns = new_headers
return out, changed
# ---------------------------------------------------------------------------
# High-confidence fixes
# ---------------------------------------------------------------------------
@register(_a.FIX_TRIM_WHITESPACE)
def trim_whitespace(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
"""Strip leading/trailing whitespace; collapse internal runs in text cells.
Numeric/date/phone-shaped cells get only outer trim — internal spacing
in those is often semantic (`1 234`, `(555) 123-4567`).
"""
def fix(s: str) -> str:
trimmed = s.strip()
if not trimmed or _looks_structured(trimmed):
return trimmed
return _WHITESPACE_RUN_RE.sub(" ", trimmed)
return _apply_to_strings(df, fix)
@register(_a.FIX_STRIP_NBSP)
def strip_nbsp(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
"""Replace NBSP and other Unicode spaces with ASCII space."""
def fix(s: str) -> str:
return s.translate(_NBSP_TRANS)
return _apply_to_strings(df, fix)
@register(_a.FIX_STRIP_ZERO_WIDTH)
def strip_zero_width(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
"""Remove zero-width and invisible characters from cells."""
def fix(s: str) -> str:
return _ZERO_WIDTH_RE.sub("", s)
return _apply_to_strings(df, fix)
@register(_a.FIX_FOLD_SMART_PUNCT)
def fold_smart_punctuation(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
"""ASCII-fy curly quotes, em/en dashes, ellipsis, primes."""
def fix(s: str) -> str:
return s.translate(_SMART_TRANS)
return _apply_to_strings(df, fix)
@register(_a.FIX_CLEAN_HEADERS)
def clean_headers(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
"""Apply the same per-cell hygiene to column headers.
Fixes the df['Email'] vs df['Email '] class of bug.
"""
def fix(s: str) -> str:
s = strip_bom(s)
s = s.translate(_NBSP_TRANS)
s = _ZERO_WIDTH_RE.sub("", s)
s = s.translate(_SMART_TRANS)
s = _CONTROL_RE.sub("", s)
return s.strip()
out = df.copy()
new_headers = []
changed = 0
for h in out.columns:
if isinstance(h, str):
nh = fix(h)
if nh != h:
changed += 1
new_headers.append(nh)
else:
new_headers.append(h)
out.columns = new_headers
return out, changed
@register(_a.FIX_NORMALIZE_LINE_ENDINGS)
def normalize_line_endings(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
"""Normalize CRLF / bare CR inside cells to LF.
File-level line endings are handled by ``repair_bytes`` before parsing;
this fix covers embedded multi-line cells (case 11 in the corpus).
"""
return _apply_to_strings(df, _norm_le_str)
# ---------------------------------------------------------------------------
# Already-applied fixes (no-op at this layer; kept so the audit log is
# uniform and the gate can reason about them)
# ---------------------------------------------------------------------------
@register(_a.FIX_STRIP_BOM)
def strip_bom_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
"""BOM is stripped during read by repair_bytes; nothing to do here."""
return df, 0
@register(_a.FIX_STRIP_NUL)
def strip_nul_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
"""NUL is stripped during read by repair_bytes."""
return df, 0
@register(_a.FIX_FOLD_SMART_QUOTES_BYTE)
def fold_smart_quotes_byte_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
"""Byte-level smart-quote fold runs in repair_bytes."""
return df, 0
@register(_a.FIX_REPAIR_UNQUOTED_DELIM)
def repair_unquoted_delim_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
"""Per-row delimiter repair runs in repair_bytes."""
return df, 0
# ---------------------------------------------------------------------------
# Medium-confidence fixes (require user confirmation in the review flow)
# ---------------------------------------------------------------------------
@register(_a.FIX_LOWERCASE_EMAIL)
def lowercase_email(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
"""Lowercase values in the column named in *payload['column']*.
Defaults to lowercasing every column whose name matches the email
heuristic if no payload is given.
"""
out = df.copy()
payload = payload or {}
target_cols: list[str]
if "column" in payload:
target_cols = [payload["column"]]
else:
target_cols = [
c for c in out.columns
if isinstance(c, str) and _a._EMAIL_LIKE_COL.search(c)
]
changed = 0
for col in target_cols:
if col not in out.columns:
continue
new_col = []
for v in out[col]:
if isinstance(v, str):
nv = v.lower()
if nv != v:
changed += 1
new_col.append(nv)
else:
new_col.append(v)
out[col] = new_col
return out, changed
@register(_a.FIX_REPLACE_NULL_SENTINELS)
def replace_null_sentinels(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
"""Replace user-approved null-like sentinel strings with empty string.
Payload: ``{"sentinels": ["N/A", "n/a", "nan", ...]}``. Defaults to
the analyzer's built-in set when no payload is given. Comparison is
case-insensitive, whitespace-trimmed.
"""
payload = payload or {}
sentinels = payload.get("sentinels")
if sentinels is None:
sentinels = list(_a._NULL_LIKE)
sentinel_set = {s.strip().lower() for s in sentinels}
def fix(s: str) -> str:
return "" if s.strip().lower() in sentinel_set else s
return _apply_to_strings(df, fix)
# ---------------------------------------------------------------------------
# Low-confidence fixes (off by default; user-only)
# ---------------------------------------------------------------------------
@register(_a.FIX_REPAIR_MOJIBAKE)
def repair_mojibake(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
"""Heuristic UTF-8-as-cp1252 mojibake repair via ftfy when available.
Falls back to a no-op (returning ``(df, 0)``) when ftfy is not
installed; the review page surfaces that as "library missing — install
ftfy to enable" so we never silently corrupt data with a hand-rolled
heuristic.
"""
try:
import ftfy # type: ignore
except ImportError:
return df, 0
def fix(s: str) -> str:
return ftfy.fix_text(s)
return _apply_to_strings(df, fix)