datatools-dev/src/core/fixes.py

"""Registry of fix algorithms keyed by ``fix_action`` id.

Every :class:`~src.core.analyze.Finding` declares a ``fix_action`` naming
the algorithm that resolves it. The normalize layer dispatches on that id
into this registry. Each fix function takes a DataFrame plus an optional
``payload`` dict (for fixes that need user-supplied parameters, e.g. the
custom null-sentinel list) and returns ``(new_df, n_cells_changed)``.

Fixes here operate on the DataFrame after the byte-level pre-parse repair
has already run (BOM, NUL, line endings, smart-quote bytes, unquoted
delimiters). Anything in this layer is reversible from the audit log; a
lossy fix (e.g. mojibake repair) is gated to ``confidence="low"`` and
requires explicit user opt-in via the review page.
"""

from __future__ import annotations

import re
import unicodedata
from typing import Any, Callable, Optional

import pandas as pd

from .text_clean import (
    _SMART_TRANS,
    _ZERO_WIDTH_RE,
    _CONTROL_RE,
    _WHITESPACE_RUN_RE,
    _looks_structured,
    strip_bom,
    normalize_line_endings as _norm_le_str,
)
# The package __init__ re-exports the analyze() function under the name
# `analyze`, which shadows the submodule attribute. Reach the module via
# sys.modules to get its private constants and FIX_* identifiers.
import sys as _sys
import src.core.analyze  # noqa: F401  (registers the submodule)
_a = _sys.modules["src.core.analyze"]

# NBSP / Unicode-whitespace -> ASCII space. Mirrors the analyzer's
# detection set (analyze._NBSP_LIKE_CHARS) so what the detector flags is
# exactly what this fix replaces.
_NBSP_TRANS = str.maketrans({c: " " for c in _a._NBSP_LIKE_CHARS})


FixFn = Callable[[pd.DataFrame, Optional[dict]], tuple[pd.DataFrame, int]]

_REGISTRY: dict[str, FixFn] = {}


def register(action_id: str) -> Callable[[FixFn], FixFn]:
    def deco(fn: FixFn) -> FixFn:
        _REGISTRY[action_id] = fn
        return fn
    return deco


def get_fix(action_id: str) -> Optional[FixFn]:
    return _REGISTRY.get(action_id)


def available_actions() -> list[str]:
    return sorted(_REGISTRY)


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _apply_to_strings(
    df: pd.DataFrame, fn: Callable[[str], str], *, include_headers: bool = False,
) -> tuple[pd.DataFrame, int]:
    """Apply *fn* to every string cell. Returns (new_df, cells_changed).

    Headers are not touched here — the dedicated header-cleaning fix owns
    that scope so the gate's audit log records header changes separately.
    """
    out = df.copy()
    changed = 0
    for col in out.columns:
        if not pd.api.types.is_object_dtype(out[col]) and not pd.api.types.is_string_dtype(out[col]):
            continue
        new_col = []
        for v in out[col]:
            if isinstance(v, str):
                nv = fn(v)
                if nv != v:
                    changed += 1
                new_col.append(nv)
            else:
                new_col.append(v)
        out[col] = new_col
    if include_headers:
        new_headers = []
        for h in out.columns:
            if isinstance(h, str):
                nh = fn(h)
                if nh != h:
                    changed += 1
                new_headers.append(nh)
            else:
                new_headers.append(h)
        out.columns = new_headers
    return out, changed


# ---------------------------------------------------------------------------
# High-confidence fixes
# ---------------------------------------------------------------------------

@register(_a.FIX_TRIM_WHITESPACE)
def trim_whitespace(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
    """Strip leading/trailing whitespace; collapse internal runs in text cells.

    Numeric/date/phone-shaped cells get only outer trim — internal spacing
    in those is often semantic (`1 234`, `(555) 123-4567`).
    """
    def fix(s: str) -> str:
        trimmed = s.strip()
        if not trimmed or _looks_structured(trimmed):
            return trimmed
        return _WHITESPACE_RUN_RE.sub(" ", trimmed)
    return _apply_to_strings(df, fix)


@register(_a.FIX_STRIP_NBSP)
def strip_nbsp(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
    """Replace NBSP and other Unicode spaces with ASCII space."""
    def fix(s: str) -> str:
        return s.translate(_NBSP_TRANS)
    return _apply_to_strings(df, fix)


@register(_a.FIX_STRIP_ZERO_WIDTH)
def strip_zero_width(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
    """Remove zero-width and invisible characters from cells."""
    def fix(s: str) -> str:
        return _ZERO_WIDTH_RE.sub("", s)
    return _apply_to_strings(df, fix)


@register(_a.FIX_FOLD_SMART_PUNCT)
def fold_smart_punctuation(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
    """ASCII-fy curly quotes, em/en dashes, ellipsis, primes."""
    def fix(s: str) -> str:
        return s.translate(_SMART_TRANS)
    return _apply_to_strings(df, fix)


@register(_a.FIX_CLEAN_HEADERS)
def clean_headers(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
    """Apply the same per-cell hygiene to column headers.

    Fixes the df['Email'] vs df['Email '] class of bug.
    """
    def fix(s: str) -> str:
        s = strip_bom(s)
        s = s.translate(_NBSP_TRANS)
        s = _ZERO_WIDTH_RE.sub("", s)
        s = s.translate(_SMART_TRANS)
        s = _CONTROL_RE.sub("", s)
        return s.strip()
    out = df.copy()
    new_headers = []
    changed = 0
    for h in out.columns:
        if isinstance(h, str):
            nh = fix(h)
            if nh != h:
                changed += 1
            new_headers.append(nh)
        else:
            new_headers.append(h)
    out.columns = new_headers
    return out, changed


@register(_a.FIX_NORMALIZE_LINE_ENDINGS)
def normalize_line_endings(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
    """Normalize CRLF / bare CR inside cells to LF.

    File-level line endings are handled by ``repair_bytes`` before parsing;
    this fix covers embedded multi-line cells (case 11 in the corpus).
    """
    return _apply_to_strings(df, _norm_le_str)


# ---------------------------------------------------------------------------
# Already-applied fixes (no-op at this layer; kept so the audit log is
# uniform and the gate can reason about them)
# ---------------------------------------------------------------------------

@register(_a.FIX_STRIP_BOM)
def strip_bom_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
    """BOM is stripped during read by repair_bytes; nothing to do here."""
    return df, 0


@register(_a.FIX_STRIP_NUL)
def strip_nul_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
    """NUL is stripped during read by repair_bytes."""
    return df, 0


@register(_a.FIX_FOLD_SMART_QUOTES_BYTE)
def fold_smart_quotes_byte_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
    """Byte-level smart-quote fold runs in repair_bytes."""
    return df, 0


@register(_a.FIX_REPAIR_UNQUOTED_DELIM)
def repair_unquoted_delim_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
    """Per-row delimiter repair runs in repair_bytes."""
    return df, 0


# ---------------------------------------------------------------------------
# Medium-confidence fixes (require user confirmation in the review flow)
# ---------------------------------------------------------------------------

@register(_a.FIX_LOWERCASE_EMAIL)
def lowercase_email(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
    """Lowercase values in the column named in *payload['column']*.

    Defaults to lowercasing every column whose name matches the email
    heuristic if no payload is given.
    """
    out = df.copy()
    payload = payload or {}
    target_cols: list[str]
    if "column" in payload:
        target_cols = [payload["column"]]
    else:
        target_cols = [
            c for c in out.columns
            if isinstance(c, str) and _a._EMAIL_LIKE_COL.search(c)
        ]
    changed = 0
    for col in target_cols:
        if col not in out.columns:
            continue
        new_col = []
        for v in out[col]:
            if isinstance(v, str):
                nv = v.lower()
                if nv != v:
                    changed += 1
                new_col.append(nv)
            else:
                new_col.append(v)
        out[col] = new_col
    return out, changed


@register(_a.FIX_REPLACE_NULL_SENTINELS)
def replace_null_sentinels(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
    """Replace user-approved null-like sentinel strings with empty string.

    Payload: ``{"sentinels": ["N/A", "n/a", "nan", ...]}``. Defaults to
    the analyzer's built-in set when no payload is given. Comparison is
    case-insensitive, whitespace-trimmed.
    """
    payload = payload or {}
    sentinels = payload.get("sentinels")
    if sentinels is None:
        sentinels = list(_a._NULL_LIKE)
    sentinel_set = {s.strip().lower() for s in sentinels}

    def fix(s: str) -> str:
        return "" if s.strip().lower() in sentinel_set else s

    return _apply_to_strings(df, fix)


# ---------------------------------------------------------------------------
# Low-confidence fixes (off by default; user-only)
# ---------------------------------------------------------------------------

@register(_a.FIX_REPAIR_MOJIBAKE)
def repair_mojibake(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
    """Heuristic UTF-8-as-cp1252 mojibake repair via ftfy when available.

    Falls back to a no-op (returning ``(df, 0)``) when ftfy is not
    installed; the review page surfaces that as "library missing — install
    ftfy to enable" so we never silently corrupt data with a hand-rolled
    heuristic.
    """
    try:
        import ftfy  # type: ignore
    except ImportError:
        return df, 0

    def fix(s: str) -> str:
        return ftfy.fix_text(s)

    return _apply_to_strings(df, fix)