feat(gate): CSV-normalization gate with confidence-tiered findings

Adds a Review & Normalize page that sits between upload and every tool page. The analyzer now tags each finding with confidence (high/medium/low) and a fix_action; the gate auto-applies high-confidence fixes, surfaces medium/low ones for user review, and blocks tool pages on error-level findings until resolved or waived. Core (src/core/): - analyze.py: Finding gains confidence, fix_action, pre_applied; new detectors for encoding_uncertain, encoding_decode_failed; new top- level encoding_override parameter. - fixes.py: registry of fix algorithms keyed by fix_action id. - normalize.py: auto_fix(), apply_decisions(), is_normalized(), and the NormalizationResult / Decision dataclasses the gate consumes. - io.py: detect_encoding tries strict UTF-8 first; repair_bytes now transcodes UTF-16/32 to UTF-8 before NUL-strip (fixes UTF-16 corruption) and normalizes line endings (fixes bare-CR parser crash); empty file handled gracefully instead of EmptyDataError traceback. GUI (src/gui/): - pages/0_Review.py: gate page with per-finding decision controls, encoding override picker (16 codepages + custom), and Advanced output options (encoding, delimiter, line terminator) on the download. - components.py: require_normalization_gate() helper. - pages/1-9: gate guard wired on every tool page. Test corpora: - test-cases/encodings-corpus/: 31 encoded CSV fixtures + 9 reference UTF-8 files + manifest, synced from Business/DataTools. - test-cases/text-cleaner-corpus/test_data/17: synced malformed input (unquoted $1,500.00) for the unquoted-delimiter detector. Tests (94 new): - test_normalize.py (48): finding fields, fix registry, auto_fix scope, decision paths, gate idempotency, output-options helper. - test_encodings_corpus.py (90, 16 xfailed): parametric detection + decode + analyzer-no-crash sweep against the manifest. - test_analyze.py: encoding override + encoding_uncertain detectors. - test_corpus.py: pre-parse repair in the strict reader. run_tests.py: new aliases --tool normalize, --tool encodings, --tool gate; encodings corpus added to --fixtures category. Docs: USER-GUIDE §3.3 covers the gate workflow, encoding override, and output options; TECHNICAL §10.2.1-10.2.4 documents the analyzer schema, gate API, Review page, and pre-parse repair pipeline; CLI-REFERENCE adds the analyzer JSON schema with the new fields; README links to all of it. Suite: 765 passed, 17 xfailed (was 458 passed). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 20:35:27 +00:00
parent e9c490ae1b
commit 82d7fef21e
68 changed files with 2883 additions and 34 deletions
--- a/src/core/analyze.py
+++ b/src/core/analyze.py
@@ -25,6 +25,7 @@ from pandas.api import types as pdtypes
 from .io import RepairResult, repair_bytes, detect_encoding, detect_delimiter

 Severity = Literal["info", "warn", "error"]
+Confidence = Literal["high", "medium", "low"]


 # Tool identifiers — match the 0N_<name> convention used by the script set.
@@ -35,6 +36,29 @@ TOOL_DEDUPLICATOR = "01_deduplicator"
 TOOL_FORMAT_STANDARDIZER = "03_format_standardizer"


+# Stable fix-action ids. These name the algorithm that resolves a finding;
+# the normalize layer dispatches on this id. Keep in sync with fixes.py.
+FIX_TRIM_WHITESPACE = "trim_whitespace"
+FIX_STRIP_NBSP = "strip_nbsp_unicode_whitespace"
+FIX_STRIP_ZERO_WIDTH = "strip_zero_width"
+FIX_FOLD_SMART_PUNCT = "fold_smart_punctuation"
+FIX_CLEAN_HEADERS = "clean_headers"
+FIX_NORMALIZE_LINE_ENDINGS = "normalize_line_endings"
+FIX_STRIP_BOM = "strip_bom"
+FIX_STRIP_NUL = "strip_nul"
+FIX_FOLD_SMART_QUOTES_BYTE = "fold_smart_quotes_byte"
+FIX_REPAIR_UNQUOTED_DELIM = "repair_unquoted_delimiters"
+FIX_LOWERCASE_EMAIL = "lowercase_email_column"
+FIX_REPLACE_NULL_SENTINELS = "replace_null_sentinels"
+FIX_REPAIR_MOJIBAKE = "repair_mojibake"
+FIX_NONE = ""  # informational — nothing to apply
+
+# Replacement character (U+FFFD) inserted when a decoder gave up on a byte.
+# Anything more than a tiny ratio of it in the loaded text is a strong
+# signal that the encoding was wrong.
+_REPLACEMENT_CHAR = "<EFBFBD>"
+
+
@dataclass
 class Finding:
    """One issue the analyzer surfaced.
@@ -47,6 +71,16 @@ class Finding:
    severity
        ``"info"`` (FYI), ``"warn"`` (likely needs cleanup),
        ``"error"`` (will block downstream work).
+    confidence
+        ``"high"`` — round-trip-safe algorithmic fix, eligible for auto-fix.
+        ``"medium"`` — right call in the common case but has known
+        false-positive shapes; user should preview before applying.
+        ``"low"`` — heuristic; the wrong call corrupts data; opt-in only.
+        Independent of severity: a ``warn`` finding can be high-confidence
+        (NBSP strip) and an ``info`` finding can be low-confidence (mojibake).
+    fix_action
+        Stable id naming the algorithm that resolves this finding. Empty
+        string for informational findings with no associated fix.
    tool
        Tool id that can address the finding, or empty string for purely
        informational findings.
@@ -69,6 +103,13 @@ class Finding:
    description: str
    column: Optional[str] = None
    samples: list[tuple[int, str, str]] = field(default_factory=list)
+    confidence: Confidence = "high"
+    fix_action: str = FIX_NONE
+    # True when the fix already ran during the pre-parse repair pass
+    # (e.g. BOM strip, byte-level smart-quote fold). The gate treats these
+    # as already-resolved; the review page still surfaces them so the
+    # user can see what was auto-applied during read.
+    pre_applied: bool = False


 # ---------------------------------------------------------------------------
@@ -139,6 +180,8 @@ def _detect_smart_punctuation(df: pd.DataFrame) -> list[Finding]:
            f"regex patterns."
        ),
        samples=sample_rows,
+        confidence="high",
+        fix_action=FIX_FOLD_SMART_PUNCT,
    )]


@@ -172,6 +215,8 @@ def _detect_invisible_chars(df: pd.DataFrame) -> list[Finding]:
                f"join keys."
            ),
            samples=nbsp_samples,
+            confidence="high",
+            fix_action=FIX_STRIP_NBSP,
        ))
    if zw_cells:
        findings.append(Finding(
@@ -184,6 +229,8 @@ def _detect_invisible_chars(df: pd.DataFrame) -> list[Finding]:
                f"characters (ZWSP, ZWJ, soft hyphen, BOM, bidi marks)."
            ),
            samples=zw_samples,
+            confidence="high",
+            fix_action=FIX_STRIP_ZERO_WIDTH,
        ))
    # Headers carry the same risks; flag separately so the user sees that
    # df["Email"] vs df["Email"] is the issue.
@@ -208,6 +255,8 @@ def _detect_invisible_chars(df: pd.DataFrame) -> list[Finding]:
                f"df['col'] lookups."
            ),
            samples=[(0, h, h) for h in bad_headers[:5]],
+            confidence="high",
+            fix_action=FIX_CLEAN_HEADERS,
        ))
    return findings

@@ -235,6 +284,8 @@ def _detect_whitespace_padding(df: pd.DataFrame) -> list[Finding]:
            f"multi-space internal runs. Common cause of failed joins."
        ),
        samples=samples,
+        confidence="high",
+        fix_action=FIX_TRIM_WHITESPACE,
    )]


@@ -264,6 +315,8 @@ def _detect_null_like_sentinels(df: pd.DataFrame) -> list[Finding]:
            f"counts as missing in the missing-value handler."
        ),
        samples=samples,
+        confidence="medium",
+        fix_action=FIX_REPLACE_NULL_SENTINELS,
    )]


@@ -290,6 +343,8 @@ def _detect_mojibake(df: pd.DataFrame) -> list[Finding]:
            f"patterns (Ã©, â€™, etc.). Auto-repair is opt-in (Tier 2)."
        ),
        samples=samples,
+        confidence="low",
+        fix_action=FIX_REPAIR_MOJIBAKE,
    )]


@@ -316,6 +371,8 @@ def _detect_mixed_case_email(df: pd.DataFrame) -> list[Finding]:
                ),
                column=col,
                samples=samples,
+                confidence="medium",
+                fix_action=FIX_LOWERCASE_EMAIL,
            ))
    return findings

@@ -362,6 +419,8 @@ def _detect_near_duplicates(df: pd.DataFrame) -> list[Finding]:
            f"Run the deduplicator to merge or remove."
        ),
        samples=samples,
+        confidence="medium",
+        fix_action=FIX_NONE,  # routed to dedup tool, not auto-fixed here
    )]


@@ -397,23 +456,60 @@ def _detect_leading_zero_ids(df: pd.DataFrame) -> list[Finding]:
                ),
                column=str(col),
                samples=samples,
+                confidence="low",
+                fix_action=FIX_NONE,  # informational only
            ))
    return findings


+def _count_row_terminators(raw: bytes) -> tuple[int, int, int]:
+    """Count CRLF / LF / CR sequences that act as *row* terminators.
+
+    Walks the bytes tracking quoted-region state so that line breaks
+    inside multi-line quoted cells (e.g. an address column) are not
+    counted. Without this, files that legitimately have CRLF at row
+    boundaries plus LF inside quoted cells get false-positive
+    ``mixed_line_endings`` findings.
+    """
+    n_crlf = n_lf = n_cr = 0
+    in_quotes = False
+    i = 0
+    n = len(raw)
+    while i < n:
+        b = raw[i]
+        if b == 0x22:  # ASCII double quote — toggles quoted region.
+            # Doubled quote inside a quoted cell is an escape, not an exit.
+            if in_quotes and i + 1 < n and raw[i + 1] == 0x22:
+                i += 2
+                continue
+            in_quotes = not in_quotes
+            i += 1
+            continue
+        if not in_quotes:
+            if b == 0x0D:  # CR
+                if i + 1 < n and raw[i + 1] == 0x0A:
+                    n_crlf += 1
+                    i += 2
+                    continue
+                n_cr += 1
+            elif b == 0x0A:  # LF
+                n_lf += 1
+        i += 1
+    return n_crlf, n_lf, n_cr
+
+
 def _detect_mixed_line_endings(raw: bytes) -> list[Finding]:
-    """Flag files that mix CRLF, LF, and bare CR line terminators.
+    """Flag files that mix CRLF, LF, and bare CR row terminators.

    Mixed endings are a classic disaster pattern after multi-source concat
-    (Windows + macOS + Linux exports stitched together). Operates on raw
+    (Windows + macOS + Linux exports stitched together). Counts only the
+    terminators that act as row separators, so embedded newlines inside
+    quoted multi-line cells don't create false positives. Operates on raw
    bytes only — DataFrame-mode :func:`analyze` skips this detector.
    """
    if not raw:
        return []
-    n_crlf = raw.count(b"\r\n")
-    # Count standalone \r and \n (not part of \r\n) by subtracting overlaps.
-    n_lf = raw.count(b"\n") - n_crlf
-    n_cr = raw.count(b"\r") - n_crlf
+    n_crlf, n_lf, n_cr = _count_row_terminators(raw)
    kinds_present = sum(1 for n in (n_crlf, n_lf, n_cr) if n > 0)
    if kinds_present <= 1:
        return []
@@ -434,6 +530,53 @@ def _detect_mixed_line_endings(raw: bytes) -> list[Finding]:
            f"({', '.join(breakdown)}). Naive splits on one style produce "
            f"ghost rows or merged lines. Run the text cleaner to normalize."
        ),
+        confidence="high",
+        fix_action=FIX_NORMALIZE_LINE_ENDINGS,
+    )]
+
+
+def _detect_encoding_uncertainty(df: pd.DataFrame) -> list[Finding]:
+    """Flag DataFrames whose loaded text contains U+FFFD replacement chars.
+
+    The replacement character is what Python's decoder substitutes for
+    bytes it could not interpret under ``errors="replace"``. Any non-zero
+    count is a strong signal that the encoding picked by the loader was
+    wrong for at least part of the file — classic lying-BOM, mixed-encoding,
+    or wrong-codepage symptom. The user has to pick: re-upload with an
+    explicit encoding, or accept the loss.
+    """
+    affected_cells = 0
+    sample_rows: list[tuple[int, str, str]] = []
+    bad_headers: list[str] = []
+    for col in df.columns:
+        if isinstance(col, str) and _REPLACEMENT_CHAR in col:
+            bad_headers.append(col)
+        for row_idx, val in enumerate(df[col].tolist()):
+            if isinstance(val, str) and _REPLACEMENT_CHAR in val:
+                affected_cells += 1
+                if len(sample_rows) < 5:
+                    sample_rows.append((row_idx, str(col), val))
+    if not affected_cells and not bad_headers:
+        return []
+    location = []
+    if affected_cells:
+        location.append(f"{affected_cells} cell(s)")
+    if bad_headers:
+        location.append(f"{len(bad_headers)} header(s)")
+    return [Finding(
+        id="encoding_uncertain",
+        severity="error",
+        tool="",
+        count=affected_cells + len(bad_headers),
+        description=(
+            f"{' and '.join(location)} contain U+FFFD replacement characters, "
+            f"which means the file's encoding could not be decoded cleanly. "
+            f"Re-upload with an explicit encoding (e.g. cp1252, latin-1) "
+            f"or fix the source. Continuing risks silent data loss."
+        ),
+        samples=sample_rows,
+        confidence="low",
+        fix_action=FIX_NONE,
    )]


@@ -455,6 +598,9 @@ def _findings_from_repair(repair: RepairResult) -> list[Finding]:
            tool=TOOL_TEXT_CLEANER,
            count=1,
            description="UTF-8 BOM at file start was removed before parsing.",
+            confidence="high",
+            fix_action=FIX_STRIP_BOM,
+            pre_applied=True,
        ))
    if "strip_nul" in summary:
        nul_action = next(a for a in repair.actions if a.kind == "strip_nul")
@@ -467,6 +613,9 @@ def _findings_from_repair(repair: RepairResult) -> list[Finding]:
                f"Embedded NUL bytes in the file were stripped before "
                f"parsing ({nul_action.detail})."
            ),
+            confidence="high",
+            fix_action=FIX_STRIP_NUL,
+            pre_applied=True,
        ))
    if "fold_smart_quote" in summary:
        action = next(a for a in repair.actions if a.kind == "fold_smart_quote")
@@ -479,6 +628,55 @@ def _findings_from_repair(repair: RepairResult) -> list[Finding]:
                f"Smart double quotes were folded to ASCII before parsing "
                f"({action.detail})."
            ),
+            confidence="high",
+            fix_action=FIX_FOLD_SMART_QUOTES_BYTE,
+            pre_applied=True,
+        ))
+    if "normalize_line_endings" in summary:
+        action = next(a for a in repair.actions if a.kind == "normalize_line_endings")
+        findings.append(Finding(
+            id="csv_line_endings_normalized",
+            severity="info",
+            tool=TOOL_TEXT_CLEANER,
+            count=1,
+            description=(
+                f"Line endings were normalized to LF before parsing "
+                f"({action.detail})."
+            ),
+            confidence="high",
+            fix_action=FIX_NORMALIZE_LINE_ENDINGS,
+            pre_applied=True,
+        ))
+    if "transcode_to_utf8" in summary:
+        action = next(a for a in repair.actions if a.kind == "transcode_to_utf8")
+        findings.append(Finding(
+            id="csv_transcoded_to_utf8",
+            severity="info",
+            tool="",
+            count=1,
+            description=(
+                f"File was transcoded from a wide encoding to UTF-8 before "
+                f"parsing ({action.detail})."
+            ),
+            confidence="high",
+            fix_action=FIX_NONE,
+            pre_applied=True,
+        ))
+    if "decode_replaced" in summary:
+        action = next(a for a in repair.actions if a.kind == "decode_replaced")
+        findings.append(Finding(
+            id="encoding_decode_failed",
+            severity="error",
+            tool="",
+            count=1,
+            description=(
+                f"Some bytes could not be decoded under the detected "
+                f"encoding ({action.detail}). Replacement characters "
+                f"(U+FFFD) were inserted; the file likely uses a different "
+                f"encoding or mixes encodings. Re-upload with --encoding."
+            ),
+            confidence="low",
+            fix_action=FIX_NONE,
        ))
    if "quote_unquoted_delim" in summary:
        n = summary["quote_unquoted_delim"]
@@ -491,6 +689,9 @@ def _findings_from_repair(repair: RepairResult) -> list[Finding]:
                f"{n} row(s) had a delimiter inside an unquoted field "
                f"(e.g. '$1,500.00') and were merged during pre-parse repair."
            ),
+            confidence="medium",
+            fix_action=FIX_REPAIR_UNQUOTED_DELIM,
+            pre_applied=True,
        ))
    if repair.unrepairable_lines:
        n = len(repair.unrepairable_lines)
@@ -504,6 +705,8 @@ def _findings_from_repair(repair: RepairResult) -> list[Finding]:
                f"left as-is. Inspect lines: "
                f"{repair.unrepairable_lines[:10]}"
            ),
+            confidence="low",
+            fix_action=FIX_NONE,
        ))
    return findings

@@ -517,6 +720,7 @@ def analyze(
    *,
    sample_rows: int = 1000,
    repair_result: Optional[RepairResult] = None,
+    encoding_override: Optional[str] = None,
 ) -> list[Finding]:
    """Run all detectors against *source* and return a list of findings.

@@ -533,11 +737,17 @@ def analyze(
        Optional :class:`RepairResult` from a prior pre-parse pass; used
        to synthesize ``csv_*`` findings so the user sees what the parser
        quietly fixed.
+    encoding_override
+        When set, skip charset detection and decode with this encoding
+        instead. Used by the Review page to let the user correct
+        misdetections (cp1250-vs-cp1252 ambiguity, KOI8-R surfacing as
+        Shift_JIS, etc.). Only applies when *source* is a path.
    """
    raw_for_byte_scan: Optional[bytes] = None
    if isinstance(source, (str, Path)):
        df, internal_repair, raw_for_byte_scan = _load_for_analysis(
            Path(source), sample_rows=sample_rows,
+            encoding_override=encoding_override,
        )
        # Caller-supplied repair_result wins over the internally produced one,
        # since the caller may have used non-default repair flags.
@@ -547,10 +757,36 @@ def analyze(
        df = source.head(sample_rows).copy() if len(source) > sample_rows else source.copy()

    findings: list[Finding] = []
+    if raw_for_byte_scan is not None and not raw_for_byte_scan.strip():
+        findings.append(Finding(
+            id="empty_input",
+            severity="error",
+            tool="",
+            count=0,
+            description="Input file is empty (zero bytes or whitespace only).",
+            confidence="low",
+            fix_action=FIX_NONE,
+        ))
+        return findings
+    if df.empty and df.columns.empty and raw_for_byte_scan is not None:
+        # Non-empty bytes but the parser couldn't extract a header row.
+        findings.append(Finding(
+            id="empty_input",
+            severity="error",
+            tool="",
+            count=0,
+            description=(
+                "Input file has no parseable rows or columns "
+                "(only line endings, BOM, or whitespace)."
+            ),
+            confidence="low",
+            fix_action=FIX_NONE,
+        ))
    if repair_result is not None:
        findings.extend(_findings_from_repair(repair_result))
    if raw_for_byte_scan is not None:
        findings.extend(_detect_mixed_line_endings(raw_for_byte_scan))
+    findings.extend(_detect_encoding_uncertainty(df))
    findings.extend(_detect_smart_punctuation(df))
    findings.extend(_detect_invisible_chars(df))
    findings.extend(_detect_whitespace_padding(df))
@@ -563,7 +799,7 @@ def analyze(


 def _load_for_analysis(
-    path: Path, *, sample_rows: int,
+    path: Path, *, sample_rows: int, encoding_override: Optional[str] = None,
 ) -> tuple[pd.DataFrame, Optional[RepairResult], Optional[bytes]]:
    """Read just enough of *path* to scan, with the same robust pre-parse
    repair the tool pages will use.
@@ -571,6 +807,12 @@ def _load_for_analysis(
    Returns ``(df, repair_result, raw_bytes)``. The repair result and raw
    bytes are *None* for Excel files since the byte-level repair step
    (BOM/NUL/smart-quote folding) and line-ending scan are CSV-specific.
+    An empty CSV returns an empty DataFrame plus the (empty) raw bytes;
+    the caller synthesizes an ``empty_input`` finding from that.
+
+    When *encoding_override* is set, it replaces the detected encoding
+    entirely — the user has explicitly told us what the file is. The
+    delimiter is still detected (it's separate from encoding choice).
    """
    suffix = path.suffix.lower()
    if suffix in (".xlsx", ".xls"):
@@ -579,17 +821,24 @@ def _load_for_analysis(
            nrows=sample_rows,
        )
        return df, None, None
-    enc = detect_encoding(path)
-    delim = detect_delimiter(path, enc)
    raw = path.read_bytes()
+    if not raw.strip():
+        return pd.DataFrame(), None, raw
+    enc = encoding_override or detect_encoding(path)
+    delim = detect_delimiter(path, enc)
    repair = repair_bytes(raw, encoding=enc, delimiter=delim)
    import io as _io
-    df = pd.read_csv(
-        _io.BytesIO(repair.repaired_bytes),
-        encoding="utf-8", delimiter=delim,
-        dtype=str, keep_default_na=False, on_bad_lines="warn",
-        nrows=sample_rows,
-    )
+    try:
+        df = pd.read_csv(
+            _io.BytesIO(repair.repaired_bytes),
+            encoding="utf-8", delimiter=delim,
+            dtype=str, keep_default_na=False, on_bad_lines="warn",
+            nrows=sample_rows,
+        )
+    except pd.errors.EmptyDataError:
+        # File is non-empty bytes but had no parseable columns (e.g. only
+        # whitespace, only a BOM, only line endings). Treat as empty.
+        return pd.DataFrame(), repair, raw
    return df, repair, raw


@@ -598,6 +847,9 @@ def to_dict(finding: Finding) -> dict[str, Any]:
    return {
        "id": finding.id,
        "severity": finding.severity,
+        "confidence": finding.confidence,
+        "fix_action": finding.fix_action,
+        "pre_applied": finding.pre_applied,
        "tool": finding.tool,
        "count": finding.count,
        "description": finding.description,
--- a/src/core/fixes.py
+++ b/src/core/fixes.py
@@ -0,0 +1,296 @@
+"""Registry of fix algorithms keyed by ``fix_action`` id.
+
+Every :class:`~src.core.analyze.Finding` declares a ``fix_action`` naming
+the algorithm that resolves it. The normalize layer dispatches on that id
+into this registry. Each fix function takes a DataFrame plus an optional
+``payload`` dict (for fixes that need user-supplied parameters, e.g. the
+custom null-sentinel list) and returns ``(new_df, n_cells_changed)``.
+
+Fixes here operate on the DataFrame after the byte-level pre-parse repair
+has already run (BOM, NUL, line endings, smart-quote bytes, unquoted
+delimiters). Anything in this layer is reversible from the audit log; a
+lossy fix (e.g. mojibake repair) is gated to ``confidence="low"`` and
+requires explicit user opt-in via the review page.
+"""
+
+from __future__ import annotations
+
+import re
+import unicodedata
+from typing import Any, Callable, Optional
+
+import pandas as pd
+
+from .text_clean import (
+    _SMART_TRANS,
+    _ZERO_WIDTH_RE,
+    _CONTROL_RE,
+    _WHITESPACE_RUN_RE,
+    _looks_structured,
+    strip_bom,
+    normalize_line_endings as _norm_le_str,
+)
+# The package __init__ re-exports the analyze() function under the name
+# `analyze`, which shadows the submodule attribute. Reach the module via
+# sys.modules to get its private constants and FIX_* identifiers.
+import sys as _sys
+import src.core.analyze  # noqa: F401  (registers the submodule)
+_a = _sys.modules["src.core.analyze"]
+
+# NBSP / Unicode-whitespace -> ASCII space. Mirrors the analyzer's
+# detection set (analyze._NBSP_LIKE_CHARS) so what the detector flags is
+# exactly what this fix replaces.
+_NBSP_TRANS = str.maketrans({c: " " for c in _a._NBSP_LIKE_CHARS})
+
+
+FixFn = Callable[[pd.DataFrame, Optional[dict]], tuple[pd.DataFrame, int]]
+
+_REGISTRY: dict[str, FixFn] = {}
+
+
+def register(action_id: str) -> Callable[[FixFn], FixFn]:
+    def deco(fn: FixFn) -> FixFn:
+        _REGISTRY[action_id] = fn
+        return fn
+    return deco
+
+
+def get_fix(action_id: str) -> Optional[FixFn]:
+    return _REGISTRY.get(action_id)
+
+
+def available_actions() -> list[str]:
+    return sorted(_REGISTRY)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _apply_to_strings(
+    df: pd.DataFrame, fn: Callable[[str], str], *, include_headers: bool = False,
+) -> tuple[pd.DataFrame, int]:
+    """Apply *fn* to every string cell. Returns (new_df, cells_changed).
+
+    Headers are not touched here — the dedicated header-cleaning fix owns
+    that scope so the gate's audit log records header changes separately.
+    """
+    out = df.copy()
+    changed = 0
+    for col in out.columns:
+        if not pd.api.types.is_object_dtype(out[col]) and not pd.api.types.is_string_dtype(out[col]):
+            continue
+        new_col = []
+        for v in out[col]:
+            if isinstance(v, str):
+                nv = fn(v)
+                if nv != v:
+                    changed += 1
+                new_col.append(nv)
+            else:
+                new_col.append(v)
+        out[col] = new_col
+    if include_headers:
+        new_headers = []
+        for h in out.columns:
+            if isinstance(h, str):
+                nh = fn(h)
+                if nh != h:
+                    changed += 1
+                new_headers.append(nh)
+            else:
+                new_headers.append(h)
+        out.columns = new_headers
+    return out, changed
+
+
+# ---------------------------------------------------------------------------
+# High-confidence fixes
+# ---------------------------------------------------------------------------
+
+@register(_a.FIX_TRIM_WHITESPACE)
+def trim_whitespace(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """Strip leading/trailing whitespace; collapse internal runs in text cells.
+
+    Numeric/date/phone-shaped cells get only outer trim — internal spacing
+    in those is often semantic (`1 234`, `(555) 123-4567`).
+    """
+    def fix(s: str) -> str:
+        trimmed = s.strip()
+        if not trimmed or _looks_structured(trimmed):
+            return trimmed
+        return _WHITESPACE_RUN_RE.sub(" ", trimmed)
+    return _apply_to_strings(df, fix)
+
+
+@register(_a.FIX_STRIP_NBSP)
+def strip_nbsp(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """Replace NBSP and other Unicode spaces with ASCII space."""
+    def fix(s: str) -> str:
+        return s.translate(_NBSP_TRANS)
+    return _apply_to_strings(df, fix)
+
+
+@register(_a.FIX_STRIP_ZERO_WIDTH)
+def strip_zero_width(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """Remove zero-width and invisible characters from cells."""
+    def fix(s: str) -> str:
+        return _ZERO_WIDTH_RE.sub("", s)
+    return _apply_to_strings(df, fix)
+
+
+@register(_a.FIX_FOLD_SMART_PUNCT)
+def fold_smart_punctuation(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """ASCII-fy curly quotes, em/en dashes, ellipsis, primes."""
+    def fix(s: str) -> str:
+        return s.translate(_SMART_TRANS)
+    return _apply_to_strings(df, fix)
+
+
+@register(_a.FIX_CLEAN_HEADERS)
+def clean_headers(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """Apply the same per-cell hygiene to column headers.
+
+    Fixes the df['Email'] vs df['Email '] class of bug.
+    """
+    def fix(s: str) -> str:
+        s = strip_bom(s)
+        s = s.translate(_NBSP_TRANS)
+        s = _ZERO_WIDTH_RE.sub("", s)
+        s = s.translate(_SMART_TRANS)
+        s = _CONTROL_RE.sub("", s)
+        return s.strip()
+    out = df.copy()
+    new_headers = []
+    changed = 0
+    for h in out.columns:
+        if isinstance(h, str):
+            nh = fix(h)
+            if nh != h:
+                changed += 1
+            new_headers.append(nh)
+        else:
+            new_headers.append(h)
+    out.columns = new_headers
+    return out, changed
+
+
+@register(_a.FIX_NORMALIZE_LINE_ENDINGS)
+def normalize_line_endings(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """Normalize CRLF / bare CR inside cells to LF.
+
+    File-level line endings are handled by ``repair_bytes`` before parsing;
+    this fix covers embedded multi-line cells (case 11 in the corpus).
+    """
+    return _apply_to_strings(df, _norm_le_str)
+
+
+# ---------------------------------------------------------------------------
+# Already-applied fixes (no-op at this layer; kept so the audit log is
+# uniform and the gate can reason about them)
+# ---------------------------------------------------------------------------
+
+@register(_a.FIX_STRIP_BOM)
+def strip_bom_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """BOM is stripped during read by repair_bytes; nothing to do here."""
+    return df, 0
+
+
+@register(_a.FIX_STRIP_NUL)
+def strip_nul_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """NUL is stripped during read by repair_bytes."""
+    return df, 0
+
+
+@register(_a.FIX_FOLD_SMART_QUOTES_BYTE)
+def fold_smart_quotes_byte_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """Byte-level smart-quote fold runs in repair_bytes."""
+    return df, 0
+
+
+@register(_a.FIX_REPAIR_UNQUOTED_DELIM)
+def repair_unquoted_delim_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """Per-row delimiter repair runs in repair_bytes."""
+    return df, 0
+
+
+# ---------------------------------------------------------------------------
+# Medium-confidence fixes (require user confirmation in the review flow)
+# ---------------------------------------------------------------------------
+
+@register(_a.FIX_LOWERCASE_EMAIL)
+def lowercase_email(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """Lowercase values in the column named in *payload['column']*.
+
+    Defaults to lowercasing every column whose name matches the email
+    heuristic if no payload is given.
+    """
+    out = df.copy()
+    payload = payload or {}
+    target_cols: list[str]
+    if "column" in payload:
+        target_cols = [payload["column"]]
+    else:
+        target_cols = [
+            c for c in out.columns
+            if isinstance(c, str) and _a._EMAIL_LIKE_COL.search(c)
+        ]
+    changed = 0
+    for col in target_cols:
+        if col not in out.columns:
+            continue
+        new_col = []
+        for v in out[col]:
+            if isinstance(v, str):
+                nv = v.lower()
+                if nv != v:
+                    changed += 1
+                new_col.append(nv)
+            else:
+                new_col.append(v)
+        out[col] = new_col
+    return out, changed
+
+
+@register(_a.FIX_REPLACE_NULL_SENTINELS)
+def replace_null_sentinels(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """Replace user-approved null-like sentinel strings with empty string.
+
+    Payload: ``{"sentinels": ["N/A", "n/a", "nan", ...]}``. Defaults to
+    the analyzer's built-in set when no payload is given. Comparison is
+    case-insensitive, whitespace-trimmed.
+    """
+    payload = payload or {}
+    sentinels = payload.get("sentinels")
+    if sentinels is None:
+        sentinels = list(_a._NULL_LIKE)
+    sentinel_set = {s.strip().lower() for s in sentinels}
+
+    def fix(s: str) -> str:
+        return "" if s.strip().lower() in sentinel_set else s
+
+    return _apply_to_strings(df, fix)
+
+
+# ---------------------------------------------------------------------------
+# Low-confidence fixes (off by default; user-only)
+# ---------------------------------------------------------------------------
+
+@register(_a.FIX_REPAIR_MOJIBAKE)
+def repair_mojibake(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
+    """Heuristic UTF-8-as-cp1252 mojibake repair via ftfy when available.
+
+    Falls back to a no-op (returning ``(df, 0)``) when ftfy is not
+    installed; the review page surfaces that as "library missing — install
+    ftfy to enable" so we never silently corrupt data with a hand-rolled
+    heuristic.
+    """
+    try:
+        import ftfy  # type: ignore
+    except ImportError:
+        return df, 0
+
+    def fix(s: str) -> str:
+        return ftfy.fix_text(s)
+
+    return _apply_to_strings(df, fix)
--- a/src/core/io.py
+++ b/src/core/io.py
@@ -34,6 +34,16 @@ def detect_encoding(path: Path, sample_bytes: int = 65_536) -> str:
    if raw[:2] in (b"\xff\xfe", b"\xfe\xff"):
        return "utf-16"

+    # Strict UTF-8 wins. charset_normalizer fingerprints small files
+    # dominated by short non-ASCII sequences (e.g. zero-width chars at
+    # U+200B-class) as mac_latin2 / cp1250 / similar — but if the bytes
+    # decode cleanly as UTF-8, that's the right answer regardless.
+    try:
+        raw.decode("utf-8")
+        return "utf-8"
+    except UnicodeDecodeError:
+        pass
+
    result = from_bytes(raw).best()
    if result is None:
        return "utf-8"
@@ -416,6 +426,7 @@ def repair_bytes(
    fold_quotes: bool = True,
    strip_nul: bool = True,
    repair_delims: bool = True,
+    normalize_line_endings: bool = True,
 ) -> RepairResult:
    """Pre-parse repair on a raw delimited file.

@@ -423,8 +434,11 @@ def repair_bytes(

    1. Strip a leading UTF-8 BOM.
    2. Strip embedded NUL bytes (the C parser truncates fields at NUL).
-    3. Fold smart double quotes (curly, guillemet, double-prime) to ASCII ``"``.
-    4. Per-row repair when one rogue delimiter is embedded in a field that
+    3. Normalize line endings (CRLF and bare CR to LF). Bare CR confuses
+       the C parser ("new-line character seen in unquoted field"); the
+       text-cleaner contract also calls for LF inside multi-line cells.
+    4. Fold smart double quotes (curly, guillemet, double-prime) to ASCII ``"``.
+    5. Per-row repair when one rogue delimiter is embedded in a field that
       looks like currency or thousands-grouped digits — quote that field.

    Single curly quotes and other punctuation are deferred to the cell-level
@@ -434,12 +448,41 @@ def repair_bytes(
    unrepairable: list[int] = []
    data = raw

+    # If the input is a UTF-16 / UTF-32 byte stream, transcode it to UTF-8
+    # up front. UTF-16 ASCII codepoints carry NUL as half of every 16-bit
+    # unit, so the byte-level NUL-strip below would shred the file. Doing
+    # the transcode here means the rest of the repair pipeline operates
+    # on UTF-8 bytes regardless of the source encoding.
+    enc_norm = encoding.lower().replace("-", "_") if encoding else ""
+    is_wide = enc_norm.startswith(("utf_16", "utf_32"))
+    # UTF-16 LE without a BOM that survives detection lands here too.
+    if is_wide:
+        try:
+            decoded = data.decode(encoding)
+        except (UnicodeDecodeError, LookupError):
+            decoded = data.decode("utf-8", errors="replace")
+            actions.append(RepairAction(
+                kind="decode_replaced", line=None,
+                detail=f"decode errors under {encoding}; replaced with U+FFFD",
+            ))
+        # Strip a leading UTF-16 BOM (decoded as U+FEFF) if present.
+        if decoded and decoded[0] == "":
+            decoded = decoded[1:]
+        data = decoded.encode("utf-8")
+        actions.append(RepairAction(
+            kind="transcode_to_utf8", line=None,
+            detail=f"transcoded {encoding} -> utf-8 ({len(raw)}B -> {len(data)}B)",
+        ))
+        encoding = "utf-8"  # downstream steps now operate on UTF-8
+
    # 1. BOM
    if data.startswith(b"\xef\xbb\xbf"):
        data = data[3:]
        actions.append(RepairAction(kind="strip_bom", line=None, detail="UTF-8 BOM removed"))

-    # 2. NUL
+    # 2. NUL — only meaningful for single-byte / UTF-8 encodings. We've
+    # already transcoded UTF-16/32 to UTF-8 above, so NUL here is genuine
+    # corruption (truncated C strings, half-binary exports), not encoding.
    if strip_nul and b"\x00" in data:
        before = data.count(b"\x00")
        data = data.replace(b"\x00", b"")
@@ -448,6 +491,26 @@ def repair_bytes(
            detail=f"removed {before} NUL byte(s)",
        ))

+    # 3. Line endings: CRLF and bare CR -> LF. CRLF first so we don't
+    # double-substitute. Done at the byte layer so it survives through
+    # any subsequent decode failure.
+    if normalize_line_endings and (b"\r" in data):
+        n_crlf = data.count(b"\r\n")
+        data = data.replace(b"\r\n", b"\n")
+        n_cr = data.count(b"\r")
+        if n_cr:
+            data = data.replace(b"\r", b"\n")
+        if n_crlf or n_cr:
+            parts = []
+            if n_crlf:
+                parts.append(f"{n_crlf} CRLF")
+            if n_cr:
+                parts.append(f"{n_cr} bare CR")
+            actions.append(RepairAction(
+                kind="normalize_line_endings", line=None,
+                detail=f"normalized {', '.join(parts)} to LF",
+            ))
+
    # Decode for character-level work.
    try:
        text = data.decode(encoding)
--- a/src/core/normalize.py
+++ b/src/core/normalize.py
@@ -0,0 +1,249 @@
+"""CSV-normalization gate.
+
+A file enters the tool pages only after passing the gate. The gate has
+two paths:
+
+1. **Auto-fix** — apply every algorithm flagged ``confidence="high"``.
+2. **Review** — show the user a preview of medium/low-confidence findings
+   and accept an explicit per-finding decision before applying.
+
+The gate produces a :class:`NormalizationResult` containing the cleaned
+DataFrame, the bytes representation, and a structured audit log of every
+fix that ran. Tool pages are guarded by :func:`is_normalized` against
+the result and the original list of findings.
+"""
+
+from __future__ import annotations
+
+import io
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Literal, Optional
+
+import pandas as pd
+
+from .analyze import Finding, analyze
+from .fixes import get_fix
+
+
+DecisionAction = Literal["auto", "skip", "modified"]
+
+
+@dataclass
+class Decision:
+    """One user-recorded choice for a finding.
+
+    Attributes
+    ----------
+    finding_id
+        The :class:`Finding` id this decision applies to.
+    action
+        ``"auto"`` to run the registered fix as-is, ``"skip"`` to leave
+        it alone (the gate logs it as waived), ``"modified"`` to run the
+        fix with a custom payload (e.g. user-edited null sentinel list).
+    payload
+        Optional kwargs forwarded to the fix function. Required for
+        ``"modified"``; ignored for ``"skip"``.
+    """
+
+    finding_id: str
+    action: DecisionAction
+    payload: Optional[dict] = None
+
+
+@dataclass
+class FixApplied:
+    """One fix that ran during a gate pass."""
+
+    finding_id: str
+    fix_action: str
+    cells_changed: int
+    decision: DecisionAction
+
+
+@dataclass
+class NormalizationResult:
+    """Output of a gate pass.
+
+    Attributes
+    ----------
+    cleaned_df
+        DataFrame after every applied fix. The downstream tool pages
+        consume this directly.
+    cleaned_bytes
+        UTF-8 encoded CSV of *cleaned_df* — the canonical artifact for
+        round-tripping into another tool that re-parses.
+    applied
+        Audit log of fixes that ran.
+    skipped_findings
+        Findings the user explicitly waived (decision = ``"skip"``).
+    pending_findings
+        Findings still requiring a user decision before the gate is
+        considered passed. Empty on a successful gate pass.
+    blocking_findings
+        Severity=error findings that have no decision and no auto-fix.
+        Non-empty means the gate is blocked and the file cannot enter
+        tool pages.
+    """
+
+    cleaned_df: pd.DataFrame
+    cleaned_bytes: bytes
+    applied: list[FixApplied] = field(default_factory=list)
+    skipped_findings: list[Finding] = field(default_factory=list)
+    pending_findings: list[Finding] = field(default_factory=list)
+    blocking_findings: list[Finding] = field(default_factory=list)
+
+    @property
+    def passed(self) -> bool:
+        return not self.pending_findings and not self.blocking_findings
+
+
+def _df_to_bytes(df: pd.DataFrame) -> bytes:
+    buf = io.StringIO()
+    df.to_csv(buf, index=False, lineterminator="\n")
+    return buf.getvalue().encode("utf-8")
+
+
+def _is_actionable(f: Finding) -> bool:
+    """Does this finding still need attention from the gate?
+
+    Pre-applied fixes (BOM strip, etc. — already done during read) are
+    not actionable. Findings without a registered fix_action are not
+    actionable here either; severity=error ones become blockers.
+    """
+    if f.pre_applied:
+        return False
+    if not f.fix_action:
+        return False
+    return get_fix(f.fix_action) is not None
+
+
+def auto_fix(
+    df: pd.DataFrame, findings: list[Finding],
+) -> NormalizationResult:
+    """Apply every fix flagged ``confidence="high"``.
+
+    Returns a :class:`NormalizationResult`. Medium / low / unknown
+    confidence findings are surfaced as ``pending_findings`` and the
+    result is *not* considered passed until the user decides on them.
+    """
+    decisions: list[Decision] = [
+        Decision(finding_id=f.id, action="auto")
+        for f in findings
+        if _is_actionable(f) and f.confidence == "high"
+    ]
+    return apply_decisions(df, findings, decisions)
+
+
+def apply_decisions(
+    df: pd.DataFrame, findings: list[Finding], decisions: list[Decision],
+) -> NormalizationResult:
+    """Apply *decisions* to *df* in finding order.
+
+    Findings with no matching decision are categorized:
+
+    * ``severity=error`` -> ``blocking_findings``
+    * Otherwise -> ``pending_findings`` (user still owes us a decision)
+
+    Pre-applied findings are recorded once in the audit log with
+    ``cells_changed=0`` so callers can render "what was already done."
+    """
+    decision_by_id = {d.finding_id: d for d in decisions}
+
+    out = df.copy()
+    applied: list[FixApplied] = []
+    skipped: list[Finding] = []
+    pending: list[Finding] = []
+    blocking: list[Finding] = []
+
+    for f in findings:
+        if f.pre_applied:
+            applied.append(FixApplied(
+                finding_id=f.id,
+                fix_action=f.fix_action,
+                cells_changed=0,
+                decision="auto",
+            ))
+            continue
+
+        decision = decision_by_id.get(f.id)
+        if decision is None:
+            if f.severity == "error":
+                blocking.append(f)
+            elif _is_actionable(f):
+                pending.append(f)
+            # else: informational with no fix; ignore.
+            continue
+
+        if decision.action == "skip":
+            skipped.append(f)
+            continue
+
+        fix_fn = get_fix(f.fix_action)
+        if fix_fn is None:
+            # Decision references a fix we don't have; treat as pending.
+            pending.append(f)
+            continue
+
+        payload = decision.payload
+        # Per-column fixes (lowercase_email) can carry the column from
+        # the finding when the user didn't override it.
+        if f.column and (payload is None or "column" not in payload):
+            payload = {**(payload or {}), "column": f.column}
+
+        out, changed = fix_fn(out, payload)
+        applied.append(FixApplied(
+            finding_id=f.id,
+            fix_action=f.fix_action,
+            cells_changed=changed,
+            decision=decision.action,
+        ))
+
+    return NormalizationResult(
+        cleaned_df=out,
+        cleaned_bytes=_df_to_bytes(out),
+        applied=applied,
+        skipped_findings=skipped,
+        pending_findings=pending,
+        blocking_findings=blocking,
+    )
+
+
+def is_normalized(
+    findings: list[Finding], result: Optional[NormalizationResult],
+) -> bool:
+    """True iff *result* satisfies the gate against *findings*.
+
+    The gate passes when:
+
+    * A result exists, and
+    * It has no blocking findings, and
+    * It has no pending (undecided) actionable findings.
+
+    Re-run analysis on the cleaned bytes to confirm the high-confidence
+    detectors no longer fire — that's the contract the tool pages rely
+    on. Callers who want the cheap check can pass ``result.passed``
+    directly; this function is the strict version.
+    """
+    if result is None:
+        return False
+    if not result.passed:
+        return False
+    # Re-analyze the cleaned bytes; high-confidence detectors must be silent.
+    rerun = analyze(result.cleaned_df)
+    for f in rerun:
+        if f.confidence == "high" and _is_actionable(f):
+            return False
+    return True
+
+
+def gate_summary(result: NormalizationResult) -> dict:
+    """One-line-per-key summary suitable for logging or the CLI."""
+    return {
+        "passed": result.passed,
+        "fixes_applied": len(result.applied),
+        "cells_changed": sum(a.cells_changed for a in result.applied),
+        "skipped": [f.id for f in result.skipped_findings],
+        "pending": [f.id for f in result.pending_findings],
+        "blocking": [f.id for f in result.blocking_findings],
+    }
--- a/src/gui/components.py
+++ b/src/gui/components.py
@@ -1096,6 +1096,49 @@ class _StashedUpload:
        return self._data


+def require_normalization_gate() -> None:
+    """Block the calling tool page until the upload has passed the gate.
+
+    Tool pages should call this immediately after their imports. When the
+    current session upload has not been normalized — no
+    ``normalization_result``, the result is for a different upload, or the
+    result didn't pass — the user is shown a banner and a button to jump
+    to the Review page; the rest of the page is short-circuited via
+    ``st.stop()``.
+
+    Pages that genuinely don't need a clean dataframe (rare) can opt out
+    by simply not calling this.
+    """
+    import hashlib
+    has_upload = st.session_state.get("home_uploaded_bytes") is not None
+    if not has_upload:
+        # No upload yet — let the page's own uploader handle it; the gate
+        # will kick in once a file is present.
+        return
+
+    upload_hash = hashlib.sha256(
+        st.session_state["home_uploaded_bytes"]
+    ).hexdigest()
+    result = st.session_state.get("normalization_result")
+    matched = (
+        result is not None
+        and st.session_state.get("normalization_for") == upload_hash
+        and getattr(result, "passed", False)
+    )
+    if matched:
+        return
+
+    name = st.session_state.get("home_uploaded_name", "the uploaded file")
+    st.warning(
+        f"**{name}** must pass the CSV-normalization gate before you can "
+        f"use this tool. Open the Review page to apply the fixes our "
+        f"analyzer recommends."
+    )
+    if st.button("Go to Review & Normalize", type="primary"):
+        st.switch_page("pages/0_Review.py")
+    st.stop()
+
+
 def pickup_or_upload(
    *,
    label: str,
--- a/src/gui/pages/0_Review.py
+++ b/src/gui/pages/0_Review.py
@@ -0,0 +1,675 @@
+"""Review & normalize gate page.
+
+Sits between the home-page upload and every tool page. Walks the user
+through every analyzer finding, lets them auto-fix, preview, customize,
+or skip each one, and produces a :class:`NormalizationResult` stashed in
+session state. Tool pages refuse to load until this gate has passed.
+
+State contract
+--------------
+Session state read:
+* ``home_uploaded_bytes`` / ``home_uploaded_name`` — current upload.
+* ``home_findings`` — list of :class:`Finding` from the home-page scan.
+* ``review_decisions`` — dict[finding_id, Decision]; user's choices so far.
+
+Session state written:
+* ``review_decisions`` — updated as the user flips controls.
+* ``normalization_result`` — :class:`NormalizationResult` after Apply.
+* ``normalization_for`` — content hash of the upload the result is for.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import io
+import sys
+from pathlib import Path
+from typing import Optional
+
+import pandas as pd
+import streamlit as st
+
+# Project root on sys.path (mirrors app.py).
+_project_root = Path(__file__).resolve().parent.parent.parent.parent
+if str(_project_root) not in sys.path:
+    sys.path.insert(0, str(_project_root))
+
+from src.core.analyze import Finding, analyze
+from src.core.fixes import get_fix
+from src.core.io import detect_encoding, repair_bytes
+from src.core.normalize import (
+    Decision,
+    NormalizationResult,
+    apply_decisions,
+    auto_fix,
+    gate_summary,
+    is_normalized,
+)
+from src.gui.components import hide_streamlit_chrome
+
+
+# Common single-byte and multi-byte encodings the user might pick to
+# correct a misdetection. Ordered by frequency in real-world Western /
+# multilingual data; keep the list short — too many options just adds
+# noise. The user can type a custom encoding via the "Other" entry.
+_OVERRIDE_ENCODINGS = [
+    "(detected)",
+    "utf-8",
+    "utf-8-sig",
+    "cp1252",
+    "iso-8859-1",
+    "iso-8859-15",
+    "cp1250",
+    "iso-8859-2",
+    "cp1251",
+    "koi8-r",
+    "mac-roman",
+    "shift_jis",
+    "cp932",
+    "gb18030",
+    "big5",
+    "euc-kr",
+    "cp949",
+    "utf-16",
+    "utf-16-le",
+    "utf-16-be",
+    "Other…",
+]
+
+
+st.set_page_config(page_title="Review & Normalize", page_icon="🛡️", layout="wide")
+hide_streamlit_chrome()
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _upload_hash() -> Optional[str]:
+    data = st.session_state.get("home_uploaded_bytes")
+    if not data:
+        return None
+    return hashlib.sha256(data).hexdigest()
+
+
+def _detected_encoding_for_session() -> Optional[str]:
+    """Run charset detection on the session bytes via a tmp file."""
+    data = st.session_state.get("home_uploaded_bytes")
+    name = st.session_state.get("home_uploaded_name") or "tmp.csv"
+    if not data:
+        return None
+    import tempfile
+    suffix = "." + name.rsplit(".", 1)[-1] if "." in name else ".csv"
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as fh:
+        fh.write(data)
+        tmp_path = Path(fh.name)
+    try:
+        return detect_encoding(tmp_path)
+    finally:
+        tmp_path.unlink(missing_ok=True)
+
+
+def _load_df_from_session(encoding_override: Optional[str] = None) -> Optional[pd.DataFrame]:
+    """Re-parse the session upload through the same pipeline the home page
+    uses, so the review page operates on identical bytes.
+
+    When *encoding_override* is set, decode with that encoding instead of
+    UTF-8. The override flows into ``repair_bytes`` so the wide-encoding
+    transcode and decode_replaced fallback both honor the user's choice.
+    """
+    data = st.session_state.get("home_uploaded_bytes")
+    name = st.session_state.get("home_uploaded_name") or ""
+    if not data:
+        return None
+    suffix = name.rsplit(".", 1)[-1].lower() if "." in name else ""
+    if suffix in ("xlsx", "xls"):
+        return pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False)
+    delim = "\t" if suffix == "tsv" else ","
+    if delim == ",":
+        head = data[:4096].decode("utf-8", errors="replace")
+        for cand in ("\t", ";", "|"):
+            if head.count(cand) > head.count(",") * 1.5:
+                delim = cand
+                break
+    enc = encoding_override or "utf-8"
+    repair = repair_bytes(data, encoding=enc, delimiter=delim)
+    return pd.read_csv(
+        io.BytesIO(repair.repaired_bytes),
+        encoding="utf-8", delimiter=delim,
+        dtype=str, keep_default_na=False, on_bad_lines="warn",
+    )
+
+
+def _run_analysis_with_override(encoding_override: Optional[str]) -> list[Finding]:
+    """Re-run analyze() on the session upload with an encoding override.
+
+    Mirrors components._run_analysis_on_upload but writes the bytes to a
+    tempfile so analyze() goes through the path-based loader (which is
+    where the encoding_override hook lives — DataFrame-mode analysis has
+    nothing to override).
+    """
+    data = st.session_state.get("home_uploaded_bytes")
+    name = st.session_state.get("home_uploaded_name") or "tmp.csv"
+    if not data:
+        return []
+    import tempfile
+    suffix = "." + name.rsplit(".", 1)[-1] if "." in name else ".csv"
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as fh:
+        fh.write(data)
+        tmp_path = Path(fh.name)
+    try:
+        return analyze(tmp_path, encoding_override=encoding_override)
+    finally:
+        tmp_path.unlink(missing_ok=True)
+
+
+def _confidence_pill(c: str) -> str:
+    """Streamlit-markdown pill for the confidence tier."""
+    palette = {"high": "green", "medium": "orange", "low": "red"}
+    return f":{palette.get(c, 'gray')}-background[**{c.upper()}**]"
+
+
+def _severity_pill(s: str) -> str:
+    palette = {"info": "blue", "warn": "orange", "error": "red"}
+    return f":{palette.get(s, 'gray')}-background[**{s}**]"
+
+
+# ---------------------------------------------------------------------------
+# Output options (Advanced — re-encode the cleaned DataFrame for download)
+# ---------------------------------------------------------------------------
+
+# (label_shown_to_user, codec_passed_to_pandas)
+_OUTPUT_ENCODINGS = [
+    ("UTF-8 (recommended)", "utf-8"),
+    ("UTF-8 with BOM (Excel)", "utf-8-sig"),
+    ("Windows-1252 (Western Europe)", "cp1252"),
+    ("ISO-8859-1 / Latin-1", "iso-8859-1"),
+    ("ISO-8859-15 / Latin-9", "iso-8859-15"),
+    ("Windows-1250 (Central Europe)", "cp1250"),
+    ("ISO-8859-2 / Latin-2", "iso-8859-2"),
+    ("Windows-1251 (Cyrillic)", "cp1251"),
+    ("Shift_JIS (Japanese)", "shift_jis"),
+    ("GB18030 (Chinese)", "gb18030"),
+    ("Big5 (Traditional Chinese)", "big5"),
+    ("EUC-KR (Korean)", "euc-kr"),
+    ("UTF-16 LE with BOM", "utf-16"),
+]
+
+_OUTPUT_DELIMITERS = [
+    ("Comma  ,", ","),
+    ("Tab  \\t", "\t"),
+    ("Semicolon  ;", ";"),
+    ("Pipe  |", "|"),
+]
+
+_OUTPUT_LINE_TERMINATORS = [
+    ("LF — \\n (Unix / web / git default)", "\n"),
+    ("CRLF — \\r\\n (Windows / classic Excel)", "\r\n"),
+    ("CR — \\r (classic Mac, very rare)", "\r"),
+]
+
+
+def _build_output_bytes(
+    df: pd.DataFrame,
+    *,
+    encoding: str,
+    delimiter: str,
+    line_terminator: str,
+) -> tuple[bytes, Optional[str]]:
+    """Serialize *df* with the user's output options.
+
+    Returns ``(bytes, error_message)``. ``error_message`` is non-None when
+    the chosen encoding cannot represent at least one cell — characters
+    that don't exist in the target codepage are replaced with ``?`` so
+    the user still gets a download, plus a warning telling them which
+    target was lossy.
+    """
+    buf = io.StringIO()
+    df.to_csv(buf, index=False, sep=delimiter, lineterminator=line_terminator)
+    text = buf.getvalue()
+    try:
+        return text.encode(encoding), None
+    except UnicodeEncodeError:
+        # Find the first character that fails so the message is useful.
+        bad: Optional[str] = None
+        for ch in text:
+            try:
+                ch.encode(encoding)
+            except UnicodeEncodeError:
+                bad = ch
+                break
+        msg = (
+            f"Some characters cannot be represented in {encoding}"
+            + (f" (first offender: {bad!r})" if bad else "")
+            + ". Falling back to '?' replacement; non-Latin content will be lost."
+        )
+        return text.encode(encoding, errors="replace"), msg
+
+
+def _preview_table(f: Finding, decision_action: str, payload: Optional[dict]) -> Optional[pd.DataFrame]:
+    """Build a before/after preview from finding samples.
+
+    Runs the registered fix function on each sample value individually so
+    the user sees exactly what would change. Returns None when no preview
+    is meaningful (no samples, or no fix registered).
+    """
+    if not f.samples:
+        return None
+    fix_fn = get_fix(f.fix_action)
+    if fix_fn is None:
+        # No fix to preview; show samples as-is.
+        return pd.DataFrame(
+            [{"row": r, "column": c, "value": v} for r, c, v in f.samples]
+        )
+    rows = []
+    for r, col, val in f.samples:
+        # Run the fix on a tiny single-cell DataFrame so payload semantics
+        # (e.g. lowercase_email's column targeting) are honored.
+        mini = pd.DataFrame({col: [val]})
+        try:
+            new_df, _ = fix_fn(mini, payload)
+            new_val = new_df[col].iloc[0]
+        except Exception as e:
+            new_val = f"<preview error: {e}>"
+        rows.append({"row": r, "column": col, "before": val, "after": new_val})
+    return pd.DataFrame(rows)
+
+
+# ---------------------------------------------------------------------------
+# Page body
+# ---------------------------------------------------------------------------
+
+st.title("🛡️ Review & Normalize")
+st.caption(
+    "Every finding is shown below with the algorithm that would fix it. "
+    "Auto-fix the high-confidence ones in one click; preview or customize "
+    "the rest before applying."
+)
+
+# Pre-flight: nothing to review without an upload.
+findings: list[Finding] = st.session_state.get("home_findings") or []
+upload_name = st.session_state.get("home_uploaded_name")
+
+if not upload_name:
+    st.warning("No file uploaded. Go back to the home page and upload a CSV or Excel file first.")
+    if st.button("Back to home"):
+        st.switch_page("app.py")
+    st.stop()
+
+# ---- Encoding picker --------------------------------------------------------
+#
+# Charset detection misfires on small files, byte-equivalent codepages
+# (cp1252 vs Latin-1 vs cp1250), and content where every byte happens to
+# decode under the wrong encoding (KOI8-R bytes that look like Shift_JIS).
+# When the user spots mojibake or U+FFFD chars in the findings list, this
+# picker is the escape hatch — pick the right encoding, re-run the analyzer.
+
+with st.container(border=True):
+    detected_enc = _detected_encoding_for_session()
+    current_override = st.session_state.get("encoding_override")
+    suffix = (st.session_state.get("home_uploaded_name") or "")
+    suffix = suffix.rsplit(".", 1)[-1].lower() if "." in suffix else ""
+    is_excel = suffix in ("xlsx", "xls")
+
+    st.markdown("**File encoding**")
+    if is_excel:
+        st.caption(
+            "Excel files store text as Unicode internally — encoding override "
+            "doesn't apply. Skip this section."
+        )
+    else:
+        cap_parts = [f"Detected: `{detected_enc or 'unknown'}`"]
+        if current_override:
+            cap_parts.append(f"Currently using: `{current_override}`")
+        st.caption(
+            " · ".join(cap_parts)
+            + " · Override only if you see mojibake (e.g. `Ã©` for `é`) or U+FFFD"
+            " (`<60>`) in the findings below."
+        )
+
+        col_pick, col_custom, col_apply = st.columns([2, 2, 1])
+
+        with col_pick:
+            current_label = current_override or "(detected)"
+            try:
+                idx = _OVERRIDE_ENCODINGS.index(current_label)
+            except ValueError:
+                idx = _OVERRIDE_ENCODINGS.index("Other…")
+            chosen = st.selectbox(
+                "Encoding",
+                options=_OVERRIDE_ENCODINGS,
+                index=idx,
+                key="encoding_override_select",
+                label_visibility="collapsed",
+            )
+
+        custom_value: Optional[str] = None
+        with col_custom:
+            if chosen == "Other…":
+                custom_value = st.text_input(
+                    "Custom encoding (e.g. `cp1257`, `iso-8859-9`)",
+                    value=current_override if current_override and current_override not in _OVERRIDE_ENCODINGS else "",
+                    key="encoding_override_custom",
+                    label_visibility="collapsed",
+                    placeholder="cp1257",
+                )
+
+        with col_apply:
+            if st.button("Re-analyze", use_container_width=True):
+                if chosen == "(detected)":
+                    new_override = None
+                elif chosen == "Other…":
+                    new_override = (custom_value or "").strip() or None
+                else:
+                    new_override = chosen
+
+                # Sanity-check the override actually decodes the bytes.
+                data = st.session_state.get("home_uploaded_bytes") or b""
+                if new_override is not None:
+                    try:
+                        data.decode(new_override, errors="strict")
+                        decode_ok = True
+                        decode_err = None
+                    except (UnicodeDecodeError, LookupError) as e:
+                        decode_ok = False
+                        decode_err = str(e)
+                else:
+                    decode_ok = True
+                    decode_err = None
+
+                if not decode_ok:
+                    st.warning(
+                        f"`{new_override}` cannot decode this file: {decode_err}. "
+                        f"Re-running anyway with replacement-character fallback so "
+                        f"you can see where the failures are."
+                    )
+
+                # Re-run analysis with the override and refresh session state.
+                st.session_state["encoding_override"] = new_override
+                st.session_state["home_findings"] = _run_analysis_with_override(new_override)
+                # Drop any prior gate result; the user must re-apply.
+                st.session_state.pop("normalization_result", None)
+                st.session_state.pop("normalization_for", None)
+                st.session_state.pop("review_decisions", None)
+                st.rerun()
+
+# Reload findings — the picker above may have just rewritten them.
+findings = st.session_state.get("home_findings") or []
+
+if not findings:
+    st.success("✓ No findings to review. The file is already clean — open any tool to begin.")
+    st.stop()
+
+
+# ---- Top-line counters -------------------------------------------------------
+
+n_high = sum(1 for f in findings if f.confidence == "high" and not f.pre_applied and f.fix_action)
+n_medium = sum(1 for f in findings if f.confidence == "medium" and not f.pre_applied)
+n_low = sum(1 for f in findings if f.confidence == "low" and not f.pre_applied)
+n_pre = sum(1 for f in findings if f.pre_applied)
+n_block = sum(1 for f in findings if f.severity == "error")
+
+c1, c2, c3, c4, c5 = st.columns(5)
+c1.metric("High confidence", n_high, help="Round-trip safe — eligible for auto-fix.")
+c2.metric("Medium", n_medium, help="Right call in the common case; preview before applying.")
+c3.metric("Low", n_low, help="Heuristic — opt in only.")
+c4.metric("Already applied", n_pre, help="Fixed during the read pass (BOM, NUL, line endings).")
+c5.metric("Blocking", n_block, help="Severity = error; must be resolved or waived.")
+
+st.divider()
+
+
+# ---- Top-level controls ------------------------------------------------------
+
+decisions_state: dict = st.session_state.setdefault("review_decisions", {})
+
+bar_left, bar_mid, bar_right = st.columns([1.2, 1.2, 3])
+
+with bar_left:
+    if st.button("✨ Auto-fix high-confidence", type="primary", use_container_width=True):
+        for f in findings:
+            if (
+                not f.pre_applied
+                and f.confidence == "high"
+                and f.fix_action
+                and get_fix(f.fix_action) is not None
+            ):
+                decisions_state[f.id] = Decision(finding_id=f.id, action="auto")
+        st.rerun()
+
+with bar_mid:
+    if st.button("Skip everything (not recommended)", use_container_width=True):
+        for f in findings:
+            if not f.pre_applied:
+                decisions_state[f.id] = Decision(finding_id=f.id, action="skip")
+        st.rerun()
+
+
+# ---- Per-finding cards -------------------------------------------------------
+
+# Sort: blocking first, then high (unfixed), medium, low, pre-applied.
+def _sort_key(f: Finding) -> tuple:
+    severity_rank = {"error": 0, "warn": 1, "info": 2}[f.severity]
+    confidence_rank = {"high": 0, "medium": 1, "low": 2}[f.confidence]
+    return (int(f.pre_applied), severity_rank, confidence_rank, f.id)
+
+
+for f in sorted(findings, key=_sort_key):
+    decision = decisions_state.get(f.id)
+    decision_action = decision.action if decision else (
+        "auto" if (f.pre_applied or (f.confidence == "high" and f.fix_action)) else "skip"
+    )
+
+    title_bits = [
+        _severity_pill(f.severity),
+        _confidence_pill(f.confidence),
+        f"**{f.id}**",
+        f"({f.count})",
+    ]
+    if f.pre_applied:
+        title_bits.append(":gray-background[applied during read]")
+
+    with st.expander(" ".join(title_bits), expanded=(f.severity == "error")):
+        st.caption(f.description)
+        if f.tool:
+            st.caption(f"Owned by: `{f.tool}`")
+
+        if f.pre_applied:
+            st.info("This was already applied during the file read pass — no decision needed.")
+            continue
+
+        if not f.fix_action:
+            if f.severity == "error":
+                st.error(
+                    "Blocking finding with no auto-fix. Choose **Skip / waive** to "
+                    "acknowledge and proceed (not recommended), or fix the file outside "
+                    "DataTools and re-upload."
+                )
+            else:
+                st.info("Informational only — no fix to apply.")
+
+        # Decision radio
+        choice_labels = {
+            "auto": "Auto-fix with our algorithm",
+            "skip": "Skip / waive (no change)",
+        }
+        # Customize is offered for fixes that take a meaningful payload.
+        if f.fix_action in ("replace_null_sentinels",):
+            choice_labels["modified"] = "Customize"
+
+        chosen = st.radio(
+            "Decision",
+            options=list(choice_labels.keys()),
+            index=list(choice_labels.keys()).index(decision_action)
+                if decision_action in choice_labels else 0,
+            format_func=lambda k: choice_labels[k],
+            key=f"decision_{f.id}",
+            horizontal=True,
+        )
+
+        # Customize payload editor (only for the modified action)
+        payload: Optional[dict] = None
+        if chosen == "modified" and f.fix_action == "replace_null_sentinels":
+            default_sentinels = ", ".join(sorted([
+                "n/a", "na", "nan", "null", "none", "-", "--", "tbd", "unknown",
+            ]))
+            text = st.text_area(
+                "Sentinels (comma-separated, case-insensitive):",
+                value=(decision.payload or {}).get(
+                    "sentinels_raw", default_sentinels,
+                ) if decision else default_sentinels,
+                key=f"sentinels_{f.id}",
+            )
+            sentinels = [s.strip() for s in text.split(",") if s.strip()]
+            payload = {"sentinels": sentinels, "sentinels_raw": text}
+
+        # Persist
+        decisions_state[f.id] = Decision(
+            finding_id=f.id, action=chosen, payload=payload,
+        )
+
+        # Preview
+        if chosen != "skip" and f.samples:
+            preview = _preview_table(f, chosen, payload)
+            if preview is not None and not preview.empty:
+                st.markdown("**Preview** (showing up to 5 affected cells)")
+                st.dataframe(preview, use_container_width=True, hide_index=True)
+
+st.divider()
+
+
+# ---- Apply ------------------------------------------------------------------
+
+bottom_left, bottom_mid, bottom_right = st.columns([1, 1, 3])
+
+with bottom_left:
+    apply_clicked = st.button(
+        "✅ Apply & enter tools", type="primary", use_container_width=True,
+        disabled=not decisions_state,
+    )
+
+with bottom_mid:
+    reset_clicked = st.button("Reset all decisions", use_container_width=True)
+
+if reset_clicked:
+    st.session_state.pop("review_decisions", None)
+    st.session_state.pop("normalization_result", None)
+    st.session_state.pop("normalization_for", None)
+    st.rerun()
+
+if apply_clicked:
+    df = _load_df_from_session(
+        encoding_override=st.session_state.get("encoding_override")
+    )
+    if df is None:
+        st.error("Could not re-read the uploaded file. Try re-uploading.")
+        st.stop()
+    decisions_list = [d for d in decisions_state.values() if isinstance(d, Decision)]
+    result = apply_decisions(df, findings, decisions_list)
+    st.session_state["normalization_result"] = result
+    st.session_state["normalization_for"] = _upload_hash()
+
+    summary = gate_summary(result)
+    if result.passed and is_normalized(findings, result):
+        st.success(
+            f"✓ Gate passed — {summary['fixes_applied']} fix(es) applied, "
+            f"{summary['cells_changed']} cell(s) changed. You can now open any tool."
+        )
+    elif result.blocking_findings:
+        st.error(
+            f"Gate blocked by error-level findings: "
+            f"{', '.join(b.id for b in result.blocking_findings)}. "
+            f"Resolve or waive them above before continuing."
+        )
+    elif result.pending_findings:
+        st.warning(
+            f"Pending decisions remain on: "
+            f"{', '.join(f.id for f in result.pending_findings)}. "
+            f"Choose Auto-fix or Skip for each before continuing."
+        )
+
+# Persisted summary (re-render on reload)
+result: Optional[NormalizationResult] = st.session_state.get("normalization_result")
+if result is not None and st.session_state.get("normalization_for") == _upload_hash():
+    with st.expander("Audit log"):
+        if result.applied:
+            st.markdown("**Applied fixes**")
+            st.dataframe(
+                pd.DataFrame([
+                    {
+                        "finding": a.finding_id,
+                        "fix_action": a.fix_action,
+                        "decision": a.decision,
+                        "cells_changed": a.cells_changed,
+                    }
+                    for a in result.applied
+                ]),
+                use_container_width=True, hide_index=True,
+            )
+        if result.skipped_findings:
+            st.markdown("**Skipped (waived by user)**")
+            st.write([f.id for f in result.skipped_findings])
+        if result.passed:
+            st.markdown("---")
+            st.markdown("**Download normalized file**")
+            with st.expander("⚙️  Advanced output options"):
+                st.caption(
+                    "Defaults match what the analyzer normalized to: UTF-8, "
+                    "comma-separated, LF line endings. Override only if your "
+                    "destination tool requires a specific format."
+                )
+
+                col_enc, col_delim, col_le = st.columns(3)
+                with col_enc:
+                    enc_choice = st.selectbox(
+                        "Encoding (code page)",
+                        options=[label for label, _ in _OUTPUT_ENCODINGS],
+                        index=0,
+                        key="output_encoding_select",
+                    )
+                    out_encoding = next(
+                        codec for label, codec in _OUTPUT_ENCODINGS if label == enc_choice
+                    )
+
+                with col_delim:
+                    delim_choice = st.selectbox(
+                        "Delimiter",
+                        options=[label for label, _ in _OUTPUT_DELIMITERS],
+                        index=0,
+                        key="output_delim_select",
+                    )
+                    out_delim = next(
+                        ch for label, ch in _OUTPUT_DELIMITERS if label == delim_choice
+                    )
+
+                with col_le:
+                    le_choice = st.selectbox(
+                        "Line terminator",
+                        options=[label for label, _ in _OUTPUT_LINE_TERMINATORS],
+                        index=0,
+                        key="output_le_select",
+                    )
+                    out_le = next(
+                        ch for label, ch in _OUTPUT_LINE_TERMINATORS if label == le_choice
+                    )
+
+            data, encode_warn = _build_output_bytes(
+                result.cleaned_df,
+                encoding=out_encoding,
+                delimiter=out_delim,
+                line_terminator=out_le,
+            )
+            if encode_warn:
+                st.warning(encode_warn)
+
+            ext = "tsv" if out_delim == "\t" else "csv"
+            mime = "text/tab-separated-values" if out_delim == "\t" else "text/csv"
+            file_name = f"{Path(upload_name).stem}.normalized.{ext}"
+
+            st.download_button(
+                f"⬇️  Download {file_name}",
+                data=data,
+                file_name=file_name,
+                mime=mime,
+                type="primary",
+            )
--- a/src/gui/pages/1_Deduplicator.py
+++ b/src/gui/pages/1_Deduplicator.py
@@ -22,10 +22,12 @@ from src.gui.components import (
    hide_streamlit_chrome,
    match_group_card,
    pickup_or_upload,
+    require_normalization_gate,
    results_summary,
 )

 hide_streamlit_chrome()
+require_normalization_gate()

 # ---------------------------------------------------------------------------
 # Session state defaults
--- a/src/gui/pages/2_Text_Cleaner.py
+++ b/src/gui/pages/2_Text_Cleaner.py
@@ -18,6 +18,7 @@ from src.gui.components import (
    hide_streamlit_chrome,
    pickup_or_upload,
    render_hidden_aware_preview,
+    require_normalization_gate,
 )
 from src.core.text_clean import (
    PRESETS,
@@ -28,6 +29,7 @@ from src.core.text_clean import (
 )

 hide_streamlit_chrome()
+require_normalization_gate()


 # ---------------------------------------------------------------------------
--- a/src/gui/pages/3_Format_Standardizer.py
+++ b/src/gui/pages/3_Format_Standardizer.py
@@ -11,9 +11,10 @@ _project_root = Path(__file__).resolve().parent.parent.parent.parent
 if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root))

-from src.gui.components import hide_streamlit_chrome
+from src.gui.components import hide_streamlit_chrome, require_normalization_gate

 hide_streamlit_chrome()
+require_normalization_gate()

 # ---------------------------------------------------------------------------
 # Header
--- a/src/gui/pages/4_Missing_Values.py
+++ b/src/gui/pages/4_Missing_Values.py
@@ -11,9 +11,10 @@ _project_root = Path(__file__).resolve().parent.parent.parent.parent
 if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root))

-from src.gui.components import hide_streamlit_chrome
+from src.gui.components import hide_streamlit_chrome, require_normalization_gate

 hide_streamlit_chrome()
+require_normalization_gate()

 # ---------------------------------------------------------------------------
 # Header
--- a/src/gui/pages/5_Column_Mapper.py
+++ b/src/gui/pages/5_Column_Mapper.py
@@ -11,9 +11,10 @@ _project_root = Path(__file__).resolve().parent.parent.parent.parent
 if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root))

-from src.gui.components import hide_streamlit_chrome
+from src.gui.components import hide_streamlit_chrome, require_normalization_gate

 hide_streamlit_chrome()
+require_normalization_gate()

 # ---------------------------------------------------------------------------
 # Header
--- a/src/gui/pages/6_Outlier_Detector.py
+++ b/src/gui/pages/6_Outlier_Detector.py
@@ -11,9 +11,10 @@ _project_root = Path(__file__).resolve().parent.parent.parent.parent
 if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root))

-from src.gui.components import hide_streamlit_chrome
+from src.gui.components import hide_streamlit_chrome, require_normalization_gate

 hide_streamlit_chrome()
+require_normalization_gate()

 # ---------------------------------------------------------------------------
 # Header
--- a/src/gui/pages/7_Multi_File_Merger.py
+++ b/src/gui/pages/7_Multi_File_Merger.py
@@ -11,9 +11,10 @@ _project_root = Path(__file__).resolve().parent.parent.parent.parent
 if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root))

-from src.gui.components import hide_streamlit_chrome
+from src.gui.components import hide_streamlit_chrome, require_normalization_gate

 hide_streamlit_chrome()
+require_normalization_gate()

 # ---------------------------------------------------------------------------
 # Header
--- a/src/gui/pages/8_Validator_Reporter.py
+++ b/src/gui/pages/8_Validator_Reporter.py
@@ -11,9 +11,10 @@ _project_root = Path(__file__).resolve().parent.parent.parent.parent
 if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root))

-from src.gui.components import hide_streamlit_chrome
+from src.gui.components import hide_streamlit_chrome, require_normalization_gate

 hide_streamlit_chrome()
+require_normalization_gate()

 # ---------------------------------------------------------------------------
 # Header
--- a/src/gui/pages/9_Pipeline_Runner.py
+++ b/src/gui/pages/9_Pipeline_Runner.py
@@ -11,9 +11,10 @@ _project_root = Path(__file__).resolve().parent.parent.parent.parent
 if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root))

-from src.gui.components import hide_streamlit_chrome
+from src.gui.components import hide_streamlit_chrome, require_normalization_gate

 hide_streamlit_chrome()
+require_normalization_gate()

 # ---------------------------------------------------------------------------
 # Header