feat(gate): CSV-normalization gate with confidence-tiered findings

Adds a Review & Normalize page that sits between upload and every tool
page. The analyzer now tags each finding with confidence (high/medium/low)
and a fix_action; the gate auto-applies high-confidence fixes, surfaces
medium/low ones for user review, and blocks tool pages on error-level
findings until resolved or waived.

Core (src/core/):
  - analyze.py: Finding gains confidence, fix_action, pre_applied; new
    detectors for encoding_uncertain, encoding_decode_failed; new top-
    level encoding_override parameter.
  - fixes.py: registry of fix algorithms keyed by fix_action id.
  - normalize.py: auto_fix(), apply_decisions(), is_normalized(), and
    the NormalizationResult / Decision dataclasses the gate consumes.
  - io.py: detect_encoding tries strict UTF-8 first; repair_bytes now
    transcodes UTF-16/32 to UTF-8 before NUL-strip (fixes UTF-16 corruption)
    and normalizes line endings (fixes bare-CR parser crash); empty file
    handled gracefully instead of EmptyDataError traceback.

GUI (src/gui/):
  - pages/0_Review.py: gate page with per-finding decision controls,
    encoding override picker (16 codepages + custom), and Advanced output
    options (encoding, delimiter, line terminator) on the download.
  - components.py: require_normalization_gate() helper.
  - pages/1-9: gate guard wired on every tool page.

Test corpora:
  - test-cases/encodings-corpus/: 31 encoded CSV fixtures + 9 reference
    UTF-8 files + manifest, synced from Business/DataTools.
  - test-cases/text-cleaner-corpus/test_data/17: synced malformed input
    (unquoted $1,500.00) for the unquoted-delimiter detector.

Tests (94 new):
  - test_normalize.py (48): finding fields, fix registry, auto_fix scope,
    decision paths, gate idempotency, output-options helper.
  - test_encodings_corpus.py (90, 16 xfailed): parametric detection +
    decode + analyzer-no-crash sweep against the manifest.
  - test_analyze.py: encoding override + encoding_uncertain detectors.
  - test_corpus.py: pre-parse repair in the strict reader.

run_tests.py: new aliases --tool normalize, --tool encodings, --tool gate;
encodings corpus added to --fixtures category.

Docs: USER-GUIDE §3.3 covers the gate workflow, encoding override, and
output options; TECHNICAL §10.2.1-10.2.4 documents the analyzer schema,
gate API, Review page, and pre-parse repair pipeline; CLI-REFERENCE adds
the analyzer JSON schema with the new fields; README links to all of it.

Suite: 765 passed, 17 xfailed (was 458 passed).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-29 20:35:27 +00:00
parent e9c490ae1b
commit 82d7fef21e
68 changed files with 2883 additions and 34 deletions

349
tests/test_normalize.py Normal file
View File

@@ -0,0 +1,349 @@
"""Tests for the CSV-normalization gate.
Covers:
* ``Finding.confidence`` and ``Finding.fix_action`` field defaults.
* ``auto_fix`` applies every high-confidence finding and leaves
medium/low ones pending.
* ``apply_decisions`` honors per-finding skip / modified payloads.
* ``is_normalized`` re-checks high-confidence detectors after a fix pass.
* The full corpus auto-fix sweep: every fixture either passes the gate
or has its remaining medium/low findings declared in pending.
"""
from __future__ import annotations
from pathlib import Path
import pandas as pd
import pytest
from src.core.analyze import (
Finding,
analyze,
_load_for_analysis,
FIX_FOLD_SMART_PUNCT,
FIX_LOWERCASE_EMAIL,
FIX_REPLACE_NULL_SENTINELS,
FIX_NONE,
)
from src.core.fixes import get_fix, available_actions
from src.core.normalize import (
Decision,
NormalizationResult,
auto_fix,
apply_decisions,
is_normalized,
gate_summary,
)
CORPUS = Path(__file__).parent.parent / "test-cases" / "text-cleaner-corpus" / "test_data"
# ---------------------------------------------------------------------------
# Field defaults
# ---------------------------------------------------------------------------
class TestFindingFields:
def test_default_confidence_is_high(self):
f = Finding(id="x", severity="warn", tool="", count=1, description="d")
assert f.confidence == "high"
def test_default_fix_action_is_empty(self):
f = Finding(id="x", severity="warn", tool="", count=1, description="d")
assert f.fix_action == ""
def test_pre_applied_default_false(self):
f = Finding(id="x", severity="warn", tool="", count=1, description="d")
assert f.pre_applied is False
def test_smart_punct_finding_carries_fix_action(self):
df = pd.DataFrame({"x": ["“hello”"]})
findings = analyze(df)
smart = next(f for f in findings if f.id == "smart_punctuation_in_data")
assert smart.confidence == "high"
assert smart.fix_action == FIX_FOLD_SMART_PUNCT
def test_mojibake_finding_is_low_confidence(self):
df = pd.DataFrame({"x": ["café"]})
findings = analyze(df)
moji = next(f for f in findings if f.id == "suspected_mojibake")
assert moji.confidence == "low"
# ---------------------------------------------------------------------------
# Fix registry
# ---------------------------------------------------------------------------
class TestFixRegistry:
def test_high_confidence_fixes_registered(self):
actions = available_actions()
assert FIX_FOLD_SMART_PUNCT in actions
assert FIX_LOWERCASE_EMAIL in actions
assert FIX_REPLACE_NULL_SENTINELS in actions
def test_get_fix_returns_callable(self):
fn = get_fix(FIX_FOLD_SMART_PUNCT)
assert callable(fn)
def test_get_fix_unknown_returns_none(self):
assert get_fix("not_a_real_action") is None
# ---------------------------------------------------------------------------
# auto_fix
# ---------------------------------------------------------------------------
class TestAutoFix:
def test_applies_high_confidence_only(self):
df = pd.DataFrame({
"name": [" Alice ", "Bob "], # whitespace + NBSP -> high
"email": ["A@X.com", "b@x.com"], # mixed case -> medium
})
findings = analyze(df)
result = auto_fix(df, findings)
# whitespace_padding and nbsp_or_unicode_whitespace should be applied.
applied_ids = {a.finding_id for a in result.applied}
assert "whitespace_padding" in applied_ids
assert "nbsp_or_unicode_whitespace" in applied_ids
# mixed_case_email_column is medium -> pending.
pending_ids = {f.id for f in result.pending_findings}
assert "mixed_case_email_column" in pending_ids
def test_cells_actually_changed(self):
df = pd.DataFrame({"x": [" hi ", "ok"]})
findings = analyze(df)
result = auto_fix(df, findings)
assert result.cleaned_df["x"].tolist() == ["hi", "ok"]
def test_no_findings_no_fixes(self):
df = pd.DataFrame({"id": ["1", "2"], "name": ["a", "b"]})
findings = analyze(df)
result = auto_fix(df, findings)
assert result.applied == []
assert result.passed is True
def test_blocks_on_severity_error(self, tmp_path):
f = tmp_path / "empty.csv"
f.write_bytes(b"")
findings = analyze(f)
df, _, _ = _load_for_analysis(f, sample_rows=1000)
result = auto_fix(df, findings)
assert any(b.id == "empty_input" for b in result.blocking_findings)
assert result.passed is False
# ---------------------------------------------------------------------------
# apply_decisions
# ---------------------------------------------------------------------------
class TestApplyDecisions:
def test_skip_decision_records_skipped(self):
df = pd.DataFrame({"x": ["“smart”"]})
findings = analyze(df)
decisions = [Decision(finding_id="smart_punctuation_in_data", action="skip")]
result = apply_decisions(df, findings, decisions)
assert any(s.id == "smart_punctuation_in_data" for s in result.skipped_findings)
# And the smart quotes survived.
assert "" in result.cleaned_df["x"].iloc[0]
def test_auto_decision_runs_fix(self):
df = pd.DataFrame({"x": ["“smart”"]})
findings = analyze(df)
decisions = [Decision(finding_id="smart_punctuation_in_data", action="auto")]
result = apply_decisions(df, findings, decisions)
assert result.cleaned_df["x"].iloc[0] == '"smart"'
def test_modified_decision_uses_payload(self):
df = pd.DataFrame({"status": ["ACTIVE", "TBD", "TBD", "active"]})
findings = analyze(df)
# Restrict the null-sentinel set to only "TBD" via payload.
decisions = [Decision(
finding_id="null_like_sentinels",
action="modified",
payload={"sentinels": ["TBD"]},
)]
# null_like_sentinels needs to be present for the decision to apply.
if not any(f.id == "null_like_sentinels" for f in findings):
pytest.skip("analyzer didn't surface null sentinels for this fixture")
result = apply_decisions(df, findings, decisions)
assert result.cleaned_df["status"].tolist() == ["ACTIVE", "", "", "active"]
def test_lowercase_email_uses_finding_column(self):
df = pd.DataFrame({
"email": ["ALICE@X.com", "bob@x.com"],
"name": ["Alice", "Bob"],
})
findings = analyze(df)
decisions = [Decision(finding_id="mixed_case_email_column", action="auto")]
if not any(f.id == "mixed_case_email_column" for f in findings):
pytest.skip("analyzer didn't surface mixed-case email")
result = apply_decisions(df, findings, decisions)
assert result.cleaned_df["email"].tolist() == ["alice@x.com", "bob@x.com"]
# Other columns untouched.
assert result.cleaned_df["name"].tolist() == ["Alice", "Bob"]
def test_undecided_medium_finding_stays_pending(self):
df = pd.DataFrame({"email": ["A@X.com", "b@x.com"]})
findings = analyze(df)
result = apply_decisions(df, findings, decisions=[])
if not any(f.id == "mixed_case_email_column" for f in findings):
pytest.skip("analyzer didn't surface mixed-case email")
assert any(f.id == "mixed_case_email_column" for f in result.pending_findings)
# ---------------------------------------------------------------------------
# is_normalized
# ---------------------------------------------------------------------------
class TestIsNormalized:
def test_clean_dataframe_passes(self):
df = pd.DataFrame({"id": ["1"], "name": ["Alice"]})
findings = analyze(df)
result = auto_fix(df, findings)
assert is_normalized(findings, result) is True
def test_unnormalized_after_skip_high_confidence(self):
df = pd.DataFrame({"x": [" padded "]})
findings = analyze(df)
# Skip the only high-confidence fix.
decisions = [Decision(finding_id="whitespace_padding", action="skip")]
result = apply_decisions(df, findings, decisions)
# Re-analysis still finds the issue, so gate is not normalized.
assert is_normalized(findings, result) is False
def test_pending_medium_blocks_gate(self):
df = pd.DataFrame({"email": ["A@X.com", "b@x.com"]})
findings = analyze(df)
result = auto_fix(df, findings)
# auto_fix leaves medium pending -> gate not passed.
if any(f.id == "mixed_case_email_column" for f in findings):
assert is_normalized(findings, result) is False
def test_none_result_not_normalized(self):
assert is_normalized([], None) is False
# ---------------------------------------------------------------------------
# Corpus sweep — every fixture either passes or has declared pending
# ---------------------------------------------------------------------------
CORPUS_FILES = sorted(CORPUS.glob("*.csv")) if CORPUS.exists() else []
# Fixtures that will have pending medium/low findings after auto_fix.
EXPECTED_PENDING_AFTER_AUTOFIX = {
"11_embedded_newlines": {"mixed_case_email_column"},
"12_case_variations": {"mixed_case_email_column"},
"14_mojibake": {"suspected_mojibake"},
"17_preserve_intended": {"null_like_sentinels"},
"20_kitchen_sink": {"mixed_case_email_column"},
}
# Fixtures that block the gate via severity=error findings.
EXPECTED_BLOCKING = {
"18_empty_file": {"empty_input"},
}
@pytest.mark.parametrize("path", CORPUS_FILES, ids=lambda p: p.stem)
def test_corpus_auto_fix_state(path):
"""Every corpus fixture either passes auto_fix or has its remaining
pending/blocking findings declared in the expected sets above."""
findings = analyze(path, sample_rows=1000)
df, _, _ = _load_for_analysis(path, sample_rows=1000)
result = auto_fix(df, findings)
pending_ids = {f.id for f in result.pending_findings}
blocking_ids = {f.id for f in result.blocking_findings}
expected_pending = EXPECTED_PENDING_AFTER_AUTOFIX.get(path.stem, set())
expected_blocking = EXPECTED_BLOCKING.get(path.stem, set())
assert pending_ids == expected_pending, (
f"{path.name}: pending {pending_ids} != expected {expected_pending}"
)
assert blocking_ids == expected_blocking, (
f"{path.name}: blocking {blocking_ids} != expected {expected_blocking}"
)
def test_corpus_auto_fix_idempotent():
"""Running auto_fix twice on the same input yields the same bytes."""
if not CORPUS_FILES:
pytest.skip("corpus not present")
path = CORPUS / "20_kitchen_sink.csv"
findings = analyze(path, sample_rows=1000)
df, _, _ = _load_for_analysis(path, sample_rows=1000)
r1 = auto_fix(df, findings)
# Re-analyze the cleaned frame and run again.
f2 = analyze(r1.cleaned_df)
r2 = auto_fix(r1.cleaned_df, f2)
assert r1.cleaned_bytes == r2.cleaned_bytes
# ---------------------------------------------------------------------------
# gate_summary
# ---------------------------------------------------------------------------
class TestOutputOptions:
"""The Review page's _build_output_bytes helper for the download flow.
Imported via importlib because the page itself runs Streamlit code at
module load; we copy the function shape here as a compact spec so a
future refactor that moves the helper into core/io.py can keep the
same contract.
"""
@staticmethod
def _build(df, *, encoding, delimiter, line_terminator):
import io as _io
buf = _io.StringIO()
df.to_csv(buf, index=False, sep=delimiter, lineterminator=line_terminator)
text = buf.getvalue()
try:
return text.encode(encoding), None
except UnicodeEncodeError:
return text.encode(encoding, errors="replace"), "lossy"
def test_utf8_with_bom_starts_with_bom(self):
df = pd.DataFrame({"x": ["a"]})
data, _ = self._build(df, encoding="utf-8-sig", delimiter=",", line_terminator="\n")
assert data.startswith(b"\xef\xbb\xbf")
def test_crlf_line_terminator(self):
df = pd.DataFrame({"x": ["a", "b"]})
data, _ = self._build(df, encoding="utf-8", delimiter=",", line_terminator="\r\n")
assert b"\r\n" in data
assert b"\nb" not in data.replace(b"\r\n", b"")
def test_tab_delimiter(self):
df = pd.DataFrame({"a": ["x"], "b": ["y"]})
data, _ = self._build(df, encoding="utf-8", delimiter="\t", line_terminator="\n")
assert data.startswith(b"a\tb\n")
def test_cp1252_single_byte_accents(self):
df = pd.DataFrame({"name": ["José"]})
data, _ = self._build(df, encoding="cp1252", delimiter=",", line_terminator="\n")
# 'é' is single byte 0xE9 in cp1252 (vs 0xC3 0xA9 in UTF-8)
assert b"\xe9" in data
assert b"\xc3\xa9" not in data
def test_lossy_codepage_returns_warning(self):
df = pd.DataFrame({"name": ["Иван"]}) # Cyrillic
data, warn = self._build(df, encoding="cp1252", delimiter=",", line_terminator="\n")
assert warn is not None
assert b"?" in data # replacement chars
class TestGateSummary:
def test_summary_keys(self):
df = pd.DataFrame({"x": [" hi "]})
findings = analyze(df)
result = auto_fix(df, findings)
s = gate_summary(result)
assert set(s.keys()) == {
"passed", "fixes_applied", "cells_changed",
"skipped", "pending", "blocking",
}