Files
datatools-dev/tests/test_normalize.py
Michael 82d7fef21e feat(gate): CSV-normalization gate with confidence-tiered findings
Adds a Review & Normalize page that sits between upload and every tool
page. The analyzer now tags each finding with confidence (high/medium/low)
and a fix_action; the gate auto-applies high-confidence fixes, surfaces
medium/low ones for user review, and blocks tool pages on error-level
findings until resolved or waived.

Core (src/core/):
  - analyze.py: Finding gains confidence, fix_action, pre_applied; new
    detectors for encoding_uncertain, encoding_decode_failed; new top-
    level encoding_override parameter.
  - fixes.py: registry of fix algorithms keyed by fix_action id.
  - normalize.py: auto_fix(), apply_decisions(), is_normalized(), and
    the NormalizationResult / Decision dataclasses the gate consumes.
  - io.py: detect_encoding tries strict UTF-8 first; repair_bytes now
    transcodes UTF-16/32 to UTF-8 before NUL-strip (fixes UTF-16 corruption)
    and normalizes line endings (fixes bare-CR parser crash); empty file
    handled gracefully instead of EmptyDataError traceback.

GUI (src/gui/):
  - pages/0_Review.py: gate page with per-finding decision controls,
    encoding override picker (16 codepages + custom), and Advanced output
    options (encoding, delimiter, line terminator) on the download.
  - components.py: require_normalization_gate() helper.
  - pages/1-9: gate guard wired on every tool page.

Test corpora:
  - test-cases/encodings-corpus/: 31 encoded CSV fixtures + 9 reference
    UTF-8 files + manifest, synced from Business/DataTools.
  - test-cases/text-cleaner-corpus/test_data/17: synced malformed input
    (unquoted $1,500.00) for the unquoted-delimiter detector.

Tests (94 new):
  - test_normalize.py (48): finding fields, fix registry, auto_fix scope,
    decision paths, gate idempotency, output-options helper.
  - test_encodings_corpus.py (90, 16 xfailed): parametric detection +
    decode + analyzer-no-crash sweep against the manifest.
  - test_analyze.py: encoding override + encoding_uncertain detectors.
  - test_corpus.py: pre-parse repair in the strict reader.

run_tests.py: new aliases --tool normalize, --tool encodings, --tool gate;
encodings corpus added to --fixtures category.

Docs: USER-GUIDE §3.3 covers the gate workflow, encoding override, and
output options; TECHNICAL §10.2.1-10.2.4 documents the analyzer schema,
gate API, Review page, and pre-parse repair pipeline; CLI-REFERENCE adds
the analyzer JSON schema with the new fields; README links to all of it.

Suite: 765 passed, 17 xfailed (was 458 passed).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 20:35:27 +00:00

350 lines
14 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests for the CSV-normalization gate.
Covers:
* ``Finding.confidence`` and ``Finding.fix_action`` field defaults.
* ``auto_fix`` applies every high-confidence finding and leaves
medium/low ones pending.
* ``apply_decisions`` honors per-finding skip / modified payloads.
* ``is_normalized`` re-checks high-confidence detectors after a fix pass.
* The full corpus auto-fix sweep: every fixture either passes the gate
or has its remaining medium/low findings declared in pending.
"""
from __future__ import annotations
from pathlib import Path
import pandas as pd
import pytest
from src.core.analyze import (
Finding,
analyze,
_load_for_analysis,
FIX_FOLD_SMART_PUNCT,
FIX_LOWERCASE_EMAIL,
FIX_REPLACE_NULL_SENTINELS,
FIX_NONE,
)
from src.core.fixes import get_fix, available_actions
from src.core.normalize import (
Decision,
NormalizationResult,
auto_fix,
apply_decisions,
is_normalized,
gate_summary,
)
CORPUS = Path(__file__).parent.parent / "test-cases" / "text-cleaner-corpus" / "test_data"
# ---------------------------------------------------------------------------
# Field defaults
# ---------------------------------------------------------------------------
class TestFindingFields:
def test_default_confidence_is_high(self):
f = Finding(id="x", severity="warn", tool="", count=1, description="d")
assert f.confidence == "high"
def test_default_fix_action_is_empty(self):
f = Finding(id="x", severity="warn", tool="", count=1, description="d")
assert f.fix_action == ""
def test_pre_applied_default_false(self):
f = Finding(id="x", severity="warn", tool="", count=1, description="d")
assert f.pre_applied is False
def test_smart_punct_finding_carries_fix_action(self):
df = pd.DataFrame({"x": ["“hello”"]})
findings = analyze(df)
smart = next(f for f in findings if f.id == "smart_punctuation_in_data")
assert smart.confidence == "high"
assert smart.fix_action == FIX_FOLD_SMART_PUNCT
def test_mojibake_finding_is_low_confidence(self):
df = pd.DataFrame({"x": ["café"]})
findings = analyze(df)
moji = next(f for f in findings if f.id == "suspected_mojibake")
assert moji.confidence == "low"
# ---------------------------------------------------------------------------
# Fix registry
# ---------------------------------------------------------------------------
class TestFixRegistry:
def test_high_confidence_fixes_registered(self):
actions = available_actions()
assert FIX_FOLD_SMART_PUNCT in actions
assert FIX_LOWERCASE_EMAIL in actions
assert FIX_REPLACE_NULL_SENTINELS in actions
def test_get_fix_returns_callable(self):
fn = get_fix(FIX_FOLD_SMART_PUNCT)
assert callable(fn)
def test_get_fix_unknown_returns_none(self):
assert get_fix("not_a_real_action") is None
# ---------------------------------------------------------------------------
# auto_fix
# ---------------------------------------------------------------------------
class TestAutoFix:
def test_applies_high_confidence_only(self):
df = pd.DataFrame({
"name": [" Alice ", "Bob "], # whitespace + NBSP -> high
"email": ["A@X.com", "b@x.com"], # mixed case -> medium
})
findings = analyze(df)
result = auto_fix(df, findings)
# whitespace_padding and nbsp_or_unicode_whitespace should be applied.
applied_ids = {a.finding_id for a in result.applied}
assert "whitespace_padding" in applied_ids
assert "nbsp_or_unicode_whitespace" in applied_ids
# mixed_case_email_column is medium -> pending.
pending_ids = {f.id for f in result.pending_findings}
assert "mixed_case_email_column" in pending_ids
def test_cells_actually_changed(self):
df = pd.DataFrame({"x": [" hi ", "ok"]})
findings = analyze(df)
result = auto_fix(df, findings)
assert result.cleaned_df["x"].tolist() == ["hi", "ok"]
def test_no_findings_no_fixes(self):
df = pd.DataFrame({"id": ["1", "2"], "name": ["a", "b"]})
findings = analyze(df)
result = auto_fix(df, findings)
assert result.applied == []
assert result.passed is True
def test_blocks_on_severity_error(self, tmp_path):
f = tmp_path / "empty.csv"
f.write_bytes(b"")
findings = analyze(f)
df, _, _ = _load_for_analysis(f, sample_rows=1000)
result = auto_fix(df, findings)
assert any(b.id == "empty_input" for b in result.blocking_findings)
assert result.passed is False
# ---------------------------------------------------------------------------
# apply_decisions
# ---------------------------------------------------------------------------
class TestApplyDecisions:
def test_skip_decision_records_skipped(self):
df = pd.DataFrame({"x": ["“smart”"]})
findings = analyze(df)
decisions = [Decision(finding_id="smart_punctuation_in_data", action="skip")]
result = apply_decisions(df, findings, decisions)
assert any(s.id == "smart_punctuation_in_data" for s in result.skipped_findings)
# And the smart quotes survived.
assert "" in result.cleaned_df["x"].iloc[0]
def test_auto_decision_runs_fix(self):
df = pd.DataFrame({"x": ["“smart”"]})
findings = analyze(df)
decisions = [Decision(finding_id="smart_punctuation_in_data", action="auto")]
result = apply_decisions(df, findings, decisions)
assert result.cleaned_df["x"].iloc[0] == '"smart"'
def test_modified_decision_uses_payload(self):
df = pd.DataFrame({"status": ["ACTIVE", "TBD", "TBD", "active"]})
findings = analyze(df)
# Restrict the null-sentinel set to only "TBD" via payload.
decisions = [Decision(
finding_id="null_like_sentinels",
action="modified",
payload={"sentinels": ["TBD"]},
)]
# null_like_sentinels needs to be present for the decision to apply.
if not any(f.id == "null_like_sentinels" for f in findings):
pytest.skip("analyzer didn't surface null sentinels for this fixture")
result = apply_decisions(df, findings, decisions)
assert result.cleaned_df["status"].tolist() == ["ACTIVE", "", "", "active"]
def test_lowercase_email_uses_finding_column(self):
df = pd.DataFrame({
"email": ["ALICE@X.com", "bob@x.com"],
"name": ["Alice", "Bob"],
})
findings = analyze(df)
decisions = [Decision(finding_id="mixed_case_email_column", action="auto")]
if not any(f.id == "mixed_case_email_column" for f in findings):
pytest.skip("analyzer didn't surface mixed-case email")
result = apply_decisions(df, findings, decisions)
assert result.cleaned_df["email"].tolist() == ["alice@x.com", "bob@x.com"]
# Other columns untouched.
assert result.cleaned_df["name"].tolist() == ["Alice", "Bob"]
def test_undecided_medium_finding_stays_pending(self):
df = pd.DataFrame({"email": ["A@X.com", "b@x.com"]})
findings = analyze(df)
result = apply_decisions(df, findings, decisions=[])
if not any(f.id == "mixed_case_email_column" for f in findings):
pytest.skip("analyzer didn't surface mixed-case email")
assert any(f.id == "mixed_case_email_column" for f in result.pending_findings)
# ---------------------------------------------------------------------------
# is_normalized
# ---------------------------------------------------------------------------
class TestIsNormalized:
def test_clean_dataframe_passes(self):
df = pd.DataFrame({"id": ["1"], "name": ["Alice"]})
findings = analyze(df)
result = auto_fix(df, findings)
assert is_normalized(findings, result) is True
def test_unnormalized_after_skip_high_confidence(self):
df = pd.DataFrame({"x": [" padded "]})
findings = analyze(df)
# Skip the only high-confidence fix.
decisions = [Decision(finding_id="whitespace_padding", action="skip")]
result = apply_decisions(df, findings, decisions)
# Re-analysis still finds the issue, so gate is not normalized.
assert is_normalized(findings, result) is False
def test_pending_medium_blocks_gate(self):
df = pd.DataFrame({"email": ["A@X.com", "b@x.com"]})
findings = analyze(df)
result = auto_fix(df, findings)
# auto_fix leaves medium pending -> gate not passed.
if any(f.id == "mixed_case_email_column" for f in findings):
assert is_normalized(findings, result) is False
def test_none_result_not_normalized(self):
assert is_normalized([], None) is False
# ---------------------------------------------------------------------------
# Corpus sweep — every fixture either passes or has declared pending
# ---------------------------------------------------------------------------
CORPUS_FILES = sorted(CORPUS.glob("*.csv")) if CORPUS.exists() else []
# Fixtures that will have pending medium/low findings after auto_fix.
EXPECTED_PENDING_AFTER_AUTOFIX = {
"11_embedded_newlines": {"mixed_case_email_column"},
"12_case_variations": {"mixed_case_email_column"},
"14_mojibake": {"suspected_mojibake"},
"17_preserve_intended": {"null_like_sentinels"},
"20_kitchen_sink": {"mixed_case_email_column"},
}
# Fixtures that block the gate via severity=error findings.
EXPECTED_BLOCKING = {
"18_empty_file": {"empty_input"},
}
@pytest.mark.parametrize("path", CORPUS_FILES, ids=lambda p: p.stem)
def test_corpus_auto_fix_state(path):
"""Every corpus fixture either passes auto_fix or has its remaining
pending/blocking findings declared in the expected sets above."""
findings = analyze(path, sample_rows=1000)
df, _, _ = _load_for_analysis(path, sample_rows=1000)
result = auto_fix(df, findings)
pending_ids = {f.id for f in result.pending_findings}
blocking_ids = {f.id for f in result.blocking_findings}
expected_pending = EXPECTED_PENDING_AFTER_AUTOFIX.get(path.stem, set())
expected_blocking = EXPECTED_BLOCKING.get(path.stem, set())
assert pending_ids == expected_pending, (
f"{path.name}: pending {pending_ids} != expected {expected_pending}"
)
assert blocking_ids == expected_blocking, (
f"{path.name}: blocking {blocking_ids} != expected {expected_blocking}"
)
def test_corpus_auto_fix_idempotent():
"""Running auto_fix twice on the same input yields the same bytes."""
if not CORPUS_FILES:
pytest.skip("corpus not present")
path = CORPUS / "20_kitchen_sink.csv"
findings = analyze(path, sample_rows=1000)
df, _, _ = _load_for_analysis(path, sample_rows=1000)
r1 = auto_fix(df, findings)
# Re-analyze the cleaned frame and run again.
f2 = analyze(r1.cleaned_df)
r2 = auto_fix(r1.cleaned_df, f2)
assert r1.cleaned_bytes == r2.cleaned_bytes
# ---------------------------------------------------------------------------
# gate_summary
# ---------------------------------------------------------------------------
class TestOutputOptions:
"""The Review page's _build_output_bytes helper for the download flow.
Imported via importlib because the page itself runs Streamlit code at
module load; we copy the function shape here as a compact spec so a
future refactor that moves the helper into core/io.py can keep the
same contract.
"""
@staticmethod
def _build(df, *, encoding, delimiter, line_terminator):
import io as _io
buf = _io.StringIO()
df.to_csv(buf, index=False, sep=delimiter, lineterminator=line_terminator)
text = buf.getvalue()
try:
return text.encode(encoding), None
except UnicodeEncodeError:
return text.encode(encoding, errors="replace"), "lossy"
def test_utf8_with_bom_starts_with_bom(self):
df = pd.DataFrame({"x": ["a"]})
data, _ = self._build(df, encoding="utf-8-sig", delimiter=",", line_terminator="\n")
assert data.startswith(b"\xef\xbb\xbf")
def test_crlf_line_terminator(self):
df = pd.DataFrame({"x": ["a", "b"]})
data, _ = self._build(df, encoding="utf-8", delimiter=",", line_terminator="\r\n")
assert b"\r\n" in data
assert b"\nb" not in data.replace(b"\r\n", b"")
def test_tab_delimiter(self):
df = pd.DataFrame({"a": ["x"], "b": ["y"]})
data, _ = self._build(df, encoding="utf-8", delimiter="\t", line_terminator="\n")
assert data.startswith(b"a\tb\n")
def test_cp1252_single_byte_accents(self):
df = pd.DataFrame({"name": ["José"]})
data, _ = self._build(df, encoding="cp1252", delimiter=",", line_terminator="\n")
# 'é' is single byte 0xE9 in cp1252 (vs 0xC3 0xA9 in UTF-8)
assert b"\xe9" in data
assert b"\xc3\xa9" not in data
def test_lossy_codepage_returns_warning(self):
df = pd.DataFrame({"name": ["Иван"]}) # Cyrillic
data, warn = self._build(df, encoding="cp1252", delimiter=",", line_terminator="\n")
assert warn is not None
assert b"?" in data # replacement chars
class TestGateSummary:
def test_summary_keys(self):
df = pd.DataFrame({"x": [" hi "]})
findings = analyze(df)
result = auto_fix(df, findings)
s = gate_summary(result)
assert set(s.keys()) == {
"passed", "fixes_applied", "cells_changed",
"skipped", "pending", "blocking",
}