feat(gate): CSV-normalization gate with confidence-tiered findings
Adds a Review & Normalize page that sits between upload and every tool
page. The analyzer now tags each finding with confidence (high/medium/low)
and a fix_action; the gate auto-applies high-confidence fixes, surfaces
medium/low ones for user review, and blocks tool pages on error-level
findings until resolved or waived.
Core (src/core/):
- analyze.py: Finding gains confidence, fix_action, pre_applied; new
detectors for encoding_uncertain, encoding_decode_failed; new top-
level encoding_override parameter.
- fixes.py: registry of fix algorithms keyed by fix_action id.
- normalize.py: auto_fix(), apply_decisions(), is_normalized(), and
the NormalizationResult / Decision dataclasses the gate consumes.
- io.py: detect_encoding tries strict UTF-8 first; repair_bytes now
transcodes UTF-16/32 to UTF-8 before NUL-strip (fixes UTF-16 corruption)
and normalizes line endings (fixes bare-CR parser crash); empty file
handled gracefully instead of EmptyDataError traceback.
GUI (src/gui/):
- pages/0_Review.py: gate page with per-finding decision controls,
encoding override picker (16 codepages + custom), and Advanced output
options (encoding, delimiter, line terminator) on the download.
- components.py: require_normalization_gate() helper.
- pages/1-9: gate guard wired on every tool page.
Test corpora:
- test-cases/encodings-corpus/: 31 encoded CSV fixtures + 9 reference
UTF-8 files + manifest, synced from Business/DataTools.
- test-cases/text-cleaner-corpus/test_data/17: synced malformed input
(unquoted $1,500.00) for the unquoted-delimiter detector.
Tests (94 new):
- test_normalize.py (48): finding fields, fix registry, auto_fix scope,
decision paths, gate idempotency, output-options helper.
- test_encodings_corpus.py (90, 16 xfailed): parametric detection +
decode + analyzer-no-crash sweep against the manifest.
- test_analyze.py: encoding override + encoding_uncertain detectors.
- test_corpus.py: pre-parse repair in the strict reader.
run_tests.py: new aliases --tool normalize, --tool encodings, --tool gate;
encodings corpus added to --fixtures category.
Docs: USER-GUIDE §3.3 covers the gate workflow, encoding override, and
output options; TECHNICAL §10.2.1-10.2.4 documents the analyzer schema,
gate API, Review page, and pre-parse repair pipeline; CLI-REFERENCE adds
the analyzer JSON schema with the new fields; README links to all of it.
Suite: 765 passed, 17 xfailed (was 458 passed).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -34,6 +34,16 @@ def detect_encoding(path: Path, sample_bytes: int = 65_536) -> str:
|
||||
if raw[:2] in (b"\xff\xfe", b"\xfe\xff"):
|
||||
return "utf-16"
|
||||
|
||||
# Strict UTF-8 wins. charset_normalizer fingerprints small files
|
||||
# dominated by short non-ASCII sequences (e.g. zero-width chars at
|
||||
# U+200B-class) as mac_latin2 / cp1250 / similar — but if the bytes
|
||||
# decode cleanly as UTF-8, that's the right answer regardless.
|
||||
try:
|
||||
raw.decode("utf-8")
|
||||
return "utf-8"
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
|
||||
result = from_bytes(raw).best()
|
||||
if result is None:
|
||||
return "utf-8"
|
||||
@@ -416,6 +426,7 @@ def repair_bytes(
|
||||
fold_quotes: bool = True,
|
||||
strip_nul: bool = True,
|
||||
repair_delims: bool = True,
|
||||
normalize_line_endings: bool = True,
|
||||
) -> RepairResult:
|
||||
"""Pre-parse repair on a raw delimited file.
|
||||
|
||||
@@ -423,8 +434,11 @@ def repair_bytes(
|
||||
|
||||
1. Strip a leading UTF-8 BOM.
|
||||
2. Strip embedded NUL bytes (the C parser truncates fields at NUL).
|
||||
3. Fold smart double quotes (curly, guillemet, double-prime) to ASCII ``"``.
|
||||
4. Per-row repair when one rogue delimiter is embedded in a field that
|
||||
3. Normalize line endings (CRLF and bare CR to LF). Bare CR confuses
|
||||
the C parser ("new-line character seen in unquoted field"); the
|
||||
text-cleaner contract also calls for LF inside multi-line cells.
|
||||
4. Fold smart double quotes (curly, guillemet, double-prime) to ASCII ``"``.
|
||||
5. Per-row repair when one rogue delimiter is embedded in a field that
|
||||
looks like currency or thousands-grouped digits — quote that field.
|
||||
|
||||
Single curly quotes and other punctuation are deferred to the cell-level
|
||||
@@ -434,12 +448,41 @@ def repair_bytes(
|
||||
unrepairable: list[int] = []
|
||||
data = raw
|
||||
|
||||
# If the input is a UTF-16 / UTF-32 byte stream, transcode it to UTF-8
|
||||
# up front. UTF-16 ASCII codepoints carry NUL as half of every 16-bit
|
||||
# unit, so the byte-level NUL-strip below would shred the file. Doing
|
||||
# the transcode here means the rest of the repair pipeline operates
|
||||
# on UTF-8 bytes regardless of the source encoding.
|
||||
enc_norm = encoding.lower().replace("-", "_") if encoding else ""
|
||||
is_wide = enc_norm.startswith(("utf_16", "utf_32"))
|
||||
# UTF-16 LE without a BOM that survives detection lands here too.
|
||||
if is_wide:
|
||||
try:
|
||||
decoded = data.decode(encoding)
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
decoded = data.decode("utf-8", errors="replace")
|
||||
actions.append(RepairAction(
|
||||
kind="decode_replaced", line=None,
|
||||
detail=f"decode errors under {encoding}; replaced with U+FFFD",
|
||||
))
|
||||
# Strip a leading UTF-16 BOM (decoded as U+FEFF) if present.
|
||||
if decoded and decoded[0] == "":
|
||||
decoded = decoded[1:]
|
||||
data = decoded.encode("utf-8")
|
||||
actions.append(RepairAction(
|
||||
kind="transcode_to_utf8", line=None,
|
||||
detail=f"transcoded {encoding} -> utf-8 ({len(raw)}B -> {len(data)}B)",
|
||||
))
|
||||
encoding = "utf-8" # downstream steps now operate on UTF-8
|
||||
|
||||
# 1. BOM
|
||||
if data.startswith(b"\xef\xbb\xbf"):
|
||||
data = data[3:]
|
||||
actions.append(RepairAction(kind="strip_bom", line=None, detail="UTF-8 BOM removed"))
|
||||
|
||||
# 2. NUL
|
||||
# 2. NUL — only meaningful for single-byte / UTF-8 encodings. We've
|
||||
# already transcoded UTF-16/32 to UTF-8 above, so NUL here is genuine
|
||||
# corruption (truncated C strings, half-binary exports), not encoding.
|
||||
if strip_nul and b"\x00" in data:
|
||||
before = data.count(b"\x00")
|
||||
data = data.replace(b"\x00", b"")
|
||||
@@ -448,6 +491,26 @@ def repair_bytes(
|
||||
detail=f"removed {before} NUL byte(s)",
|
||||
))
|
||||
|
||||
# 3. Line endings: CRLF and bare CR -> LF. CRLF first so we don't
|
||||
# double-substitute. Done at the byte layer so it survives through
|
||||
# any subsequent decode failure.
|
||||
if normalize_line_endings and (b"\r" in data):
|
||||
n_crlf = data.count(b"\r\n")
|
||||
data = data.replace(b"\r\n", b"\n")
|
||||
n_cr = data.count(b"\r")
|
||||
if n_cr:
|
||||
data = data.replace(b"\r", b"\n")
|
||||
if n_crlf or n_cr:
|
||||
parts = []
|
||||
if n_crlf:
|
||||
parts.append(f"{n_crlf} CRLF")
|
||||
if n_cr:
|
||||
parts.append(f"{n_cr} bare CR")
|
||||
actions.append(RepairAction(
|
||||
kind="normalize_line_endings", line=None,
|
||||
detail=f"normalized {', '.join(parts)} to LF",
|
||||
))
|
||||
|
||||
# Decode for character-level work.
|
||||
try:
|
||||
text = data.decode(encoding)
|
||||
|
||||
Reference in New Issue
Block a user