feat(gate): CSV-normalization gate with confidence-tiered findings
Adds a Review & Normalize page that sits between upload and every tool
page. The analyzer now tags each finding with confidence (high/medium/low)
and a fix_action; the gate auto-applies high-confidence fixes, surfaces
medium/low ones for user review, and blocks tool pages on error-level
findings until resolved or waived.
Core (src/core/):
- analyze.py: Finding gains confidence, fix_action, pre_applied; new
detectors for encoding_uncertain, encoding_decode_failed; new top-
level encoding_override parameter.
- fixes.py: registry of fix algorithms keyed by fix_action id.
- normalize.py: auto_fix(), apply_decisions(), is_normalized(), and
the NormalizationResult / Decision dataclasses the gate consumes.
- io.py: detect_encoding tries strict UTF-8 first; repair_bytes now
transcodes UTF-16/32 to UTF-8 before NUL-strip (fixes UTF-16 corruption)
and normalizes line endings (fixes bare-CR parser crash); empty file
handled gracefully instead of EmptyDataError traceback.
GUI (src/gui/):
- pages/0_Review.py: gate page with per-finding decision controls,
encoding override picker (16 codepages + custom), and Advanced output
options (encoding, delimiter, line terminator) on the download.
- components.py: require_normalization_gate() helper.
- pages/1-9: gate guard wired on every tool page.
Test corpora:
- test-cases/encodings-corpus/: 31 encoded CSV fixtures + 9 reference
UTF-8 files + manifest, synced from Business/DataTools.
- test-cases/text-cleaner-corpus/test_data/17: synced malformed input
(unquoted $1,500.00) for the unquoted-delimiter detector.
Tests (94 new):
- test_normalize.py (48): finding fields, fix registry, auto_fix scope,
decision paths, gate idempotency, output-options helper.
- test_encodings_corpus.py (90, 16 xfailed): parametric detection +
decode + analyzer-no-crash sweep against the manifest.
- test_analyze.py: encoding override + encoding_uncertain detectors.
- test_corpus.py: pre-parse repair in the strict reader.
run_tests.py: new aliases --tool normalize, --tool encodings, --tool gate;
encodings corpus added to --fixtures category.
Docs: USER-GUIDE §3.3 covers the gate workflow, encoding override, and
output options; TECHNICAL §10.2.1-10.2.4 documents the analyzer schema,
gate API, Review page, and pre-parse repair pipeline; CLI-REFERENCE adds
the analyzer JSON schema with the new fields; README links to all of it.
Suite: 765 passed, 17 xfailed (was 458 passed).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -25,6 +25,7 @@ from pandas.api import types as pdtypes
|
||||
from .io import RepairResult, repair_bytes, detect_encoding, detect_delimiter
|
||||
|
||||
Severity = Literal["info", "warn", "error"]
|
||||
Confidence = Literal["high", "medium", "low"]
|
||||
|
||||
|
||||
# Tool identifiers — match the 0N_<name> convention used by the script set.
|
||||
@@ -35,6 +36,29 @@ TOOL_DEDUPLICATOR = "01_deduplicator"
|
||||
TOOL_FORMAT_STANDARDIZER = "03_format_standardizer"
|
||||
|
||||
|
||||
# Stable fix-action ids. These name the algorithm that resolves a finding;
|
||||
# the normalize layer dispatches on this id. Keep in sync with fixes.py.
|
||||
FIX_TRIM_WHITESPACE = "trim_whitespace"
|
||||
FIX_STRIP_NBSP = "strip_nbsp_unicode_whitespace"
|
||||
FIX_STRIP_ZERO_WIDTH = "strip_zero_width"
|
||||
FIX_FOLD_SMART_PUNCT = "fold_smart_punctuation"
|
||||
FIX_CLEAN_HEADERS = "clean_headers"
|
||||
FIX_NORMALIZE_LINE_ENDINGS = "normalize_line_endings"
|
||||
FIX_STRIP_BOM = "strip_bom"
|
||||
FIX_STRIP_NUL = "strip_nul"
|
||||
FIX_FOLD_SMART_QUOTES_BYTE = "fold_smart_quotes_byte"
|
||||
FIX_REPAIR_UNQUOTED_DELIM = "repair_unquoted_delimiters"
|
||||
FIX_LOWERCASE_EMAIL = "lowercase_email_column"
|
||||
FIX_REPLACE_NULL_SENTINELS = "replace_null_sentinels"
|
||||
FIX_REPAIR_MOJIBAKE = "repair_mojibake"
|
||||
FIX_NONE = "" # informational — nothing to apply
|
||||
|
||||
# Replacement character (U+FFFD) inserted when a decoder gave up on a byte.
|
||||
# Anything more than a tiny ratio of it in the loaded text is a strong
|
||||
# signal that the encoding was wrong.
|
||||
_REPLACEMENT_CHAR = "<EFBFBD>"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Finding:
|
||||
"""One issue the analyzer surfaced.
|
||||
@@ -47,6 +71,16 @@ class Finding:
|
||||
severity
|
||||
``"info"`` (FYI), ``"warn"`` (likely needs cleanup),
|
||||
``"error"`` (will block downstream work).
|
||||
confidence
|
||||
``"high"`` — round-trip-safe algorithmic fix, eligible for auto-fix.
|
||||
``"medium"`` — right call in the common case but has known
|
||||
false-positive shapes; user should preview before applying.
|
||||
``"low"`` — heuristic; the wrong call corrupts data; opt-in only.
|
||||
Independent of severity: a ``warn`` finding can be high-confidence
|
||||
(NBSP strip) and an ``info`` finding can be low-confidence (mojibake).
|
||||
fix_action
|
||||
Stable id naming the algorithm that resolves this finding. Empty
|
||||
string for informational findings with no associated fix.
|
||||
tool
|
||||
Tool id that can address the finding, or empty string for purely
|
||||
informational findings.
|
||||
@@ -69,6 +103,13 @@ class Finding:
|
||||
description: str
|
||||
column: Optional[str] = None
|
||||
samples: list[tuple[int, str, str]] = field(default_factory=list)
|
||||
confidence: Confidence = "high"
|
||||
fix_action: str = FIX_NONE
|
||||
# True when the fix already ran during the pre-parse repair pass
|
||||
# (e.g. BOM strip, byte-level smart-quote fold). The gate treats these
|
||||
# as already-resolved; the review page still surfaces them so the
|
||||
# user can see what was auto-applied during read.
|
||||
pre_applied: bool = False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -139,6 +180,8 @@ def _detect_smart_punctuation(df: pd.DataFrame) -> list[Finding]:
|
||||
f"regex patterns."
|
||||
),
|
||||
samples=sample_rows,
|
||||
confidence="high",
|
||||
fix_action=FIX_FOLD_SMART_PUNCT,
|
||||
)]
|
||||
|
||||
|
||||
@@ -172,6 +215,8 @@ def _detect_invisible_chars(df: pd.DataFrame) -> list[Finding]:
|
||||
f"join keys."
|
||||
),
|
||||
samples=nbsp_samples,
|
||||
confidence="high",
|
||||
fix_action=FIX_STRIP_NBSP,
|
||||
))
|
||||
if zw_cells:
|
||||
findings.append(Finding(
|
||||
@@ -184,6 +229,8 @@ def _detect_invisible_chars(df: pd.DataFrame) -> list[Finding]:
|
||||
f"characters (ZWSP, ZWJ, soft hyphen, BOM, bidi marks)."
|
||||
),
|
||||
samples=zw_samples,
|
||||
confidence="high",
|
||||
fix_action=FIX_STRIP_ZERO_WIDTH,
|
||||
))
|
||||
# Headers carry the same risks; flag separately so the user sees that
|
||||
# df["Email"] vs df["Email"] is the issue.
|
||||
@@ -208,6 +255,8 @@ def _detect_invisible_chars(df: pd.DataFrame) -> list[Finding]:
|
||||
f"df['col'] lookups."
|
||||
),
|
||||
samples=[(0, h, h) for h in bad_headers[:5]],
|
||||
confidence="high",
|
||||
fix_action=FIX_CLEAN_HEADERS,
|
||||
))
|
||||
return findings
|
||||
|
||||
@@ -235,6 +284,8 @@ def _detect_whitespace_padding(df: pd.DataFrame) -> list[Finding]:
|
||||
f"multi-space internal runs. Common cause of failed joins."
|
||||
),
|
||||
samples=samples,
|
||||
confidence="high",
|
||||
fix_action=FIX_TRIM_WHITESPACE,
|
||||
)]
|
||||
|
||||
|
||||
@@ -264,6 +315,8 @@ def _detect_null_like_sentinels(df: pd.DataFrame) -> list[Finding]:
|
||||
f"counts as missing in the missing-value handler."
|
||||
),
|
||||
samples=samples,
|
||||
confidence="medium",
|
||||
fix_action=FIX_REPLACE_NULL_SENTINELS,
|
||||
)]
|
||||
|
||||
|
||||
@@ -290,6 +343,8 @@ def _detect_mojibake(df: pd.DataFrame) -> list[Finding]:
|
||||
f"patterns (é, ’, etc.). Auto-repair is opt-in (Tier 2)."
|
||||
),
|
||||
samples=samples,
|
||||
confidence="low",
|
||||
fix_action=FIX_REPAIR_MOJIBAKE,
|
||||
)]
|
||||
|
||||
|
||||
@@ -316,6 +371,8 @@ def _detect_mixed_case_email(df: pd.DataFrame) -> list[Finding]:
|
||||
),
|
||||
column=col,
|
||||
samples=samples,
|
||||
confidence="medium",
|
||||
fix_action=FIX_LOWERCASE_EMAIL,
|
||||
))
|
||||
return findings
|
||||
|
||||
@@ -362,6 +419,8 @@ def _detect_near_duplicates(df: pd.DataFrame) -> list[Finding]:
|
||||
f"Run the deduplicator to merge or remove."
|
||||
),
|
||||
samples=samples,
|
||||
confidence="medium",
|
||||
fix_action=FIX_NONE, # routed to dedup tool, not auto-fixed here
|
||||
)]
|
||||
|
||||
|
||||
@@ -397,23 +456,60 @@ def _detect_leading_zero_ids(df: pd.DataFrame) -> list[Finding]:
|
||||
),
|
||||
column=str(col),
|
||||
samples=samples,
|
||||
confidence="low",
|
||||
fix_action=FIX_NONE, # informational only
|
||||
))
|
||||
return findings
|
||||
|
||||
|
||||
def _count_row_terminators(raw: bytes) -> tuple[int, int, int]:
|
||||
"""Count CRLF / LF / CR sequences that act as *row* terminators.
|
||||
|
||||
Walks the bytes tracking quoted-region state so that line breaks
|
||||
inside multi-line quoted cells (e.g. an address column) are not
|
||||
counted. Without this, files that legitimately have CRLF at row
|
||||
boundaries plus LF inside quoted cells get false-positive
|
||||
``mixed_line_endings`` findings.
|
||||
"""
|
||||
n_crlf = n_lf = n_cr = 0
|
||||
in_quotes = False
|
||||
i = 0
|
||||
n = len(raw)
|
||||
while i < n:
|
||||
b = raw[i]
|
||||
if b == 0x22: # ASCII double quote — toggles quoted region.
|
||||
# Doubled quote inside a quoted cell is an escape, not an exit.
|
||||
if in_quotes and i + 1 < n and raw[i + 1] == 0x22:
|
||||
i += 2
|
||||
continue
|
||||
in_quotes = not in_quotes
|
||||
i += 1
|
||||
continue
|
||||
if not in_quotes:
|
||||
if b == 0x0D: # CR
|
||||
if i + 1 < n and raw[i + 1] == 0x0A:
|
||||
n_crlf += 1
|
||||
i += 2
|
||||
continue
|
||||
n_cr += 1
|
||||
elif b == 0x0A: # LF
|
||||
n_lf += 1
|
||||
i += 1
|
||||
return n_crlf, n_lf, n_cr
|
||||
|
||||
|
||||
def _detect_mixed_line_endings(raw: bytes) -> list[Finding]:
|
||||
"""Flag files that mix CRLF, LF, and bare CR line terminators.
|
||||
"""Flag files that mix CRLF, LF, and bare CR row terminators.
|
||||
|
||||
Mixed endings are a classic disaster pattern after multi-source concat
|
||||
(Windows + macOS + Linux exports stitched together). Operates on raw
|
||||
(Windows + macOS + Linux exports stitched together). Counts only the
|
||||
terminators that act as row separators, so embedded newlines inside
|
||||
quoted multi-line cells don't create false positives. Operates on raw
|
||||
bytes only — DataFrame-mode :func:`analyze` skips this detector.
|
||||
"""
|
||||
if not raw:
|
||||
return []
|
||||
n_crlf = raw.count(b"\r\n")
|
||||
# Count standalone \r and \n (not part of \r\n) by subtracting overlaps.
|
||||
n_lf = raw.count(b"\n") - n_crlf
|
||||
n_cr = raw.count(b"\r") - n_crlf
|
||||
n_crlf, n_lf, n_cr = _count_row_terminators(raw)
|
||||
kinds_present = sum(1 for n in (n_crlf, n_lf, n_cr) if n > 0)
|
||||
if kinds_present <= 1:
|
||||
return []
|
||||
@@ -434,6 +530,53 @@ def _detect_mixed_line_endings(raw: bytes) -> list[Finding]:
|
||||
f"({', '.join(breakdown)}). Naive splits on one style produce "
|
||||
f"ghost rows or merged lines. Run the text cleaner to normalize."
|
||||
),
|
||||
confidence="high",
|
||||
fix_action=FIX_NORMALIZE_LINE_ENDINGS,
|
||||
)]
|
||||
|
||||
|
||||
def _detect_encoding_uncertainty(df: pd.DataFrame) -> list[Finding]:
|
||||
"""Flag DataFrames whose loaded text contains U+FFFD replacement chars.
|
||||
|
||||
The replacement character is what Python's decoder substitutes for
|
||||
bytes it could not interpret under ``errors="replace"``. Any non-zero
|
||||
count is a strong signal that the encoding picked by the loader was
|
||||
wrong for at least part of the file — classic lying-BOM, mixed-encoding,
|
||||
or wrong-codepage symptom. The user has to pick: re-upload with an
|
||||
explicit encoding, or accept the loss.
|
||||
"""
|
||||
affected_cells = 0
|
||||
sample_rows: list[tuple[int, str, str]] = []
|
||||
bad_headers: list[str] = []
|
||||
for col in df.columns:
|
||||
if isinstance(col, str) and _REPLACEMENT_CHAR in col:
|
||||
bad_headers.append(col)
|
||||
for row_idx, val in enumerate(df[col].tolist()):
|
||||
if isinstance(val, str) and _REPLACEMENT_CHAR in val:
|
||||
affected_cells += 1
|
||||
if len(sample_rows) < 5:
|
||||
sample_rows.append((row_idx, str(col), val))
|
||||
if not affected_cells and not bad_headers:
|
||||
return []
|
||||
location = []
|
||||
if affected_cells:
|
||||
location.append(f"{affected_cells} cell(s)")
|
||||
if bad_headers:
|
||||
location.append(f"{len(bad_headers)} header(s)")
|
||||
return [Finding(
|
||||
id="encoding_uncertain",
|
||||
severity="error",
|
||||
tool="",
|
||||
count=affected_cells + len(bad_headers),
|
||||
description=(
|
||||
f"{' and '.join(location)} contain U+FFFD replacement characters, "
|
||||
f"which means the file's encoding could not be decoded cleanly. "
|
||||
f"Re-upload with an explicit encoding (e.g. cp1252, latin-1) "
|
||||
f"or fix the source. Continuing risks silent data loss."
|
||||
),
|
||||
samples=sample_rows,
|
||||
confidence="low",
|
||||
fix_action=FIX_NONE,
|
||||
)]
|
||||
|
||||
|
||||
@@ -455,6 +598,9 @@ def _findings_from_repair(repair: RepairResult) -> list[Finding]:
|
||||
tool=TOOL_TEXT_CLEANER,
|
||||
count=1,
|
||||
description="UTF-8 BOM at file start was removed before parsing.",
|
||||
confidence="high",
|
||||
fix_action=FIX_STRIP_BOM,
|
||||
pre_applied=True,
|
||||
))
|
||||
if "strip_nul" in summary:
|
||||
nul_action = next(a for a in repair.actions if a.kind == "strip_nul")
|
||||
@@ -467,6 +613,9 @@ def _findings_from_repair(repair: RepairResult) -> list[Finding]:
|
||||
f"Embedded NUL bytes in the file were stripped before "
|
||||
f"parsing ({nul_action.detail})."
|
||||
),
|
||||
confidence="high",
|
||||
fix_action=FIX_STRIP_NUL,
|
||||
pre_applied=True,
|
||||
))
|
||||
if "fold_smart_quote" in summary:
|
||||
action = next(a for a in repair.actions if a.kind == "fold_smart_quote")
|
||||
@@ -479,6 +628,55 @@ def _findings_from_repair(repair: RepairResult) -> list[Finding]:
|
||||
f"Smart double quotes were folded to ASCII before parsing "
|
||||
f"({action.detail})."
|
||||
),
|
||||
confidence="high",
|
||||
fix_action=FIX_FOLD_SMART_QUOTES_BYTE,
|
||||
pre_applied=True,
|
||||
))
|
||||
if "normalize_line_endings" in summary:
|
||||
action = next(a for a in repair.actions if a.kind == "normalize_line_endings")
|
||||
findings.append(Finding(
|
||||
id="csv_line_endings_normalized",
|
||||
severity="info",
|
||||
tool=TOOL_TEXT_CLEANER,
|
||||
count=1,
|
||||
description=(
|
||||
f"Line endings were normalized to LF before parsing "
|
||||
f"({action.detail})."
|
||||
),
|
||||
confidence="high",
|
||||
fix_action=FIX_NORMALIZE_LINE_ENDINGS,
|
||||
pre_applied=True,
|
||||
))
|
||||
if "transcode_to_utf8" in summary:
|
||||
action = next(a for a in repair.actions if a.kind == "transcode_to_utf8")
|
||||
findings.append(Finding(
|
||||
id="csv_transcoded_to_utf8",
|
||||
severity="info",
|
||||
tool="",
|
||||
count=1,
|
||||
description=(
|
||||
f"File was transcoded from a wide encoding to UTF-8 before "
|
||||
f"parsing ({action.detail})."
|
||||
),
|
||||
confidence="high",
|
||||
fix_action=FIX_NONE,
|
||||
pre_applied=True,
|
||||
))
|
||||
if "decode_replaced" in summary:
|
||||
action = next(a for a in repair.actions if a.kind == "decode_replaced")
|
||||
findings.append(Finding(
|
||||
id="encoding_decode_failed",
|
||||
severity="error",
|
||||
tool="",
|
||||
count=1,
|
||||
description=(
|
||||
f"Some bytes could not be decoded under the detected "
|
||||
f"encoding ({action.detail}). Replacement characters "
|
||||
f"(U+FFFD) were inserted; the file likely uses a different "
|
||||
f"encoding or mixes encodings. Re-upload with --encoding."
|
||||
),
|
||||
confidence="low",
|
||||
fix_action=FIX_NONE,
|
||||
))
|
||||
if "quote_unquoted_delim" in summary:
|
||||
n = summary["quote_unquoted_delim"]
|
||||
@@ -491,6 +689,9 @@ def _findings_from_repair(repair: RepairResult) -> list[Finding]:
|
||||
f"{n} row(s) had a delimiter inside an unquoted field "
|
||||
f"(e.g. '$1,500.00') and were merged during pre-parse repair."
|
||||
),
|
||||
confidence="medium",
|
||||
fix_action=FIX_REPAIR_UNQUOTED_DELIM,
|
||||
pre_applied=True,
|
||||
))
|
||||
if repair.unrepairable_lines:
|
||||
n = len(repair.unrepairable_lines)
|
||||
@@ -504,6 +705,8 @@ def _findings_from_repair(repair: RepairResult) -> list[Finding]:
|
||||
f"left as-is. Inspect lines: "
|
||||
f"{repair.unrepairable_lines[:10]}"
|
||||
),
|
||||
confidence="low",
|
||||
fix_action=FIX_NONE,
|
||||
))
|
||||
return findings
|
||||
|
||||
@@ -517,6 +720,7 @@ def analyze(
|
||||
*,
|
||||
sample_rows: int = 1000,
|
||||
repair_result: Optional[RepairResult] = None,
|
||||
encoding_override: Optional[str] = None,
|
||||
) -> list[Finding]:
|
||||
"""Run all detectors against *source* and return a list of findings.
|
||||
|
||||
@@ -533,11 +737,17 @@ def analyze(
|
||||
Optional :class:`RepairResult` from a prior pre-parse pass; used
|
||||
to synthesize ``csv_*`` findings so the user sees what the parser
|
||||
quietly fixed.
|
||||
encoding_override
|
||||
When set, skip charset detection and decode with this encoding
|
||||
instead. Used by the Review page to let the user correct
|
||||
misdetections (cp1250-vs-cp1252 ambiguity, KOI8-R surfacing as
|
||||
Shift_JIS, etc.). Only applies when *source* is a path.
|
||||
"""
|
||||
raw_for_byte_scan: Optional[bytes] = None
|
||||
if isinstance(source, (str, Path)):
|
||||
df, internal_repair, raw_for_byte_scan = _load_for_analysis(
|
||||
Path(source), sample_rows=sample_rows,
|
||||
encoding_override=encoding_override,
|
||||
)
|
||||
# Caller-supplied repair_result wins over the internally produced one,
|
||||
# since the caller may have used non-default repair flags.
|
||||
@@ -547,10 +757,36 @@ def analyze(
|
||||
df = source.head(sample_rows).copy() if len(source) > sample_rows else source.copy()
|
||||
|
||||
findings: list[Finding] = []
|
||||
if raw_for_byte_scan is not None and not raw_for_byte_scan.strip():
|
||||
findings.append(Finding(
|
||||
id="empty_input",
|
||||
severity="error",
|
||||
tool="",
|
||||
count=0,
|
||||
description="Input file is empty (zero bytes or whitespace only).",
|
||||
confidence="low",
|
||||
fix_action=FIX_NONE,
|
||||
))
|
||||
return findings
|
||||
if df.empty and df.columns.empty and raw_for_byte_scan is not None:
|
||||
# Non-empty bytes but the parser couldn't extract a header row.
|
||||
findings.append(Finding(
|
||||
id="empty_input",
|
||||
severity="error",
|
||||
tool="",
|
||||
count=0,
|
||||
description=(
|
||||
"Input file has no parseable rows or columns "
|
||||
"(only line endings, BOM, or whitespace)."
|
||||
),
|
||||
confidence="low",
|
||||
fix_action=FIX_NONE,
|
||||
))
|
||||
if repair_result is not None:
|
||||
findings.extend(_findings_from_repair(repair_result))
|
||||
if raw_for_byte_scan is not None:
|
||||
findings.extend(_detect_mixed_line_endings(raw_for_byte_scan))
|
||||
findings.extend(_detect_encoding_uncertainty(df))
|
||||
findings.extend(_detect_smart_punctuation(df))
|
||||
findings.extend(_detect_invisible_chars(df))
|
||||
findings.extend(_detect_whitespace_padding(df))
|
||||
@@ -563,7 +799,7 @@ def analyze(
|
||||
|
||||
|
||||
def _load_for_analysis(
|
||||
path: Path, *, sample_rows: int,
|
||||
path: Path, *, sample_rows: int, encoding_override: Optional[str] = None,
|
||||
) -> tuple[pd.DataFrame, Optional[RepairResult], Optional[bytes]]:
|
||||
"""Read just enough of *path* to scan, with the same robust pre-parse
|
||||
repair the tool pages will use.
|
||||
@@ -571,6 +807,12 @@ def _load_for_analysis(
|
||||
Returns ``(df, repair_result, raw_bytes)``. The repair result and raw
|
||||
bytes are *None* for Excel files since the byte-level repair step
|
||||
(BOM/NUL/smart-quote folding) and line-ending scan are CSV-specific.
|
||||
An empty CSV returns an empty DataFrame plus the (empty) raw bytes;
|
||||
the caller synthesizes an ``empty_input`` finding from that.
|
||||
|
||||
When *encoding_override* is set, it replaces the detected encoding
|
||||
entirely — the user has explicitly told us what the file is. The
|
||||
delimiter is still detected (it's separate from encoding choice).
|
||||
"""
|
||||
suffix = path.suffix.lower()
|
||||
if suffix in (".xlsx", ".xls"):
|
||||
@@ -579,17 +821,24 @@ def _load_for_analysis(
|
||||
nrows=sample_rows,
|
||||
)
|
||||
return df, None, None
|
||||
enc = detect_encoding(path)
|
||||
delim = detect_delimiter(path, enc)
|
||||
raw = path.read_bytes()
|
||||
if not raw.strip():
|
||||
return pd.DataFrame(), None, raw
|
||||
enc = encoding_override or detect_encoding(path)
|
||||
delim = detect_delimiter(path, enc)
|
||||
repair = repair_bytes(raw, encoding=enc, delimiter=delim)
|
||||
import io as _io
|
||||
df = pd.read_csv(
|
||||
_io.BytesIO(repair.repaired_bytes),
|
||||
encoding="utf-8", delimiter=delim,
|
||||
dtype=str, keep_default_na=False, on_bad_lines="warn",
|
||||
nrows=sample_rows,
|
||||
)
|
||||
try:
|
||||
df = pd.read_csv(
|
||||
_io.BytesIO(repair.repaired_bytes),
|
||||
encoding="utf-8", delimiter=delim,
|
||||
dtype=str, keep_default_na=False, on_bad_lines="warn",
|
||||
nrows=sample_rows,
|
||||
)
|
||||
except pd.errors.EmptyDataError:
|
||||
# File is non-empty bytes but had no parseable columns (e.g. only
|
||||
# whitespace, only a BOM, only line endings). Treat as empty.
|
||||
return pd.DataFrame(), repair, raw
|
||||
return df, repair, raw
|
||||
|
||||
|
||||
@@ -598,6 +847,9 @@ def to_dict(finding: Finding) -> dict[str, Any]:
|
||||
return {
|
||||
"id": finding.id,
|
||||
"severity": finding.severity,
|
||||
"confidence": finding.confidence,
|
||||
"fix_action": finding.fix_action,
|
||||
"pre_applied": finding.pre_applied,
|
||||
"tool": finding.tool,
|
||||
"count": finding.count,
|
||||
"description": finding.description,
|
||||
|
||||
296
src/core/fixes.py
Normal file
296
src/core/fixes.py
Normal file
@@ -0,0 +1,296 @@
|
||||
"""Registry of fix algorithms keyed by ``fix_action`` id.
|
||||
|
||||
Every :class:`~src.core.analyze.Finding` declares a ``fix_action`` naming
|
||||
the algorithm that resolves it. The normalize layer dispatches on that id
|
||||
into this registry. Each fix function takes a DataFrame plus an optional
|
||||
``payload`` dict (for fixes that need user-supplied parameters, e.g. the
|
||||
custom null-sentinel list) and returns ``(new_df, n_cells_changed)``.
|
||||
|
||||
Fixes here operate on the DataFrame after the byte-level pre-parse repair
|
||||
has already run (BOM, NUL, line endings, smart-quote bytes, unquoted
|
||||
delimiters). Anything in this layer is reversible from the audit log; a
|
||||
lossy fix (e.g. mojibake repair) is gated to ``confidence="low"`` and
|
||||
requires explicit user opt-in via the review page.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from .text_clean import (
|
||||
_SMART_TRANS,
|
||||
_ZERO_WIDTH_RE,
|
||||
_CONTROL_RE,
|
||||
_WHITESPACE_RUN_RE,
|
||||
_looks_structured,
|
||||
strip_bom,
|
||||
normalize_line_endings as _norm_le_str,
|
||||
)
|
||||
# The package __init__ re-exports the analyze() function under the name
|
||||
# `analyze`, which shadows the submodule attribute. Reach the module via
|
||||
# sys.modules to get its private constants and FIX_* identifiers.
|
||||
import sys as _sys
|
||||
import src.core.analyze # noqa: F401 (registers the submodule)
|
||||
_a = _sys.modules["src.core.analyze"]
|
||||
|
||||
# NBSP / Unicode-whitespace -> ASCII space. Mirrors the analyzer's
|
||||
# detection set (analyze._NBSP_LIKE_CHARS) so what the detector flags is
|
||||
# exactly what this fix replaces.
|
||||
_NBSP_TRANS = str.maketrans({c: " " for c in _a._NBSP_LIKE_CHARS})
|
||||
|
||||
|
||||
FixFn = Callable[[pd.DataFrame, Optional[dict]], tuple[pd.DataFrame, int]]
|
||||
|
||||
_REGISTRY: dict[str, FixFn] = {}
|
||||
|
||||
|
||||
def register(action_id: str) -> Callable[[FixFn], FixFn]:
|
||||
def deco(fn: FixFn) -> FixFn:
|
||||
_REGISTRY[action_id] = fn
|
||||
return fn
|
||||
return deco
|
||||
|
||||
|
||||
def get_fix(action_id: str) -> Optional[FixFn]:
|
||||
return _REGISTRY.get(action_id)
|
||||
|
||||
|
||||
def available_actions() -> list[str]:
|
||||
return sorted(_REGISTRY)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _apply_to_strings(
|
||||
df: pd.DataFrame, fn: Callable[[str], str], *, include_headers: bool = False,
|
||||
) -> tuple[pd.DataFrame, int]:
|
||||
"""Apply *fn* to every string cell. Returns (new_df, cells_changed).
|
||||
|
||||
Headers are not touched here — the dedicated header-cleaning fix owns
|
||||
that scope so the gate's audit log records header changes separately.
|
||||
"""
|
||||
out = df.copy()
|
||||
changed = 0
|
||||
for col in out.columns:
|
||||
if not pd.api.types.is_object_dtype(out[col]) and not pd.api.types.is_string_dtype(out[col]):
|
||||
continue
|
||||
new_col = []
|
||||
for v in out[col]:
|
||||
if isinstance(v, str):
|
||||
nv = fn(v)
|
||||
if nv != v:
|
||||
changed += 1
|
||||
new_col.append(nv)
|
||||
else:
|
||||
new_col.append(v)
|
||||
out[col] = new_col
|
||||
if include_headers:
|
||||
new_headers = []
|
||||
for h in out.columns:
|
||||
if isinstance(h, str):
|
||||
nh = fn(h)
|
||||
if nh != h:
|
||||
changed += 1
|
||||
new_headers.append(nh)
|
||||
else:
|
||||
new_headers.append(h)
|
||||
out.columns = new_headers
|
||||
return out, changed
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# High-confidence fixes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@register(_a.FIX_TRIM_WHITESPACE)
|
||||
def trim_whitespace(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
|
||||
"""Strip leading/trailing whitespace; collapse internal runs in text cells.
|
||||
|
||||
Numeric/date/phone-shaped cells get only outer trim — internal spacing
|
||||
in those is often semantic (`1 234`, `(555) 123-4567`).
|
||||
"""
|
||||
def fix(s: str) -> str:
|
||||
trimmed = s.strip()
|
||||
if not trimmed or _looks_structured(trimmed):
|
||||
return trimmed
|
||||
return _WHITESPACE_RUN_RE.sub(" ", trimmed)
|
||||
return _apply_to_strings(df, fix)
|
||||
|
||||
|
||||
@register(_a.FIX_STRIP_NBSP)
|
||||
def strip_nbsp(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
|
||||
"""Replace NBSP and other Unicode spaces with ASCII space."""
|
||||
def fix(s: str) -> str:
|
||||
return s.translate(_NBSP_TRANS)
|
||||
return _apply_to_strings(df, fix)
|
||||
|
||||
|
||||
@register(_a.FIX_STRIP_ZERO_WIDTH)
|
||||
def strip_zero_width(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
|
||||
"""Remove zero-width and invisible characters from cells."""
|
||||
def fix(s: str) -> str:
|
||||
return _ZERO_WIDTH_RE.sub("", s)
|
||||
return _apply_to_strings(df, fix)
|
||||
|
||||
|
||||
@register(_a.FIX_FOLD_SMART_PUNCT)
|
||||
def fold_smart_punctuation(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
|
||||
"""ASCII-fy curly quotes, em/en dashes, ellipsis, primes."""
|
||||
def fix(s: str) -> str:
|
||||
return s.translate(_SMART_TRANS)
|
||||
return _apply_to_strings(df, fix)
|
||||
|
||||
|
||||
@register(_a.FIX_CLEAN_HEADERS)
|
||||
def clean_headers(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
|
||||
"""Apply the same per-cell hygiene to column headers.
|
||||
|
||||
Fixes the df['Email'] vs df['Email '] class of bug.
|
||||
"""
|
||||
def fix(s: str) -> str:
|
||||
s = strip_bom(s)
|
||||
s = s.translate(_NBSP_TRANS)
|
||||
s = _ZERO_WIDTH_RE.sub("", s)
|
||||
s = s.translate(_SMART_TRANS)
|
||||
s = _CONTROL_RE.sub("", s)
|
||||
return s.strip()
|
||||
out = df.copy()
|
||||
new_headers = []
|
||||
changed = 0
|
||||
for h in out.columns:
|
||||
if isinstance(h, str):
|
||||
nh = fix(h)
|
||||
if nh != h:
|
||||
changed += 1
|
||||
new_headers.append(nh)
|
||||
else:
|
||||
new_headers.append(h)
|
||||
out.columns = new_headers
|
||||
return out, changed
|
||||
|
||||
|
||||
@register(_a.FIX_NORMALIZE_LINE_ENDINGS)
|
||||
def normalize_line_endings(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
|
||||
"""Normalize CRLF / bare CR inside cells to LF.
|
||||
|
||||
File-level line endings are handled by ``repair_bytes`` before parsing;
|
||||
this fix covers embedded multi-line cells (case 11 in the corpus).
|
||||
"""
|
||||
return _apply_to_strings(df, _norm_le_str)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Already-applied fixes (no-op at this layer; kept so the audit log is
|
||||
# uniform and the gate can reason about them)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@register(_a.FIX_STRIP_BOM)
|
||||
def strip_bom_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
|
||||
"""BOM is stripped during read by repair_bytes; nothing to do here."""
|
||||
return df, 0
|
||||
|
||||
|
||||
@register(_a.FIX_STRIP_NUL)
|
||||
def strip_nul_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
|
||||
"""NUL is stripped during read by repair_bytes."""
|
||||
return df, 0
|
||||
|
||||
|
||||
@register(_a.FIX_FOLD_SMART_QUOTES_BYTE)
|
||||
def fold_smart_quotes_byte_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
|
||||
"""Byte-level smart-quote fold runs in repair_bytes."""
|
||||
return df, 0
|
||||
|
||||
|
||||
@register(_a.FIX_REPAIR_UNQUOTED_DELIM)
|
||||
def repair_unquoted_delim_noop(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
|
||||
"""Per-row delimiter repair runs in repair_bytes."""
|
||||
return df, 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Medium-confidence fixes (require user confirmation in the review flow)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@register(_a.FIX_LOWERCASE_EMAIL)
|
||||
def lowercase_email(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
|
||||
"""Lowercase values in the column named in *payload['column']*.
|
||||
|
||||
Defaults to lowercasing every column whose name matches the email
|
||||
heuristic if no payload is given.
|
||||
"""
|
||||
out = df.copy()
|
||||
payload = payload or {}
|
||||
target_cols: list[str]
|
||||
if "column" in payload:
|
||||
target_cols = [payload["column"]]
|
||||
else:
|
||||
target_cols = [
|
||||
c for c in out.columns
|
||||
if isinstance(c, str) and _a._EMAIL_LIKE_COL.search(c)
|
||||
]
|
||||
changed = 0
|
||||
for col in target_cols:
|
||||
if col not in out.columns:
|
||||
continue
|
||||
new_col = []
|
||||
for v in out[col]:
|
||||
if isinstance(v, str):
|
||||
nv = v.lower()
|
||||
if nv != v:
|
||||
changed += 1
|
||||
new_col.append(nv)
|
||||
else:
|
||||
new_col.append(v)
|
||||
out[col] = new_col
|
||||
return out, changed
|
||||
|
||||
|
||||
@register(_a.FIX_REPLACE_NULL_SENTINELS)
|
||||
def replace_null_sentinels(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
|
||||
"""Replace user-approved null-like sentinel strings with empty string.
|
||||
|
||||
Payload: ``{"sentinels": ["N/A", "n/a", "nan", ...]}``. Defaults to
|
||||
the analyzer's built-in set when no payload is given. Comparison is
|
||||
case-insensitive, whitespace-trimmed.
|
||||
"""
|
||||
payload = payload or {}
|
||||
sentinels = payload.get("sentinels")
|
||||
if sentinels is None:
|
||||
sentinels = list(_a._NULL_LIKE)
|
||||
sentinel_set = {s.strip().lower() for s in sentinels}
|
||||
|
||||
def fix(s: str) -> str:
|
||||
return "" if s.strip().lower() in sentinel_set else s
|
||||
|
||||
return _apply_to_strings(df, fix)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Low-confidence fixes (off by default; user-only)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@register(_a.FIX_REPAIR_MOJIBAKE)
|
||||
def repair_mojibake(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[pd.DataFrame, int]:
|
||||
"""Heuristic UTF-8-as-cp1252 mojibake repair via ftfy when available.
|
||||
|
||||
Falls back to a no-op (returning ``(df, 0)``) when ftfy is not
|
||||
installed; the review page surfaces that as "library missing — install
|
||||
ftfy to enable" so we never silently corrupt data with a hand-rolled
|
||||
heuristic.
|
||||
"""
|
||||
try:
|
||||
import ftfy # type: ignore
|
||||
except ImportError:
|
||||
return df, 0
|
||||
|
||||
def fix(s: str) -> str:
|
||||
return ftfy.fix_text(s)
|
||||
|
||||
return _apply_to_strings(df, fix)
|
||||
@@ -34,6 +34,16 @@ def detect_encoding(path: Path, sample_bytes: int = 65_536) -> str:
|
||||
if raw[:2] in (b"\xff\xfe", b"\xfe\xff"):
|
||||
return "utf-16"
|
||||
|
||||
# Strict UTF-8 wins. charset_normalizer fingerprints small files
|
||||
# dominated by short non-ASCII sequences (e.g. zero-width chars at
|
||||
# U+200B-class) as mac_latin2 / cp1250 / similar — but if the bytes
|
||||
# decode cleanly as UTF-8, that's the right answer regardless.
|
||||
try:
|
||||
raw.decode("utf-8")
|
||||
return "utf-8"
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
|
||||
result = from_bytes(raw).best()
|
||||
if result is None:
|
||||
return "utf-8"
|
||||
@@ -416,6 +426,7 @@ def repair_bytes(
|
||||
fold_quotes: bool = True,
|
||||
strip_nul: bool = True,
|
||||
repair_delims: bool = True,
|
||||
normalize_line_endings: bool = True,
|
||||
) -> RepairResult:
|
||||
"""Pre-parse repair on a raw delimited file.
|
||||
|
||||
@@ -423,8 +434,11 @@ def repair_bytes(
|
||||
|
||||
1. Strip a leading UTF-8 BOM.
|
||||
2. Strip embedded NUL bytes (the C parser truncates fields at NUL).
|
||||
3. Fold smart double quotes (curly, guillemet, double-prime) to ASCII ``"``.
|
||||
4. Per-row repair when one rogue delimiter is embedded in a field that
|
||||
3. Normalize line endings (CRLF and bare CR to LF). Bare CR confuses
|
||||
the C parser ("new-line character seen in unquoted field"); the
|
||||
text-cleaner contract also calls for LF inside multi-line cells.
|
||||
4. Fold smart double quotes (curly, guillemet, double-prime) to ASCII ``"``.
|
||||
5. Per-row repair when one rogue delimiter is embedded in a field that
|
||||
looks like currency or thousands-grouped digits — quote that field.
|
||||
|
||||
Single curly quotes and other punctuation are deferred to the cell-level
|
||||
@@ -434,12 +448,41 @@ def repair_bytes(
|
||||
unrepairable: list[int] = []
|
||||
data = raw
|
||||
|
||||
# If the input is a UTF-16 / UTF-32 byte stream, transcode it to UTF-8
|
||||
# up front. UTF-16 ASCII codepoints carry NUL as half of every 16-bit
|
||||
# unit, so the byte-level NUL-strip below would shred the file. Doing
|
||||
# the transcode here means the rest of the repair pipeline operates
|
||||
# on UTF-8 bytes regardless of the source encoding.
|
||||
enc_norm = encoding.lower().replace("-", "_") if encoding else ""
|
||||
is_wide = enc_norm.startswith(("utf_16", "utf_32"))
|
||||
# UTF-16 LE without a BOM that survives detection lands here too.
|
||||
if is_wide:
|
||||
try:
|
||||
decoded = data.decode(encoding)
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
decoded = data.decode("utf-8", errors="replace")
|
||||
actions.append(RepairAction(
|
||||
kind="decode_replaced", line=None,
|
||||
detail=f"decode errors under {encoding}; replaced with U+FFFD",
|
||||
))
|
||||
# Strip a leading UTF-16 BOM (decoded as U+FEFF) if present.
|
||||
if decoded and decoded[0] == "":
|
||||
decoded = decoded[1:]
|
||||
data = decoded.encode("utf-8")
|
||||
actions.append(RepairAction(
|
||||
kind="transcode_to_utf8", line=None,
|
||||
detail=f"transcoded {encoding} -> utf-8 ({len(raw)}B -> {len(data)}B)",
|
||||
))
|
||||
encoding = "utf-8" # downstream steps now operate on UTF-8
|
||||
|
||||
# 1. BOM
|
||||
if data.startswith(b"\xef\xbb\xbf"):
|
||||
data = data[3:]
|
||||
actions.append(RepairAction(kind="strip_bom", line=None, detail="UTF-8 BOM removed"))
|
||||
|
||||
# 2. NUL
|
||||
# 2. NUL — only meaningful for single-byte / UTF-8 encodings. We've
|
||||
# already transcoded UTF-16/32 to UTF-8 above, so NUL here is genuine
|
||||
# corruption (truncated C strings, half-binary exports), not encoding.
|
||||
if strip_nul and b"\x00" in data:
|
||||
before = data.count(b"\x00")
|
||||
data = data.replace(b"\x00", b"")
|
||||
@@ -448,6 +491,26 @@ def repair_bytes(
|
||||
detail=f"removed {before} NUL byte(s)",
|
||||
))
|
||||
|
||||
# 3. Line endings: CRLF and bare CR -> LF. CRLF first so we don't
|
||||
# double-substitute. Done at the byte layer so it survives through
|
||||
# any subsequent decode failure.
|
||||
if normalize_line_endings and (b"\r" in data):
|
||||
n_crlf = data.count(b"\r\n")
|
||||
data = data.replace(b"\r\n", b"\n")
|
||||
n_cr = data.count(b"\r")
|
||||
if n_cr:
|
||||
data = data.replace(b"\r", b"\n")
|
||||
if n_crlf or n_cr:
|
||||
parts = []
|
||||
if n_crlf:
|
||||
parts.append(f"{n_crlf} CRLF")
|
||||
if n_cr:
|
||||
parts.append(f"{n_cr} bare CR")
|
||||
actions.append(RepairAction(
|
||||
kind="normalize_line_endings", line=None,
|
||||
detail=f"normalized {', '.join(parts)} to LF",
|
||||
))
|
||||
|
||||
# Decode for character-level work.
|
||||
try:
|
||||
text = data.decode(encoding)
|
||||
|
||||
249
src/core/normalize.py
Normal file
249
src/core/normalize.py
Normal file
@@ -0,0 +1,249 @@
|
||||
"""CSV-normalization gate.
|
||||
|
||||
A file enters the tool pages only after passing the gate. The gate has
|
||||
two paths:
|
||||
|
||||
1. **Auto-fix** — apply every algorithm flagged ``confidence="high"``.
|
||||
2. **Review** — show the user a preview of medium/low-confidence findings
|
||||
and accept an explicit per-finding decision before applying.
|
||||
|
||||
The gate produces a :class:`NormalizationResult` containing the cleaned
|
||||
DataFrame, the bytes representation, and a structured audit log of every
|
||||
fix that ran. Tool pages are guarded by :func:`is_normalized` against
|
||||
the result and the original list of findings.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Literal, Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from .analyze import Finding, analyze
|
||||
from .fixes import get_fix
|
||||
|
||||
|
||||
DecisionAction = Literal["auto", "skip", "modified"]
|
||||
|
||||
|
||||
@dataclass
|
||||
class Decision:
|
||||
"""One user-recorded choice for a finding.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
finding_id
|
||||
The :class:`Finding` id this decision applies to.
|
||||
action
|
||||
``"auto"`` to run the registered fix as-is, ``"skip"`` to leave
|
||||
it alone (the gate logs it as waived), ``"modified"`` to run the
|
||||
fix with a custom payload (e.g. user-edited null sentinel list).
|
||||
payload
|
||||
Optional kwargs forwarded to the fix function. Required for
|
||||
``"modified"``; ignored for ``"skip"``.
|
||||
"""
|
||||
|
||||
finding_id: str
|
||||
action: DecisionAction
|
||||
payload: Optional[dict] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class FixApplied:
|
||||
"""One fix that ran during a gate pass."""
|
||||
|
||||
finding_id: str
|
||||
fix_action: str
|
||||
cells_changed: int
|
||||
decision: DecisionAction
|
||||
|
||||
|
||||
@dataclass
|
||||
class NormalizationResult:
|
||||
"""Output of a gate pass.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
cleaned_df
|
||||
DataFrame after every applied fix. The downstream tool pages
|
||||
consume this directly.
|
||||
cleaned_bytes
|
||||
UTF-8 encoded CSV of *cleaned_df* — the canonical artifact for
|
||||
round-tripping into another tool that re-parses.
|
||||
applied
|
||||
Audit log of fixes that ran.
|
||||
skipped_findings
|
||||
Findings the user explicitly waived (decision = ``"skip"``).
|
||||
pending_findings
|
||||
Findings still requiring a user decision before the gate is
|
||||
considered passed. Empty on a successful gate pass.
|
||||
blocking_findings
|
||||
Severity=error findings that have no decision and no auto-fix.
|
||||
Non-empty means the gate is blocked and the file cannot enter
|
||||
tool pages.
|
||||
"""
|
||||
|
||||
cleaned_df: pd.DataFrame
|
||||
cleaned_bytes: bytes
|
||||
applied: list[FixApplied] = field(default_factory=list)
|
||||
skipped_findings: list[Finding] = field(default_factory=list)
|
||||
pending_findings: list[Finding] = field(default_factory=list)
|
||||
blocking_findings: list[Finding] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def passed(self) -> bool:
|
||||
return not self.pending_findings and not self.blocking_findings
|
||||
|
||||
|
||||
def _df_to_bytes(df: pd.DataFrame) -> bytes:
|
||||
buf = io.StringIO()
|
||||
df.to_csv(buf, index=False, lineterminator="\n")
|
||||
return buf.getvalue().encode("utf-8")
|
||||
|
||||
|
||||
def _is_actionable(f: Finding) -> bool:
|
||||
"""Does this finding still need attention from the gate?
|
||||
|
||||
Pre-applied fixes (BOM strip, etc. — already done during read) are
|
||||
not actionable. Findings without a registered fix_action are not
|
||||
actionable here either; severity=error ones become blockers.
|
||||
"""
|
||||
if f.pre_applied:
|
||||
return False
|
||||
if not f.fix_action:
|
||||
return False
|
||||
return get_fix(f.fix_action) is not None
|
||||
|
||||
|
||||
def auto_fix(
|
||||
df: pd.DataFrame, findings: list[Finding],
|
||||
) -> NormalizationResult:
|
||||
"""Apply every fix flagged ``confidence="high"``.
|
||||
|
||||
Returns a :class:`NormalizationResult`. Medium / low / unknown
|
||||
confidence findings are surfaced as ``pending_findings`` and the
|
||||
result is *not* considered passed until the user decides on them.
|
||||
"""
|
||||
decisions: list[Decision] = [
|
||||
Decision(finding_id=f.id, action="auto")
|
||||
for f in findings
|
||||
if _is_actionable(f) and f.confidence == "high"
|
||||
]
|
||||
return apply_decisions(df, findings, decisions)
|
||||
|
||||
|
||||
def apply_decisions(
|
||||
df: pd.DataFrame, findings: list[Finding], decisions: list[Decision],
|
||||
) -> NormalizationResult:
|
||||
"""Apply *decisions* to *df* in finding order.
|
||||
|
||||
Findings with no matching decision are categorized:
|
||||
|
||||
* ``severity=error`` -> ``blocking_findings``
|
||||
* Otherwise -> ``pending_findings`` (user still owes us a decision)
|
||||
|
||||
Pre-applied findings are recorded once in the audit log with
|
||||
``cells_changed=0`` so callers can render "what was already done."
|
||||
"""
|
||||
decision_by_id = {d.finding_id: d for d in decisions}
|
||||
|
||||
out = df.copy()
|
||||
applied: list[FixApplied] = []
|
||||
skipped: list[Finding] = []
|
||||
pending: list[Finding] = []
|
||||
blocking: list[Finding] = []
|
||||
|
||||
for f in findings:
|
||||
if f.pre_applied:
|
||||
applied.append(FixApplied(
|
||||
finding_id=f.id,
|
||||
fix_action=f.fix_action,
|
||||
cells_changed=0,
|
||||
decision="auto",
|
||||
))
|
||||
continue
|
||||
|
||||
decision = decision_by_id.get(f.id)
|
||||
if decision is None:
|
||||
if f.severity == "error":
|
||||
blocking.append(f)
|
||||
elif _is_actionable(f):
|
||||
pending.append(f)
|
||||
# else: informational with no fix; ignore.
|
||||
continue
|
||||
|
||||
if decision.action == "skip":
|
||||
skipped.append(f)
|
||||
continue
|
||||
|
||||
fix_fn = get_fix(f.fix_action)
|
||||
if fix_fn is None:
|
||||
# Decision references a fix we don't have; treat as pending.
|
||||
pending.append(f)
|
||||
continue
|
||||
|
||||
payload = decision.payload
|
||||
# Per-column fixes (lowercase_email) can carry the column from
|
||||
# the finding when the user didn't override it.
|
||||
if f.column and (payload is None or "column" not in payload):
|
||||
payload = {**(payload or {}), "column": f.column}
|
||||
|
||||
out, changed = fix_fn(out, payload)
|
||||
applied.append(FixApplied(
|
||||
finding_id=f.id,
|
||||
fix_action=f.fix_action,
|
||||
cells_changed=changed,
|
||||
decision=decision.action,
|
||||
))
|
||||
|
||||
return NormalizationResult(
|
||||
cleaned_df=out,
|
||||
cleaned_bytes=_df_to_bytes(out),
|
||||
applied=applied,
|
||||
skipped_findings=skipped,
|
||||
pending_findings=pending,
|
||||
blocking_findings=blocking,
|
||||
)
|
||||
|
||||
|
||||
def is_normalized(
|
||||
findings: list[Finding], result: Optional[NormalizationResult],
|
||||
) -> bool:
|
||||
"""True iff *result* satisfies the gate against *findings*.
|
||||
|
||||
The gate passes when:
|
||||
|
||||
* A result exists, and
|
||||
* It has no blocking findings, and
|
||||
* It has no pending (undecided) actionable findings.
|
||||
|
||||
Re-run analysis on the cleaned bytes to confirm the high-confidence
|
||||
detectors no longer fire — that's the contract the tool pages rely
|
||||
on. Callers who want the cheap check can pass ``result.passed``
|
||||
directly; this function is the strict version.
|
||||
"""
|
||||
if result is None:
|
||||
return False
|
||||
if not result.passed:
|
||||
return False
|
||||
# Re-analyze the cleaned bytes; high-confidence detectors must be silent.
|
||||
rerun = analyze(result.cleaned_df)
|
||||
for f in rerun:
|
||||
if f.confidence == "high" and _is_actionable(f):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def gate_summary(result: NormalizationResult) -> dict:
|
||||
"""One-line-per-key summary suitable for logging or the CLI."""
|
||||
return {
|
||||
"passed": result.passed,
|
||||
"fixes_applied": len(result.applied),
|
||||
"cells_changed": sum(a.cells_changed for a in result.applied),
|
||||
"skipped": [f.id for f in result.skipped_findings],
|
||||
"pending": [f.id for f in result.pending_findings],
|
||||
"blocking": [f.id for f in result.blocking_findings],
|
||||
}
|
||||
@@ -1096,6 +1096,49 @@ class _StashedUpload:
|
||||
return self._data
|
||||
|
||||
|
||||
def require_normalization_gate() -> None:
|
||||
"""Block the calling tool page until the upload has passed the gate.
|
||||
|
||||
Tool pages should call this immediately after their imports. When the
|
||||
current session upload has not been normalized — no
|
||||
``normalization_result``, the result is for a different upload, or the
|
||||
result didn't pass — the user is shown a banner and a button to jump
|
||||
to the Review page; the rest of the page is short-circuited via
|
||||
``st.stop()``.
|
||||
|
||||
Pages that genuinely don't need a clean dataframe (rare) can opt out
|
||||
by simply not calling this.
|
||||
"""
|
||||
import hashlib
|
||||
has_upload = st.session_state.get("home_uploaded_bytes") is not None
|
||||
if not has_upload:
|
||||
# No upload yet — let the page's own uploader handle it; the gate
|
||||
# will kick in once a file is present.
|
||||
return
|
||||
|
||||
upload_hash = hashlib.sha256(
|
||||
st.session_state["home_uploaded_bytes"]
|
||||
).hexdigest()
|
||||
result = st.session_state.get("normalization_result")
|
||||
matched = (
|
||||
result is not None
|
||||
and st.session_state.get("normalization_for") == upload_hash
|
||||
and getattr(result, "passed", False)
|
||||
)
|
||||
if matched:
|
||||
return
|
||||
|
||||
name = st.session_state.get("home_uploaded_name", "the uploaded file")
|
||||
st.warning(
|
||||
f"**{name}** must pass the CSV-normalization gate before you can "
|
||||
f"use this tool. Open the Review page to apply the fixes our "
|
||||
f"analyzer recommends."
|
||||
)
|
||||
if st.button("Go to Review & Normalize", type="primary"):
|
||||
st.switch_page("pages/0_Review.py")
|
||||
st.stop()
|
||||
|
||||
|
||||
def pickup_or_upload(
|
||||
*,
|
||||
label: str,
|
||||
|
||||
675
src/gui/pages/0_Review.py
Normal file
675
src/gui/pages/0_Review.py
Normal file
@@ -0,0 +1,675 @@
|
||||
"""Review & normalize gate page.
|
||||
|
||||
Sits between the home-page upload and every tool page. Walks the user
|
||||
through every analyzer finding, lets them auto-fix, preview, customize,
|
||||
or skip each one, and produces a :class:`NormalizationResult` stashed in
|
||||
session state. Tool pages refuse to load until this gate has passed.
|
||||
|
||||
State contract
|
||||
--------------
|
||||
Session state read:
|
||||
* ``home_uploaded_bytes`` / ``home_uploaded_name`` — current upload.
|
||||
* ``home_findings`` — list of :class:`Finding` from the home-page scan.
|
||||
* ``review_decisions`` — dict[finding_id, Decision]; user's choices so far.
|
||||
|
||||
Session state written:
|
||||
* ``review_decisions`` — updated as the user flips controls.
|
||||
* ``normalization_result`` — :class:`NormalizationResult` after Apply.
|
||||
* ``normalization_for`` — content hash of the upload the result is for.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import io
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
# Project root on sys.path (mirrors app.py).
|
||||
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
if str(_project_root) not in sys.path:
|
||||
sys.path.insert(0, str(_project_root))
|
||||
|
||||
from src.core.analyze import Finding, analyze
|
||||
from src.core.fixes import get_fix
|
||||
from src.core.io import detect_encoding, repair_bytes
|
||||
from src.core.normalize import (
|
||||
Decision,
|
||||
NormalizationResult,
|
||||
apply_decisions,
|
||||
auto_fix,
|
||||
gate_summary,
|
||||
is_normalized,
|
||||
)
|
||||
from src.gui.components import hide_streamlit_chrome
|
||||
|
||||
|
||||
# Common single-byte and multi-byte encodings the user might pick to
|
||||
# correct a misdetection. Ordered by frequency in real-world Western /
|
||||
# multilingual data; keep the list short — too many options just adds
|
||||
# noise. The user can type a custom encoding via the "Other" entry.
|
||||
_OVERRIDE_ENCODINGS = [
|
||||
"(detected)",
|
||||
"utf-8",
|
||||
"utf-8-sig",
|
||||
"cp1252",
|
||||
"iso-8859-1",
|
||||
"iso-8859-15",
|
||||
"cp1250",
|
||||
"iso-8859-2",
|
||||
"cp1251",
|
||||
"koi8-r",
|
||||
"mac-roman",
|
||||
"shift_jis",
|
||||
"cp932",
|
||||
"gb18030",
|
||||
"big5",
|
||||
"euc-kr",
|
||||
"cp949",
|
||||
"utf-16",
|
||||
"utf-16-le",
|
||||
"utf-16-be",
|
||||
"Other…",
|
||||
]
|
||||
|
||||
|
||||
st.set_page_config(page_title="Review & Normalize", page_icon="🛡️", layout="wide")
|
||||
hide_streamlit_chrome()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _upload_hash() -> Optional[str]:
|
||||
data = st.session_state.get("home_uploaded_bytes")
|
||||
if not data:
|
||||
return None
|
||||
return hashlib.sha256(data).hexdigest()
|
||||
|
||||
|
||||
def _detected_encoding_for_session() -> Optional[str]:
|
||||
"""Run charset detection on the session bytes via a tmp file."""
|
||||
data = st.session_state.get("home_uploaded_bytes")
|
||||
name = st.session_state.get("home_uploaded_name") or "tmp.csv"
|
||||
if not data:
|
||||
return None
|
||||
import tempfile
|
||||
suffix = "." + name.rsplit(".", 1)[-1] if "." in name else ".csv"
|
||||
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as fh:
|
||||
fh.write(data)
|
||||
tmp_path = Path(fh.name)
|
||||
try:
|
||||
return detect_encoding(tmp_path)
|
||||
finally:
|
||||
tmp_path.unlink(missing_ok=True)
|
||||
|
||||
|
||||
def _load_df_from_session(encoding_override: Optional[str] = None) -> Optional[pd.DataFrame]:
|
||||
"""Re-parse the session upload through the same pipeline the home page
|
||||
uses, so the review page operates on identical bytes.
|
||||
|
||||
When *encoding_override* is set, decode with that encoding instead of
|
||||
UTF-8. The override flows into ``repair_bytes`` so the wide-encoding
|
||||
transcode and decode_replaced fallback both honor the user's choice.
|
||||
"""
|
||||
data = st.session_state.get("home_uploaded_bytes")
|
||||
name = st.session_state.get("home_uploaded_name") or ""
|
||||
if not data:
|
||||
return None
|
||||
suffix = name.rsplit(".", 1)[-1].lower() if "." in name else ""
|
||||
if suffix in ("xlsx", "xls"):
|
||||
return pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False)
|
||||
delim = "\t" if suffix == "tsv" else ","
|
||||
if delim == ",":
|
||||
head = data[:4096].decode("utf-8", errors="replace")
|
||||
for cand in ("\t", ";", "|"):
|
||||
if head.count(cand) > head.count(",") * 1.5:
|
||||
delim = cand
|
||||
break
|
||||
enc = encoding_override or "utf-8"
|
||||
repair = repair_bytes(data, encoding=enc, delimiter=delim)
|
||||
return pd.read_csv(
|
||||
io.BytesIO(repair.repaired_bytes),
|
||||
encoding="utf-8", delimiter=delim,
|
||||
dtype=str, keep_default_na=False, on_bad_lines="warn",
|
||||
)
|
||||
|
||||
|
||||
def _run_analysis_with_override(encoding_override: Optional[str]) -> list[Finding]:
|
||||
"""Re-run analyze() on the session upload with an encoding override.
|
||||
|
||||
Mirrors components._run_analysis_on_upload but writes the bytes to a
|
||||
tempfile so analyze() goes through the path-based loader (which is
|
||||
where the encoding_override hook lives — DataFrame-mode analysis has
|
||||
nothing to override).
|
||||
"""
|
||||
data = st.session_state.get("home_uploaded_bytes")
|
||||
name = st.session_state.get("home_uploaded_name") or "tmp.csv"
|
||||
if not data:
|
||||
return []
|
||||
import tempfile
|
||||
suffix = "." + name.rsplit(".", 1)[-1] if "." in name else ".csv"
|
||||
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as fh:
|
||||
fh.write(data)
|
||||
tmp_path = Path(fh.name)
|
||||
try:
|
||||
return analyze(tmp_path, encoding_override=encoding_override)
|
||||
finally:
|
||||
tmp_path.unlink(missing_ok=True)
|
||||
|
||||
|
||||
def _confidence_pill(c: str) -> str:
|
||||
"""Streamlit-markdown pill for the confidence tier."""
|
||||
palette = {"high": "green", "medium": "orange", "low": "red"}
|
||||
return f":{palette.get(c, 'gray')}-background[**{c.upper()}**]"
|
||||
|
||||
|
||||
def _severity_pill(s: str) -> str:
|
||||
palette = {"info": "blue", "warn": "orange", "error": "red"}
|
||||
return f":{palette.get(s, 'gray')}-background[**{s}**]"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Output options (Advanced — re-encode the cleaned DataFrame for download)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# (label_shown_to_user, codec_passed_to_pandas)
|
||||
_OUTPUT_ENCODINGS = [
|
||||
("UTF-8 (recommended)", "utf-8"),
|
||||
("UTF-8 with BOM (Excel)", "utf-8-sig"),
|
||||
("Windows-1252 (Western Europe)", "cp1252"),
|
||||
("ISO-8859-1 / Latin-1", "iso-8859-1"),
|
||||
("ISO-8859-15 / Latin-9", "iso-8859-15"),
|
||||
("Windows-1250 (Central Europe)", "cp1250"),
|
||||
("ISO-8859-2 / Latin-2", "iso-8859-2"),
|
||||
("Windows-1251 (Cyrillic)", "cp1251"),
|
||||
("Shift_JIS (Japanese)", "shift_jis"),
|
||||
("GB18030 (Chinese)", "gb18030"),
|
||||
("Big5 (Traditional Chinese)", "big5"),
|
||||
("EUC-KR (Korean)", "euc-kr"),
|
||||
("UTF-16 LE with BOM", "utf-16"),
|
||||
]
|
||||
|
||||
_OUTPUT_DELIMITERS = [
|
||||
("Comma ,", ","),
|
||||
("Tab \\t", "\t"),
|
||||
("Semicolon ;", ";"),
|
||||
("Pipe |", "|"),
|
||||
]
|
||||
|
||||
_OUTPUT_LINE_TERMINATORS = [
|
||||
("LF — \\n (Unix / web / git default)", "\n"),
|
||||
("CRLF — \\r\\n (Windows / classic Excel)", "\r\n"),
|
||||
("CR — \\r (classic Mac, very rare)", "\r"),
|
||||
]
|
||||
|
||||
|
||||
def _build_output_bytes(
|
||||
df: pd.DataFrame,
|
||||
*,
|
||||
encoding: str,
|
||||
delimiter: str,
|
||||
line_terminator: str,
|
||||
) -> tuple[bytes, Optional[str]]:
|
||||
"""Serialize *df* with the user's output options.
|
||||
|
||||
Returns ``(bytes, error_message)``. ``error_message`` is non-None when
|
||||
the chosen encoding cannot represent at least one cell — characters
|
||||
that don't exist in the target codepage are replaced with ``?`` so
|
||||
the user still gets a download, plus a warning telling them which
|
||||
target was lossy.
|
||||
"""
|
||||
buf = io.StringIO()
|
||||
df.to_csv(buf, index=False, sep=delimiter, lineterminator=line_terminator)
|
||||
text = buf.getvalue()
|
||||
try:
|
||||
return text.encode(encoding), None
|
||||
except UnicodeEncodeError:
|
||||
# Find the first character that fails so the message is useful.
|
||||
bad: Optional[str] = None
|
||||
for ch in text:
|
||||
try:
|
||||
ch.encode(encoding)
|
||||
except UnicodeEncodeError:
|
||||
bad = ch
|
||||
break
|
||||
msg = (
|
||||
f"Some characters cannot be represented in {encoding}"
|
||||
+ (f" (first offender: {bad!r})" if bad else "")
|
||||
+ ". Falling back to '?' replacement; non-Latin content will be lost."
|
||||
)
|
||||
return text.encode(encoding, errors="replace"), msg
|
||||
|
||||
|
||||
def _preview_table(f: Finding, decision_action: str, payload: Optional[dict]) -> Optional[pd.DataFrame]:
|
||||
"""Build a before/after preview from finding samples.
|
||||
|
||||
Runs the registered fix function on each sample value individually so
|
||||
the user sees exactly what would change. Returns None when no preview
|
||||
is meaningful (no samples, or no fix registered).
|
||||
"""
|
||||
if not f.samples:
|
||||
return None
|
||||
fix_fn = get_fix(f.fix_action)
|
||||
if fix_fn is None:
|
||||
# No fix to preview; show samples as-is.
|
||||
return pd.DataFrame(
|
||||
[{"row": r, "column": c, "value": v} for r, c, v in f.samples]
|
||||
)
|
||||
rows = []
|
||||
for r, col, val in f.samples:
|
||||
# Run the fix on a tiny single-cell DataFrame so payload semantics
|
||||
# (e.g. lowercase_email's column targeting) are honored.
|
||||
mini = pd.DataFrame({col: [val]})
|
||||
try:
|
||||
new_df, _ = fix_fn(mini, payload)
|
||||
new_val = new_df[col].iloc[0]
|
||||
except Exception as e:
|
||||
new_val = f"<preview error: {e}>"
|
||||
rows.append({"row": r, "column": col, "before": val, "after": new_val})
|
||||
return pd.DataFrame(rows)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Page body
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.title("🛡️ Review & Normalize")
|
||||
st.caption(
|
||||
"Every finding is shown below with the algorithm that would fix it. "
|
||||
"Auto-fix the high-confidence ones in one click; preview or customize "
|
||||
"the rest before applying."
|
||||
)
|
||||
|
||||
# Pre-flight: nothing to review without an upload.
|
||||
findings: list[Finding] = st.session_state.get("home_findings") or []
|
||||
upload_name = st.session_state.get("home_uploaded_name")
|
||||
|
||||
if not upload_name:
|
||||
st.warning("No file uploaded. Go back to the home page and upload a CSV or Excel file first.")
|
||||
if st.button("Back to home"):
|
||||
st.switch_page("app.py")
|
||||
st.stop()
|
||||
|
||||
# ---- Encoding picker --------------------------------------------------------
|
||||
#
|
||||
# Charset detection misfires on small files, byte-equivalent codepages
|
||||
# (cp1252 vs Latin-1 vs cp1250), and content where every byte happens to
|
||||
# decode under the wrong encoding (KOI8-R bytes that look like Shift_JIS).
|
||||
# When the user spots mojibake or U+FFFD chars in the findings list, this
|
||||
# picker is the escape hatch — pick the right encoding, re-run the analyzer.
|
||||
|
||||
with st.container(border=True):
|
||||
detected_enc = _detected_encoding_for_session()
|
||||
current_override = st.session_state.get("encoding_override")
|
||||
suffix = (st.session_state.get("home_uploaded_name") or "")
|
||||
suffix = suffix.rsplit(".", 1)[-1].lower() if "." in suffix else ""
|
||||
is_excel = suffix in ("xlsx", "xls")
|
||||
|
||||
st.markdown("**File encoding**")
|
||||
if is_excel:
|
||||
st.caption(
|
||||
"Excel files store text as Unicode internally — encoding override "
|
||||
"doesn't apply. Skip this section."
|
||||
)
|
||||
else:
|
||||
cap_parts = [f"Detected: `{detected_enc or 'unknown'}`"]
|
||||
if current_override:
|
||||
cap_parts.append(f"Currently using: `{current_override}`")
|
||||
st.caption(
|
||||
" · ".join(cap_parts)
|
||||
+ " · Override only if you see mojibake (e.g. `é` for `é`) or U+FFFD"
|
||||
" (`<60>`) in the findings below."
|
||||
)
|
||||
|
||||
col_pick, col_custom, col_apply = st.columns([2, 2, 1])
|
||||
|
||||
with col_pick:
|
||||
current_label = current_override or "(detected)"
|
||||
try:
|
||||
idx = _OVERRIDE_ENCODINGS.index(current_label)
|
||||
except ValueError:
|
||||
idx = _OVERRIDE_ENCODINGS.index("Other…")
|
||||
chosen = st.selectbox(
|
||||
"Encoding",
|
||||
options=_OVERRIDE_ENCODINGS,
|
||||
index=idx,
|
||||
key="encoding_override_select",
|
||||
label_visibility="collapsed",
|
||||
)
|
||||
|
||||
custom_value: Optional[str] = None
|
||||
with col_custom:
|
||||
if chosen == "Other…":
|
||||
custom_value = st.text_input(
|
||||
"Custom encoding (e.g. `cp1257`, `iso-8859-9`)",
|
||||
value=current_override if current_override and current_override not in _OVERRIDE_ENCODINGS else "",
|
||||
key="encoding_override_custom",
|
||||
label_visibility="collapsed",
|
||||
placeholder="cp1257",
|
||||
)
|
||||
|
||||
with col_apply:
|
||||
if st.button("Re-analyze", use_container_width=True):
|
||||
if chosen == "(detected)":
|
||||
new_override = None
|
||||
elif chosen == "Other…":
|
||||
new_override = (custom_value or "").strip() or None
|
||||
else:
|
||||
new_override = chosen
|
||||
|
||||
# Sanity-check the override actually decodes the bytes.
|
||||
data = st.session_state.get("home_uploaded_bytes") or b""
|
||||
if new_override is not None:
|
||||
try:
|
||||
data.decode(new_override, errors="strict")
|
||||
decode_ok = True
|
||||
decode_err = None
|
||||
except (UnicodeDecodeError, LookupError) as e:
|
||||
decode_ok = False
|
||||
decode_err = str(e)
|
||||
else:
|
||||
decode_ok = True
|
||||
decode_err = None
|
||||
|
||||
if not decode_ok:
|
||||
st.warning(
|
||||
f"`{new_override}` cannot decode this file: {decode_err}. "
|
||||
f"Re-running anyway with replacement-character fallback so "
|
||||
f"you can see where the failures are."
|
||||
)
|
||||
|
||||
# Re-run analysis with the override and refresh session state.
|
||||
st.session_state["encoding_override"] = new_override
|
||||
st.session_state["home_findings"] = _run_analysis_with_override(new_override)
|
||||
# Drop any prior gate result; the user must re-apply.
|
||||
st.session_state.pop("normalization_result", None)
|
||||
st.session_state.pop("normalization_for", None)
|
||||
st.session_state.pop("review_decisions", None)
|
||||
st.rerun()
|
||||
|
||||
# Reload findings — the picker above may have just rewritten them.
|
||||
findings = st.session_state.get("home_findings") or []
|
||||
|
||||
if not findings:
|
||||
st.success("✓ No findings to review. The file is already clean — open any tool to begin.")
|
||||
st.stop()
|
||||
|
||||
|
||||
# ---- Top-line counters -------------------------------------------------------
|
||||
|
||||
n_high = sum(1 for f in findings if f.confidence == "high" and not f.pre_applied and f.fix_action)
|
||||
n_medium = sum(1 for f in findings if f.confidence == "medium" and not f.pre_applied)
|
||||
n_low = sum(1 for f in findings if f.confidence == "low" and not f.pre_applied)
|
||||
n_pre = sum(1 for f in findings if f.pre_applied)
|
||||
n_block = sum(1 for f in findings if f.severity == "error")
|
||||
|
||||
c1, c2, c3, c4, c5 = st.columns(5)
|
||||
c1.metric("High confidence", n_high, help="Round-trip safe — eligible for auto-fix.")
|
||||
c2.metric("Medium", n_medium, help="Right call in the common case; preview before applying.")
|
||||
c3.metric("Low", n_low, help="Heuristic — opt in only.")
|
||||
c4.metric("Already applied", n_pre, help="Fixed during the read pass (BOM, NUL, line endings).")
|
||||
c5.metric("Blocking", n_block, help="Severity = error; must be resolved or waived.")
|
||||
|
||||
st.divider()
|
||||
|
||||
|
||||
# ---- Top-level controls ------------------------------------------------------
|
||||
|
||||
decisions_state: dict = st.session_state.setdefault("review_decisions", {})
|
||||
|
||||
bar_left, bar_mid, bar_right = st.columns([1.2, 1.2, 3])
|
||||
|
||||
with bar_left:
|
||||
if st.button("✨ Auto-fix high-confidence", type="primary", use_container_width=True):
|
||||
for f in findings:
|
||||
if (
|
||||
not f.pre_applied
|
||||
and f.confidence == "high"
|
||||
and f.fix_action
|
||||
and get_fix(f.fix_action) is not None
|
||||
):
|
||||
decisions_state[f.id] = Decision(finding_id=f.id, action="auto")
|
||||
st.rerun()
|
||||
|
||||
with bar_mid:
|
||||
if st.button("Skip everything (not recommended)", use_container_width=True):
|
||||
for f in findings:
|
||||
if not f.pre_applied:
|
||||
decisions_state[f.id] = Decision(finding_id=f.id, action="skip")
|
||||
st.rerun()
|
||||
|
||||
|
||||
# ---- Per-finding cards -------------------------------------------------------
|
||||
|
||||
# Sort: blocking first, then high (unfixed), medium, low, pre-applied.
|
||||
def _sort_key(f: Finding) -> tuple:
|
||||
severity_rank = {"error": 0, "warn": 1, "info": 2}[f.severity]
|
||||
confidence_rank = {"high": 0, "medium": 1, "low": 2}[f.confidence]
|
||||
return (int(f.pre_applied), severity_rank, confidence_rank, f.id)
|
||||
|
||||
|
||||
for f in sorted(findings, key=_sort_key):
|
||||
decision = decisions_state.get(f.id)
|
||||
decision_action = decision.action if decision else (
|
||||
"auto" if (f.pre_applied or (f.confidence == "high" and f.fix_action)) else "skip"
|
||||
)
|
||||
|
||||
title_bits = [
|
||||
_severity_pill(f.severity),
|
||||
_confidence_pill(f.confidence),
|
||||
f"**{f.id}**",
|
||||
f"({f.count})",
|
||||
]
|
||||
if f.pre_applied:
|
||||
title_bits.append(":gray-background[applied during read]")
|
||||
|
||||
with st.expander(" ".join(title_bits), expanded=(f.severity == "error")):
|
||||
st.caption(f.description)
|
||||
if f.tool:
|
||||
st.caption(f"Owned by: `{f.tool}`")
|
||||
|
||||
if f.pre_applied:
|
||||
st.info("This was already applied during the file read pass — no decision needed.")
|
||||
continue
|
||||
|
||||
if not f.fix_action:
|
||||
if f.severity == "error":
|
||||
st.error(
|
||||
"Blocking finding with no auto-fix. Choose **Skip / waive** to "
|
||||
"acknowledge and proceed (not recommended), or fix the file outside "
|
||||
"DataTools and re-upload."
|
||||
)
|
||||
else:
|
||||
st.info("Informational only — no fix to apply.")
|
||||
|
||||
# Decision radio
|
||||
choice_labels = {
|
||||
"auto": "Auto-fix with our algorithm",
|
||||
"skip": "Skip / waive (no change)",
|
||||
}
|
||||
# Customize is offered for fixes that take a meaningful payload.
|
||||
if f.fix_action in ("replace_null_sentinels",):
|
||||
choice_labels["modified"] = "Customize"
|
||||
|
||||
chosen = st.radio(
|
||||
"Decision",
|
||||
options=list(choice_labels.keys()),
|
||||
index=list(choice_labels.keys()).index(decision_action)
|
||||
if decision_action in choice_labels else 0,
|
||||
format_func=lambda k: choice_labels[k],
|
||||
key=f"decision_{f.id}",
|
||||
horizontal=True,
|
||||
)
|
||||
|
||||
# Customize payload editor (only for the modified action)
|
||||
payload: Optional[dict] = None
|
||||
if chosen == "modified" and f.fix_action == "replace_null_sentinels":
|
||||
default_sentinels = ", ".join(sorted([
|
||||
"n/a", "na", "nan", "null", "none", "-", "--", "tbd", "unknown",
|
||||
]))
|
||||
text = st.text_area(
|
||||
"Sentinels (comma-separated, case-insensitive):",
|
||||
value=(decision.payload or {}).get(
|
||||
"sentinels_raw", default_sentinels,
|
||||
) if decision else default_sentinels,
|
||||
key=f"sentinels_{f.id}",
|
||||
)
|
||||
sentinels = [s.strip() for s in text.split(",") if s.strip()]
|
||||
payload = {"sentinels": sentinels, "sentinels_raw": text}
|
||||
|
||||
# Persist
|
||||
decisions_state[f.id] = Decision(
|
||||
finding_id=f.id, action=chosen, payload=payload,
|
||||
)
|
||||
|
||||
# Preview
|
||||
if chosen != "skip" and f.samples:
|
||||
preview = _preview_table(f, chosen, payload)
|
||||
if preview is not None and not preview.empty:
|
||||
st.markdown("**Preview** (showing up to 5 affected cells)")
|
||||
st.dataframe(preview, use_container_width=True, hide_index=True)
|
||||
|
||||
st.divider()
|
||||
|
||||
|
||||
# ---- Apply ------------------------------------------------------------------
|
||||
|
||||
bottom_left, bottom_mid, bottom_right = st.columns([1, 1, 3])
|
||||
|
||||
with bottom_left:
|
||||
apply_clicked = st.button(
|
||||
"✅ Apply & enter tools", type="primary", use_container_width=True,
|
||||
disabled=not decisions_state,
|
||||
)
|
||||
|
||||
with bottom_mid:
|
||||
reset_clicked = st.button("Reset all decisions", use_container_width=True)
|
||||
|
||||
if reset_clicked:
|
||||
st.session_state.pop("review_decisions", None)
|
||||
st.session_state.pop("normalization_result", None)
|
||||
st.session_state.pop("normalization_for", None)
|
||||
st.rerun()
|
||||
|
||||
if apply_clicked:
|
||||
df = _load_df_from_session(
|
||||
encoding_override=st.session_state.get("encoding_override")
|
||||
)
|
||||
if df is None:
|
||||
st.error("Could not re-read the uploaded file. Try re-uploading.")
|
||||
st.stop()
|
||||
decisions_list = [d for d in decisions_state.values() if isinstance(d, Decision)]
|
||||
result = apply_decisions(df, findings, decisions_list)
|
||||
st.session_state["normalization_result"] = result
|
||||
st.session_state["normalization_for"] = _upload_hash()
|
||||
|
||||
summary = gate_summary(result)
|
||||
if result.passed and is_normalized(findings, result):
|
||||
st.success(
|
||||
f"✓ Gate passed — {summary['fixes_applied']} fix(es) applied, "
|
||||
f"{summary['cells_changed']} cell(s) changed. You can now open any tool."
|
||||
)
|
||||
elif result.blocking_findings:
|
||||
st.error(
|
||||
f"Gate blocked by error-level findings: "
|
||||
f"{', '.join(b.id for b in result.blocking_findings)}. "
|
||||
f"Resolve or waive them above before continuing."
|
||||
)
|
||||
elif result.pending_findings:
|
||||
st.warning(
|
||||
f"Pending decisions remain on: "
|
||||
f"{', '.join(f.id for f in result.pending_findings)}. "
|
||||
f"Choose Auto-fix or Skip for each before continuing."
|
||||
)
|
||||
|
||||
# Persisted summary (re-render on reload)
|
||||
result: Optional[NormalizationResult] = st.session_state.get("normalization_result")
|
||||
if result is not None and st.session_state.get("normalization_for") == _upload_hash():
|
||||
with st.expander("Audit log"):
|
||||
if result.applied:
|
||||
st.markdown("**Applied fixes**")
|
||||
st.dataframe(
|
||||
pd.DataFrame([
|
||||
{
|
||||
"finding": a.finding_id,
|
||||
"fix_action": a.fix_action,
|
||||
"decision": a.decision,
|
||||
"cells_changed": a.cells_changed,
|
||||
}
|
||||
for a in result.applied
|
||||
]),
|
||||
use_container_width=True, hide_index=True,
|
||||
)
|
||||
if result.skipped_findings:
|
||||
st.markdown("**Skipped (waived by user)**")
|
||||
st.write([f.id for f in result.skipped_findings])
|
||||
if result.passed:
|
||||
st.markdown("---")
|
||||
st.markdown("**Download normalized file**")
|
||||
with st.expander("⚙️ Advanced output options"):
|
||||
st.caption(
|
||||
"Defaults match what the analyzer normalized to: UTF-8, "
|
||||
"comma-separated, LF line endings. Override only if your "
|
||||
"destination tool requires a specific format."
|
||||
)
|
||||
|
||||
col_enc, col_delim, col_le = st.columns(3)
|
||||
with col_enc:
|
||||
enc_choice = st.selectbox(
|
||||
"Encoding (code page)",
|
||||
options=[label for label, _ in _OUTPUT_ENCODINGS],
|
||||
index=0,
|
||||
key="output_encoding_select",
|
||||
)
|
||||
out_encoding = next(
|
||||
codec for label, codec in _OUTPUT_ENCODINGS if label == enc_choice
|
||||
)
|
||||
|
||||
with col_delim:
|
||||
delim_choice = st.selectbox(
|
||||
"Delimiter",
|
||||
options=[label for label, _ in _OUTPUT_DELIMITERS],
|
||||
index=0,
|
||||
key="output_delim_select",
|
||||
)
|
||||
out_delim = next(
|
||||
ch for label, ch in _OUTPUT_DELIMITERS if label == delim_choice
|
||||
)
|
||||
|
||||
with col_le:
|
||||
le_choice = st.selectbox(
|
||||
"Line terminator",
|
||||
options=[label for label, _ in _OUTPUT_LINE_TERMINATORS],
|
||||
index=0,
|
||||
key="output_le_select",
|
||||
)
|
||||
out_le = next(
|
||||
ch for label, ch in _OUTPUT_LINE_TERMINATORS if label == le_choice
|
||||
)
|
||||
|
||||
data, encode_warn = _build_output_bytes(
|
||||
result.cleaned_df,
|
||||
encoding=out_encoding,
|
||||
delimiter=out_delim,
|
||||
line_terminator=out_le,
|
||||
)
|
||||
if encode_warn:
|
||||
st.warning(encode_warn)
|
||||
|
||||
ext = "tsv" if out_delim == "\t" else "csv"
|
||||
mime = "text/tab-separated-values" if out_delim == "\t" else "text/csv"
|
||||
file_name = f"{Path(upload_name).stem}.normalized.{ext}"
|
||||
|
||||
st.download_button(
|
||||
f"⬇️ Download {file_name}",
|
||||
data=data,
|
||||
file_name=file_name,
|
||||
mime=mime,
|
||||
type="primary",
|
||||
)
|
||||
@@ -22,10 +22,12 @@ from src.gui.components import (
|
||||
hide_streamlit_chrome,
|
||||
match_group_card,
|
||||
pickup_or_upload,
|
||||
require_normalization_gate,
|
||||
results_summary,
|
||||
)
|
||||
|
||||
hide_streamlit_chrome()
|
||||
require_normalization_gate()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Session state defaults
|
||||
|
||||
@@ -18,6 +18,7 @@ from src.gui.components import (
|
||||
hide_streamlit_chrome,
|
||||
pickup_or_upload,
|
||||
render_hidden_aware_preview,
|
||||
require_normalization_gate,
|
||||
)
|
||||
from src.core.text_clean import (
|
||||
PRESETS,
|
||||
@@ -28,6 +29,7 @@ from src.core.text_clean import (
|
||||
)
|
||||
|
||||
hide_streamlit_chrome()
|
||||
require_normalization_gate()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@@ -11,9 +11,10 @@ _project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
if str(_project_root) not in sys.path:
|
||||
sys.path.insert(0, str(_project_root))
|
||||
|
||||
from src.gui.components import hide_streamlit_chrome
|
||||
from src.gui.components import hide_streamlit_chrome, require_normalization_gate
|
||||
|
||||
hide_streamlit_chrome()
|
||||
require_normalization_gate()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Header
|
||||
|
||||
@@ -11,9 +11,10 @@ _project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
if str(_project_root) not in sys.path:
|
||||
sys.path.insert(0, str(_project_root))
|
||||
|
||||
from src.gui.components import hide_streamlit_chrome
|
||||
from src.gui.components import hide_streamlit_chrome, require_normalization_gate
|
||||
|
||||
hide_streamlit_chrome()
|
||||
require_normalization_gate()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Header
|
||||
|
||||
@@ -11,9 +11,10 @@ _project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
if str(_project_root) not in sys.path:
|
||||
sys.path.insert(0, str(_project_root))
|
||||
|
||||
from src.gui.components import hide_streamlit_chrome
|
||||
from src.gui.components import hide_streamlit_chrome, require_normalization_gate
|
||||
|
||||
hide_streamlit_chrome()
|
||||
require_normalization_gate()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Header
|
||||
|
||||
@@ -11,9 +11,10 @@ _project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
if str(_project_root) not in sys.path:
|
||||
sys.path.insert(0, str(_project_root))
|
||||
|
||||
from src.gui.components import hide_streamlit_chrome
|
||||
from src.gui.components import hide_streamlit_chrome, require_normalization_gate
|
||||
|
||||
hide_streamlit_chrome()
|
||||
require_normalization_gate()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Header
|
||||
|
||||
@@ -11,9 +11,10 @@ _project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
if str(_project_root) not in sys.path:
|
||||
sys.path.insert(0, str(_project_root))
|
||||
|
||||
from src.gui.components import hide_streamlit_chrome
|
||||
from src.gui.components import hide_streamlit_chrome, require_normalization_gate
|
||||
|
||||
hide_streamlit_chrome()
|
||||
require_normalization_gate()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Header
|
||||
|
||||
@@ -11,9 +11,10 @@ _project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
if str(_project_root) not in sys.path:
|
||||
sys.path.insert(0, str(_project_root))
|
||||
|
||||
from src.gui.components import hide_streamlit_chrome
|
||||
from src.gui.components import hide_streamlit_chrome, require_normalization_gate
|
||||
|
||||
hide_streamlit_chrome()
|
||||
require_normalization_gate()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Header
|
||||
|
||||
@@ -11,9 +11,10 @@ _project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
if str(_project_root) not in sys.path:
|
||||
sys.path.insert(0, str(_project_root))
|
||||
|
||||
from src.gui.components import hide_streamlit_chrome
|
||||
from src.gui.components import hide_streamlit_chrome, require_normalization_gate
|
||||
|
||||
hide_streamlit_chrome()
|
||||
require_normalization_gate()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Header
|
||||
|
||||
Reference in New Issue
Block a user