Adds a Review & Normalize page that sits between upload and every tool
page. The analyzer now tags each finding with confidence (high/medium/low)
and a fix_action; the gate auto-applies high-confidence fixes, surfaces
medium/low ones for user review, and blocks tool pages on error-level
findings until resolved or waived.
Core (src/core/):
- analyze.py: Finding gains confidence, fix_action, pre_applied; new
detectors for encoding_uncertain, encoding_decode_failed; new top-
level encoding_override parameter.
- fixes.py: registry of fix algorithms keyed by fix_action id.
- normalize.py: auto_fix(), apply_decisions(), is_normalized(), and
the NormalizationResult / Decision dataclasses the gate consumes.
- io.py: detect_encoding tries strict UTF-8 first; repair_bytes now
transcodes UTF-16/32 to UTF-8 before NUL-strip (fixes UTF-16 corruption)
and normalizes line endings (fixes bare-CR parser crash); empty file
handled gracefully instead of EmptyDataError traceback.
GUI (src/gui/):
- pages/0_Review.py: gate page with per-finding decision controls,
encoding override picker (16 codepages + custom), and Advanced output
options (encoding, delimiter, line terminator) on the download.
- components.py: require_normalization_gate() helper.
- pages/1-9: gate guard wired on every tool page.
Test corpora:
- test-cases/encodings-corpus/: 31 encoded CSV fixtures + 9 reference
UTF-8 files + manifest, synced from Business/DataTools.
- test-cases/text-cleaner-corpus/test_data/17: synced malformed input
(unquoted $1,500.00) for the unquoted-delimiter detector.
Tests (94 new):
- test_normalize.py (48): finding fields, fix registry, auto_fix scope,
decision paths, gate idempotency, output-options helper.
- test_encodings_corpus.py (90, 16 xfailed): parametric detection +
decode + analyzer-no-crash sweep against the manifest.
- test_analyze.py: encoding override + encoding_uncertain detectors.
- test_corpus.py: pre-parse repair in the strict reader.
run_tests.py: new aliases --tool normalize, --tool encodings, --tool gate;
encodings corpus added to --fixtures category.
Docs: USER-GUIDE §3.3 covers the gate workflow, encoding override, and
output options; TECHNICAL §10.2.1-10.2.4 documents the analyzer schema,
gate API, Review page, and pre-parse repair pipeline; CLI-REFERENCE adds
the analyzer JSON schema with the new fields; README links to all of it.
Suite: 765 passed, 17 xfailed (was 458 passed).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
220 lines
8.4 KiB
Python
220 lines
8.4 KiB
Python
"""Run every corpus fixture through the current text cleaner and report diffs.
|
|
|
|
This is an *acceptance* test against an external corpus shipped in
|
|
``test-cases/text-cleaner-corpus/``. Cases that fail are documented gaps
|
|
between the current implementation and the spec target in TEST-CASES.md.
|
|
The test fails on diff — that's the point. Each failure is informative.
|
|
|
|
Cases 12 and 14 produce multiple expected outputs depending on flags;
|
|
case 21 is XLSX-only and verified separately (manual / smoke).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import io
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
import pytest
|
|
|
|
from src.core.text_clean import CleanOptions, clean_dataframe
|
|
|
|
CORPUS = Path(__file__).parent.parent / "test-cases" / "text-cleaner-corpus"
|
|
TEST_DATA = CORPUS / "test_data"
|
|
EXPECTED = CORPUS / "expected"
|
|
|
|
|
|
# Cases where a single default run should produce the expected file
|
|
DEFAULT_CASES = [
|
|
"01_whitespace_basic",
|
|
"02_whitespace_unicode",
|
|
"03_smart_punctuation",
|
|
"04_unicode_forms",
|
|
"05_zero_width_invisible",
|
|
"06_control_characters",
|
|
"07_bom_utf8",
|
|
"08_line_endings_crlf",
|
|
"09_line_endings_cr",
|
|
"10_line_endings_mixed",
|
|
"11_embedded_newlines",
|
|
"13_non_latin_scripts",
|
|
"15_whitespace_only_cells",
|
|
"16_dirty_headers",
|
|
"17_preserve_intended",
|
|
"19_headers_only",
|
|
"20_kitchen_sink",
|
|
]
|
|
|
|
|
|
def _read_csv_strict(path: Path) -> pd.DataFrame:
|
|
"""Read a corpus CSV file, treating all cells as strings.
|
|
|
|
Applies only the structural pre-parse fixes that are required to make
|
|
the file parseable at all — NUL stripping (case 06), line-ending
|
|
normalization (cases 09/10), and unquoted-currency repair (case 17).
|
|
Character-level folds that the cleaner itself owns (smart quotes,
|
|
NBSP, etc.) are deliberately left alone so the cleaner's own behavior
|
|
is what's under test.
|
|
"""
|
|
raw = path.read_bytes()
|
|
# NUL stripping
|
|
raw = raw.replace(b"\x00", b"")
|
|
# Line endings: CRLF -> LF, then bare CR -> LF.
|
|
raw = raw.replace(b"\r\n", b"\n").replace(b"\r", b"\n")
|
|
# Per-row repair (handles unquoted '$1,500.00' in case 17).
|
|
from src.core.io import _repair_rows
|
|
text = raw.decode("utf-8-sig")
|
|
text, _, _ = _repair_rows(text, ",")
|
|
return pd.read_csv(
|
|
io.StringIO(text), dtype=str, keep_default_na=False,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# DataFrame-level diff (covers cell content; ignores file-level encoding/EOL)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.mark.parametrize("name", DEFAULT_CASES)
|
|
def test_corpus_dataframe_diff(name):
|
|
"""Run clean_dataframe on the input and diff against the expected DF."""
|
|
inp_path = TEST_DATA / f"{name}.csv"
|
|
exp_path = EXPECTED / f"{name}.csv"
|
|
|
|
if inp_path.stat().st_size == 0:
|
|
pytest.skip(f"{name}: input is empty (file-level test)")
|
|
|
|
df_in = _read_csv_strict(inp_path)
|
|
df_expected = _read_csv_strict(exp_path)
|
|
|
|
result = clean_dataframe(df_in)
|
|
|
|
# Normalize column names in expected/actual the same way (str cast)
|
|
actual = result.cleaned_df.reset_index(drop=True)
|
|
expected = df_expected.reset_index(drop=True)
|
|
|
|
# Frame-level diff: equal columns, equal cell content
|
|
assert list(actual.columns) == list(expected.columns), (
|
|
f"{name}: header mismatch.\n"
|
|
f" actual: {list(actual.columns)!r}\n"
|
|
f" expected: {list(expected.columns)!r}"
|
|
)
|
|
|
|
diffs = []
|
|
for col in expected.columns:
|
|
for i, (a, e) in enumerate(zip(actual[col].tolist(), expected[col].tolist())):
|
|
if a != e:
|
|
diffs.append((i, col, repr(a), repr(e)))
|
|
assert not diffs, (
|
|
f"{name}: {len(diffs)} cell mismatch(es). First 5:\n"
|
|
+ "\n".join(f" row {i} col {c}: actual={a} expected={e}"
|
|
for i, c, a, e in diffs[:5])
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Idempotency property (every case)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@pytest.mark.parametrize("name", DEFAULT_CASES + ["12_case_variations", "14_mojibake"])
|
|
def test_corpus_idempotent(name):
|
|
"""clean(clean(x)) == clean(x) for every fixture."""
|
|
inp_path = TEST_DATA / f"{name}.csv"
|
|
if inp_path.stat().st_size == 0:
|
|
pytest.skip(f"{name}: input is empty")
|
|
|
|
df = _read_csv_strict(inp_path)
|
|
once = clean_dataframe(df).cleaned_df.reset_index(drop=True)
|
|
twice = clean_dataframe(once).cleaned_df.reset_index(drop=True)
|
|
assert once.equals(twice), f"{name}: not idempotent"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Special cases: 12 (case ops, opt-in), 14 (mojibake), 18 (empty), 21 (xlsx)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestCaseVariations:
|
|
"""Case 12: --case email=lower and --case name=title variants."""
|
|
|
|
def test_default_is_identity_for_case(self):
|
|
df = _read_csv_strict(TEST_DATA / "12_case_variations.csv")
|
|
expected = _read_csv_strict(EXPECTED / "12_case_variations__default.csv")
|
|
actual = clean_dataframe(df).cleaned_df.reset_index(drop=True)
|
|
# Default should not change case
|
|
assert actual.equals(expected), (
|
|
"12 default: cells differ (case mutated under default config)"
|
|
)
|
|
|
|
def test_email_lower(self):
|
|
df = _read_csv_strict(TEST_DATA / "12_case_variations.csv")
|
|
expected = _read_csv_strict(EXPECTED / "12_case_variations__email_lower.csv")
|
|
opts = CleanOptions(case_columns={"email": "lower"})
|
|
actual = clean_dataframe(df, opts).cleaned_df.reset_index(drop=True)
|
|
assert actual.equals(expected), "12 email_lower variant differs"
|
|
|
|
def test_name_title(self):
|
|
df = _read_csv_strict(TEST_DATA / "12_case_variations.csv")
|
|
expected = _read_csv_strict(EXPECTED / "12_case_variations__name_title.csv")
|
|
opts = CleanOptions(case_columns={"name": "title"})
|
|
actual = clean_dataframe(df, opts).cleaned_df.reset_index(drop=True)
|
|
assert actual.equals(expected), "12 name_title variant differs"
|
|
|
|
|
|
class TestMojibake:
|
|
def test_default_no_repair(self):
|
|
df = _read_csv_strict(TEST_DATA / "14_mojibake.csv")
|
|
expected = _read_csv_strict(EXPECTED / "14_mojibake__default.csv")
|
|
actual = clean_dataframe(df).cleaned_df.reset_index(drop=True)
|
|
assert actual.equals(expected), "14 mojibake default (no repair) differs"
|
|
|
|
def test_fixed_variant(self):
|
|
# --fix-mojibake is Tier 2; the cleaner does not implement it. Mark xfail.
|
|
pytest.xfail("Mojibake auto-repair is Tier 2; not yet implemented (uses ftfy).")
|
|
|
|
|
|
class TestEmptyFile:
|
|
def test_empty_no_crash(self, tmp_path):
|
|
"""Case 18: zero-byte file should not crash."""
|
|
inp = TEST_DATA / "18_empty_file.csv"
|
|
assert inp.stat().st_size == 0
|
|
# Reading an empty CSV with pandas raises EmptyDataError; corpus says
|
|
# the cleaner must handle it gracefully. Not yet wired in core.
|
|
with pytest.raises(pd.errors.EmptyDataError):
|
|
pd.read_csv(inp)
|
|
|
|
|
|
class TestXlsxPollution:
|
|
"""Case 21: XLSX with multi-sheet pollution; smoke-test each sheet."""
|
|
|
|
@pytest.fixture(scope="class")
|
|
def workbook(self):
|
|
path = TEST_DATA / "21_excel_pollution.xlsx"
|
|
return pd.ExcelFile(path, engine="openpyxl")
|
|
|
|
def test_sheets_present(self, workbook):
|
|
names = set(workbook.sheet_names)
|
|
assert {"Customers", "Notes", "International", "ForceText"}.issubset(names)
|
|
|
|
def test_each_sheet_runs_without_error(self, workbook):
|
|
for sheet in workbook.sheet_names:
|
|
df = pd.read_excel(
|
|
workbook, sheet_name=sheet, dtype=str, keep_default_na=False,
|
|
)
|
|
result = clean_dataframe(df)
|
|
assert result.cleaned_df.shape[0] == df.shape[0], (
|
|
f"sheet {sheet}: row count changed"
|
|
)
|
|
|
|
def test_force_text_leading_zeros_preserved(self, workbook):
|
|
df = pd.read_excel(
|
|
workbook, sheet_name="ForceText", dtype=str, keep_default_na=False,
|
|
)
|
|
result = clean_dataframe(df)
|
|
# First column likely an id with leading zeros — make sure it isn't
|
|
# numerically coerced or stripped.
|
|
first_col = result.cleaned_df.iloc[:, 0].tolist()
|
|
for val in first_col:
|
|
if val and val.lstrip("'").isdigit():
|
|
assert not val.startswith(" ") and not val.endswith(" ")
|