Files
datatools-dev/tests/test_analyze.py
Michael 82d7fef21e feat(gate): CSV-normalization gate with confidence-tiered findings
Adds a Review & Normalize page that sits between upload and every tool
page. The analyzer now tags each finding with confidence (high/medium/low)
and a fix_action; the gate auto-applies high-confidence fixes, surfaces
medium/low ones for user review, and blocks tool pages on error-level
findings until resolved or waived.

Core (src/core/):
  - analyze.py: Finding gains confidence, fix_action, pre_applied; new
    detectors for encoding_uncertain, encoding_decode_failed; new top-
    level encoding_override parameter.
  - fixes.py: registry of fix algorithms keyed by fix_action id.
  - normalize.py: auto_fix(), apply_decisions(), is_normalized(), and
    the NormalizationResult / Decision dataclasses the gate consumes.
  - io.py: detect_encoding tries strict UTF-8 first; repair_bytes now
    transcodes UTF-16/32 to UTF-8 before NUL-strip (fixes UTF-16 corruption)
    and normalizes line endings (fixes bare-CR parser crash); empty file
    handled gracefully instead of EmptyDataError traceback.

GUI (src/gui/):
  - pages/0_Review.py: gate page with per-finding decision controls,
    encoding override picker (16 codepages + custom), and Advanced output
    options (encoding, delimiter, line terminator) on the download.
  - components.py: require_normalization_gate() helper.
  - pages/1-9: gate guard wired on every tool page.

Test corpora:
  - test-cases/encodings-corpus/: 31 encoded CSV fixtures + 9 reference
    UTF-8 files + manifest, synced from Business/DataTools.
  - test-cases/text-cleaner-corpus/test_data/17: synced malformed input
    (unquoted $1,500.00) for the unquoted-delimiter detector.

Tests (94 new):
  - test_normalize.py (48): finding fields, fix registry, auto_fix scope,
    decision paths, gate idempotency, output-options helper.
  - test_encodings_corpus.py (90, 16 xfailed): parametric detection +
    decode + analyzer-no-crash sweep against the manifest.
  - test_analyze.py: encoding override + encoding_uncertain detectors.
  - test_corpus.py: pre-parse repair in the strict reader.

run_tests.py: new aliases --tool normalize, --tool encodings, --tool gate;
encodings corpus added to --fixtures category.

Docs: USER-GUIDE §3.3 covers the gate workflow, encoding override, and
output options; TECHNICAL §10.2.1-10.2.4 documents the analyzer schema,
gate API, Review page, and pre-parse repair pipeline; CLI-REFERENCE adds
the analyzer JSON schema with the new fields; README links to all of it.

Suite: 765 passed, 17 xfailed (was 458 passed).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 20:35:27 +00:00

381 lines
14 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests for src.core.analyze — upload-time data quality detectors."""
from __future__ import annotations
from pathlib import Path
import pandas as pd
import pytest
from src.core.analyze import (
Finding,
TOOL_DEDUPLICATOR,
TOOL_MISSING_HANDLER,
TOOL_TEXT_CLEANER,
analyze,
findings_by_tool,
to_dict,
)
from src.core.io import RepairAction, RepairResult, repair_bytes
def _ids(findings: list[Finding]) -> set[str]:
return {f.id for f in findings}
# ---------------------------------------------------------------------------
# Smart punctuation
# ---------------------------------------------------------------------------
class TestSmartPunctuation:
def test_finds_curly_quotes(self):
df = pd.DataFrame({"note": ["plain", "“fancy”", "its"]})
findings = analyze(df)
assert "smart_punctuation_in_data" in _ids(findings)
f = next(f for f in findings if f.id == "smart_punctuation_in_data")
assert f.severity == "warn"
assert f.tool == TOOL_TEXT_CLEANER
assert f.count == 2
def test_finds_dashes_and_ellipsis(self):
df = pd.DataFrame({"note": ["a—b", "wait…"]})
findings = analyze(df)
assert "smart_punctuation_in_data" in _ids(findings)
def test_clean_data_no_finding(self):
df = pd.DataFrame({"note": ["plain", "ASCII only", "no smart chars"]})
findings = analyze(df)
assert "smart_punctuation_in_data" not in _ids(findings)
# ---------------------------------------------------------------------------
# Invisible / NBSP / dirty headers
# ---------------------------------------------------------------------------
class TestInvisibleChars:
def test_finds_nbsp(self):
df = pd.DataFrame({"name": ["Alice ", "Bob"]})
findings = analyze(df)
assert "nbsp_or_unicode_whitespace" in _ids(findings)
f = next(f for f in findings if f.id == "nbsp_or_unicode_whitespace")
assert f.count == 1
def test_finds_zero_width(self):
df = pd.DataFrame({"name": ["Alice", "Bob"]})
findings = analyze(df)
assert "zero_width_or_invisible" in _ids(findings)
def test_flags_dirty_headers(self):
df = pd.DataFrame({" id ": [1], "Email": ["a@b.com"]})
findings = analyze(df)
assert "dirty_column_headers" in _ids(findings)
f = next(f for f in findings if f.id == "dirty_column_headers")
assert f.count == 2
def test_clean_headers_no_finding(self):
df = pd.DataFrame({"id": [1], "email": ["a@b.com"]})
findings = analyze(df)
assert "dirty_column_headers" not in _ids(findings)
# ---------------------------------------------------------------------------
# Whitespace padding
# ---------------------------------------------------------------------------
class TestWhitespacePadding:
def test_finds_leading_trailing_space(self):
df = pd.DataFrame({"x": [" padded ", "clean"]})
findings = analyze(df)
assert "whitespace_padding" in _ids(findings)
def test_finds_internal_double_space(self):
df = pd.DataFrame({"x": ["double space", "single space"]})
findings = analyze(df)
assert "whitespace_padding" in _ids(findings)
def test_no_finding_when_clean(self):
df = pd.DataFrame({"x": ["clean", "also clean"]})
findings = analyze(df)
assert "whitespace_padding" not in _ids(findings)
# ---------------------------------------------------------------------------
# Null-like sentinels
# ---------------------------------------------------------------------------
class TestNullLikeSentinels:
def test_finds_n_a_and_nan(self):
df = pd.DataFrame({"x": ["valid", "N/A", "nan", "None", "-"]})
findings = analyze(df)
f = next(f for f in findings if f.id == "null_like_sentinels")
assert f.count == 4
assert f.tool == TOOL_MISSING_HANDLER
assert f.severity == "info"
def test_clean_data_no_finding(self):
df = pd.DataFrame({"x": ["a", "b", "c"]})
findings = analyze(df)
assert "null_like_sentinels" not in _ids(findings)
# ---------------------------------------------------------------------------
# Mojibake
# ---------------------------------------------------------------------------
class TestMojibake:
def test_finds_classic_pattern(self):
df = pd.DataFrame({"name": ["café", "café", "Müller"]})
findings = analyze(df)
assert "suspected_mojibake" in _ids(findings)
def test_clean_unicode_no_finding(self):
df = pd.DataFrame({"name": ["café", "naïve", "München"]})
findings = analyze(df)
assert "suspected_mojibake" not in _ids(findings)
# ---------------------------------------------------------------------------
# Mixed-case email column
# ---------------------------------------------------------------------------
class TestMixedCaseEmail:
def test_finds_mixed_case(self):
df = pd.DataFrame({"email": ["Alice@Example.COM", "bob@example.com"]})
findings = analyze(df)
assert "mixed_case_email_column" in _ids(findings)
def test_all_lower_no_finding(self):
df = pd.DataFrame({"email": ["a@b.com", "c@d.com"]})
findings = analyze(df)
assert "mixed_case_email_column" not in _ids(findings)
def test_non_email_column_ignored(self):
df = pd.DataFrame({"name": ["Alice", "bob"]})
findings = analyze(df)
assert "mixed_case_email_column" not in _ids(findings)
# ---------------------------------------------------------------------------
# Leading-zero IDs
# ---------------------------------------------------------------------------
class TestLeadingZeroIds:
def test_finds_zero_padded_ids(self):
df = pd.DataFrame({
"sku": ["0001234", "0005678", "0009999", "0001111", "0002222", "0003333"],
})
findings = analyze(df)
assert "leading_zero_ids" in _ids(findings)
def test_no_finding_when_no_leading_zero(self):
df = pd.DataFrame({"id": [str(i) for i in range(1, 100)]})
findings = analyze(df)
assert "leading_zero_ids" not in _ids(findings)
# ---------------------------------------------------------------------------
# Near-duplicate rows
# ---------------------------------------------------------------------------
class TestNearDuplicates:
def test_finds_case_insensitive_dupes(self):
df = pd.DataFrame({
"name": ["Alice", "alice ", "Bob"],
"email": ["a@b.com", "A@B.COM", "bob@b.com"],
})
findings = analyze(df)
assert "near_duplicate_rows" in _ids(findings)
def test_unique_rows_no_finding(self):
df = pd.DataFrame({
"name": ["Alice", "Bob", "Carol"],
"email": ["a@x.com", "b@x.com", "c@x.com"],
})
findings = analyze(df)
assert "near_duplicate_rows" not in _ids(findings)
def test_single_row_no_finding(self):
df = pd.DataFrame({"x": ["only"]})
findings = analyze(df)
assert "near_duplicate_rows" not in _ids(findings)
# ---------------------------------------------------------------------------
# Mixed line endings
# ---------------------------------------------------------------------------
class TestEncodingUncertainty:
def test_replacement_chars_in_data_flagged(self):
df = pd.DataFrame({"name": ["Caf<EFBFBD>", "Ber<EFBFBD>in"]})
findings = analyze(df)
f = next(f for f in findings if f.id == "encoding_uncertain")
assert f.severity == "error"
assert f.confidence == "low"
assert f.count == 2
def test_replacement_chars_in_header_flagged(self):
df = pd.DataFrame({"emai<EFBFBD>l": ["a@x.com"]})
findings = analyze(df)
ids = {f.id for f in findings}
assert "encoding_uncertain" in ids
def test_clean_data_no_finding(self):
df = pd.DataFrame({"name": ["Alice", "Bob"]})
findings = analyze(df)
assert "encoding_uncertain" not in {f.id for f in findings}
class TestEncodingOverride:
def test_override_corrects_misdetected_codepage(self, tmp_path):
# WESTERN_BASIC bytes encoded as cp1252; charset-normalizer guesses
# cp1250, which gets 0xF1 wrong (ń vs ñ).
f = tmp_path / "cp1252.csv"
f.write_bytes("id,name\n1,España\n".encode("cp1252"))
from src.core.analyze import _load_for_analysis
df_auto, _, _ = _load_for_analysis(f, sample_rows=10)
df_overridden, _, _ = _load_for_analysis(
f, sample_rows=10, encoding_override="cp1252",
)
# Override yields the correct character.
assert df_overridden["name"].iloc[0] == "España"
def test_override_propagates_through_top_level_analyze(self, tmp_path):
f = tmp_path / "koi8.csv"
# KOI8-R Cyrillic; default detection guesses Shift_JIS.
f.write_bytes("id,name\n1,Иван\n".encode("koi8-r"))
# With the override the analyzer should produce zero findings
# against this clean fixture (no mojibake, no U+FFFD).
findings = analyze(f, encoding_override="koi8-r")
ids = {x.id for x in findings}
assert "encoding_uncertain" not in ids
assert "encoding_decode_failed" not in ids
class TestEncodingDecodeFailedFromRepair:
def test_decode_replaced_action_surfaces_error_finding(self, tmp_path):
# Create a file with a UTF-8 BOM but cp1252 body bytes — utf-8-sig
# fails on byte 0x80 (€ in cp1252).
f = tmp_path / "lying_bom.csv"
f.write_bytes(b"\xef\xbb\xbfid,name\n1,\x80100\n")
findings = analyze(f)
ids = {x.id for x in findings}
assert "encoding_decode_failed" in ids
bad = next(x for x in findings if x.id == "encoding_decode_failed")
assert bad.severity == "error"
class TestMixedLineEndings:
def test_crlf_plus_lf_flagged(self, tmp_path):
f = tmp_path / "mixed.csv"
f.write_bytes(b"id,name\r\n1,Alice\n2,Bob\r\n")
findings = analyze(f)
assert "mixed_line_endings" in _ids(findings)
def test_uniform_lf_not_flagged(self, tmp_path):
f = tmp_path / "uniform.csv"
f.write_bytes(b"id,name\n1,Alice\n2,Bob\n")
findings = analyze(f)
assert "mixed_line_endings" not in _ids(findings)
def test_dataframe_mode_skips_detector(self):
# No raw bytes -> mixed_line_endings cannot be detected.
df = pd.DataFrame({"id": ["1"], "name": ["Alice"]})
findings = analyze(df)
assert "mixed_line_endings" not in _ids(findings)
# ---------------------------------------------------------------------------
# Findings synthesized from RepairResult
# ---------------------------------------------------------------------------
class TestFindingsFromRepair:
def test_bom_strip_surfaces(self):
repair = repair_bytes(b"\xef\xbb\xbfid,name\n1,Alice\n")
findings = analyze(pd.DataFrame({"id": ["1"], "name": ["Alice"]}),
repair_result=repair)
assert "csv_bom_stripped" in _ids(findings)
def test_nul_strip_surfaces(self):
repair = repair_bytes(b"id,name\n1,Hel\x00lo\n")
findings = analyze(pd.DataFrame({"id": ["1"], "name": ["Hello"]}),
repair_result=repair)
assert "csv_nul_stripped" in _ids(findings)
def test_unrepairable_surfaces_as_error(self):
# Synthesize a result with an unrepairable line.
repair = RepairResult(
repaired_bytes=b"id,a,b\n1,foo,bar\n",
actions=[],
unrepairable_lines=[3],
)
findings = analyze(pd.DataFrame({"id": ["1"], "a": ["foo"], "b": ["bar"]}),
repair_result=repair)
f = next(f for f in findings if f.id == "csv_unrepairable_rows")
assert f.severity == "error"
# ---------------------------------------------------------------------------
# End-to-end on the corpus kitchen-sink fixture
# ---------------------------------------------------------------------------
class TestEndToEnd:
def test_kitchen_sink_fixture_finds_pollution(self):
path = Path("test-cases/text-cleaner-corpus/test_data/20_kitchen_sink.csv")
if not path.exists():
pytest.skip("corpus fixture not present")
findings = analyze(path)
ids = _ids(findings)
# Kitchen-sink has BOM, smart quotes, NBSP, ZWSP, and dirty headers.
# Pre-parse repair handles the file-level smart-quote/BOM, so they
# show up as csv_* findings; the cell-level NBSP/ZW remain as
# data findings.
assert "csv_bom_stripped" in ids or "csv_smart_quotes_folded" in ids
# NBSP-padded headers should still surface — pre-parse repair only
# touches double-quote characters.
assert any(i.startswith("dirty_") or i.startswith("nbsp") or i.startswith("zero_width")
for i in ids)
def test_clean_dataframe_returns_empty_findings(self):
df = pd.DataFrame({
"id": ["1", "2", "3"],
"name": ["Alice", "Bob", "Carol"],
"email": ["a@x.com", "b@x.com", "c@x.com"],
})
findings = analyze(df)
assert findings == []
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
class TestHelpers:
def test_findings_by_tool_groups_correctly(self):
df = pd.DataFrame({
"name": [" padded ", "“smart”"],
"x": ["N/A", "valid"],
})
findings = analyze(df)
grouped = findings_by_tool(findings)
assert TOOL_TEXT_CLEANER in grouped
assert TOOL_MISSING_HANDLER in grouped
def test_findings_by_tool_skips_toolless(self):
repair = RepairResult(
repaired_bytes=b"", actions=[], unrepairable_lines=[5, 7],
)
findings = analyze(pd.DataFrame({"x": ["a"]}), repair_result=repair)
grouped = findings_by_tool(findings)
# csv_unrepairable_rows has tool="" and should not appear.
assert all(t for t in grouped)
def test_to_dict_is_json_serializable(self):
df = pd.DataFrame({"x": [" padded "]})
findings = analyze(df)
d = to_dict(findings[0])
import json
json.dumps(d) # would raise on non-serializable values
assert d["id"] == "whitespace_padding"
assert "samples" in d