Tools shipped this batch (4 → 6 of 9 Ready):
04 Missing Value Handler src/core/missing.py + cli_missing.py + GUI
05 Column Mapper src/core/column_mapper.py + cli_column_map.py + GUI
09 Pipeline Runner src/core/pipeline.py + cli_pipeline.py + GUI
with soft tool-dependency graph (recommended,
not enforced) and JSON save/load for repeatable
weekly cleanups.
Format Standardizer reworked for 1 GB international files:
• Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
• Per-row country / address columns drive parsing
• Audit cap (default 10 k rows, ~50 MB RAM)
• standardize_file(): chunked streaming entry point (~165 k rows/sec)
• currency_decimal="auto" for EU comma-decimal locales
• R$ / kr / zł multi-char currency prefixes
• cli_format.py with auto-stream above 100 MB inputs
Encoding detection arbiter + language-aware probe:
Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.
Distribution-readiness assets:
• streamlit_app.py — Streamlit Community Cloud entry shim
• src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
100-row cap + watermark, free-vs-paid boundary enforced at surface
• samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
• landing/ — 4 static HTML pages (apex chooser + 3 niche),
shared CSS, deploy.py URL-substitution script,
auto-generated robots.txt + sitemap.xml + 404.html + favicon
• docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
— full strategy + measurement + deployment + master checklist
Test counts:
before: 1,520 passed · 4 skipped · 17 xfailed
after: 1,729 passed · 0 skipped · 0 xfailed
Tier-1 corpora added:
• missing-corpus 3 use cases + 16 edge cases
• column-mapper-corpus 3 use cases + 5 edge cases
• format-cleaner intl 20-row 13-country stress fixture
Engine hardening flushed out by the corpora:
• interpolate guards against object-dtype columns
• mean/median skip all-NaN columns (silences numpy warning)
• fillna runs under future.no_silent_downcasting (silences pandas warning)
• mojibake test no longer skips when ftfy installed (monkeypatch path)
• drop-row threshold semantics: strict-greater (consistent across rows / cols)
• currency_decimal validator allow-set updated for "auto"
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
385 lines
14 KiB
Python
385 lines
14 KiB
Python
"""Tests for src.core.analyze — upload-time data quality detectors."""
|
||
|
||
from __future__ import annotations
|
||
|
||
from pathlib import Path
|
||
|
||
import pandas as pd
|
||
import pytest
|
||
|
||
from src.core.analyze import (
|
||
Finding,
|
||
TOOL_DEDUPLICATOR,
|
||
TOOL_MISSING_HANDLER,
|
||
TOOL_TEXT_CLEANER,
|
||
analyze,
|
||
findings_by_tool,
|
||
to_dict,
|
||
)
|
||
from src.core.io import RepairAction, RepairResult, repair_bytes
|
||
|
||
|
||
def _ids(findings: list[Finding]) -> set[str]:
|
||
return {f.id for f in findings}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Smart punctuation
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestSmartPunctuation:
|
||
def test_finds_curly_quotes(self):
|
||
df = pd.DataFrame({"note": ["plain", "“fancy”", "it’s"]})
|
||
findings = analyze(df)
|
||
assert "smart_punctuation_in_data" in _ids(findings)
|
||
f = next(f for f in findings if f.id == "smart_punctuation_in_data")
|
||
assert f.severity == "warn"
|
||
assert f.tool == TOOL_TEXT_CLEANER
|
||
assert f.count == 2
|
||
|
||
def test_finds_dashes_and_ellipsis(self):
|
||
df = pd.DataFrame({"note": ["a—b", "wait…"]})
|
||
findings = analyze(df)
|
||
assert "smart_punctuation_in_data" in _ids(findings)
|
||
|
||
def test_clean_data_no_finding(self):
|
||
df = pd.DataFrame({"note": ["plain", "ASCII only", "no smart chars"]})
|
||
findings = analyze(df)
|
||
assert "smart_punctuation_in_data" not in _ids(findings)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Invisible / NBSP / dirty headers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestInvisibleChars:
|
||
def test_finds_nbsp(self):
|
||
df = pd.DataFrame({"name": ["Alice ", "Bob"]})
|
||
findings = analyze(df)
|
||
assert "nbsp_or_unicode_whitespace" in _ids(findings)
|
||
f = next(f for f in findings if f.id == "nbsp_or_unicode_whitespace")
|
||
assert f.count == 1
|
||
|
||
def test_finds_zero_width(self):
|
||
df = pd.DataFrame({"name": ["Alice", "Bob"]})
|
||
findings = analyze(df)
|
||
assert "zero_width_or_invisible" in _ids(findings)
|
||
|
||
def test_flags_dirty_headers(self):
|
||
df = pd.DataFrame({" id ": [1], "Email": ["a@b.com"]})
|
||
findings = analyze(df)
|
||
assert "dirty_column_headers" in _ids(findings)
|
||
f = next(f for f in findings if f.id == "dirty_column_headers")
|
||
assert f.count == 2
|
||
|
||
def test_clean_headers_no_finding(self):
|
||
df = pd.DataFrame({"id": [1], "email": ["a@b.com"]})
|
||
findings = analyze(df)
|
||
assert "dirty_column_headers" not in _ids(findings)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Whitespace padding
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestWhitespacePadding:
|
||
def test_finds_leading_trailing_space(self):
|
||
df = pd.DataFrame({"x": [" padded ", "clean"]})
|
||
findings = analyze(df)
|
||
assert "whitespace_padding" in _ids(findings)
|
||
|
||
def test_finds_internal_double_space(self):
|
||
df = pd.DataFrame({"x": ["double space", "single space"]})
|
||
findings = analyze(df)
|
||
assert "whitespace_padding" in _ids(findings)
|
||
|
||
def test_no_finding_when_clean(self):
|
||
df = pd.DataFrame({"x": ["clean", "also clean"]})
|
||
findings = analyze(df)
|
||
assert "whitespace_padding" not in _ids(findings)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Null-like sentinels
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestNullLikeSentinels:
|
||
def test_finds_n_a_and_nan(self):
|
||
df = pd.DataFrame({"x": ["valid", "N/A", "nan", "None", "-"]})
|
||
findings = analyze(df)
|
||
f = next(f for f in findings if f.id == "null_like_sentinels")
|
||
assert f.count == 4
|
||
assert f.tool == TOOL_MISSING_HANDLER
|
||
assert f.severity == "info"
|
||
|
||
def test_clean_data_no_finding(self):
|
||
df = pd.DataFrame({"x": ["a", "b", "c"]})
|
||
findings = analyze(df)
|
||
assert "null_like_sentinels" not in _ids(findings)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Mojibake
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestMojibake:
|
||
def test_finds_classic_pattern(self):
|
||
df = pd.DataFrame({"name": ["café", "café", "Müller"]})
|
||
findings = analyze(df)
|
||
assert "suspected_mojibake" in _ids(findings)
|
||
|
||
def test_clean_unicode_no_finding(self):
|
||
df = pd.DataFrame({"name": ["café", "naïve", "München"]})
|
||
findings = analyze(df)
|
||
assert "suspected_mojibake" not in _ids(findings)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Mixed-case email column
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestMixedCaseEmail:
|
||
def test_finds_mixed_case(self):
|
||
df = pd.DataFrame({"email": ["Alice@Example.COM", "bob@example.com"]})
|
||
findings = analyze(df)
|
||
assert "mixed_case_email_column" in _ids(findings)
|
||
|
||
def test_all_lower_no_finding(self):
|
||
df = pd.DataFrame({"email": ["a@b.com", "c@d.com"]})
|
||
findings = analyze(df)
|
||
assert "mixed_case_email_column" not in _ids(findings)
|
||
|
||
def test_non_email_column_ignored(self):
|
||
df = pd.DataFrame({"name": ["Alice", "bob"]})
|
||
findings = analyze(df)
|
||
assert "mixed_case_email_column" not in _ids(findings)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Leading-zero IDs
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestLeadingZeroIds:
|
||
def test_finds_zero_padded_ids(self):
|
||
df = pd.DataFrame({
|
||
"sku": ["0001234", "0005678", "0009999", "0001111", "0002222", "0003333"],
|
||
})
|
||
findings = analyze(df)
|
||
assert "leading_zero_ids" in _ids(findings)
|
||
|
||
def test_no_finding_when_no_leading_zero(self):
|
||
df = pd.DataFrame({"id": [str(i) for i in range(1, 100)]})
|
||
findings = analyze(df)
|
||
assert "leading_zero_ids" not in _ids(findings)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Near-duplicate rows
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestNearDuplicates:
|
||
def test_finds_case_insensitive_dupes(self):
|
||
df = pd.DataFrame({
|
||
"name": ["Alice", "alice ", "Bob"],
|
||
"email": ["a@b.com", "A@B.COM", "bob@b.com"],
|
||
})
|
||
findings = analyze(df)
|
||
assert "near_duplicate_rows" in _ids(findings)
|
||
|
||
def test_unique_rows_no_finding(self):
|
||
df = pd.DataFrame({
|
||
"name": ["Alice", "Bob", "Carol"],
|
||
"email": ["a@x.com", "b@x.com", "c@x.com"],
|
||
})
|
||
findings = analyze(df)
|
||
assert "near_duplicate_rows" not in _ids(findings)
|
||
|
||
def test_single_row_no_finding(self):
|
||
df = pd.DataFrame({"x": ["only"]})
|
||
findings = analyze(df)
|
||
assert "near_duplicate_rows" not in _ids(findings)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Mixed line endings
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestEncodingUncertainty:
|
||
def test_replacement_chars_in_data_flagged(self):
|
||
df = pd.DataFrame({"name": ["Caf<EFBFBD>", "Ber<EFBFBD>in"]})
|
||
findings = analyze(df)
|
||
f = next(f for f in findings if f.id == "encoding_uncertain")
|
||
assert f.severity == "error"
|
||
assert f.confidence == "low"
|
||
assert f.count == 2
|
||
|
||
def test_replacement_chars_in_header_flagged(self):
|
||
df = pd.DataFrame({"emai<EFBFBD>l": ["a@x.com"]})
|
||
findings = analyze(df)
|
||
ids = {f.id for f in findings}
|
||
assert "encoding_uncertain" in ids
|
||
|
||
def test_clean_data_no_finding(self):
|
||
df = pd.DataFrame({"name": ["Alice", "Bob"]})
|
||
findings = analyze(df)
|
||
assert "encoding_uncertain" not in {f.id for f in findings}
|
||
|
||
|
||
class TestEncodingOverride:
|
||
def test_override_corrects_misdetected_codepage(self, tmp_path):
|
||
# WESTERN_BASIC bytes encoded as cp1252; charset-normalizer guesses
|
||
# cp1250, which gets 0xF1 wrong (ń vs ñ).
|
||
f = tmp_path / "cp1252.csv"
|
||
f.write_bytes("id,name\n1,España\n".encode("cp1252"))
|
||
|
||
from src.core.analyze import _load_for_analysis
|
||
df_auto, _, _ = _load_for_analysis(f, sample_rows=10)
|
||
df_overridden, _, _ = _load_for_analysis(
|
||
f, sample_rows=10, encoding_override="cp1252",
|
||
)
|
||
# Override yields the correct character.
|
||
assert df_overridden["name"].iloc[0] == "España"
|
||
|
||
def test_override_propagates_through_top_level_analyze(self, tmp_path):
|
||
f = tmp_path / "koi8.csv"
|
||
# KOI8-R Cyrillic; default detection guesses Shift_JIS.
|
||
f.write_bytes("id,name\n1,Иван\n".encode("koi8-r"))
|
||
# With the override the analyzer should produce zero findings
|
||
# against this clean fixture (no mojibake, no U+FFFD).
|
||
findings = analyze(f, encoding_override="koi8-r")
|
||
ids = {x.id for x in findings}
|
||
assert "encoding_uncertain" not in ids
|
||
assert "encoding_decode_failed" not in ids
|
||
|
||
|
||
class TestEncodingDecodeFailedFromRepair:
|
||
def test_lying_bom_recovered_and_flagged(self, tmp_path):
|
||
# File has a UTF-8 BOM but the body bytes are cp1252 (0x80 = € in
|
||
# cp1252; not a valid UTF-8 continuation byte). Detector should
|
||
# recover transparently to cp1252 and surface an
|
||
# ``encoding_lying_bom`` warn so the user knows.
|
||
f = tmp_path / "lying_bom.csv"
|
||
f.write_bytes(b"\xef\xbb\xbfid,name\n1,\x80100\n")
|
||
findings = analyze(f)
|
||
ids = {x.id for x in findings}
|
||
assert "encoding_lying_bom" in ids
|
||
bad = next(x for x in findings if x.id == "encoding_lying_bom")
|
||
assert bad.severity == "warn"
|
||
# Decode should have succeeded — no replacement-character finding.
|
||
assert "encoding_decode_failed" not in ids
|
||
|
||
|
||
class TestMixedLineEndings:
|
||
def test_crlf_plus_lf_flagged(self, tmp_path):
|
||
f = tmp_path / "mixed.csv"
|
||
f.write_bytes(b"id,name\r\n1,Alice\n2,Bob\r\n")
|
||
findings = analyze(f)
|
||
assert "mixed_line_endings" in _ids(findings)
|
||
|
||
def test_uniform_lf_not_flagged(self, tmp_path):
|
||
f = tmp_path / "uniform.csv"
|
||
f.write_bytes(b"id,name\n1,Alice\n2,Bob\n")
|
||
findings = analyze(f)
|
||
assert "mixed_line_endings" not in _ids(findings)
|
||
|
||
def test_dataframe_mode_skips_detector(self):
|
||
# No raw bytes -> mixed_line_endings cannot be detected.
|
||
df = pd.DataFrame({"id": ["1"], "name": ["Alice"]})
|
||
findings = analyze(df)
|
||
assert "mixed_line_endings" not in _ids(findings)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Findings synthesized from RepairResult
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestFindingsFromRepair:
|
||
def test_bom_strip_surfaces(self):
|
||
repair = repair_bytes(b"\xef\xbb\xbfid,name\n1,Alice\n")
|
||
findings = analyze(pd.DataFrame({"id": ["1"], "name": ["Alice"]}),
|
||
repair_result=repair)
|
||
assert "csv_bom_stripped" in _ids(findings)
|
||
|
||
def test_nul_strip_surfaces(self):
|
||
repair = repair_bytes(b"id,name\n1,Hel\x00lo\n")
|
||
findings = analyze(pd.DataFrame({"id": ["1"], "name": ["Hello"]}),
|
||
repair_result=repair)
|
||
assert "csv_nul_stripped" in _ids(findings)
|
||
|
||
def test_unrepairable_surfaces_as_error(self):
|
||
# Synthesize a result with an unrepairable line.
|
||
repair = RepairResult(
|
||
repaired_bytes=b"id,a,b\n1,foo,bar\n",
|
||
actions=[],
|
||
unrepairable_lines=[3],
|
||
)
|
||
findings = analyze(pd.DataFrame({"id": ["1"], "a": ["foo"], "b": ["bar"]}),
|
||
repair_result=repair)
|
||
f = next(f for f in findings if f.id == "csv_unrepairable_rows")
|
||
assert f.severity == "error"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# End-to-end on the corpus kitchen-sink fixture
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestEndToEnd:
|
||
def test_kitchen_sink_fixture_finds_pollution(self):
|
||
path = Path("test-cases/text-cleaner-corpus/test_data/20_kitchen_sink.csv")
|
||
if not path.exists():
|
||
pytest.skip("corpus fixture not present")
|
||
findings = analyze(path)
|
||
ids = _ids(findings)
|
||
# Kitchen-sink has BOM, smart quotes, NBSP, ZWSP, and dirty headers.
|
||
# Pre-parse repair handles the file-level smart-quote/BOM, so they
|
||
# show up as csv_* findings; the cell-level NBSP/ZW remain as
|
||
# data findings.
|
||
assert "csv_bom_stripped" in ids or "csv_smart_quotes_folded" in ids
|
||
# NBSP-padded headers should still surface — pre-parse repair only
|
||
# touches double-quote characters.
|
||
assert any(i.startswith("dirty_") or i.startswith("nbsp") or i.startswith("zero_width")
|
||
for i in ids)
|
||
|
||
def test_clean_dataframe_returns_empty_findings(self):
|
||
df = pd.DataFrame({
|
||
"id": ["1", "2", "3"],
|
||
"name": ["Alice", "Bob", "Carol"],
|
||
"email": ["a@x.com", "b@x.com", "c@x.com"],
|
||
})
|
||
findings = analyze(df)
|
||
assert findings == []
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestHelpers:
|
||
def test_findings_by_tool_groups_correctly(self):
|
||
df = pd.DataFrame({
|
||
"name": [" padded ", "“smart”"],
|
||
"x": ["N/A", "valid"],
|
||
})
|
||
findings = analyze(df)
|
||
grouped = findings_by_tool(findings)
|
||
assert TOOL_TEXT_CLEANER in grouped
|
||
assert TOOL_MISSING_HANDLER in grouped
|
||
|
||
def test_findings_by_tool_skips_toolless(self):
|
||
repair = RepairResult(
|
||
repaired_bytes=b"", actions=[], unrepairable_lines=[5, 7],
|
||
)
|
||
findings = analyze(pd.DataFrame({"x": ["a"]}), repair_result=repair)
|
||
grouped = findings_by_tool(findings)
|
||
# csv_unrepairable_rows has tool="" and should not appear.
|
||
assert all(t for t in grouped)
|
||
|
||
def test_to_dict_is_json_serializable(self):
|
||
df = pd.DataFrame({"x": [" padded "]})
|
||
findings = analyze(df)
|
||
d = to_dict(findings[0])
|
||
import json
|
||
json.dumps(d) # would raise on non-serializable values
|
||
assert d["id"] == "whitespace_padding"
|
||
assert "samples" in d
|