Files
datatools-dev/tests/test_analyze.py
Michael 966af8ef94 feat: 3 new tools, format streaming, distribution-ready demo + landing pages
Tools shipped this batch (4 → 6 of 9 Ready):
  04 Missing Value Handler   src/core/missing.py + cli_missing.py + GUI
  05 Column Mapper           src/core/column_mapper.py + cli_column_map.py + GUI
  09 Pipeline Runner         src/core/pipeline.py + cli_pipeline.py + GUI
                             with soft tool-dependency graph (recommended,
                             not enforced) and JSON save/load for repeatable
                             weekly cleanups.

Format Standardizer reworked for 1 GB international files:
  • Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
  • Per-row country / address columns drive parsing
  • Audit cap (default 10 k rows, ~50 MB RAM)
  • standardize_file(): chunked streaming entry point (~165 k rows/sec)
  • currency_decimal="auto" for EU comma-decimal locales
  • R$ / kr / zł multi-char currency prefixes
  • cli_format.py with auto-stream above 100 MB inputs

Encoding detection arbiter + language-aware probe:
  Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
  via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.

Distribution-readiness assets:
  • streamlit_app.py — Streamlit Community Cloud entry shim
  • src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
    100-row cap + watermark, free-vs-paid boundary enforced at surface
  • samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
  • landing/ — 4 static HTML pages (apex chooser + 3 niche),
    shared CSS, deploy.py URL-substitution script,
    auto-generated robots.txt + sitemap.xml + 404.html + favicon
  • docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
    — full strategy + measurement + deployment + master checklist

Test counts:
  before: 1,520 passed · 4 skipped · 17 xfailed
  after:  1,729 passed · 0 skipped · 0  xfailed

Tier-1 corpora added:
  • missing-corpus           3 use cases + 16 edge cases
  • column-mapper-corpus     3 use cases + 5 edge cases
  • format-cleaner intl      20-row 13-country stress fixture

Engine hardening flushed out by the corpora:
  • interpolate guards against object-dtype columns
  • mean/median skip all-NaN columns (silences numpy warning)
  • fillna runs under future.no_silent_downcasting (silences pandas warning)
  • mojibake test no longer skips when ftfy installed (monkeypatch path)
  • drop-row threshold semantics: strict-greater (consistent across rows / cols)
  • currency_decimal validator allow-set updated for "auto"

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 22:31:26 +00:00

385 lines
14 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests for src.core.analyze — upload-time data quality detectors."""
from __future__ import annotations
from pathlib import Path
import pandas as pd
import pytest
from src.core.analyze import (
Finding,
TOOL_DEDUPLICATOR,
TOOL_MISSING_HANDLER,
TOOL_TEXT_CLEANER,
analyze,
findings_by_tool,
to_dict,
)
from src.core.io import RepairAction, RepairResult, repair_bytes
def _ids(findings: list[Finding]) -> set[str]:
return {f.id for f in findings}
# ---------------------------------------------------------------------------
# Smart punctuation
# ---------------------------------------------------------------------------
class TestSmartPunctuation:
def test_finds_curly_quotes(self):
df = pd.DataFrame({"note": ["plain", "“fancy”", "its"]})
findings = analyze(df)
assert "smart_punctuation_in_data" in _ids(findings)
f = next(f for f in findings if f.id == "smart_punctuation_in_data")
assert f.severity == "warn"
assert f.tool == TOOL_TEXT_CLEANER
assert f.count == 2
def test_finds_dashes_and_ellipsis(self):
df = pd.DataFrame({"note": ["a—b", "wait…"]})
findings = analyze(df)
assert "smart_punctuation_in_data" in _ids(findings)
def test_clean_data_no_finding(self):
df = pd.DataFrame({"note": ["plain", "ASCII only", "no smart chars"]})
findings = analyze(df)
assert "smart_punctuation_in_data" not in _ids(findings)
# ---------------------------------------------------------------------------
# Invisible / NBSP / dirty headers
# ---------------------------------------------------------------------------
class TestInvisibleChars:
def test_finds_nbsp(self):
df = pd.DataFrame({"name": ["Alice ", "Bob"]})
findings = analyze(df)
assert "nbsp_or_unicode_whitespace" in _ids(findings)
f = next(f for f in findings if f.id == "nbsp_or_unicode_whitespace")
assert f.count == 1
def test_finds_zero_width(self):
df = pd.DataFrame({"name": ["Alice", "Bob"]})
findings = analyze(df)
assert "zero_width_or_invisible" in _ids(findings)
def test_flags_dirty_headers(self):
df = pd.DataFrame({" id ": [1], "Email": ["a@b.com"]})
findings = analyze(df)
assert "dirty_column_headers" in _ids(findings)
f = next(f for f in findings if f.id == "dirty_column_headers")
assert f.count == 2
def test_clean_headers_no_finding(self):
df = pd.DataFrame({"id": [1], "email": ["a@b.com"]})
findings = analyze(df)
assert "dirty_column_headers" not in _ids(findings)
# ---------------------------------------------------------------------------
# Whitespace padding
# ---------------------------------------------------------------------------
class TestWhitespacePadding:
def test_finds_leading_trailing_space(self):
df = pd.DataFrame({"x": [" padded ", "clean"]})
findings = analyze(df)
assert "whitespace_padding" in _ids(findings)
def test_finds_internal_double_space(self):
df = pd.DataFrame({"x": ["double space", "single space"]})
findings = analyze(df)
assert "whitespace_padding" in _ids(findings)
def test_no_finding_when_clean(self):
df = pd.DataFrame({"x": ["clean", "also clean"]})
findings = analyze(df)
assert "whitespace_padding" not in _ids(findings)
# ---------------------------------------------------------------------------
# Null-like sentinels
# ---------------------------------------------------------------------------
class TestNullLikeSentinels:
def test_finds_n_a_and_nan(self):
df = pd.DataFrame({"x": ["valid", "N/A", "nan", "None", "-"]})
findings = analyze(df)
f = next(f for f in findings if f.id == "null_like_sentinels")
assert f.count == 4
assert f.tool == TOOL_MISSING_HANDLER
assert f.severity == "info"
def test_clean_data_no_finding(self):
df = pd.DataFrame({"x": ["a", "b", "c"]})
findings = analyze(df)
assert "null_like_sentinels" not in _ids(findings)
# ---------------------------------------------------------------------------
# Mojibake
# ---------------------------------------------------------------------------
class TestMojibake:
def test_finds_classic_pattern(self):
df = pd.DataFrame({"name": ["café", "café", "Müller"]})
findings = analyze(df)
assert "suspected_mojibake" in _ids(findings)
def test_clean_unicode_no_finding(self):
df = pd.DataFrame({"name": ["café", "naïve", "München"]})
findings = analyze(df)
assert "suspected_mojibake" not in _ids(findings)
# ---------------------------------------------------------------------------
# Mixed-case email column
# ---------------------------------------------------------------------------
class TestMixedCaseEmail:
def test_finds_mixed_case(self):
df = pd.DataFrame({"email": ["Alice@Example.COM", "bob@example.com"]})
findings = analyze(df)
assert "mixed_case_email_column" in _ids(findings)
def test_all_lower_no_finding(self):
df = pd.DataFrame({"email": ["a@b.com", "c@d.com"]})
findings = analyze(df)
assert "mixed_case_email_column" not in _ids(findings)
def test_non_email_column_ignored(self):
df = pd.DataFrame({"name": ["Alice", "bob"]})
findings = analyze(df)
assert "mixed_case_email_column" not in _ids(findings)
# ---------------------------------------------------------------------------
# Leading-zero IDs
# ---------------------------------------------------------------------------
class TestLeadingZeroIds:
def test_finds_zero_padded_ids(self):
df = pd.DataFrame({
"sku": ["0001234", "0005678", "0009999", "0001111", "0002222", "0003333"],
})
findings = analyze(df)
assert "leading_zero_ids" in _ids(findings)
def test_no_finding_when_no_leading_zero(self):
df = pd.DataFrame({"id": [str(i) for i in range(1, 100)]})
findings = analyze(df)
assert "leading_zero_ids" not in _ids(findings)
# ---------------------------------------------------------------------------
# Near-duplicate rows
# ---------------------------------------------------------------------------
class TestNearDuplicates:
def test_finds_case_insensitive_dupes(self):
df = pd.DataFrame({
"name": ["Alice", "alice ", "Bob"],
"email": ["a@b.com", "A@B.COM", "bob@b.com"],
})
findings = analyze(df)
assert "near_duplicate_rows" in _ids(findings)
def test_unique_rows_no_finding(self):
df = pd.DataFrame({
"name": ["Alice", "Bob", "Carol"],
"email": ["a@x.com", "b@x.com", "c@x.com"],
})
findings = analyze(df)
assert "near_duplicate_rows" not in _ids(findings)
def test_single_row_no_finding(self):
df = pd.DataFrame({"x": ["only"]})
findings = analyze(df)
assert "near_duplicate_rows" not in _ids(findings)
# ---------------------------------------------------------------------------
# Mixed line endings
# ---------------------------------------------------------------------------
class TestEncodingUncertainty:
def test_replacement_chars_in_data_flagged(self):
df = pd.DataFrame({"name": ["Caf<EFBFBD>", "Ber<EFBFBD>in"]})
findings = analyze(df)
f = next(f for f in findings if f.id == "encoding_uncertain")
assert f.severity == "error"
assert f.confidence == "low"
assert f.count == 2
def test_replacement_chars_in_header_flagged(self):
df = pd.DataFrame({"emai<EFBFBD>l": ["a@x.com"]})
findings = analyze(df)
ids = {f.id for f in findings}
assert "encoding_uncertain" in ids
def test_clean_data_no_finding(self):
df = pd.DataFrame({"name": ["Alice", "Bob"]})
findings = analyze(df)
assert "encoding_uncertain" not in {f.id for f in findings}
class TestEncodingOverride:
def test_override_corrects_misdetected_codepage(self, tmp_path):
# WESTERN_BASIC bytes encoded as cp1252; charset-normalizer guesses
# cp1250, which gets 0xF1 wrong (ń vs ñ).
f = tmp_path / "cp1252.csv"
f.write_bytes("id,name\n1,España\n".encode("cp1252"))
from src.core.analyze import _load_for_analysis
df_auto, _, _ = _load_for_analysis(f, sample_rows=10)
df_overridden, _, _ = _load_for_analysis(
f, sample_rows=10, encoding_override="cp1252",
)
# Override yields the correct character.
assert df_overridden["name"].iloc[0] == "España"
def test_override_propagates_through_top_level_analyze(self, tmp_path):
f = tmp_path / "koi8.csv"
# KOI8-R Cyrillic; default detection guesses Shift_JIS.
f.write_bytes("id,name\n1,Иван\n".encode("koi8-r"))
# With the override the analyzer should produce zero findings
# against this clean fixture (no mojibake, no U+FFFD).
findings = analyze(f, encoding_override="koi8-r")
ids = {x.id for x in findings}
assert "encoding_uncertain" not in ids
assert "encoding_decode_failed" not in ids
class TestEncodingDecodeFailedFromRepair:
def test_lying_bom_recovered_and_flagged(self, tmp_path):
# File has a UTF-8 BOM but the body bytes are cp1252 (0x80 = € in
# cp1252; not a valid UTF-8 continuation byte). Detector should
# recover transparently to cp1252 and surface an
# ``encoding_lying_bom`` warn so the user knows.
f = tmp_path / "lying_bom.csv"
f.write_bytes(b"\xef\xbb\xbfid,name\n1,\x80100\n")
findings = analyze(f)
ids = {x.id for x in findings}
assert "encoding_lying_bom" in ids
bad = next(x for x in findings if x.id == "encoding_lying_bom")
assert bad.severity == "warn"
# Decode should have succeeded — no replacement-character finding.
assert "encoding_decode_failed" not in ids
class TestMixedLineEndings:
def test_crlf_plus_lf_flagged(self, tmp_path):
f = tmp_path / "mixed.csv"
f.write_bytes(b"id,name\r\n1,Alice\n2,Bob\r\n")
findings = analyze(f)
assert "mixed_line_endings" in _ids(findings)
def test_uniform_lf_not_flagged(self, tmp_path):
f = tmp_path / "uniform.csv"
f.write_bytes(b"id,name\n1,Alice\n2,Bob\n")
findings = analyze(f)
assert "mixed_line_endings" not in _ids(findings)
def test_dataframe_mode_skips_detector(self):
# No raw bytes -> mixed_line_endings cannot be detected.
df = pd.DataFrame({"id": ["1"], "name": ["Alice"]})
findings = analyze(df)
assert "mixed_line_endings" not in _ids(findings)
# ---------------------------------------------------------------------------
# Findings synthesized from RepairResult
# ---------------------------------------------------------------------------
class TestFindingsFromRepair:
def test_bom_strip_surfaces(self):
repair = repair_bytes(b"\xef\xbb\xbfid,name\n1,Alice\n")
findings = analyze(pd.DataFrame({"id": ["1"], "name": ["Alice"]}),
repair_result=repair)
assert "csv_bom_stripped" in _ids(findings)
def test_nul_strip_surfaces(self):
repair = repair_bytes(b"id,name\n1,Hel\x00lo\n")
findings = analyze(pd.DataFrame({"id": ["1"], "name": ["Hello"]}),
repair_result=repair)
assert "csv_nul_stripped" in _ids(findings)
def test_unrepairable_surfaces_as_error(self):
# Synthesize a result with an unrepairable line.
repair = RepairResult(
repaired_bytes=b"id,a,b\n1,foo,bar\n",
actions=[],
unrepairable_lines=[3],
)
findings = analyze(pd.DataFrame({"id": ["1"], "a": ["foo"], "b": ["bar"]}),
repair_result=repair)
f = next(f for f in findings if f.id == "csv_unrepairable_rows")
assert f.severity == "error"
# ---------------------------------------------------------------------------
# End-to-end on the corpus kitchen-sink fixture
# ---------------------------------------------------------------------------
class TestEndToEnd:
def test_kitchen_sink_fixture_finds_pollution(self):
path = Path("test-cases/text-cleaner-corpus/test_data/20_kitchen_sink.csv")
if not path.exists():
pytest.skip("corpus fixture not present")
findings = analyze(path)
ids = _ids(findings)
# Kitchen-sink has BOM, smart quotes, NBSP, ZWSP, and dirty headers.
# Pre-parse repair handles the file-level smart-quote/BOM, so they
# show up as csv_* findings; the cell-level NBSP/ZW remain as
# data findings.
assert "csv_bom_stripped" in ids or "csv_smart_quotes_folded" in ids
# NBSP-padded headers should still surface — pre-parse repair only
# touches double-quote characters.
assert any(i.startswith("dirty_") or i.startswith("nbsp") or i.startswith("zero_width")
for i in ids)
def test_clean_dataframe_returns_empty_findings(self):
df = pd.DataFrame({
"id": ["1", "2", "3"],
"name": ["Alice", "Bob", "Carol"],
"email": ["a@x.com", "b@x.com", "c@x.com"],
})
findings = analyze(df)
assert findings == []
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
class TestHelpers:
def test_findings_by_tool_groups_correctly(self):
df = pd.DataFrame({
"name": [" padded ", "“smart”"],
"x": ["N/A", "valid"],
})
findings = analyze(df)
grouped = findings_by_tool(findings)
assert TOOL_TEXT_CLEANER in grouped
assert TOOL_MISSING_HANDLER in grouped
def test_findings_by_tool_skips_toolless(self):
repair = RepairResult(
repaired_bytes=b"", actions=[], unrepairable_lines=[5, 7],
)
findings = analyze(pd.DataFrame({"x": ["a"]}), repair_result=repair)
grouped = findings_by_tool(findings)
# csv_unrepairable_rows has tool="" and should not appear.
assert all(t for t in grouped)
def test_to_dict_is_json_serializable(self):
df = pd.DataFrame({"x": [" padded "]})
findings = analyze(df)
d = to_dict(findings[0])
import json
json.dumps(d) # would raise on non-serializable values
assert d["id"] == "whitespace_padding"
assert "samples" in d