feat(analyze): upload-time data quality analyzer
Pure, advisory scan over an uploaded file or DataFrame that returns a list of
Finding objects naming each issue, the affected count, and which downstream
tool can fix it. The GUI uses this to badge tool nav items at upload; the CLI
will print findings as a table or JSON.
src/core/analyze.py:
Finding dataclass (id, severity, tool, count, description, column, samples)
analyze(source, *, sample_rows=1000, repair_result=None) -> list[Finding]
- source: DataFrame, path, or str. Path scans first 1000 rows.
- When source is a path, runs the same pre-parse repair the tool pages
will use; the resulting RepairResult is auto-surfaced as csv_*
findings. A caller-supplied repair_result wins so non-default repair
flags are respected.
Detectors (each independent, samples capped at 5):
- smart_punctuation_in_data -> 02
- nbsp_or_unicode_whitespace -> 02
- zero_width_or_invisible -> 02
- dirty_column_headers -> 02
- whitespace_padding -> 02
- null_like_sentinels -> 04
- suspected_mojibake -> 02 (Tier 2)
- mixed_case_email_column -> 02 case op
- leading_zero_ids -> informational, no tool
Helpers: findings_by_tool() for sidebar grouping, to_dict() for JSON.
Detectors are decoupled from the GUI display layer — they emit stable tool
ids ("02_text_cleaner") and the GUI maps those to display names.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
268
tests/test_analyze.py
Normal file
268
tests/test_analyze.py
Normal file
@@ -0,0 +1,268 @@
|
||||
"""Tests for src.core.analyze — upload-time data quality detectors."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from src.core.analyze import (
|
||||
Finding,
|
||||
TOOL_DEDUPLICATOR,
|
||||
TOOL_MISSING_HANDLER,
|
||||
TOOL_TEXT_CLEANER,
|
||||
analyze,
|
||||
findings_by_tool,
|
||||
to_dict,
|
||||
)
|
||||
from src.core.io import RepairAction, RepairResult, repair_bytes
|
||||
|
||||
|
||||
def _ids(findings: list[Finding]) -> set[str]:
|
||||
return {f.id for f in findings}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Smart punctuation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestSmartPunctuation:
|
||||
def test_finds_curly_quotes(self):
|
||||
df = pd.DataFrame({"note": ["plain", "“fancy”", "it’s"]})
|
||||
findings = analyze(df)
|
||||
assert "smart_punctuation_in_data" in _ids(findings)
|
||||
f = next(f for f in findings if f.id == "smart_punctuation_in_data")
|
||||
assert f.severity == "warn"
|
||||
assert f.tool == TOOL_TEXT_CLEANER
|
||||
assert f.count == 2
|
||||
|
||||
def test_finds_dashes_and_ellipsis(self):
|
||||
df = pd.DataFrame({"note": ["a—b", "wait…"]})
|
||||
findings = analyze(df)
|
||||
assert "smart_punctuation_in_data" in _ids(findings)
|
||||
|
||||
def test_clean_data_no_finding(self):
|
||||
df = pd.DataFrame({"note": ["plain", "ASCII only", "no smart chars"]})
|
||||
findings = analyze(df)
|
||||
assert "smart_punctuation_in_data" not in _ids(findings)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Invisible / NBSP / dirty headers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestInvisibleChars:
|
||||
def test_finds_nbsp(self):
|
||||
df = pd.DataFrame({"name": ["Alice ", "Bob"]})
|
||||
findings = analyze(df)
|
||||
assert "nbsp_or_unicode_whitespace" in _ids(findings)
|
||||
f = next(f for f in findings if f.id == "nbsp_or_unicode_whitespace")
|
||||
assert f.count == 1
|
||||
|
||||
def test_finds_zero_width(self):
|
||||
df = pd.DataFrame({"name": ["Alice", "Bob"]})
|
||||
findings = analyze(df)
|
||||
assert "zero_width_or_invisible" in _ids(findings)
|
||||
|
||||
def test_flags_dirty_headers(self):
|
||||
df = pd.DataFrame({" id ": [1], "Email": ["a@b.com"]})
|
||||
findings = analyze(df)
|
||||
assert "dirty_column_headers" in _ids(findings)
|
||||
f = next(f for f in findings if f.id == "dirty_column_headers")
|
||||
assert f.count == 2
|
||||
|
||||
def test_clean_headers_no_finding(self):
|
||||
df = pd.DataFrame({"id": [1], "email": ["a@b.com"]})
|
||||
findings = analyze(df)
|
||||
assert "dirty_column_headers" not in _ids(findings)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Whitespace padding
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestWhitespacePadding:
|
||||
def test_finds_leading_trailing_space(self):
|
||||
df = pd.DataFrame({"x": [" padded ", "clean"]})
|
||||
findings = analyze(df)
|
||||
assert "whitespace_padding" in _ids(findings)
|
||||
|
||||
def test_finds_internal_double_space(self):
|
||||
df = pd.DataFrame({"x": ["double space", "single space"]})
|
||||
findings = analyze(df)
|
||||
assert "whitespace_padding" in _ids(findings)
|
||||
|
||||
def test_no_finding_when_clean(self):
|
||||
df = pd.DataFrame({"x": ["clean", "also clean"]})
|
||||
findings = analyze(df)
|
||||
assert "whitespace_padding" not in _ids(findings)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Null-like sentinels
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestNullLikeSentinels:
|
||||
def test_finds_n_a_and_nan(self):
|
||||
df = pd.DataFrame({"x": ["valid", "N/A", "nan", "None", "-"]})
|
||||
findings = analyze(df)
|
||||
f = next(f for f in findings if f.id == "null_like_sentinels")
|
||||
assert f.count == 4
|
||||
assert f.tool == TOOL_MISSING_HANDLER
|
||||
assert f.severity == "info"
|
||||
|
||||
def test_clean_data_no_finding(self):
|
||||
df = pd.DataFrame({"x": ["a", "b", "c"]})
|
||||
findings = analyze(df)
|
||||
assert "null_like_sentinels" not in _ids(findings)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Mojibake
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestMojibake:
|
||||
def test_finds_classic_pattern(self):
|
||||
df = pd.DataFrame({"name": ["café", "café", "Müller"]})
|
||||
findings = analyze(df)
|
||||
assert "suspected_mojibake" in _ids(findings)
|
||||
|
||||
def test_clean_unicode_no_finding(self):
|
||||
df = pd.DataFrame({"name": ["café", "naïve", "München"]})
|
||||
findings = analyze(df)
|
||||
assert "suspected_mojibake" not in _ids(findings)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Mixed-case email column
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestMixedCaseEmail:
|
||||
def test_finds_mixed_case(self):
|
||||
df = pd.DataFrame({"email": ["Alice@Example.COM", "bob@example.com"]})
|
||||
findings = analyze(df)
|
||||
assert "mixed_case_email_column" in _ids(findings)
|
||||
|
||||
def test_all_lower_no_finding(self):
|
||||
df = pd.DataFrame({"email": ["a@b.com", "c@d.com"]})
|
||||
findings = analyze(df)
|
||||
assert "mixed_case_email_column" not in _ids(findings)
|
||||
|
||||
def test_non_email_column_ignored(self):
|
||||
df = pd.DataFrame({"name": ["Alice", "bob"]})
|
||||
findings = analyze(df)
|
||||
assert "mixed_case_email_column" not in _ids(findings)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Leading-zero IDs
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestLeadingZeroIds:
|
||||
def test_finds_zero_padded_ids(self):
|
||||
df = pd.DataFrame({
|
||||
"sku": ["0001234", "0005678", "0009999", "0001111", "0002222", "0003333"],
|
||||
})
|
||||
findings = analyze(df)
|
||||
assert "leading_zero_ids" in _ids(findings)
|
||||
|
||||
def test_no_finding_when_no_leading_zero(self):
|
||||
df = pd.DataFrame({"id": [str(i) for i in range(1, 100)]})
|
||||
findings = analyze(df)
|
||||
assert "leading_zero_ids" not in _ids(findings)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Findings synthesized from RepairResult
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestFindingsFromRepair:
|
||||
def test_bom_strip_surfaces(self):
|
||||
repair = repair_bytes(b"\xef\xbb\xbfid,name\n1,Alice\n")
|
||||
findings = analyze(pd.DataFrame({"id": ["1"], "name": ["Alice"]}),
|
||||
repair_result=repair)
|
||||
assert "csv_bom_stripped" in _ids(findings)
|
||||
|
||||
def test_nul_strip_surfaces(self):
|
||||
repair = repair_bytes(b"id,name\n1,Hel\x00lo\n")
|
||||
findings = analyze(pd.DataFrame({"id": ["1"], "name": ["Hello"]}),
|
||||
repair_result=repair)
|
||||
assert "csv_nul_stripped" in _ids(findings)
|
||||
|
||||
def test_unrepairable_surfaces_as_error(self):
|
||||
# Synthesize a result with an unrepairable line.
|
||||
repair = RepairResult(
|
||||
repaired_bytes=b"id,a,b\n1,foo,bar\n",
|
||||
actions=[],
|
||||
unrepairable_lines=[3],
|
||||
)
|
||||
findings = analyze(pd.DataFrame({"id": ["1"], "a": ["foo"], "b": ["bar"]}),
|
||||
repair_result=repair)
|
||||
f = next(f for f in findings if f.id == "csv_unrepairable_rows")
|
||||
assert f.severity == "error"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# End-to-end on the corpus kitchen-sink fixture
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestEndToEnd:
|
||||
def test_kitchen_sink_fixture_finds_pollution(self):
|
||||
path = Path("test-cases/text-cleaner-corpus/test_data/20_kitchen_sink.csv")
|
||||
if not path.exists():
|
||||
pytest.skip("corpus fixture not present")
|
||||
findings = analyze(path)
|
||||
ids = _ids(findings)
|
||||
# Kitchen-sink has BOM, smart quotes, NBSP, ZWSP, and dirty headers.
|
||||
# Pre-parse repair handles the file-level smart-quote/BOM, so they
|
||||
# show up as csv_* findings; the cell-level NBSP/ZW remain as
|
||||
# data findings.
|
||||
assert "csv_bom_stripped" in ids or "csv_smart_quotes_folded" in ids
|
||||
# NBSP-padded headers should still surface — pre-parse repair only
|
||||
# touches double-quote characters.
|
||||
assert any(i.startswith("dirty_") or i.startswith("nbsp") or i.startswith("zero_width")
|
||||
for i in ids)
|
||||
|
||||
def test_clean_dataframe_returns_empty_findings(self):
|
||||
df = pd.DataFrame({
|
||||
"id": ["1", "2", "3"],
|
||||
"name": ["Alice", "Bob", "Carol"],
|
||||
"email": ["a@x.com", "b@x.com", "c@x.com"],
|
||||
})
|
||||
findings = analyze(df)
|
||||
assert findings == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestHelpers:
|
||||
def test_findings_by_tool_groups_correctly(self):
|
||||
df = pd.DataFrame({
|
||||
"name": [" padded ", "“smart”"],
|
||||
"x": ["N/A", "valid"],
|
||||
})
|
||||
findings = analyze(df)
|
||||
grouped = findings_by_tool(findings)
|
||||
assert TOOL_TEXT_CLEANER in grouped
|
||||
assert TOOL_MISSING_HANDLER in grouped
|
||||
|
||||
def test_findings_by_tool_skips_toolless(self):
|
||||
repair = RepairResult(
|
||||
repaired_bytes=b"", actions=[], unrepairable_lines=[5, 7],
|
||||
)
|
||||
findings = analyze(pd.DataFrame({"x": ["a"]}), repair_result=repair)
|
||||
grouped = findings_by_tool(findings)
|
||||
# csv_unrepairable_rows has tool="" and should not appear.
|
||||
assert all(t for t in grouped)
|
||||
|
||||
def test_to_dict_is_json_serializable(self):
|
||||
df = pd.DataFrame({"x": [" padded "]})
|
||||
findings = analyze(df)
|
||||
d = to_dict(findings[0])
|
||||
import json
|
||||
json.dumps(d) # would raise on non-serializable values
|
||||
assert d["id"] == "whitespace_padding"
|
||||
assert "samples" in d
|
||||
Reference in New Issue
Block a user