Files
datatools-dev/tests/test_analyze.py
Michael edf6ccf90b feat(analyze): upload-time data quality analyzer
Pure, advisory scan over an uploaded file or DataFrame that returns a list of
Finding objects naming each issue, the affected count, and which downstream
tool can fix it. The GUI uses this to badge tool nav items at upload; the CLI
will print findings as a table or JSON.

src/core/analyze.py:
  Finding dataclass (id, severity, tool, count, description, column, samples)
  analyze(source, *, sample_rows=1000, repair_result=None) -> list[Finding]
    - source: DataFrame, path, or str. Path scans first 1000 rows.
    - When source is a path, runs the same pre-parse repair the tool pages
      will use; the resulting RepairResult is auto-surfaced as csv_*
      findings. A caller-supplied repair_result wins so non-default repair
      flags are respected.
  Detectors (each independent, samples capped at 5):
    - smart_punctuation_in_data        -> 02
    - nbsp_or_unicode_whitespace       -> 02
    - zero_width_or_invisible          -> 02
    - dirty_column_headers             -> 02
    - whitespace_padding               -> 02
    - null_like_sentinels              -> 04
    - suspected_mojibake               -> 02 (Tier 2)
    - mixed_case_email_column          -> 02 case op
    - leading_zero_ids                 -> informational, no tool
  Helpers: findings_by_tool() for sidebar grouping, to_dict() for JSON.

Detectors are decoupled from the GUI display layer — they emit stable tool
ids ("02_text_cleaner") and the GUI maps those to display names.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 15:41:36 +00:00

269 lines
9.9 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests for src.core.analyze — upload-time data quality detectors."""
from __future__ import annotations
from pathlib import Path
import pandas as pd
import pytest
from src.core.analyze import (
Finding,
TOOL_DEDUPLICATOR,
TOOL_MISSING_HANDLER,
TOOL_TEXT_CLEANER,
analyze,
findings_by_tool,
to_dict,
)
from src.core.io import RepairAction, RepairResult, repair_bytes
def _ids(findings: list[Finding]) -> set[str]:
return {f.id for f in findings}
# ---------------------------------------------------------------------------
# Smart punctuation
# ---------------------------------------------------------------------------
class TestSmartPunctuation:
def test_finds_curly_quotes(self):
df = pd.DataFrame({"note": ["plain", "“fancy”", "its"]})
findings = analyze(df)
assert "smart_punctuation_in_data" in _ids(findings)
f = next(f for f in findings if f.id == "smart_punctuation_in_data")
assert f.severity == "warn"
assert f.tool == TOOL_TEXT_CLEANER
assert f.count == 2
def test_finds_dashes_and_ellipsis(self):
df = pd.DataFrame({"note": ["a—b", "wait…"]})
findings = analyze(df)
assert "smart_punctuation_in_data" in _ids(findings)
def test_clean_data_no_finding(self):
df = pd.DataFrame({"note": ["plain", "ASCII only", "no smart chars"]})
findings = analyze(df)
assert "smart_punctuation_in_data" not in _ids(findings)
# ---------------------------------------------------------------------------
# Invisible / NBSP / dirty headers
# ---------------------------------------------------------------------------
class TestInvisibleChars:
def test_finds_nbsp(self):
df = pd.DataFrame({"name": ["Alice ", "Bob"]})
findings = analyze(df)
assert "nbsp_or_unicode_whitespace" in _ids(findings)
f = next(f for f in findings if f.id == "nbsp_or_unicode_whitespace")
assert f.count == 1
def test_finds_zero_width(self):
df = pd.DataFrame({"name": ["Alice", "Bob"]})
findings = analyze(df)
assert "zero_width_or_invisible" in _ids(findings)
def test_flags_dirty_headers(self):
df = pd.DataFrame({" id ": [1], "Email": ["a@b.com"]})
findings = analyze(df)
assert "dirty_column_headers" in _ids(findings)
f = next(f for f in findings if f.id == "dirty_column_headers")
assert f.count == 2
def test_clean_headers_no_finding(self):
df = pd.DataFrame({"id": [1], "email": ["a@b.com"]})
findings = analyze(df)
assert "dirty_column_headers" not in _ids(findings)
# ---------------------------------------------------------------------------
# Whitespace padding
# ---------------------------------------------------------------------------
class TestWhitespacePadding:
def test_finds_leading_trailing_space(self):
df = pd.DataFrame({"x": [" padded ", "clean"]})
findings = analyze(df)
assert "whitespace_padding" in _ids(findings)
def test_finds_internal_double_space(self):
df = pd.DataFrame({"x": ["double space", "single space"]})
findings = analyze(df)
assert "whitespace_padding" in _ids(findings)
def test_no_finding_when_clean(self):
df = pd.DataFrame({"x": ["clean", "also clean"]})
findings = analyze(df)
assert "whitespace_padding" not in _ids(findings)
# ---------------------------------------------------------------------------
# Null-like sentinels
# ---------------------------------------------------------------------------
class TestNullLikeSentinels:
def test_finds_n_a_and_nan(self):
df = pd.DataFrame({"x": ["valid", "N/A", "nan", "None", "-"]})
findings = analyze(df)
f = next(f for f in findings if f.id == "null_like_sentinels")
assert f.count == 4
assert f.tool == TOOL_MISSING_HANDLER
assert f.severity == "info"
def test_clean_data_no_finding(self):
df = pd.DataFrame({"x": ["a", "b", "c"]})
findings = analyze(df)
assert "null_like_sentinels" not in _ids(findings)
# ---------------------------------------------------------------------------
# Mojibake
# ---------------------------------------------------------------------------
class TestMojibake:
def test_finds_classic_pattern(self):
df = pd.DataFrame({"name": ["café", "café", "Müller"]})
findings = analyze(df)
assert "suspected_mojibake" in _ids(findings)
def test_clean_unicode_no_finding(self):
df = pd.DataFrame({"name": ["café", "naïve", "München"]})
findings = analyze(df)
assert "suspected_mojibake" not in _ids(findings)
# ---------------------------------------------------------------------------
# Mixed-case email column
# ---------------------------------------------------------------------------
class TestMixedCaseEmail:
def test_finds_mixed_case(self):
df = pd.DataFrame({"email": ["Alice@Example.COM", "bob@example.com"]})
findings = analyze(df)
assert "mixed_case_email_column" in _ids(findings)
def test_all_lower_no_finding(self):
df = pd.DataFrame({"email": ["a@b.com", "c@d.com"]})
findings = analyze(df)
assert "mixed_case_email_column" not in _ids(findings)
def test_non_email_column_ignored(self):
df = pd.DataFrame({"name": ["Alice", "bob"]})
findings = analyze(df)
assert "mixed_case_email_column" not in _ids(findings)
# ---------------------------------------------------------------------------
# Leading-zero IDs
# ---------------------------------------------------------------------------
class TestLeadingZeroIds:
def test_finds_zero_padded_ids(self):
df = pd.DataFrame({
"sku": ["0001234", "0005678", "0009999", "0001111", "0002222", "0003333"],
})
findings = analyze(df)
assert "leading_zero_ids" in _ids(findings)
def test_no_finding_when_no_leading_zero(self):
df = pd.DataFrame({"id": [str(i) for i in range(1, 100)]})
findings = analyze(df)
assert "leading_zero_ids" not in _ids(findings)
# ---------------------------------------------------------------------------
# Findings synthesized from RepairResult
# ---------------------------------------------------------------------------
class TestFindingsFromRepair:
def test_bom_strip_surfaces(self):
repair = repair_bytes(b"\xef\xbb\xbfid,name\n1,Alice\n")
findings = analyze(pd.DataFrame({"id": ["1"], "name": ["Alice"]}),
repair_result=repair)
assert "csv_bom_stripped" in _ids(findings)
def test_nul_strip_surfaces(self):
repair = repair_bytes(b"id,name\n1,Hel\x00lo\n")
findings = analyze(pd.DataFrame({"id": ["1"], "name": ["Hello"]}),
repair_result=repair)
assert "csv_nul_stripped" in _ids(findings)
def test_unrepairable_surfaces_as_error(self):
# Synthesize a result with an unrepairable line.
repair = RepairResult(
repaired_bytes=b"id,a,b\n1,foo,bar\n",
actions=[],
unrepairable_lines=[3],
)
findings = analyze(pd.DataFrame({"id": ["1"], "a": ["foo"], "b": ["bar"]}),
repair_result=repair)
f = next(f for f in findings if f.id == "csv_unrepairable_rows")
assert f.severity == "error"
# ---------------------------------------------------------------------------
# End-to-end on the corpus kitchen-sink fixture
# ---------------------------------------------------------------------------
class TestEndToEnd:
def test_kitchen_sink_fixture_finds_pollution(self):
path = Path("test-cases/text-cleaner-corpus/test_data/20_kitchen_sink.csv")
if not path.exists():
pytest.skip("corpus fixture not present")
findings = analyze(path)
ids = _ids(findings)
# Kitchen-sink has BOM, smart quotes, NBSP, ZWSP, and dirty headers.
# Pre-parse repair handles the file-level smart-quote/BOM, so they
# show up as csv_* findings; the cell-level NBSP/ZW remain as
# data findings.
assert "csv_bom_stripped" in ids or "csv_smart_quotes_folded" in ids
# NBSP-padded headers should still surface — pre-parse repair only
# touches double-quote characters.
assert any(i.startswith("dirty_") or i.startswith("nbsp") or i.startswith("zero_width")
for i in ids)
def test_clean_dataframe_returns_empty_findings(self):
df = pd.DataFrame({
"id": ["1", "2", "3"],
"name": ["Alice", "Bob", "Carol"],
"email": ["a@x.com", "b@x.com", "c@x.com"],
})
findings = analyze(df)
assert findings == []
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
class TestHelpers:
def test_findings_by_tool_groups_correctly(self):
df = pd.DataFrame({
"name": [" padded ", "“smart”"],
"x": ["N/A", "valid"],
})
findings = analyze(df)
grouped = findings_by_tool(findings)
assert TOOL_TEXT_CLEANER in grouped
assert TOOL_MISSING_HANDLER in grouped
def test_findings_by_tool_skips_toolless(self):
repair = RepairResult(
repaired_bytes=b"", actions=[], unrepairable_lines=[5, 7],
)
findings = analyze(pd.DataFrame({"x": ["a"]}), repair_result=repair)
grouped = findings_by_tool(findings)
# csv_unrepairable_rows has tool="" and should not appear.
assert all(t for t in grouped)
def test_to_dict_is_json_serializable(self):
df = pd.DataFrame({"x": [" padded "]})
findings = analyze(df)
d = to_dict(findings[0])
import json
json.dumps(d) # would raise on non-serializable values
assert d["id"] == "whitespace_padding"
assert "samples" in d