Two more detectors close the analyzer gap list: mixed_line_endings (warn, tool=02): scans raw bytes for combinations of CRLF / LF / bare CR. Disaster pattern after multi-source concat (Windows + macOS + Linux exports stitched together). Operates on raw bytes only — DataFrame-mode analyze() skips it because raw bytes aren't available. _load_for_analysis now returns the raw bytes alongside the DataFrame and repair result so the detector has them. near_duplicate_rows (info, tool=01): cheap dedup signal — strip and lowercase every string column, then count df.duplicated(). Catches the most common case (same customer entered twice with subtle formatting differences) without paying for fuzzy matching. Anything more sophisticated stays in tool 01. Six new tests cover both detectors plus the dataframe-mode skip path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
320 lines
12 KiB
Python
320 lines
12 KiB
Python
"""Tests for src.core.analyze — upload-time data quality detectors."""
|
||
|
||
from __future__ import annotations
|
||
|
||
from pathlib import Path
|
||
|
||
import pandas as pd
|
||
import pytest
|
||
|
||
from src.core.analyze import (
|
||
Finding,
|
||
TOOL_DEDUPLICATOR,
|
||
TOOL_MISSING_HANDLER,
|
||
TOOL_TEXT_CLEANER,
|
||
analyze,
|
||
findings_by_tool,
|
||
to_dict,
|
||
)
|
||
from src.core.io import RepairAction, RepairResult, repair_bytes
|
||
|
||
|
||
def _ids(findings: list[Finding]) -> set[str]:
|
||
return {f.id for f in findings}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Smart punctuation
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestSmartPunctuation:
|
||
def test_finds_curly_quotes(self):
|
||
df = pd.DataFrame({"note": ["plain", "“fancy”", "it’s"]})
|
||
findings = analyze(df)
|
||
assert "smart_punctuation_in_data" in _ids(findings)
|
||
f = next(f for f in findings if f.id == "smart_punctuation_in_data")
|
||
assert f.severity == "warn"
|
||
assert f.tool == TOOL_TEXT_CLEANER
|
||
assert f.count == 2
|
||
|
||
def test_finds_dashes_and_ellipsis(self):
|
||
df = pd.DataFrame({"note": ["a—b", "wait…"]})
|
||
findings = analyze(df)
|
||
assert "smart_punctuation_in_data" in _ids(findings)
|
||
|
||
def test_clean_data_no_finding(self):
|
||
df = pd.DataFrame({"note": ["plain", "ASCII only", "no smart chars"]})
|
||
findings = analyze(df)
|
||
assert "smart_punctuation_in_data" not in _ids(findings)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Invisible / NBSP / dirty headers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestInvisibleChars:
|
||
def test_finds_nbsp(self):
|
||
df = pd.DataFrame({"name": ["Alice ", "Bob"]})
|
||
findings = analyze(df)
|
||
assert "nbsp_or_unicode_whitespace" in _ids(findings)
|
||
f = next(f for f in findings if f.id == "nbsp_or_unicode_whitespace")
|
||
assert f.count == 1
|
||
|
||
def test_finds_zero_width(self):
|
||
df = pd.DataFrame({"name": ["Alice", "Bob"]})
|
||
findings = analyze(df)
|
||
assert "zero_width_or_invisible" in _ids(findings)
|
||
|
||
def test_flags_dirty_headers(self):
|
||
df = pd.DataFrame({" id ": [1], "Email": ["a@b.com"]})
|
||
findings = analyze(df)
|
||
assert "dirty_column_headers" in _ids(findings)
|
||
f = next(f for f in findings if f.id == "dirty_column_headers")
|
||
assert f.count == 2
|
||
|
||
def test_clean_headers_no_finding(self):
|
||
df = pd.DataFrame({"id": [1], "email": ["a@b.com"]})
|
||
findings = analyze(df)
|
||
assert "dirty_column_headers" not in _ids(findings)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Whitespace padding
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestWhitespacePadding:
|
||
def test_finds_leading_trailing_space(self):
|
||
df = pd.DataFrame({"x": [" padded ", "clean"]})
|
||
findings = analyze(df)
|
||
assert "whitespace_padding" in _ids(findings)
|
||
|
||
def test_finds_internal_double_space(self):
|
||
df = pd.DataFrame({"x": ["double space", "single space"]})
|
||
findings = analyze(df)
|
||
assert "whitespace_padding" in _ids(findings)
|
||
|
||
def test_no_finding_when_clean(self):
|
||
df = pd.DataFrame({"x": ["clean", "also clean"]})
|
||
findings = analyze(df)
|
||
assert "whitespace_padding" not in _ids(findings)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Null-like sentinels
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestNullLikeSentinels:
|
||
def test_finds_n_a_and_nan(self):
|
||
df = pd.DataFrame({"x": ["valid", "N/A", "nan", "None", "-"]})
|
||
findings = analyze(df)
|
||
f = next(f for f in findings if f.id == "null_like_sentinels")
|
||
assert f.count == 4
|
||
assert f.tool == TOOL_MISSING_HANDLER
|
||
assert f.severity == "info"
|
||
|
||
def test_clean_data_no_finding(self):
|
||
df = pd.DataFrame({"x": ["a", "b", "c"]})
|
||
findings = analyze(df)
|
||
assert "null_like_sentinels" not in _ids(findings)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Mojibake
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestMojibake:
|
||
def test_finds_classic_pattern(self):
|
||
df = pd.DataFrame({"name": ["café", "café", "Müller"]})
|
||
findings = analyze(df)
|
||
assert "suspected_mojibake" in _ids(findings)
|
||
|
||
def test_clean_unicode_no_finding(self):
|
||
df = pd.DataFrame({"name": ["café", "naïve", "München"]})
|
||
findings = analyze(df)
|
||
assert "suspected_mojibake" not in _ids(findings)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Mixed-case email column
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestMixedCaseEmail:
|
||
def test_finds_mixed_case(self):
|
||
df = pd.DataFrame({"email": ["Alice@Example.COM", "bob@example.com"]})
|
||
findings = analyze(df)
|
||
assert "mixed_case_email_column" in _ids(findings)
|
||
|
||
def test_all_lower_no_finding(self):
|
||
df = pd.DataFrame({"email": ["a@b.com", "c@d.com"]})
|
||
findings = analyze(df)
|
||
assert "mixed_case_email_column" not in _ids(findings)
|
||
|
||
def test_non_email_column_ignored(self):
|
||
df = pd.DataFrame({"name": ["Alice", "bob"]})
|
||
findings = analyze(df)
|
||
assert "mixed_case_email_column" not in _ids(findings)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Leading-zero IDs
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestLeadingZeroIds:
|
||
def test_finds_zero_padded_ids(self):
|
||
df = pd.DataFrame({
|
||
"sku": ["0001234", "0005678", "0009999", "0001111", "0002222", "0003333"],
|
||
})
|
||
findings = analyze(df)
|
||
assert "leading_zero_ids" in _ids(findings)
|
||
|
||
def test_no_finding_when_no_leading_zero(self):
|
||
df = pd.DataFrame({"id": [str(i) for i in range(1, 100)]})
|
||
findings = analyze(df)
|
||
assert "leading_zero_ids" not in _ids(findings)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Near-duplicate rows
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestNearDuplicates:
|
||
def test_finds_case_insensitive_dupes(self):
|
||
df = pd.DataFrame({
|
||
"name": ["Alice", "alice ", "Bob"],
|
||
"email": ["a@b.com", "A@B.COM", "bob@b.com"],
|
||
})
|
||
findings = analyze(df)
|
||
assert "near_duplicate_rows" in _ids(findings)
|
||
|
||
def test_unique_rows_no_finding(self):
|
||
df = pd.DataFrame({
|
||
"name": ["Alice", "Bob", "Carol"],
|
||
"email": ["a@x.com", "b@x.com", "c@x.com"],
|
||
})
|
||
findings = analyze(df)
|
||
assert "near_duplicate_rows" not in _ids(findings)
|
||
|
||
def test_single_row_no_finding(self):
|
||
df = pd.DataFrame({"x": ["only"]})
|
||
findings = analyze(df)
|
||
assert "near_duplicate_rows" not in _ids(findings)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Mixed line endings
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestMixedLineEndings:
|
||
def test_crlf_plus_lf_flagged(self, tmp_path):
|
||
f = tmp_path / "mixed.csv"
|
||
f.write_bytes(b"id,name\r\n1,Alice\n2,Bob\r\n")
|
||
findings = analyze(f)
|
||
assert "mixed_line_endings" in _ids(findings)
|
||
|
||
def test_uniform_lf_not_flagged(self, tmp_path):
|
||
f = tmp_path / "uniform.csv"
|
||
f.write_bytes(b"id,name\n1,Alice\n2,Bob\n")
|
||
findings = analyze(f)
|
||
assert "mixed_line_endings" not in _ids(findings)
|
||
|
||
def test_dataframe_mode_skips_detector(self):
|
||
# No raw bytes -> mixed_line_endings cannot be detected.
|
||
df = pd.DataFrame({"id": ["1"], "name": ["Alice"]})
|
||
findings = analyze(df)
|
||
assert "mixed_line_endings" not in _ids(findings)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Findings synthesized from RepairResult
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestFindingsFromRepair:
|
||
def test_bom_strip_surfaces(self):
|
||
repair = repair_bytes(b"\xef\xbb\xbfid,name\n1,Alice\n")
|
||
findings = analyze(pd.DataFrame({"id": ["1"], "name": ["Alice"]}),
|
||
repair_result=repair)
|
||
assert "csv_bom_stripped" in _ids(findings)
|
||
|
||
def test_nul_strip_surfaces(self):
|
||
repair = repair_bytes(b"id,name\n1,Hel\x00lo\n")
|
||
findings = analyze(pd.DataFrame({"id": ["1"], "name": ["Hello"]}),
|
||
repair_result=repair)
|
||
assert "csv_nul_stripped" in _ids(findings)
|
||
|
||
def test_unrepairable_surfaces_as_error(self):
|
||
# Synthesize a result with an unrepairable line.
|
||
repair = RepairResult(
|
||
repaired_bytes=b"id,a,b\n1,foo,bar\n",
|
||
actions=[],
|
||
unrepairable_lines=[3],
|
||
)
|
||
findings = analyze(pd.DataFrame({"id": ["1"], "a": ["foo"], "b": ["bar"]}),
|
||
repair_result=repair)
|
||
f = next(f for f in findings if f.id == "csv_unrepairable_rows")
|
||
assert f.severity == "error"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# End-to-end on the corpus kitchen-sink fixture
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestEndToEnd:
|
||
def test_kitchen_sink_fixture_finds_pollution(self):
|
||
path = Path("test-cases/text-cleaner-corpus/test_data/20_kitchen_sink.csv")
|
||
if not path.exists():
|
||
pytest.skip("corpus fixture not present")
|
||
findings = analyze(path)
|
||
ids = _ids(findings)
|
||
# Kitchen-sink has BOM, smart quotes, NBSP, ZWSP, and dirty headers.
|
||
# Pre-parse repair handles the file-level smart-quote/BOM, so they
|
||
# show up as csv_* findings; the cell-level NBSP/ZW remain as
|
||
# data findings.
|
||
assert "csv_bom_stripped" in ids or "csv_smart_quotes_folded" in ids
|
||
# NBSP-padded headers should still surface — pre-parse repair only
|
||
# touches double-quote characters.
|
||
assert any(i.startswith("dirty_") or i.startswith("nbsp") or i.startswith("zero_width")
|
||
for i in ids)
|
||
|
||
def test_clean_dataframe_returns_empty_findings(self):
|
||
df = pd.DataFrame({
|
||
"id": ["1", "2", "3"],
|
||
"name": ["Alice", "Bob", "Carol"],
|
||
"email": ["a@x.com", "b@x.com", "c@x.com"],
|
||
})
|
||
findings = analyze(df)
|
||
assert findings == []
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestHelpers:
|
||
def test_findings_by_tool_groups_correctly(self):
|
||
df = pd.DataFrame({
|
||
"name": [" padded ", "“smart”"],
|
||
"x": ["N/A", "valid"],
|
||
})
|
||
findings = analyze(df)
|
||
grouped = findings_by_tool(findings)
|
||
assert TOOL_TEXT_CLEANER in grouped
|
||
assert TOOL_MISSING_HANDLER in grouped
|
||
|
||
def test_findings_by_tool_skips_toolless(self):
|
||
repair = RepairResult(
|
||
repaired_bytes=b"", actions=[], unrepairable_lines=[5, 7],
|
||
)
|
||
findings = analyze(pd.DataFrame({"x": ["a"]}), repair_result=repair)
|
||
grouped = findings_by_tool(findings)
|
||
# csv_unrepairable_rows has tool="" and should not appear.
|
||
assert all(t for t in grouped)
|
||
|
||
def test_to_dict_is_json_serializable(self):
|
||
df = pd.DataFrame({"x": [" padded "]})
|
||
findings = analyze(df)
|
||
d = to_dict(findings[0])
|
||
import json
|
||
json.dumps(d) # would raise on non-serializable values
|
||
assert d["id"] == "whitespace_padding"
|
||
assert "samples" in d
|