Files
datatools-dev/tests/test_analyze.py
Michael 8dfc6ad8ae feat(analyze): add mixed_line_endings + near_duplicate_rows detectors
Two more detectors close the analyzer gap list:

mixed_line_endings (warn, tool=02): scans raw bytes for combinations of
  CRLF / LF / bare CR. Disaster pattern after multi-source concat
  (Windows + macOS + Linux exports stitched together). Operates on raw
  bytes only — DataFrame-mode analyze() skips it because raw bytes
  aren't available. _load_for_analysis now returns the raw bytes
  alongside the DataFrame and repair result so the detector has them.

near_duplicate_rows (info, tool=01): cheap dedup signal — strip and
  lowercase every string column, then count df.duplicated(). Catches the
  most common case (same customer entered twice with subtle formatting
  differences) without paying for fuzzy matching. Anything more
  sophisticated stays in tool 01.

Six new tests cover both detectors plus the dataframe-mode skip path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 16:09:42 +00:00

320 lines
12 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Tests for src.core.analyze — upload-time data quality detectors."""
from __future__ import annotations
from pathlib import Path
import pandas as pd
import pytest
from src.core.analyze import (
Finding,
TOOL_DEDUPLICATOR,
TOOL_MISSING_HANDLER,
TOOL_TEXT_CLEANER,
analyze,
findings_by_tool,
to_dict,
)
from src.core.io import RepairAction, RepairResult, repair_bytes
def _ids(findings: list[Finding]) -> set[str]:
return {f.id for f in findings}
# ---------------------------------------------------------------------------
# Smart punctuation
# ---------------------------------------------------------------------------
class TestSmartPunctuation:
def test_finds_curly_quotes(self):
df = pd.DataFrame({"note": ["plain", "“fancy”", "its"]})
findings = analyze(df)
assert "smart_punctuation_in_data" in _ids(findings)
f = next(f for f in findings if f.id == "smart_punctuation_in_data")
assert f.severity == "warn"
assert f.tool == TOOL_TEXT_CLEANER
assert f.count == 2
def test_finds_dashes_and_ellipsis(self):
df = pd.DataFrame({"note": ["a—b", "wait…"]})
findings = analyze(df)
assert "smart_punctuation_in_data" in _ids(findings)
def test_clean_data_no_finding(self):
df = pd.DataFrame({"note": ["plain", "ASCII only", "no smart chars"]})
findings = analyze(df)
assert "smart_punctuation_in_data" not in _ids(findings)
# ---------------------------------------------------------------------------
# Invisible / NBSP / dirty headers
# ---------------------------------------------------------------------------
class TestInvisibleChars:
def test_finds_nbsp(self):
df = pd.DataFrame({"name": ["Alice ", "Bob"]})
findings = analyze(df)
assert "nbsp_or_unicode_whitespace" in _ids(findings)
f = next(f for f in findings if f.id == "nbsp_or_unicode_whitespace")
assert f.count == 1
def test_finds_zero_width(self):
df = pd.DataFrame({"name": ["Alice", "Bob"]})
findings = analyze(df)
assert "zero_width_or_invisible" in _ids(findings)
def test_flags_dirty_headers(self):
df = pd.DataFrame({" id ": [1], "Email": ["a@b.com"]})
findings = analyze(df)
assert "dirty_column_headers" in _ids(findings)
f = next(f for f in findings if f.id == "dirty_column_headers")
assert f.count == 2
def test_clean_headers_no_finding(self):
df = pd.DataFrame({"id": [1], "email": ["a@b.com"]})
findings = analyze(df)
assert "dirty_column_headers" not in _ids(findings)
# ---------------------------------------------------------------------------
# Whitespace padding
# ---------------------------------------------------------------------------
class TestWhitespacePadding:
def test_finds_leading_trailing_space(self):
df = pd.DataFrame({"x": [" padded ", "clean"]})
findings = analyze(df)
assert "whitespace_padding" in _ids(findings)
def test_finds_internal_double_space(self):
df = pd.DataFrame({"x": ["double space", "single space"]})
findings = analyze(df)
assert "whitespace_padding" in _ids(findings)
def test_no_finding_when_clean(self):
df = pd.DataFrame({"x": ["clean", "also clean"]})
findings = analyze(df)
assert "whitespace_padding" not in _ids(findings)
# ---------------------------------------------------------------------------
# Null-like sentinels
# ---------------------------------------------------------------------------
class TestNullLikeSentinels:
def test_finds_n_a_and_nan(self):
df = pd.DataFrame({"x": ["valid", "N/A", "nan", "None", "-"]})
findings = analyze(df)
f = next(f for f in findings if f.id == "null_like_sentinels")
assert f.count == 4
assert f.tool == TOOL_MISSING_HANDLER
assert f.severity == "info"
def test_clean_data_no_finding(self):
df = pd.DataFrame({"x": ["a", "b", "c"]})
findings = analyze(df)
assert "null_like_sentinels" not in _ids(findings)
# ---------------------------------------------------------------------------
# Mojibake
# ---------------------------------------------------------------------------
class TestMojibake:
def test_finds_classic_pattern(self):
df = pd.DataFrame({"name": ["café", "café", "Müller"]})
findings = analyze(df)
assert "suspected_mojibake" in _ids(findings)
def test_clean_unicode_no_finding(self):
df = pd.DataFrame({"name": ["café", "naïve", "München"]})
findings = analyze(df)
assert "suspected_mojibake" not in _ids(findings)
# ---------------------------------------------------------------------------
# Mixed-case email column
# ---------------------------------------------------------------------------
class TestMixedCaseEmail:
def test_finds_mixed_case(self):
df = pd.DataFrame({"email": ["Alice@Example.COM", "bob@example.com"]})
findings = analyze(df)
assert "mixed_case_email_column" in _ids(findings)
def test_all_lower_no_finding(self):
df = pd.DataFrame({"email": ["a@b.com", "c@d.com"]})
findings = analyze(df)
assert "mixed_case_email_column" not in _ids(findings)
def test_non_email_column_ignored(self):
df = pd.DataFrame({"name": ["Alice", "bob"]})
findings = analyze(df)
assert "mixed_case_email_column" not in _ids(findings)
# ---------------------------------------------------------------------------
# Leading-zero IDs
# ---------------------------------------------------------------------------
class TestLeadingZeroIds:
def test_finds_zero_padded_ids(self):
df = pd.DataFrame({
"sku": ["0001234", "0005678", "0009999", "0001111", "0002222", "0003333"],
})
findings = analyze(df)
assert "leading_zero_ids" in _ids(findings)
def test_no_finding_when_no_leading_zero(self):
df = pd.DataFrame({"id": [str(i) for i in range(1, 100)]})
findings = analyze(df)
assert "leading_zero_ids" not in _ids(findings)
# ---------------------------------------------------------------------------
# Near-duplicate rows
# ---------------------------------------------------------------------------
class TestNearDuplicates:
def test_finds_case_insensitive_dupes(self):
df = pd.DataFrame({
"name": ["Alice", "alice ", "Bob"],
"email": ["a@b.com", "A@B.COM", "bob@b.com"],
})
findings = analyze(df)
assert "near_duplicate_rows" in _ids(findings)
def test_unique_rows_no_finding(self):
df = pd.DataFrame({
"name": ["Alice", "Bob", "Carol"],
"email": ["a@x.com", "b@x.com", "c@x.com"],
})
findings = analyze(df)
assert "near_duplicate_rows" not in _ids(findings)
def test_single_row_no_finding(self):
df = pd.DataFrame({"x": ["only"]})
findings = analyze(df)
assert "near_duplicate_rows" not in _ids(findings)
# ---------------------------------------------------------------------------
# Mixed line endings
# ---------------------------------------------------------------------------
class TestMixedLineEndings:
def test_crlf_plus_lf_flagged(self, tmp_path):
f = tmp_path / "mixed.csv"
f.write_bytes(b"id,name\r\n1,Alice\n2,Bob\r\n")
findings = analyze(f)
assert "mixed_line_endings" in _ids(findings)
def test_uniform_lf_not_flagged(self, tmp_path):
f = tmp_path / "uniform.csv"
f.write_bytes(b"id,name\n1,Alice\n2,Bob\n")
findings = analyze(f)
assert "mixed_line_endings" not in _ids(findings)
def test_dataframe_mode_skips_detector(self):
# No raw bytes -> mixed_line_endings cannot be detected.
df = pd.DataFrame({"id": ["1"], "name": ["Alice"]})
findings = analyze(df)
assert "mixed_line_endings" not in _ids(findings)
# ---------------------------------------------------------------------------
# Findings synthesized from RepairResult
# ---------------------------------------------------------------------------
class TestFindingsFromRepair:
def test_bom_strip_surfaces(self):
repair = repair_bytes(b"\xef\xbb\xbfid,name\n1,Alice\n")
findings = analyze(pd.DataFrame({"id": ["1"], "name": ["Alice"]}),
repair_result=repair)
assert "csv_bom_stripped" in _ids(findings)
def test_nul_strip_surfaces(self):
repair = repair_bytes(b"id,name\n1,Hel\x00lo\n")
findings = analyze(pd.DataFrame({"id": ["1"], "name": ["Hello"]}),
repair_result=repair)
assert "csv_nul_stripped" in _ids(findings)
def test_unrepairable_surfaces_as_error(self):
# Synthesize a result with an unrepairable line.
repair = RepairResult(
repaired_bytes=b"id,a,b\n1,foo,bar\n",
actions=[],
unrepairable_lines=[3],
)
findings = analyze(pd.DataFrame({"id": ["1"], "a": ["foo"], "b": ["bar"]}),
repair_result=repair)
f = next(f for f in findings if f.id == "csv_unrepairable_rows")
assert f.severity == "error"
# ---------------------------------------------------------------------------
# End-to-end on the corpus kitchen-sink fixture
# ---------------------------------------------------------------------------
class TestEndToEnd:
def test_kitchen_sink_fixture_finds_pollution(self):
path = Path("test-cases/text-cleaner-corpus/test_data/20_kitchen_sink.csv")
if not path.exists():
pytest.skip("corpus fixture not present")
findings = analyze(path)
ids = _ids(findings)
# Kitchen-sink has BOM, smart quotes, NBSP, ZWSP, and dirty headers.
# Pre-parse repair handles the file-level smart-quote/BOM, so they
# show up as csv_* findings; the cell-level NBSP/ZW remain as
# data findings.
assert "csv_bom_stripped" in ids or "csv_smart_quotes_folded" in ids
# NBSP-padded headers should still surface — pre-parse repair only
# touches double-quote characters.
assert any(i.startswith("dirty_") or i.startswith("nbsp") or i.startswith("zero_width")
for i in ids)
def test_clean_dataframe_returns_empty_findings(self):
df = pd.DataFrame({
"id": ["1", "2", "3"],
"name": ["Alice", "Bob", "Carol"],
"email": ["a@x.com", "b@x.com", "c@x.com"],
})
findings = analyze(df)
assert findings == []
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
class TestHelpers:
def test_findings_by_tool_groups_correctly(self):
df = pd.DataFrame({
"name": [" padded ", "“smart”"],
"x": ["N/A", "valid"],
})
findings = analyze(df)
grouped = findings_by_tool(findings)
assert TOOL_TEXT_CLEANER in grouped
assert TOOL_MISSING_HANDLER in grouped
def test_findings_by_tool_skips_toolless(self):
repair = RepairResult(
repaired_bytes=b"", actions=[], unrepairable_lines=[5, 7],
)
findings = analyze(pd.DataFrame({"x": ["a"]}), repair_result=repair)
grouped = findings_by_tool(findings)
# csv_unrepairable_rows has tool="" and should not appear.
assert all(t for t in grouped)
def test_to_dict_is_json_serializable(self):
df = pd.DataFrame({"x": [" padded "]})
findings = analyze(df)
d = to_dict(findings[0])
import json
json.dumps(d) # would raise on non-serializable values
assert d["id"] == "whitespace_padding"
assert "samples" in d