"""Tests for src.core.analyze — upload-time data quality detectors.""" from __future__ import annotations from pathlib import Path import pandas as pd import pytest from src.core.analyze import ( Finding, TOOL_DEDUPLICATOR, TOOL_MISSING_HANDLER, TOOL_TEXT_CLEANER, analyze, findings_by_tool, to_dict, ) from src.core.io import RepairAction, RepairResult, repair_bytes def _ids(findings: list[Finding]) -> set[str]: return {f.id for f in findings} # --------------------------------------------------------------------------- # Smart punctuation # --------------------------------------------------------------------------- class TestSmartPunctuation: def test_finds_curly_quotes(self): df = pd.DataFrame({"note": ["plain", "“fancy”", "it’s"]}) findings = analyze(df) assert "smart_punctuation_in_data" in _ids(findings) f = next(f for f in findings if f.id == "smart_punctuation_in_data") assert f.severity == "warn" assert f.tool == TOOL_TEXT_CLEANER assert f.count == 2 def test_finds_dashes_and_ellipsis(self): df = pd.DataFrame({"note": ["a—b", "wait…"]}) findings = analyze(df) assert "smart_punctuation_in_data" in _ids(findings) def test_clean_data_no_finding(self): df = pd.DataFrame({"note": ["plain", "ASCII only", "no smart chars"]}) findings = analyze(df) assert "smart_punctuation_in_data" not in _ids(findings) # --------------------------------------------------------------------------- # Invisible / NBSP / dirty headers # --------------------------------------------------------------------------- class TestInvisibleChars: def test_finds_nbsp(self): df = pd.DataFrame({"name": ["Alice ", "Bob"]}) findings = analyze(df) assert "nbsp_or_unicode_whitespace" in _ids(findings) f = next(f for f in findings if f.id == "nbsp_or_unicode_whitespace") assert f.count == 1 def test_finds_zero_width(self): df = pd.DataFrame({"name": ["Alice​", "Bob"]}) findings = analyze(df) assert "zero_width_or_invisible" in _ids(findings) def test_flags_dirty_headers(self): df = pd.DataFrame({" id ": [1], "Email​": ["a@b.com"]}) findings = analyze(df) assert "dirty_column_headers" in _ids(findings) f = next(f for f in findings if f.id == "dirty_column_headers") assert f.count == 2 def test_clean_headers_no_finding(self): df = pd.DataFrame({"id": [1], "email": ["a@b.com"]}) findings = analyze(df) assert "dirty_column_headers" not in _ids(findings) # --------------------------------------------------------------------------- # Whitespace padding # --------------------------------------------------------------------------- class TestWhitespacePadding: def test_finds_leading_trailing_space(self): df = pd.DataFrame({"x": [" padded ", "clean"]}) findings = analyze(df) assert "whitespace_padding" in _ids(findings) def test_finds_internal_double_space(self): df = pd.DataFrame({"x": ["double space", "single space"]}) findings = analyze(df) assert "whitespace_padding" in _ids(findings) def test_no_finding_when_clean(self): df = pd.DataFrame({"x": ["clean", "also clean"]}) findings = analyze(df) assert "whitespace_padding" not in _ids(findings) # --------------------------------------------------------------------------- # Null-like sentinels # --------------------------------------------------------------------------- class TestNullLikeSentinels: def test_finds_n_a_and_nan(self): df = pd.DataFrame({"x": ["valid", "N/A", "nan", "None", "-"]}) findings = analyze(df) f = next(f for f in findings if f.id == "null_like_sentinels") assert f.count == 4 assert f.tool == TOOL_MISSING_HANDLER assert f.severity == "info" def test_clean_data_no_finding(self): df = pd.DataFrame({"x": ["a", "b", "c"]}) findings = analyze(df) assert "null_like_sentinels" not in _ids(findings) # --------------------------------------------------------------------------- # Mojibake # --------------------------------------------------------------------------- class TestMojibake: def test_finds_classic_pattern(self): df = pd.DataFrame({"name": ["café", "café", "Müller"]}) findings = analyze(df) assert "suspected_mojibake" in _ids(findings) def test_clean_unicode_no_finding(self): df = pd.DataFrame({"name": ["café", "naïve", "München"]}) findings = analyze(df) assert "suspected_mojibake" not in _ids(findings) # --------------------------------------------------------------------------- # Mixed-case email column # --------------------------------------------------------------------------- class TestMixedCaseEmail: def test_finds_mixed_case(self): df = pd.DataFrame({"email": ["Alice@Example.COM", "bob@example.com"]}) findings = analyze(df) assert "mixed_case_email_column" in _ids(findings) def test_all_lower_no_finding(self): df = pd.DataFrame({"email": ["a@b.com", "c@d.com"]}) findings = analyze(df) assert "mixed_case_email_column" not in _ids(findings) def test_non_email_column_ignored(self): df = pd.DataFrame({"name": ["Alice", "bob"]}) findings = analyze(df) assert "mixed_case_email_column" not in _ids(findings) # --------------------------------------------------------------------------- # Leading-zero IDs # --------------------------------------------------------------------------- class TestLeadingZeroIds: def test_finds_zero_padded_ids(self): df = pd.DataFrame({ "sku": ["0001234", "0005678", "0009999", "0001111", "0002222", "0003333"], }) findings = analyze(df) assert "leading_zero_ids" in _ids(findings) def test_no_finding_when_no_leading_zero(self): df = pd.DataFrame({"id": [str(i) for i in range(1, 100)]}) findings = analyze(df) assert "leading_zero_ids" not in _ids(findings) # --------------------------------------------------------------------------- # Near-duplicate rows # --------------------------------------------------------------------------- class TestNearDuplicates: def test_finds_case_insensitive_dupes(self): df = pd.DataFrame({ "name": ["Alice", "alice ", "Bob"], "email": ["a@b.com", "A@B.COM", "bob@b.com"], }) findings = analyze(df) assert "near_duplicate_rows" in _ids(findings) def test_unique_rows_no_finding(self): df = pd.DataFrame({ "name": ["Alice", "Bob", "Carol"], "email": ["a@x.com", "b@x.com", "c@x.com"], }) findings = analyze(df) assert "near_duplicate_rows" not in _ids(findings) def test_single_row_no_finding(self): df = pd.DataFrame({"x": ["only"]}) findings = analyze(df) assert "near_duplicate_rows" not in _ids(findings) # --------------------------------------------------------------------------- # Mixed line endings # --------------------------------------------------------------------------- class TestMixedLineEndings: def test_crlf_plus_lf_flagged(self, tmp_path): f = tmp_path / "mixed.csv" f.write_bytes(b"id,name\r\n1,Alice\n2,Bob\r\n") findings = analyze(f) assert "mixed_line_endings" in _ids(findings) def test_uniform_lf_not_flagged(self, tmp_path): f = tmp_path / "uniform.csv" f.write_bytes(b"id,name\n1,Alice\n2,Bob\n") findings = analyze(f) assert "mixed_line_endings" not in _ids(findings) def test_dataframe_mode_skips_detector(self): # No raw bytes -> mixed_line_endings cannot be detected. df = pd.DataFrame({"id": ["1"], "name": ["Alice"]}) findings = analyze(df) assert "mixed_line_endings" not in _ids(findings) # --------------------------------------------------------------------------- # Findings synthesized from RepairResult # --------------------------------------------------------------------------- class TestFindingsFromRepair: def test_bom_strip_surfaces(self): repair = repair_bytes(b"\xef\xbb\xbfid,name\n1,Alice\n") findings = analyze(pd.DataFrame({"id": ["1"], "name": ["Alice"]}), repair_result=repair) assert "csv_bom_stripped" in _ids(findings) def test_nul_strip_surfaces(self): repair = repair_bytes(b"id,name\n1,Hel\x00lo\n") findings = analyze(pd.DataFrame({"id": ["1"], "name": ["Hello"]}), repair_result=repair) assert "csv_nul_stripped" in _ids(findings) def test_unrepairable_surfaces_as_error(self): # Synthesize a result with an unrepairable line. repair = RepairResult( repaired_bytes=b"id,a,b\n1,foo,bar\n", actions=[], unrepairable_lines=[3], ) findings = analyze(pd.DataFrame({"id": ["1"], "a": ["foo"], "b": ["bar"]}), repair_result=repair) f = next(f for f in findings if f.id == "csv_unrepairable_rows") assert f.severity == "error" # --------------------------------------------------------------------------- # End-to-end on the corpus kitchen-sink fixture # --------------------------------------------------------------------------- class TestEndToEnd: def test_kitchen_sink_fixture_finds_pollution(self): path = Path("test-cases/text-cleaner-corpus/test_data/20_kitchen_sink.csv") if not path.exists(): pytest.skip("corpus fixture not present") findings = analyze(path) ids = _ids(findings) # Kitchen-sink has BOM, smart quotes, NBSP, ZWSP, and dirty headers. # Pre-parse repair handles the file-level smart-quote/BOM, so they # show up as csv_* findings; the cell-level NBSP/ZW remain as # data findings. assert "csv_bom_stripped" in ids or "csv_smart_quotes_folded" in ids # NBSP-padded headers should still surface — pre-parse repair only # touches double-quote characters. assert any(i.startswith("dirty_") or i.startswith("nbsp") or i.startswith("zero_width") for i in ids) def test_clean_dataframe_returns_empty_findings(self): df = pd.DataFrame({ "id": ["1", "2", "3"], "name": ["Alice", "Bob", "Carol"], "email": ["a@x.com", "b@x.com", "c@x.com"], }) findings = analyze(df) assert findings == [] # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- class TestHelpers: def test_findings_by_tool_groups_correctly(self): df = pd.DataFrame({ "name": [" padded ", "“smart”"], "x": ["N/A", "valid"], }) findings = analyze(df) grouped = findings_by_tool(findings) assert TOOL_TEXT_CLEANER in grouped assert TOOL_MISSING_HANDLER in grouped def test_findings_by_tool_skips_toolless(self): repair = RepairResult( repaired_bytes=b"", actions=[], unrepairable_lines=[5, 7], ) findings = analyze(pd.DataFrame({"x": ["a"]}), repair_result=repair) grouped = findings_by_tool(findings) # csv_unrepairable_rows has tool="" and should not appear. assert all(t for t in grouped) def test_to_dict_is_json_serializable(self): df = pd.DataFrame({"x": [" padded "]}) findings = analyze(df) d = to_dict(findings[0]) import json json.dumps(d) # would raise on non-serializable values assert d["id"] == "whitespace_padding" assert "samples" in d