datatools-dev/tests/test_analyze.py

"""Tests for src.core.analyze — upload-time data quality detectors."""

from __future__ import annotations

from pathlib import Path

import pandas as pd
import pytest

from src.core.analyze import (
    Finding,
    TOOL_DEDUPLICATOR,
    TOOL_MISSING_HANDLER,
    TOOL_TEXT_CLEANER,
    analyze,
    findings_by_tool,
    to_dict,
)
from src.core.io import RepairAction, RepairResult, repair_bytes


def _ids(findings: list[Finding]) -> set[str]:
    return {f.id for f in findings}


# ---------------------------------------------------------------------------
# Smart punctuation
# ---------------------------------------------------------------------------

class TestSmartPunctuation:
    def test_finds_curly_quotes(self):
        df = pd.DataFrame({"note": ["plain", "“fancy”", "it’s"]})
        findings = analyze(df)
        assert "smart_punctuation_in_data" in _ids(findings)
        f = next(f for f in findings if f.id == "smart_punctuation_in_data")
        assert f.severity == "warn"
        assert f.tool == TOOL_TEXT_CLEANER
        assert f.count == 2

    def test_finds_dashes_and_ellipsis(self):
        df = pd.DataFrame({"note": ["a—b", "wait…"]})
        findings = analyze(df)
        assert "smart_punctuation_in_data" in _ids(findings)

    def test_clean_data_no_finding(self):
        df = pd.DataFrame({"note": ["plain", "ASCII only", "no smart chars"]})
        findings = analyze(df)
        assert "smart_punctuation_in_data" not in _ids(findings)


# ---------------------------------------------------------------------------
# Invisible / NBSP / dirty headers
# ---------------------------------------------------------------------------

class TestInvisibleChars:
    def test_finds_nbsp(self):
        df = pd.DataFrame({"name": ["Alice ", "Bob"]})
        findings = analyze(df)
        assert "nbsp_or_unicode_whitespace" in _ids(findings)
        f = next(f for f in findings if f.id == "nbsp_or_unicode_whitespace")
        assert f.count == 1

    def test_finds_zero_width(self):
        df = pd.DataFrame({"name": ["Alice", "Bob"]})
        findings = analyze(df)
        assert "zero_width_or_invisible" in _ids(findings)

    def test_flags_dirty_headers(self):
        df = pd.DataFrame({"  id  ": [1], "Email": ["a@b.com"]})
        findings = analyze(df)
        assert "dirty_column_headers" in _ids(findings)
        f = next(f for f in findings if f.id == "dirty_column_headers")
        assert f.count == 2

    def test_clean_headers_no_finding(self):
        df = pd.DataFrame({"id": [1], "email": ["a@b.com"]})
        findings = analyze(df)
        assert "dirty_column_headers" not in _ids(findings)


# ---------------------------------------------------------------------------
# Whitespace padding
# ---------------------------------------------------------------------------

class TestWhitespacePadding:
    def test_finds_leading_trailing_space(self):
        df = pd.DataFrame({"x": ["  padded  ", "clean"]})
        findings = analyze(df)
        assert "whitespace_padding" in _ids(findings)

    def test_finds_internal_double_space(self):
        df = pd.DataFrame({"x": ["double  space", "single space"]})
        findings = analyze(df)
        assert "whitespace_padding" in _ids(findings)

    def test_no_finding_when_clean(self):
        df = pd.DataFrame({"x": ["clean", "also clean"]})
        findings = analyze(df)
        assert "whitespace_padding" not in _ids(findings)


# ---------------------------------------------------------------------------
# Null-like sentinels
# ---------------------------------------------------------------------------

class TestNullLikeSentinels:
    def test_finds_n_a_and_nan(self):
        df = pd.DataFrame({"x": ["valid", "N/A", "nan", "None", "-"]})
        findings = analyze(df)
        f = next(f for f in findings if f.id == "null_like_sentinels")
        assert f.count == 4
        assert f.tool == TOOL_MISSING_HANDLER
        assert f.severity == "info"

    def test_clean_data_no_finding(self):
        df = pd.DataFrame({"x": ["a", "b", "c"]})
        findings = analyze(df)
        assert "null_like_sentinels" not in _ids(findings)


# ---------------------------------------------------------------------------
# Mojibake
# ---------------------------------------------------------------------------

class TestMojibake:
    def test_finds_classic_pattern(self):
        df = pd.DataFrame({"name": ["café", "cafÃ©", "MÃ¼ller"]})
        findings = analyze(df)
        assert "suspected_mojibake" in _ids(findings)

    def test_clean_unicode_no_finding(self):
        df = pd.DataFrame({"name": ["café", "naïve", "München"]})
        findings = analyze(df)
        assert "suspected_mojibake" not in _ids(findings)


# ---------------------------------------------------------------------------
# Mixed-case email column
# ---------------------------------------------------------------------------

class TestMixedCaseEmail:
    def test_finds_mixed_case(self):
        df = pd.DataFrame({"email": ["Alice@Example.COM", "bob@example.com"]})
        findings = analyze(df)
        assert "mixed_case_email_column" in _ids(findings)

    def test_all_lower_no_finding(self):
        df = pd.DataFrame({"email": ["a@b.com", "c@d.com"]})
        findings = analyze(df)
        assert "mixed_case_email_column" not in _ids(findings)

    def test_non_email_column_ignored(self):
        df = pd.DataFrame({"name": ["Alice", "bob"]})
        findings = analyze(df)
        assert "mixed_case_email_column" not in _ids(findings)


# ---------------------------------------------------------------------------
# Leading-zero IDs
# ---------------------------------------------------------------------------

class TestLeadingZeroIds:
    def test_finds_zero_padded_ids(self):
        df = pd.DataFrame({
            "sku": ["0001234", "0005678", "0009999", "0001111", "0002222", "0003333"],
        })
        findings = analyze(df)
        assert "leading_zero_ids" in _ids(findings)

    def test_no_finding_when_no_leading_zero(self):
        df = pd.DataFrame({"id": [str(i) for i in range(1, 100)]})
        findings = analyze(df)
        assert "leading_zero_ids" not in _ids(findings)


# ---------------------------------------------------------------------------
# Near-duplicate rows
# ---------------------------------------------------------------------------

class TestNearDuplicates:
    def test_finds_case_insensitive_dupes(self):
        df = pd.DataFrame({
            "name": ["Alice", "alice ", "Bob"],
            "email": ["a@b.com", "A@B.COM", "bob@b.com"],
        })
        findings = analyze(df)
        assert "near_duplicate_rows" in _ids(findings)

    def test_unique_rows_no_finding(self):
        df = pd.DataFrame({
            "name": ["Alice", "Bob", "Carol"],
            "email": ["a@x.com", "b@x.com", "c@x.com"],
        })
        findings = analyze(df)
        assert "near_duplicate_rows" not in _ids(findings)

    def test_single_row_no_finding(self):
        df = pd.DataFrame({"x": ["only"]})
        findings = analyze(df)
        assert "near_duplicate_rows" not in _ids(findings)


# ---------------------------------------------------------------------------
# Mixed line endings
# ---------------------------------------------------------------------------

class TestEncodingUncertainty:
    def test_replacement_chars_in_data_flagged(self):
        df = pd.DataFrame({"name": ["Caf<EFBFBD>", "Ber<EFBFBD>in"]})
        findings = analyze(df)
        f = next(f for f in findings if f.id == "encoding_uncertain")
        assert f.severity == "error"
        assert f.confidence == "low"
        assert f.count == 2

    def test_replacement_chars_in_header_flagged(self):
        df = pd.DataFrame({"emai<EFBFBD>l": ["a@x.com"]})
        findings = analyze(df)
        ids = {f.id for f in findings}
        assert "encoding_uncertain" in ids

    def test_clean_data_no_finding(self):
        df = pd.DataFrame({"name": ["Alice", "Bob"]})
        findings = analyze(df)
        assert "encoding_uncertain" not in {f.id for f in findings}


class TestEncodingOverride:
    def test_override_corrects_misdetected_codepage(self, tmp_path):
        # WESTERN_BASIC bytes encoded as cp1252; charset-normalizer guesses
        # cp1250, which gets 0xF1 wrong (ń vs ñ).
        f = tmp_path / "cp1252.csv"
        f.write_bytes("id,name\n1,España\n".encode("cp1252"))

        from src.core.analyze import _load_for_analysis
        df_auto, _, _ = _load_for_analysis(f, sample_rows=10)
        df_overridden, _, _ = _load_for_analysis(
            f, sample_rows=10, encoding_override="cp1252",
        )
        # Override yields the correct character.
        assert df_overridden["name"].iloc[0] == "España"

    def test_override_propagates_through_top_level_analyze(self, tmp_path):
        f = tmp_path / "koi8.csv"
        # KOI8-R Cyrillic; default detection guesses Shift_JIS.
        f.write_bytes("id,name\n1,Иван\n".encode("koi8-r"))
        # With the override the analyzer should produce zero findings
        # against this clean fixture (no mojibake, no U+FFFD).
        findings = analyze(f, encoding_override="koi8-r")
        ids = {x.id for x in findings}
        assert "encoding_uncertain" not in ids
        assert "encoding_decode_failed" not in ids


class TestEncodingDecodeFailedFromRepair:
    def test_lying_bom_recovered_and_flagged(self, tmp_path):
        # File has a UTF-8 BOM but the body bytes are cp1252 (0x80 = € in
        # cp1252; not a valid UTF-8 continuation byte). Detector should
        # recover transparently to cp1252 and surface an
        # ``encoding_lying_bom`` warn so the user knows.
        f = tmp_path / "lying_bom.csv"
        f.write_bytes(b"\xef\xbb\xbfid,name\n1,\x80100\n")
        findings = analyze(f)
        ids = {x.id for x in findings}
        assert "encoding_lying_bom" in ids
        bad = next(x for x in findings if x.id == "encoding_lying_bom")
        assert bad.severity == "warn"
        # Decode should have succeeded — no replacement-character finding.
        assert "encoding_decode_failed" not in ids


class TestMixedLineEndings:
    def test_crlf_plus_lf_flagged(self, tmp_path):
        f = tmp_path / "mixed.csv"
        f.write_bytes(b"id,name\r\n1,Alice\n2,Bob\r\n")
        findings = analyze(f)
        assert "mixed_line_endings" in _ids(findings)

    def test_uniform_lf_not_flagged(self, tmp_path):
        f = tmp_path / "uniform.csv"
        f.write_bytes(b"id,name\n1,Alice\n2,Bob\n")
        findings = analyze(f)
        assert "mixed_line_endings" not in _ids(findings)

    def test_dataframe_mode_skips_detector(self):
        # No raw bytes -> mixed_line_endings cannot be detected.
        df = pd.DataFrame({"id": ["1"], "name": ["Alice"]})
        findings = analyze(df)
        assert "mixed_line_endings" not in _ids(findings)


# ---------------------------------------------------------------------------
# Findings synthesized from RepairResult
# ---------------------------------------------------------------------------

class TestFindingsFromRepair:
    def test_bom_strip_surfaces(self):
        repair = repair_bytes(b"\xef\xbb\xbfid,name\n1,Alice\n")
        findings = analyze(pd.DataFrame({"id": ["1"], "name": ["Alice"]}),
                           repair_result=repair)
        assert "csv_bom_stripped" in _ids(findings)

    def test_nul_strip_surfaces(self):
        repair = repair_bytes(b"id,name\n1,Hel\x00lo\n")
        findings = analyze(pd.DataFrame({"id": ["1"], "name": ["Hello"]}),
                           repair_result=repair)
        assert "csv_nul_stripped" in _ids(findings)

    def test_unrepairable_surfaces_as_error(self):
        # Synthesize a result with an unrepairable line.
        repair = RepairResult(
            repaired_bytes=b"id,a,b\n1,foo,bar\n",
            actions=[],
            unrepairable_lines=[3],
        )
        findings = analyze(pd.DataFrame({"id": ["1"], "a": ["foo"], "b": ["bar"]}),
                           repair_result=repair)
        f = next(f for f in findings if f.id == "csv_unrepairable_rows")
        assert f.severity == "error"


# ---------------------------------------------------------------------------
# End-to-end on the corpus kitchen-sink fixture
# ---------------------------------------------------------------------------

class TestEndToEnd:
    def test_kitchen_sink_fixture_finds_pollution(self):
        path = Path("test-cases/text-cleaner-corpus/test_data/20_kitchen_sink.csv")
        if not path.exists():
            pytest.skip("corpus fixture not present")
        findings = analyze(path)
        ids = _ids(findings)
        # Kitchen-sink has BOM, smart quotes, NBSP, ZWSP, and dirty headers.
        # Pre-parse repair handles the file-level smart-quote/BOM, so they
        # show up as csv_* findings; the cell-level NBSP/ZW remain as
        # data findings.
        assert "csv_bom_stripped" in ids or "csv_smart_quotes_folded" in ids
        # NBSP-padded headers should still surface — pre-parse repair only
        # touches double-quote characters.
        assert any(i.startswith("dirty_") or i.startswith("nbsp") or i.startswith("zero_width")
                   for i in ids)

    def test_clean_dataframe_returns_empty_findings(self):
        df = pd.DataFrame({
            "id": ["1", "2", "3"],
            "name": ["Alice", "Bob", "Carol"],
            "email": ["a@x.com", "b@x.com", "c@x.com"],
        })
        findings = analyze(df)
        assert findings == []


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

class TestHelpers:
    def test_findings_by_tool_groups_correctly(self):
        df = pd.DataFrame({
            "name": ["  padded  ", "“smart”"],
            "x": ["N/A", "valid"],
        })
        findings = analyze(df)
        grouped = findings_by_tool(findings)
        assert TOOL_TEXT_CLEANER in grouped
        assert TOOL_MISSING_HANDLER in grouped

    def test_findings_by_tool_skips_toolless(self):
        repair = RepairResult(
            repaired_bytes=b"", actions=[], unrepairable_lines=[5, 7],
        )
        findings = analyze(pd.DataFrame({"x": ["a"]}), repair_result=repair)
        grouped = findings_by_tool(findings)
        # csv_unrepairable_rows has tool="" and should not appear.
        assert all(t for t in grouped)

    def test_to_dict_is_json_serializable(self):
        df = pd.DataFrame({"x": ["  padded  "]})
        findings = analyze(df)
        d = to_dict(findings[0])
        import json
        json.dumps(d)  # would raise on non-serializable values
        assert d["id"] == "whitespace_padding"
        assert "samples" in d