datatools-dev/tests/test_normalize.py

"""Tests for the CSV-normalization gate.

Covers:
* ``Finding.confidence`` and ``Finding.fix_action`` field defaults.
* ``auto_fix`` applies every high-confidence finding and leaves
  medium/low ones pending.
* ``apply_decisions`` honors per-finding skip / modified payloads.
* ``is_normalized`` re-checks high-confidence detectors after a fix pass.
* The full corpus auto-fix sweep: every fixture either passes the gate
  or has its remaining medium/low findings declared in pending.
"""

from __future__ import annotations

from pathlib import Path

import pandas as pd
import pytest

from src.core.analyze import (
    Finding,
    analyze,
    _load_for_analysis,
    FIX_FOLD_SMART_PUNCT,
    FIX_LOWERCASE_EMAIL,
    FIX_REPLACE_NULL_SENTINELS,
    FIX_NONE,
)
from src.core.fixes import get_fix, available_actions
from src.core.normalize import (
    Decision,
    NormalizationResult,
    auto_fix,
    apply_decisions,
    is_normalized,
    gate_summary,
)


CORPUS = Path(__file__).parent.parent / "test-cases" / "text-cleaner-corpus" / "test_data"


# ---------------------------------------------------------------------------
# Field defaults
# ---------------------------------------------------------------------------

class TestFindingFields:
    def test_default_confidence_is_high(self):
        f = Finding(id="x", severity="warn", tool="", count=1, description="d")
        assert f.confidence == "high"

    def test_default_fix_action_is_empty(self):
        f = Finding(id="x", severity="warn", tool="", count=1, description="d")
        assert f.fix_action == ""

    def test_pre_applied_default_false(self):
        f = Finding(id="x", severity="warn", tool="", count=1, description="d")
        assert f.pre_applied is False

    def test_smart_punct_finding_carries_fix_action(self):
        df = pd.DataFrame({"x": ["“hello”"]})
        findings = analyze(df)
        smart = next(f for f in findings if f.id == "smart_punctuation_in_data")
        assert smart.confidence == "high"
        assert smart.fix_action == FIX_FOLD_SMART_PUNCT

    def test_mojibake_finding_is_low_confidence(self):
        df = pd.DataFrame({"x": ["cafÃ©"]})
        findings = analyze(df)
        moji = next(f for f in findings if f.id == "suspected_mojibake")
        assert moji.confidence == "low"


# ---------------------------------------------------------------------------
# Fix registry
# ---------------------------------------------------------------------------

class TestFixRegistry:
    def test_high_confidence_fixes_registered(self):
        actions = available_actions()
        assert FIX_FOLD_SMART_PUNCT in actions
        assert FIX_LOWERCASE_EMAIL in actions
        assert FIX_REPLACE_NULL_SENTINELS in actions

    def test_get_fix_returns_callable(self):
        fn = get_fix(FIX_FOLD_SMART_PUNCT)
        assert callable(fn)

    def test_get_fix_unknown_returns_none(self):
        assert get_fix("not_a_real_action") is None


# ---------------------------------------------------------------------------
# auto_fix
# ---------------------------------------------------------------------------

class TestAutoFix:
    def test_applies_high_confidence_only(self):
        df = pd.DataFrame({
            "name": ["  Alice  ", "Bob "],   # whitespace + NBSP -> high
            "email": ["A@X.com", "b@x.com"],       # mixed case -> medium
        })
        findings = analyze(df)
        result = auto_fix(df, findings)

        # whitespace_padding and nbsp_or_unicode_whitespace should be applied.
        applied_ids = {a.finding_id for a in result.applied}
        assert "whitespace_padding" in applied_ids
        assert "nbsp_or_unicode_whitespace" in applied_ids

        # mixed_case_email_column is medium -> pending.
        pending_ids = {f.id for f in result.pending_findings}
        assert "mixed_case_email_column" in pending_ids

    def test_cells_actually_changed(self):
        df = pd.DataFrame({"x": ["  hi  ", "ok"]})
        findings = analyze(df)
        result = auto_fix(df, findings)
        assert result.cleaned_df["x"].tolist() == ["hi", "ok"]

    def test_no_findings_no_fixes(self):
        df = pd.DataFrame({"id": ["1", "2"], "name": ["a", "b"]})
        findings = analyze(df)
        result = auto_fix(df, findings)
        assert result.applied == []
        assert result.passed is True

    def test_blocks_on_severity_error(self, tmp_path):
        f = tmp_path / "empty.csv"
        f.write_bytes(b"")
        findings = analyze(f)
        df, _, _ = _load_for_analysis(f, sample_rows=1000)
        result = auto_fix(df, findings)
        assert any(b.id == "empty_input" for b in result.blocking_findings)
        assert result.passed is False


# ---------------------------------------------------------------------------
# apply_decisions
# ---------------------------------------------------------------------------

class TestApplyDecisions:
    def test_skip_decision_records_skipped(self):
        df = pd.DataFrame({"x": ["“smart”"]})
        findings = analyze(df)
        decisions = [Decision(finding_id="smart_punctuation_in_data", action="skip")]
        result = apply_decisions(df, findings, decisions)
        assert any(s.id == "smart_punctuation_in_data" for s in result.skipped_findings)
        # And the smart quotes survived.
        assert "“" in result.cleaned_df["x"].iloc[0]

    def test_auto_decision_runs_fix(self):
        df = pd.DataFrame({"x": ["“smart”"]})
        findings = analyze(df)
        decisions = [Decision(finding_id="smart_punctuation_in_data", action="auto")]
        result = apply_decisions(df, findings, decisions)
        assert result.cleaned_df["x"].iloc[0] == '"smart"'

    def test_modified_decision_uses_payload(self):
        df = pd.DataFrame({"status": ["ACTIVE", "TBD", "TBD", "active"]})
        findings = analyze(df)
        # Restrict the null-sentinel set to only "TBD" via payload.
        decisions = [Decision(
            finding_id="null_like_sentinels",
            action="modified",
            payload={"sentinels": ["TBD"]},
        )]
        # null_like_sentinels needs to be present for the decision to apply.
        if not any(f.id == "null_like_sentinels" for f in findings):
            pytest.skip("analyzer didn't surface null sentinels for this fixture")
        result = apply_decisions(df, findings, decisions)
        assert result.cleaned_df["status"].tolist() == ["ACTIVE", "", "", "active"]

    def test_lowercase_email_uses_finding_column(self):
        df = pd.DataFrame({
            "email": ["ALICE@X.com", "bob@x.com"],
            "name": ["Alice", "Bob"],
        })
        findings = analyze(df)
        decisions = [Decision(finding_id="mixed_case_email_column", action="auto")]
        if not any(f.id == "mixed_case_email_column" for f in findings):
            pytest.skip("analyzer didn't surface mixed-case email")
        result = apply_decisions(df, findings, decisions)
        assert result.cleaned_df["email"].tolist() == ["alice@x.com", "bob@x.com"]
        # Other columns untouched.
        assert result.cleaned_df["name"].tolist() == ["Alice", "Bob"]

    def test_undecided_medium_finding_stays_pending(self):
        df = pd.DataFrame({"email": ["A@X.com", "b@x.com"]})
        findings = analyze(df)
        result = apply_decisions(df, findings, decisions=[])
        if not any(f.id == "mixed_case_email_column" for f in findings):
            pytest.skip("analyzer didn't surface mixed-case email")
        assert any(f.id == "mixed_case_email_column" for f in result.pending_findings)


# ---------------------------------------------------------------------------
# is_normalized
# ---------------------------------------------------------------------------

class TestIsNormalized:
    def test_clean_dataframe_passes(self):
        df = pd.DataFrame({"id": ["1"], "name": ["Alice"]})
        findings = analyze(df)
        result = auto_fix(df, findings)
        assert is_normalized(findings, result) is True

    def test_unnormalized_after_skip_high_confidence(self):
        df = pd.DataFrame({"x": ["  padded  "]})
        findings = analyze(df)
        # Skip the only high-confidence fix.
        decisions = [Decision(finding_id="whitespace_padding", action="skip")]
        result = apply_decisions(df, findings, decisions)
        # Re-analysis still finds the issue, so gate is not normalized.
        assert is_normalized(findings, result) is False

    def test_pending_medium_blocks_gate(self):
        df = pd.DataFrame({"email": ["A@X.com", "b@x.com"]})
        findings = analyze(df)
        result = auto_fix(df, findings)
        # auto_fix leaves medium pending -> gate not passed.
        if any(f.id == "mixed_case_email_column" for f in findings):
            assert is_normalized(findings, result) is False

    def test_none_result_not_normalized(self):
        assert is_normalized([], None) is False


# ---------------------------------------------------------------------------
# Corpus sweep — every fixture either passes or has declared pending
# ---------------------------------------------------------------------------

CORPUS_FILES = sorted(CORPUS.glob("*.csv")) if CORPUS.exists() else []

# Fixtures that will have pending medium/low findings after auto_fix.
EXPECTED_PENDING_AFTER_AUTOFIX = {
    "11_embedded_newlines": {"mixed_case_email_column"},
    "12_case_variations": {"mixed_case_email_column"},
    "14_mojibake": {"suspected_mojibake"},
    "17_preserve_intended": {"null_like_sentinels"},
    "20_kitchen_sink": {"mixed_case_email_column"},
}

# Fixtures that block the gate via severity=error findings.
EXPECTED_BLOCKING = {
    "18_empty_file": {"empty_input"},
}


@pytest.mark.parametrize("path", CORPUS_FILES, ids=lambda p: p.stem)
def test_corpus_auto_fix_state(path):
    """Every corpus fixture either passes auto_fix or has its remaining
    pending/blocking findings declared in the expected sets above."""
    findings = analyze(path, sample_rows=1000)
    df, _, _ = _load_for_analysis(path, sample_rows=1000)
    result = auto_fix(df, findings)

    pending_ids = {f.id for f in result.pending_findings}
    blocking_ids = {f.id for f in result.blocking_findings}

    expected_pending = EXPECTED_PENDING_AFTER_AUTOFIX.get(path.stem, set())
    expected_blocking = EXPECTED_BLOCKING.get(path.stem, set())

    assert pending_ids == expected_pending, (
        f"{path.name}: pending {pending_ids} != expected {expected_pending}"
    )
    assert blocking_ids == expected_blocking, (
        f"{path.name}: blocking {blocking_ids} != expected {expected_blocking}"
    )


def test_corpus_auto_fix_idempotent():
    """Running auto_fix twice on the same input yields the same bytes."""
    if not CORPUS_FILES:
        pytest.skip("corpus not present")
    path = CORPUS / "20_kitchen_sink.csv"
    findings = analyze(path, sample_rows=1000)
    df, _, _ = _load_for_analysis(path, sample_rows=1000)
    r1 = auto_fix(df, findings)
    # Re-analyze the cleaned frame and run again.
    f2 = analyze(r1.cleaned_df)
    r2 = auto_fix(r1.cleaned_df, f2)
    assert r1.cleaned_bytes == r2.cleaned_bytes


# ---------------------------------------------------------------------------
# gate_summary
# ---------------------------------------------------------------------------

class TestOutputOptions:
    """The Review page's _build_output_bytes helper for the download flow.

    Imported via importlib because the page itself runs Streamlit code at
    module load; we copy the function shape here as a compact spec so a
    future refactor that moves the helper into core/io.py can keep the
    same contract.
    """

    @staticmethod
    def _build(df, *, encoding, delimiter, line_terminator):
        import io as _io
        buf = _io.StringIO()
        df.to_csv(buf, index=False, sep=delimiter, lineterminator=line_terminator)
        text = buf.getvalue()
        try:
            return text.encode(encoding), None
        except UnicodeEncodeError:
            return text.encode(encoding, errors="replace"), "lossy"

    def test_utf8_with_bom_starts_with_bom(self):
        df = pd.DataFrame({"x": ["a"]})
        data, _ = self._build(df, encoding="utf-8-sig", delimiter=",", line_terminator="\n")
        assert data.startswith(b"\xef\xbb\xbf")

    def test_crlf_line_terminator(self):
        df = pd.DataFrame({"x": ["a", "b"]})
        data, _ = self._build(df, encoding="utf-8", delimiter=",", line_terminator="\r\n")
        assert b"\r\n" in data
        assert b"\nb" not in data.replace(b"\r\n", b"")

    def test_tab_delimiter(self):
        df = pd.DataFrame({"a": ["x"], "b": ["y"]})
        data, _ = self._build(df, encoding="utf-8", delimiter="\t", line_terminator="\n")
        assert data.startswith(b"a\tb\n")

    def test_cp1252_single_byte_accents(self):
        df = pd.DataFrame({"name": ["José"]})
        data, _ = self._build(df, encoding="cp1252", delimiter=",", line_terminator="\n")
        # 'é' is single byte 0xE9 in cp1252 (vs 0xC3 0xA9 in UTF-8)
        assert b"\xe9" in data
        assert b"\xc3\xa9" not in data

    def test_lossy_codepage_returns_warning(self):
        df = pd.DataFrame({"name": ["Иван"]})  # Cyrillic
        data, warn = self._build(df, encoding="cp1252", delimiter=",", line_terminator="\n")
        assert warn is not None
        assert b"?" in data  # replacement chars


class TestGateSummary:
    def test_summary_keys(self):
        df = pd.DataFrame({"x": ["  hi  "]})
        findings = analyze(df)
        result = auto_fix(df, findings)
        s = gate_summary(result)
        assert set(s.keys()) == {
            "passed", "fixes_applied", "cells_changed",
            "skipped", "pending", "blocking",
        }