"""Tests for the CSV-normalization gate. Covers: * ``Finding.confidence`` and ``Finding.fix_action`` field defaults. * ``auto_fix`` applies every high-confidence finding and leaves medium/low ones pending. * ``apply_decisions`` honors per-finding skip / modified payloads. * ``is_normalized`` re-checks high-confidence detectors after a fix pass. * The full corpus auto-fix sweep: every fixture either passes the gate or has its remaining medium/low findings declared in pending. """ from __future__ import annotations from pathlib import Path import pandas as pd import pytest from src.core.analyze import ( Finding, analyze, _load_for_analysis, FIX_FOLD_SMART_PUNCT, FIX_LOWERCASE_EMAIL, FIX_REPLACE_NULL_SENTINELS, FIX_NONE, ) from src.core.fixes import get_fix, available_actions from src.core.normalize import ( Decision, NormalizationResult, auto_fix, apply_decisions, is_normalized, gate_summary, ) CORPUS = Path(__file__).parent.parent / "test-cases" / "text-cleaner-corpus" / "test_data" # --------------------------------------------------------------------------- # Field defaults # --------------------------------------------------------------------------- class TestFindingFields: def test_default_confidence_is_high(self): f = Finding(id="x", severity="warn", tool="", count=1, description="d") assert f.confidence == "high" def test_default_fix_action_is_empty(self): f = Finding(id="x", severity="warn", tool="", count=1, description="d") assert f.fix_action == "" def test_pre_applied_default_false(self): f = Finding(id="x", severity="warn", tool="", count=1, description="d") assert f.pre_applied is False def test_smart_punct_finding_carries_fix_action(self): df = pd.DataFrame({"x": ["“hello”"]}) findings = analyze(df) smart = next(f for f in findings if f.id == "smart_punctuation_in_data") assert smart.confidence == "high" assert smart.fix_action == FIX_FOLD_SMART_PUNCT def test_mojibake_finding_is_low_confidence(self): df = pd.DataFrame({"x": ["café"]}) findings = analyze(df) moji = next(f for f in findings if f.id == "suspected_mojibake") assert moji.confidence == "low" # --------------------------------------------------------------------------- # Fix registry # --------------------------------------------------------------------------- class TestFixRegistry: def test_high_confidence_fixes_registered(self): actions = available_actions() assert FIX_FOLD_SMART_PUNCT in actions assert FIX_LOWERCASE_EMAIL in actions assert FIX_REPLACE_NULL_SENTINELS in actions def test_get_fix_returns_callable(self): fn = get_fix(FIX_FOLD_SMART_PUNCT) assert callable(fn) def test_get_fix_unknown_returns_none(self): assert get_fix("not_a_real_action") is None # --------------------------------------------------------------------------- # auto_fix # --------------------------------------------------------------------------- class TestAutoFix: def test_applies_high_confidence_only(self): df = pd.DataFrame({ "name": [" Alice ", "Bob "], # whitespace + NBSP -> high "email": ["A@X.com", "b@x.com"], # mixed case -> medium }) findings = analyze(df) result = auto_fix(df, findings) # whitespace_padding and nbsp_or_unicode_whitespace should be applied. applied_ids = {a.finding_id for a in result.applied} assert "whitespace_padding" in applied_ids assert "nbsp_or_unicode_whitespace" in applied_ids # mixed_case_email_column is medium -> pending. pending_ids = {f.id for f in result.pending_findings} assert "mixed_case_email_column" in pending_ids def test_cells_actually_changed(self): df = pd.DataFrame({"x": [" hi ", "ok"]}) findings = analyze(df) result = auto_fix(df, findings) assert result.cleaned_df["x"].tolist() == ["hi", "ok"] def test_no_findings_no_fixes(self): df = pd.DataFrame({"id": ["1", "2"], "name": ["a", "b"]}) findings = analyze(df) result = auto_fix(df, findings) assert result.applied == [] assert result.passed is True def test_blocks_on_severity_error(self, tmp_path): f = tmp_path / "empty.csv" f.write_bytes(b"") findings = analyze(f) df, _, _ = _load_for_analysis(f, sample_rows=1000) result = auto_fix(df, findings) assert any(b.id == "empty_input" for b in result.blocking_findings) assert result.passed is False # --------------------------------------------------------------------------- # apply_decisions # --------------------------------------------------------------------------- class TestApplyDecisions: def test_skip_decision_records_skipped(self): df = pd.DataFrame({"x": ["“smart”"]}) findings = analyze(df) decisions = [Decision(finding_id="smart_punctuation_in_data", action="skip")] result = apply_decisions(df, findings, decisions) assert any(s.id == "smart_punctuation_in_data" for s in result.skipped_findings) # And the smart quotes survived. assert "“" in result.cleaned_df["x"].iloc[0] def test_auto_decision_runs_fix(self): df = pd.DataFrame({"x": ["“smart”"]}) findings = analyze(df) decisions = [Decision(finding_id="smart_punctuation_in_data", action="auto")] result = apply_decisions(df, findings, decisions) assert result.cleaned_df["x"].iloc[0] == '"smart"' def test_modified_decision_uses_payload(self): df = pd.DataFrame({"status": ["ACTIVE", "TBD", "TBD", "active"]}) findings = analyze(df) # Restrict the null-sentinel set to only "TBD" via payload. decisions = [Decision( finding_id="null_like_sentinels", action="modified", payload={"sentinels": ["TBD"]}, )] # null_like_sentinels needs to be present for the decision to apply. if not any(f.id == "null_like_sentinels" for f in findings): pytest.skip("analyzer didn't surface null sentinels for this fixture") result = apply_decisions(df, findings, decisions) assert result.cleaned_df["status"].tolist() == ["ACTIVE", "", "", "active"] def test_lowercase_email_uses_finding_column(self): df = pd.DataFrame({ "email": ["ALICE@X.com", "bob@x.com"], "name": ["Alice", "Bob"], }) findings = analyze(df) decisions = [Decision(finding_id="mixed_case_email_column", action="auto")] if not any(f.id == "mixed_case_email_column" for f in findings): pytest.skip("analyzer didn't surface mixed-case email") result = apply_decisions(df, findings, decisions) assert result.cleaned_df["email"].tolist() == ["alice@x.com", "bob@x.com"] # Other columns untouched. assert result.cleaned_df["name"].tolist() == ["Alice", "Bob"] def test_undecided_medium_finding_stays_pending(self): df = pd.DataFrame({"email": ["A@X.com", "b@x.com"]}) findings = analyze(df) result = apply_decisions(df, findings, decisions=[]) if not any(f.id == "mixed_case_email_column" for f in findings): pytest.skip("analyzer didn't surface mixed-case email") assert any(f.id == "mixed_case_email_column" for f in result.pending_findings) # --------------------------------------------------------------------------- # is_normalized # --------------------------------------------------------------------------- class TestIsNormalized: def test_clean_dataframe_passes(self): df = pd.DataFrame({"id": ["1"], "name": ["Alice"]}) findings = analyze(df) result = auto_fix(df, findings) assert is_normalized(findings, result) is True def test_unnormalized_after_skip_high_confidence(self): df = pd.DataFrame({"x": [" padded "]}) findings = analyze(df) # Skip the only high-confidence fix. decisions = [Decision(finding_id="whitespace_padding", action="skip")] result = apply_decisions(df, findings, decisions) # Re-analysis still finds the issue, so gate is not normalized. assert is_normalized(findings, result) is False def test_pending_medium_blocks_gate(self): df = pd.DataFrame({"email": ["A@X.com", "b@x.com"]}) findings = analyze(df) result = auto_fix(df, findings) # auto_fix leaves medium pending -> gate not passed. if any(f.id == "mixed_case_email_column" for f in findings): assert is_normalized(findings, result) is False def test_none_result_not_normalized(self): assert is_normalized([], None) is False # --------------------------------------------------------------------------- # Corpus sweep — every fixture either passes or has declared pending # --------------------------------------------------------------------------- CORPUS_FILES = sorted(CORPUS.glob("*.csv")) if CORPUS.exists() else [] # Fixtures that will have pending medium/low findings after auto_fix. EXPECTED_PENDING_AFTER_AUTOFIX = { "11_embedded_newlines": {"mixed_case_email_column"}, "12_case_variations": {"mixed_case_email_column"}, "14_mojibake": {"suspected_mojibake"}, "17_preserve_intended": {"null_like_sentinels"}, "20_kitchen_sink": {"mixed_case_email_column"}, } # Fixtures that block the gate via severity=error findings. EXPECTED_BLOCKING = { "18_empty_file": {"empty_input"}, } @pytest.mark.parametrize("path", CORPUS_FILES, ids=lambda p: p.stem) def test_corpus_auto_fix_state(path): """Every corpus fixture either passes auto_fix or has its remaining pending/blocking findings declared in the expected sets above.""" findings = analyze(path, sample_rows=1000) df, _, _ = _load_for_analysis(path, sample_rows=1000) result = auto_fix(df, findings) pending_ids = {f.id for f in result.pending_findings} blocking_ids = {f.id for f in result.blocking_findings} expected_pending = EXPECTED_PENDING_AFTER_AUTOFIX.get(path.stem, set()) expected_blocking = EXPECTED_BLOCKING.get(path.stem, set()) assert pending_ids == expected_pending, ( f"{path.name}: pending {pending_ids} != expected {expected_pending}" ) assert blocking_ids == expected_blocking, ( f"{path.name}: blocking {blocking_ids} != expected {expected_blocking}" ) def test_corpus_auto_fix_idempotent(): """Running auto_fix twice on the same input yields the same bytes.""" if not CORPUS_FILES: pytest.skip("corpus not present") path = CORPUS / "20_kitchen_sink.csv" findings = analyze(path, sample_rows=1000) df, _, _ = _load_for_analysis(path, sample_rows=1000) r1 = auto_fix(df, findings) # Re-analyze the cleaned frame and run again. f2 = analyze(r1.cleaned_df) r2 = auto_fix(r1.cleaned_df, f2) assert r1.cleaned_bytes == r2.cleaned_bytes # --------------------------------------------------------------------------- # gate_summary # --------------------------------------------------------------------------- class TestOutputOptions: """The Review page's _build_output_bytes helper for the download flow. Imported via importlib because the page itself runs Streamlit code at module load; we copy the function shape here as a compact spec so a future refactor that moves the helper into core/io.py can keep the same contract. """ @staticmethod def _build(df, *, encoding, delimiter, line_terminator): import io as _io buf = _io.StringIO() df.to_csv(buf, index=False, sep=delimiter, lineterminator=line_terminator) text = buf.getvalue() try: return text.encode(encoding), None except UnicodeEncodeError: return text.encode(encoding, errors="replace"), "lossy" def test_utf8_with_bom_starts_with_bom(self): df = pd.DataFrame({"x": ["a"]}) data, _ = self._build(df, encoding="utf-8-sig", delimiter=",", line_terminator="\n") assert data.startswith(b"\xef\xbb\xbf") def test_crlf_line_terminator(self): df = pd.DataFrame({"x": ["a", "b"]}) data, _ = self._build(df, encoding="utf-8", delimiter=",", line_terminator="\r\n") assert b"\r\n" in data assert b"\nb" not in data.replace(b"\r\n", b"") def test_tab_delimiter(self): df = pd.DataFrame({"a": ["x"], "b": ["y"]}) data, _ = self._build(df, encoding="utf-8", delimiter="\t", line_terminator="\n") assert data.startswith(b"a\tb\n") def test_cp1252_single_byte_accents(self): df = pd.DataFrame({"name": ["José"]}) data, _ = self._build(df, encoding="cp1252", delimiter=",", line_terminator="\n") # 'é' is single byte 0xE9 in cp1252 (vs 0xC3 0xA9 in UTF-8) assert b"\xe9" in data assert b"\xc3\xa9" not in data def test_lossy_codepage_returns_warning(self): df = pd.DataFrame({"name": ["Иван"]}) # Cyrillic data, warn = self._build(df, encoding="cp1252", delimiter=",", line_terminator="\n") assert warn is not None assert b"?" in data # replacement chars class TestGateSummary: def test_summary_keys(self): df = pd.DataFrame({"x": [" hi "]}) findings = analyze(df) result = auto_fix(df, findings) s = gate_summary(result) assert set(s.keys()) == { "passed", "fixes_applied", "cells_changed", "skipped", "pending", "blocking", }