datatools-dev/tests/test_gap_coverage.py

"""Tests added to close gaps surfaced by the test audit.

These cover edges that existing suites missed:

- ``CleanOptions.clean_headers=False`` toggle (added but not directly tested).
- ``repair_bytes`` with non-comma delimiters and combined-fix scenarios.
- ``analyze()`` over a path-based Excel file.
- ``analyze()`` with ``sample_rows >= len(df)`` (uses copy(), not head()).
- ``findings_by_tool`` on an empty list.
- BOM that appears mid-cell rather than at file start.
- The collapse-whitespace heuristic for numeric/date/phone-shaped cells
  (spec §4.17), now wired in via ``_smart_collapse_whitespace``.
"""

from __future__ import annotations

import io

import pandas as pd
import pytest

from src.core.analyze import analyze, findings_by_tool
from src.core.io import RepairAction, repair_bytes
from src.core.text_clean import CleanOptions, clean_dataframe


# ---------------------------------------------------------------------------
# clean_headers toggle
# ---------------------------------------------------------------------------

class TestCleanHeadersToggle:
    def test_default_cleans_headers(self):
        df = pd.DataFrame({"  id  ": [1], "Email": ["a@b.com"]})
        result = clean_dataframe(df)
        assert list(result.cleaned_df.columns) == ["id", "Email"]

    def test_disable_preserves_dirty_headers(self):
        df = pd.DataFrame({"  id  ": [1], "Email": ["a@b.com"]})
        result = clean_dataframe(df, CleanOptions(clean_headers=False))
        assert list(result.cleaned_df.columns) == ["  id  ", "Email"]

    def test_disable_still_cleans_data_cells(self):
        df = pd.DataFrame({"name": ["  Alice  ", "Bob "]})
        result = clean_dataframe(df, CleanOptions(clean_headers=False))
        assert result.cleaned_df["name"].tolist() == ["Alice", "Bob"]


# ---------------------------------------------------------------------------
# repair_bytes — non-comma delimiters and combined fixes
# ---------------------------------------------------------------------------

class TestRepairBytesDelimiters:
    def test_tab_delimited_smart_quote_fold(self):
        raw = "id\tnote\n1\t“hi”\n".encode("utf-8")
        result = repair_bytes(raw, delimiter="\t")
        text = result.repaired_bytes.decode("utf-8")
        assert "“" not in text and "”" not in text
        assert "\t" in text  # delimiter preserved

    def test_semicolon_delimited_unrepairable_extras(self):
        raw = b"id;a;b\n1;foo;bar\n2;1;2;3;4\n"
        result = repair_bytes(raw, delimiter=";")
        # Extra-field row with no clear merge candidate is logged unrepairable.
        assert 3 in result.unrepairable_lines


class TestRepairBytesCombinedFixes:
    def test_bom_plus_nul_plus_smart_quotes(self):
        raw = (
            b"\xef\xbb\xbf"
            b"id,note\n"
            b"1,Hel\x00lo \xe2\x80\x9cworld\xe2\x80\x9d\n"
        )
        result = repair_bytes(raw)
        kinds = {a.kind for a in result.actions}
        assert {"strip_bom", "strip_nul", "fold_smart_quote"} <= kinds
        # Resulting bytes parse cleanly.
        df = pd.read_csv(io.BytesIO(result.repaired_bytes))
        assert df.iloc[0]["note"] == 'Hello "world"'


# ---------------------------------------------------------------------------
# analyze() — path-based Excel and large-sample edges
# ---------------------------------------------------------------------------

class TestAnalyzeXlsxPath:
    def test_excel_path_runs_without_repair(self, tmp_path):
        path = tmp_path / "small.xlsx"
        df = pd.DataFrame({
            "id": ["1", "2"],
            "name": ["  Alice  ", "Bob"],   # padding in xlsx
        })
        df.to_excel(path, index=False, engine="openpyxl")
        findings = analyze(path)
        ids = {f.id for f in findings}
        assert "whitespace_padding" in ids
        # Excel skips csv_* findings — no pre-parse repair on xlsx.
        assert not any(i.startswith("csv_") for i in ids)


class TestAnalyzeSampleRowsEdge:
    def test_sample_rows_larger_than_df(self):
        df = pd.DataFrame({"x": ["  pad  ", "clean"]})
        # sample_rows=1000 but df has only 2 rows; must not crash.
        findings = analyze(df, sample_rows=1000)
        assert any(f.id == "whitespace_padding" for f in findings)


class TestAnalyzeMidCellBom:
    def test_bom_inside_cell_treated_as_zero_width(self):
        df = pd.DataFrame({"name": ["Hello"]})
        findings = analyze(df)
        assert any(f.id == "zero_width_or_invisible" for f in findings)


# ---------------------------------------------------------------------------
# findings_by_tool — edge cases
# ---------------------------------------------------------------------------

class TestFindingsByToolEdges:
    def test_empty_list_returns_empty_dict(self):
        assert findings_by_tool([]) == {}

    def test_only_toolless_findings_returns_empty_dict(self):
        from src.core.analyze import Finding
        # Construct a Finding with no tool — like csv_unrepairable_rows.
        f = Finding(
            id="x", severity="info", tool="", count=1,
            description="d",
        )
        assert findings_by_tool([f]) == {}


# ---------------------------------------------------------------------------
# Known gap: collapse_whitespace on numeric/date/phone-shaped cells
# ---------------------------------------------------------------------------

class TestStructuredCellWhitespacePreservation:
    """Spec §4.17: ``collapse_whitespace`` skips numeric/date/phone-shaped cells."""

    def test_phone_internal_double_space_preserved(self):
        df = pd.DataFrame({"phone": ["(555)  123-4567"]})
        result = clean_dataframe(df)
        assert result.cleaned_df.iloc[0]["phone"] == "(555)  123-4567"

    def test_european_thousands_sep_preserved(self):
        df = pd.DataFrame({"price": ["1 234"]})
        result = clean_dataframe(df)
        assert result.cleaned_df.iloc[0]["price"] == "1 234"

    def test_iso_date_passes_through(self):
        df = pd.DataFrame({"date": ["2024-01-15"]})
        result = clean_dataframe(df)
        assert result.cleaned_df.iloc[0]["date"] == "2024-01-15"

    def test_textual_date_preserves_spaces(self):
        df = pd.DataFrame({"date": ["Jan 15 2024"]})
        result = clean_dataframe(df)
        assert result.cleaned_df.iloc[0]["date"] == "Jan 15 2024"

    def test_free_text_double_space_still_collapsed(self):
        # Crucially, the heuristic must NOT trigger on prose with letters.
        df = pd.DataFrame({"note": ["hello  world"]})
        result = clean_dataframe(df)
        assert result.cleaned_df.iloc[0]["note"] == "hello world"