Closes the §4.17 spec gap that test_gap_coverage.py was tracking via xfail: collapse_whitespace must NOT touch cells whose shape carries meaningful internal whitespace. Adds _looks_structured(s) — returns True when s matches: - numeric (currency optional, thousand-grouping by , . or single space) - date (ISO/slash/dot separator, or 'Mon DD YYYY' / 'DD Mon YYYY') - phone (digits + parens/dots/dashes/+/spaces, >= 7 digits, no letters) The pipeline uses a new _smart_collapse_whitespace wrapper that defers to collapse_whitespace only when _looks_structured returns False. The raw collapse_whitespace function is unchanged so direct callers and existing unit tests remain valid. Five new positive tests replace the xfail: - "(555) 123-4567" preserved (phone, double space inside) - "1 234" preserved (European thousands) - "2024-01-15" preserved (ISO date) - "Jan 15 2024" preserved (textual date) - "hello world" still collapsed to "hello world" (free-text negative case) Conservative on purpose: a false negative just collapses (existing behavior); a false positive leaves intentional double spaces in prose. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
168 lines
6.6 KiB
Python
168 lines
6.6 KiB
Python
"""Tests added to close gaps surfaced by the test audit.
|
||
|
||
These cover edges that existing suites missed:
|
||
|
||
- ``CleanOptions.clean_headers=False`` toggle (added but not directly tested).
|
||
- ``repair_bytes`` with non-comma delimiters and combined-fix scenarios.
|
||
- ``analyze()`` over a path-based Excel file.
|
||
- ``analyze()`` with ``sample_rows >= len(df)`` (uses copy(), not head()).
|
||
- ``findings_by_tool`` on an empty list.
|
||
- BOM that appears mid-cell rather than at file start.
|
||
|
||
The collapse-whitespace heuristic for numeric/date/phone-shaped cells (spec
|
||
§4.17) is *not yet implemented* and is captured here as a known-gap xfail
|
||
so it's surfaced rather than silently missing.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import io
|
||
|
||
import pandas as pd
|
||
import pytest
|
||
|
||
from src.core.analyze import analyze, findings_by_tool
|
||
from src.core.io import RepairAction, repair_bytes
|
||
from src.core.text_clean import CleanOptions, clean_dataframe
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# clean_headers toggle
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestCleanHeadersToggle:
|
||
def test_default_cleans_headers(self):
|
||
df = pd.DataFrame({" id ": [1], "Email": ["a@b.com"]})
|
||
result = clean_dataframe(df)
|
||
assert list(result.cleaned_df.columns) == ["id", "Email"]
|
||
|
||
def test_disable_preserves_dirty_headers(self):
|
||
df = pd.DataFrame({" id ": [1], "Email": ["a@b.com"]})
|
||
result = clean_dataframe(df, CleanOptions(clean_headers=False))
|
||
assert list(result.cleaned_df.columns) == [" id ", "Email"]
|
||
|
||
def test_disable_still_cleans_data_cells(self):
|
||
df = pd.DataFrame({"name": [" Alice ", "Bob "]})
|
||
result = clean_dataframe(df, CleanOptions(clean_headers=False))
|
||
assert result.cleaned_df["name"].tolist() == ["Alice", "Bob"]
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# repair_bytes — non-comma delimiters and combined fixes
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestRepairBytesDelimiters:
|
||
def test_tab_delimited_smart_quote_fold(self):
|
||
raw = "id\tnote\n1\t“hi”\n".encode("utf-8")
|
||
result = repair_bytes(raw, delimiter="\t")
|
||
text = result.repaired_bytes.decode("utf-8")
|
||
assert "“" not in text and "”" not in text
|
||
assert "\t" in text # delimiter preserved
|
||
|
||
def test_semicolon_delimited_unrepairable_extras(self):
|
||
raw = b"id;a;b\n1;foo;bar\n2;1;2;3;4\n"
|
||
result = repair_bytes(raw, delimiter=";")
|
||
# Extra-field row with no clear merge candidate is logged unrepairable.
|
||
assert 3 in result.unrepairable_lines
|
||
|
||
|
||
class TestRepairBytesCombinedFixes:
|
||
def test_bom_plus_nul_plus_smart_quotes(self):
|
||
raw = (
|
||
b"\xef\xbb\xbf"
|
||
b"id,note\n"
|
||
b"1,Hel\x00lo \xe2\x80\x9cworld\xe2\x80\x9d\n"
|
||
)
|
||
result = repair_bytes(raw)
|
||
kinds = {a.kind for a in result.actions}
|
||
assert {"strip_bom", "strip_nul", "fold_smart_quote"} <= kinds
|
||
# Resulting bytes parse cleanly.
|
||
df = pd.read_csv(io.BytesIO(result.repaired_bytes))
|
||
assert df.iloc[0]["note"] == 'Hello "world"'
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# analyze() — path-based Excel and large-sample edges
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestAnalyzeXlsxPath:
|
||
def test_excel_path_runs_without_repair(self, tmp_path):
|
||
path = tmp_path / "small.xlsx"
|
||
df = pd.DataFrame({
|
||
"id": ["1", "2"],
|
||
"name": [" Alice ", "Bob"], # padding in xlsx
|
||
})
|
||
df.to_excel(path, index=False, engine="openpyxl")
|
||
findings = analyze(path)
|
||
ids = {f.id for f in findings}
|
||
assert "whitespace_padding" in ids
|
||
# Excel skips csv_* findings — no pre-parse repair on xlsx.
|
||
assert not any(i.startswith("csv_") for i in ids)
|
||
|
||
|
||
class TestAnalyzeSampleRowsEdge:
|
||
def test_sample_rows_larger_than_df(self):
|
||
df = pd.DataFrame({"x": [" pad ", "clean"]})
|
||
# sample_rows=1000 but df has only 2 rows; must not crash.
|
||
findings = analyze(df, sample_rows=1000)
|
||
assert any(f.id == "whitespace_padding" for f in findings)
|
||
|
||
|
||
class TestAnalyzeMidCellBom:
|
||
def test_bom_inside_cell_treated_as_zero_width(self):
|
||
df = pd.DataFrame({"name": ["Hello"]})
|
||
findings = analyze(df)
|
||
assert any(f.id == "zero_width_or_invisible" for f in findings)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# findings_by_tool — edge cases
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestFindingsByToolEdges:
|
||
def test_empty_list_returns_empty_dict(self):
|
||
assert findings_by_tool([]) == {}
|
||
|
||
def test_only_toolless_findings_returns_empty_dict(self):
|
||
from src.core.analyze import Finding
|
||
# Construct a Finding with no tool — like csv_unrepairable_rows.
|
||
f = Finding(
|
||
id="x", severity="info", tool="", count=1,
|
||
description="d",
|
||
)
|
||
assert findings_by_tool([f]) == {}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Known gap: collapse_whitespace on numeric/date/phone-shaped cells
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestStructuredCellWhitespacePreservation:
|
||
"""Spec §4.17: ``collapse_whitespace`` skips numeric/date/phone-shaped cells."""
|
||
|
||
def test_phone_internal_double_space_preserved(self):
|
||
df = pd.DataFrame({"phone": ["(555) 123-4567"]})
|
||
result = clean_dataframe(df)
|
||
assert result.cleaned_df.iloc[0]["phone"] == "(555) 123-4567"
|
||
|
||
def test_european_thousands_sep_preserved(self):
|
||
df = pd.DataFrame({"price": ["1 234"]})
|
||
result = clean_dataframe(df)
|
||
assert result.cleaned_df.iloc[0]["price"] == "1 234"
|
||
|
||
def test_iso_date_passes_through(self):
|
||
df = pd.DataFrame({"date": ["2024-01-15"]})
|
||
result = clean_dataframe(df)
|
||
assert result.cleaned_df.iloc[0]["date"] == "2024-01-15"
|
||
|
||
def test_textual_date_preserves_spaces(self):
|
||
df = pd.DataFrame({"date": ["Jan 15 2024"]})
|
||
result = clean_dataframe(df)
|
||
assert result.cleaned_df.iloc[0]["date"] == "Jan 15 2024"
|
||
|
||
def test_free_text_double_space_still_collapsed(self):
|
||
# Crucially, the heuristic must NOT trigger on prose with letters.
|
||
df = pd.DataFrame({"note": ["hello world"]})
|
||
result = clean_dataframe(df)
|
||
assert result.cleaned_df.iloc[0]["note"] == "hello world"
|