"""Tests added to close gaps surfaced by the test audit. These cover edges that existing suites missed: - ``CleanOptions.clean_headers=False`` toggle (added but not directly tested). - ``repair_bytes`` with non-comma delimiters and combined-fix scenarios. - ``analyze()`` over a path-based Excel file. - ``analyze()`` with ``sample_rows >= len(df)`` (uses copy(), not head()). - ``findings_by_tool`` on an empty list. - BOM that appears mid-cell rather than at file start. The collapse-whitespace heuristic for numeric/date/phone-shaped cells (spec §4.17) is *not yet implemented* and is captured here as a known-gap xfail so it's surfaced rather than silently missing. """ from __future__ import annotations import io import pandas as pd import pytest from src.core.analyze import analyze, findings_by_tool from src.core.io import RepairAction, repair_bytes from src.core.text_clean import CleanOptions, clean_dataframe # --------------------------------------------------------------------------- # clean_headers toggle # --------------------------------------------------------------------------- class TestCleanHeadersToggle: def test_default_cleans_headers(self): df = pd.DataFrame({" id ": [1], "Email​": ["a@b.com"]}) result = clean_dataframe(df) assert list(result.cleaned_df.columns) == ["id", "Email"] def test_disable_preserves_dirty_headers(self): df = pd.DataFrame({" id ": [1], "Email​": ["a@b.com"]}) result = clean_dataframe(df, CleanOptions(clean_headers=False)) assert list(result.cleaned_df.columns) == [" id ", "Email​"] def test_disable_still_cleans_data_cells(self): df = pd.DataFrame({"name": [" Alice ", "Bob "]}) result = clean_dataframe(df, CleanOptions(clean_headers=False)) assert result.cleaned_df["name"].tolist() == ["Alice", "Bob"] # --------------------------------------------------------------------------- # repair_bytes — non-comma delimiters and combined fixes # --------------------------------------------------------------------------- class TestRepairBytesDelimiters: def test_tab_delimited_smart_quote_fold(self): raw = "id\tnote\n1\t“hi”\n".encode("utf-8") result = repair_bytes(raw, delimiter="\t") text = result.repaired_bytes.decode("utf-8") assert "“" not in text and "”" not in text assert "\t" in text # delimiter preserved def test_semicolon_delimited_unrepairable_extras(self): raw = b"id;a;b\n1;foo;bar\n2;1;2;3;4\n" result = repair_bytes(raw, delimiter=";") # Extra-field row with no clear merge candidate is logged unrepairable. assert 3 in result.unrepairable_lines class TestRepairBytesCombinedFixes: def test_bom_plus_nul_plus_smart_quotes(self): raw = ( b"\xef\xbb\xbf" b"id,note\n" b"1,Hel\x00lo \xe2\x80\x9cworld\xe2\x80\x9d\n" ) result = repair_bytes(raw) kinds = {a.kind for a in result.actions} assert {"strip_bom", "strip_nul", "fold_smart_quote"} <= kinds # Resulting bytes parse cleanly. df = pd.read_csv(io.BytesIO(result.repaired_bytes)) assert df.iloc[0]["note"] == 'Hello "world"' # --------------------------------------------------------------------------- # analyze() — path-based Excel and large-sample edges # --------------------------------------------------------------------------- class TestAnalyzeXlsxPath: def test_excel_path_runs_without_repair(self, tmp_path): path = tmp_path / "small.xlsx" df = pd.DataFrame({ "id": ["1", "2"], "name": [" Alice ", "Bob"], # padding in xlsx }) df.to_excel(path, index=False, engine="openpyxl") findings = analyze(path) ids = {f.id for f in findings} assert "whitespace_padding" in ids # Excel skips csv_* findings — no pre-parse repair on xlsx. assert not any(i.startswith("csv_") for i in ids) class TestAnalyzeSampleRowsEdge: def test_sample_rows_larger_than_df(self): df = pd.DataFrame({"x": [" pad ", "clean"]}) # sample_rows=1000 but df has only 2 rows; must not crash. findings = analyze(df, sample_rows=1000) assert any(f.id == "whitespace_padding" for f in findings) class TestAnalyzeMidCellBom: def test_bom_inside_cell_treated_as_zero_width(self): df = pd.DataFrame({"name": ["Hello"]}) findings = analyze(df) assert any(f.id == "zero_width_or_invisible" for f in findings) # --------------------------------------------------------------------------- # findings_by_tool — edge cases # --------------------------------------------------------------------------- class TestFindingsByToolEdges: def test_empty_list_returns_empty_dict(self): assert findings_by_tool([]) == {} def test_only_toolless_findings_returns_empty_dict(self): from src.core.analyze import Finding # Construct a Finding with no tool — like csv_unrepairable_rows. f = Finding( id="x", severity="info", tool="", count=1, description="d", ) assert findings_by_tool([f]) == {} # --------------------------------------------------------------------------- # Known gap: collapse_whitespace on numeric/date/phone-shaped cells # --------------------------------------------------------------------------- class TestStructuredCellWhitespacePreservation: """Spec §4.17: ``collapse_whitespace`` skips numeric/date/phone-shaped cells.""" def test_phone_internal_double_space_preserved(self): df = pd.DataFrame({"phone": ["(555) 123-4567"]}) result = clean_dataframe(df) assert result.cleaned_df.iloc[0]["phone"] == "(555) 123-4567" def test_european_thousands_sep_preserved(self): df = pd.DataFrame({"price": ["1 234"]}) result = clean_dataframe(df) assert result.cleaned_df.iloc[0]["price"] == "1 234" def test_iso_date_passes_through(self): df = pd.DataFrame({"date": ["2024-01-15"]}) result = clean_dataframe(df) assert result.cleaned_df.iloc[0]["date"] == "2024-01-15" def test_textual_date_preserves_spaces(self): df = pd.DataFrame({"date": ["Jan 15 2024"]}) result = clean_dataframe(df) assert result.cleaned_df.iloc[0]["date"] == "Jan 15 2024" def test_free_text_double_space_still_collapsed(self): # Crucially, the heuristic must NOT trigger on prose with letters. df = pd.DataFrame({"note": ["hello world"]}) result = clean_dataframe(df) assert result.cleaned_df.iloc[0]["note"] == "hello world"