feat(io): pre-parse CSV repair (BOM/NUL/smart-quotes/unquoted-delim)

Some pollution patterns block pandas before the cell-level cleaner can run. Add a pre-parse pass on raw bytes that fixes only what breaks parsing, and returns a structured action log the GUI/CLI can surface to the user. repair_bytes(raw, *, encoding, delimiter, fold_quotes, strip_nul, repair_delims): 1. Strip leading UTF-8 BOM. 2. Strip embedded NUL bytes (the C parser truncates fields at NUL). 3. Fold smart double quotes (curly, guillemet, double-prime) to ASCII '"'. Curly singles are NOT folded here; they don't conflict with CSV and the cell-level cleaner handles them more accurately. 4. Per-row repair when one rogue delimiter is embedded in a field that looks like currency or thousands-grouped digits. Tiered scoring keeps " $1,500.00 ,7" unambiguous: the strict currency regex match wins over the loose digit/sigil heuristic. read_csv_repaired(path) -> (DataFrame, RepairResult). RepairResult exposes .actions, .unrepairable_lines, and a summary() grouped by kind. Out of scope for this pass: encoding repair, delimiter conversion, multi- delimiter merges (k>1) — logged as unrepairable so callers can see what was left alone instead of silently parsing wrong. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 15:37:49 +00:00
parent c349a90e18
commit b8a9fa1b09
2 changed files with 380 additions and 0 deletions
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -1,5 +1,7 @@
 """Tests for src.core.io — file reading, encoding/delimiter detection."""

+import io
+
 import pandas as pd
 import pytest
 from pathlib import Path
@@ -11,6 +13,8 @@ from src.core.io import (
    read_file,
    write_file,
    list_sheets,
+    repair_bytes,
+    read_csv_repaired,
 )


@@ -128,3 +132,98 @@ class TestListSheets:
            simple_df.to_excel(writer, sheet_name="Sheet2", index=False)
        sheets = list_sheets(path)
        assert sheets == ["Sheet1", "Sheet2"]
+
+
+# ---------------------------------------------------------------------------
+# Pre-parse repair
+# ---------------------------------------------------------------------------
+
+class TestRepairBytes:
+    def test_strips_bom(self):
+        raw = b"\xef\xbb\xbfid,name\n1,Alice\n"
+        result = repair_bytes(raw)
+        assert result.repaired_bytes == b"id,name\n1,Alice\n"
+        assert any(a.kind == "strip_bom" for a in result.actions)
+
+    def test_strips_nul_bytes(self):
+        raw = b"id,name\n1,Hel\x00lo\n2,Wo\x00\x00rld\n"
+        result = repair_bytes(raw)
+        assert b"\x00" not in result.repaired_bytes
+        nul_action = next(a for a in result.actions if a.kind == "strip_nul")
+        assert "3" in nul_action.detail  # 3 NUL bytes
+
+    def test_folds_smart_double_quotes(self):
+        raw = "id,note\n1,“hello”\n2,«bonjour»\n".encode("utf-8")
+        result = repair_bytes(raw)
+        text = result.repaired_bytes.decode("utf-8")
+        assert "“" not in text and "”" not in text
+        assert "«" not in text and "»" not in text
+        assert any(a.kind == "fold_smart_quote" for a in result.actions)
+
+    def test_does_not_fold_curly_singles(self):
+        # Single curly quotes should pass through; cell-level cleaner handles them.
+        raw = "id,note\n1,it’s fine\n".encode("utf-8")
+        result = repair_bytes(raw)
+        text = result.repaired_bytes.decode("utf-8")
+        assert "’" in text
+        assert not any(a.kind == "fold_smart_quote" for a in result.actions)
+
+    def test_no_changes_when_clean(self):
+        raw = b"id,name\n1,Alice\n2,Bob\n"
+        result = repair_bytes(raw)
+        assert result.repaired_bytes == raw
+        assert result.actions == []
+        assert result.changed is False
+
+    def test_repairs_unquoted_currency_comma(self):
+        raw = (
+            b"id,price,qty\n"
+            b"1,100,5\n"
+            b"2,  $1,500.00  ,7\n"   # 4 fields instead of 3
+            b"3,200,9\n"
+        )
+        result = repair_bytes(raw)
+        # After repair, every row should have 3 fields when re-parsed.
+        df = pd.read_csv(io.BytesIO(result.repaired_bytes))
+        assert list(df.columns) == ["id", "price", "qty"]
+        assert len(df) == 3
+        assert any(a.kind == "quote_unquoted_delim" and a.line == 3 for a in result.actions)
+
+    def test_logs_unrepairable_when_ambiguous(self):
+        # Two adjacent merge candidates -> bail out, log unrepairable.
+        raw = (
+            b"id,a,b,c\n"
+            b"1,foo,bar,baz\n"
+            b"2,1,2,3,4,5\n"   # way too many extras, no clear merge
+        )
+        result = repair_bytes(raw)
+        assert 3 in result.unrepairable_lines
+
+    def test_summary_groups_by_kind(self):
+        raw = b"\xef\xbb\xbfid,name\n1,Hel\x00lo\n"
+        result = repair_bytes(raw)
+        summary = result.summary()
+        assert summary.get("strip_bom") == 1
+        assert summary.get("strip_nul") == 1
+
+
+class TestReadCsvRepaired:
+    def test_recovers_malformed_currency_row(self, tmp_path):
+        f = tmp_path / "bad.csv"
+        f.write_bytes(
+            b"id,price,qty\n"
+            b"1,100,5\n"
+            b"2,  $1,500.00  ,7\n"
+            b"3,200,9\n"
+        )
+        df, repair = read_csv_repaired(f)
+        assert len(df) == 3
+        assert "1,500.00" in df.iloc[1]["price"]
+        assert repair.changed
+
+    def test_passthrough_when_clean(self, tmp_path):
+        f = tmp_path / "ok.csv"
+        f.write_bytes(b"id,name\n1,Alice\n2,Bob\n")
+        df, repair = read_csv_repaired(f)
+        assert len(df) == 2
+        assert repair.changed is False