feat(io): pre-parse CSV repair (BOM/NUL/smart-quotes/unquoted-delim)

Some pollution patterns block pandas before the cell-level cleaner can run. Add a pre-parse pass on raw bytes that fixes only what breaks parsing, and returns a structured action log the GUI/CLI can surface to the user. repair_bytes(raw, *, encoding, delimiter, fold_quotes, strip_nul, repair_delims): 1. Strip leading UTF-8 BOM. 2. Strip embedded NUL bytes (the C parser truncates fields at NUL). 3. Fold smart double quotes (curly, guillemet, double-prime) to ASCII '"'. Curly singles are NOT folded here; they don't conflict with CSV and the cell-level cleaner handles them more accurately. 4. Per-row repair when one rogue delimiter is embedded in a field that looks like currency or thousands-grouped digits. Tiered scoring keeps " $1,500.00 ,7" unambiguous: the strict currency regex match wins over the loose digit/sigil heuristic. read_csv_repaired(path) -> (DataFrame, RepairResult). RepairResult exposes .actions, .unrepairable_lines, and a summary() grouped by kind. Out of scope for this pass: encoding repair, delimiter conversion, multi- delimiter merges (k>1) — logged as unrepairable so callers can see what was left alone instead of silently parsing wrong. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 15:37:49 +00:00
parent c349a90e18
commit b8a9fa1b09
2 changed files with 380 additions and 0 deletions
--- a/src/core/io.py
+++ b/src/core/io.py
@@ -4,6 +4,8 @@ from __future__ import annotations

 import csv
 import io
+import re
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Generator, Optional

@@ -245,3 +247,282 @@ def write_file(
        df.to_csv(out, index=False, encoding=encoding)
    logger.info("Wrote {} rows to {}", len(df), out)
    return out
+
+
+# ---------------------------------------------------------------------------
+# Pre-parse repair (CSV / delimited text)
+# ---------------------------------------------------------------------------
+#
+# Some pollution patterns confuse pandas' parser before the cleaner can ever
+# see the data. Smart double quotes inside an unquoted field, NUL bytes, and
+# unquoted delimiters embedded in numeric/currency cells all cause structural
+# parse failures or silent truncation. These helpers operate on raw bytes
+# (or decoded text) and produce a parseable byte stream plus an audit log.
+#
+# Design notes:
+#   - Single curly quotes (U+2018/U+2019) are NOT folded here: they don't
+#     conflict with the default CSV quote char and the cell-level cleaner
+#     handles them more accurately. Only double-quote-equivalents are folded.
+#   - Delimiter-row repair only attempts the unambiguous case (one extra
+#     field, one merge candidate that looks like currency/thousands-sep).
+#     Anything else is logged as unrepairable and the line is left alone.
+
+# Smart double-quote characters that confuse CSV parsing.
+_CSV_SMART_QUOTE_TRANS = str.maketrans({
+    "“": '"',   # LEFT DOUBLE QUOTATION MARK
+    "”": '"',   # RIGHT DOUBLE QUOTATION MARK
+    "„": '"',   # DOUBLE LOW-9 QUOTATION MARK
+    "‟": '"',   # DOUBLE HIGH-REVERSED-9 QUOTATION MARK
+    "«": '"',   # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
+    "»": '"',   # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
+    "″": '"',   # DOUBLE PRIME
+})
+
+# A merged value is "currency-shaped" when it looks like $1,500.00 or 1.234,56
+# (i.e., a sequence of digits, separators, and an optional currency sigil).
+_CURRENCY_SHAPED = re.compile(r"^\s*[$€£¥]?\s*\d{1,3}([,.\s]\d{3})+([,.]\d+)?\s*$")
+# Or a plain decimal with thousands grouping (no currency sigil).
+_THOUSANDS_SHAPED = re.compile(r"^\s*\d{1,3}(,\d{3})+(\.\d+)?\s*$")
+
+
+@dataclass
+class RepairAction:
+    """One repair the pre-parse pass made to the raw bytes."""
+
+    kind: str           # e.g. "strip_bom", "strip_nul", "fold_smart_quote",
+                        #      "quote_unquoted_delim"
+    line: Optional[int] # 1-indexed source line; None for file-level
+    detail: str
+
+
+@dataclass
+class RepairResult:
+    """Output of :func:`repair_bytes`."""
+
+    repaired_bytes: bytes
+    actions: list[RepairAction] = field(default_factory=list)
+    unrepairable_lines: list[int] = field(default_factory=list)
+
+    @property
+    def changed(self) -> bool:
+        return bool(self.actions)
+
+    def summary(self) -> dict[str, int]:
+        """Action count grouped by kind."""
+        out: dict[str, int] = {}
+        for a in self.actions:
+            out[a.kind] = out.get(a.kind, 0) + 1
+        return out
+
+
+def _merge_score(left: str, right: str, delimiter: str) -> int:
+    """Rank how plausible it is that ``left+delimiter+right`` is one field.
+
+    Higher = more confident. ``0`` means the merge is implausible.
+
+    - 3: merged value matches a currency-shaped or thousands-shaped pattern.
+    - 1: loose heuristic (left has $/€/digit and right starts with digit, and
+         delimiter is one of ``,``/``.``).
+    - 0: no signal.
+
+    Tiering matters because ``"  $1,500.00  ,7"`` has two raw candidates
+    (``$1+500.00`` and ``500.00+7``) but only the first produces a strict
+    currency shape.
+    """
+    merged = f"{left}{delimiter}{right}"
+    if _CURRENCY_SHAPED.match(merged) or _THOUSANDS_SHAPED.match(merged):
+        return 3
+    if delimiter in ".,":
+        left_has_money = bool(re.search(r"[$€£¥]\s*\d", left)) or bool(re.search(r"\d\s*$", left))
+        right_starts_digits = bool(re.match(r"\s*\d", right))
+        if left_has_money and right_starts_digits:
+            return 1
+    return 0
+
+
+def _repair_extra_field_row(
+    fields: list[str], expected: int, delimiter: str,
+) -> Optional[list[str]]:
+    """Try to merge one adjacent pair so the row has *expected* fields.
+
+    Returns the repaired field list, or *None* if no unambiguous merge exists.
+    """
+    if len(fields) != expected + 1:
+        return None
+    scores = [
+        (i, _merge_score(fields[i], fields[i + 1], delimiter))
+        for i in range(len(fields) - 1)
+    ]
+    best = max(s for _, s in scores)
+    if best == 0:
+        return None
+    winners = [i for i, s in scores if s == best]
+    if len(winners) != 1:
+        return None
+    i = winners[0]
+    merged = f"{fields[i]}{delimiter}{fields[i + 1]}"
+    return fields[:i] + [merged] + fields[i + 2:]
+
+
+def repair_bytes(
+    raw: bytes,
+    *,
+    encoding: str = "utf-8",
+    delimiter: str = ",",
+    fold_quotes: bool = True,
+    strip_nul: bool = True,
+    repair_delims: bool = True,
+) -> RepairResult:
+    """Pre-parse repair on a raw delimited file.
+
+    Performs (in order, each toggleable):
+
+    1. Strip a leading UTF-8 BOM.
+    2. Strip embedded NUL bytes (the C parser truncates fields at NUL).
+    3. Fold smart double quotes (curly, guillemet, double-prime) to ASCII ``"``.
+    4. Per-row repair when one rogue delimiter is embedded in a field that
+       looks like currency or thousands-grouped digits — quote that field.
+
+    Single curly quotes and other punctuation are deferred to the cell-level
+    cleaner; this layer only fixes things that break CSV *parsing*.
+    """
+    actions: list[RepairAction] = []
+    unrepairable: list[int] = []
+    data = raw
+
+    # 1. BOM
+    if data.startswith(b"\xef\xbb\xbf"):
+        data = data[3:]
+        actions.append(RepairAction(kind="strip_bom", line=None, detail="UTF-8 BOM removed"))
+
+    # 2. NUL
+    if strip_nul and b"\x00" in data:
+        before = data.count(b"\x00")
+        data = data.replace(b"\x00", b"")
+        actions.append(RepairAction(
+            kind="strip_nul", line=None,
+            detail=f"removed {before} NUL byte(s)",
+        ))
+
+    # Decode for character-level work.
+    try:
+        text = data.decode(encoding)
+    except (UnicodeDecodeError, LookupError):
+        text = data.decode("utf-8", errors="replace")
+        actions.append(RepairAction(
+            kind="decode_replaced", line=None,
+            detail=f"decode errors under {encoding}; replaced with U+FFFD",
+        ))
+
+    # 3. Smart double quotes
+    if fold_quotes:
+        folded = text.translate(_CSV_SMART_QUOTE_TRANS)
+        if folded != text:
+            # Count is approximate (distinct mapped chars combined).
+            n = sum(1 for a, b in zip(text, folded) if a != b)
+            actions.append(RepairAction(
+                kind="fold_smart_quote", line=None,
+                detail=f"replaced {n} smart double-quote char(s) with ASCII '\"'",
+            ))
+            text = folded
+
+    # 4. Per-row delimiter repair
+    if repair_delims:
+        text, row_actions, unrepairable = _repair_rows(text, delimiter)
+        actions.extend(row_actions)
+
+    return RepairResult(
+        repaired_bytes=text.encode("utf-8"),
+        actions=actions,
+        unrepairable_lines=unrepairable,
+    )
+
+
+def _repair_rows(
+    text: str, delimiter: str,
+) -> tuple[str, list[RepairAction], list[int]]:
+    """Per-line field-count repair. Operates on already-decoded text."""
+    actions: list[RepairAction] = []
+    unrepairable: list[int] = []
+
+    reader = csv.reader(io.StringIO(text), delimiter=delimiter)
+    rows = list(reader)
+    if not rows:
+        return text, actions, unrepairable
+
+    expected = len(rows[0])
+    repaired_rows: list[list[str]] = [rows[0]]
+    needs_rewrite = False
+
+    for idx, row in enumerate(rows[1:], start=2):  # 1-indexed; header is line 1
+        if len(row) == expected or not row:
+            repaired_rows.append(row)
+            continue
+        if len(row) > expected:
+            fixed = _repair_extra_field_row(row, expected, delimiter)
+            if fixed is not None:
+                repaired_rows.append(fixed)
+                needs_rewrite = True
+                actions.append(RepairAction(
+                    kind="quote_unquoted_delim", line=idx,
+                    detail=(
+                        f"line {idx}: merged adjacent fields to fix "
+                        f"unquoted '{delimiter}' (saw {len(row)} fields, "
+                        f"expected {expected})"
+                    ),
+                ))
+                continue
+            unrepairable.append(idx)
+            repaired_rows.append(row)
+        else:
+            # Too few fields: leave alone, log info-level only.
+            unrepairable.append(idx)
+            repaired_rows.append(row)
+
+    if not needs_rewrite:
+        return text, actions, unrepairable
+
+    buf = io.StringIO()
+    writer = csv.writer(buf, delimiter=delimiter, lineterminator="\n")
+    for row in repaired_rows:
+        writer.writerow(row)
+    return buf.getvalue(), actions, unrepairable
+
+
+def read_csv_repaired(
+    path: str | Path,
+    *,
+    encoding: Optional[str] = None,
+    delimiter: Optional[str] = None,
+    header_row: Optional[int] = None,
+    fold_quotes: bool = True,
+    strip_nul: bool = True,
+    repair_delims: bool = True,
+) -> tuple[pd.DataFrame, RepairResult]:
+    """Read a CSV after running :func:`repair_bytes` on the raw file.
+
+    Returns ``(df, repair_result)`` so callers can surface the action log.
+    """
+    p = Path(path)
+    enc = encoding or detect_encoding(p)
+    delim = delimiter or detect_delimiter(p, enc)
+    raw = p.read_bytes()
+
+    repair = repair_bytes(
+        raw, encoding=enc, delimiter=delim,
+        fold_quotes=fold_quotes, strip_nul=strip_nul, repair_delims=repair_delims,
+    )
+
+    hdr = header_row if header_row is not None else 0
+    df = pd.read_csv(
+        io.BytesIO(repair.repaired_bytes),
+        encoding="utf-8",
+        delimiter=delim,
+        header=hdr,
+        dtype=str,
+        keep_default_na=False,
+        on_bad_lines="warn",
+    )
+    if repair.actions:
+        logger.info("Pre-parse repair on {}: {}", p.name, repair.summary())
+    return df, repair
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -1,5 +1,7 @@
 """Tests for src.core.io — file reading, encoding/delimiter detection."""

+import io
+
 import pandas as pd
 import pytest
 from pathlib import Path
@@ -11,6 +13,8 @@ from src.core.io import (
    read_file,
    write_file,
    list_sheets,
+    repair_bytes,
+    read_csv_repaired,
 )


@@ -128,3 +132,98 @@ class TestListSheets:
            simple_df.to_excel(writer, sheet_name="Sheet2", index=False)
        sheets = list_sheets(path)
        assert sheets == ["Sheet1", "Sheet2"]
+
+
+# ---------------------------------------------------------------------------
+# Pre-parse repair
+# ---------------------------------------------------------------------------
+
+class TestRepairBytes:
+    def test_strips_bom(self):
+        raw = b"\xef\xbb\xbfid,name\n1,Alice\n"
+        result = repair_bytes(raw)
+        assert result.repaired_bytes == b"id,name\n1,Alice\n"
+        assert any(a.kind == "strip_bom" for a in result.actions)
+
+    def test_strips_nul_bytes(self):
+        raw = b"id,name\n1,Hel\x00lo\n2,Wo\x00\x00rld\n"
+        result = repair_bytes(raw)
+        assert b"\x00" not in result.repaired_bytes
+        nul_action = next(a for a in result.actions if a.kind == "strip_nul")
+        assert "3" in nul_action.detail  # 3 NUL bytes
+
+    def test_folds_smart_double_quotes(self):
+        raw = "id,note\n1,“hello”\n2,«bonjour»\n".encode("utf-8")
+        result = repair_bytes(raw)
+        text = result.repaired_bytes.decode("utf-8")
+        assert "“" not in text and "”" not in text
+        assert "«" not in text and "»" not in text
+        assert any(a.kind == "fold_smart_quote" for a in result.actions)
+
+    def test_does_not_fold_curly_singles(self):
+        # Single curly quotes should pass through; cell-level cleaner handles them.
+        raw = "id,note\n1,it’s fine\n".encode("utf-8")
+        result = repair_bytes(raw)
+        text = result.repaired_bytes.decode("utf-8")
+        assert "’" in text
+        assert not any(a.kind == "fold_smart_quote" for a in result.actions)
+
+    def test_no_changes_when_clean(self):
+        raw = b"id,name\n1,Alice\n2,Bob\n"
+        result = repair_bytes(raw)
+        assert result.repaired_bytes == raw
+        assert result.actions == []
+        assert result.changed is False
+
+    def test_repairs_unquoted_currency_comma(self):
+        raw = (
+            b"id,price,qty\n"
+            b"1,100,5\n"
+            b"2,  $1,500.00  ,7\n"   # 4 fields instead of 3
+            b"3,200,9\n"
+        )
+        result = repair_bytes(raw)
+        # After repair, every row should have 3 fields when re-parsed.
+        df = pd.read_csv(io.BytesIO(result.repaired_bytes))
+        assert list(df.columns) == ["id", "price", "qty"]
+        assert len(df) == 3
+        assert any(a.kind == "quote_unquoted_delim" and a.line == 3 for a in result.actions)
+
+    def test_logs_unrepairable_when_ambiguous(self):
+        # Two adjacent merge candidates -> bail out, log unrepairable.
+        raw = (
+            b"id,a,b,c\n"
+            b"1,foo,bar,baz\n"
+            b"2,1,2,3,4,5\n"   # way too many extras, no clear merge
+        )
+        result = repair_bytes(raw)
+        assert 3 in result.unrepairable_lines
+
+    def test_summary_groups_by_kind(self):
+        raw = b"\xef\xbb\xbfid,name\n1,Hel\x00lo\n"
+        result = repair_bytes(raw)
+        summary = result.summary()
+        assert summary.get("strip_bom") == 1
+        assert summary.get("strip_nul") == 1
+
+
+class TestReadCsvRepaired:
+    def test_recovers_malformed_currency_row(self, tmp_path):
+        f = tmp_path / "bad.csv"
+        f.write_bytes(
+            b"id,price,qty\n"
+            b"1,100,5\n"
+            b"2,  $1,500.00  ,7\n"
+            b"3,200,9\n"
+        )
+        df, repair = read_csv_repaired(f)
+        assert len(df) == 3
+        assert "1,500.00" in df.iloc[1]["price"]
+        assert repair.changed
+
+    def test_passthrough_when_clean(self, tmp_path):
+        f = tmp_path / "ok.csv"
+        f.write_bytes(b"id,name\n1,Alice\n2,Bob\n")
+        df, repair = read_csv_repaired(f)
+        assert len(df) == 2
+        assert repair.changed is False