feat(io): pre-parse CSV repair (BOM/NUL/smart-quotes/unquoted-delim)

Some pollution patterns block pandas before the cell-level cleaner can run.
Add a pre-parse pass on raw bytes that fixes only what breaks parsing, and
returns a structured action log the GUI/CLI can surface to the user.

repair_bytes(raw, *, encoding, delimiter, fold_quotes, strip_nul, repair_delims):
  1. Strip leading UTF-8 BOM.
  2. Strip embedded NUL bytes (the C parser truncates fields at NUL).
  3. Fold smart double quotes (curly, guillemet, double-prime) to ASCII '"'.
     Curly singles are NOT folded here; they don't conflict with CSV and the
     cell-level cleaner handles them more accurately.
  4. Per-row repair when one rogue delimiter is embedded in a field that
     looks like currency or thousands-grouped digits. Tiered scoring keeps
     "  $1,500.00  ,7" unambiguous: the strict currency regex match wins
     over the loose digit/sigil heuristic.

read_csv_repaired(path) -> (DataFrame, RepairResult). RepairResult exposes
.actions, .unrepairable_lines, and a summary() grouped by kind.

Out of scope for this pass: encoding repair, delimiter conversion, multi-
delimiter merges (k>1) — logged as unrepairable so callers can see what was
left alone instead of silently parsing wrong.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-29 15:37:49 +00:00
parent c349a90e18
commit b8a9fa1b09
2 changed files with 380 additions and 0 deletions

View File

@@ -4,6 +4,8 @@ from __future__ import annotations
import csv import csv
import io import io
import re
from dataclasses import dataclass, field
from pathlib import Path from pathlib import Path
from typing import Generator, Optional from typing import Generator, Optional
@@ -245,3 +247,282 @@ def write_file(
df.to_csv(out, index=False, encoding=encoding) df.to_csv(out, index=False, encoding=encoding)
logger.info("Wrote {} rows to {}", len(df), out) logger.info("Wrote {} rows to {}", len(df), out)
return out return out
# ---------------------------------------------------------------------------
# Pre-parse repair (CSV / delimited text)
# ---------------------------------------------------------------------------
#
# Some pollution patterns confuse pandas' parser before the cleaner can ever
# see the data. Smart double quotes inside an unquoted field, NUL bytes, and
# unquoted delimiters embedded in numeric/currency cells all cause structural
# parse failures or silent truncation. These helpers operate on raw bytes
# (or decoded text) and produce a parseable byte stream plus an audit log.
#
# Design notes:
# - Single curly quotes (U+2018/U+2019) are NOT folded here: they don't
# conflict with the default CSV quote char and the cell-level cleaner
# handles them more accurately. Only double-quote-equivalents are folded.
# - Delimiter-row repair only attempts the unambiguous case (one extra
# field, one merge candidate that looks like currency/thousands-sep).
# Anything else is logged as unrepairable and the line is left alone.
# Smart double-quote characters that confuse CSV parsing.
_CSV_SMART_QUOTE_TRANS = str.maketrans({
"": '"', # LEFT DOUBLE QUOTATION MARK
"": '"', # RIGHT DOUBLE QUOTATION MARK
"": '"', # DOUBLE LOW-9 QUOTATION MARK
"": '"', # DOUBLE HIGH-REVERSED-9 QUOTATION MARK
"«": '"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
"»": '"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
"": '"', # DOUBLE PRIME
})
# A merged value is "currency-shaped" when it looks like $1,500.00 or 1.234,56
# (i.e., a sequence of digits, separators, and an optional currency sigil).
_CURRENCY_SHAPED = re.compile(r"^\s*[$€£¥]?\s*\d{1,3}([,.\s]\d{3})+([,.]\d+)?\s*$")
# Or a plain decimal with thousands grouping (no currency sigil).
_THOUSANDS_SHAPED = re.compile(r"^\s*\d{1,3}(,\d{3})+(\.\d+)?\s*$")
@dataclass
class RepairAction:
"""One repair the pre-parse pass made to the raw bytes."""
kind: str # e.g. "strip_bom", "strip_nul", "fold_smart_quote",
# "quote_unquoted_delim"
line: Optional[int] # 1-indexed source line; None for file-level
detail: str
@dataclass
class RepairResult:
"""Output of :func:`repair_bytes`."""
repaired_bytes: bytes
actions: list[RepairAction] = field(default_factory=list)
unrepairable_lines: list[int] = field(default_factory=list)
@property
def changed(self) -> bool:
return bool(self.actions)
def summary(self) -> dict[str, int]:
"""Action count grouped by kind."""
out: dict[str, int] = {}
for a in self.actions:
out[a.kind] = out.get(a.kind, 0) + 1
return out
def _merge_score(left: str, right: str, delimiter: str) -> int:
"""Rank how plausible it is that ``left+delimiter+right`` is one field.
Higher = more confident. ``0`` means the merge is implausible.
- 3: merged value matches a currency-shaped or thousands-shaped pattern.
- 1: loose heuristic (left has $/€/digit and right starts with digit, and
delimiter is one of ``,``/``.``).
- 0: no signal.
Tiering matters because ``" $1,500.00 ,7"`` has two raw candidates
(``$1+500.00`` and ``500.00+7``) but only the first produces a strict
currency shape.
"""
merged = f"{left}{delimiter}{right}"
if _CURRENCY_SHAPED.match(merged) or _THOUSANDS_SHAPED.match(merged):
return 3
if delimiter in ".,":
left_has_money = bool(re.search(r"[$€£¥]\s*\d", left)) or bool(re.search(r"\d\s*$", left))
right_starts_digits = bool(re.match(r"\s*\d", right))
if left_has_money and right_starts_digits:
return 1
return 0
def _repair_extra_field_row(
fields: list[str], expected: int, delimiter: str,
) -> Optional[list[str]]:
"""Try to merge one adjacent pair so the row has *expected* fields.
Returns the repaired field list, or *None* if no unambiguous merge exists.
"""
if len(fields) != expected + 1:
return None
scores = [
(i, _merge_score(fields[i], fields[i + 1], delimiter))
for i in range(len(fields) - 1)
]
best = max(s for _, s in scores)
if best == 0:
return None
winners = [i for i, s in scores if s == best]
if len(winners) != 1:
return None
i = winners[0]
merged = f"{fields[i]}{delimiter}{fields[i + 1]}"
return fields[:i] + [merged] + fields[i + 2:]
def repair_bytes(
raw: bytes,
*,
encoding: str = "utf-8",
delimiter: str = ",",
fold_quotes: bool = True,
strip_nul: bool = True,
repair_delims: bool = True,
) -> RepairResult:
"""Pre-parse repair on a raw delimited file.
Performs (in order, each toggleable):
1. Strip a leading UTF-8 BOM.
2. Strip embedded NUL bytes (the C parser truncates fields at NUL).
3. Fold smart double quotes (curly, guillemet, double-prime) to ASCII ``"``.
4. Per-row repair when one rogue delimiter is embedded in a field that
looks like currency or thousands-grouped digits — quote that field.
Single curly quotes and other punctuation are deferred to the cell-level
cleaner; this layer only fixes things that break CSV *parsing*.
"""
actions: list[RepairAction] = []
unrepairable: list[int] = []
data = raw
# 1. BOM
if data.startswith(b"\xef\xbb\xbf"):
data = data[3:]
actions.append(RepairAction(kind="strip_bom", line=None, detail="UTF-8 BOM removed"))
# 2. NUL
if strip_nul and b"\x00" in data:
before = data.count(b"\x00")
data = data.replace(b"\x00", b"")
actions.append(RepairAction(
kind="strip_nul", line=None,
detail=f"removed {before} NUL byte(s)",
))
# Decode for character-level work.
try:
text = data.decode(encoding)
except (UnicodeDecodeError, LookupError):
text = data.decode("utf-8", errors="replace")
actions.append(RepairAction(
kind="decode_replaced", line=None,
detail=f"decode errors under {encoding}; replaced with U+FFFD",
))
# 3. Smart double quotes
if fold_quotes:
folded = text.translate(_CSV_SMART_QUOTE_TRANS)
if folded != text:
# Count is approximate (distinct mapped chars combined).
n = sum(1 for a, b in zip(text, folded) if a != b)
actions.append(RepairAction(
kind="fold_smart_quote", line=None,
detail=f"replaced {n} smart double-quote char(s) with ASCII '\"'",
))
text = folded
# 4. Per-row delimiter repair
if repair_delims:
text, row_actions, unrepairable = _repair_rows(text, delimiter)
actions.extend(row_actions)
return RepairResult(
repaired_bytes=text.encode("utf-8"),
actions=actions,
unrepairable_lines=unrepairable,
)
def _repair_rows(
text: str, delimiter: str,
) -> tuple[str, list[RepairAction], list[int]]:
"""Per-line field-count repair. Operates on already-decoded text."""
actions: list[RepairAction] = []
unrepairable: list[int] = []
reader = csv.reader(io.StringIO(text), delimiter=delimiter)
rows = list(reader)
if not rows:
return text, actions, unrepairable
expected = len(rows[0])
repaired_rows: list[list[str]] = [rows[0]]
needs_rewrite = False
for idx, row in enumerate(rows[1:], start=2): # 1-indexed; header is line 1
if len(row) == expected or not row:
repaired_rows.append(row)
continue
if len(row) > expected:
fixed = _repair_extra_field_row(row, expected, delimiter)
if fixed is not None:
repaired_rows.append(fixed)
needs_rewrite = True
actions.append(RepairAction(
kind="quote_unquoted_delim", line=idx,
detail=(
f"line {idx}: merged adjacent fields to fix "
f"unquoted '{delimiter}' (saw {len(row)} fields, "
f"expected {expected})"
),
))
continue
unrepairable.append(idx)
repaired_rows.append(row)
else:
# Too few fields: leave alone, log info-level only.
unrepairable.append(idx)
repaired_rows.append(row)
if not needs_rewrite:
return text, actions, unrepairable
buf = io.StringIO()
writer = csv.writer(buf, delimiter=delimiter, lineterminator="\n")
for row in repaired_rows:
writer.writerow(row)
return buf.getvalue(), actions, unrepairable
def read_csv_repaired(
path: str | Path,
*,
encoding: Optional[str] = None,
delimiter: Optional[str] = None,
header_row: Optional[int] = None,
fold_quotes: bool = True,
strip_nul: bool = True,
repair_delims: bool = True,
) -> tuple[pd.DataFrame, RepairResult]:
"""Read a CSV after running :func:`repair_bytes` on the raw file.
Returns ``(df, repair_result)`` so callers can surface the action log.
"""
p = Path(path)
enc = encoding or detect_encoding(p)
delim = delimiter or detect_delimiter(p, enc)
raw = p.read_bytes()
repair = repair_bytes(
raw, encoding=enc, delimiter=delim,
fold_quotes=fold_quotes, strip_nul=strip_nul, repair_delims=repair_delims,
)
hdr = header_row if header_row is not None else 0
df = pd.read_csv(
io.BytesIO(repair.repaired_bytes),
encoding="utf-8",
delimiter=delim,
header=hdr,
dtype=str,
keep_default_na=False,
on_bad_lines="warn",
)
if repair.actions:
logger.info("Pre-parse repair on {}: {}", p.name, repair.summary())
return df, repair

View File

@@ -1,5 +1,7 @@
"""Tests for src.core.io — file reading, encoding/delimiter detection.""" """Tests for src.core.io — file reading, encoding/delimiter detection."""
import io
import pandas as pd import pandas as pd
import pytest import pytest
from pathlib import Path from pathlib import Path
@@ -11,6 +13,8 @@ from src.core.io import (
read_file, read_file,
write_file, write_file,
list_sheets, list_sheets,
repair_bytes,
read_csv_repaired,
) )
@@ -128,3 +132,98 @@ class TestListSheets:
simple_df.to_excel(writer, sheet_name="Sheet2", index=False) simple_df.to_excel(writer, sheet_name="Sheet2", index=False)
sheets = list_sheets(path) sheets = list_sheets(path)
assert sheets == ["Sheet1", "Sheet2"] assert sheets == ["Sheet1", "Sheet2"]
# ---------------------------------------------------------------------------
# Pre-parse repair
# ---------------------------------------------------------------------------
class TestRepairBytes:
def test_strips_bom(self):
raw = b"\xef\xbb\xbfid,name\n1,Alice\n"
result = repair_bytes(raw)
assert result.repaired_bytes == b"id,name\n1,Alice\n"
assert any(a.kind == "strip_bom" for a in result.actions)
def test_strips_nul_bytes(self):
raw = b"id,name\n1,Hel\x00lo\n2,Wo\x00\x00rld\n"
result = repair_bytes(raw)
assert b"\x00" not in result.repaired_bytes
nul_action = next(a for a in result.actions if a.kind == "strip_nul")
assert "3" in nul_action.detail # 3 NUL bytes
def test_folds_smart_double_quotes(self):
raw = "id,note\n1,“hello”\n2,«bonjour»\n".encode("utf-8")
result = repair_bytes(raw)
text = result.repaired_bytes.decode("utf-8")
assert "" not in text and "" not in text
assert "«" not in text and "»" not in text
assert any(a.kind == "fold_smart_quote" for a in result.actions)
def test_does_not_fold_curly_singles(self):
# Single curly quotes should pass through; cell-level cleaner handles them.
raw = "id,note\n1,its fine\n".encode("utf-8")
result = repair_bytes(raw)
text = result.repaired_bytes.decode("utf-8")
assert "" in text
assert not any(a.kind == "fold_smart_quote" for a in result.actions)
def test_no_changes_when_clean(self):
raw = b"id,name\n1,Alice\n2,Bob\n"
result = repair_bytes(raw)
assert result.repaired_bytes == raw
assert result.actions == []
assert result.changed is False
def test_repairs_unquoted_currency_comma(self):
raw = (
b"id,price,qty\n"
b"1,100,5\n"
b"2, $1,500.00 ,7\n" # 4 fields instead of 3
b"3,200,9\n"
)
result = repair_bytes(raw)
# After repair, every row should have 3 fields when re-parsed.
df = pd.read_csv(io.BytesIO(result.repaired_bytes))
assert list(df.columns) == ["id", "price", "qty"]
assert len(df) == 3
assert any(a.kind == "quote_unquoted_delim" and a.line == 3 for a in result.actions)
def test_logs_unrepairable_when_ambiguous(self):
# Two adjacent merge candidates -> bail out, log unrepairable.
raw = (
b"id,a,b,c\n"
b"1,foo,bar,baz\n"
b"2,1,2,3,4,5\n" # way too many extras, no clear merge
)
result = repair_bytes(raw)
assert 3 in result.unrepairable_lines
def test_summary_groups_by_kind(self):
raw = b"\xef\xbb\xbfid,name\n1,Hel\x00lo\n"
result = repair_bytes(raw)
summary = result.summary()
assert summary.get("strip_bom") == 1
assert summary.get("strip_nul") == 1
class TestReadCsvRepaired:
def test_recovers_malformed_currency_row(self, tmp_path):
f = tmp_path / "bad.csv"
f.write_bytes(
b"id,price,qty\n"
b"1,100,5\n"
b"2, $1,500.00 ,7\n"
b"3,200,9\n"
)
df, repair = read_csv_repaired(f)
assert len(df) == 3
assert "1,500.00" in df.iloc[1]["price"]
assert repair.changed
def test_passthrough_when_clean(self, tmp_path):
f = tmp_path / "ok.csv"
f.write_bytes(b"id,name\n1,Alice\n2,Bob\n")
df, repair = read_csv_repaired(f)
assert len(df) == 2
assert repair.changed is False