feat(io): pre-parse CSV repair (BOM/NUL/smart-quotes/unquoted-delim)
Some pollution patterns block pandas before the cell-level cleaner can run.
Add a pre-parse pass on raw bytes that fixes only what breaks parsing, and
returns a structured action log the GUI/CLI can surface to the user.
repair_bytes(raw, *, encoding, delimiter, fold_quotes, strip_nul, repair_delims):
1. Strip leading UTF-8 BOM.
2. Strip embedded NUL bytes (the C parser truncates fields at NUL).
3. Fold smart double quotes (curly, guillemet, double-prime) to ASCII '"'.
Curly singles are NOT folded here; they don't conflict with CSV and the
cell-level cleaner handles them more accurately.
4. Per-row repair when one rogue delimiter is embedded in a field that
looks like currency or thousands-grouped digits. Tiered scoring keeps
" $1,500.00 ,7" unambiguous: the strict currency regex match wins
over the loose digit/sigil heuristic.
read_csv_repaired(path) -> (DataFrame, RepairResult). RepairResult exposes
.actions, .unrepairable_lines, and a summary() grouped by kind.
Out of scope for this pass: encoding repair, delimiter conversion, multi-
delimiter merges (k>1) — logged as unrepairable so callers can see what was
left alone instead of silently parsing wrong.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
281
src/core/io.py
281
src/core/io.py
@@ -4,6 +4,8 @@ from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import io
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Generator, Optional
|
||||
|
||||
@@ -245,3 +247,282 @@ def write_file(
|
||||
df.to_csv(out, index=False, encoding=encoding)
|
||||
logger.info("Wrote {} rows to {}", len(df), out)
|
||||
return out
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pre-parse repair (CSV / delimited text)
|
||||
# ---------------------------------------------------------------------------
|
||||
#
|
||||
# Some pollution patterns confuse pandas' parser before the cleaner can ever
|
||||
# see the data. Smart double quotes inside an unquoted field, NUL bytes, and
|
||||
# unquoted delimiters embedded in numeric/currency cells all cause structural
|
||||
# parse failures or silent truncation. These helpers operate on raw bytes
|
||||
# (or decoded text) and produce a parseable byte stream plus an audit log.
|
||||
#
|
||||
# Design notes:
|
||||
# - Single curly quotes (U+2018/U+2019) are NOT folded here: they don't
|
||||
# conflict with the default CSV quote char and the cell-level cleaner
|
||||
# handles them more accurately. Only double-quote-equivalents are folded.
|
||||
# - Delimiter-row repair only attempts the unambiguous case (one extra
|
||||
# field, one merge candidate that looks like currency/thousands-sep).
|
||||
# Anything else is logged as unrepairable and the line is left alone.
|
||||
|
||||
# Smart double-quote characters that confuse CSV parsing.
|
||||
_CSV_SMART_QUOTE_TRANS = str.maketrans({
|
||||
"“": '"', # LEFT DOUBLE QUOTATION MARK
|
||||
"”": '"', # RIGHT DOUBLE QUOTATION MARK
|
||||
"„": '"', # DOUBLE LOW-9 QUOTATION MARK
|
||||
"‟": '"', # DOUBLE HIGH-REVERSED-9 QUOTATION MARK
|
||||
"«": '"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||
"»": '"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||
"″": '"', # DOUBLE PRIME
|
||||
})
|
||||
|
||||
# A merged value is "currency-shaped" when it looks like $1,500.00 or 1.234,56
|
||||
# (i.e., a sequence of digits, separators, and an optional currency sigil).
|
||||
_CURRENCY_SHAPED = re.compile(r"^\s*[$€£¥]?\s*\d{1,3}([,.\s]\d{3})+([,.]\d+)?\s*$")
|
||||
# Or a plain decimal with thousands grouping (no currency sigil).
|
||||
_THOUSANDS_SHAPED = re.compile(r"^\s*\d{1,3}(,\d{3})+(\.\d+)?\s*$")
|
||||
|
||||
|
||||
@dataclass
|
||||
class RepairAction:
|
||||
"""One repair the pre-parse pass made to the raw bytes."""
|
||||
|
||||
kind: str # e.g. "strip_bom", "strip_nul", "fold_smart_quote",
|
||||
# "quote_unquoted_delim"
|
||||
line: Optional[int] # 1-indexed source line; None for file-level
|
||||
detail: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class RepairResult:
|
||||
"""Output of :func:`repair_bytes`."""
|
||||
|
||||
repaired_bytes: bytes
|
||||
actions: list[RepairAction] = field(default_factory=list)
|
||||
unrepairable_lines: list[int] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def changed(self) -> bool:
|
||||
return bool(self.actions)
|
||||
|
||||
def summary(self) -> dict[str, int]:
|
||||
"""Action count grouped by kind."""
|
||||
out: dict[str, int] = {}
|
||||
for a in self.actions:
|
||||
out[a.kind] = out.get(a.kind, 0) + 1
|
||||
return out
|
||||
|
||||
|
||||
def _merge_score(left: str, right: str, delimiter: str) -> int:
|
||||
"""Rank how plausible it is that ``left+delimiter+right`` is one field.
|
||||
|
||||
Higher = more confident. ``0`` means the merge is implausible.
|
||||
|
||||
- 3: merged value matches a currency-shaped or thousands-shaped pattern.
|
||||
- 1: loose heuristic (left has $/€/digit and right starts with digit, and
|
||||
delimiter is one of ``,``/``.``).
|
||||
- 0: no signal.
|
||||
|
||||
Tiering matters because ``" $1,500.00 ,7"`` has two raw candidates
|
||||
(``$1+500.00`` and ``500.00+7``) but only the first produces a strict
|
||||
currency shape.
|
||||
"""
|
||||
merged = f"{left}{delimiter}{right}"
|
||||
if _CURRENCY_SHAPED.match(merged) or _THOUSANDS_SHAPED.match(merged):
|
||||
return 3
|
||||
if delimiter in ".,":
|
||||
left_has_money = bool(re.search(r"[$€£¥]\s*\d", left)) or bool(re.search(r"\d\s*$", left))
|
||||
right_starts_digits = bool(re.match(r"\s*\d", right))
|
||||
if left_has_money and right_starts_digits:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
def _repair_extra_field_row(
|
||||
fields: list[str], expected: int, delimiter: str,
|
||||
) -> Optional[list[str]]:
|
||||
"""Try to merge one adjacent pair so the row has *expected* fields.
|
||||
|
||||
Returns the repaired field list, or *None* if no unambiguous merge exists.
|
||||
"""
|
||||
if len(fields) != expected + 1:
|
||||
return None
|
||||
scores = [
|
||||
(i, _merge_score(fields[i], fields[i + 1], delimiter))
|
||||
for i in range(len(fields) - 1)
|
||||
]
|
||||
best = max(s for _, s in scores)
|
||||
if best == 0:
|
||||
return None
|
||||
winners = [i for i, s in scores if s == best]
|
||||
if len(winners) != 1:
|
||||
return None
|
||||
i = winners[0]
|
||||
merged = f"{fields[i]}{delimiter}{fields[i + 1]}"
|
||||
return fields[:i] + [merged] + fields[i + 2:]
|
||||
|
||||
|
||||
def repair_bytes(
|
||||
raw: bytes,
|
||||
*,
|
||||
encoding: str = "utf-8",
|
||||
delimiter: str = ",",
|
||||
fold_quotes: bool = True,
|
||||
strip_nul: bool = True,
|
||||
repair_delims: bool = True,
|
||||
) -> RepairResult:
|
||||
"""Pre-parse repair on a raw delimited file.
|
||||
|
||||
Performs (in order, each toggleable):
|
||||
|
||||
1. Strip a leading UTF-8 BOM.
|
||||
2. Strip embedded NUL bytes (the C parser truncates fields at NUL).
|
||||
3. Fold smart double quotes (curly, guillemet, double-prime) to ASCII ``"``.
|
||||
4. Per-row repair when one rogue delimiter is embedded in a field that
|
||||
looks like currency or thousands-grouped digits — quote that field.
|
||||
|
||||
Single curly quotes and other punctuation are deferred to the cell-level
|
||||
cleaner; this layer only fixes things that break CSV *parsing*.
|
||||
"""
|
||||
actions: list[RepairAction] = []
|
||||
unrepairable: list[int] = []
|
||||
data = raw
|
||||
|
||||
# 1. BOM
|
||||
if data.startswith(b"\xef\xbb\xbf"):
|
||||
data = data[3:]
|
||||
actions.append(RepairAction(kind="strip_bom", line=None, detail="UTF-8 BOM removed"))
|
||||
|
||||
# 2. NUL
|
||||
if strip_nul and b"\x00" in data:
|
||||
before = data.count(b"\x00")
|
||||
data = data.replace(b"\x00", b"")
|
||||
actions.append(RepairAction(
|
||||
kind="strip_nul", line=None,
|
||||
detail=f"removed {before} NUL byte(s)",
|
||||
))
|
||||
|
||||
# Decode for character-level work.
|
||||
try:
|
||||
text = data.decode(encoding)
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
text = data.decode("utf-8", errors="replace")
|
||||
actions.append(RepairAction(
|
||||
kind="decode_replaced", line=None,
|
||||
detail=f"decode errors under {encoding}; replaced with U+FFFD",
|
||||
))
|
||||
|
||||
# 3. Smart double quotes
|
||||
if fold_quotes:
|
||||
folded = text.translate(_CSV_SMART_QUOTE_TRANS)
|
||||
if folded != text:
|
||||
# Count is approximate (distinct mapped chars combined).
|
||||
n = sum(1 for a, b in zip(text, folded) if a != b)
|
||||
actions.append(RepairAction(
|
||||
kind="fold_smart_quote", line=None,
|
||||
detail=f"replaced {n} smart double-quote char(s) with ASCII '\"'",
|
||||
))
|
||||
text = folded
|
||||
|
||||
# 4. Per-row delimiter repair
|
||||
if repair_delims:
|
||||
text, row_actions, unrepairable = _repair_rows(text, delimiter)
|
||||
actions.extend(row_actions)
|
||||
|
||||
return RepairResult(
|
||||
repaired_bytes=text.encode("utf-8"),
|
||||
actions=actions,
|
||||
unrepairable_lines=unrepairable,
|
||||
)
|
||||
|
||||
|
||||
def _repair_rows(
|
||||
text: str, delimiter: str,
|
||||
) -> tuple[str, list[RepairAction], list[int]]:
|
||||
"""Per-line field-count repair. Operates on already-decoded text."""
|
||||
actions: list[RepairAction] = []
|
||||
unrepairable: list[int] = []
|
||||
|
||||
reader = csv.reader(io.StringIO(text), delimiter=delimiter)
|
||||
rows = list(reader)
|
||||
if not rows:
|
||||
return text, actions, unrepairable
|
||||
|
||||
expected = len(rows[0])
|
||||
repaired_rows: list[list[str]] = [rows[0]]
|
||||
needs_rewrite = False
|
||||
|
||||
for idx, row in enumerate(rows[1:], start=2): # 1-indexed; header is line 1
|
||||
if len(row) == expected or not row:
|
||||
repaired_rows.append(row)
|
||||
continue
|
||||
if len(row) > expected:
|
||||
fixed = _repair_extra_field_row(row, expected, delimiter)
|
||||
if fixed is not None:
|
||||
repaired_rows.append(fixed)
|
||||
needs_rewrite = True
|
||||
actions.append(RepairAction(
|
||||
kind="quote_unquoted_delim", line=idx,
|
||||
detail=(
|
||||
f"line {idx}: merged adjacent fields to fix "
|
||||
f"unquoted '{delimiter}' (saw {len(row)} fields, "
|
||||
f"expected {expected})"
|
||||
),
|
||||
))
|
||||
continue
|
||||
unrepairable.append(idx)
|
||||
repaired_rows.append(row)
|
||||
else:
|
||||
# Too few fields: leave alone, log info-level only.
|
||||
unrepairable.append(idx)
|
||||
repaired_rows.append(row)
|
||||
|
||||
if not needs_rewrite:
|
||||
return text, actions, unrepairable
|
||||
|
||||
buf = io.StringIO()
|
||||
writer = csv.writer(buf, delimiter=delimiter, lineterminator="\n")
|
||||
for row in repaired_rows:
|
||||
writer.writerow(row)
|
||||
return buf.getvalue(), actions, unrepairable
|
||||
|
||||
|
||||
def read_csv_repaired(
|
||||
path: str | Path,
|
||||
*,
|
||||
encoding: Optional[str] = None,
|
||||
delimiter: Optional[str] = None,
|
||||
header_row: Optional[int] = None,
|
||||
fold_quotes: bool = True,
|
||||
strip_nul: bool = True,
|
||||
repair_delims: bool = True,
|
||||
) -> tuple[pd.DataFrame, RepairResult]:
|
||||
"""Read a CSV after running :func:`repair_bytes` on the raw file.
|
||||
|
||||
Returns ``(df, repair_result)`` so callers can surface the action log.
|
||||
"""
|
||||
p = Path(path)
|
||||
enc = encoding or detect_encoding(p)
|
||||
delim = delimiter or detect_delimiter(p, enc)
|
||||
raw = p.read_bytes()
|
||||
|
||||
repair = repair_bytes(
|
||||
raw, encoding=enc, delimiter=delim,
|
||||
fold_quotes=fold_quotes, strip_nul=strip_nul, repair_delims=repair_delims,
|
||||
)
|
||||
|
||||
hdr = header_row if header_row is not None else 0
|
||||
df = pd.read_csv(
|
||||
io.BytesIO(repair.repaired_bytes),
|
||||
encoding="utf-8",
|
||||
delimiter=delim,
|
||||
header=hdr,
|
||||
dtype=str,
|
||||
keep_default_na=False,
|
||||
on_bad_lines="warn",
|
||||
)
|
||||
if repair.actions:
|
||||
logger.info("Pre-parse repair on {}: {}", p.name, repair.summary())
|
||||
return df, repair
|
||||
|
||||
Reference in New Issue
Block a user