feat(io): route read_file through pre-parse repair by default

Previously only analyze() and direct read_csv_repaired() callers got the
byte-level repair pass (BOM strip, NUL strip, smart-double-quote fold,
unquoted-delimiter merge). The dedup CLI and any other read_file consumer
silently missed it.

read_file gains a repair=True default. CSV/TSV inputs run through
repair_bytes before pandas sees them; Excel inputs still pass through
unchanged. Chunked reads (chunk_size set) bypass repair because the pre-
parse pass loads the whole file — preserving streaming behavior on huge
files. Repair actions and unrepairable lines are logged at INFO/WARNING.

cli_text_clean opts out (repair=False): the cleaner offers fine-grained
control via --preset and per-op flags, and a byte-level smart-quote fold
under the user's "minimal" preset would violate that contract. The
cell-level cleaner does the equivalent work itself when its options ask
for it.

Tests: read_file default strips BOM and folds curly double quotes;
repair=False preserves smart quotes; chunked reads still work and skip
repair as documented.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-29 16:09:35 +00:00
parent 0b959dee93
commit 0671ef277e
3 changed files with 91 additions and 9 deletions

View File

@@ -280,6 +280,10 @@ def clean(
encoding=encoding_override, encoding=encoding_override,
header_row=header_row, header_row=header_row,
sheet_name=sheet_arg if sheet_arg is not None else 0, sheet_name=sheet_arg if sheet_arg is not None else 0,
# Bypass byte-level repair so the user's preset/flag choices
# remain authoritative. The cell-level cleaner does the
# smart-quote / NUL / BOM work itself.
repair=False,
) )
if not isinstance(df, pd.DataFrame): if not isinstance(df, pd.DataFrame):
df = pd.concat(list(df), ignore_index=True) df = pd.concat(list(df), ignore_index=True)

View File

@@ -137,6 +137,7 @@ def read_file(
header_row: Optional[int] = None, header_row: Optional[int] = None,
sheet_name: Optional[str | int] = 0, sheet_name: Optional[str | int] = 0,
chunk_size: Optional[int] = None, chunk_size: Optional[int] = None,
repair: bool = True,
) -> pd.DataFrame | Generator[pd.DataFrame, None, None]: ) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
"""Read a CSV, TSV, or Excel file into a DataFrame. """Read a CSV, TSV, or Excel file into a DataFrame.
@@ -147,7 +148,13 @@ def read_file(
delimiter : override detected delimiter (CSV only) delimiter : override detected delimiter (CSV only)
header_row : 0-based row index for the header; auto-detected if *None* header_row : 0-based row index for the header; auto-detected if *None*
sheet_name : Excel sheet (name or 0-based index). Ignored for CSV. sheet_name : Excel sheet (name or 0-based index). Ignored for CSV.
chunk_size : if set, return a generator of DataFrames (CSV only). chunk_size : if set, return a generator of DataFrames (CSV only). When
*chunk_size* is set, *repair* is forced off because the pre-parse
pass loads the entire file into memory.
repair : run :func:`repair_bytes` over the raw CSV before parsing
(default ``True``). Excel files always skip this step. Pass
``repair=False`` when you specifically need pandas' raw view of
the input.
Returns a DataFrame (or generator when *chunk_size* is set). Returns a DataFrame (or generator when *chunk_size* is set).
""" """
@@ -165,6 +172,7 @@ def read_file(
delimiter=delimiter, delimiter=delimiter,
header_row=header_row, header_row=header_row,
chunk_size=chunk_size, chunk_size=chunk_size,
repair=repair,
) )
@@ -175,15 +183,56 @@ def _read_csv(
delimiter: Optional[str] = None, delimiter: Optional[str] = None,
header_row: Optional[int] = None, header_row: Optional[int] = None,
chunk_size: Optional[int] = None, chunk_size: Optional[int] = None,
repair: bool = True,
) -> pd.DataFrame | Generator[pd.DataFrame, None, None]: ) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
enc = encoding or detect_encoding(path) enc = encoding or detect_encoding(path)
delim = delimiter or detect_delimiter(path, enc) delim = delimiter or detect_delimiter(path, enc)
hdr = header_row if header_row is not None else detect_header_row(path, enc, delim) hdr = header_row if header_row is not None else detect_header_row(path, enc, delim)
logger.debug("Reading CSV {} (encoding={}, delimiter={!r}, header_row={})", logger.debug(
path.name, enc, delim, hdr) "Reading CSV {} (encoding={}, delimiter={!r}, header_row={}, repair={})",
path.name, enc, delim, hdr, repair,
)
kwargs: dict = dict( if chunk_size:
# Streaming reads can't share memory with the repair pass; fall back
# to direct pandas read so chunked workflows on huge files still
# work.
return pd.read_csv(
filepath_or_buffer=path,
encoding=enc,
delimiter=delim,
header=hdr,
dtype=str,
keep_default_na=False,
on_bad_lines="warn",
chunksize=chunk_size,
)
if repair:
raw = path.read_bytes()
repair_result = repair_bytes(raw, encoding=enc, delimiter=delim)
if repair_result.changed:
logger.info(
"Pre-parse repair on {}: {}", path.name, repair_result.summary(),
)
if repair_result.unrepairable_lines:
logger.warning(
"Pre-parse repair on {}: {} unrepairable line(s) at {}",
path.name, len(repair_result.unrepairable_lines),
repair_result.unrepairable_lines[:10],
)
return pd.read_csv(
io.BytesIO(repair_result.repaired_bytes),
encoding="utf-8",
delimiter=delim,
header=hdr,
dtype=str,
keep_default_na=False,
on_bad_lines="warn",
)
return pd.read_csv(
filepath_or_buffer=path, filepath_or_buffer=path,
encoding=enc, encoding=enc,
delimiter=delim, delimiter=delim,
@@ -193,11 +242,6 @@ def _read_csv(
on_bad_lines="warn", on_bad_lines="warn",
) )
if chunk_size:
return pd.read_csv(**kwargs, chunksize=chunk_size)
return pd.read_csv(**kwargs)
def _read_excel( def _read_excel(
path: Path, path: Path,

View File

@@ -207,6 +207,40 @@ class TestRepairBytes:
assert summary.get("strip_nul") == 1 assert summary.get("strip_nul") == 1
class TestReadFileWithRepair:
"""``read_file(repair=True)`` (default) routes CSV through repair_bytes."""
def test_default_strips_bom_via_repair(self, tmp_path):
f = tmp_path / "bom.csv"
f.write_bytes(b"\xef\xbb\xbfid,name\n1,Alice\n")
df = read_file(f)
# First column header must be 'id', not 'id'.
assert list(df.columns)[0] == "id"
def test_default_folds_smart_double_quotes(self, tmp_path):
# Curly quotes are *unquoted* here — outer ASCII quotes would create
# a CSV-quoting collision once the fold runs.
f = tmp_path / "quoted.csv"
f.write_bytes("id,note\n1,curly “hello” world\n".encode("utf-8"))
df = read_file(f)
assert df.iloc[0]["note"] == 'curly "hello" world'
def test_repair_false_preserves_smart_quotes(self, tmp_path):
f = tmp_path / "quoted.csv"
f.write_bytes("id,note\n1,curly “hello” world\n".encode("utf-8"))
df = read_file(f, repair=False)
assert "" in df.iloc[0]["note"] or "" in df.iloc[0]["note"]
def test_chunked_read_skips_repair(self, tmp_path):
# Chunked reads bypass repair (memory budget). Verify they still work.
rows = "id,name\n" + "\n".join(f"{i},Alice" for i in range(1, 21))
f = tmp_path / "chunked.csv"
f.write_text(rows)
chunks = list(read_file(f, chunk_size=5))
total = sum(len(c) for c in chunks)
assert total == 20
class TestReadCsvRepaired: class TestReadCsvRepaired:
def test_recovers_malformed_currency_row(self, tmp_path): def test_recovers_malformed_currency_row(self, tmp_path):
f = tmp_path / "bad.csv" f = tmp_path / "bad.csv"