feat(io): route read_file through pre-parse repair by default
Previously only analyze() and direct read_csv_repaired() callers got the byte-level repair pass (BOM strip, NUL strip, smart-double-quote fold, unquoted-delimiter merge). The dedup CLI and any other read_file consumer silently missed it. read_file gains a repair=True default. CSV/TSV inputs run through repair_bytes before pandas sees them; Excel inputs still pass through unchanged. Chunked reads (chunk_size set) bypass repair because the pre- parse pass loads the whole file — preserving streaming behavior on huge files. Repair actions and unrepairable lines are logged at INFO/WARNING. cli_text_clean opts out (repair=False): the cleaner offers fine-grained control via --preset and per-op flags, and a byte-level smart-quote fold under the user's "minimal" preset would violate that contract. The cell-level cleaner does the equivalent work itself when its options ask for it. Tests: read_file default strips BOM and folds curly double quotes; repair=False preserves smart quotes; chunked reads still work and skip repair as documented. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -280,6 +280,10 @@ def clean(
|
|||||||
encoding=encoding_override,
|
encoding=encoding_override,
|
||||||
header_row=header_row,
|
header_row=header_row,
|
||||||
sheet_name=sheet_arg if sheet_arg is not None else 0,
|
sheet_name=sheet_arg if sheet_arg is not None else 0,
|
||||||
|
# Bypass byte-level repair so the user's preset/flag choices
|
||||||
|
# remain authoritative. The cell-level cleaner does the
|
||||||
|
# smart-quote / NUL / BOM work itself.
|
||||||
|
repair=False,
|
||||||
)
|
)
|
||||||
if not isinstance(df, pd.DataFrame):
|
if not isinstance(df, pd.DataFrame):
|
||||||
df = pd.concat(list(df), ignore_index=True)
|
df = pd.concat(list(df), ignore_index=True)
|
||||||
|
|||||||
@@ -137,6 +137,7 @@ def read_file(
|
|||||||
header_row: Optional[int] = None,
|
header_row: Optional[int] = None,
|
||||||
sheet_name: Optional[str | int] = 0,
|
sheet_name: Optional[str | int] = 0,
|
||||||
chunk_size: Optional[int] = None,
|
chunk_size: Optional[int] = None,
|
||||||
|
repair: bool = True,
|
||||||
) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
|
) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
|
||||||
"""Read a CSV, TSV, or Excel file into a DataFrame.
|
"""Read a CSV, TSV, or Excel file into a DataFrame.
|
||||||
|
|
||||||
@@ -147,7 +148,13 @@ def read_file(
|
|||||||
delimiter : override detected delimiter (CSV only)
|
delimiter : override detected delimiter (CSV only)
|
||||||
header_row : 0-based row index for the header; auto-detected if *None*
|
header_row : 0-based row index for the header; auto-detected if *None*
|
||||||
sheet_name : Excel sheet (name or 0-based index). Ignored for CSV.
|
sheet_name : Excel sheet (name or 0-based index). Ignored for CSV.
|
||||||
chunk_size : if set, return a generator of DataFrames (CSV only).
|
chunk_size : if set, return a generator of DataFrames (CSV only). When
|
||||||
|
*chunk_size* is set, *repair* is forced off because the pre-parse
|
||||||
|
pass loads the entire file into memory.
|
||||||
|
repair : run :func:`repair_bytes` over the raw CSV before parsing
|
||||||
|
(default ``True``). Excel files always skip this step. Pass
|
||||||
|
``repair=False`` when you specifically need pandas' raw view of
|
||||||
|
the input.
|
||||||
|
|
||||||
Returns a DataFrame (or generator when *chunk_size* is set).
|
Returns a DataFrame (or generator when *chunk_size* is set).
|
||||||
"""
|
"""
|
||||||
@@ -165,6 +172,7 @@ def read_file(
|
|||||||
delimiter=delimiter,
|
delimiter=delimiter,
|
||||||
header_row=header_row,
|
header_row=header_row,
|
||||||
chunk_size=chunk_size,
|
chunk_size=chunk_size,
|
||||||
|
repair=repair,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -175,15 +183,56 @@ def _read_csv(
|
|||||||
delimiter: Optional[str] = None,
|
delimiter: Optional[str] = None,
|
||||||
header_row: Optional[int] = None,
|
header_row: Optional[int] = None,
|
||||||
chunk_size: Optional[int] = None,
|
chunk_size: Optional[int] = None,
|
||||||
|
repair: bool = True,
|
||||||
) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
|
) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
|
||||||
enc = encoding or detect_encoding(path)
|
enc = encoding or detect_encoding(path)
|
||||||
delim = delimiter or detect_delimiter(path, enc)
|
delim = delimiter or detect_delimiter(path, enc)
|
||||||
hdr = header_row if header_row is not None else detect_header_row(path, enc, delim)
|
hdr = header_row if header_row is not None else detect_header_row(path, enc, delim)
|
||||||
|
|
||||||
logger.debug("Reading CSV {} (encoding={}, delimiter={!r}, header_row={})",
|
logger.debug(
|
||||||
path.name, enc, delim, hdr)
|
"Reading CSV {} (encoding={}, delimiter={!r}, header_row={}, repair={})",
|
||||||
|
path.name, enc, delim, hdr, repair,
|
||||||
|
)
|
||||||
|
|
||||||
kwargs: dict = dict(
|
if chunk_size:
|
||||||
|
# Streaming reads can't share memory with the repair pass; fall back
|
||||||
|
# to direct pandas read so chunked workflows on huge files still
|
||||||
|
# work.
|
||||||
|
return pd.read_csv(
|
||||||
|
filepath_or_buffer=path,
|
||||||
|
encoding=enc,
|
||||||
|
delimiter=delim,
|
||||||
|
header=hdr,
|
||||||
|
dtype=str,
|
||||||
|
keep_default_na=False,
|
||||||
|
on_bad_lines="warn",
|
||||||
|
chunksize=chunk_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
if repair:
|
||||||
|
raw = path.read_bytes()
|
||||||
|
repair_result = repair_bytes(raw, encoding=enc, delimiter=delim)
|
||||||
|
if repair_result.changed:
|
||||||
|
logger.info(
|
||||||
|
"Pre-parse repair on {}: {}", path.name, repair_result.summary(),
|
||||||
|
)
|
||||||
|
if repair_result.unrepairable_lines:
|
||||||
|
logger.warning(
|
||||||
|
"Pre-parse repair on {}: {} unrepairable line(s) at {}",
|
||||||
|
path.name, len(repair_result.unrepairable_lines),
|
||||||
|
repair_result.unrepairable_lines[:10],
|
||||||
|
)
|
||||||
|
return pd.read_csv(
|
||||||
|
io.BytesIO(repair_result.repaired_bytes),
|
||||||
|
encoding="utf-8",
|
||||||
|
delimiter=delim,
|
||||||
|
header=hdr,
|
||||||
|
dtype=str,
|
||||||
|
keep_default_na=False,
|
||||||
|
on_bad_lines="warn",
|
||||||
|
)
|
||||||
|
|
||||||
|
return pd.read_csv(
|
||||||
filepath_or_buffer=path,
|
filepath_or_buffer=path,
|
||||||
encoding=enc,
|
encoding=enc,
|
||||||
delimiter=delim,
|
delimiter=delim,
|
||||||
@@ -193,11 +242,6 @@ def _read_csv(
|
|||||||
on_bad_lines="warn",
|
on_bad_lines="warn",
|
||||||
)
|
)
|
||||||
|
|
||||||
if chunk_size:
|
|
||||||
return pd.read_csv(**kwargs, chunksize=chunk_size)
|
|
||||||
|
|
||||||
return pd.read_csv(**kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def _read_excel(
|
def _read_excel(
|
||||||
path: Path,
|
path: Path,
|
||||||
|
|||||||
@@ -207,6 +207,40 @@ class TestRepairBytes:
|
|||||||
assert summary.get("strip_nul") == 1
|
assert summary.get("strip_nul") == 1
|
||||||
|
|
||||||
|
|
||||||
|
class TestReadFileWithRepair:
|
||||||
|
"""``read_file(repair=True)`` (default) routes CSV through repair_bytes."""
|
||||||
|
|
||||||
|
def test_default_strips_bom_via_repair(self, tmp_path):
|
||||||
|
f = tmp_path / "bom.csv"
|
||||||
|
f.write_bytes(b"\xef\xbb\xbfid,name\n1,Alice\n")
|
||||||
|
df = read_file(f)
|
||||||
|
# First column header must be 'id', not 'id'.
|
||||||
|
assert list(df.columns)[0] == "id"
|
||||||
|
|
||||||
|
def test_default_folds_smart_double_quotes(self, tmp_path):
|
||||||
|
# Curly quotes are *unquoted* here — outer ASCII quotes would create
|
||||||
|
# a CSV-quoting collision once the fold runs.
|
||||||
|
f = tmp_path / "quoted.csv"
|
||||||
|
f.write_bytes("id,note\n1,curly “hello” world\n".encode("utf-8"))
|
||||||
|
df = read_file(f)
|
||||||
|
assert df.iloc[0]["note"] == 'curly "hello" world'
|
||||||
|
|
||||||
|
def test_repair_false_preserves_smart_quotes(self, tmp_path):
|
||||||
|
f = tmp_path / "quoted.csv"
|
||||||
|
f.write_bytes("id,note\n1,curly “hello” world\n".encode("utf-8"))
|
||||||
|
df = read_file(f, repair=False)
|
||||||
|
assert "“" in df.iloc[0]["note"] or "”" in df.iloc[0]["note"]
|
||||||
|
|
||||||
|
def test_chunked_read_skips_repair(self, tmp_path):
|
||||||
|
# Chunked reads bypass repair (memory budget). Verify they still work.
|
||||||
|
rows = "id,name\n" + "\n".join(f"{i},Alice" for i in range(1, 21))
|
||||||
|
f = tmp_path / "chunked.csv"
|
||||||
|
f.write_text(rows)
|
||||||
|
chunks = list(read_file(f, chunk_size=5))
|
||||||
|
total = sum(len(c) for c in chunks)
|
||||||
|
assert total == 20
|
||||||
|
|
||||||
|
|
||||||
class TestReadCsvRepaired:
|
class TestReadCsvRepaired:
|
||||||
def test_recovers_malformed_currency_row(self, tmp_path):
|
def test_recovers_malformed_currency_row(self, tmp_path):
|
||||||
f = tmp_path / "bad.csv"
|
f = tmp_path / "bad.csv"
|
||||||
|
|||||||
Reference in New Issue
Block a user