feat(io): route read_file through pre-parse repair by default
Previously only analyze() and direct read_csv_repaired() callers got the byte-level repair pass (BOM strip, NUL strip, smart-double-quote fold, unquoted-delimiter merge). The dedup CLI and any other read_file consumer silently missed it. read_file gains a repair=True default. CSV/TSV inputs run through repair_bytes before pandas sees them; Excel inputs still pass through unchanged. Chunked reads (chunk_size set) bypass repair because the pre- parse pass loads the whole file — preserving streaming behavior on huge files. Repair actions and unrepairable lines are logged at INFO/WARNING. cli_text_clean opts out (repair=False): the cleaner offers fine-grained control via --preset and per-op flags, and a byte-level smart-quote fold under the user's "minimal" preset would violate that contract. The cell-level cleaner does the equivalent work itself when its options ask for it. Tests: read_file default strips BOM and folds curly double quotes; repair=False preserves smart quotes; chunked reads still work and skip repair as documented. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -137,6 +137,7 @@ def read_file(
|
||||
header_row: Optional[int] = None,
|
||||
sheet_name: Optional[str | int] = 0,
|
||||
chunk_size: Optional[int] = None,
|
||||
repair: bool = True,
|
||||
) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
|
||||
"""Read a CSV, TSV, or Excel file into a DataFrame.
|
||||
|
||||
@@ -147,7 +148,13 @@ def read_file(
|
||||
delimiter : override detected delimiter (CSV only)
|
||||
header_row : 0-based row index for the header; auto-detected if *None*
|
||||
sheet_name : Excel sheet (name or 0-based index). Ignored for CSV.
|
||||
chunk_size : if set, return a generator of DataFrames (CSV only).
|
||||
chunk_size : if set, return a generator of DataFrames (CSV only). When
|
||||
*chunk_size* is set, *repair* is forced off because the pre-parse
|
||||
pass loads the entire file into memory.
|
||||
repair : run :func:`repair_bytes` over the raw CSV before parsing
|
||||
(default ``True``). Excel files always skip this step. Pass
|
||||
``repair=False`` when you specifically need pandas' raw view of
|
||||
the input.
|
||||
|
||||
Returns a DataFrame (or generator when *chunk_size* is set).
|
||||
"""
|
||||
@@ -165,6 +172,7 @@ def read_file(
|
||||
delimiter=delimiter,
|
||||
header_row=header_row,
|
||||
chunk_size=chunk_size,
|
||||
repair=repair,
|
||||
)
|
||||
|
||||
|
||||
@@ -175,15 +183,56 @@ def _read_csv(
|
||||
delimiter: Optional[str] = None,
|
||||
header_row: Optional[int] = None,
|
||||
chunk_size: Optional[int] = None,
|
||||
repair: bool = True,
|
||||
) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
|
||||
enc = encoding or detect_encoding(path)
|
||||
delim = delimiter or detect_delimiter(path, enc)
|
||||
hdr = header_row if header_row is not None else detect_header_row(path, enc, delim)
|
||||
|
||||
logger.debug("Reading CSV {} (encoding={}, delimiter={!r}, header_row={})",
|
||||
path.name, enc, delim, hdr)
|
||||
logger.debug(
|
||||
"Reading CSV {} (encoding={}, delimiter={!r}, header_row={}, repair={})",
|
||||
path.name, enc, delim, hdr, repair,
|
||||
)
|
||||
|
||||
kwargs: dict = dict(
|
||||
if chunk_size:
|
||||
# Streaming reads can't share memory with the repair pass; fall back
|
||||
# to direct pandas read so chunked workflows on huge files still
|
||||
# work.
|
||||
return pd.read_csv(
|
||||
filepath_or_buffer=path,
|
||||
encoding=enc,
|
||||
delimiter=delim,
|
||||
header=hdr,
|
||||
dtype=str,
|
||||
keep_default_na=False,
|
||||
on_bad_lines="warn",
|
||||
chunksize=chunk_size,
|
||||
)
|
||||
|
||||
if repair:
|
||||
raw = path.read_bytes()
|
||||
repair_result = repair_bytes(raw, encoding=enc, delimiter=delim)
|
||||
if repair_result.changed:
|
||||
logger.info(
|
||||
"Pre-parse repair on {}: {}", path.name, repair_result.summary(),
|
||||
)
|
||||
if repair_result.unrepairable_lines:
|
||||
logger.warning(
|
||||
"Pre-parse repair on {}: {} unrepairable line(s) at {}",
|
||||
path.name, len(repair_result.unrepairable_lines),
|
||||
repair_result.unrepairable_lines[:10],
|
||||
)
|
||||
return pd.read_csv(
|
||||
io.BytesIO(repair_result.repaired_bytes),
|
||||
encoding="utf-8",
|
||||
delimiter=delim,
|
||||
header=hdr,
|
||||
dtype=str,
|
||||
keep_default_na=False,
|
||||
on_bad_lines="warn",
|
||||
)
|
||||
|
||||
return pd.read_csv(
|
||||
filepath_or_buffer=path,
|
||||
encoding=enc,
|
||||
delimiter=delim,
|
||||
@@ -193,11 +242,6 @@ def _read_csv(
|
||||
on_bad_lines="warn",
|
||||
)
|
||||
|
||||
if chunk_size:
|
||||
return pd.read_csv(**kwargs, chunksize=chunk_size)
|
||||
|
||||
return pd.read_csv(**kwargs)
|
||||
|
||||
|
||||
def _read_excel(
|
||||
path: Path,
|
||||
|
||||
Reference in New Issue
Block a user