feat(io): route read_file through pre-parse repair by default

Previously only analyze() and direct read_csv_repaired() callers got the byte-level repair pass (BOM strip, NUL strip, smart-double-quote fold, unquoted-delimiter merge). The dedup CLI and any other read_file consumer silently missed it. read_file gains a repair=True default. CSV/TSV inputs run through repair_bytes before pandas sees them; Excel inputs still pass through unchanged. Chunked reads (chunk_size set) bypass repair because the pre- parse pass loads the whole file — preserving streaming behavior on huge files. Repair actions and unrepairable lines are logged at INFO/WARNING. cli_text_clean opts out (repair=False): the cleaner offers fine-grained control via --preset and per-op flags, and a byte-level smart-quote fold under the user's "minimal" preset would violate that contract. The cell-level cleaner does the equivalent work itself when its options ask for it. Tests: read_file default strips BOM and folds curly double quotes; repair=False preserves smart quotes; chunked reads still work and skip repair as documented. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 16:09:35 +00:00
parent 0b959dee93
commit 0671ef277e
3 changed files with 91 additions and 9 deletions
--- a/src/core/io.py
+++ b/src/core/io.py
@@ -137,6 +137,7 @@ def read_file(
    header_row: Optional[int] = None,
    sheet_name: Optional[str | int] = 0,
    chunk_size: Optional[int] = None,
+    repair: bool = True,
 ) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
    """Read a CSV, TSV, or Excel file into a DataFrame.

@@ -147,7 +148,13 @@ def read_file(
    delimiter : override detected delimiter (CSV only)
    header_row : 0-based row index for the header; auto-detected if *None*
    sheet_name : Excel sheet (name or 0-based index). Ignored for CSV.
-    chunk_size : if set, return a generator of DataFrames (CSV only).
+    chunk_size : if set, return a generator of DataFrames (CSV only). When
+        *chunk_size* is set, *repair* is forced off because the pre-parse
+        pass loads the entire file into memory.
+    repair : run :func:`repair_bytes` over the raw CSV before parsing
+        (default ``True``). Excel files always skip this step. Pass
+        ``repair=False`` when you specifically need pandas' raw view of
+        the input.

    Returns a DataFrame (or generator when *chunk_size* is set).
    """
@@ -165,6 +172,7 @@ def read_file(
            delimiter=delimiter,
            header_row=header_row,
            chunk_size=chunk_size,
+            repair=repair,
        )


@@ -175,15 +183,56 @@ def _read_csv(
    delimiter: Optional[str] = None,
    header_row: Optional[int] = None,
    chunk_size: Optional[int] = None,
+    repair: bool = True,
 ) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
    enc = encoding or detect_encoding(path)
    delim = delimiter or detect_delimiter(path, enc)
    hdr = header_row if header_row is not None else detect_header_row(path, enc, delim)

-    logger.debug("Reading CSV {} (encoding={}, delimiter={!r}, header_row={})",
-                 path.name, enc, delim, hdr)
+    logger.debug(
+        "Reading CSV {} (encoding={}, delimiter={!r}, header_row={}, repair={})",
+        path.name, enc, delim, hdr, repair,
+    )

-    kwargs: dict = dict(
+    if chunk_size:
+        # Streaming reads can't share memory with the repair pass; fall back
+        # to direct pandas read so chunked workflows on huge files still
+        # work.
+        return pd.read_csv(
+            filepath_or_buffer=path,
+            encoding=enc,
+            delimiter=delim,
+            header=hdr,
+            dtype=str,
+            keep_default_na=False,
+            on_bad_lines="warn",
+            chunksize=chunk_size,
+        )
+
+    if repair:
+        raw = path.read_bytes()
+        repair_result = repair_bytes(raw, encoding=enc, delimiter=delim)
+        if repair_result.changed:
+            logger.info(
+                "Pre-parse repair on {}: {}", path.name, repair_result.summary(),
+            )
+        if repair_result.unrepairable_lines:
+            logger.warning(
+                "Pre-parse repair on {}: {} unrepairable line(s) at {}",
+                path.name, len(repair_result.unrepairable_lines),
+                repair_result.unrepairable_lines[:10],
+            )
+        return pd.read_csv(
+            io.BytesIO(repair_result.repaired_bytes),
+            encoding="utf-8",
+            delimiter=delim,
+            header=hdr,
+            dtype=str,
+            keep_default_na=False,
+            on_bad_lines="warn",
+        )
+
+    return pd.read_csv(
        filepath_or_buffer=path,
        encoding=enc,
        delimiter=delim,
@@ -193,11 +242,6 @@ def _read_csv(
        on_bad_lines="warn",
    )

-    if chunk_size:
-        return pd.read_csv(**kwargs, chunksize=chunk_size)
-
-    return pd.read_csv(**kwargs)
-

 def _read_excel(
    path: Path,