feat(io): route read_file through pre-parse repair by default

Previously only analyze() and direct read_csv_repaired() callers got the byte-level repair pass (BOM strip, NUL strip, smart-double-quote fold, unquoted-delimiter merge). The dedup CLI and any other read_file consumer silently missed it. read_file gains a repair=True default. CSV/TSV inputs run through repair_bytes before pandas sees them; Excel inputs still pass through unchanged. Chunked reads (chunk_size set) bypass repair because the pre- parse pass loads the whole file — preserving streaming behavior on huge files. Repair actions and unrepairable lines are logged at INFO/WARNING. cli_text_clean opts out (repair=False): the cleaner offers fine-grained control via --preset and per-op flags, and a byte-level smart-quote fold under the user's "minimal" preset would violate that contract. The cell-level cleaner does the equivalent work itself when its options ask for it. Tests: read_file default strips BOM and folds curly double quotes; repair=False preserves smart quotes; chunked reads still work and skip repair as documented. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 16:09:35 +00:00
parent 0b959dee93
commit 0671ef277e
3 changed files with 91 additions and 9 deletions
--- a/src/cli_text_clean.py
+++ b/src/cli_text_clean.py
@@ -280,6 +280,10 @@ def clean(
            encoding=encoding_override,
            header_row=header_row,
            sheet_name=sheet_arg if sheet_arg is not None else 0,
            # Bypass byte-level repair so the user's preset/flag choices
            # remain authoritative. The cell-level cleaner does the
            # smart-quote / NUL / BOM work itself.
            repair=False,
        )
        if not isinstance(df, pd.DataFrame):
            df = pd.concat(list(df), ignore_index=True)
--- a/src/core/io.py
+++ b/src/core/io.py
@@ -137,6 +137,7 @@ def read_file(
    header_row: Optional[int] = None,
    sheet_name: Optional[str | int] = 0,
    chunk_size: Optional[int] = None,
    repair: bool = True,
 ) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
    """Read a CSV, TSV, or Excel file into a DataFrame.
@@ -147,7 +148,13 @@ def read_file(
    delimiter : override detected delimiter (CSV only)
    header_row : 0-based row index for the header; auto-detected if *None*
    sheet_name : Excel sheet (name or 0-based index). Ignored for CSV.
-    chunk_size : if set, return a generator of DataFrames (CSV only).
+    chunk_size : if set, return a generator of DataFrames (CSV only). When
        *chunk_size* is set, *repair* is forced off because the pre-parse
        pass loads the entire file into memory.
    repair : run :func:`repair_bytes` over the raw CSV before parsing
        (default ``True``). Excel files always skip this step. Pass
        ``repair=False`` when you specifically need pandas' raw view of
        the input.
    Returns a DataFrame (or generator when *chunk_size* is set).
    """
@@ -165,6 +172,7 @@ def read_file(
            delimiter=delimiter,
            header_row=header_row,
            chunk_size=chunk_size,
            repair=repair,
        )
@@ -175,15 +183,56 @@ def _read_csv(
    delimiter: Optional[str] = None,
    header_row: Optional[int] = None,
    chunk_size: Optional[int] = None,
    repair: bool = True,
 ) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
    enc = encoding or detect_encoding(path)
    delim = delimiter or detect_delimiter(path, enc)
    hdr = header_row if header_row is not None else detect_header_row(path, enc, delim)
-    logger.debug("Reading CSV {} (encoding={}, delimiter={!r}, header_row={})",
+    logger.debug(
-                 path.name, enc, delim, hdr)
+        "Reading CSV {} (encoding={}, delimiter={!r}, header_row={}, repair={})",
        path.name, enc, delim, hdr, repair,
    )
-    kwargs: dict = dict(
+    if chunk_size:
        # Streaming reads can't share memory with the repair pass; fall back
        # to direct pandas read so chunked workflows on huge files still
        # work.
        return pd.read_csv(
            filepath_or_buffer=path,
            encoding=enc,
            delimiter=delim,
            header=hdr,
            dtype=str,
            keep_default_na=False,
            on_bad_lines="warn",
            chunksize=chunk_size,
        )
    if repair:
        raw = path.read_bytes()
        repair_result = repair_bytes(raw, encoding=enc, delimiter=delim)
        if repair_result.changed:
            logger.info(
                "Pre-parse repair on {}: {}", path.name, repair_result.summary(),
            )
        if repair_result.unrepairable_lines:
            logger.warning(
                "Pre-parse repair on {}: {} unrepairable line(s) at {}",
                path.name, len(repair_result.unrepairable_lines),
                repair_result.unrepairable_lines[:10],
            )
        return pd.read_csv(
            io.BytesIO(repair_result.repaired_bytes),
            encoding="utf-8",
            delimiter=delim,
            header=hdr,
            dtype=str,
            keep_default_na=False,
            on_bad_lines="warn",
        )
    return pd.read_csv(
        filepath_or_buffer=path,
        encoding=enc,
        delimiter=delim,
@@ -193,11 +242,6 @@ def _read_csv(
        on_bad_lines="warn",
    )
    if chunk_size:
        return pd.read_csv(**kwargs, chunksize=chunk_size)
    return pd.read_csv(**kwargs)
 def _read_excel(
    path: Path,
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -207,6 +207,40 @@ class TestRepairBytes:
        assert summary.get("strip_nul") == 1
 class TestReadFileWithRepair:
    """``read_file(repair=True)`` (default) routes CSV through repair_bytes."""
    def test_default_strips_bom_via_repair(self, tmp_path):
        f = tmp_path / "bom.csv"
        f.write_bytes(b"\xef\xbb\xbfid,name\n1,Alice\n")
        df = read_file(f)
        # First column header must be 'id', not 'id'.
        assert list(df.columns)[0] == "id"
    def test_default_folds_smart_double_quotes(self, tmp_path):
        # Curly quotes are *unquoted* here — outer ASCII quotes would create
        # a CSV-quoting collision once the fold runs.
        f = tmp_path / "quoted.csv"
        f.write_bytes("id,note\n1,curly “hello” world\n".encode("utf-8"))
        df = read_file(f)
        assert df.iloc[0]["note"] == 'curly "hello" world'
    def test_repair_false_preserves_smart_quotes(self, tmp_path):
        f = tmp_path / "quoted.csv"
        f.write_bytes("id,note\n1,curly “hello” world\n".encode("utf-8"))
        df = read_file(f, repair=False)
        assert "“" in df.iloc[0]["note"] or "”" in df.iloc[0]["note"]
    def test_chunked_read_skips_repair(self, tmp_path):
        # Chunked reads bypass repair (memory budget). Verify they still work.
        rows = "id,name\n" + "\n".join(f"{i},Alice" for i in range(1, 21))
        f = tmp_path / "chunked.csv"
        f.write_text(rows)
        chunks = list(read_file(f, chunk_size=5))
        total = sum(len(c) for c in chunks)
        assert total == 20
 class TestReadCsvRepaired:
    def test_recovers_malformed_currency_row(self, tmp_path):
        f = tmp_path / "bad.csv"