diff --git a/src/cli_text_clean.py b/src/cli_text_clean.py index a234166..bc5c163 100644 --- a/src/cli_text_clean.py +++ b/src/cli_text_clean.py @@ -280,6 +280,10 @@ def clean( encoding=encoding_override, header_row=header_row, sheet_name=sheet_arg if sheet_arg is not None else 0, + # Bypass byte-level repair so the user's preset/flag choices + # remain authoritative. The cell-level cleaner does the + # smart-quote / NUL / BOM work itself. + repair=False, ) if not isinstance(df, pd.DataFrame): df = pd.concat(list(df), ignore_index=True) diff --git a/src/core/io.py b/src/core/io.py index d668e45..dd45b87 100644 --- a/src/core/io.py +++ b/src/core/io.py @@ -137,6 +137,7 @@ def read_file( header_row: Optional[int] = None, sheet_name: Optional[str | int] = 0, chunk_size: Optional[int] = None, + repair: bool = True, ) -> pd.DataFrame | Generator[pd.DataFrame, None, None]: """Read a CSV, TSV, or Excel file into a DataFrame. @@ -147,7 +148,13 @@ def read_file( delimiter : override detected delimiter (CSV only) header_row : 0-based row index for the header; auto-detected if *None* sheet_name : Excel sheet (name or 0-based index). Ignored for CSV. - chunk_size : if set, return a generator of DataFrames (CSV only). + chunk_size : if set, return a generator of DataFrames (CSV only). When + *chunk_size* is set, *repair* is forced off because the pre-parse + pass loads the entire file into memory. + repair : run :func:`repair_bytes` over the raw CSV before parsing + (default ``True``). Excel files always skip this step. Pass + ``repair=False`` when you specifically need pandas' raw view of + the input. Returns a DataFrame (or generator when *chunk_size* is set). """ @@ -165,6 +172,7 @@ def read_file( delimiter=delimiter, header_row=header_row, chunk_size=chunk_size, + repair=repair, ) @@ -175,15 +183,56 @@ def _read_csv( delimiter: Optional[str] = None, header_row: Optional[int] = None, chunk_size: Optional[int] = None, + repair: bool = True, ) -> pd.DataFrame | Generator[pd.DataFrame, None, None]: enc = encoding or detect_encoding(path) delim = delimiter or detect_delimiter(path, enc) hdr = header_row if header_row is not None else detect_header_row(path, enc, delim) - logger.debug("Reading CSV {} (encoding={}, delimiter={!r}, header_row={})", - path.name, enc, delim, hdr) + logger.debug( + "Reading CSV {} (encoding={}, delimiter={!r}, header_row={}, repair={})", + path.name, enc, delim, hdr, repair, + ) - kwargs: dict = dict( + if chunk_size: + # Streaming reads can't share memory with the repair pass; fall back + # to direct pandas read so chunked workflows on huge files still + # work. + return pd.read_csv( + filepath_or_buffer=path, + encoding=enc, + delimiter=delim, + header=hdr, + dtype=str, + keep_default_na=False, + on_bad_lines="warn", + chunksize=chunk_size, + ) + + if repair: + raw = path.read_bytes() + repair_result = repair_bytes(raw, encoding=enc, delimiter=delim) + if repair_result.changed: + logger.info( + "Pre-parse repair on {}: {}", path.name, repair_result.summary(), + ) + if repair_result.unrepairable_lines: + logger.warning( + "Pre-parse repair on {}: {} unrepairable line(s) at {}", + path.name, len(repair_result.unrepairable_lines), + repair_result.unrepairable_lines[:10], + ) + return pd.read_csv( + io.BytesIO(repair_result.repaired_bytes), + encoding="utf-8", + delimiter=delim, + header=hdr, + dtype=str, + keep_default_na=False, + on_bad_lines="warn", + ) + + return pd.read_csv( filepath_or_buffer=path, encoding=enc, delimiter=delim, @@ -193,11 +242,6 @@ def _read_csv( on_bad_lines="warn", ) - if chunk_size: - return pd.read_csv(**kwargs, chunksize=chunk_size) - - return pd.read_csv(**kwargs) - def _read_excel( path: Path, diff --git a/tests/test_io.py b/tests/test_io.py index 598b5ae..514b6d4 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -207,6 +207,40 @@ class TestRepairBytes: assert summary.get("strip_nul") == 1 +class TestReadFileWithRepair: + """``read_file(repair=True)`` (default) routes CSV through repair_bytes.""" + + def test_default_strips_bom_via_repair(self, tmp_path): + f = tmp_path / "bom.csv" + f.write_bytes(b"\xef\xbb\xbfid,name\n1,Alice\n") + df = read_file(f) + # First column header must be 'id', not 'id'. + assert list(df.columns)[0] == "id" + + def test_default_folds_smart_double_quotes(self, tmp_path): + # Curly quotes are *unquoted* here — outer ASCII quotes would create + # a CSV-quoting collision once the fold runs. + f = tmp_path / "quoted.csv" + f.write_bytes("id,note\n1,curly “hello” world\n".encode("utf-8")) + df = read_file(f) + assert df.iloc[0]["note"] == 'curly "hello" world' + + def test_repair_false_preserves_smart_quotes(self, tmp_path): + f = tmp_path / "quoted.csv" + f.write_bytes("id,note\n1,curly “hello” world\n".encode("utf-8")) + df = read_file(f, repair=False) + assert "“" in df.iloc[0]["note"] or "”" in df.iloc[0]["note"] + + def test_chunked_read_skips_repair(self, tmp_path): + # Chunked reads bypass repair (memory budget). Verify they still work. + rows = "id,name\n" + "\n".join(f"{i},Alice" for i in range(1, 21)) + f = tmp_path / "chunked.csv" + f.write_text(rows) + chunks = list(read_file(f, chunk_size=5)) + total = sum(len(c) for c in chunks) + assert total == 20 + + class TestReadCsvRepaired: def test_recovers_malformed_currency_row(self, tmp_path): f = tmp_path / "bad.csv"