feat(io): route read_file through pre-parse repair by default
Previously only analyze() and direct read_csv_repaired() callers got the byte-level repair pass (BOM strip, NUL strip, smart-double-quote fold, unquoted-delimiter merge). The dedup CLI and any other read_file consumer silently missed it. read_file gains a repair=True default. CSV/TSV inputs run through repair_bytes before pandas sees them; Excel inputs still pass through unchanged. Chunked reads (chunk_size set) bypass repair because the pre- parse pass loads the whole file — preserving streaming behavior on huge files. Repair actions and unrepairable lines are logged at INFO/WARNING. cli_text_clean opts out (repair=False): the cleaner offers fine-grained control via --preset and per-op flags, and a byte-level smart-quote fold under the user's "minimal" preset would violate that contract. The cell-level cleaner does the equivalent work itself when its options ask for it. Tests: read_file default strips BOM and folds curly double quotes; repair=False preserves smart quotes; chunked reads still work and skip repair as documented. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -207,6 +207,40 @@ class TestRepairBytes:
|
||||
assert summary.get("strip_nul") == 1
|
||||
|
||||
|
||||
class TestReadFileWithRepair:
|
||||
"""``read_file(repair=True)`` (default) routes CSV through repair_bytes."""
|
||||
|
||||
def test_default_strips_bom_via_repair(self, tmp_path):
|
||||
f = tmp_path / "bom.csv"
|
||||
f.write_bytes(b"\xef\xbb\xbfid,name\n1,Alice\n")
|
||||
df = read_file(f)
|
||||
# First column header must be 'id', not 'id'.
|
||||
assert list(df.columns)[0] == "id"
|
||||
|
||||
def test_default_folds_smart_double_quotes(self, tmp_path):
|
||||
# Curly quotes are *unquoted* here — outer ASCII quotes would create
|
||||
# a CSV-quoting collision once the fold runs.
|
||||
f = tmp_path / "quoted.csv"
|
||||
f.write_bytes("id,note\n1,curly “hello” world\n".encode("utf-8"))
|
||||
df = read_file(f)
|
||||
assert df.iloc[0]["note"] == 'curly "hello" world'
|
||||
|
||||
def test_repair_false_preserves_smart_quotes(self, tmp_path):
|
||||
f = tmp_path / "quoted.csv"
|
||||
f.write_bytes("id,note\n1,curly “hello” world\n".encode("utf-8"))
|
||||
df = read_file(f, repair=False)
|
||||
assert "“" in df.iloc[0]["note"] or "”" in df.iloc[0]["note"]
|
||||
|
||||
def test_chunked_read_skips_repair(self, tmp_path):
|
||||
# Chunked reads bypass repair (memory budget). Verify they still work.
|
||||
rows = "id,name\n" + "\n".join(f"{i},Alice" for i in range(1, 21))
|
||||
f = tmp_path / "chunked.csv"
|
||||
f.write_text(rows)
|
||||
chunks = list(read_file(f, chunk_size=5))
|
||||
total = sum(len(c) for c in chunks)
|
||||
assert total == 20
|
||||
|
||||
|
||||
class TestReadCsvRepaired:
|
||||
def test_recovers_malformed_currency_row(self, tmp_path):
|
||||
f = tmp_path / "bad.csv"
|
||||
|
||||
Reference in New Issue
Block a user