Files
datatools-dev/src/core/io.py
Michael 5b672370a6 perf: cache hot paths, drop wasted allocations, lift 1 GB → 1.5 GB
Five targeted wins driven by an end-to-end audit, with shape-pinning
regression tests so reverts are loud:

- format_standardize: fuse the dispatcher loop into one pass — was
  calling Series.tolist() three times per typed column and materialising
  an intermediate triples list; now one tolist, one walk. On a
  synthetic 1M-row phone+email frame this measures ~2.7M rows/sec
  (vs. the previous 150k/sec doc target).
- dedup: wrap normalizers in a per-call lru_cache so repeat phones /
  emails / addresses skip re-parsing. phonenumbers.parse is the
  expensive call; ~2–5x faster on the normalisation step for realistic
  workloads.
- analyze: _detect_near_duplicates no longer copies the full input
  frame; builds only the normalised string columns via a dict and
  references non-string columns by view. Skips the redundant
  astype(str) when a column is already pandas string dtype.
- text_clean: hoist _build_pipeline out of the per-cell loop and add a
  per-call string cache so 100k repeats of "Active" only run the
  pipeline once. ~1M rows/sec on repetition-heavy columns.
- io.repair_bytes: the non-UTF-8 smart-quote fold path used a
  Python-level zip walk over the entire decoded string to count
  replacements — replaced with sum(text.count(c) ...) which runs in
  C at ~GB/s. Was a latent ~100s on a 1 GB cp1252 file; now <1s.

Updates REQUIREMENTS §10 with measured numbers and bumps the buyer-
facing upload limit from 1 GB to 1.5 GB across the i18n packs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 15:37:26 +00:00

1106 lines
41 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""File I/O: encoding/delimiter detection, CSV/Excel reading, output writing."""
from __future__ import annotations
import csv
import io
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Generator, Optional
import pandas as pd
from charset_normalizer import from_bytes
from loguru import logger
# ---------------------------------------------------------------------------
# Encoding detection
# ---------------------------------------------------------------------------
# charset-normalizer often picks an Eastern-European code page (cp1250,
# cp1258) for byte-equivalent Western content, mac_iceland over mac_roman
# in the Mac family, and shift_jis_2004 for short Cyrillic samples. The
# arbiter below resolves these specific false positives without
# overruling the detector when its top pick is genuinely the right
# answer.
#
# Mapping is *over-picked encoding* → *more plausible substitutes (in
# priority order)*. We accept either the candidate's primary encoding
# name or any of its ``could_be_from_charset`` aliases.
_ENCODING_FALLBACKS: dict[str, tuple[str, ...]] = {
"cp1250": ("cp1252", "latin_1", "iso8859_15", "iso8859_2"),
"cp1258": ("iso8859_2", "cp1250", "cp1252"),
"mac_iceland": ("mac_roman",),
"shift_jis_2004": ("koi8_r", "cp1251", "cp1252", "iso8859_2"),
"shift_jisx0213": ("koi8_r", "cp1251", "cp1252", "iso8859_2"),
}
def _arbitrate_charset_match(matches) -> Optional[str]:
"""Pick the most plausible encoding from a charset-normalizer match list.
Two distinguishing signals separate a false positive from a real
pick when the top encoding is one we've recorded as over-picked:
* If the top match's own ``could_be_from_charset`` alias list
already names a preferred fallback (e.g. cp1250 with cp1252 as a
sibling), we substitute — charset-normalizer has flagged the
byte content as ambiguous.
* If the second-ranked match shares identical *chaos* and
*coherence* scores with the top — meaning the bytes decode
byte-equivalently under both — we substitute when the second
match is the preferred Western default.
When neither signal fires (real cp1250 / cp1258 content where
charset-normalizer is genuinely confident), the top pick is
returned unchanged.
"""
ranked = list(matches)
if not ranked:
return None
top = ranked[0]
top_enc = top.encoding.lower()
fallbacks = _ENCODING_FALLBACKS.get(top_enc)
if not fallbacks:
return top_enc
# The decisive signal: a lower-ranked candidate that ties the top
# pick on both chaos and coherence has decoded the bytes
# *identically*, so the choice between them is byte-equivalent. When
# one of those tied candidates is a preferred Western default,
# substitute. We walk the fallbacks in priority order so the most
# canonical alternative wins (cp1252 over iso8859_2 over iso8859_15).
#
# When no tied candidate matches, we leave the top pick alone — that
# is the "real cp1250 / cp1258 content" path where charset-normalizer
# is genuinely confident.
top_chaos = getattr(top, "chaos", None)
top_coherence = getattr(top, "coherence", None)
tied: list = []
for m in ranked[1:]:
if m.chaos != top_chaos or m.coherence != top_coherence:
break # ranked list is monotonically less confident
tied.append(m)
if tied:
for preferred in fallbacks:
for m in tied:
candidates = {
m.encoding.lower(),
*(a.lower() for a in m.could_be_from_charset),
}
if preferred in candidates:
return preferred
# No tied alternative — but charset-normalizer occasionally folds
# the more popular Western alias into the *top pick's own* alias
# list (cp1250 with cp1252 listed alongside). When that happens,
# prefer the canonical Western form.
top_aliases = {a.lower() for a in top.could_be_from_charset}
for preferred in fallbacks:
# Only honour an in-alias swap if the preferred encoding is a
# different family from the top pick (cp1252 swap from cp1250 is
# legitimate; iso8859_2 swap from cp1250 is not — they differ
# bytewise on accented Eastern letters).
if preferred in top_aliases and not _same_byte_family(top_enc, preferred):
return preferred
return top_enc
# ---------------------------------------------------------------------------
# Language-aware probe: distinguish KOI8-R from Shift_JIS, ISO-8859-2 from
# cp1258 when charset-normalizer cannot.
# ---------------------------------------------------------------------------
# Unicode ranges that uniquely identify each language family. A candidate
# encoding "wins" the probe when its decoding of the raw bytes produces
# the highest *coverage ratio* (non-ASCII letters in the target range
# divided by total non-ASCII letters).
_CYRILLIC_RANGE = (0x0400, 0x04FF)
_EE_LATIN_LETTERS = frozenset(
"ąćęłńóśźżĄĆĘŁŃÓŚŹŻ" # Polish
"áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ" # Czech
"áéíóöőúüűÁÉÍÓÖŐÚÜŰ" # Hungarian
"äčďéíĺľňóôŕšťúýžÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ" # Slovak
)
# Encodings to probe when charset-normalizer fingerprints the file as
# Japanese (a frequent misfire on short Cyrillic samples whose byte
# patterns happen to coincide with shift_jis lead bytes).
_CYRILLIC_PROBES: tuple[str, ...] = ("koi8_r", "cp1251", "iso8859_5")
_EE_LATIN_PROBES: tuple[str, ...] = ("iso8859_2", "cp1250")
def _cyrillic_coverage(text: str) -> float:
"""Fraction of *all non-ASCII characters* in *text* that are Cyrillic letters.
Dividing by all non-ASCII (rather than only letters) penalises
decodings that produce mostly symbols/box-drawing with a sprinkle
of incidental Cyrillic glyphs — a real KOI8-R Russian text scores
>0.7 because nearly every non-ASCII codepoint IS a Cyrillic letter,
whereas a Japanese-shift_jis-decoded-as-koi8r text scores low.
"""
non_ascii = [c for c in text if ord(c) >= 0x80]
if not non_ascii:
return 0.0
cyr = sum(
1 for c in non_ascii
if c.isalpha() and _CYRILLIC_RANGE[0] <= ord(c) <= _CYRILLIC_RANGE[1]
)
return cyr / len(non_ascii)
def _ee_latin_coverage(text: str) -> float:
"""Fraction of *all non-ASCII characters* in *text* that look like EE Latin."""
non_ascii = [c for c in text if ord(c) >= 0x80]
if not non_ascii:
return 0.0
ee = sum(1 for c in non_ascii if c in _EE_LATIN_LETTERS)
return ee / len(non_ascii)
def _probe_language(raw: bytes, top_enc: str) -> Optional[str]:
"""Try language-specific decodings when charset-normalizer guessed wrong.
Returns a better encoding name when one of the probe candidates
decodes the bytes into a language-coherent text (Cyrillic ≥ 70 % for
Cyrillic probes, EE-Latin ≥ 50 % for EE Latin probes), else None.
"""
if top_enc in {"shift_jis_2004", "shift_jisx0213", "shift_jis", "cp932"}:
probes, scorer, threshold = _CYRILLIC_PROBES, _cyrillic_coverage, 0.70
elif top_enc in {"cp1258", "iso8859_16"}:
probes, scorer, threshold = _EE_LATIN_PROBES, _ee_latin_coverage, 0.50
else:
return None
# Score the top pick first. If the top encoding *itself* decodes the
# bytes into reasonable Cyrillic / EE Latin text, the bytes are
# genuinely in that script — don't override.
try:
top_decoded = raw.decode(top_enc, errors="replace")
top_score = scorer(top_decoded)
except LookupError:
top_score = 0.0
best_enc: Optional[str] = None
best_score = 0.0
for enc in probes:
try:
decoded = raw.decode(enc)
except (UnicodeDecodeError, LookupError):
continue
score = scorer(decoded)
if score > best_score:
best_score = score
best_enc = enc
# Require both an absolute coverage threshold AND a clear margin over
# the top pick — otherwise we risk hijacking real Japanese / Vietnamese
# content whose decode happens to produce a few Cyrillic / EE-Latin
# glyphs by coincidence.
if best_enc and best_score >= threshold and best_score >= top_score + 0.30:
return best_enc
return None
# Pairs of encoding names whose byte ranges DIFFER for accented letters.
# Used to refuse spurious in-alias swaps (e.g. cp1250 vs iso8859_2 are
# byte-distinct even though charset-normalizer lists them as siblings).
_SAME_FAMILY: set[frozenset[str]] = {
frozenset({"cp1250", "iso8859_2"}),
frozenset({"mac_iceland", "mac_turkish"}),
frozenset({"shift_jis_2004", "shift_jisx0213"}),
}
def _same_byte_family(a: str, b: str) -> bool:
return frozenset({a, b}) in _SAME_FAMILY
def detect_encoding(path: Path, sample_bytes: int = 65_536) -> str:
"""Detect file encoding by reading the first *sample_bytes*.
Returns the best-guess encoding name (e.g. ``utf-8``, ``windows-1252``).
Falls back to ``utf-8`` when detection is inconclusive.
Reads only the head bytes (does not slurp the file). On a 1 GB input
this is the difference between ~50 ms and a multi-GB allocation.
"""
with Path(path).open("rb") as fh:
raw = fh.read(sample_bytes)
if not raw:
return "utf-8"
# Check BOM first
if raw[:3] == b"\xef\xbb\xbf":
# A "lying" BOM: file claims utf-8 but the body bytes don't decode
# as utf-8. Fall through to charset detection on the BOM-stripped
# body so we don't hand back utf-8-sig that will then fail to read.
body = raw[3:]
try:
body.decode("utf-8")
return "utf-8-sig"
except UnicodeDecodeError:
logger.debug(
"detect_encoding({}): file has UTF-8 BOM but body is not "
"valid UTF-8 — falling through to charset detection",
Path(path).name,
)
raw = body
elif raw[:2] in (b"\xff\xfe", b"\xfe\xff"):
return "utf-16"
# Strict UTF-8 wins. charset_normalizer fingerprints small files
# dominated by short non-ASCII sequences (e.g. zero-width chars at
# U+200B-class) as mac_latin2 / cp1250 / similar — but if the bytes
# decode cleanly as UTF-8, that's the right answer regardless.
try:
raw.decode("utf-8")
return "utf-8"
except UnicodeDecodeError:
pass
matches = from_bytes(raw)
enc = _arbitrate_charset_match(matches)
if enc is None:
return "utf-8"
# Language-aware probe runs after the arbiter so we only spend cycles
# on the cases where charset-normalizer fingerprinted the bytes as a
# codepage that doesn't match the apparent script. Returns a better
# encoding only when the probe finds a high-coverage match.
probed = _probe_language(raw, enc)
if probed:
logger.debug(
"detect_encoding({}): language probe overrode {}{}",
Path(path).name, enc, probed,
)
enc = probed
if enc in ("ascii", "us-ascii"):
enc = "utf-8"
return enc
# ---------------------------------------------------------------------------
# Delimiter detection
# ---------------------------------------------------------------------------
_COMMON_DELIMITERS = [",", "\t", ";", "|"]
def detect_delimiter(path: Path, encoding: str = "utf-8") -> str:
"""Sniff the delimiter from the first 20 lines of a text file.
Falls back to comma if csv.Sniffer cannot decide.
"""
raw_path = Path(path)
lines: list[str] = []
with raw_path.open("r", encoding=encoding, errors="replace") as fh:
for _ in range(20):
line = fh.readline()
if not line:
break
lines.append(line)
if not lines:
return ","
sample = "".join(lines)
try:
dialect = csv.Sniffer().sniff(sample, delimiters="".join(_COMMON_DELIMITERS))
return dialect.delimiter
except csv.Error:
return ","
# ---------------------------------------------------------------------------
# Header-row detection
# ---------------------------------------------------------------------------
def detect_header_row(path: Path, encoding: str = "utf-8", delimiter: str = ",",
max_scan: int = 20) -> int:
"""Return the 0-based index of the likely header row.
Heuristic: the first row where *every* cell looks like a column name
(non-numeric, non-empty string). Falls back to 0.
"""
raw_path = Path(path)
with raw_path.open("r", encoding=encoding, errors="replace") as fh:
reader = csv.reader(fh, delimiter=delimiter)
for idx, row in enumerate(reader):
if idx >= max_scan:
break
if not row:
continue
# Header heuristic:
# - every non-empty cell looks like a header;
# - at least 2 non-empty cells (or just 1 in a single-column
# file). Without the count check, blank rows match
# vacuously (``all([])`` is True) and metadata banners
# like ``["Report 2024", "", ""]`` claim row 0 falsely.
non_empty = [cell for cell in row if cell.strip()]
min_required = 1 if len(row) <= 1 else 2
if (
len(non_empty) >= min_required
and all(_looks_like_header(cell) for cell in non_empty)
):
return idx
return 0
def _looks_like_header(value: str) -> bool:
"""True if *value* looks like a column header, not a data value."""
v = value.strip()
if not v:
return False
# Pure numbers are not headers
try:
float(v.replace(",", ""))
return False
except ValueError:
pass
return True
# ---------------------------------------------------------------------------
# Excel helpers
# ---------------------------------------------------------------------------
def list_sheets(path: Path) -> list[str]:
"""Return sheet names from an Excel workbook."""
xl = pd.ExcelFile(path, engine="openpyxl")
return xl.sheet_names
# ---------------------------------------------------------------------------
# Reading
# ---------------------------------------------------------------------------
def read_file(
path: str | Path,
*,
encoding: Optional[str] = None,
delimiter: Optional[str] = None,
header_row: Optional[int] = None,
sheet_name: Optional[str | int] = 0,
chunk_size: Optional[int] = None,
repair: bool = True,
) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
"""Read a CSV, TSV, or Excel file into a DataFrame.
Parameters
----------
path : file path
encoding : override detected encoding (CSV only)
delimiter : override detected delimiter (CSV only)
header_row : 0-based row index for the header; auto-detected if *None*
sheet_name : Excel sheet (name or 0-based index). Ignored for CSV.
chunk_size : if set, return a generator of DataFrames (CSV only). When
*chunk_size* is set, *repair* is forced off because the pre-parse
pass loads the entire file into memory.
repair : run :func:`repair_bytes` over the raw CSV before parsing
(default ``True``). Excel files always skip this step. Pass
``repair=False`` when you specifically need pandas' raw view of
the input.
Returns a DataFrame (or generator when *chunk_size* is set).
"""
from .errors import FileAccessError, InputValidationError
filepath = Path(path)
if not filepath.exists():
raise FileAccessError(
"Input file not found",
path=filepath,
operation="read_file",
suggestion=(
f"Check the path is correct. Parent directory "
f"{filepath.parent} "
f"{'exists' if filepath.parent.exists() else 'does NOT exist'}."
),
)
if chunk_size is not None and chunk_size <= 0:
raise InputValidationError(
f"chunk_size must be positive; got {chunk_size}",
operation="read_file",
suggestion="Pass a positive integer (e.g., chunk_size=10000) or omit for non-streaming reads.",
)
suffix = filepath.suffix.lower()
logger.info(
"read_file: {} (suffix={}, chunk_size={})",
filepath, suffix, chunk_size,
)
if suffix in (".xlsx", ".xls"):
return _read_excel(filepath, header_row=header_row, sheet_name=sheet_name)
else:
return _read_csv(
filepath,
encoding=encoding,
delimiter=delimiter,
header_row=header_row,
chunk_size=chunk_size,
repair=repair,
)
def _read_csv(
path: Path,
*,
encoding: Optional[str] = None,
delimiter: Optional[str] = None,
header_row: Optional[int] = None,
chunk_size: Optional[int] = None,
repair: bool = True,
) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
enc = encoding or detect_encoding(path)
delim = delimiter or detect_delimiter(path, enc)
hdr = header_row if header_row is not None else detect_header_row(path, enc, delim)
logger.debug(
"Reading CSV {} (encoding={}, delimiter={!r}, header_row={}, repair={})",
path.name, enc, delim, hdr, repair,
)
if chunk_size:
# Streaming reads can't share memory with the repair pass; fall back
# to direct pandas read so chunked workflows on huge files still
# work.
return pd.read_csv(
filepath_or_buffer=path,
encoding=enc,
delimiter=delim,
header=hdr,
dtype=str,
keep_default_na=False,
on_bad_lines="warn",
chunksize=chunk_size,
)
if repair:
raw = path.read_bytes()
repair_result = repair_bytes(raw, encoding=enc, delimiter=delim)
if repair_result.changed:
logger.info(
"Pre-parse repair on {}: {}", path.name, repair_result.summary(),
)
if repair_result.unrepairable_lines:
logger.warning(
"Pre-parse repair on {}: {} unrepairable line(s) at {}",
path.name, len(repair_result.unrepairable_lines),
repair_result.unrepairable_lines[:10],
)
return pd.read_csv(
io.BytesIO(repair_result.repaired_bytes),
encoding="utf-8",
delimiter=delim,
header=hdr,
dtype=str,
keep_default_na=False,
on_bad_lines="warn",
)
return pd.read_csv(
filepath_or_buffer=path,
encoding=enc,
delimiter=delim,
header=hdr,
dtype=str,
keep_default_na=False,
on_bad_lines="warn",
)
def _read_excel(
path: Path,
*,
header_row: Optional[int] = None,
sheet_name: Optional[str | int] = 0,
) -> pd.DataFrame:
hdr = (
header_row
if header_row is not None
else _detect_excel_header_row(path, sheet_name)
)
logger.debug("Reading Excel {} (sheet={}, header_row={})", path.name, sheet_name, hdr)
try:
return pd.read_excel(
path,
sheet_name=sheet_name,
header=hdr,
dtype=str,
keep_default_na=False,
engine="openpyxl",
)
except ValueError as e:
# pandas raises ValueError for "Worksheet named 'X' not found".
from .errors import FileFormatError
raise FileFormatError(
"Could not read Excel sheet",
path=path,
operation=f"open sheet {sheet_name!r}",
cause=e,
suggestion=(
"Check the sheet name exists. List available sheets with "
"`from src.core.io import list_sheets; list_sheets(path)`."
),
) from e
except Exception as e:
# openpyxl can raise BadZipFile, InvalidFileException for
# corrupt / non-xlsx inputs. Wrap with file context.
from .errors import FileFormatError
raise FileFormatError(
"Excel file could not be parsed",
path=path,
operation="pd.read_excel",
cause=e,
suggestion=(
"Confirm the file is a valid .xlsx workbook and not "
"renamed/corrupted. Try opening it in Excel to verify."
),
) from e
def _detect_excel_header_row(
path: Path,
sheet_name: Optional[str | int] = 0,
max_scan: int = 20,
) -> int:
"""Mirror of :func:`detect_header_row` for Excel workbooks.
Scans the first *max_scan* rows of *sheet_name* in read-only mode
(so a 100 MB workbook doesn't get fully materialized) and returns
the index of the first row where every non-empty cell looks like a
column header. Falls back to 0 on parse failure (logged at debug —
the caller's ``pd.read_excel`` will raise a useful FileFormatError
with full context).
"""
try:
from openpyxl import load_workbook
from openpyxl.utils.exceptions import InvalidFileException
except ImportError as e:
logger.debug("openpyxl unavailable for header detection: {}", e)
return 0
wb = None
try:
wb = load_workbook(path, read_only=True, data_only=True)
if isinstance(sheet_name, int):
names = wb.sheetnames
target = names[sheet_name] if 0 <= sheet_name < len(names) else names[0]
elif isinstance(sheet_name, str):
target = sheet_name if sheet_name in wb.sheetnames else wb.sheetnames[0]
else:
target = wb.sheetnames[0]
ws = wb[target]
for idx, row in enumerate(ws.iter_rows(values_only=True)):
if idx >= max_scan:
break
cells = ["" if v is None else str(v) for v in row]
non_empty = [c for c in cells if c.strip()]
min_required = 1 if len(cells) <= 1 else 2
if (
len(non_empty) >= min_required
and all(_looks_like_header(c) for c in non_empty)
):
return idx
return 0
except (InvalidFileException, KeyError, IndexError, OSError) as e:
# Corrupt workbook, missing sheet name, or read failure — fall
# back to row 0 and let pd.read_excel raise the user-facing error
# with full context.
logger.debug(
"Excel header detection failed for {} (sheet={}): {}",
path, sheet_name, e,
)
return 0
finally:
if wb is not None:
wb.close()
# ---------------------------------------------------------------------------
# Writing
# ---------------------------------------------------------------------------
def write_file(
df: pd.DataFrame,
path: str | Path,
*,
file_format: Optional[str] = None,
encoding: str = "utf-8-sig",
delimiter: Optional[str] = None,
) -> Path:
"""Write a DataFrame to CSV or Excel.
Parameters
----------
df : DataFrame to write
path : output file path
file_format : ``"csv"``, ``"tsv"``, or ``"xlsx"``; auto-detected from
*path* suffix if *None*
encoding : output encoding (default ``utf-8-sig`` for Windows Excel compat)
delimiter : field separator for delimited output. Defaults to ``,``
for ``.csv``, ``\\t`` for ``.tsv``, and the explicit value
otherwise. Ignored for Excel formats.
Returns the resolved output Path.
"""
from .errors import ensure_dataframe, wrap_file_write
ensure_dataframe(df, function="write_file")
out = Path(path)
fmt = file_format or out.suffix.lstrip(".").lower()
try:
if fmt in ("xlsx", "xls"):
df.to_excel(out, index=False, engine="openpyxl")
else:
sep = delimiter if delimiter is not None else (
"\t" if fmt == "tsv" else ","
)
df.to_csv(out, index=False, encoding=encoding, sep=sep)
except (OSError, PermissionError) as e:
raise wrap_file_write(out, f"write_file (format={fmt})", e) from e
logger.info("Wrote {} rows × {} cols to {}", len(df), len(df.columns), out)
return out
# ---------------------------------------------------------------------------
# Pre-parse repair (CSV / delimited text)
# ---------------------------------------------------------------------------
#
# Some pollution patterns confuse pandas' parser before the cleaner can ever
# see the data. Smart double quotes inside an unquoted field, NUL bytes, and
# unquoted delimiters embedded in numeric/currency cells all cause structural
# parse failures or silent truncation. These helpers operate on raw bytes
# (or decoded text) and produce a parseable byte stream plus an audit log.
#
# Design notes:
# - Single curly quotes (U+2018/U+2019) are NOT folded here: they don't
# conflict with the default CSV quote char and the cell-level cleaner
# handles them more accurately. Only double-quote-equivalents are folded.
# - Delimiter-row repair only attempts the unambiguous case (one extra
# field, one merge candidate that looks like currency/thousands-sep).
# Anything else is logged as unrepairable and the line is left alone.
# Smart double-quote characters that confuse CSV parsing.
_CSV_SMART_QUOTE_CHARS: tuple[str, ...] = (
"", # LEFT DOUBLE QUOTATION MARK
"", # RIGHT DOUBLE QUOTATION MARK
"", # DOUBLE LOW-9 QUOTATION MARK
"", # DOUBLE HIGH-REVERSED-9 QUOTATION MARK
"«", # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
"»", # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
"", # DOUBLE PRIME
)
# ``str.maketrans`` builds a codepoint→codepoint dict the C translate
# uses directly. Iterating that dict yields ``int`` codepoints, which is
# why we keep ``_CSV_SMART_QUOTE_CHARS`` separately for the ``.count``
# loop in the non-UTF-8 fold path.
_CSV_SMART_QUOTE_TRANS = str.maketrans({c: '"' for c in _CSV_SMART_QUOTE_CHARS})
# Byte-level fast path: same characters but as UTF-8 byte sequences. Used
# when the file is already valid UTF-8 — folds in C without ever
# materializing a multi-GB decoded string.
_CSV_SMART_QUOTE_BYTE_MAP: list[tuple[bytes, bytes]] = [
("".encode("utf-8"), b'"'), # E2 80 9C
("".encode("utf-8"), b'"'), # E2 80 9D
("".encode("utf-8"), b'"'), # E2 80 9E
("".encode("utf-8"), b'"'), # E2 80 9F
("«".encode("utf-8"), b'"'), # C2 AB
("»".encode("utf-8"), b'"'), # C2 BB
("".encode("utf-8"), b'"'), # E2 80 B3
]
# Cheap probe: if none of these sentinel pairs appear in the bytes,
# skip the smart-quote stage entirely. Probing one byte per family hits
# the C-implemented ``bytes.__contains__`` which is sub-millisecond on a
# 1 GB buffer.
_CSV_SMART_QUOTE_PROBES = (b"\xe2\x80", b"\xc2\xab", b"\xc2\xbb")
# A merged value is "currency-shaped" when it looks like $1,500.00 or 1.234,56
# (i.e., a sequence of digits, separators, and an optional currency sigil).
_CURRENCY_SHAPED = re.compile(r"^\s*[$€£¥]?\s*\d{1,3}([,.\s]\d{3})+([,.]\d+)?\s*$")
# Or a plain decimal with thousands grouping (no currency sigil).
_THOUSANDS_SHAPED = re.compile(r"^\s*\d{1,3}(,\d{3})+(\.\d+)?\s*$")
@dataclass
class RepairAction:
"""One repair the pre-parse pass made to the raw bytes."""
kind: str # e.g. "strip_bom", "strip_nul", "fold_smart_quote",
# "quote_unquoted_delim"
line: Optional[int] # 1-indexed source line; None for file-level
detail: str
@dataclass
class RepairResult:
"""Output of :func:`repair_bytes`."""
repaired_bytes: bytes
actions: list[RepairAction] = field(default_factory=list)
unrepairable_lines: list[int] = field(default_factory=list)
@property
def changed(self) -> bool:
return bool(self.actions)
def summary(self) -> dict[str, int]:
"""Action count grouped by kind."""
out: dict[str, int] = {}
for a in self.actions:
out[a.kind] = out.get(a.kind, 0) + 1
return out
def _merge_score(left: str, right: str, delimiter: str) -> int:
"""Rank how plausible it is that ``left+delimiter+right`` is one field.
Higher = more confident. ``0`` means the merge is implausible.
- 3: merged value matches a currency-shaped or thousands-shaped pattern.
- 1: loose heuristic (left has $/€/digit and right starts with digit, and
delimiter is one of ``,``/``.``).
- 0: no signal.
Tiering matters because ``" $1,500.00 ,7"`` has two raw candidates
(``$1+500.00`` and ``500.00+7``) but only the first produces a strict
currency shape.
"""
merged = f"{left}{delimiter}{right}"
if _CURRENCY_SHAPED.match(merged) or _THOUSANDS_SHAPED.match(merged):
return 3
if delimiter in ".,":
left_has_money = bool(re.search(r"[$€£¥]\s*\d", left)) or bool(re.search(r"\d\s*$", left))
right_starts_digits = bool(re.match(r"\s*\d", right))
if left_has_money and right_starts_digits:
return 1
return 0
def _repair_extra_field_row(
fields: list[str], expected: int, delimiter: str,
) -> Optional[list[str]]:
"""Try to merge one adjacent pair so the row has *expected* fields.
Returns the repaired field list, or *None* if no unambiguous merge exists.
"""
if len(fields) != expected + 1:
return None
scores = [
(i, _merge_score(fields[i], fields[i + 1], delimiter))
for i in range(len(fields) - 1)
]
best = max(s for _, s in scores)
if best == 0:
return None
winners = [i for i, s in scores if s == best]
if len(winners) != 1:
return None
i = winners[0]
merged = f"{fields[i]}{delimiter}{fields[i + 1]}"
return fields[:i] + [merged] + fields[i + 2:]
def repair_bytes(
raw: bytes,
*,
encoding: str = "utf-8",
delimiter: str = ",",
fold_quotes: bool = True,
strip_nul: bool = True,
repair_delims: bool = True,
normalize_line_endings: bool = True,
) -> RepairResult:
"""Pre-parse repair on a raw delimited file.
Performs (in order, each toggleable):
1. Strip a leading UTF-8 BOM.
2. Strip embedded NUL bytes (the C parser truncates fields at NUL).
3. Normalize line endings (CRLF and bare CR to LF). Bare CR confuses
the C parser ("new-line character seen in unquoted field"); the
text-cleaner contract also calls for LF inside multi-line cells.
4. Fold smart double quotes (curly, guillemet, double-prime) to ASCII ``"``.
5. Per-row repair when one rogue delimiter is embedded in a field that
looks like currency or thousands-grouped digits — quote that field.
Single curly quotes and other punctuation are deferred to the cell-level
cleaner; this layer only fixes things that break CSV *parsing*.
"""
actions: list[RepairAction] = []
unrepairable: list[int] = []
data = raw
# If the input is a UTF-16 / UTF-32 byte stream, transcode it to UTF-8
# up front. UTF-16 ASCII codepoints carry NUL as half of every 16-bit
# unit, so the byte-level NUL-strip below would shred the file. Doing
# the transcode here means the rest of the repair pipeline operates
# on UTF-8 bytes regardless of the source encoding.
enc_norm = encoding.lower().replace("-", "_") if encoding else ""
is_wide = enc_norm.startswith(("utf_16", "utf_32"))
# UTF-16 LE without a BOM that survives detection lands here too.
if is_wide:
try:
decoded = data.decode(encoding)
except (UnicodeDecodeError, LookupError):
decoded = data.decode("utf-8", errors="replace")
actions.append(RepairAction(
kind="decode_replaced", line=None,
detail=f"decode errors under {encoding}; replaced with U+FFFD",
))
# Strip a leading UTF-16 BOM (decoded as U+FEFF) if present.
if decoded and decoded[0] == "":
decoded = decoded[1:]
data = decoded.encode("utf-8")
actions.append(RepairAction(
kind="transcode_to_utf8", line=None,
detail=f"transcoded {encoding} -> utf-8 ({len(raw)}B -> {len(data)}B)",
))
encoding = "utf-8" # downstream steps now operate on UTF-8
# 1. BOM
if data.startswith(b"\xef\xbb\xbf"):
data = data[3:]
actions.append(RepairAction(kind="strip_bom", line=None, detail="UTF-8 BOM removed"))
# 2. NUL — only meaningful for single-byte / UTF-8 encodings. We've
# already transcoded UTF-16/32 to UTF-8 above, so NUL here is genuine
# corruption (truncated C strings, half-binary exports), not encoding.
if strip_nul and b"\x00" in data:
before = data.count(b"\x00")
data = data.replace(b"\x00", b"")
actions.append(RepairAction(
kind="strip_nul", line=None,
detail=f"removed {before} NUL byte(s)",
))
# 3. Line endings: CRLF and bare CR -> LF. CRLF first so we don't
# double-substitute. Done at the byte layer so it survives through
# any subsequent decode failure.
if normalize_line_endings and (b"\r" in data):
n_crlf = data.count(b"\r\n")
data = data.replace(b"\r\n", b"\n")
n_cr = data.count(b"\r")
if n_cr:
data = data.replace(b"\r", b"\n")
if n_crlf or n_cr:
parts = []
if n_crlf:
parts.append(f"{n_crlf} CRLF")
if n_cr:
parts.append(f"{n_cr} bare CR")
actions.append(RepairAction(
kind="normalize_line_endings", line=None,
detail=f"normalized {', '.join(parts)} to LF",
))
# Smart-quote fast path: when the bytes are already UTF-8 (which
# they are after the wide-encoding transcode above), fold curly /
# guillemet / double-prime quotes via ``bytes.replace`` — no decode,
# no string allocation. The probe check skips this entirely on the
# common case of files with no smart quotes.
enc_norm = encoding.lower().replace("-", "_") if encoding else ""
is_utf8 = enc_norm in ("utf_8", "utf_8_sig", "utf8", "ascii")
smart_folded_bytes = False
if fold_quotes and is_utf8:
if any(p in data for p in _CSV_SMART_QUOTE_PROBES):
replaced_total = 0
for src_bytes, dst in _CSV_SMART_QUOTE_BYTE_MAP:
if src_bytes in data:
n = data.count(src_bytes)
if n:
data = data.replace(src_bytes, dst)
replaced_total += n
if replaced_total:
smart_folded_bytes = True
actions.append(RepairAction(
kind="fold_smart_quote", line=None,
detail=f"replaced {replaced_total} smart double-quote char(s) with ASCII '\"'",
))
# Always attempt the decode so we catch encoding errors (lying-BOM
# case E30 needs the ``decode_replaced`` action to surface as the
# ``encoding_decode_failed`` finding). The decode is O(N) memory but
# CPython's UTF-8 decoder is C-implemented and runs at GB/s rates.
decode_failed = False
try:
text = data.decode(encoding if not smart_folded_bytes else "utf-8")
except (UnicodeDecodeError, LookupError):
text = data.decode("utf-8", errors="replace")
decode_failed = True
actions.append(RepairAction(
kind="decode_replaced", line=None,
detail=f"decode errors under {encoding}; replaced with U+FFFD",
))
# Smart-quote fold for non-UTF-8 inputs that bypassed the byte fast
# path (the byte_map only covers the UTF-8 byte sequences).
if fold_quotes and not is_utf8:
# Count via ``str.count`` (C-implemented, ~GB/s) instead of a
# Python-level char-by-char ``zip`` walk. On a 1 GB decoded
# string the old path took ~100s of pure CPython iteration; the
# ``count`` sum is microseconds because each call runs in C.
n = sum(text.count(c) for c in _CSV_SMART_QUOTE_CHARS)
if n:
text = text.translate(_CSV_SMART_QUOTE_TRANS)
actions.append(RepairAction(
kind="fold_smart_quote", line=None,
detail=f"replaced {n} smart double-quote char(s) with ASCII '\"'",
))
# Per-row delimiter repair: skip the costly csv.reader walk on
# well-formed files. Triggers, in cheap-to-expensive order:
# 1. Currency sigil somewhere in the bytes (``$`` / € / £) — the
# classic ``$1,500.00`` case.
# 2. Non-comma delimiter (rare in the wild; opt in for safety).
# 3. The decoder had to substitute U+FFFD (file is suspicious).
# 4. Field-count mismatch: at least one data row has a different
# delimiter count than the header. Costs O(N) but only on the
# already-decoded ``text``.
has_currency_sigil = (
b"$" in data or b"\xe2\x82\xac" in data or b"\xc2\xa3" in data
)
needs_row_repair = repair_delims and (
has_currency_sigil or delimiter != "," or decode_failed
or _has_field_count_mismatch(text, delimiter)
)
if needs_row_repair:
text, row_actions, unrepairable = _repair_rows(text, delimiter)
actions.extend(row_actions)
return RepairResult(
repaired_bytes=text.encode("utf-8"),
actions=actions,
unrepairable_lines=unrepairable,
)
def _has_field_count_mismatch(text: str, delimiter: str) -> bool:
"""Quick scan for rows whose unquoted-delimiter count differs from
the header's. Walks the text once with a hand-rolled quote-state
machine — much cheaper than running csv.reader, which materializes a
list of every row. Returns True at the first mismatch.
False negatives are acceptable here: the trigger only decides
whether to run the (slower, exact) ``_repair_rows`` pass. False
positives just mean we run the slow pass anyway.
"""
in_quote = False
header_count: int | None = None
current_count = 0
for ch in text:
if ch == '"':
in_quote = not in_quote
continue
if in_quote:
continue
if ch == delimiter:
current_count += 1
continue
if ch == "\n":
if header_count is None:
header_count = current_count
elif current_count != header_count and current_count != 0:
return True
current_count = 0
# Trailing line without a newline.
if (
header_count is not None
and current_count != 0
and current_count != header_count
):
return True
return False
def _repair_rows(
text: str, delimiter: str,
) -> tuple[str, list[RepairAction], list[int]]:
"""Per-line field-count repair. Operates on already-decoded text."""
actions: list[RepairAction] = []
unrepairable: list[int] = []
reader = csv.reader(io.StringIO(text), delimiter=delimiter)
rows = list(reader)
if not rows:
return text, actions, unrepairable
expected = len(rows[0])
repaired_rows: list[list[str]] = [rows[0]]
needs_rewrite = False
for idx, row in enumerate(rows[1:], start=2): # 1-indexed; header is line 1
if len(row) == expected or not row:
repaired_rows.append(row)
continue
if len(row) > expected:
fixed = _repair_extra_field_row(row, expected, delimiter)
if fixed is not None:
repaired_rows.append(fixed)
needs_rewrite = True
actions.append(RepairAction(
kind="quote_unquoted_delim", line=idx,
detail=(
f"line {idx}: merged adjacent fields to fix "
f"unquoted '{delimiter}' (saw {len(row)} fields, "
f"expected {expected})"
),
))
continue
unrepairable.append(idx)
repaired_rows.append(row)
else:
# Too few fields: leave alone, log info-level only.
unrepairable.append(idx)
repaired_rows.append(row)
if not needs_rewrite:
return text, actions, unrepairable
buf = io.StringIO()
writer = csv.writer(buf, delimiter=delimiter, lineterminator="\n")
for row in repaired_rows:
writer.writerow(row)
return buf.getvalue(), actions, unrepairable
def read_csv_repaired(
path: str | Path,
*,
encoding: Optional[str] = None,
delimiter: Optional[str] = None,
header_row: Optional[int] = None,
fold_quotes: bool = True,
strip_nul: bool = True,
repair_delims: bool = True,
) -> tuple[pd.DataFrame, RepairResult]:
"""Read a CSV after running :func:`repair_bytes` on the raw file.
Returns ``(df, repair_result)`` so callers can surface the action log.
"""
p = Path(path)
enc = encoding or detect_encoding(p)
delim = delimiter or detect_delimiter(p, enc)
raw = p.read_bytes()
repair = repair_bytes(
raw, encoding=enc, delimiter=delim,
fold_quotes=fold_quotes, strip_nul=strip_nul, repair_delims=repair_delims,
)
hdr = header_row if header_row is not None else 0
df = pd.read_csv(
io.BytesIO(repair.repaired_bytes),
encoding="utf-8",
delimiter=delim,
header=hdr,
dtype=str,
keep_default_na=False,
on_bad_lines="warn",
)
if repair.actions:
logger.info("Pre-parse repair on {}: {}", p.name, repair.summary())
return df, repair