test: add text-cleaner corpus and close gaps surfaced by it

The 21-fixture corpus (test-cases/text-cleaner-corpus/) exercises the cleaner
end-to-end against the spec in TEST-CASES.md. Closing the failing cases drove
five small cleaner fixes plus two fixture-generation fixes:

- _SMART_CHARS: add prime, double prime, guillemets (case 03)
- _ZERO_WIDTH: add soft hyphen U+00AD (case 05)
- clean_dataframe: clean column headers via the same pipeline (cases 16/19/20),
  with a clean_headers toggle on CleanOptions
- smart_title_case: title-case full-shout strings ("ALICE SMITH" -> "Alice
  Smith") while still preserving embedded acronyms; preserve uppercase after
  apostrophe in names ("O'CONNOR" -> "O'Connor", "o'neil" -> "O'neil")
- test_corpus.py reader: pre-strip NUL bytes (C parser truncates at NUL,
  python engine is too strict about embedded literal "), per spec case 06
- generate_test_data.py: properly CSV-escape literal-quote cells in case 03
  expected; quote the rogue-comma price field in case 17 input

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-29 15:37:35 +00:00
parent 54f92ae47e
commit c349a90e18
50 changed files with 1644 additions and 4 deletions

View File

@@ -40,6 +40,10 @@ _SMART_CHARS: dict[str, str] = {
"": "-", # HORIZONTAL BAR
"": "-", # MINUS SIGN
"": "...", # HORIZONTAL ELLIPSIS
"": "'", # PRIME (foot / minute marker)
"": '"', # DOUBLE PRIME (inch / second marker)
"«": '"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
"»": '"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
" ": " ", # NO-BREAK SPACE
"": " ", # NARROW NO-BREAK SPACE
"": " ", # THIN SPACE
@@ -62,6 +66,7 @@ _ZERO_WIDTH = (
"" # LEFT-TO-RIGHT MARK
"" # RIGHT-TO-LEFT MARK
"" # ZERO WIDTH NO-BREAK SPACE / BOM
"­" # SOFT HYPHEN
)
_ZERO_WIDTH_RE = re.compile(f"[{_ZERO_WIDTH}]")
@@ -159,27 +164,37 @@ def _is_all_caps_token(token: str) -> bool:
def smart_title_case(s: str) -> str:
"""Title-case that preserves all-caps tokens and lowercases mid-string particles.
- ``USA`` stays ``USA``.
- ``USA`` stays ``USA`` when surrounded by mixed-case words (acronym).
- ``ALICE SMITH`` becomes ``Alice Smith`` (entire string is shouting).
- ``of``, ``and``, ``the``, etc. stay lowercase except as the first/last word.
- Apostrophes inside words don't restart capitalization (``O'Neil``).
"""
if not isinstance(s, str) or not s:
return s
tokens = s.split(" ")
# If every cased token is all-caps, treat the whole string as SHOUT and
# title-case it. Otherwise preserve all-caps tokens as acronyms.
cased_tokens = [t for t in tokens if any(c.isalpha() for c in t)]
all_shouting = bool(cased_tokens) and all(
not any(c.islower() for c in t) for t in cased_tokens
)
out: list[str] = []
last_idx = len(tokens) - 1
for i, tok in enumerate(tokens):
if not tok:
out.append(tok)
continue
if _is_all_caps_token(tok):
if not all_shouting and _is_all_caps_token(tok):
out.append(tok)
continue
lowered = tok.lower()
if 0 < i < last_idx and lowered in _TITLE_LOWERCASE_PARTICLES:
out.append(lowered)
continue
# Capitalize first cased character; preserve apostrophes/hyphens
# Capitalize first cased character. Inside a token, preserve the
# original capitalization of the letter immediately after an
# apostrophe so name patterns like ``O'Connor``/``D'Angelo`` survive
# while ``o'neil`` -> ``O'neil`` stays lowercase.
chars = list(tok)
capitalized = False
for j, c in enumerate(chars):
@@ -188,7 +203,12 @@ def smart_title_case(s: str) -> str:
chars[j] = c.upper()
capitalized = True
else:
chars[j] = c.lower()
prev = chars[j - 1] if j > 0 else ""
if prev == "'" and c.isupper():
# Preserve original uppercase after apostrophe.
pass
else:
chars[j] = c.lower()
out.append("".join(chars))
return " ".join(out)
@@ -291,6 +311,11 @@ class CleanOptions:
strip_control: bool = True
normalize_line_endings: bool = True
# Apply the same character-level pipeline to column headers. Headers carry
# the same pollution as data cells (NBSP padding, smart quotes, ZWSP);
# not cleaning them silently breaks df["col"] lookups downstream.
clean_headers: bool = True
# Case conversion: either a single mode applied to all selected columns,
# or a dict mapping column name -> mode for per-column control.
case: Optional[CaseMode] = None
@@ -440,6 +465,15 @@ def clean_dataframe(df: pd.DataFrame, options: Optional[CleanOptions] = None) ->
out = df.copy()
columns = _select_columns(out, options)
if options.clean_headers:
new_columns = [clean_value(c, options)[0] for c in out.columns]
if new_columns != list(out.columns):
# Track column mapping so case_columns/columns/skip_columns based
# on the original (dirty) names continue to work after rename.
rename = dict(zip(out.columns, new_columns))
columns = [rename.get(c, c) for c in columns]
out.columns = new_columns
case_per_col: dict[str, CaseMode] = dict(options.case_columns)
if options.case is not None:
for c in columns: