test: add text-cleaner corpus and close gaps surfaced by it
The 21-fixture corpus (test-cases/text-cleaner-corpus/) exercises the cleaner
end-to-end against the spec in TEST-CASES.md. Closing the failing cases drove
five small cleaner fixes plus two fixture-generation fixes:
- _SMART_CHARS: add prime, double prime, guillemets (case 03)
- _ZERO_WIDTH: add soft hyphen U+00AD (case 05)
- clean_dataframe: clean column headers via the same pipeline (cases 16/19/20),
with a clean_headers toggle on CleanOptions
- smart_title_case: title-case full-shout strings ("ALICE SMITH" -> "Alice
Smith") while still preserving embedded acronyms; preserve uppercase after
apostrophe in names ("O'CONNOR" -> "O'Connor", "o'neil" -> "O'neil")
- test_corpus.py reader: pre-strip NUL bytes (C parser truncates at NUL,
python engine is too strict about embedded literal "), per spec case 06
- generate_test_data.py: properly CSV-escape literal-quote cells in case 03
expected; quote the rogue-comma price field in case 17 input
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -40,6 +40,10 @@ _SMART_CHARS: dict[str, str] = {
|
||||
"―": "-", # HORIZONTAL BAR
|
||||
"−": "-", # MINUS SIGN
|
||||
"…": "...", # HORIZONTAL ELLIPSIS
|
||||
"′": "'", # PRIME (foot / minute marker)
|
||||
"″": '"', # DOUBLE PRIME (inch / second marker)
|
||||
"«": '"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||
"»": '"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||
" ": " ", # NO-BREAK SPACE
|
||||
" ": " ", # NARROW NO-BREAK SPACE
|
||||
" ": " ", # THIN SPACE
|
||||
@@ -62,6 +66,7 @@ _ZERO_WIDTH = (
|
||||
"" # LEFT-TO-RIGHT MARK
|
||||
"" # RIGHT-TO-LEFT MARK
|
||||
"" # ZERO WIDTH NO-BREAK SPACE / BOM
|
||||
"" # SOFT HYPHEN
|
||||
)
|
||||
_ZERO_WIDTH_RE = re.compile(f"[{_ZERO_WIDTH}]")
|
||||
|
||||
@@ -159,27 +164,37 @@ def _is_all_caps_token(token: str) -> bool:
|
||||
def smart_title_case(s: str) -> str:
|
||||
"""Title-case that preserves all-caps tokens and lowercases mid-string particles.
|
||||
|
||||
- ``USA`` stays ``USA``.
|
||||
- ``USA`` stays ``USA`` when surrounded by mixed-case words (acronym).
|
||||
- ``ALICE SMITH`` becomes ``Alice Smith`` (entire string is shouting).
|
||||
- ``of``, ``and``, ``the``, etc. stay lowercase except as the first/last word.
|
||||
- Apostrophes inside words don't restart capitalization (``O'Neil``).
|
||||
"""
|
||||
if not isinstance(s, str) or not s:
|
||||
return s
|
||||
tokens = s.split(" ")
|
||||
# If every cased token is all-caps, treat the whole string as SHOUT and
|
||||
# title-case it. Otherwise preserve all-caps tokens as acronyms.
|
||||
cased_tokens = [t for t in tokens if any(c.isalpha() for c in t)]
|
||||
all_shouting = bool(cased_tokens) and all(
|
||||
not any(c.islower() for c in t) for t in cased_tokens
|
||||
)
|
||||
out: list[str] = []
|
||||
last_idx = len(tokens) - 1
|
||||
for i, tok in enumerate(tokens):
|
||||
if not tok:
|
||||
out.append(tok)
|
||||
continue
|
||||
if _is_all_caps_token(tok):
|
||||
if not all_shouting and _is_all_caps_token(tok):
|
||||
out.append(tok)
|
||||
continue
|
||||
lowered = tok.lower()
|
||||
if 0 < i < last_idx and lowered in _TITLE_LOWERCASE_PARTICLES:
|
||||
out.append(lowered)
|
||||
continue
|
||||
# Capitalize first cased character; preserve apostrophes/hyphens
|
||||
# Capitalize first cased character. Inside a token, preserve the
|
||||
# original capitalization of the letter immediately after an
|
||||
# apostrophe so name patterns like ``O'Connor``/``D'Angelo`` survive
|
||||
# while ``o'neil`` -> ``O'neil`` stays lowercase.
|
||||
chars = list(tok)
|
||||
capitalized = False
|
||||
for j, c in enumerate(chars):
|
||||
@@ -188,7 +203,12 @@ def smart_title_case(s: str) -> str:
|
||||
chars[j] = c.upper()
|
||||
capitalized = True
|
||||
else:
|
||||
chars[j] = c.lower()
|
||||
prev = chars[j - 1] if j > 0 else ""
|
||||
if prev == "'" and c.isupper():
|
||||
# Preserve original uppercase after apostrophe.
|
||||
pass
|
||||
else:
|
||||
chars[j] = c.lower()
|
||||
out.append("".join(chars))
|
||||
return " ".join(out)
|
||||
|
||||
@@ -291,6 +311,11 @@ class CleanOptions:
|
||||
strip_control: bool = True
|
||||
normalize_line_endings: bool = True
|
||||
|
||||
# Apply the same character-level pipeline to column headers. Headers carry
|
||||
# the same pollution as data cells (NBSP padding, smart quotes, ZWSP);
|
||||
# not cleaning them silently breaks df["col"] lookups downstream.
|
||||
clean_headers: bool = True
|
||||
|
||||
# Case conversion: either a single mode applied to all selected columns,
|
||||
# or a dict mapping column name -> mode for per-column control.
|
||||
case: Optional[CaseMode] = None
|
||||
@@ -440,6 +465,15 @@ def clean_dataframe(df: pd.DataFrame, options: Optional[CleanOptions] = None) ->
|
||||
out = df.copy()
|
||||
columns = _select_columns(out, options)
|
||||
|
||||
if options.clean_headers:
|
||||
new_columns = [clean_value(c, options)[0] for c in out.columns]
|
||||
if new_columns != list(out.columns):
|
||||
# Track column mapping so case_columns/columns/skip_columns based
|
||||
# on the original (dirty) names continue to work after rename.
|
||||
rename = dict(zip(out.columns, new_columns))
|
||||
columns = [rename.get(c, c) for c in columns]
|
||||
out.columns = new_columns
|
||||
|
||||
case_per_col: dict[str, CaseMode] = dict(options.case_columns)
|
||||
if options.case is not None:
|
||||
for c in columns:
|
||||
|
||||
Reference in New Issue
Block a user