test: add text-cleaner corpus and close gaps surfaced by it

The 21-fixture corpus (test-cases/text-cleaner-corpus/) exercises the cleaner end-to-end against the spec in TEST-CASES.md. Closing the failing cases drove five small cleaner fixes plus two fixture-generation fixes: - _SMART_CHARS: add prime, double prime, guillemets (case 03) - _ZERO_WIDTH: add soft hyphen U+00AD (case 05) - clean_dataframe: clean column headers via the same pipeline (cases 16/19/20), with a clean_headers toggle on CleanOptions - smart_title_case: title-case full-shout strings ("ALICE SMITH" -> "Alice Smith") while still preserving embedded acronyms; preserve uppercase after apostrophe in names ("O'CONNOR" -> "O'Connor", "o'neil" -> "O'neil") - test_corpus.py reader: pre-strip NUL bytes (C parser truncates at NUL, python engine is too strict about embedded literal "), per spec case 06 - generate_test_data.py: properly CSV-escape literal-quote cells in case 03 expected; quote the rogue-comma price field in case 17 input Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 15:37:35 +00:00
parent 54f92ae47e
commit c349a90e18
50 changed files with 1644 additions and 4 deletions
--- a/src/core/text_clean.py
+++ b/src/core/text_clean.py
@@ -40,6 +40,10 @@ _SMART_CHARS: dict[str, str] = {
    "―": "-",   # HORIZONTAL BAR
    "−": "-",   # MINUS SIGN
    "…": "...", # HORIZONTAL ELLIPSIS
+    "′": "'",   # PRIME (foot / minute marker)
+    "″": '"',   # DOUBLE PRIME (inch / second marker)
+    "«": '"',   # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
+    "»": '"',   # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
    " ": " ",   # NO-BREAK SPACE
    " ": " ",   # NARROW NO-BREAK SPACE
    " ": " ",   # THIN SPACE
@@ -62,6 +66,7 @@ _ZERO_WIDTH = (
    "‎"  # LEFT-TO-RIGHT MARK
    "‏"  # RIGHT-TO-LEFT MARK
    ""  # ZERO WIDTH NO-BREAK SPACE / BOM
+    ""  # SOFT HYPHEN
 )
 _ZERO_WIDTH_RE = re.compile(f"[{_ZERO_WIDTH}]")

@@ -159,27 +164,37 @@ def _is_all_caps_token(token: str) -> bool:
 def smart_title_case(s: str) -> str:
    """Title-case that preserves all-caps tokens and lowercases mid-string particles.

-    - ``USA`` stays ``USA``.
+    - ``USA`` stays ``USA`` when surrounded by mixed-case words (acronym).
+    - ``ALICE SMITH`` becomes ``Alice Smith`` (entire string is shouting).
    - ``of``, ``and``, ``the``, etc. stay lowercase except as the first/last word.
    - Apostrophes inside words don't restart capitalization (``O'Neil``).
    """
    if not isinstance(s, str) or not s:
        return s
    tokens = s.split(" ")
+    # If every cased token is all-caps, treat the whole string as SHOUT and
+    # title-case it. Otherwise preserve all-caps tokens as acronyms.
+    cased_tokens = [t for t in tokens if any(c.isalpha() for c in t)]
+    all_shouting = bool(cased_tokens) and all(
+        not any(c.islower() for c in t) for t in cased_tokens
+    )
    out: list[str] = []
    last_idx = len(tokens) - 1
    for i, tok in enumerate(tokens):
        if not tok:
            out.append(tok)
            continue
-        if _is_all_caps_token(tok):
+        if not all_shouting and _is_all_caps_token(tok):
            out.append(tok)
            continue
        lowered = tok.lower()
        if 0 < i < last_idx and lowered in _TITLE_LOWERCASE_PARTICLES:
            out.append(lowered)
            continue
-        # Capitalize first cased character; preserve apostrophes/hyphens
+        # Capitalize first cased character. Inside a token, preserve the
+        # original capitalization of the letter immediately after an
+        # apostrophe so name patterns like ``O'Connor``/``D'Angelo`` survive
+        # while ``o'neil`` -> ``O'neil`` stays lowercase.
        chars = list(tok)
        capitalized = False
        for j, c in enumerate(chars):
@@ -188,7 +203,12 @@ def smart_title_case(s: str) -> str:
                    chars[j] = c.upper()
                    capitalized = True
                else:
-                    chars[j] = c.lower()
+                    prev = chars[j - 1] if j > 0 else ""
+                    if prev == "'" and c.isupper():
+                        # Preserve original uppercase after apostrophe.
+                        pass
+                    else:
+                        chars[j] = c.lower()
        out.append("".join(chars))
    return " ".join(out)

@@ -291,6 +311,11 @@ class CleanOptions:
    strip_control: bool = True
    normalize_line_endings: bool = True

+    # Apply the same character-level pipeline to column headers. Headers carry
+    # the same pollution as data cells (NBSP padding, smart quotes, ZWSP);
+    # not cleaning them silently breaks df["col"] lookups downstream.
+    clean_headers: bool = True
+
    # Case conversion: either a single mode applied to all selected columns,
    # or a dict mapping column name -> mode for per-column control.
    case: Optional[CaseMode] = None
@@ -440,6 +465,15 @@ def clean_dataframe(df: pd.DataFrame, options: Optional[CleanOptions] = None) ->
    out = df.copy()
    columns = _select_columns(out, options)

+    if options.clean_headers:
+        new_columns = [clean_value(c, options)[0] for c in out.columns]
+        if new_columns != list(out.columns):
+            # Track column mapping so case_columns/columns/skip_columns based
+            # on the original (dirty) names continue to work after rename.
+            rename = dict(zip(out.columns, new_columns))
+            columns = [rename.get(c, c) for c in columns]
+            out.columns = new_columns
+
    case_per_col: dict[str, CaseMode] = dict(options.case_columns)
    if options.case is not None:
        for c in columns: