feat(text_clean): preserve internal whitespace in numeric/date/phone cells

Closes the §4.17 spec gap that test_gap_coverage.py was tracking via xfail: collapse_whitespace must NOT touch cells whose shape carries meaningful internal whitespace. Adds _looks_structured(s) — returns True when s matches: - numeric (currency optional, thousand-grouping by , . or single space) - date (ISO/slash/dot separator, or 'Mon DD YYYY' / 'DD Mon YYYY') - phone (digits + parens/dots/dashes/+/spaces, >= 7 digits, no letters) The pipeline uses a new _smart_collapse_whitespace wrapper that defers to collapse_whitespace only when _looks_structured returns False. The raw collapse_whitespace function is unchanged so direct callers and existing unit tests remain valid. Five new positive tests replace the xfail: - "(555) 123-4567" preserved (phone, double space inside) - "1 234" preserved (European thousands) - "2024-01-15" preserved (ISO date) - "Jan 15 2024" preserved (textual date) - "hello world" still collapsed to "hello world" (free-text negative case) Conservative on purpose: a false negative just collapses (existing behavior); a false positive leaves intentional double spaces in prose. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 16:09:25 +00:00
parent 4687cf87b4
commit 0b959dee93
2 changed files with 91 additions and 19 deletions
--- a/src/core/text_clean.py
+++ b/src/core/text_clean.py
@@ -92,12 +92,76 @@ def collapse_whitespace(s: str) -> str:
    Preserves leading/trailing whitespace boundaries (use ``trim`` to remove
    them). Tabs and other whitespace inside the string become a single
    regular space.
+
+    This is the *raw* operation — it always collapses. The cell-level
+    pipeline uses :func:`_smart_collapse_whitespace` instead, which skips
+    cells that look structured (numeric, dated, or phone-shaped) per
+    TEST-CASES.md §4.17.
    """
    if not isinstance(s, str):
        return s
    return _WHITESPACE_RUN_RE.sub(" ", s)


+# Cell-shape predicates used to skip ``collapse_whitespace`` on values that
+# carry meaningful internal whitespace (European thousands separators,
+# phone formatting, dates with space-separated tokens).
+
+# Numeric: optional sign / currency, digits with optional thousand-grouping
+# by comma, dot, or single space, and optional decimal portion.
+_NUMERIC_SHAPED = re.compile(
+    r"^\s*[$€£¥]?\s*[+-]?\d{1,3}(?:[, ]\d{3})+(?:[.,]\d+)?\s*$"
+    r"|^\s*[$€£¥]?\s*[+-]?\d+(?:[.,]\d+)?\s*$"
+)
+# Date: ISO, slash, or dot separators with two- or four-digit year, plus
+# the ``Mon DD YYYY`` / ``DD Mon YYYY`` shapes.
+_DATE_SHAPED = re.compile(
+    r"^\s*\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4}\s*$"
+    r"|^\s*\d{1,2}\s+[A-Za-z]{3,9}\s+\d{2,4}\s*$"
+    r"|^\s*[A-Za-z]{3,9}\s+\d{1,2}\s+\d{2,4}\s*$"
+)
+# Phone: a string that's mostly digits with parens / dots / dashes / +
+# / spaces, contains at least 7 digits total, and has no letters.
+_PHONE_DIGIT_RE = re.compile(r"\d")
+_PHONE_NON_DIGIT_RE = re.compile(r"[A-Za-z]")
+_PHONE_ALLOWED_RE = re.compile(r"^[\d\s().+\-]+$")
+
+
+def _looks_structured(s: str) -> bool:
+    """True when *s* looks numeric, dated, or phone-shaped.
+
+    Used by the pipeline-level collapse to leave meaningful internal
+    whitespace alone (``1 234`` European thousand-sep, ``(555) 123-4567``
+    phone formatting, ``Jan 15 2024`` date, etc.). Conservative on purpose:
+    a false negative just means the cell gets collapsed (the existing
+    behavior); a false positive leaves intentional double spaces in free
+    text, which is a worse outcome.
+    """
+    if not s or not isinstance(s, str):
+        return False
+    stripped = s.strip()
+    if not stripped:
+        return False
+    if _NUMERIC_SHAPED.match(stripped) or _DATE_SHAPED.match(stripped):
+        return True
+    if (
+        _PHONE_ALLOWED_RE.match(stripped)
+        and not _PHONE_NON_DIGIT_RE.search(stripped)
+        and len(_PHONE_DIGIT_RE.findall(stripped)) >= 7
+    ):
+        return True
+    return False
+
+
+def _smart_collapse_whitespace(s: str) -> str:
+    """Pipeline variant of :func:`collapse_whitespace` that skips structured cells."""
+    if not isinstance(s, str):
+        return s
+    if _looks_structured(s):
+        return s
+    return _WHITESPACE_RUN_RE.sub(" ", s)
+
+
 def to_nfc(s: str) -> str:
    """Apply Unicode NFC (canonical composition)."""
    if not isinstance(s, str):
@@ -398,7 +462,9 @@ def _build_pipeline(options: CleanOptions) -> list[tuple[str, Callable[[str], st
    if options.strip_zero_width:
        ops.append(("strip_zero_width", strip_zero_width))
    if options.collapse_whitespace:
-        ops.append(("collapse_whitespace", collapse_whitespace))
+        # The pipeline uses the structured-cell-aware variant so phone /
+        # date / numeric cells keep their meaningful internal whitespace.
+        ops.append(("collapse_whitespace", _smart_collapse_whitespace))
    if options.trim:
        ops.append(("trim", trim))
    return ops
--- a/tests/test_gap_coverage.py
+++ b/tests/test_gap_coverage.py
@@ -137,25 +137,31 @@ class TestFindingsByToolEdges:
 # Known gap: collapse_whitespace on numeric/date/phone-shaped cells
 # ---------------------------------------------------------------------------

-class TestNumericPhoneWhitespaceGap:
-    """Spec §4.17: ``collapse_whitespace`` should NOT collapse internal
-    whitespace in cells that look numeric, dated, or phone-shaped.
+class TestStructuredCellWhitespacePreservation:
+    """Spec §4.17: ``collapse_whitespace`` skips numeric/date/phone-shaped cells."""

-    Currently unconditional. Marked xfail so the suite tracks the gap
-    without silently allowing regressions on the cells that *do* get
-    correctly collapsed.
-    """
-
-    @pytest.mark.xfail(
-        reason=(
-            "Heuristic not yet implemented — collapse_whitespace runs on every "
-            "string cell, including phone-shaped ones. See TEST-CASES.md §4.17."
-        ),
-        strict=True,
-    )
    def test_phone_internal_double_space_preserved(self):
-        df = pd.DataFrame({"phone": ["(555)  123-4567"]})  # double space inside
+        df = pd.DataFrame({"phone": ["(555)  123-4567"]})
        result = clean_dataframe(df)
-        # Spec requires the double space to survive because the cell looks
-        # phone-shaped. Today the cleaner collapses it.
        assert result.cleaned_df.iloc[0]["phone"] == "(555)  123-4567"
+
+    def test_european_thousands_sep_preserved(self):
+        df = pd.DataFrame({"price": ["1 234"]})
+        result = clean_dataframe(df)
+        assert result.cleaned_df.iloc[0]["price"] == "1 234"
+
+    def test_iso_date_passes_through(self):
+        df = pd.DataFrame({"date": ["2024-01-15"]})
+        result = clean_dataframe(df)
+        assert result.cleaned_df.iloc[0]["date"] == "2024-01-15"
+
+    def test_textual_date_preserves_spaces(self):
+        df = pd.DataFrame({"date": ["Jan 15 2024"]})
+        result = clean_dataframe(df)
+        assert result.cleaned_df.iloc[0]["date"] == "Jan 15 2024"
+
+    def test_free_text_double_space_still_collapsed(self):
+        # Crucially, the heuristic must NOT trigger on prose with letters.
+        df = pd.DataFrame({"note": ["hello  world"]})
+        result = clean_dataframe(df)
+        assert result.cleaned_df.iloc[0]["note"] == "hello world"