feat(text_clean): preserve internal whitespace in numeric/date/phone cells

Closes the §4.17 spec gap that test_gap_coverage.py was tracking via xfail:
collapse_whitespace must NOT touch cells whose shape carries meaningful
internal whitespace.

Adds _looks_structured(s) — returns True when s matches:
  - numeric (currency optional, thousand-grouping by , . or single space)
  - date (ISO/slash/dot separator, or 'Mon DD YYYY' / 'DD Mon YYYY')
  - phone (digits + parens/dots/dashes/+/spaces, >= 7 digits, no letters)

The pipeline uses a new _smart_collapse_whitespace wrapper that defers to
collapse_whitespace only when _looks_structured returns False. The raw
collapse_whitespace function is unchanged so direct callers and existing
unit tests remain valid.

Five new positive tests replace the xfail:
  - "(555)  123-4567" preserved (phone, double space inside)
  - "1 234" preserved (European thousands)
  - "2024-01-15" preserved (ISO date)
  - "Jan 15 2024" preserved (textual date)
  - "hello  world" still collapsed to "hello world" (free-text negative case)

Conservative on purpose: a false negative just collapses (existing
behavior); a false positive leaves intentional double spaces in prose.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-29 16:09:25 +00:00
parent 4687cf87b4
commit 0b959dee93
2 changed files with 91 additions and 19 deletions

View File

@@ -92,12 +92,76 @@ def collapse_whitespace(s: str) -> str:
Preserves leading/trailing whitespace boundaries (use ``trim`` to remove
them). Tabs and other whitespace inside the string become a single
regular space.
This is the *raw* operation — it always collapses. The cell-level
pipeline uses :func:`_smart_collapse_whitespace` instead, which skips
cells that look structured (numeric, dated, or phone-shaped) per
TEST-CASES.md §4.17.
"""
if not isinstance(s, str):
return s
return _WHITESPACE_RUN_RE.sub(" ", s)
# Cell-shape predicates used to skip ``collapse_whitespace`` on values that
# carry meaningful internal whitespace (European thousands separators,
# phone formatting, dates with space-separated tokens).
# Numeric: optional sign / currency, digits with optional thousand-grouping
# by comma, dot, or single space, and optional decimal portion.
_NUMERIC_SHAPED = re.compile(
r"^\s*[$€£¥]?\s*[+-]?\d{1,3}(?:[, ]\d{3})+(?:[.,]\d+)?\s*$"
r"|^\s*[$€£¥]?\s*[+-]?\d+(?:[.,]\d+)?\s*$"
)
# Date: ISO, slash, or dot separators with two- or four-digit year, plus
# the ``Mon DD YYYY`` / ``DD Mon YYYY`` shapes.
_DATE_SHAPED = re.compile(
r"^\s*\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4}\s*$"
r"|^\s*\d{1,2}\s+[A-Za-z]{3,9}\s+\d{2,4}\s*$"
r"|^\s*[A-Za-z]{3,9}\s+\d{1,2}\s+\d{2,4}\s*$"
)
# Phone: a string that's mostly digits with parens / dots / dashes / +
# / spaces, contains at least 7 digits total, and has no letters.
_PHONE_DIGIT_RE = re.compile(r"\d")
_PHONE_NON_DIGIT_RE = re.compile(r"[A-Za-z]")
_PHONE_ALLOWED_RE = re.compile(r"^[\d\s().+\-]+$")
def _looks_structured(s: str) -> bool:
"""True when *s* looks numeric, dated, or phone-shaped.
Used by the pipeline-level collapse to leave meaningful internal
whitespace alone (``1 234`` European thousand-sep, ``(555) 123-4567``
phone formatting, ``Jan 15 2024`` date, etc.). Conservative on purpose:
a false negative just means the cell gets collapsed (the existing
behavior); a false positive leaves intentional double spaces in free
text, which is a worse outcome.
"""
if not s or not isinstance(s, str):
return False
stripped = s.strip()
if not stripped:
return False
if _NUMERIC_SHAPED.match(stripped) or _DATE_SHAPED.match(stripped):
return True
if (
_PHONE_ALLOWED_RE.match(stripped)
and not _PHONE_NON_DIGIT_RE.search(stripped)
and len(_PHONE_DIGIT_RE.findall(stripped)) >= 7
):
return True
return False
def _smart_collapse_whitespace(s: str) -> str:
"""Pipeline variant of :func:`collapse_whitespace` that skips structured cells."""
if not isinstance(s, str):
return s
if _looks_structured(s):
return s
return _WHITESPACE_RUN_RE.sub(" ", s)
def to_nfc(s: str) -> str:
"""Apply Unicode NFC (canonical composition)."""
if not isinstance(s, str):
@@ -398,7 +462,9 @@ def _build_pipeline(options: CleanOptions) -> list[tuple[str, Callable[[str], st
if options.strip_zero_width:
ops.append(("strip_zero_width", strip_zero_width))
if options.collapse_whitespace:
ops.append(("collapse_whitespace", collapse_whitespace))
# The pipeline uses the structured-cell-aware variant so phone /
# date / numeric cells keep their meaningful internal whitespace.
ops.append(("collapse_whitespace", _smart_collapse_whitespace))
if options.trim:
ops.append(("trim", trim))
return ops

View File

@@ -137,25 +137,31 @@ class TestFindingsByToolEdges:
# Known gap: collapse_whitespace on numeric/date/phone-shaped cells
# ---------------------------------------------------------------------------
class TestNumericPhoneWhitespaceGap:
"""Spec §4.17: ``collapse_whitespace`` should NOT collapse internal
whitespace in cells that look numeric, dated, or phone-shaped.
class TestStructuredCellWhitespacePreservation:
"""Spec §4.17: ``collapse_whitespace`` skips numeric/date/phone-shaped cells."""
Currently unconditional. Marked xfail so the suite tracks the gap
without silently allowing regressions on the cells that *do* get
correctly collapsed.
"""
@pytest.mark.xfail(
reason=(
"Heuristic not yet implemented — collapse_whitespace runs on every "
"string cell, including phone-shaped ones. See TEST-CASES.md §4.17."
),
strict=True,
)
def test_phone_internal_double_space_preserved(self):
df = pd.DataFrame({"phone": ["(555) 123-4567"]}) # double space inside
df = pd.DataFrame({"phone": ["(555) 123-4567"]})
result = clean_dataframe(df)
# Spec requires the double space to survive because the cell looks
# phone-shaped. Today the cleaner collapses it.
assert result.cleaned_df.iloc[0]["phone"] == "(555) 123-4567"
def test_european_thousands_sep_preserved(self):
df = pd.DataFrame({"price": ["1 234"]})
result = clean_dataframe(df)
assert result.cleaned_df.iloc[0]["price"] == "1 234"
def test_iso_date_passes_through(self):
df = pd.DataFrame({"date": ["2024-01-15"]})
result = clean_dataframe(df)
assert result.cleaned_df.iloc[0]["date"] == "2024-01-15"
def test_textual_date_preserves_spaces(self):
df = pd.DataFrame({"date": ["Jan 15 2024"]})
result = clean_dataframe(df)
assert result.cleaned_df.iloc[0]["date"] == "Jan 15 2024"
def test_free_text_double_space_still_collapsed(self):
# Crucially, the heuristic must NOT trigger on prose with letters.
df = pd.DataFrame({"note": ["hello world"]})
result = clean_dataframe(df)
assert result.cleaned_df.iloc[0]["note"] == "hello world"