feat(text_clean): preserve internal whitespace in numeric/date/phone cells
Closes the §4.17 spec gap that test_gap_coverage.py was tracking via xfail: collapse_whitespace must NOT touch cells whose shape carries meaningful internal whitespace. Adds _looks_structured(s) — returns True when s matches: - numeric (currency optional, thousand-grouping by , . or single space) - date (ISO/slash/dot separator, or 'Mon DD YYYY' / 'DD Mon YYYY') - phone (digits + parens/dots/dashes/+/spaces, >= 7 digits, no letters) The pipeline uses a new _smart_collapse_whitespace wrapper that defers to collapse_whitespace only when _looks_structured returns False. The raw collapse_whitespace function is unchanged so direct callers and existing unit tests remain valid. Five new positive tests replace the xfail: - "(555) 123-4567" preserved (phone, double space inside) - "1 234" preserved (European thousands) - "2024-01-15" preserved (ISO date) - "Jan 15 2024" preserved (textual date) - "hello world" still collapsed to "hello world" (free-text negative case) Conservative on purpose: a false negative just collapses (existing behavior); a false positive leaves intentional double spaces in prose. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -92,12 +92,76 @@ def collapse_whitespace(s: str) -> str:
|
||||
Preserves leading/trailing whitespace boundaries (use ``trim`` to remove
|
||||
them). Tabs and other whitespace inside the string become a single
|
||||
regular space.
|
||||
|
||||
This is the *raw* operation — it always collapses. The cell-level
|
||||
pipeline uses :func:`_smart_collapse_whitespace` instead, which skips
|
||||
cells that look structured (numeric, dated, or phone-shaped) per
|
||||
TEST-CASES.md §4.17.
|
||||
"""
|
||||
if not isinstance(s, str):
|
||||
return s
|
||||
return _WHITESPACE_RUN_RE.sub(" ", s)
|
||||
|
||||
|
||||
# Cell-shape predicates used to skip ``collapse_whitespace`` on values that
|
||||
# carry meaningful internal whitespace (European thousands separators,
|
||||
# phone formatting, dates with space-separated tokens).
|
||||
|
||||
# Numeric: optional sign / currency, digits with optional thousand-grouping
|
||||
# by comma, dot, or single space, and optional decimal portion.
|
||||
_NUMERIC_SHAPED = re.compile(
|
||||
r"^\s*[$€£¥]?\s*[+-]?\d{1,3}(?:[, ]\d{3})+(?:[.,]\d+)?\s*$"
|
||||
r"|^\s*[$€£¥]?\s*[+-]?\d+(?:[.,]\d+)?\s*$"
|
||||
)
|
||||
# Date: ISO, slash, or dot separators with two- or four-digit year, plus
|
||||
# the ``Mon DD YYYY`` / ``DD Mon YYYY`` shapes.
|
||||
_DATE_SHAPED = re.compile(
|
||||
r"^\s*\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4}\s*$"
|
||||
r"|^\s*\d{1,2}\s+[A-Za-z]{3,9}\s+\d{2,4}\s*$"
|
||||
r"|^\s*[A-Za-z]{3,9}\s+\d{1,2}\s+\d{2,4}\s*$"
|
||||
)
|
||||
# Phone: a string that's mostly digits with parens / dots / dashes / +
|
||||
# / spaces, contains at least 7 digits total, and has no letters.
|
||||
_PHONE_DIGIT_RE = re.compile(r"\d")
|
||||
_PHONE_NON_DIGIT_RE = re.compile(r"[A-Za-z]")
|
||||
_PHONE_ALLOWED_RE = re.compile(r"^[\d\s().+\-]+$")
|
||||
|
||||
|
||||
def _looks_structured(s: str) -> bool:
|
||||
"""True when *s* looks numeric, dated, or phone-shaped.
|
||||
|
||||
Used by the pipeline-level collapse to leave meaningful internal
|
||||
whitespace alone (``1 234`` European thousand-sep, ``(555) 123-4567``
|
||||
phone formatting, ``Jan 15 2024`` date, etc.). Conservative on purpose:
|
||||
a false negative just means the cell gets collapsed (the existing
|
||||
behavior); a false positive leaves intentional double spaces in free
|
||||
text, which is a worse outcome.
|
||||
"""
|
||||
if not s or not isinstance(s, str):
|
||||
return False
|
||||
stripped = s.strip()
|
||||
if not stripped:
|
||||
return False
|
||||
if _NUMERIC_SHAPED.match(stripped) or _DATE_SHAPED.match(stripped):
|
||||
return True
|
||||
if (
|
||||
_PHONE_ALLOWED_RE.match(stripped)
|
||||
and not _PHONE_NON_DIGIT_RE.search(stripped)
|
||||
and len(_PHONE_DIGIT_RE.findall(stripped)) >= 7
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _smart_collapse_whitespace(s: str) -> str:
|
||||
"""Pipeline variant of :func:`collapse_whitespace` that skips structured cells."""
|
||||
if not isinstance(s, str):
|
||||
return s
|
||||
if _looks_structured(s):
|
||||
return s
|
||||
return _WHITESPACE_RUN_RE.sub(" ", s)
|
||||
|
||||
|
||||
def to_nfc(s: str) -> str:
|
||||
"""Apply Unicode NFC (canonical composition)."""
|
||||
if not isinstance(s, str):
|
||||
@@ -398,7 +462,9 @@ def _build_pipeline(options: CleanOptions) -> list[tuple[str, Callable[[str], st
|
||||
if options.strip_zero_width:
|
||||
ops.append(("strip_zero_width", strip_zero_width))
|
||||
if options.collapse_whitespace:
|
||||
ops.append(("collapse_whitespace", collapse_whitespace))
|
||||
# The pipeline uses the structured-cell-aware variant so phone /
|
||||
# date / numeric cells keep their meaningful internal whitespace.
|
||||
ops.append(("collapse_whitespace", _smart_collapse_whitespace))
|
||||
if options.trim:
|
||||
ops.append(("trim", trim))
|
||||
return ops
|
||||
|
||||
@@ -137,25 +137,31 @@ class TestFindingsByToolEdges:
|
||||
# Known gap: collapse_whitespace on numeric/date/phone-shaped cells
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestNumericPhoneWhitespaceGap:
|
||||
"""Spec §4.17: ``collapse_whitespace`` should NOT collapse internal
|
||||
whitespace in cells that look numeric, dated, or phone-shaped.
|
||||
class TestStructuredCellWhitespacePreservation:
|
||||
"""Spec §4.17: ``collapse_whitespace`` skips numeric/date/phone-shaped cells."""
|
||||
|
||||
Currently unconditional. Marked xfail so the suite tracks the gap
|
||||
without silently allowing regressions on the cells that *do* get
|
||||
correctly collapsed.
|
||||
"""
|
||||
|
||||
@pytest.mark.xfail(
|
||||
reason=(
|
||||
"Heuristic not yet implemented — collapse_whitespace runs on every "
|
||||
"string cell, including phone-shaped ones. See TEST-CASES.md §4.17."
|
||||
),
|
||||
strict=True,
|
||||
)
|
||||
def test_phone_internal_double_space_preserved(self):
|
||||
df = pd.DataFrame({"phone": ["(555) 123-4567"]}) # double space inside
|
||||
df = pd.DataFrame({"phone": ["(555) 123-4567"]})
|
||||
result = clean_dataframe(df)
|
||||
# Spec requires the double space to survive because the cell looks
|
||||
# phone-shaped. Today the cleaner collapses it.
|
||||
assert result.cleaned_df.iloc[0]["phone"] == "(555) 123-4567"
|
||||
|
||||
def test_european_thousands_sep_preserved(self):
|
||||
df = pd.DataFrame({"price": ["1 234"]})
|
||||
result = clean_dataframe(df)
|
||||
assert result.cleaned_df.iloc[0]["price"] == "1 234"
|
||||
|
||||
def test_iso_date_passes_through(self):
|
||||
df = pd.DataFrame({"date": ["2024-01-15"]})
|
||||
result = clean_dataframe(df)
|
||||
assert result.cleaned_df.iloc[0]["date"] == "2024-01-15"
|
||||
|
||||
def test_textual_date_preserves_spaces(self):
|
||||
df = pd.DataFrame({"date": ["Jan 15 2024"]})
|
||||
result = clean_dataframe(df)
|
||||
assert result.cleaned_df.iloc[0]["date"] == "Jan 15 2024"
|
||||
|
||||
def test_free_text_double_space_still_collapsed(self):
|
||||
# Crucially, the heuristic must NOT trigger on prose with letters.
|
||||
df = pd.DataFrame({"note": ["hello world"]})
|
||||
result = clean_dataframe(df)
|
||||
assert result.cleaned_df.iloc[0]["note"] == "hello world"
|
||||
|
||||
Reference in New Issue
Block a user