feat(text_clean): preserve internal whitespace in numeric/date/phone cells
Closes the §4.17 spec gap that test_gap_coverage.py was tracking via xfail: collapse_whitespace must NOT touch cells whose shape carries meaningful internal whitespace. Adds _looks_structured(s) — returns True when s matches: - numeric (currency optional, thousand-grouping by , . or single space) - date (ISO/slash/dot separator, or 'Mon DD YYYY' / 'DD Mon YYYY') - phone (digits + parens/dots/dashes/+/spaces, >= 7 digits, no letters) The pipeline uses a new _smart_collapse_whitespace wrapper that defers to collapse_whitespace only when _looks_structured returns False. The raw collapse_whitespace function is unchanged so direct callers and existing unit tests remain valid. Five new positive tests replace the xfail: - "(555) 123-4567" preserved (phone, double space inside) - "1 234" preserved (European thousands) - "2024-01-15" preserved (ISO date) - "Jan 15 2024" preserved (textual date) - "hello world" still collapsed to "hello world" (free-text negative case) Conservative on purpose: a false negative just collapses (existing behavior); a false positive leaves intentional double spaces in prose. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -137,25 +137,31 @@ class TestFindingsByToolEdges:
|
||||
# Known gap: collapse_whitespace on numeric/date/phone-shaped cells
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestNumericPhoneWhitespaceGap:
|
||||
"""Spec §4.17: ``collapse_whitespace`` should NOT collapse internal
|
||||
whitespace in cells that look numeric, dated, or phone-shaped.
|
||||
class TestStructuredCellWhitespacePreservation:
|
||||
"""Spec §4.17: ``collapse_whitespace`` skips numeric/date/phone-shaped cells."""
|
||||
|
||||
Currently unconditional. Marked xfail so the suite tracks the gap
|
||||
without silently allowing regressions on the cells that *do* get
|
||||
correctly collapsed.
|
||||
"""
|
||||
|
||||
@pytest.mark.xfail(
|
||||
reason=(
|
||||
"Heuristic not yet implemented — collapse_whitespace runs on every "
|
||||
"string cell, including phone-shaped ones. See TEST-CASES.md §4.17."
|
||||
),
|
||||
strict=True,
|
||||
)
|
||||
def test_phone_internal_double_space_preserved(self):
|
||||
df = pd.DataFrame({"phone": ["(555) 123-4567"]}) # double space inside
|
||||
df = pd.DataFrame({"phone": ["(555) 123-4567"]})
|
||||
result = clean_dataframe(df)
|
||||
# Spec requires the double space to survive because the cell looks
|
||||
# phone-shaped. Today the cleaner collapses it.
|
||||
assert result.cleaned_df.iloc[0]["phone"] == "(555) 123-4567"
|
||||
|
||||
def test_european_thousands_sep_preserved(self):
|
||||
df = pd.DataFrame({"price": ["1 234"]})
|
||||
result = clean_dataframe(df)
|
||||
assert result.cleaned_df.iloc[0]["price"] == "1 234"
|
||||
|
||||
def test_iso_date_passes_through(self):
|
||||
df = pd.DataFrame({"date": ["2024-01-15"]})
|
||||
result = clean_dataframe(df)
|
||||
assert result.cleaned_df.iloc[0]["date"] == "2024-01-15"
|
||||
|
||||
def test_textual_date_preserves_spaces(self):
|
||||
df = pd.DataFrame({"date": ["Jan 15 2024"]})
|
||||
result = clean_dataframe(df)
|
||||
assert result.cleaned_df.iloc[0]["date"] == "Jan 15 2024"
|
||||
|
||||
def test_free_text_double_space_still_collapsed(self):
|
||||
# Crucially, the heuristic must NOT trigger on prose with letters.
|
||||
df = pd.DataFrame({"note": ["hello world"]})
|
||||
result = clean_dataframe(df)
|
||||
assert result.cleaned_df.iloc[0]["note"] == "hello world"
|
||||
|
||||
Reference in New Issue
Block a user