diff --git a/src/core/text_clean.py b/src/core/text_clean.py index 06fd38a..432d40b 100644 --- a/src/core/text_clean.py +++ b/src/core/text_clean.py @@ -92,12 +92,76 @@ def collapse_whitespace(s: str) -> str: Preserves leading/trailing whitespace boundaries (use ``trim`` to remove them). Tabs and other whitespace inside the string become a single regular space. + + This is the *raw* operation — it always collapses. The cell-level + pipeline uses :func:`_smart_collapse_whitespace` instead, which skips + cells that look structured (numeric, dated, or phone-shaped) per + TEST-CASES.md §4.17. """ if not isinstance(s, str): return s return _WHITESPACE_RUN_RE.sub(" ", s) +# Cell-shape predicates used to skip ``collapse_whitespace`` on values that +# carry meaningful internal whitespace (European thousands separators, +# phone formatting, dates with space-separated tokens). + +# Numeric: optional sign / currency, digits with optional thousand-grouping +# by comma, dot, or single space, and optional decimal portion. +_NUMERIC_SHAPED = re.compile( + r"^\s*[$€£¥]?\s*[+-]?\d{1,3}(?:[, ]\d{3})+(?:[.,]\d+)?\s*$" + r"|^\s*[$€£¥]?\s*[+-]?\d+(?:[.,]\d+)?\s*$" +) +# Date: ISO, slash, or dot separators with two- or four-digit year, plus +# the ``Mon DD YYYY`` / ``DD Mon YYYY`` shapes. +_DATE_SHAPED = re.compile( + r"^\s*\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4}\s*$" + r"|^\s*\d{1,2}\s+[A-Za-z]{3,9}\s+\d{2,4}\s*$" + r"|^\s*[A-Za-z]{3,9}\s+\d{1,2}\s+\d{2,4}\s*$" +) +# Phone: a string that's mostly digits with parens / dots / dashes / + +# / spaces, contains at least 7 digits total, and has no letters. +_PHONE_DIGIT_RE = re.compile(r"\d") +_PHONE_NON_DIGIT_RE = re.compile(r"[A-Za-z]") +_PHONE_ALLOWED_RE = re.compile(r"^[\d\s().+\-]+$") + + +def _looks_structured(s: str) -> bool: + """True when *s* looks numeric, dated, or phone-shaped. + + Used by the pipeline-level collapse to leave meaningful internal + whitespace alone (``1 234`` European thousand-sep, ``(555) 123-4567`` + phone formatting, ``Jan 15 2024`` date, etc.). Conservative on purpose: + a false negative just means the cell gets collapsed (the existing + behavior); a false positive leaves intentional double spaces in free + text, which is a worse outcome. + """ + if not s or not isinstance(s, str): + return False + stripped = s.strip() + if not stripped: + return False + if _NUMERIC_SHAPED.match(stripped) or _DATE_SHAPED.match(stripped): + return True + if ( + _PHONE_ALLOWED_RE.match(stripped) + and not _PHONE_NON_DIGIT_RE.search(stripped) + and len(_PHONE_DIGIT_RE.findall(stripped)) >= 7 + ): + return True + return False + + +def _smart_collapse_whitespace(s: str) -> str: + """Pipeline variant of :func:`collapse_whitespace` that skips structured cells.""" + if not isinstance(s, str): + return s + if _looks_structured(s): + return s + return _WHITESPACE_RUN_RE.sub(" ", s) + + def to_nfc(s: str) -> str: """Apply Unicode NFC (canonical composition).""" if not isinstance(s, str): @@ -398,7 +462,9 @@ def _build_pipeline(options: CleanOptions) -> list[tuple[str, Callable[[str], st if options.strip_zero_width: ops.append(("strip_zero_width", strip_zero_width)) if options.collapse_whitespace: - ops.append(("collapse_whitespace", collapse_whitespace)) + # The pipeline uses the structured-cell-aware variant so phone / + # date / numeric cells keep their meaningful internal whitespace. + ops.append(("collapse_whitespace", _smart_collapse_whitespace)) if options.trim: ops.append(("trim", trim)) return ops diff --git a/tests/test_gap_coverage.py b/tests/test_gap_coverage.py index 079652a..0003b62 100644 --- a/tests/test_gap_coverage.py +++ b/tests/test_gap_coverage.py @@ -137,25 +137,31 @@ class TestFindingsByToolEdges: # Known gap: collapse_whitespace on numeric/date/phone-shaped cells # --------------------------------------------------------------------------- -class TestNumericPhoneWhitespaceGap: - """Spec §4.17: ``collapse_whitespace`` should NOT collapse internal - whitespace in cells that look numeric, dated, or phone-shaped. +class TestStructuredCellWhitespacePreservation: + """Spec §4.17: ``collapse_whitespace`` skips numeric/date/phone-shaped cells.""" - Currently unconditional. Marked xfail so the suite tracks the gap - without silently allowing regressions on the cells that *do* get - correctly collapsed. - """ - - @pytest.mark.xfail( - reason=( - "Heuristic not yet implemented — collapse_whitespace runs on every " - "string cell, including phone-shaped ones. See TEST-CASES.md §4.17." - ), - strict=True, - ) def test_phone_internal_double_space_preserved(self): - df = pd.DataFrame({"phone": ["(555) 123-4567"]}) # double space inside + df = pd.DataFrame({"phone": ["(555) 123-4567"]}) result = clean_dataframe(df) - # Spec requires the double space to survive because the cell looks - # phone-shaped. Today the cleaner collapses it. assert result.cleaned_df.iloc[0]["phone"] == "(555) 123-4567" + + def test_european_thousands_sep_preserved(self): + df = pd.DataFrame({"price": ["1 234"]}) + result = clean_dataframe(df) + assert result.cleaned_df.iloc[0]["price"] == "1 234" + + def test_iso_date_passes_through(self): + df = pd.DataFrame({"date": ["2024-01-15"]}) + result = clean_dataframe(df) + assert result.cleaned_df.iloc[0]["date"] == "2024-01-15" + + def test_textual_date_preserves_spaces(self): + df = pd.DataFrame({"date": ["Jan 15 2024"]}) + result = clean_dataframe(df) + assert result.cleaned_df.iloc[0]["date"] == "Jan 15 2024" + + def test_free_text_double_space_still_collapsed(self): + # Crucially, the heuristic must NOT trigger on prose with letters. + df = pd.DataFrame({"note": ["hello world"]}) + result = clean_dataframe(df) + assert result.cleaned_df.iloc[0]["note"] == "hello world"