feat: implement text cleaner (script 02) with CLI, GUI, and tests

Builds 02_text_cleaner.py from stub to working: character-level hygiene
for CSV/Excel inputs covering trim, whitespace collapse, smart-character
folding, Unicode NFC/NFKC, BOM strip, zero-width strip, control-char
strip, line-ending normalization, and per-column case conversion. Three
presets (minimal/excel-hygiene/paranoid) keep the buyer surface small.

- src/core/text_clean.py: pure helpers + CleanOptions/CleanResult +
  clean_dataframe with dtype-safe column selection
- src/cli_text_clean.py: Typer CLI mirroring the dedup CLI shape
  (dry-run by default, --apply writes cleaned + changes audit, JSON
  config save/load)
- src/gui/pages/2_Text_Cleaner.py: real Streamlit page with preset
  picker, advanced toggles, preview, before/after metrics, and three
  download buttons
- tests/test_text_clean.py + test_cli_text_clean.py: 92 new tests
  covering edge cases E1-E50 from the spec
- samples/messy_text.csv: demo dataset surfacing UC1, UC3, UC6, UC10
  in 10 rows
- test-cases/uc16-uc26 + ec05-ec09: per-use-case and per-edge-case
  fixtures

Docs: TECHNICAL.md §10.2 (full Tier 1/2/3 spec), DECISIONS.md v1.7
entry locking the spec, CLI-REFERENCE.md gains the text cleaner
section, README.md gains a top-level Text Cleaner block, USER-GUIDE.md
status row 02 promoted Skeleton -> Working.

200/200 tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-29 15:14:15 +00:00
parent b2ca04e6f4
commit 54f92ae47e
28 changed files with 2093 additions and 58 deletions

View File

@@ -59,6 +59,25 @@ from .config import (
DeduplicationConfig,
StrategyConfig,
)
from .text_clean import (
CleanOptions,
CleanResult,
PRESETS,
apply_case,
clean_dataframe,
clean_value,
collapse_whitespace,
fold_smart_chars,
normalize_line_endings,
sentence_case,
smart_title_case,
strip_bom,
strip_control,
strip_zero_width,
to_nfc,
to_nfkc,
trim,
)
__all__ = [
# Core
@@ -90,4 +109,22 @@ __all__ = [
"DeduplicationConfig",
"StrategyConfig",
"ColumnStrategyConfig",
# Text cleaning
"CleanOptions",
"CleanResult",
"PRESETS",
"clean_dataframe",
"clean_value",
"trim",
"collapse_whitespace",
"to_nfc",
"to_nfkc",
"fold_smart_chars",
"strip_zero_width",
"strip_bom",
"strip_control",
"normalize_line_endings",
"smart_title_case",
"sentence_case",
"apply_case",
]

489
src/core/text_clean.py Normal file
View File

@@ -0,0 +1,489 @@
"""Character-level text hygiene for DataFrames.
Operations are independently toggleable, idempotent, and safe to compose.
Each per-string helper is ``str -> str``. Numeric, datetime, and boolean
columns pass through ``clean_dataframe`` untouched; only string cells are
modified.
See TECHNICAL.md Section 10.2 for the full functional spec.
"""
from __future__ import annotations
import json
import re
import unicodedata
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any, Callable, Iterable, Literal, Optional
import pandas as pd
from pandas.api import types as pdtypes
# ---------------------------------------------------------------------------
# Per-string helpers
# ---------------------------------------------------------------------------
# Smart-character map (curly quotes, dashes, ellipsis, NBSP, narrow NBSP).
_SMART_CHARS: dict[str, str] = {
"": "'", # LEFT SINGLE QUOTATION MARK
"": "'", # RIGHT SINGLE QUOTATION MARK
"": "'", # SINGLE LOW-9 QUOTATION MARK
"": "'", # SINGLE HIGH-REVERSED-9 QUOTATION MARK
"": '"', # LEFT DOUBLE QUOTATION MARK
"": '"', # RIGHT DOUBLE QUOTATION MARK
"": '"', # DOUBLE LOW-9 QUOTATION MARK
"": '"', # DOUBLE HIGH-REVERSED-9 QUOTATION MARK
"": "-", # EN DASH
"": "-", # EM DASH
"": "-", # HORIZONTAL BAR
"": "-", # MINUS SIGN
"": "...", # HORIZONTAL ELLIPSIS
" ": " ", # NO-BREAK SPACE
"": " ", # NARROW NO-BREAK SPACE
"": " ", # THIN SPACE
"": " ", # HAIR SPACE
"": " ", # EN SPACE
"": " ", # EM SPACE
" ": " ", # IDEOGRAPHIC SPACE
}
_SMART_TRANS = str.maketrans(_SMART_CHARS)
# Zero-width / invisible characters. ``U+FEFF`` (BOM/ZWNBSP) is included; if
# it appears at the *very start* of the first cell of the first column, the
# BOM-strip op handles it; elsewhere it is treated as a zero-width char.
_ZERO_WIDTH = (
"" # ZERO WIDTH SPACE
"" # ZERO WIDTH NON-JOINER
"" # ZERO WIDTH JOINER
"" # WORD JOINER
"" # LEFT-TO-RIGHT MARK
"" # RIGHT-TO-LEFT MARK
"" # ZERO WIDTH NO-BREAK SPACE / BOM
)
_ZERO_WIDTH_RE = re.compile(f"[{_ZERO_WIDTH}]")
# Control characters: U+0000-U+001F and U+007F, but preserve \t \n \r.
_CONTROL_RE = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]")
# Any run of *horizontal* whitespace (spaces, tabs, form/vertical feeds).
# Newlines and carriage returns are excluded so multi-line cells keep their
# line structure; the line-ending op normalizes the actual line terminators.
_WHITESPACE_RUN_RE = re.compile(r"[^\S\n\r]+")
def trim(s: str) -> str:
"""Strip leading/trailing whitespace."""
if not isinstance(s, str):
return s
return s.strip()
def collapse_whitespace(s: str) -> str:
"""Collapse runs of whitespace to a single space.
Preserves leading/trailing whitespace boundaries (use ``trim`` to remove
them). Tabs and other whitespace inside the string become a single
regular space.
"""
if not isinstance(s, str):
return s
return _WHITESPACE_RUN_RE.sub(" ", s)
def to_nfc(s: str) -> str:
"""Apply Unicode NFC (canonical composition)."""
if not isinstance(s, str):
return s
return unicodedata.normalize("NFC", s)
def to_nfkc(s: str) -> str:
"""Apply Unicode NFKC (compatibility composition). Lossy."""
if not isinstance(s, str):
return s
return unicodedata.normalize("NFKC", s)
def fold_smart_chars(s: str) -> str:
"""Fold curly quotes, em/en-dashes, ellipsis, NBSP variants to ASCII."""
if not isinstance(s, str):
return s
return s.translate(_SMART_TRANS)
def strip_zero_width(s: str) -> str:
"""Remove zero-width and bidi-mark characters."""
if not isinstance(s, str):
return s
return _ZERO_WIDTH_RE.sub("", s)
def strip_bom(s: str) -> str:
"""Remove a leading ``U+FEFF`` (BOM) from the start of the string."""
if not isinstance(s, str):
return s
return s.lstrip("")
def strip_control(s: str) -> str:
"""Remove control characters except ``\\t \\n \\r``."""
if not isinstance(s, str):
return s
return _CONTROL_RE.sub("", s)
def normalize_line_endings(s: str) -> str:
"""Normalize ``\\r\\n`` and bare ``\\r`` to ``\\n``."""
if not isinstance(s, str):
return s
return s.replace("\r\n", "\n").replace("\r", "\n")
# Smart title-case helpers
_TITLE_LOWERCASE_PARTICLES = {
"a", "an", "and", "as", "at", "but", "by", "en", "for", "if", "in", "nor",
"of", "on", "or", "per", "the", "to", "v", "v.", "vs", "vs.", "via",
}
def _is_all_caps_token(token: str) -> bool:
"""A token is all-caps when it has at least one cased char and no lowercase."""
has_letter = any(c.isalpha() for c in token)
has_lower = any(c.islower() for c in token)
return has_letter and not has_lower and len(token) >= 2
def smart_title_case(s: str) -> str:
"""Title-case that preserves all-caps tokens and lowercases mid-string particles.
- ``USA`` stays ``USA``.
- ``of``, ``and``, ``the``, etc. stay lowercase except as the first/last word.
- Apostrophes inside words don't restart capitalization (``O'Neil``).
"""
if not isinstance(s, str) or not s:
return s
tokens = s.split(" ")
out: list[str] = []
last_idx = len(tokens) - 1
for i, tok in enumerate(tokens):
if not tok:
out.append(tok)
continue
if _is_all_caps_token(tok):
out.append(tok)
continue
lowered = tok.lower()
if 0 < i < last_idx and lowered in _TITLE_LOWERCASE_PARTICLES:
out.append(lowered)
continue
# Capitalize first cased character; preserve apostrophes/hyphens
chars = list(tok)
capitalized = False
for j, c in enumerate(chars):
if c.isalpha():
if not capitalized:
chars[j] = c.upper()
capitalized = True
else:
chars[j] = c.lower()
out.append("".join(chars))
return " ".join(out)
def sentence_case(s: str) -> str:
"""Lowercase, then capitalize the first cased letter after each ``. ! ?``."""
if not isinstance(s, str) or not s:
return s
lowered = s.lower()
chars = list(lowered)
capitalize_next = True
for i, c in enumerate(chars):
if c in ".!?":
capitalize_next = True
continue
if capitalize_next and c.isalpha():
chars[i] = c.upper()
capitalize_next = False
elif c.strip():
# Any non-whitespace, non-letter (e.g., quote, paren) doesn't
# consume the "next letter" trigger.
if c.isalpha():
capitalize_next = False
return "".join(chars)
CaseMode = Literal["upper", "lower", "title", "sentence"]
def apply_case(s: str, mode: CaseMode) -> str:
if not isinstance(s, str):
return s
if mode == "upper":
return s.upper()
if mode == "lower":
return s.lower()
if mode == "title":
return smart_title_case(s)
if mode == "sentence":
return sentence_case(s)
raise ValueError(f"Unknown case mode: {mode}")
# ---------------------------------------------------------------------------
# Options / result dataclasses
# ---------------------------------------------------------------------------
PRESETS: dict[str, dict[str, Any]] = {
"minimal": {
"trim": True,
"collapse_whitespace": True,
"nfc": False,
"nfkc": False,
"fold_smart_chars": False,
"strip_zero_width": False,
"strip_bom": False,
"strip_control": False,
"normalize_line_endings": False,
},
"excel-hygiene": {
"trim": True,
"collapse_whitespace": True,
"nfc": True,
"nfkc": False,
"fold_smart_chars": True,
"strip_zero_width": True,
"strip_bom": True,
"strip_control": True,
"normalize_line_endings": True,
},
"paranoid": {
"trim": True,
"collapse_whitespace": True,
"nfc": True,
"nfkc": True,
"fold_smart_chars": True,
"strip_zero_width": True,
"strip_bom": True,
"strip_control": True,
"normalize_line_endings": True,
},
}
@dataclass
class CleanOptions:
"""Toggles for character-level cleaning operations.
Defaults match the ``excel-hygiene`` preset.
"""
# Operations
trim: bool = True
collapse_whitespace: bool = True
nfc: bool = True
nfkc: bool = False
fold_smart_chars: bool = True
strip_zero_width: bool = True
strip_bom: bool = True
strip_control: bool = True
normalize_line_endings: bool = True
# Case conversion: either a single mode applied to all selected columns,
# or a dict mapping column name -> mode for per-column control.
case: Optional[CaseMode] = None
case_columns: dict[str, CaseMode] = field(default_factory=dict)
# Scope control
columns: Optional[list[str]] = None # None = all string-typed columns
skip_columns: list[str] = field(default_factory=list)
@classmethod
def from_preset(cls, name: str) -> CleanOptions:
if name not in PRESETS:
raise ValueError(
f"Unknown preset '{name}'. "
f"Available: {', '.join(sorted(PRESETS))}."
)
return cls(**PRESETS[name])
@classmethod
def from_dict(cls, data: dict) -> CleanOptions:
known = {f for f in cls.__dataclass_fields__}
kwargs = {k: v for k, v in data.items() if k in known}
return cls(**kwargs)
def to_dict(self) -> dict:
return asdict(self)
def to_file(self, path: str | Path) -> Path:
out = Path(path)
out.write_text(json.dumps(self.to_dict(), indent=2))
return out
@classmethod
def from_file(cls, path: str | Path) -> CleanOptions:
return cls.from_dict(json.loads(Path(path).read_text()))
@dataclass
class CleanResult:
"""Output of ``clean_dataframe``."""
cleaned_df: pd.DataFrame
changes: pd.DataFrame # cols: row, column, old, new, ops_applied
cells_changed: int
cells_total: int
columns_processed: list[str]
# ---------------------------------------------------------------------------
# Cell-level pipeline
# ---------------------------------------------------------------------------
def _build_pipeline(options: CleanOptions) -> list[tuple[str, Callable[[str], str]]]:
"""Return ordered (op_name, fn) pairs for the cell-level pipeline.
Order is meaningful:
1. BOM strip first so a leading FEFF doesn't survive into other ops.
2. Line-ending normalize before whitespace ops so \\r\\n collapses cleanly.
3. Control-char strip before whitespace ops.
4. Smart-char fold before NFC/NFKC (folded ASCII is already NFC-stable).
5. NFC then NFKC (NFKC subsumes NFC if both set; we still run NFC first
so the result is identical to NFKC alone — kept explicit for logging).
6. Zero-width strip after Unicode normalization (NFKC can introduce
decomposed forms whose combining marks must not be stripped).
7. Whitespace collapse, then trim, last.
"""
ops: list[tuple[str, Callable[[str], str]]] = []
if options.strip_bom:
ops.append(("strip_bom", strip_bom))
if options.normalize_line_endings:
ops.append(("normalize_line_endings", normalize_line_endings))
if options.strip_control:
ops.append(("strip_control", strip_control))
if options.fold_smart_chars:
ops.append(("fold_smart_chars", fold_smart_chars))
if options.nfc:
ops.append(("nfc", to_nfc))
if options.nfkc:
ops.append(("nfkc", to_nfkc))
if options.strip_zero_width:
ops.append(("strip_zero_width", strip_zero_width))
if options.collapse_whitespace:
ops.append(("collapse_whitespace", collapse_whitespace))
if options.trim:
ops.append(("trim", trim))
return ops
def clean_value(value: Any, options: CleanOptions) -> tuple[Any, list[str]]:
"""Apply the configured pipeline to a single cell.
Returns ``(cleaned_value, ops_applied)``. Non-strings and missing values
pass through unchanged with an empty ``ops_applied`` list.
"""
if value is None or (isinstance(value, float) and pd.isna(value)):
return value, []
if not isinstance(value, str):
return value, []
pipeline = _build_pipeline(options)
cur = value
applied: list[str] = []
for name, fn in pipeline:
new = fn(cur)
if new != cur:
applied.append(name)
cur = new
return cur, applied
# ---------------------------------------------------------------------------
# DataFrame-level entry point
# ---------------------------------------------------------------------------
def _select_columns(df: pd.DataFrame, options: CleanOptions) -> list[str]:
"""Pick the columns the pipeline should operate on.
- If ``options.columns`` is explicit, use it (after validating).
- Otherwise default to columns whose pandas dtype is object/string.
- Always exclude ``options.skip_columns``.
"""
if options.columns is not None:
missing = [c for c in options.columns if c not in df.columns]
if missing:
raise ValueError(
f"Columns not found in input: {missing}. "
f"Available: {list(df.columns)}"
)
chosen: Iterable[str] = options.columns
else:
chosen = [
c for c in df.columns
if pdtypes.is_object_dtype(df[c]) or pdtypes.is_string_dtype(df[c])
]
skip = set(options.skip_columns)
return [c for c in chosen if c not in skip]
def clean_dataframe(df: pd.DataFrame, options: Optional[CleanOptions] = None) -> CleanResult:
"""Apply text-cleaning ops to selected columns of *df*.
Numeric, datetime, and boolean columns are skipped by default. The input
DataFrame is not mutated; a copy is returned in ``CleanResult.cleaned_df``.
"""
options = options or CleanOptions()
out = df.copy()
columns = _select_columns(out, options)
case_per_col: dict[str, CaseMode] = dict(options.case_columns)
if options.case is not None:
for c in columns:
case_per_col.setdefault(c, options.case)
change_records: list[dict[str, Any]] = []
cells_changed = 0
cells_total = 0
for col in columns:
series = out[col]
new_values: list[Any] = []
col_case = case_per_col.get(col)
for row_idx, original in enumerate(series.tolist()):
cells_total += 1
cleaned, ops_applied = clean_value(original, options)
if col_case is not None and isinstance(cleaned, str):
cased = apply_case(cleaned, col_case)
if cased != cleaned:
ops_applied.append(f"case:{col_case}")
cleaned = cased
if ops_applied and cleaned != original:
cells_changed += 1
change_records.append({
"row": row_idx,
"column": col,
"old": original,
"new": cleaned,
"ops_applied": ",".join(ops_applied),
})
new_values.append(cleaned)
out[col] = new_values
changes_df = pd.DataFrame(
change_records,
columns=["row", "column", "old", "new", "ops_applied"],
)
return CleanResult(
cleaned_df=out,
changes=changes_df,
cells_changed=cells_changed,
cells_total=cells_total,
columns_processed=columns,
)