feat: implement text cleaner (script 02) with CLI, GUI, and tests

Builds 02_text_cleaner.py from stub to working: character-level hygiene for CSV/Excel inputs covering trim, whitespace collapse, smart-character folding, Unicode NFC/NFKC, BOM strip, zero-width strip, control-char strip, line-ending normalization, and per-column case conversion. Three presets (minimal/excel-hygiene/paranoid) keep the buyer surface small. - src/core/text_clean.py: pure helpers + CleanOptions/CleanResult + clean_dataframe with dtype-safe column selection - src/cli_text_clean.py: Typer CLI mirroring the dedup CLI shape (dry-run by default, --apply writes cleaned + changes audit, JSON config save/load) - src/gui/pages/2_Text_Cleaner.py: real Streamlit page with preset picker, advanced toggles, preview, before/after metrics, and three download buttons - tests/test_text_clean.py + test_cli_text_clean.py: 92 new tests covering edge cases E1-E50 from the spec - samples/messy_text.csv: demo dataset surfacing UC1, UC3, UC6, UC10 in 10 rows - test-cases/uc16-uc26 + ec05-ec09: per-use-case and per-edge-case fixtures Docs: TECHNICAL.md §10.2 (full Tier 1/2/3 spec), DECISIONS.md v1.7 entry locking the spec, CLI-REFERENCE.md gains the text cleaner section, README.md gains a top-level Text Cleaner block, USER-GUIDE.md status row 02 promoted Skeleton -> Working. 200/200 tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 15:14:15 +00:00
parent b2ca04e6f4
commit 54f92ae47e
28 changed files with 2093 additions and 58 deletions
--- a/src/cli_text_clean.py
+++ b/src/cli_text_clean.py
@@ -0,0 +1,373 @@
+"""CLI for the DataTools text cleaner (script 02).
+
+Usage:
+    python -m src.cli_text_clean input.csv                 # dry-run preview
+    python -m src.cli_text_clean input.csv --apply         # write cleaned file
+    python -m src.cli_text_clean input.csv --preset minimal --apply
+    python -m src.cli_text_clean input.csv --case upper:name --apply
+    python -m src.cli_text_clean --help                    # full help
+"""
+
+from __future__ import annotations
+
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+import typer
+from loguru import logger
+
+app = typer.Typer(
+    name="text-clean",
+    help=(
+        "Clean and normalize text content in CSV and Excel files.\n\n"
+        "By default, runs in preview mode — shows what would change without "
+        "modifying anything. Add --apply to write the output.\n\n"
+        "Examples:\n\n"
+        "  # Preview what would change\n"
+        "  python -m src.cli_text_clean messy.csv\n\n"
+        "  # Apply the safe defaults (excel-hygiene preset)\n"
+        "  python -m src.cli_text_clean messy.csv --apply\n\n"
+        "  # Minimal: only trim and collapse whitespace\n"
+        "  python -m src.cli_text_clean messy.csv --preset minimal --apply\n\n"
+        "  # Title-case the 'name' column, leave others alone for case\n"
+        "  python -m src.cli_text_clean people.csv --case title:name --apply\n\n"
+        "  # Clean only specific columns\n"
+        "  python -m src.cli_text_clean orders.csv --columns vendor,product --apply\n\n"
+        "  # Skip a free-text column from cleaning\n"
+        "  python -m src.cli_text_clean tickets.csv --skip notes --apply\n"
+    ),
+    add_completion=False,
+    no_args_is_help=True,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _setup_logging(log_dir: Path) -> Path:
+    """Configure loguru to write a timestamped log file. Returns the log path."""
+    log_dir.mkdir(parents=True, exist_ok=True)
+    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+    log_path = log_dir / f"text_clean_{ts}.log"
+    logger.remove()
+    logger.add(sys.stderr, level="WARNING", format="{message}")
+    logger.add(
+        str(log_path),
+        level="DEBUG",
+        format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}",
+    )
+    return log_path
+
+
+def _parse_case(raw: Optional[str]) -> tuple[Optional[str], dict[str, str]]:
+    """Parse --case argument.
+
+    Forms:
+      --case upper                 -> ("upper", {})       (apply to all selected)
+      --case title:name            -> (None, {"name": "title"})
+      --case upper:code,title:name -> (None, {...})
+    """
+    if not raw:
+        return None, {}
+    if ":" not in raw:
+        # Bare mode applies to all selected columns
+        return raw.strip(), {}
+    per_col: dict[str, str] = {}
+    for piece in raw.split(","):
+        piece = piece.strip()
+        if not piece:
+            continue
+        if ":" not in piece:
+            raise typer.BadParameter(
+                f"Invalid --case piece: '{piece}'. "
+                f"Expected 'mode' or 'mode:col[,mode:col...]' "
+                f"(e.g., 'upper' or 'title:name,upper:code')."
+            )
+        mode, col = piece.split(":", 1)
+        per_col[col.strip()] = mode.strip()
+    return None, per_col
+
+
+def _split_csv_arg(raw: Optional[str]) -> Optional[list[str]]:
+    if raw is None:
+        return None
+    return [c.strip() for c in raw.split(",") if c.strip()]
+
+
+# ---------------------------------------------------------------------------
+# Main command
+# ---------------------------------------------------------------------------
+
+@app.command()
+def clean(
+    input_file: str = typer.Argument(
+        ...,
+        help="Path to the CSV or Excel file to clean.",
+    ),
+    output: Optional[str] = typer.Option(
+        None, "--output", "-o",
+        help="Output file path. Default: {input}_cleaned.csv",
+    ),
+    apply: bool = typer.Option(
+        False, "--apply",
+        help="Write the output files. Without this flag, only a preview is shown.",
+    ),
+    preset: str = typer.Option(
+        "excel-hygiene", "--preset",
+        help="Preset: minimal, excel-hygiene, or paranoid.",
+    ),
+    columns: Optional[str] = typer.Option(
+        None, "--columns",
+        help="Comma-separated columns to clean (default: all string columns).",
+    ),
+    skip: Optional[str] = typer.Option(
+        None, "--skip",
+        help="Comma-separated columns to skip even if they look like text.",
+    ),
+    case: Optional[str] = typer.Option(
+        None, "--case",
+        help=(
+            "Case conversion. Bare mode 'upper'|'lower'|'title'|'sentence' applies to "
+            "all selected columns. Per-column form: 'mode:col[,mode:col]' "
+            "(e.g., 'title:name,upper:code')."
+        ),
+    ),
+    no_trim: bool = typer.Option(False, "--no-trim", help="Disable whitespace trim."),
+    no_collapse: bool = typer.Option(
+        False, "--no-collapse", help="Disable internal whitespace collapse.",
+    ),
+    no_nfc: bool = typer.Option(False, "--no-nfc", help="Disable Unicode NFC normalization."),
+    nfkc: bool = typer.Option(
+        False, "--nfkc",
+        help="Enable NFKC compat fold (lossy: ① → 1, ﬁ → fi). Default off.",
+    ),
+    no_smart_chars: bool = typer.Option(
+        False, "--no-smart-chars",
+        help="Disable smart-character folding (curly quotes, em/en-dash, NBSP).",
+    ),
+    no_zero_width: bool = typer.Option(
+        False, "--no-zero-width", help="Disable zero-width / invisible char strip.",
+    ),
+    no_bom: bool = typer.Option(False, "--no-bom", help="Disable BOM strip."),
+    no_control: bool = typer.Option(
+        False, "--no-control", help="Disable control-character strip.",
+    ),
+    no_line_endings: bool = typer.Option(
+        False, "--no-line-endings", help="Disable line-ending normalization.",
+    ),
+    full_changelog: bool = typer.Option(
+        False, "--full-changelog",
+        help="Write every cell change to the audit CSV (default caps to first 1000).",
+    ),
+    config: Optional[str] = typer.Option(
+        None, "--config",
+        help="Load options from a saved JSON config file.",
+    ),
+    save_config: Optional[str] = typer.Option(
+        None, "--save-config",
+        help="Save current options to a JSON config file.",
+    ),
+    sheet: Optional[str] = typer.Option(
+        None, "--sheet",
+        help="Excel sheet name or index (default: first sheet).",
+    ),
+    encoding_override: Optional[str] = typer.Option(
+        None, "--encoding",
+        help="Override auto-detected file encoding.",
+    ),
+    header_row: Optional[int] = typer.Option(
+        None, "--header-row",
+        help="0-based row index for the header (default: auto-detect).",
+    ),
+):
+    """Clean and normalize text in a CSV or Excel file."""
+    from src.core.io import read_file, write_file
+    from src.core.text_clean import (
+        CleanOptions,
+        PRESETS,
+        clean_dataframe,
+    )
+    import pandas as pd
+
+    # ------------------------------------------------------------------
+    # Validate inputs
+    # ------------------------------------------------------------------
+    input_path = Path(input_file)
+    if not input_path.exists():
+        typer.echo(f"Error: File not found: {input_path}", err=True)
+        raise typer.Exit(1)
+
+    if preset not in PRESETS:
+        typer.echo(
+            f"Error: Unknown preset '{preset}'. "
+            f"Choose from: {', '.join(sorted(PRESETS))}.",
+            err=True,
+        )
+        raise typer.Exit(1)
+
+    log_path = _setup_logging(Path("logs"))
+
+    # ------------------------------------------------------------------
+    # Build CleanOptions
+    # ------------------------------------------------------------------
+    if config:
+        cfg_path = Path(config)
+        if not cfg_path.exists():
+            typer.echo(f"Error: Config file not found: {cfg_path}", err=True)
+            raise typer.Exit(1)
+        options = CleanOptions.from_file(cfg_path)
+        logger.info("Loaded config from {}", cfg_path)
+    else:
+        options = CleanOptions.from_preset(preset)
+
+    # CLI overrides on top of preset/config
+    if no_trim:
+        options.trim = False
+    if no_collapse:
+        options.collapse_whitespace = False
+    if no_nfc:
+        options.nfc = False
+    if nfkc:
+        options.nfkc = True
+    if no_smart_chars:
+        options.fold_smart_chars = False
+    if no_zero_width:
+        options.strip_zero_width = False
+    if no_bom:
+        options.strip_bom = False
+    if no_control:
+        options.strip_control = False
+    if no_line_endings:
+        options.normalize_line_endings = False
+
+    cols_list = _split_csv_arg(columns)
+    if cols_list is not None:
+        options.columns = cols_list
+    skip_list = _split_csv_arg(skip)
+    if skip_list:
+        options.skip_columns = skip_list
+
+    bare_case, per_col_case = _parse_case(case)
+    if bare_case:
+        options.case = bare_case  # type: ignore[assignment]
+    if per_col_case:
+        options.case_columns = {**options.case_columns, **per_col_case}  # type: ignore[dict-item]
+
+    # ------------------------------------------------------------------
+    # Save config if requested (after CLI merge so the file reflects intent)
+    # ------------------------------------------------------------------
+    if save_config:
+        saved = options.to_file(save_config)
+        typer.echo(f"Config saved to {saved}")
+
+    # ------------------------------------------------------------------
+    # Read input
+    # ------------------------------------------------------------------
+    typer.echo(f"Reading {input_path.name}...")
+    try:
+        sheet_arg: str | int | None = None
+        if sheet is not None:
+            try:
+                sheet_arg = int(sheet)
+            except ValueError:
+                sheet_arg = sheet
+
+        df = read_file(
+            input_path,
+            encoding=encoding_override,
+            header_row=header_row,
+            sheet_name=sheet_arg if sheet_arg is not None else 0,
+        )
+        if not isinstance(df, pd.DataFrame):
+            df = pd.concat(list(df), ignore_index=True)
+    except Exception as e:
+        typer.echo(f"Error reading file: {e}", err=True)
+        raise typer.Exit(1)
+
+    typer.echo(f"  {len(df)} rows, {len(df.columns)} columns")
+
+    # ------------------------------------------------------------------
+    # Run pipeline
+    # ------------------------------------------------------------------
+    typer.echo("Cleaning text...")
+    try:
+        result = clean_dataframe(df, options)
+    except ValueError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1)
+
+    _print_results(result, input_path, options)
+
+    # ------------------------------------------------------------------
+    # Write output
+    # ------------------------------------------------------------------
+    if apply:
+        stem = input_path.stem
+        out_path = Path(output) if output else input_path.parent / f"{stem}_cleaned.csv"
+        write_file(result.cleaned_df, out_path)
+        typer.echo(f"\nCleaned file:    {out_path}")
+
+        if not result.changes.empty:
+            changes_path = input_path.parent / f"{stem}_changes.csv"
+            audit_df = result.changes
+            cap = 1000
+            if not full_changelog and len(audit_df) > cap:
+                typer.echo(
+                    f"Note: changelog capped at {cap} rows. "
+                    f"Use --full-changelog to write all {len(audit_df)} changes."
+                )
+                audit_df = audit_df.head(cap)
+            write_file(audit_df, changes_path)
+            typer.echo(f"Changes audit:   {changes_path}")
+    else:
+        typer.echo("\nThis was a preview. Add --apply to write the output files.")
+
+    typer.echo(f"Log: {log_path}")
+
+
+# ---------------------------------------------------------------------------
+# Output formatting
+# ---------------------------------------------------------------------------
+
+def _print_results(result, input_path: Path, options) -> None:
+    pct = (result.cells_changed / result.cells_total * 100.0) if result.cells_total else 0.0
+    typer.echo(f"\n{'─'*50}")
+    typer.echo(f"  File:             {input_path.name}")
+    typer.echo(f"  Columns processed: {len(result.columns_processed)}")
+    typer.echo(f"  Cells scanned:     {result.cells_total}")
+    typer.echo(f"  Cells changed:     {result.cells_changed} ({pct:.1f}%)")
+    typer.echo(f"{'─'*50}")
+
+    if result.cells_changed and not result.changes.empty:
+        # Per-column change counts
+        counts = result.changes["column"].value_counts()
+        typer.echo("\nChanges by column:")
+        for col, n in counts.head(10).items():
+            typer.echo(f"  {col}: {n} cell(s)")
+        if len(counts) > 10:
+            typer.echo(f"  ... and {len(counts) - 10} more columns")
+
+        # Show first few examples
+        typer.echo("\nFirst examples:")
+        for _, row in result.changes.head(5).iterrows():
+            old = repr(row["old"])[:40]
+            new = repr(row["new"])[:40]
+            typer.echo(
+                f"  Row {row['row'] + 1}, {row['column']}: {old} → {new} "
+                f"[{row['ops_applied']}]"
+            )
+
+
+# ---------------------------------------------------------------------------
+# __main__
+# ---------------------------------------------------------------------------
+
+def main():
+    app()
+
+
+if __name__ == "__main__":
+    main()
--- a/src/core/init.py
+++ b/src/core/init.py
@@ -59,6 +59,25 @@ from .config import (
    DeduplicationConfig,
    StrategyConfig,
 )
+from .text_clean import (
+    CleanOptions,
+    CleanResult,
+    PRESETS,
+    apply_case,
+    clean_dataframe,
+    clean_value,
+    collapse_whitespace,
+    fold_smart_chars,
+    normalize_line_endings,
+    sentence_case,
+    smart_title_case,
+    strip_bom,
+    strip_control,
+    strip_zero_width,
+    to_nfc,
+    to_nfkc,
+    trim,
+)

 __all__ = [
    # Core
@@ -90,4 +109,22 @@ __all__ = [
    "DeduplicationConfig",
    "StrategyConfig",
    "ColumnStrategyConfig",
+    # Text cleaning
+    "CleanOptions",
+    "CleanResult",
+    "PRESETS",
+    "clean_dataframe",
+    "clean_value",
+    "trim",
+    "collapse_whitespace",
+    "to_nfc",
+    "to_nfkc",
+    "fold_smart_chars",
+    "strip_zero_width",
+    "strip_bom",
+    "strip_control",
+    "normalize_line_endings",
+    "smart_title_case",
+    "sentence_case",
+    "apply_case",
 ]
--- a/src/core/text_clean.py
+++ b/src/core/text_clean.py
@@ -0,0 +1,489 @@
+"""Character-level text hygiene for DataFrames.
+
+Operations are independently toggleable, idempotent, and safe to compose.
+Each per-string helper is ``str -> str``. Numeric, datetime, and boolean
+columns pass through ``clean_dataframe`` untouched; only string cells are
+modified.
+
+See TECHNICAL.md Section 10.2 for the full functional spec.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+import unicodedata
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any, Callable, Iterable, Literal, Optional
+
+import pandas as pd
+from pandas.api import types as pdtypes
+
+
+# ---------------------------------------------------------------------------
+# Per-string helpers
+# ---------------------------------------------------------------------------
+
+# Smart-character map (curly quotes, dashes, ellipsis, NBSP, narrow NBSP).
+_SMART_CHARS: dict[str, str] = {
+    "‘": "'",   # LEFT SINGLE QUOTATION MARK
+    "’": "'",   # RIGHT SINGLE QUOTATION MARK
+    "‚": "'",   # SINGLE LOW-9 QUOTATION MARK
+    "‛": "'",   # SINGLE HIGH-REVERSED-9 QUOTATION MARK
+    "“": '"',   # LEFT DOUBLE QUOTATION MARK
+    "”": '"',   # RIGHT DOUBLE QUOTATION MARK
+    "„": '"',   # DOUBLE LOW-9 QUOTATION MARK
+    "‟": '"',   # DOUBLE HIGH-REVERSED-9 QUOTATION MARK
+    "–": "-",   # EN DASH
+    "—": "-",   # EM DASH
+    "―": "-",   # HORIZONTAL BAR
+    "−": "-",   # MINUS SIGN
+    "…": "...", # HORIZONTAL ELLIPSIS
+    " ": " ",   # NO-BREAK SPACE
+    " ": " ",   # NARROW NO-BREAK SPACE
+    " ": " ",   # THIN SPACE
+    " ": " ",   # HAIR SPACE
+    " ": " ",   # EN SPACE
+    " ": " ",   # EM SPACE
+    "　": " ",   # IDEOGRAPHIC SPACE
+}
+
+_SMART_TRANS = str.maketrans(_SMART_CHARS)
+
+# Zero-width / invisible characters. ``U+FEFF`` (BOM/ZWNBSP) is included; if
+# it appears at the *very start* of the first cell of the first column, the
+# BOM-strip op handles it; elsewhere it is treated as a zero-width char.
+_ZERO_WIDTH = (
+    ""  # ZERO WIDTH SPACE
+    "‌"  # ZERO WIDTH NON-JOINER
+    "‍"  # ZERO WIDTH JOINER
+    "⁠"  # WORD JOINER
+    "‎"  # LEFT-TO-RIGHT MARK
+    "‏"  # RIGHT-TO-LEFT MARK
+    ""  # ZERO WIDTH NO-BREAK SPACE / BOM
+)
+_ZERO_WIDTH_RE = re.compile(f"[{_ZERO_WIDTH}]")
+
+# Control characters: U+0000-U+001F and U+007F, but preserve \t \n \r.
+_CONTROL_RE = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]")
+
+# Any run of *horizontal* whitespace (spaces, tabs, form/vertical feeds).
+# Newlines and carriage returns are excluded so multi-line cells keep their
+# line structure; the line-ending op normalizes the actual line terminators.
+_WHITESPACE_RUN_RE = re.compile(r"[^\S\n\r]+")
+
+
+def trim(s: str) -> str:
+    """Strip leading/trailing whitespace."""
+    if not isinstance(s, str):
+        return s
+    return s.strip()
+
+
+def collapse_whitespace(s: str) -> str:
+    """Collapse runs of whitespace to a single space.
+
+    Preserves leading/trailing whitespace boundaries (use ``trim`` to remove
+    them). Tabs and other whitespace inside the string become a single
+    regular space.
+    """
+    if not isinstance(s, str):
+        return s
+    return _WHITESPACE_RUN_RE.sub(" ", s)
+
+
+def to_nfc(s: str) -> str:
+    """Apply Unicode NFC (canonical composition)."""
+    if not isinstance(s, str):
+        return s
+    return unicodedata.normalize("NFC", s)
+
+
+def to_nfkc(s: str) -> str:
+    """Apply Unicode NFKC (compatibility composition). Lossy."""
+    if not isinstance(s, str):
+        return s
+    return unicodedata.normalize("NFKC", s)
+
+
+def fold_smart_chars(s: str) -> str:
+    """Fold curly quotes, em/en-dashes, ellipsis, NBSP variants to ASCII."""
+    if not isinstance(s, str):
+        return s
+    return s.translate(_SMART_TRANS)
+
+
+def strip_zero_width(s: str) -> str:
+    """Remove zero-width and bidi-mark characters."""
+    if not isinstance(s, str):
+        return s
+    return _ZERO_WIDTH_RE.sub("", s)
+
+
+def strip_bom(s: str) -> str:
+    """Remove a leading ``U+FEFF`` (BOM) from the start of the string."""
+    if not isinstance(s, str):
+        return s
+    return s.lstrip("")
+
+
+def strip_control(s: str) -> str:
+    """Remove control characters except ``\\t \\n \\r``."""
+    if not isinstance(s, str):
+        return s
+    return _CONTROL_RE.sub("", s)
+
+
+def normalize_line_endings(s: str) -> str:
+    """Normalize ``\\r\\n`` and bare ``\\r`` to ``\\n``."""
+    if not isinstance(s, str):
+        return s
+    return s.replace("\r\n", "\n").replace("\r", "\n")
+
+
+# Smart title-case helpers
+_TITLE_LOWERCASE_PARTICLES = {
+    "a", "an", "and", "as", "at", "but", "by", "en", "for", "if", "in", "nor",
+    "of", "on", "or", "per", "the", "to", "v", "v.", "vs", "vs.", "via",
+}
+
+
+def _is_all_caps_token(token: str) -> bool:
+    """A token is all-caps when it has at least one cased char and no lowercase."""
+    has_letter = any(c.isalpha() for c in token)
+    has_lower = any(c.islower() for c in token)
+    return has_letter and not has_lower and len(token) >= 2
+
+
+def smart_title_case(s: str) -> str:
+    """Title-case that preserves all-caps tokens and lowercases mid-string particles.
+
+    - ``USA`` stays ``USA``.
+    - ``of``, ``and``, ``the``, etc. stay lowercase except as the first/last word.
+    - Apostrophes inside words don't restart capitalization (``O'Neil``).
+    """
+    if not isinstance(s, str) or not s:
+        return s
+    tokens = s.split(" ")
+    out: list[str] = []
+    last_idx = len(tokens) - 1
+    for i, tok in enumerate(tokens):
+        if not tok:
+            out.append(tok)
+            continue
+        if _is_all_caps_token(tok):
+            out.append(tok)
+            continue
+        lowered = tok.lower()
+        if 0 < i < last_idx and lowered in _TITLE_LOWERCASE_PARTICLES:
+            out.append(lowered)
+            continue
+        # Capitalize first cased character; preserve apostrophes/hyphens
+        chars = list(tok)
+        capitalized = False
+        for j, c in enumerate(chars):
+            if c.isalpha():
+                if not capitalized:
+                    chars[j] = c.upper()
+                    capitalized = True
+                else:
+                    chars[j] = c.lower()
+        out.append("".join(chars))
+    return " ".join(out)
+
+
+def sentence_case(s: str) -> str:
+    """Lowercase, then capitalize the first cased letter after each ``. ! ?``."""
+    if not isinstance(s, str) or not s:
+        return s
+    lowered = s.lower()
+    chars = list(lowered)
+    capitalize_next = True
+    for i, c in enumerate(chars):
+        if c in ".!?":
+            capitalize_next = True
+            continue
+        if capitalize_next and c.isalpha():
+            chars[i] = c.upper()
+            capitalize_next = False
+        elif c.strip():
+            # Any non-whitespace, non-letter (e.g., quote, paren) doesn't
+            # consume the "next letter" trigger.
+            if c.isalpha():
+                capitalize_next = False
+    return "".join(chars)
+
+
+CaseMode = Literal["upper", "lower", "title", "sentence"]
+
+
+def apply_case(s: str, mode: CaseMode) -> str:
+    if not isinstance(s, str):
+        return s
+    if mode == "upper":
+        return s.upper()
+    if mode == "lower":
+        return s.lower()
+    if mode == "title":
+        return smart_title_case(s)
+    if mode == "sentence":
+        return sentence_case(s)
+    raise ValueError(f"Unknown case mode: {mode}")
+
+
+# ---------------------------------------------------------------------------
+# Options / result dataclasses
+# ---------------------------------------------------------------------------
+
+PRESETS: dict[str, dict[str, Any]] = {
+    "minimal": {
+        "trim": True,
+        "collapse_whitespace": True,
+        "nfc": False,
+        "nfkc": False,
+        "fold_smart_chars": False,
+        "strip_zero_width": False,
+        "strip_bom": False,
+        "strip_control": False,
+        "normalize_line_endings": False,
+    },
+    "excel-hygiene": {
+        "trim": True,
+        "collapse_whitespace": True,
+        "nfc": True,
+        "nfkc": False,
+        "fold_smart_chars": True,
+        "strip_zero_width": True,
+        "strip_bom": True,
+        "strip_control": True,
+        "normalize_line_endings": True,
+    },
+    "paranoid": {
+        "trim": True,
+        "collapse_whitespace": True,
+        "nfc": True,
+        "nfkc": True,
+        "fold_smart_chars": True,
+        "strip_zero_width": True,
+        "strip_bom": True,
+        "strip_control": True,
+        "normalize_line_endings": True,
+    },
+}
+
+
+@dataclass
+class CleanOptions:
+    """Toggles for character-level cleaning operations.
+
+    Defaults match the ``excel-hygiene`` preset.
+    """
+
+    # Operations
+    trim: bool = True
+    collapse_whitespace: bool = True
+    nfc: bool = True
+    nfkc: bool = False
+    fold_smart_chars: bool = True
+    strip_zero_width: bool = True
+    strip_bom: bool = True
+    strip_control: bool = True
+    normalize_line_endings: bool = True
+
+    # Case conversion: either a single mode applied to all selected columns,
+    # or a dict mapping column name -> mode for per-column control.
+    case: Optional[CaseMode] = None
+    case_columns: dict[str, CaseMode] = field(default_factory=dict)
+
+    # Scope control
+    columns: Optional[list[str]] = None  # None = all string-typed columns
+    skip_columns: list[str] = field(default_factory=list)
+
+    @classmethod
+    def from_preset(cls, name: str) -> CleanOptions:
+        if name not in PRESETS:
+            raise ValueError(
+                f"Unknown preset '{name}'. "
+                f"Available: {', '.join(sorted(PRESETS))}."
+            )
+        return cls(**PRESETS[name])
+
+    @classmethod
+    def from_dict(cls, data: dict) -> CleanOptions:
+        known = {f for f in cls.__dataclass_fields__}
+        kwargs = {k: v for k, v in data.items() if k in known}
+        return cls(**kwargs)
+
+    def to_dict(self) -> dict:
+        return asdict(self)
+
+    def to_file(self, path: str | Path) -> Path:
+        out = Path(path)
+        out.write_text(json.dumps(self.to_dict(), indent=2))
+        return out
+
+    @classmethod
+    def from_file(cls, path: str | Path) -> CleanOptions:
+        return cls.from_dict(json.loads(Path(path).read_text()))
+
+
+@dataclass
+class CleanResult:
+    """Output of ``clean_dataframe``."""
+
+    cleaned_df: pd.DataFrame
+    changes: pd.DataFrame  # cols: row, column, old, new, ops_applied
+    cells_changed: int
+    cells_total: int
+    columns_processed: list[str]
+
+
+# ---------------------------------------------------------------------------
+# Cell-level pipeline
+# ---------------------------------------------------------------------------
+
+def _build_pipeline(options: CleanOptions) -> list[tuple[str, Callable[[str], str]]]:
+    """Return ordered (op_name, fn) pairs for the cell-level pipeline.
+
+    Order is meaningful:
+      1. BOM strip first so a leading FEFF doesn't survive into other ops.
+      2. Line-ending normalize before whitespace ops so \\r\\n collapses cleanly.
+      3. Control-char strip before whitespace ops.
+      4. Smart-char fold before NFC/NFKC (folded ASCII is already NFC-stable).
+      5. NFC then NFKC (NFKC subsumes NFC if both set; we still run NFC first
+         so the result is identical to NFKC alone — kept explicit for logging).
+      6. Zero-width strip after Unicode normalization (NFKC can introduce
+         decomposed forms whose combining marks must not be stripped).
+      7. Whitespace collapse, then trim, last.
+    """
+    ops: list[tuple[str, Callable[[str], str]]] = []
+    if options.strip_bom:
+        ops.append(("strip_bom", strip_bom))
+    if options.normalize_line_endings:
+        ops.append(("normalize_line_endings", normalize_line_endings))
+    if options.strip_control:
+        ops.append(("strip_control", strip_control))
+    if options.fold_smart_chars:
+        ops.append(("fold_smart_chars", fold_smart_chars))
+    if options.nfc:
+        ops.append(("nfc", to_nfc))
+    if options.nfkc:
+        ops.append(("nfkc", to_nfkc))
+    if options.strip_zero_width:
+        ops.append(("strip_zero_width", strip_zero_width))
+    if options.collapse_whitespace:
+        ops.append(("collapse_whitespace", collapse_whitespace))
+    if options.trim:
+        ops.append(("trim", trim))
+    return ops
+
+
+def clean_value(value: Any, options: CleanOptions) -> tuple[Any, list[str]]:
+    """Apply the configured pipeline to a single cell.
+
+    Returns ``(cleaned_value, ops_applied)``. Non-strings and missing values
+    pass through unchanged with an empty ``ops_applied`` list.
+    """
+    if value is None or (isinstance(value, float) and pd.isna(value)):
+        return value, []
+    if not isinstance(value, str):
+        return value, []
+
+    pipeline = _build_pipeline(options)
+    cur = value
+    applied: list[str] = []
+    for name, fn in pipeline:
+        new = fn(cur)
+        if new != cur:
+            applied.append(name)
+            cur = new
+    return cur, applied
+
+
+# ---------------------------------------------------------------------------
+# DataFrame-level entry point
+# ---------------------------------------------------------------------------
+
+def _select_columns(df: pd.DataFrame, options: CleanOptions) -> list[str]:
+    """Pick the columns the pipeline should operate on.
+
+    - If ``options.columns`` is explicit, use it (after validating).
+    - Otherwise default to columns whose pandas dtype is object/string.
+    - Always exclude ``options.skip_columns``.
+    """
+    if options.columns is not None:
+        missing = [c for c in options.columns if c not in df.columns]
+        if missing:
+            raise ValueError(
+                f"Columns not found in input: {missing}. "
+                f"Available: {list(df.columns)}"
+            )
+        chosen: Iterable[str] = options.columns
+    else:
+        chosen = [
+            c for c in df.columns
+            if pdtypes.is_object_dtype(df[c]) or pdtypes.is_string_dtype(df[c])
+        ]
+
+    skip = set(options.skip_columns)
+    return [c for c in chosen if c not in skip]
+
+
+def clean_dataframe(df: pd.DataFrame, options: Optional[CleanOptions] = None) -> CleanResult:
+    """Apply text-cleaning ops to selected columns of *df*.
+
+    Numeric, datetime, and boolean columns are skipped by default. The input
+    DataFrame is not mutated; a copy is returned in ``CleanResult.cleaned_df``.
+    """
+    options = options or CleanOptions()
+    out = df.copy()
+    columns = _select_columns(out, options)
+
+    case_per_col: dict[str, CaseMode] = dict(options.case_columns)
+    if options.case is not None:
+        for c in columns:
+            case_per_col.setdefault(c, options.case)
+
+    change_records: list[dict[str, Any]] = []
+    cells_changed = 0
+    cells_total = 0
+
+    for col in columns:
+        series = out[col]
+        new_values: list[Any] = []
+        col_case = case_per_col.get(col)
+        for row_idx, original in enumerate(series.tolist()):
+            cells_total += 1
+            cleaned, ops_applied = clean_value(original, options)
+
+            if col_case is not None and isinstance(cleaned, str):
+                cased = apply_case(cleaned, col_case)
+                if cased != cleaned:
+                    ops_applied.append(f"case:{col_case}")
+                    cleaned = cased
+
+            if ops_applied and cleaned != original:
+                cells_changed += 1
+                change_records.append({
+                    "row": row_idx,
+                    "column": col,
+                    "old": original,
+                    "new": cleaned,
+                    "ops_applied": ",".join(ops_applied),
+                })
+            new_values.append(cleaned)
+        out[col] = new_values
+
+    changes_df = pd.DataFrame(
+        change_records,
+        columns=["row", "column", "old", "new", "ops_applied"],
+    )
+
+    return CleanResult(
+        cleaned_df=out,
+        changes=changes_df,
+        cells_changed=cells_changed,
+        cells_total=cells_total,
+        columns_processed=columns,
+    )
--- a/src/gui/pages/2_Text_Cleaner.py
+++ b/src/gui/pages/2_Text_Cleaner.py
@@ -1,10 +1,13 @@
-"""DataTools Text Cleaner — stub page."""
+"""DataTools Text Cleaner — Streamlit page."""

 from __future__ import annotations

+import io
+import json
 import sys
 from pathlib import Path

+import pandas as pd
 import streamlit as st

 _project_root = Path(__file__).resolve().parent.parent.parent.parent
@@ -12,82 +15,236 @@ if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root))

 from src.gui.components import hide_streamlit_chrome
+from src.core.text_clean import (
+    PRESETS,
+    CleanOptions,
+    clean_dataframe,
+)

 hide_streamlit_chrome()

+
 # ---------------------------------------------------------------------------
 # Header
 # ---------------------------------------------------------------------------

 st.title("✂️ Text Cleaner")
-st.caption("Clean and normalize text content across your data.")
-
-st.info("This tool is under development.")
+st.caption(
+    "Trim whitespace, fold smart quotes, strip invisible characters, and "
+    "normalize line endings. Runs locally — your data never leaves this computer."
+)

 # ---------------------------------------------------------------------------
-# What this tool will do
-# ---------------------------------------------------------------------------
-
-st.markdown("""
-**Features:**
- Trim leading/trailing whitespace
- Collapse multiple spaces into one
- Unicode normalization (NFC/NFKC)
- Strip non-printable / control characters
- Remove BOM (byte order mark)
- Normalize line endings (CRLF → LF)
- Case conversion (upper, lower, title, sentence)
-""")
-
-st.divider()
-
-# ---------------------------------------------------------------------------
-# File upload (functional)
+# File upload
 # ---------------------------------------------------------------------------

 uploaded = st.file_uploader(
    "Upload CSV or Excel file",
    type=["csv", "tsv", "xlsx", "xls"],
-    help="Upload a file to preview. Processing is not yet available.",
    key="textclean_file_upload",
 )

-if uploaded is not None:
-    import pandas as pd
-    try:
-        if uploaded.name.endswith((".xlsx", ".xls")):
-            df = pd.read_excel(uploaded)
-        else:
-            df = pd.read_csv(uploaded)
-        st.subheader(f"Preview: {uploaded.name}")
-        st.caption(f"{len(df)} rows, {len(df.columns)} columns")
-        st.dataframe(df.head(10), use_container_width=True)
-    except Exception as e:
-        st.error(f"Failed to read file: {e}")
+if uploaded is None:
+    st.info("Upload a CSV, TSV, or Excel file to begin.")
+    st.stop()

-# ---------------------------------------------------------------------------
-# Placeholder options
-# ---------------------------------------------------------------------------

-st.subheader("Operations")
+@st.cache_data(show_spinner=False)
+def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
+    """Read the uploaded bytes into a DataFrame, treating all cells as strings."""
+    suffix = Path(name).suffix.lower()
+    bio = io.BytesIO(data)
+    if suffix in (".xlsx", ".xls"):
+        return pd.read_excel(bio, dtype=str, keep_default_na=False)
+    # CSV / TSV — try utf-8 then utf-8-sig then latin-1 as a fallback
+    for enc in ("utf-8", "utf-8-sig", "latin-1"):
+        try:
+            bio.seek(0)
+            sep = "\t" if suffix == ".tsv" else ","
+            return pd.read_csv(
+                bio, dtype=str, keep_default_na=False,
+                encoding=enc, sep=sep, on_bad_lines="warn",
+            )
+        except UnicodeDecodeError:
+            continue
+    bio.seek(0)
+    return pd.read_csv(bio, dtype=str, keep_default_na=False, encoding="latin-1")

-st.checkbox("Trim whitespace", value=True, disabled=True)
-st.checkbox("Collapse multiple spaces", value=True, disabled=True)
-st.checkbox("Unicode normalization (NFC)", value=False, disabled=True)
-st.checkbox("Strip non-printable characters", value=False, disabled=True)
-st.checkbox("Remove BOM", value=False, disabled=True)
-st.checkbox("Normalize line endings", value=False, disabled=True)
-st.selectbox("Case conversion", ["None", "UPPER", "lower", "Title Case", "Sentence case"], disabled=True)
+
+try:
+    df = _read_uploaded(uploaded.name, uploaded.getvalue())
+except Exception as e:
+    st.error(f"Failed to read file: {e}")
+    st.stop()
+
+st.subheader(f"Preview: {uploaded.name}")
+st.caption(f"{len(df)} rows, {len(df.columns)} columns")
+st.dataframe(df.head(10), use_container_width=True)

 st.divider()
-st.button("Clean Text", type="primary", use_container_width=True, disabled=True)

 # ---------------------------------------------------------------------------
-# Footer
+# Options
 # ---------------------------------------------------------------------------

-st.divider()
-st.caption(
-    "Runs locally. Your data never leaves this computer. "
-    "| DataTools v3.0"
+st.subheader("Options")
+
+preset_label = st.radio(
+    "Preset",
+    ["excel-hygiene (recommended)", "minimal", "paranoid"],
+    index=0,
+    horizontal=True,
+    help=(
+        "excel-hygiene: trim, collapse whitespace, fold smart quotes, strip "
+        "invisible chars, normalize line endings, NFC. "
+        "minimal: only trim and collapse. "
+        "paranoid: everything including NFKC compat fold (lossy)."
+    ),
 )
+preset_key = preset_label.split(" ", 1)[0]
+options = CleanOptions.from_preset(preset_key)
+
+with st.expander("Advanced options"):
+    col_a, col_b = st.columns(2)
+    with col_a:
+        options.trim = st.checkbox("Trim leading/trailing whitespace", value=options.trim)
+        options.collapse_whitespace = st.checkbox(
+            "Collapse internal whitespace", value=options.collapse_whitespace,
+        )
+        options.normalize_line_endings = st.checkbox(
+            "Normalize line endings (\\r\\n → \\n)", value=options.normalize_line_endings,
+        )
+        options.strip_control = st.checkbox(
+            "Strip control characters", value=options.strip_control,
+        )
+        options.strip_bom = st.checkbox("Strip BOM", value=options.strip_bom)
+    with col_b:
+        options.fold_smart_chars = st.checkbox(
+            "Fold smart characters (curly quotes, em-dash, NBSP)",
+            value=options.fold_smart_chars,
+        )
+        options.strip_zero_width = st.checkbox(
+            "Strip zero-width / invisible characters", value=options.strip_zero_width,
+        )
+        options.nfc = st.checkbox("Unicode NFC normalization", value=options.nfc)
+        options.nfkc = st.checkbox(
+            "Unicode NFKC compat fold (lossy: ① → 1, ﬁ → fi)",
+            value=options.nfkc,
+        )
+
+    st.markdown("**Scope**")
+    string_cols = [
+        c for c in df.columns
+        if pd.api.types.is_object_dtype(df[c]) or pd.api.types.is_string_dtype(df[c])
+    ]
+    selected_cols = st.multiselect(
+        "Columns to clean (default: all string columns)",
+        options=list(df.columns),
+        default=string_cols,
+    )
+    skip_cols = st.multiselect(
+        "Columns to skip even if they look like text",
+        options=list(df.columns),
+        default=[],
+    )
+    options.columns = selected_cols if selected_cols else None
+    options.skip_columns = list(skip_cols)
+
+    st.markdown("**Case conversion**")
+    case_global = st.selectbox(
+        "Apply case conversion to selected columns",
+        ["None", "UPPER", "lower", "Title", "Sentence"],
+        index=0,
+    )
+    case_map = {
+        "UPPER": "upper", "lower": "lower",
+        "Title": "title", "Sentence": "sentence",
+    }
+    if case_global != "None":
+        options.case = case_map[case_global]  # type: ignore[assignment]
+
+# ---------------------------------------------------------------------------
+# Run
+# ---------------------------------------------------------------------------
+
+st.divider()
+
+if st.button("Clean Text", type="primary", use_container_width=True):
+    with st.spinner("Cleaning..."):
+        try:
+            result = clean_dataframe(df, options)
+        except ValueError as e:
+            st.error(str(e))
+            st.stop()
+    st.session_state["textclean_result"] = result
+    st.session_state["textclean_input_name"] = uploaded.name
+
+result = st.session_state.get("textclean_result")
+if result is None:
+    st.stop()
+
+# ---------------------------------------------------------------------------
+# Results
+# ---------------------------------------------------------------------------
+
+st.subheader("Results")
+
+pct = (result.cells_changed / result.cells_total * 100.0) if result.cells_total else 0.0
+m1, m2, m3, m4 = st.columns(4)
+m1.metric("Cells scanned", result.cells_total)
+m2.metric("Cells changed", result.cells_changed)
+m3.metric("% changed", f"{pct:.1f}%")
+m4.metric("Columns processed", len(result.columns_processed))
+
+if result.cells_changed:
+    counts = result.changes["column"].value_counts()
+    st.markdown("**Changes by column**")
+    st.dataframe(
+        counts.rename("cells_changed").to_frame(),
+        use_container_width=True,
+    )
+
+    st.markdown("**Examples (first 25 changes)**")
+    examples = result.changes.head(25).copy()
+    examples["row"] = examples["row"] + 1
+    st.dataframe(examples, use_container_width=True, hide_index=True)
+
+st.markdown("**Cleaned preview (first 10 rows)**")
+st.dataframe(result.cleaned_df.head(10), use_container_width=True)
+
+# ---------------------------------------------------------------------------
+# Downloads
+# ---------------------------------------------------------------------------
+
+st.divider()
+stem = Path(st.session_state.get("textclean_input_name", "input")).stem
+
+dl_a, dl_b, dl_c = st.columns(3)
+with dl_a:
+    cleaned_bytes = result.cleaned_df.to_csv(index=False).encode("utf-8-sig")
+    st.download_button(
+        "Download cleaned CSV",
+        data=cleaned_bytes,
+        file_name=f"{stem}_cleaned.csv",
+        mime="text/csv",
+    )
+with dl_b:
+    if not result.changes.empty:
+        changes_bytes = result.changes.to_csv(index=False).encode("utf-8-sig")
+        st.download_button(
+            "Download changes audit",
+            data=changes_bytes,
+            file_name=f"{stem}_changes.csv",
+            mime="text/csv",
+        )
+with dl_c:
+    config_bytes = json.dumps(options.to_dict(), indent=2).encode("utf-8")
+    st.download_button(
+        "Download config JSON",
+        data=config_bytes,
+        file_name="text_clean_config.json",
+        mime="application/json",
+    )
+
+st.divider()
+st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")