feat: implement text cleaner (script 02) with CLI, GUI, and tests

Builds 02_text_cleaner.py from stub to working: character-level hygiene
for CSV/Excel inputs covering trim, whitespace collapse, smart-character
folding, Unicode NFC/NFKC, BOM strip, zero-width strip, control-char
strip, line-ending normalization, and per-column case conversion. Three
presets (minimal/excel-hygiene/paranoid) keep the buyer surface small.

- src/core/text_clean.py: pure helpers + CleanOptions/CleanResult +
  clean_dataframe with dtype-safe column selection
- src/cli_text_clean.py: Typer CLI mirroring the dedup CLI shape
  (dry-run by default, --apply writes cleaned + changes audit, JSON
  config save/load)
- src/gui/pages/2_Text_Cleaner.py: real Streamlit page with preset
  picker, advanced toggles, preview, before/after metrics, and three
  download buttons
- tests/test_text_clean.py + test_cli_text_clean.py: 92 new tests
  covering edge cases E1-E50 from the spec
- samples/messy_text.csv: demo dataset surfacing UC1, UC3, UC6, UC10
  in 10 rows
- test-cases/uc16-uc26 + ec05-ec09: per-use-case and per-edge-case
  fixtures

Docs: TECHNICAL.md §10.2 (full Tier 1/2/3 spec), DECISIONS.md v1.7
entry locking the spec, CLI-REFERENCE.md gains the text cleaner
section, README.md gains a top-level Text Cleaner block, USER-GUIDE.md
status row 02 promoted Skeleton -> Working.

200/200 tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-29 15:14:15 +00:00
parent b2ca04e6f4
commit 54f92ae47e
28 changed files with 2093 additions and 58 deletions

373
src/cli_text_clean.py Normal file
View File

@@ -0,0 +1,373 @@
"""CLI for the DataTools text cleaner (script 02).
Usage:
python -m src.cli_text_clean input.csv # dry-run preview
python -m src.cli_text_clean input.csv --apply # write cleaned file
python -m src.cli_text_clean input.csv --preset minimal --apply
python -m src.cli_text_clean input.csv --case upper:name --apply
python -m src.cli_text_clean --help # full help
"""
from __future__ import annotations
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional
import typer
from loguru import logger
app = typer.Typer(
name="text-clean",
help=(
"Clean and normalize text content in CSV and Excel files.\n\n"
"By default, runs in preview mode — shows what would change without "
"modifying anything. Add --apply to write the output.\n\n"
"Examples:\n\n"
" # Preview what would change\n"
" python -m src.cli_text_clean messy.csv\n\n"
" # Apply the safe defaults (excel-hygiene preset)\n"
" python -m src.cli_text_clean messy.csv --apply\n\n"
" # Minimal: only trim and collapse whitespace\n"
" python -m src.cli_text_clean messy.csv --preset minimal --apply\n\n"
" # Title-case the 'name' column, leave others alone for case\n"
" python -m src.cli_text_clean people.csv --case title:name --apply\n\n"
" # Clean only specific columns\n"
" python -m src.cli_text_clean orders.csv --columns vendor,product --apply\n\n"
" # Skip a free-text column from cleaning\n"
" python -m src.cli_text_clean tickets.csv --skip notes --apply\n"
),
add_completion=False,
no_args_is_help=True,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _setup_logging(log_dir: Path) -> Path:
"""Configure loguru to write a timestamped log file. Returns the log path."""
log_dir.mkdir(parents=True, exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
log_path = log_dir / f"text_clean_{ts}.log"
logger.remove()
logger.add(sys.stderr, level="WARNING", format="{message}")
logger.add(
str(log_path),
level="DEBUG",
format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}",
)
return log_path
def _parse_case(raw: Optional[str]) -> tuple[Optional[str], dict[str, str]]:
"""Parse --case argument.
Forms:
--case upper -> ("upper", {}) (apply to all selected)
--case title:name -> (None, {"name": "title"})
--case upper:code,title:name -> (None, {...})
"""
if not raw:
return None, {}
if ":" not in raw:
# Bare mode applies to all selected columns
return raw.strip(), {}
per_col: dict[str, str] = {}
for piece in raw.split(","):
piece = piece.strip()
if not piece:
continue
if ":" not in piece:
raise typer.BadParameter(
f"Invalid --case piece: '{piece}'. "
f"Expected 'mode' or 'mode:col[,mode:col...]' "
f"(e.g., 'upper' or 'title:name,upper:code')."
)
mode, col = piece.split(":", 1)
per_col[col.strip()] = mode.strip()
return None, per_col
def _split_csv_arg(raw: Optional[str]) -> Optional[list[str]]:
if raw is None:
return None
return [c.strip() for c in raw.split(",") if c.strip()]
# ---------------------------------------------------------------------------
# Main command
# ---------------------------------------------------------------------------
@app.command()
def clean(
input_file: str = typer.Argument(
...,
help="Path to the CSV or Excel file to clean.",
),
output: Optional[str] = typer.Option(
None, "--output", "-o",
help="Output file path. Default: {input}_cleaned.csv",
),
apply: bool = typer.Option(
False, "--apply",
help="Write the output files. Without this flag, only a preview is shown.",
),
preset: str = typer.Option(
"excel-hygiene", "--preset",
help="Preset: minimal, excel-hygiene, or paranoid.",
),
columns: Optional[str] = typer.Option(
None, "--columns",
help="Comma-separated columns to clean (default: all string columns).",
),
skip: Optional[str] = typer.Option(
None, "--skip",
help="Comma-separated columns to skip even if they look like text.",
),
case: Optional[str] = typer.Option(
None, "--case",
help=(
"Case conversion. Bare mode 'upper'|'lower'|'title'|'sentence' applies to "
"all selected columns. Per-column form: 'mode:col[,mode:col]' "
"(e.g., 'title:name,upper:code')."
),
),
no_trim: bool = typer.Option(False, "--no-trim", help="Disable whitespace trim."),
no_collapse: bool = typer.Option(
False, "--no-collapse", help="Disable internal whitespace collapse.",
),
no_nfc: bool = typer.Option(False, "--no-nfc", help="Disable Unicode NFC normalization."),
nfkc: bool = typer.Option(
False, "--nfkc",
help="Enable NFKC compat fold (lossy: ① → 1, fi → fi). Default off.",
),
no_smart_chars: bool = typer.Option(
False, "--no-smart-chars",
help="Disable smart-character folding (curly quotes, em/en-dash, NBSP).",
),
no_zero_width: bool = typer.Option(
False, "--no-zero-width", help="Disable zero-width / invisible char strip.",
),
no_bom: bool = typer.Option(False, "--no-bom", help="Disable BOM strip."),
no_control: bool = typer.Option(
False, "--no-control", help="Disable control-character strip.",
),
no_line_endings: bool = typer.Option(
False, "--no-line-endings", help="Disable line-ending normalization.",
),
full_changelog: bool = typer.Option(
False, "--full-changelog",
help="Write every cell change to the audit CSV (default caps to first 1000).",
),
config: Optional[str] = typer.Option(
None, "--config",
help="Load options from a saved JSON config file.",
),
save_config: Optional[str] = typer.Option(
None, "--save-config",
help="Save current options to a JSON config file.",
),
sheet: Optional[str] = typer.Option(
None, "--sheet",
help="Excel sheet name or index (default: first sheet).",
),
encoding_override: Optional[str] = typer.Option(
None, "--encoding",
help="Override auto-detected file encoding.",
),
header_row: Optional[int] = typer.Option(
None, "--header-row",
help="0-based row index for the header (default: auto-detect).",
),
):
"""Clean and normalize text in a CSV or Excel file."""
from src.core.io import read_file, write_file
from src.core.text_clean import (
CleanOptions,
PRESETS,
clean_dataframe,
)
import pandas as pd
# ------------------------------------------------------------------
# Validate inputs
# ------------------------------------------------------------------
input_path = Path(input_file)
if not input_path.exists():
typer.echo(f"Error: File not found: {input_path}", err=True)
raise typer.Exit(1)
if preset not in PRESETS:
typer.echo(
f"Error: Unknown preset '{preset}'. "
f"Choose from: {', '.join(sorted(PRESETS))}.",
err=True,
)
raise typer.Exit(1)
log_path = _setup_logging(Path("logs"))
# ------------------------------------------------------------------
# Build CleanOptions
# ------------------------------------------------------------------
if config:
cfg_path = Path(config)
if not cfg_path.exists():
typer.echo(f"Error: Config file not found: {cfg_path}", err=True)
raise typer.Exit(1)
options = CleanOptions.from_file(cfg_path)
logger.info("Loaded config from {}", cfg_path)
else:
options = CleanOptions.from_preset(preset)
# CLI overrides on top of preset/config
if no_trim:
options.trim = False
if no_collapse:
options.collapse_whitespace = False
if no_nfc:
options.nfc = False
if nfkc:
options.nfkc = True
if no_smart_chars:
options.fold_smart_chars = False
if no_zero_width:
options.strip_zero_width = False
if no_bom:
options.strip_bom = False
if no_control:
options.strip_control = False
if no_line_endings:
options.normalize_line_endings = False
cols_list = _split_csv_arg(columns)
if cols_list is not None:
options.columns = cols_list
skip_list = _split_csv_arg(skip)
if skip_list:
options.skip_columns = skip_list
bare_case, per_col_case = _parse_case(case)
if bare_case:
options.case = bare_case # type: ignore[assignment]
if per_col_case:
options.case_columns = {**options.case_columns, **per_col_case} # type: ignore[dict-item]
# ------------------------------------------------------------------
# Save config if requested (after CLI merge so the file reflects intent)
# ------------------------------------------------------------------
if save_config:
saved = options.to_file(save_config)
typer.echo(f"Config saved to {saved}")
# ------------------------------------------------------------------
# Read input
# ------------------------------------------------------------------
typer.echo(f"Reading {input_path.name}...")
try:
sheet_arg: str | int | None = None
if sheet is not None:
try:
sheet_arg = int(sheet)
except ValueError:
sheet_arg = sheet
df = read_file(
input_path,
encoding=encoding_override,
header_row=header_row,
sheet_name=sheet_arg if sheet_arg is not None else 0,
)
if not isinstance(df, pd.DataFrame):
df = pd.concat(list(df), ignore_index=True)
except Exception as e:
typer.echo(f"Error reading file: {e}", err=True)
raise typer.Exit(1)
typer.echo(f" {len(df)} rows, {len(df.columns)} columns")
# ------------------------------------------------------------------
# Run pipeline
# ------------------------------------------------------------------
typer.echo("Cleaning text...")
try:
result = clean_dataframe(df, options)
except ValueError as e:
typer.echo(f"Error: {e}", err=True)
raise typer.Exit(1)
_print_results(result, input_path, options)
# ------------------------------------------------------------------
# Write output
# ------------------------------------------------------------------
if apply:
stem = input_path.stem
out_path = Path(output) if output else input_path.parent / f"{stem}_cleaned.csv"
write_file(result.cleaned_df, out_path)
typer.echo(f"\nCleaned file: {out_path}")
if not result.changes.empty:
changes_path = input_path.parent / f"{stem}_changes.csv"
audit_df = result.changes
cap = 1000
if not full_changelog and len(audit_df) > cap:
typer.echo(
f"Note: changelog capped at {cap} rows. "
f"Use --full-changelog to write all {len(audit_df)} changes."
)
audit_df = audit_df.head(cap)
write_file(audit_df, changes_path)
typer.echo(f"Changes audit: {changes_path}")
else:
typer.echo("\nThis was a preview. Add --apply to write the output files.")
typer.echo(f"Log: {log_path}")
# ---------------------------------------------------------------------------
# Output formatting
# ---------------------------------------------------------------------------
def _print_results(result, input_path: Path, options) -> None:
pct = (result.cells_changed / result.cells_total * 100.0) if result.cells_total else 0.0
typer.echo(f"\n{''*50}")
typer.echo(f" File: {input_path.name}")
typer.echo(f" Columns processed: {len(result.columns_processed)}")
typer.echo(f" Cells scanned: {result.cells_total}")
typer.echo(f" Cells changed: {result.cells_changed} ({pct:.1f}%)")
typer.echo(f"{''*50}")
if result.cells_changed and not result.changes.empty:
# Per-column change counts
counts = result.changes["column"].value_counts()
typer.echo("\nChanges by column:")
for col, n in counts.head(10).items():
typer.echo(f" {col}: {n} cell(s)")
if len(counts) > 10:
typer.echo(f" ... and {len(counts) - 10} more columns")
# Show first few examples
typer.echo("\nFirst examples:")
for _, row in result.changes.head(5).iterrows():
old = repr(row["old"])[:40]
new = repr(row["new"])[:40]
typer.echo(
f" Row {row['row'] + 1}, {row['column']}: {old}{new} "
f"[{row['ops_applied']}]"
)
# ---------------------------------------------------------------------------
# __main__
# ---------------------------------------------------------------------------
def main():
app()
if __name__ == "__main__":
main()

View File

@@ -59,6 +59,25 @@ from .config import (
DeduplicationConfig,
StrategyConfig,
)
from .text_clean import (
CleanOptions,
CleanResult,
PRESETS,
apply_case,
clean_dataframe,
clean_value,
collapse_whitespace,
fold_smart_chars,
normalize_line_endings,
sentence_case,
smart_title_case,
strip_bom,
strip_control,
strip_zero_width,
to_nfc,
to_nfkc,
trim,
)
__all__ = [
# Core
@@ -90,4 +109,22 @@ __all__ = [
"DeduplicationConfig",
"StrategyConfig",
"ColumnStrategyConfig",
# Text cleaning
"CleanOptions",
"CleanResult",
"PRESETS",
"clean_dataframe",
"clean_value",
"trim",
"collapse_whitespace",
"to_nfc",
"to_nfkc",
"fold_smart_chars",
"strip_zero_width",
"strip_bom",
"strip_control",
"normalize_line_endings",
"smart_title_case",
"sentence_case",
"apply_case",
]

489
src/core/text_clean.py Normal file
View File

@@ -0,0 +1,489 @@
"""Character-level text hygiene for DataFrames.
Operations are independently toggleable, idempotent, and safe to compose.
Each per-string helper is ``str -> str``. Numeric, datetime, and boolean
columns pass through ``clean_dataframe`` untouched; only string cells are
modified.
See TECHNICAL.md Section 10.2 for the full functional spec.
"""
from __future__ import annotations
import json
import re
import unicodedata
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any, Callable, Iterable, Literal, Optional
import pandas as pd
from pandas.api import types as pdtypes
# ---------------------------------------------------------------------------
# Per-string helpers
# ---------------------------------------------------------------------------
# Smart-character map (curly quotes, dashes, ellipsis, NBSP, narrow NBSP).
_SMART_CHARS: dict[str, str] = {
"": "'", # LEFT SINGLE QUOTATION MARK
"": "'", # RIGHT SINGLE QUOTATION MARK
"": "'", # SINGLE LOW-9 QUOTATION MARK
"": "'", # SINGLE HIGH-REVERSED-9 QUOTATION MARK
"": '"', # LEFT DOUBLE QUOTATION MARK
"": '"', # RIGHT DOUBLE QUOTATION MARK
"": '"', # DOUBLE LOW-9 QUOTATION MARK
"": '"', # DOUBLE HIGH-REVERSED-9 QUOTATION MARK
"": "-", # EN DASH
"": "-", # EM DASH
"": "-", # HORIZONTAL BAR
"": "-", # MINUS SIGN
"": "...", # HORIZONTAL ELLIPSIS
" ": " ", # NO-BREAK SPACE
"": " ", # NARROW NO-BREAK SPACE
"": " ", # THIN SPACE
"": " ", # HAIR SPACE
"": " ", # EN SPACE
"": " ", # EM SPACE
" ": " ", # IDEOGRAPHIC SPACE
}
_SMART_TRANS = str.maketrans(_SMART_CHARS)
# Zero-width / invisible characters. ``U+FEFF`` (BOM/ZWNBSP) is included; if
# it appears at the *very start* of the first cell of the first column, the
# BOM-strip op handles it; elsewhere it is treated as a zero-width char.
_ZERO_WIDTH = (
"" # ZERO WIDTH SPACE
"" # ZERO WIDTH NON-JOINER
"" # ZERO WIDTH JOINER
"" # WORD JOINER
"" # LEFT-TO-RIGHT MARK
"" # RIGHT-TO-LEFT MARK
"" # ZERO WIDTH NO-BREAK SPACE / BOM
)
_ZERO_WIDTH_RE = re.compile(f"[{_ZERO_WIDTH}]")
# Control characters: U+0000-U+001F and U+007F, but preserve \t \n \r.
_CONTROL_RE = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]")
# Any run of *horizontal* whitespace (spaces, tabs, form/vertical feeds).
# Newlines and carriage returns are excluded so multi-line cells keep their
# line structure; the line-ending op normalizes the actual line terminators.
_WHITESPACE_RUN_RE = re.compile(r"[^\S\n\r]+")
def trim(s: str) -> str:
"""Strip leading/trailing whitespace."""
if not isinstance(s, str):
return s
return s.strip()
def collapse_whitespace(s: str) -> str:
"""Collapse runs of whitespace to a single space.
Preserves leading/trailing whitespace boundaries (use ``trim`` to remove
them). Tabs and other whitespace inside the string become a single
regular space.
"""
if not isinstance(s, str):
return s
return _WHITESPACE_RUN_RE.sub(" ", s)
def to_nfc(s: str) -> str:
"""Apply Unicode NFC (canonical composition)."""
if not isinstance(s, str):
return s
return unicodedata.normalize("NFC", s)
def to_nfkc(s: str) -> str:
"""Apply Unicode NFKC (compatibility composition). Lossy."""
if not isinstance(s, str):
return s
return unicodedata.normalize("NFKC", s)
def fold_smart_chars(s: str) -> str:
"""Fold curly quotes, em/en-dashes, ellipsis, NBSP variants to ASCII."""
if not isinstance(s, str):
return s
return s.translate(_SMART_TRANS)
def strip_zero_width(s: str) -> str:
"""Remove zero-width and bidi-mark characters."""
if not isinstance(s, str):
return s
return _ZERO_WIDTH_RE.sub("", s)
def strip_bom(s: str) -> str:
"""Remove a leading ``U+FEFF`` (BOM) from the start of the string."""
if not isinstance(s, str):
return s
return s.lstrip("")
def strip_control(s: str) -> str:
"""Remove control characters except ``\\t \\n \\r``."""
if not isinstance(s, str):
return s
return _CONTROL_RE.sub("", s)
def normalize_line_endings(s: str) -> str:
"""Normalize ``\\r\\n`` and bare ``\\r`` to ``\\n``."""
if not isinstance(s, str):
return s
return s.replace("\r\n", "\n").replace("\r", "\n")
# Smart title-case helpers
_TITLE_LOWERCASE_PARTICLES = {
"a", "an", "and", "as", "at", "but", "by", "en", "for", "if", "in", "nor",
"of", "on", "or", "per", "the", "to", "v", "v.", "vs", "vs.", "via",
}
def _is_all_caps_token(token: str) -> bool:
"""A token is all-caps when it has at least one cased char and no lowercase."""
has_letter = any(c.isalpha() for c in token)
has_lower = any(c.islower() for c in token)
return has_letter and not has_lower and len(token) >= 2
def smart_title_case(s: str) -> str:
"""Title-case that preserves all-caps tokens and lowercases mid-string particles.
- ``USA`` stays ``USA``.
- ``of``, ``and``, ``the``, etc. stay lowercase except as the first/last word.
- Apostrophes inside words don't restart capitalization (``O'Neil``).
"""
if not isinstance(s, str) or not s:
return s
tokens = s.split(" ")
out: list[str] = []
last_idx = len(tokens) - 1
for i, tok in enumerate(tokens):
if not tok:
out.append(tok)
continue
if _is_all_caps_token(tok):
out.append(tok)
continue
lowered = tok.lower()
if 0 < i < last_idx and lowered in _TITLE_LOWERCASE_PARTICLES:
out.append(lowered)
continue
# Capitalize first cased character; preserve apostrophes/hyphens
chars = list(tok)
capitalized = False
for j, c in enumerate(chars):
if c.isalpha():
if not capitalized:
chars[j] = c.upper()
capitalized = True
else:
chars[j] = c.lower()
out.append("".join(chars))
return " ".join(out)
def sentence_case(s: str) -> str:
"""Lowercase, then capitalize the first cased letter after each ``. ! ?``."""
if not isinstance(s, str) or not s:
return s
lowered = s.lower()
chars = list(lowered)
capitalize_next = True
for i, c in enumerate(chars):
if c in ".!?":
capitalize_next = True
continue
if capitalize_next and c.isalpha():
chars[i] = c.upper()
capitalize_next = False
elif c.strip():
# Any non-whitespace, non-letter (e.g., quote, paren) doesn't
# consume the "next letter" trigger.
if c.isalpha():
capitalize_next = False
return "".join(chars)
CaseMode = Literal["upper", "lower", "title", "sentence"]
def apply_case(s: str, mode: CaseMode) -> str:
if not isinstance(s, str):
return s
if mode == "upper":
return s.upper()
if mode == "lower":
return s.lower()
if mode == "title":
return smart_title_case(s)
if mode == "sentence":
return sentence_case(s)
raise ValueError(f"Unknown case mode: {mode}")
# ---------------------------------------------------------------------------
# Options / result dataclasses
# ---------------------------------------------------------------------------
PRESETS: dict[str, dict[str, Any]] = {
"minimal": {
"trim": True,
"collapse_whitespace": True,
"nfc": False,
"nfkc": False,
"fold_smart_chars": False,
"strip_zero_width": False,
"strip_bom": False,
"strip_control": False,
"normalize_line_endings": False,
},
"excel-hygiene": {
"trim": True,
"collapse_whitespace": True,
"nfc": True,
"nfkc": False,
"fold_smart_chars": True,
"strip_zero_width": True,
"strip_bom": True,
"strip_control": True,
"normalize_line_endings": True,
},
"paranoid": {
"trim": True,
"collapse_whitespace": True,
"nfc": True,
"nfkc": True,
"fold_smart_chars": True,
"strip_zero_width": True,
"strip_bom": True,
"strip_control": True,
"normalize_line_endings": True,
},
}
@dataclass
class CleanOptions:
"""Toggles for character-level cleaning operations.
Defaults match the ``excel-hygiene`` preset.
"""
# Operations
trim: bool = True
collapse_whitespace: bool = True
nfc: bool = True
nfkc: bool = False
fold_smart_chars: bool = True
strip_zero_width: bool = True
strip_bom: bool = True
strip_control: bool = True
normalize_line_endings: bool = True
# Case conversion: either a single mode applied to all selected columns,
# or a dict mapping column name -> mode for per-column control.
case: Optional[CaseMode] = None
case_columns: dict[str, CaseMode] = field(default_factory=dict)
# Scope control
columns: Optional[list[str]] = None # None = all string-typed columns
skip_columns: list[str] = field(default_factory=list)
@classmethod
def from_preset(cls, name: str) -> CleanOptions:
if name not in PRESETS:
raise ValueError(
f"Unknown preset '{name}'. "
f"Available: {', '.join(sorted(PRESETS))}."
)
return cls(**PRESETS[name])
@classmethod
def from_dict(cls, data: dict) -> CleanOptions:
known = {f for f in cls.__dataclass_fields__}
kwargs = {k: v for k, v in data.items() if k in known}
return cls(**kwargs)
def to_dict(self) -> dict:
return asdict(self)
def to_file(self, path: str | Path) -> Path:
out = Path(path)
out.write_text(json.dumps(self.to_dict(), indent=2))
return out
@classmethod
def from_file(cls, path: str | Path) -> CleanOptions:
return cls.from_dict(json.loads(Path(path).read_text()))
@dataclass
class CleanResult:
"""Output of ``clean_dataframe``."""
cleaned_df: pd.DataFrame
changes: pd.DataFrame # cols: row, column, old, new, ops_applied
cells_changed: int
cells_total: int
columns_processed: list[str]
# ---------------------------------------------------------------------------
# Cell-level pipeline
# ---------------------------------------------------------------------------
def _build_pipeline(options: CleanOptions) -> list[tuple[str, Callable[[str], str]]]:
"""Return ordered (op_name, fn) pairs for the cell-level pipeline.
Order is meaningful:
1. BOM strip first so a leading FEFF doesn't survive into other ops.
2. Line-ending normalize before whitespace ops so \\r\\n collapses cleanly.
3. Control-char strip before whitespace ops.
4. Smart-char fold before NFC/NFKC (folded ASCII is already NFC-stable).
5. NFC then NFKC (NFKC subsumes NFC if both set; we still run NFC first
so the result is identical to NFKC alone — kept explicit for logging).
6. Zero-width strip after Unicode normalization (NFKC can introduce
decomposed forms whose combining marks must not be stripped).
7. Whitespace collapse, then trim, last.
"""
ops: list[tuple[str, Callable[[str], str]]] = []
if options.strip_bom:
ops.append(("strip_bom", strip_bom))
if options.normalize_line_endings:
ops.append(("normalize_line_endings", normalize_line_endings))
if options.strip_control:
ops.append(("strip_control", strip_control))
if options.fold_smart_chars:
ops.append(("fold_smart_chars", fold_smart_chars))
if options.nfc:
ops.append(("nfc", to_nfc))
if options.nfkc:
ops.append(("nfkc", to_nfkc))
if options.strip_zero_width:
ops.append(("strip_zero_width", strip_zero_width))
if options.collapse_whitespace:
ops.append(("collapse_whitespace", collapse_whitespace))
if options.trim:
ops.append(("trim", trim))
return ops
def clean_value(value: Any, options: CleanOptions) -> tuple[Any, list[str]]:
"""Apply the configured pipeline to a single cell.
Returns ``(cleaned_value, ops_applied)``. Non-strings and missing values
pass through unchanged with an empty ``ops_applied`` list.
"""
if value is None or (isinstance(value, float) and pd.isna(value)):
return value, []
if not isinstance(value, str):
return value, []
pipeline = _build_pipeline(options)
cur = value
applied: list[str] = []
for name, fn in pipeline:
new = fn(cur)
if new != cur:
applied.append(name)
cur = new
return cur, applied
# ---------------------------------------------------------------------------
# DataFrame-level entry point
# ---------------------------------------------------------------------------
def _select_columns(df: pd.DataFrame, options: CleanOptions) -> list[str]:
"""Pick the columns the pipeline should operate on.
- If ``options.columns`` is explicit, use it (after validating).
- Otherwise default to columns whose pandas dtype is object/string.
- Always exclude ``options.skip_columns``.
"""
if options.columns is not None:
missing = [c for c in options.columns if c not in df.columns]
if missing:
raise ValueError(
f"Columns not found in input: {missing}. "
f"Available: {list(df.columns)}"
)
chosen: Iterable[str] = options.columns
else:
chosen = [
c for c in df.columns
if pdtypes.is_object_dtype(df[c]) or pdtypes.is_string_dtype(df[c])
]
skip = set(options.skip_columns)
return [c for c in chosen if c not in skip]
def clean_dataframe(df: pd.DataFrame, options: Optional[CleanOptions] = None) -> CleanResult:
"""Apply text-cleaning ops to selected columns of *df*.
Numeric, datetime, and boolean columns are skipped by default. The input
DataFrame is not mutated; a copy is returned in ``CleanResult.cleaned_df``.
"""
options = options or CleanOptions()
out = df.copy()
columns = _select_columns(out, options)
case_per_col: dict[str, CaseMode] = dict(options.case_columns)
if options.case is not None:
for c in columns:
case_per_col.setdefault(c, options.case)
change_records: list[dict[str, Any]] = []
cells_changed = 0
cells_total = 0
for col in columns:
series = out[col]
new_values: list[Any] = []
col_case = case_per_col.get(col)
for row_idx, original in enumerate(series.tolist()):
cells_total += 1
cleaned, ops_applied = clean_value(original, options)
if col_case is not None and isinstance(cleaned, str):
cased = apply_case(cleaned, col_case)
if cased != cleaned:
ops_applied.append(f"case:{col_case}")
cleaned = cased
if ops_applied and cleaned != original:
cells_changed += 1
change_records.append({
"row": row_idx,
"column": col,
"old": original,
"new": cleaned,
"ops_applied": ",".join(ops_applied),
})
new_values.append(cleaned)
out[col] = new_values
changes_df = pd.DataFrame(
change_records,
columns=["row", "column", "old", "new", "ops_applied"],
)
return CleanResult(
cleaned_df=out,
changes=changes_df,
cells_changed=cells_changed,
cells_total=cells_total,
columns_processed=columns,
)

View File

@@ -1,10 +1,13 @@
"""DataTools Text Cleaner — stub page."""
"""DataTools Text Cleaner — Streamlit page."""
from __future__ import annotations
import io
import json
import sys
from pathlib import Path
import pandas as pd
import streamlit as st
_project_root = Path(__file__).resolve().parent.parent.parent.parent
@@ -12,82 +15,236 @@ if str(_project_root) not in sys.path:
sys.path.insert(0, str(_project_root))
from src.gui.components import hide_streamlit_chrome
from src.core.text_clean import (
PRESETS,
CleanOptions,
clean_dataframe,
)
hide_streamlit_chrome()
# ---------------------------------------------------------------------------
# Header
# ---------------------------------------------------------------------------
st.title("✂️ Text Cleaner")
st.caption("Clean and normalize text content across your data.")
st.info("This tool is under development.")
st.caption(
"Trim whitespace, fold smart quotes, strip invisible characters, and "
"normalize line endings. Runs locally — your data never leaves this computer."
)
# ---------------------------------------------------------------------------
# What this tool will do
# ---------------------------------------------------------------------------
st.markdown("""
**Features:**
- Trim leading/trailing whitespace
- Collapse multiple spaces into one
- Unicode normalization (NFC/NFKC)
- Strip non-printable / control characters
- Remove BOM (byte order mark)
- Normalize line endings (CRLF → LF)
- Case conversion (upper, lower, title, sentence)
""")
st.divider()
# ---------------------------------------------------------------------------
# File upload (functional)
# File upload
# ---------------------------------------------------------------------------
uploaded = st.file_uploader(
"Upload CSV or Excel file",
type=["csv", "tsv", "xlsx", "xls"],
help="Upload a file to preview. Processing is not yet available.",
key="textclean_file_upload",
)
if uploaded is not None:
import pandas as pd
try:
if uploaded.name.endswith((".xlsx", ".xls")):
df = pd.read_excel(uploaded)
else:
df = pd.read_csv(uploaded)
st.subheader(f"Preview: {uploaded.name}")
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
st.dataframe(df.head(10), use_container_width=True)
except Exception as e:
st.error(f"Failed to read file: {e}")
if uploaded is None:
st.info("Upload a CSV, TSV, or Excel file to begin.")
st.stop()
# ---------------------------------------------------------------------------
# Placeholder options
# ---------------------------------------------------------------------------
st.subheader("Operations")
@st.cache_data(show_spinner=False)
def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
"""Read the uploaded bytes into a DataFrame, treating all cells as strings."""
suffix = Path(name).suffix.lower()
bio = io.BytesIO(data)
if suffix in (".xlsx", ".xls"):
return pd.read_excel(bio, dtype=str, keep_default_na=False)
# CSV / TSV — try utf-8 then utf-8-sig then latin-1 as a fallback
for enc in ("utf-8", "utf-8-sig", "latin-1"):
try:
bio.seek(0)
sep = "\t" if suffix == ".tsv" else ","
return pd.read_csv(
bio, dtype=str, keep_default_na=False,
encoding=enc, sep=sep, on_bad_lines="warn",
)
except UnicodeDecodeError:
continue
bio.seek(0)
return pd.read_csv(bio, dtype=str, keep_default_na=False, encoding="latin-1")
st.checkbox("Trim whitespace", value=True, disabled=True)
st.checkbox("Collapse multiple spaces", value=True, disabled=True)
st.checkbox("Unicode normalization (NFC)", value=False, disabled=True)
st.checkbox("Strip non-printable characters", value=False, disabled=True)
st.checkbox("Remove BOM", value=False, disabled=True)
st.checkbox("Normalize line endings", value=False, disabled=True)
st.selectbox("Case conversion", ["None", "UPPER", "lower", "Title Case", "Sentence case"], disabled=True)
try:
df = _read_uploaded(uploaded.name, uploaded.getvalue())
except Exception as e:
st.error(f"Failed to read file: {e}")
st.stop()
st.subheader(f"Preview: {uploaded.name}")
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
st.dataframe(df.head(10), use_container_width=True)
st.divider()
st.button("Clean Text", type="primary", use_container_width=True, disabled=True)
# ---------------------------------------------------------------------------
# Footer
# Options
# ---------------------------------------------------------------------------
st.divider()
st.caption(
"Runs locally. Your data never leaves this computer. "
"| DataTools v3.0"
st.subheader("Options")
preset_label = st.radio(
"Preset",
["excel-hygiene (recommended)", "minimal", "paranoid"],
index=0,
horizontal=True,
help=(
"excel-hygiene: trim, collapse whitespace, fold smart quotes, strip "
"invisible chars, normalize line endings, NFC. "
"minimal: only trim and collapse. "
"paranoid: everything including NFKC compat fold (lossy)."
),
)
preset_key = preset_label.split(" ", 1)[0]
options = CleanOptions.from_preset(preset_key)
with st.expander("Advanced options"):
col_a, col_b = st.columns(2)
with col_a:
options.trim = st.checkbox("Trim leading/trailing whitespace", value=options.trim)
options.collapse_whitespace = st.checkbox(
"Collapse internal whitespace", value=options.collapse_whitespace,
)
options.normalize_line_endings = st.checkbox(
"Normalize line endings (\\r\\n → \\n)", value=options.normalize_line_endings,
)
options.strip_control = st.checkbox(
"Strip control characters", value=options.strip_control,
)
options.strip_bom = st.checkbox("Strip BOM", value=options.strip_bom)
with col_b:
options.fold_smart_chars = st.checkbox(
"Fold smart characters (curly quotes, em-dash, NBSP)",
value=options.fold_smart_chars,
)
options.strip_zero_width = st.checkbox(
"Strip zero-width / invisible characters", value=options.strip_zero_width,
)
options.nfc = st.checkbox("Unicode NFC normalization", value=options.nfc)
options.nfkc = st.checkbox(
"Unicode NFKC compat fold (lossy: ① → 1, fi → fi)",
value=options.nfkc,
)
st.markdown("**Scope**")
string_cols = [
c for c in df.columns
if pd.api.types.is_object_dtype(df[c]) or pd.api.types.is_string_dtype(df[c])
]
selected_cols = st.multiselect(
"Columns to clean (default: all string columns)",
options=list(df.columns),
default=string_cols,
)
skip_cols = st.multiselect(
"Columns to skip even if they look like text",
options=list(df.columns),
default=[],
)
options.columns = selected_cols if selected_cols else None
options.skip_columns = list(skip_cols)
st.markdown("**Case conversion**")
case_global = st.selectbox(
"Apply case conversion to selected columns",
["None", "UPPER", "lower", "Title", "Sentence"],
index=0,
)
case_map = {
"UPPER": "upper", "lower": "lower",
"Title": "title", "Sentence": "sentence",
}
if case_global != "None":
options.case = case_map[case_global] # type: ignore[assignment]
# ---------------------------------------------------------------------------
# Run
# ---------------------------------------------------------------------------
st.divider()
if st.button("Clean Text", type="primary", use_container_width=True):
with st.spinner("Cleaning..."):
try:
result = clean_dataframe(df, options)
except ValueError as e:
st.error(str(e))
st.stop()
st.session_state["textclean_result"] = result
st.session_state["textclean_input_name"] = uploaded.name
result = st.session_state.get("textclean_result")
if result is None:
st.stop()
# ---------------------------------------------------------------------------
# Results
# ---------------------------------------------------------------------------
st.subheader("Results")
pct = (result.cells_changed / result.cells_total * 100.0) if result.cells_total else 0.0
m1, m2, m3, m4 = st.columns(4)
m1.metric("Cells scanned", result.cells_total)
m2.metric("Cells changed", result.cells_changed)
m3.metric("% changed", f"{pct:.1f}%")
m4.metric("Columns processed", len(result.columns_processed))
if result.cells_changed:
counts = result.changes["column"].value_counts()
st.markdown("**Changes by column**")
st.dataframe(
counts.rename("cells_changed").to_frame(),
use_container_width=True,
)
st.markdown("**Examples (first 25 changes)**")
examples = result.changes.head(25).copy()
examples["row"] = examples["row"] + 1
st.dataframe(examples, use_container_width=True, hide_index=True)
st.markdown("**Cleaned preview (first 10 rows)**")
st.dataframe(result.cleaned_df.head(10), use_container_width=True)
# ---------------------------------------------------------------------------
# Downloads
# ---------------------------------------------------------------------------
st.divider()
stem = Path(st.session_state.get("textclean_input_name", "input")).stem
dl_a, dl_b, dl_c = st.columns(3)
with dl_a:
cleaned_bytes = result.cleaned_df.to_csv(index=False).encode("utf-8-sig")
st.download_button(
"Download cleaned CSV",
data=cleaned_bytes,
file_name=f"{stem}_cleaned.csv",
mime="text/csv",
)
with dl_b:
if not result.changes.empty:
changes_bytes = result.changes.to_csv(index=False).encode("utf-8-sig")
st.download_button(
"Download changes audit",
data=changes_bytes,
file_name=f"{stem}_changes.csv",
mime="text/csv",
)
with dl_c:
config_bytes = json.dumps(options.to_dict(), indent=2).encode("utf-8")
st.download_button(
"Download config JSON",
data=config_bytes,
file_name="text_clean_config.json",
mime="application/json",
)
st.divider()
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")