feat: implement text cleaner (script 02) with CLI, GUI, and tests
Builds 02_text_cleaner.py from stub to working: character-level hygiene for CSV/Excel inputs covering trim, whitespace collapse, smart-character folding, Unicode NFC/NFKC, BOM strip, zero-width strip, control-char strip, line-ending normalization, and per-column case conversion. Three presets (minimal/excel-hygiene/paranoid) keep the buyer surface small. - src/core/text_clean.py: pure helpers + CleanOptions/CleanResult + clean_dataframe with dtype-safe column selection - src/cli_text_clean.py: Typer CLI mirroring the dedup CLI shape (dry-run by default, --apply writes cleaned + changes audit, JSON config save/load) - src/gui/pages/2_Text_Cleaner.py: real Streamlit page with preset picker, advanced toggles, preview, before/after metrics, and three download buttons - tests/test_text_clean.py + test_cli_text_clean.py: 92 new tests covering edge cases E1-E50 from the spec - samples/messy_text.csv: demo dataset surfacing UC1, UC3, UC6, UC10 in 10 rows - test-cases/uc16-uc26 + ec05-ec09: per-use-case and per-edge-case fixtures Docs: TECHNICAL.md §10.2 (full Tier 1/2/3 spec), DECISIONS.md v1.7 entry locking the spec, CLI-REFERENCE.md gains the text cleaner section, README.md gains a top-level Text Cleaner block, USER-GUIDE.md status row 02 promoted Skeleton -> Working. 200/200 tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
373
src/cli_text_clean.py
Normal file
373
src/cli_text_clean.py
Normal file
@@ -0,0 +1,373 @@
|
||||
"""CLI for the DataTools text cleaner (script 02).
|
||||
|
||||
Usage:
|
||||
python -m src.cli_text_clean input.csv # dry-run preview
|
||||
python -m src.cli_text_clean input.csv --apply # write cleaned file
|
||||
python -m src.cli_text_clean input.csv --preset minimal --apply
|
||||
python -m src.cli_text_clean input.csv --case upper:name --apply
|
||||
python -m src.cli_text_clean --help # full help
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import typer
|
||||
from loguru import logger
|
||||
|
||||
app = typer.Typer(
|
||||
name="text-clean",
|
||||
help=(
|
||||
"Clean and normalize text content in CSV and Excel files.\n\n"
|
||||
"By default, runs in preview mode — shows what would change without "
|
||||
"modifying anything. Add --apply to write the output.\n\n"
|
||||
"Examples:\n\n"
|
||||
" # Preview what would change\n"
|
||||
" python -m src.cli_text_clean messy.csv\n\n"
|
||||
" # Apply the safe defaults (excel-hygiene preset)\n"
|
||||
" python -m src.cli_text_clean messy.csv --apply\n\n"
|
||||
" # Minimal: only trim and collapse whitespace\n"
|
||||
" python -m src.cli_text_clean messy.csv --preset minimal --apply\n\n"
|
||||
" # Title-case the 'name' column, leave others alone for case\n"
|
||||
" python -m src.cli_text_clean people.csv --case title:name --apply\n\n"
|
||||
" # Clean only specific columns\n"
|
||||
" python -m src.cli_text_clean orders.csv --columns vendor,product --apply\n\n"
|
||||
" # Skip a free-text column from cleaning\n"
|
||||
" python -m src.cli_text_clean tickets.csv --skip notes --apply\n"
|
||||
),
|
||||
add_completion=False,
|
||||
no_args_is_help=True,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _setup_logging(log_dir: Path) -> Path:
|
||||
"""Configure loguru to write a timestamped log file. Returns the log path."""
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
log_path = log_dir / f"text_clean_{ts}.log"
|
||||
logger.remove()
|
||||
logger.add(sys.stderr, level="WARNING", format="{message}")
|
||||
logger.add(
|
||||
str(log_path),
|
||||
level="DEBUG",
|
||||
format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}",
|
||||
)
|
||||
return log_path
|
||||
|
||||
|
||||
def _parse_case(raw: Optional[str]) -> tuple[Optional[str], dict[str, str]]:
|
||||
"""Parse --case argument.
|
||||
|
||||
Forms:
|
||||
--case upper -> ("upper", {}) (apply to all selected)
|
||||
--case title:name -> (None, {"name": "title"})
|
||||
--case upper:code,title:name -> (None, {...})
|
||||
"""
|
||||
if not raw:
|
||||
return None, {}
|
||||
if ":" not in raw:
|
||||
# Bare mode applies to all selected columns
|
||||
return raw.strip(), {}
|
||||
per_col: dict[str, str] = {}
|
||||
for piece in raw.split(","):
|
||||
piece = piece.strip()
|
||||
if not piece:
|
||||
continue
|
||||
if ":" not in piece:
|
||||
raise typer.BadParameter(
|
||||
f"Invalid --case piece: '{piece}'. "
|
||||
f"Expected 'mode' or 'mode:col[,mode:col...]' "
|
||||
f"(e.g., 'upper' or 'title:name,upper:code')."
|
||||
)
|
||||
mode, col = piece.split(":", 1)
|
||||
per_col[col.strip()] = mode.strip()
|
||||
return None, per_col
|
||||
|
||||
|
||||
def _split_csv_arg(raw: Optional[str]) -> Optional[list[str]]:
|
||||
if raw is None:
|
||||
return None
|
||||
return [c.strip() for c in raw.split(",") if c.strip()]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main command
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@app.command()
|
||||
def clean(
|
||||
input_file: str = typer.Argument(
|
||||
...,
|
||||
help="Path to the CSV or Excel file to clean.",
|
||||
),
|
||||
output: Optional[str] = typer.Option(
|
||||
None, "--output", "-o",
|
||||
help="Output file path. Default: {input}_cleaned.csv",
|
||||
),
|
||||
apply: bool = typer.Option(
|
||||
False, "--apply",
|
||||
help="Write the output files. Without this flag, only a preview is shown.",
|
||||
),
|
||||
preset: str = typer.Option(
|
||||
"excel-hygiene", "--preset",
|
||||
help="Preset: minimal, excel-hygiene, or paranoid.",
|
||||
),
|
||||
columns: Optional[str] = typer.Option(
|
||||
None, "--columns",
|
||||
help="Comma-separated columns to clean (default: all string columns).",
|
||||
),
|
||||
skip: Optional[str] = typer.Option(
|
||||
None, "--skip",
|
||||
help="Comma-separated columns to skip even if they look like text.",
|
||||
),
|
||||
case: Optional[str] = typer.Option(
|
||||
None, "--case",
|
||||
help=(
|
||||
"Case conversion. Bare mode 'upper'|'lower'|'title'|'sentence' applies to "
|
||||
"all selected columns. Per-column form: 'mode:col[,mode:col]' "
|
||||
"(e.g., 'title:name,upper:code')."
|
||||
),
|
||||
),
|
||||
no_trim: bool = typer.Option(False, "--no-trim", help="Disable whitespace trim."),
|
||||
no_collapse: bool = typer.Option(
|
||||
False, "--no-collapse", help="Disable internal whitespace collapse.",
|
||||
),
|
||||
no_nfc: bool = typer.Option(False, "--no-nfc", help="Disable Unicode NFC normalization."),
|
||||
nfkc: bool = typer.Option(
|
||||
False, "--nfkc",
|
||||
help="Enable NFKC compat fold (lossy: ① → 1, fi → fi). Default off.",
|
||||
),
|
||||
no_smart_chars: bool = typer.Option(
|
||||
False, "--no-smart-chars",
|
||||
help="Disable smart-character folding (curly quotes, em/en-dash, NBSP).",
|
||||
),
|
||||
no_zero_width: bool = typer.Option(
|
||||
False, "--no-zero-width", help="Disable zero-width / invisible char strip.",
|
||||
),
|
||||
no_bom: bool = typer.Option(False, "--no-bom", help="Disable BOM strip."),
|
||||
no_control: bool = typer.Option(
|
||||
False, "--no-control", help="Disable control-character strip.",
|
||||
),
|
||||
no_line_endings: bool = typer.Option(
|
||||
False, "--no-line-endings", help="Disable line-ending normalization.",
|
||||
),
|
||||
full_changelog: bool = typer.Option(
|
||||
False, "--full-changelog",
|
||||
help="Write every cell change to the audit CSV (default caps to first 1000).",
|
||||
),
|
||||
config: Optional[str] = typer.Option(
|
||||
None, "--config",
|
||||
help="Load options from a saved JSON config file.",
|
||||
),
|
||||
save_config: Optional[str] = typer.Option(
|
||||
None, "--save-config",
|
||||
help="Save current options to a JSON config file.",
|
||||
),
|
||||
sheet: Optional[str] = typer.Option(
|
||||
None, "--sheet",
|
||||
help="Excel sheet name or index (default: first sheet).",
|
||||
),
|
||||
encoding_override: Optional[str] = typer.Option(
|
||||
None, "--encoding",
|
||||
help="Override auto-detected file encoding.",
|
||||
),
|
||||
header_row: Optional[int] = typer.Option(
|
||||
None, "--header-row",
|
||||
help="0-based row index for the header (default: auto-detect).",
|
||||
),
|
||||
):
|
||||
"""Clean and normalize text in a CSV or Excel file."""
|
||||
from src.core.io import read_file, write_file
|
||||
from src.core.text_clean import (
|
||||
CleanOptions,
|
||||
PRESETS,
|
||||
clean_dataframe,
|
||||
)
|
||||
import pandas as pd
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Validate inputs
|
||||
# ------------------------------------------------------------------
|
||||
input_path = Path(input_file)
|
||||
if not input_path.exists():
|
||||
typer.echo(f"Error: File not found: {input_path}", err=True)
|
||||
raise typer.Exit(1)
|
||||
|
||||
if preset not in PRESETS:
|
||||
typer.echo(
|
||||
f"Error: Unknown preset '{preset}'. "
|
||||
f"Choose from: {', '.join(sorted(PRESETS))}.",
|
||||
err=True,
|
||||
)
|
||||
raise typer.Exit(1)
|
||||
|
||||
log_path = _setup_logging(Path("logs"))
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Build CleanOptions
|
||||
# ------------------------------------------------------------------
|
||||
if config:
|
||||
cfg_path = Path(config)
|
||||
if not cfg_path.exists():
|
||||
typer.echo(f"Error: Config file not found: {cfg_path}", err=True)
|
||||
raise typer.Exit(1)
|
||||
options = CleanOptions.from_file(cfg_path)
|
||||
logger.info("Loaded config from {}", cfg_path)
|
||||
else:
|
||||
options = CleanOptions.from_preset(preset)
|
||||
|
||||
# CLI overrides on top of preset/config
|
||||
if no_trim:
|
||||
options.trim = False
|
||||
if no_collapse:
|
||||
options.collapse_whitespace = False
|
||||
if no_nfc:
|
||||
options.nfc = False
|
||||
if nfkc:
|
||||
options.nfkc = True
|
||||
if no_smart_chars:
|
||||
options.fold_smart_chars = False
|
||||
if no_zero_width:
|
||||
options.strip_zero_width = False
|
||||
if no_bom:
|
||||
options.strip_bom = False
|
||||
if no_control:
|
||||
options.strip_control = False
|
||||
if no_line_endings:
|
||||
options.normalize_line_endings = False
|
||||
|
||||
cols_list = _split_csv_arg(columns)
|
||||
if cols_list is not None:
|
||||
options.columns = cols_list
|
||||
skip_list = _split_csv_arg(skip)
|
||||
if skip_list:
|
||||
options.skip_columns = skip_list
|
||||
|
||||
bare_case, per_col_case = _parse_case(case)
|
||||
if bare_case:
|
||||
options.case = bare_case # type: ignore[assignment]
|
||||
if per_col_case:
|
||||
options.case_columns = {**options.case_columns, **per_col_case} # type: ignore[dict-item]
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Save config if requested (after CLI merge so the file reflects intent)
|
||||
# ------------------------------------------------------------------
|
||||
if save_config:
|
||||
saved = options.to_file(save_config)
|
||||
typer.echo(f"Config saved to {saved}")
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Read input
|
||||
# ------------------------------------------------------------------
|
||||
typer.echo(f"Reading {input_path.name}...")
|
||||
try:
|
||||
sheet_arg: str | int | None = None
|
||||
if sheet is not None:
|
||||
try:
|
||||
sheet_arg = int(sheet)
|
||||
except ValueError:
|
||||
sheet_arg = sheet
|
||||
|
||||
df = read_file(
|
||||
input_path,
|
||||
encoding=encoding_override,
|
||||
header_row=header_row,
|
||||
sheet_name=sheet_arg if sheet_arg is not None else 0,
|
||||
)
|
||||
if not isinstance(df, pd.DataFrame):
|
||||
df = pd.concat(list(df), ignore_index=True)
|
||||
except Exception as e:
|
||||
typer.echo(f"Error reading file: {e}", err=True)
|
||||
raise typer.Exit(1)
|
||||
|
||||
typer.echo(f" {len(df)} rows, {len(df.columns)} columns")
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Run pipeline
|
||||
# ------------------------------------------------------------------
|
||||
typer.echo("Cleaning text...")
|
||||
try:
|
||||
result = clean_dataframe(df, options)
|
||||
except ValueError as e:
|
||||
typer.echo(f"Error: {e}", err=True)
|
||||
raise typer.Exit(1)
|
||||
|
||||
_print_results(result, input_path, options)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Write output
|
||||
# ------------------------------------------------------------------
|
||||
if apply:
|
||||
stem = input_path.stem
|
||||
out_path = Path(output) if output else input_path.parent / f"{stem}_cleaned.csv"
|
||||
write_file(result.cleaned_df, out_path)
|
||||
typer.echo(f"\nCleaned file: {out_path}")
|
||||
|
||||
if not result.changes.empty:
|
||||
changes_path = input_path.parent / f"{stem}_changes.csv"
|
||||
audit_df = result.changes
|
||||
cap = 1000
|
||||
if not full_changelog and len(audit_df) > cap:
|
||||
typer.echo(
|
||||
f"Note: changelog capped at {cap} rows. "
|
||||
f"Use --full-changelog to write all {len(audit_df)} changes."
|
||||
)
|
||||
audit_df = audit_df.head(cap)
|
||||
write_file(audit_df, changes_path)
|
||||
typer.echo(f"Changes audit: {changes_path}")
|
||||
else:
|
||||
typer.echo("\nThis was a preview. Add --apply to write the output files.")
|
||||
|
||||
typer.echo(f"Log: {log_path}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Output formatting
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _print_results(result, input_path: Path, options) -> None:
|
||||
pct = (result.cells_changed / result.cells_total * 100.0) if result.cells_total else 0.0
|
||||
typer.echo(f"\n{'─'*50}")
|
||||
typer.echo(f" File: {input_path.name}")
|
||||
typer.echo(f" Columns processed: {len(result.columns_processed)}")
|
||||
typer.echo(f" Cells scanned: {result.cells_total}")
|
||||
typer.echo(f" Cells changed: {result.cells_changed} ({pct:.1f}%)")
|
||||
typer.echo(f"{'─'*50}")
|
||||
|
||||
if result.cells_changed and not result.changes.empty:
|
||||
# Per-column change counts
|
||||
counts = result.changes["column"].value_counts()
|
||||
typer.echo("\nChanges by column:")
|
||||
for col, n in counts.head(10).items():
|
||||
typer.echo(f" {col}: {n} cell(s)")
|
||||
if len(counts) > 10:
|
||||
typer.echo(f" ... and {len(counts) - 10} more columns")
|
||||
|
||||
# Show first few examples
|
||||
typer.echo("\nFirst examples:")
|
||||
for _, row in result.changes.head(5).iterrows():
|
||||
old = repr(row["old"])[:40]
|
||||
new = repr(row["new"])[:40]
|
||||
typer.echo(
|
||||
f" Row {row['row'] + 1}, {row['column']}: {old} → {new} "
|
||||
f"[{row['ops_applied']}]"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# __main__
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
app()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -59,6 +59,25 @@ from .config import (
|
||||
DeduplicationConfig,
|
||||
StrategyConfig,
|
||||
)
|
||||
from .text_clean import (
|
||||
CleanOptions,
|
||||
CleanResult,
|
||||
PRESETS,
|
||||
apply_case,
|
||||
clean_dataframe,
|
||||
clean_value,
|
||||
collapse_whitespace,
|
||||
fold_smart_chars,
|
||||
normalize_line_endings,
|
||||
sentence_case,
|
||||
smart_title_case,
|
||||
strip_bom,
|
||||
strip_control,
|
||||
strip_zero_width,
|
||||
to_nfc,
|
||||
to_nfkc,
|
||||
trim,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Core
|
||||
@@ -90,4 +109,22 @@ __all__ = [
|
||||
"DeduplicationConfig",
|
||||
"StrategyConfig",
|
||||
"ColumnStrategyConfig",
|
||||
# Text cleaning
|
||||
"CleanOptions",
|
||||
"CleanResult",
|
||||
"PRESETS",
|
||||
"clean_dataframe",
|
||||
"clean_value",
|
||||
"trim",
|
||||
"collapse_whitespace",
|
||||
"to_nfc",
|
||||
"to_nfkc",
|
||||
"fold_smart_chars",
|
||||
"strip_zero_width",
|
||||
"strip_bom",
|
||||
"strip_control",
|
||||
"normalize_line_endings",
|
||||
"smart_title_case",
|
||||
"sentence_case",
|
||||
"apply_case",
|
||||
]
|
||||
|
||||
489
src/core/text_clean.py
Normal file
489
src/core/text_clean.py
Normal file
@@ -0,0 +1,489 @@
|
||||
"""Character-level text hygiene for DataFrames.
|
||||
|
||||
Operations are independently toggleable, idempotent, and safe to compose.
|
||||
Each per-string helper is ``str -> str``. Numeric, datetime, and boolean
|
||||
columns pass through ``clean_dataframe`` untouched; only string cells are
|
||||
modified.
|
||||
|
||||
See TECHNICAL.md Section 10.2 for the full functional spec.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import unicodedata
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Iterable, Literal, Optional
|
||||
|
||||
import pandas as pd
|
||||
from pandas.api import types as pdtypes
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-string helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Smart-character map (curly quotes, dashes, ellipsis, NBSP, narrow NBSP).
|
||||
_SMART_CHARS: dict[str, str] = {
|
||||
"‘": "'", # LEFT SINGLE QUOTATION MARK
|
||||
"’": "'", # RIGHT SINGLE QUOTATION MARK
|
||||
"‚": "'", # SINGLE LOW-9 QUOTATION MARK
|
||||
"‛": "'", # SINGLE HIGH-REVERSED-9 QUOTATION MARK
|
||||
"“": '"', # LEFT DOUBLE QUOTATION MARK
|
||||
"”": '"', # RIGHT DOUBLE QUOTATION MARK
|
||||
"„": '"', # DOUBLE LOW-9 QUOTATION MARK
|
||||
"‟": '"', # DOUBLE HIGH-REVERSED-9 QUOTATION MARK
|
||||
"–": "-", # EN DASH
|
||||
"—": "-", # EM DASH
|
||||
"―": "-", # HORIZONTAL BAR
|
||||
"−": "-", # MINUS SIGN
|
||||
"…": "...", # HORIZONTAL ELLIPSIS
|
||||
" ": " ", # NO-BREAK SPACE
|
||||
" ": " ", # NARROW NO-BREAK SPACE
|
||||
" ": " ", # THIN SPACE
|
||||
" ": " ", # HAIR SPACE
|
||||
" ": " ", # EN SPACE
|
||||
" ": " ", # EM SPACE
|
||||
" ": " ", # IDEOGRAPHIC SPACE
|
||||
}
|
||||
|
||||
_SMART_TRANS = str.maketrans(_SMART_CHARS)
|
||||
|
||||
# Zero-width / invisible characters. ``U+FEFF`` (BOM/ZWNBSP) is included; if
|
||||
# it appears at the *very start* of the first cell of the first column, the
|
||||
# BOM-strip op handles it; elsewhere it is treated as a zero-width char.
|
||||
_ZERO_WIDTH = (
|
||||
"" # ZERO WIDTH SPACE
|
||||
"" # ZERO WIDTH NON-JOINER
|
||||
"" # ZERO WIDTH JOINER
|
||||
"" # WORD JOINER
|
||||
"" # LEFT-TO-RIGHT MARK
|
||||
"" # RIGHT-TO-LEFT MARK
|
||||
"" # ZERO WIDTH NO-BREAK SPACE / BOM
|
||||
)
|
||||
_ZERO_WIDTH_RE = re.compile(f"[{_ZERO_WIDTH}]")
|
||||
|
||||
# Control characters: U+0000-U+001F and U+007F, but preserve \t \n \r.
|
||||
_CONTROL_RE = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]")
|
||||
|
||||
# Any run of *horizontal* whitespace (spaces, tabs, form/vertical feeds).
|
||||
# Newlines and carriage returns are excluded so multi-line cells keep their
|
||||
# line structure; the line-ending op normalizes the actual line terminators.
|
||||
_WHITESPACE_RUN_RE = re.compile(r"[^\S\n\r]+")
|
||||
|
||||
|
||||
def trim(s: str) -> str:
|
||||
"""Strip leading/trailing whitespace."""
|
||||
if not isinstance(s, str):
|
||||
return s
|
||||
return s.strip()
|
||||
|
||||
|
||||
def collapse_whitespace(s: str) -> str:
|
||||
"""Collapse runs of whitespace to a single space.
|
||||
|
||||
Preserves leading/trailing whitespace boundaries (use ``trim`` to remove
|
||||
them). Tabs and other whitespace inside the string become a single
|
||||
regular space.
|
||||
"""
|
||||
if not isinstance(s, str):
|
||||
return s
|
||||
return _WHITESPACE_RUN_RE.sub(" ", s)
|
||||
|
||||
|
||||
def to_nfc(s: str) -> str:
|
||||
"""Apply Unicode NFC (canonical composition)."""
|
||||
if not isinstance(s, str):
|
||||
return s
|
||||
return unicodedata.normalize("NFC", s)
|
||||
|
||||
|
||||
def to_nfkc(s: str) -> str:
|
||||
"""Apply Unicode NFKC (compatibility composition). Lossy."""
|
||||
if not isinstance(s, str):
|
||||
return s
|
||||
return unicodedata.normalize("NFKC", s)
|
||||
|
||||
|
||||
def fold_smart_chars(s: str) -> str:
|
||||
"""Fold curly quotes, em/en-dashes, ellipsis, NBSP variants to ASCII."""
|
||||
if not isinstance(s, str):
|
||||
return s
|
||||
return s.translate(_SMART_TRANS)
|
||||
|
||||
|
||||
def strip_zero_width(s: str) -> str:
|
||||
"""Remove zero-width and bidi-mark characters."""
|
||||
if not isinstance(s, str):
|
||||
return s
|
||||
return _ZERO_WIDTH_RE.sub("", s)
|
||||
|
||||
|
||||
def strip_bom(s: str) -> str:
|
||||
"""Remove a leading ``U+FEFF`` (BOM) from the start of the string."""
|
||||
if not isinstance(s, str):
|
||||
return s
|
||||
return s.lstrip("")
|
||||
|
||||
|
||||
def strip_control(s: str) -> str:
|
||||
"""Remove control characters except ``\\t \\n \\r``."""
|
||||
if not isinstance(s, str):
|
||||
return s
|
||||
return _CONTROL_RE.sub("", s)
|
||||
|
||||
|
||||
def normalize_line_endings(s: str) -> str:
|
||||
"""Normalize ``\\r\\n`` and bare ``\\r`` to ``\\n``."""
|
||||
if not isinstance(s, str):
|
||||
return s
|
||||
return s.replace("\r\n", "\n").replace("\r", "\n")
|
||||
|
||||
|
||||
# Smart title-case helpers
|
||||
_TITLE_LOWERCASE_PARTICLES = {
|
||||
"a", "an", "and", "as", "at", "but", "by", "en", "for", "if", "in", "nor",
|
||||
"of", "on", "or", "per", "the", "to", "v", "v.", "vs", "vs.", "via",
|
||||
}
|
||||
|
||||
|
||||
def _is_all_caps_token(token: str) -> bool:
|
||||
"""A token is all-caps when it has at least one cased char and no lowercase."""
|
||||
has_letter = any(c.isalpha() for c in token)
|
||||
has_lower = any(c.islower() for c in token)
|
||||
return has_letter and not has_lower and len(token) >= 2
|
||||
|
||||
|
||||
def smart_title_case(s: str) -> str:
|
||||
"""Title-case that preserves all-caps tokens and lowercases mid-string particles.
|
||||
|
||||
- ``USA`` stays ``USA``.
|
||||
- ``of``, ``and``, ``the``, etc. stay lowercase except as the first/last word.
|
||||
- Apostrophes inside words don't restart capitalization (``O'Neil``).
|
||||
"""
|
||||
if not isinstance(s, str) or not s:
|
||||
return s
|
||||
tokens = s.split(" ")
|
||||
out: list[str] = []
|
||||
last_idx = len(tokens) - 1
|
||||
for i, tok in enumerate(tokens):
|
||||
if not tok:
|
||||
out.append(tok)
|
||||
continue
|
||||
if _is_all_caps_token(tok):
|
||||
out.append(tok)
|
||||
continue
|
||||
lowered = tok.lower()
|
||||
if 0 < i < last_idx and lowered in _TITLE_LOWERCASE_PARTICLES:
|
||||
out.append(lowered)
|
||||
continue
|
||||
# Capitalize first cased character; preserve apostrophes/hyphens
|
||||
chars = list(tok)
|
||||
capitalized = False
|
||||
for j, c in enumerate(chars):
|
||||
if c.isalpha():
|
||||
if not capitalized:
|
||||
chars[j] = c.upper()
|
||||
capitalized = True
|
||||
else:
|
||||
chars[j] = c.lower()
|
||||
out.append("".join(chars))
|
||||
return " ".join(out)
|
||||
|
||||
|
||||
def sentence_case(s: str) -> str:
|
||||
"""Lowercase, then capitalize the first cased letter after each ``. ! ?``."""
|
||||
if not isinstance(s, str) or not s:
|
||||
return s
|
||||
lowered = s.lower()
|
||||
chars = list(lowered)
|
||||
capitalize_next = True
|
||||
for i, c in enumerate(chars):
|
||||
if c in ".!?":
|
||||
capitalize_next = True
|
||||
continue
|
||||
if capitalize_next and c.isalpha():
|
||||
chars[i] = c.upper()
|
||||
capitalize_next = False
|
||||
elif c.strip():
|
||||
# Any non-whitespace, non-letter (e.g., quote, paren) doesn't
|
||||
# consume the "next letter" trigger.
|
||||
if c.isalpha():
|
||||
capitalize_next = False
|
||||
return "".join(chars)
|
||||
|
||||
|
||||
CaseMode = Literal["upper", "lower", "title", "sentence"]
|
||||
|
||||
|
||||
def apply_case(s: str, mode: CaseMode) -> str:
|
||||
if not isinstance(s, str):
|
||||
return s
|
||||
if mode == "upper":
|
||||
return s.upper()
|
||||
if mode == "lower":
|
||||
return s.lower()
|
||||
if mode == "title":
|
||||
return smart_title_case(s)
|
||||
if mode == "sentence":
|
||||
return sentence_case(s)
|
||||
raise ValueError(f"Unknown case mode: {mode}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Options / result dataclasses
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
PRESETS: dict[str, dict[str, Any]] = {
|
||||
"minimal": {
|
||||
"trim": True,
|
||||
"collapse_whitespace": True,
|
||||
"nfc": False,
|
||||
"nfkc": False,
|
||||
"fold_smart_chars": False,
|
||||
"strip_zero_width": False,
|
||||
"strip_bom": False,
|
||||
"strip_control": False,
|
||||
"normalize_line_endings": False,
|
||||
},
|
||||
"excel-hygiene": {
|
||||
"trim": True,
|
||||
"collapse_whitespace": True,
|
||||
"nfc": True,
|
||||
"nfkc": False,
|
||||
"fold_smart_chars": True,
|
||||
"strip_zero_width": True,
|
||||
"strip_bom": True,
|
||||
"strip_control": True,
|
||||
"normalize_line_endings": True,
|
||||
},
|
||||
"paranoid": {
|
||||
"trim": True,
|
||||
"collapse_whitespace": True,
|
||||
"nfc": True,
|
||||
"nfkc": True,
|
||||
"fold_smart_chars": True,
|
||||
"strip_zero_width": True,
|
||||
"strip_bom": True,
|
||||
"strip_control": True,
|
||||
"normalize_line_endings": True,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class CleanOptions:
|
||||
"""Toggles for character-level cleaning operations.
|
||||
|
||||
Defaults match the ``excel-hygiene`` preset.
|
||||
"""
|
||||
|
||||
# Operations
|
||||
trim: bool = True
|
||||
collapse_whitespace: bool = True
|
||||
nfc: bool = True
|
||||
nfkc: bool = False
|
||||
fold_smart_chars: bool = True
|
||||
strip_zero_width: bool = True
|
||||
strip_bom: bool = True
|
||||
strip_control: bool = True
|
||||
normalize_line_endings: bool = True
|
||||
|
||||
# Case conversion: either a single mode applied to all selected columns,
|
||||
# or a dict mapping column name -> mode for per-column control.
|
||||
case: Optional[CaseMode] = None
|
||||
case_columns: dict[str, CaseMode] = field(default_factory=dict)
|
||||
|
||||
# Scope control
|
||||
columns: Optional[list[str]] = None # None = all string-typed columns
|
||||
skip_columns: list[str] = field(default_factory=list)
|
||||
|
||||
@classmethod
|
||||
def from_preset(cls, name: str) -> CleanOptions:
|
||||
if name not in PRESETS:
|
||||
raise ValueError(
|
||||
f"Unknown preset '{name}'. "
|
||||
f"Available: {', '.join(sorted(PRESETS))}."
|
||||
)
|
||||
return cls(**PRESETS[name])
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> CleanOptions:
|
||||
known = {f for f in cls.__dataclass_fields__}
|
||||
kwargs = {k: v for k, v in data.items() if k in known}
|
||||
return cls(**kwargs)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return asdict(self)
|
||||
|
||||
def to_file(self, path: str | Path) -> Path:
|
||||
out = Path(path)
|
||||
out.write_text(json.dumps(self.to_dict(), indent=2))
|
||||
return out
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, path: str | Path) -> CleanOptions:
|
||||
return cls.from_dict(json.loads(Path(path).read_text()))
|
||||
|
||||
|
||||
@dataclass
|
||||
class CleanResult:
|
||||
"""Output of ``clean_dataframe``."""
|
||||
|
||||
cleaned_df: pd.DataFrame
|
||||
changes: pd.DataFrame # cols: row, column, old, new, ops_applied
|
||||
cells_changed: int
|
||||
cells_total: int
|
||||
columns_processed: list[str]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cell-level pipeline
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _build_pipeline(options: CleanOptions) -> list[tuple[str, Callable[[str], str]]]:
|
||||
"""Return ordered (op_name, fn) pairs for the cell-level pipeline.
|
||||
|
||||
Order is meaningful:
|
||||
1. BOM strip first so a leading FEFF doesn't survive into other ops.
|
||||
2. Line-ending normalize before whitespace ops so \\r\\n collapses cleanly.
|
||||
3. Control-char strip before whitespace ops.
|
||||
4. Smart-char fold before NFC/NFKC (folded ASCII is already NFC-stable).
|
||||
5. NFC then NFKC (NFKC subsumes NFC if both set; we still run NFC first
|
||||
so the result is identical to NFKC alone — kept explicit for logging).
|
||||
6. Zero-width strip after Unicode normalization (NFKC can introduce
|
||||
decomposed forms whose combining marks must not be stripped).
|
||||
7. Whitespace collapse, then trim, last.
|
||||
"""
|
||||
ops: list[tuple[str, Callable[[str], str]]] = []
|
||||
if options.strip_bom:
|
||||
ops.append(("strip_bom", strip_bom))
|
||||
if options.normalize_line_endings:
|
||||
ops.append(("normalize_line_endings", normalize_line_endings))
|
||||
if options.strip_control:
|
||||
ops.append(("strip_control", strip_control))
|
||||
if options.fold_smart_chars:
|
||||
ops.append(("fold_smart_chars", fold_smart_chars))
|
||||
if options.nfc:
|
||||
ops.append(("nfc", to_nfc))
|
||||
if options.nfkc:
|
||||
ops.append(("nfkc", to_nfkc))
|
||||
if options.strip_zero_width:
|
||||
ops.append(("strip_zero_width", strip_zero_width))
|
||||
if options.collapse_whitespace:
|
||||
ops.append(("collapse_whitespace", collapse_whitespace))
|
||||
if options.trim:
|
||||
ops.append(("trim", trim))
|
||||
return ops
|
||||
|
||||
|
||||
def clean_value(value: Any, options: CleanOptions) -> tuple[Any, list[str]]:
|
||||
"""Apply the configured pipeline to a single cell.
|
||||
|
||||
Returns ``(cleaned_value, ops_applied)``. Non-strings and missing values
|
||||
pass through unchanged with an empty ``ops_applied`` list.
|
||||
"""
|
||||
if value is None or (isinstance(value, float) and pd.isna(value)):
|
||||
return value, []
|
||||
if not isinstance(value, str):
|
||||
return value, []
|
||||
|
||||
pipeline = _build_pipeline(options)
|
||||
cur = value
|
||||
applied: list[str] = []
|
||||
for name, fn in pipeline:
|
||||
new = fn(cur)
|
||||
if new != cur:
|
||||
applied.append(name)
|
||||
cur = new
|
||||
return cur, applied
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DataFrame-level entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _select_columns(df: pd.DataFrame, options: CleanOptions) -> list[str]:
|
||||
"""Pick the columns the pipeline should operate on.
|
||||
|
||||
- If ``options.columns`` is explicit, use it (after validating).
|
||||
- Otherwise default to columns whose pandas dtype is object/string.
|
||||
- Always exclude ``options.skip_columns``.
|
||||
"""
|
||||
if options.columns is not None:
|
||||
missing = [c for c in options.columns if c not in df.columns]
|
||||
if missing:
|
||||
raise ValueError(
|
||||
f"Columns not found in input: {missing}. "
|
||||
f"Available: {list(df.columns)}"
|
||||
)
|
||||
chosen: Iterable[str] = options.columns
|
||||
else:
|
||||
chosen = [
|
||||
c for c in df.columns
|
||||
if pdtypes.is_object_dtype(df[c]) or pdtypes.is_string_dtype(df[c])
|
||||
]
|
||||
|
||||
skip = set(options.skip_columns)
|
||||
return [c for c in chosen if c not in skip]
|
||||
|
||||
|
||||
def clean_dataframe(df: pd.DataFrame, options: Optional[CleanOptions] = None) -> CleanResult:
|
||||
"""Apply text-cleaning ops to selected columns of *df*.
|
||||
|
||||
Numeric, datetime, and boolean columns are skipped by default. The input
|
||||
DataFrame is not mutated; a copy is returned in ``CleanResult.cleaned_df``.
|
||||
"""
|
||||
options = options or CleanOptions()
|
||||
out = df.copy()
|
||||
columns = _select_columns(out, options)
|
||||
|
||||
case_per_col: dict[str, CaseMode] = dict(options.case_columns)
|
||||
if options.case is not None:
|
||||
for c in columns:
|
||||
case_per_col.setdefault(c, options.case)
|
||||
|
||||
change_records: list[dict[str, Any]] = []
|
||||
cells_changed = 0
|
||||
cells_total = 0
|
||||
|
||||
for col in columns:
|
||||
series = out[col]
|
||||
new_values: list[Any] = []
|
||||
col_case = case_per_col.get(col)
|
||||
for row_idx, original in enumerate(series.tolist()):
|
||||
cells_total += 1
|
||||
cleaned, ops_applied = clean_value(original, options)
|
||||
|
||||
if col_case is not None and isinstance(cleaned, str):
|
||||
cased = apply_case(cleaned, col_case)
|
||||
if cased != cleaned:
|
||||
ops_applied.append(f"case:{col_case}")
|
||||
cleaned = cased
|
||||
|
||||
if ops_applied and cleaned != original:
|
||||
cells_changed += 1
|
||||
change_records.append({
|
||||
"row": row_idx,
|
||||
"column": col,
|
||||
"old": original,
|
||||
"new": cleaned,
|
||||
"ops_applied": ",".join(ops_applied),
|
||||
})
|
||||
new_values.append(cleaned)
|
||||
out[col] = new_values
|
||||
|
||||
changes_df = pd.DataFrame(
|
||||
change_records,
|
||||
columns=["row", "column", "old", "new", "ops_applied"],
|
||||
)
|
||||
|
||||
return CleanResult(
|
||||
cleaned_df=out,
|
||||
changes=changes_df,
|
||||
cells_changed=cells_changed,
|
||||
cells_total=cells_total,
|
||||
columns_processed=columns,
|
||||
)
|
||||
@@ -1,10 +1,13 @@
|
||||
"""DataTools Text Cleaner — stub page."""
|
||||
"""DataTools Text Cleaner — Streamlit page."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
@@ -12,82 +15,236 @@ if str(_project_root) not in sys.path:
|
||||
sys.path.insert(0, str(_project_root))
|
||||
|
||||
from src.gui.components import hide_streamlit_chrome
|
||||
from src.core.text_clean import (
|
||||
PRESETS,
|
||||
CleanOptions,
|
||||
clean_dataframe,
|
||||
)
|
||||
|
||||
hide_streamlit_chrome()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Header
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.title("✂️ Text Cleaner")
|
||||
st.caption("Clean and normalize text content across your data.")
|
||||
|
||||
st.info("This tool is under development.")
|
||||
st.caption(
|
||||
"Trim whitespace, fold smart quotes, strip invisible characters, and "
|
||||
"normalize line endings. Runs locally — your data never leaves this computer."
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# What this tool will do
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.markdown("""
|
||||
**Features:**
|
||||
- Trim leading/trailing whitespace
|
||||
- Collapse multiple spaces into one
|
||||
- Unicode normalization (NFC/NFKC)
|
||||
- Strip non-printable / control characters
|
||||
- Remove BOM (byte order mark)
|
||||
- Normalize line endings (CRLF → LF)
|
||||
- Case conversion (upper, lower, title, sentence)
|
||||
""")
|
||||
|
||||
st.divider()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# File upload (functional)
|
||||
# File upload
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
uploaded = st.file_uploader(
|
||||
"Upload CSV or Excel file",
|
||||
type=["csv", "tsv", "xlsx", "xls"],
|
||||
help="Upload a file to preview. Processing is not yet available.",
|
||||
key="textclean_file_upload",
|
||||
)
|
||||
|
||||
if uploaded is not None:
|
||||
import pandas as pd
|
||||
try:
|
||||
if uploaded.name.endswith((".xlsx", ".xls")):
|
||||
df = pd.read_excel(uploaded)
|
||||
else:
|
||||
df = pd.read_csv(uploaded)
|
||||
st.subheader(f"Preview: {uploaded.name}")
|
||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||
st.dataframe(df.head(10), use_container_width=True)
|
||||
except Exception as e:
|
||||
st.error(f"Failed to read file: {e}")
|
||||
if uploaded is None:
|
||||
st.info("Upload a CSV, TSV, or Excel file to begin.")
|
||||
st.stop()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Placeholder options
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.subheader("Operations")
|
||||
@st.cache_data(show_spinner=False)
|
||||
def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
|
||||
"""Read the uploaded bytes into a DataFrame, treating all cells as strings."""
|
||||
suffix = Path(name).suffix.lower()
|
||||
bio = io.BytesIO(data)
|
||||
if suffix in (".xlsx", ".xls"):
|
||||
return pd.read_excel(bio, dtype=str, keep_default_na=False)
|
||||
# CSV / TSV — try utf-8 then utf-8-sig then latin-1 as a fallback
|
||||
for enc in ("utf-8", "utf-8-sig", "latin-1"):
|
||||
try:
|
||||
bio.seek(0)
|
||||
sep = "\t" if suffix == ".tsv" else ","
|
||||
return pd.read_csv(
|
||||
bio, dtype=str, keep_default_na=False,
|
||||
encoding=enc, sep=sep, on_bad_lines="warn",
|
||||
)
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
bio.seek(0)
|
||||
return pd.read_csv(bio, dtype=str, keep_default_na=False, encoding="latin-1")
|
||||
|
||||
st.checkbox("Trim whitespace", value=True, disabled=True)
|
||||
st.checkbox("Collapse multiple spaces", value=True, disabled=True)
|
||||
st.checkbox("Unicode normalization (NFC)", value=False, disabled=True)
|
||||
st.checkbox("Strip non-printable characters", value=False, disabled=True)
|
||||
st.checkbox("Remove BOM", value=False, disabled=True)
|
||||
st.checkbox("Normalize line endings", value=False, disabled=True)
|
||||
st.selectbox("Case conversion", ["None", "UPPER", "lower", "Title Case", "Sentence case"], disabled=True)
|
||||
|
||||
try:
|
||||
df = _read_uploaded(uploaded.name, uploaded.getvalue())
|
||||
except Exception as e:
|
||||
st.error(f"Failed to read file: {e}")
|
||||
st.stop()
|
||||
|
||||
st.subheader(f"Preview: {uploaded.name}")
|
||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||
st.dataframe(df.head(10), use_container_width=True)
|
||||
|
||||
st.divider()
|
||||
st.button("Clean Text", type="primary", use_container_width=True, disabled=True)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Footer
|
||||
# Options
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.divider()
|
||||
st.caption(
|
||||
"Runs locally. Your data never leaves this computer. "
|
||||
"| DataTools v3.0"
|
||||
st.subheader("Options")
|
||||
|
||||
preset_label = st.radio(
|
||||
"Preset",
|
||||
["excel-hygiene (recommended)", "minimal", "paranoid"],
|
||||
index=0,
|
||||
horizontal=True,
|
||||
help=(
|
||||
"excel-hygiene: trim, collapse whitespace, fold smart quotes, strip "
|
||||
"invisible chars, normalize line endings, NFC. "
|
||||
"minimal: only trim and collapse. "
|
||||
"paranoid: everything including NFKC compat fold (lossy)."
|
||||
),
|
||||
)
|
||||
preset_key = preset_label.split(" ", 1)[0]
|
||||
options = CleanOptions.from_preset(preset_key)
|
||||
|
||||
with st.expander("Advanced options"):
|
||||
col_a, col_b = st.columns(2)
|
||||
with col_a:
|
||||
options.trim = st.checkbox("Trim leading/trailing whitespace", value=options.trim)
|
||||
options.collapse_whitespace = st.checkbox(
|
||||
"Collapse internal whitespace", value=options.collapse_whitespace,
|
||||
)
|
||||
options.normalize_line_endings = st.checkbox(
|
||||
"Normalize line endings (\\r\\n → \\n)", value=options.normalize_line_endings,
|
||||
)
|
||||
options.strip_control = st.checkbox(
|
||||
"Strip control characters", value=options.strip_control,
|
||||
)
|
||||
options.strip_bom = st.checkbox("Strip BOM", value=options.strip_bom)
|
||||
with col_b:
|
||||
options.fold_smart_chars = st.checkbox(
|
||||
"Fold smart characters (curly quotes, em-dash, NBSP)",
|
||||
value=options.fold_smart_chars,
|
||||
)
|
||||
options.strip_zero_width = st.checkbox(
|
||||
"Strip zero-width / invisible characters", value=options.strip_zero_width,
|
||||
)
|
||||
options.nfc = st.checkbox("Unicode NFC normalization", value=options.nfc)
|
||||
options.nfkc = st.checkbox(
|
||||
"Unicode NFKC compat fold (lossy: ① → 1, fi → fi)",
|
||||
value=options.nfkc,
|
||||
)
|
||||
|
||||
st.markdown("**Scope**")
|
||||
string_cols = [
|
||||
c for c in df.columns
|
||||
if pd.api.types.is_object_dtype(df[c]) or pd.api.types.is_string_dtype(df[c])
|
||||
]
|
||||
selected_cols = st.multiselect(
|
||||
"Columns to clean (default: all string columns)",
|
||||
options=list(df.columns),
|
||||
default=string_cols,
|
||||
)
|
||||
skip_cols = st.multiselect(
|
||||
"Columns to skip even if they look like text",
|
||||
options=list(df.columns),
|
||||
default=[],
|
||||
)
|
||||
options.columns = selected_cols if selected_cols else None
|
||||
options.skip_columns = list(skip_cols)
|
||||
|
||||
st.markdown("**Case conversion**")
|
||||
case_global = st.selectbox(
|
||||
"Apply case conversion to selected columns",
|
||||
["None", "UPPER", "lower", "Title", "Sentence"],
|
||||
index=0,
|
||||
)
|
||||
case_map = {
|
||||
"UPPER": "upper", "lower": "lower",
|
||||
"Title": "title", "Sentence": "sentence",
|
||||
}
|
||||
if case_global != "None":
|
||||
options.case = case_map[case_global] # type: ignore[assignment]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Run
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.divider()
|
||||
|
||||
if st.button("Clean Text", type="primary", use_container_width=True):
|
||||
with st.spinner("Cleaning..."):
|
||||
try:
|
||||
result = clean_dataframe(df, options)
|
||||
except ValueError as e:
|
||||
st.error(str(e))
|
||||
st.stop()
|
||||
st.session_state["textclean_result"] = result
|
||||
st.session_state["textclean_input_name"] = uploaded.name
|
||||
|
||||
result = st.session_state.get("textclean_result")
|
||||
if result is None:
|
||||
st.stop()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Results
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.subheader("Results")
|
||||
|
||||
pct = (result.cells_changed / result.cells_total * 100.0) if result.cells_total else 0.0
|
||||
m1, m2, m3, m4 = st.columns(4)
|
||||
m1.metric("Cells scanned", result.cells_total)
|
||||
m2.metric("Cells changed", result.cells_changed)
|
||||
m3.metric("% changed", f"{pct:.1f}%")
|
||||
m4.metric("Columns processed", len(result.columns_processed))
|
||||
|
||||
if result.cells_changed:
|
||||
counts = result.changes["column"].value_counts()
|
||||
st.markdown("**Changes by column**")
|
||||
st.dataframe(
|
||||
counts.rename("cells_changed").to_frame(),
|
||||
use_container_width=True,
|
||||
)
|
||||
|
||||
st.markdown("**Examples (first 25 changes)**")
|
||||
examples = result.changes.head(25).copy()
|
||||
examples["row"] = examples["row"] + 1
|
||||
st.dataframe(examples, use_container_width=True, hide_index=True)
|
||||
|
||||
st.markdown("**Cleaned preview (first 10 rows)**")
|
||||
st.dataframe(result.cleaned_df.head(10), use_container_width=True)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Downloads
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.divider()
|
||||
stem = Path(st.session_state.get("textclean_input_name", "input")).stem
|
||||
|
||||
dl_a, dl_b, dl_c = st.columns(3)
|
||||
with dl_a:
|
||||
cleaned_bytes = result.cleaned_df.to_csv(index=False).encode("utf-8-sig")
|
||||
st.download_button(
|
||||
"Download cleaned CSV",
|
||||
data=cleaned_bytes,
|
||||
file_name=f"{stem}_cleaned.csv",
|
||||
mime="text/csv",
|
||||
)
|
||||
with dl_b:
|
||||
if not result.changes.empty:
|
||||
changes_bytes = result.changes.to_csv(index=False).encode("utf-8-sig")
|
||||
st.download_button(
|
||||
"Download changes audit",
|
||||
data=changes_bytes,
|
||||
file_name=f"{stem}_changes.csv",
|
||||
mime="text/csv",
|
||||
)
|
||||
with dl_c:
|
||||
config_bytes = json.dumps(options.to_dict(), indent=2).encode("utf-8")
|
||||
st.download_button(
|
||||
"Download config JSON",
|
||||
data=config_bytes,
|
||||
file_name="text_clean_config.json",
|
||||
mime="application/json",
|
||||
)
|
||||
|
||||
st.divider()
|
||||
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")
|
||||
|
||||
Reference in New Issue
Block a user