feat: implement text cleaner (script 02) with CLI, GUI, and tests
Builds 02_text_cleaner.py from stub to working: character-level hygiene for CSV/Excel inputs covering trim, whitespace collapse, smart-character folding, Unicode NFC/NFKC, BOM strip, zero-width strip, control-char strip, line-ending normalization, and per-column case conversion. Three presets (minimal/excel-hygiene/paranoid) keep the buyer surface small. - src/core/text_clean.py: pure helpers + CleanOptions/CleanResult + clean_dataframe with dtype-safe column selection - src/cli_text_clean.py: Typer CLI mirroring the dedup CLI shape (dry-run by default, --apply writes cleaned + changes audit, JSON config save/load) - src/gui/pages/2_Text_Cleaner.py: real Streamlit page with preset picker, advanced toggles, preview, before/after metrics, and three download buttons - tests/test_text_clean.py + test_cli_text_clean.py: 92 new tests covering edge cases E1-E50 from the spec - samples/messy_text.csv: demo dataset surfacing UC1, UC3, UC6, UC10 in 10 rows - test-cases/uc16-uc26 + ec05-ec09: per-use-case and per-edge-case fixtures Docs: TECHNICAL.md §10.2 (full Tier 1/2/3 spec), DECISIONS.md v1.7 entry locking the spec, CLI-REFERENCE.md gains the text cleaner section, README.md gains a top-level Text Cleaner block, USER-GUIDE.md status row 02 promoted Skeleton -> Working. 200/200 tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,10 +1,13 @@
|
||||
"""DataTools Text Cleaner — stub page."""
|
||||
"""DataTools Text Cleaner — Streamlit page."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
@@ -12,82 +15,236 @@ if str(_project_root) not in sys.path:
|
||||
sys.path.insert(0, str(_project_root))
|
||||
|
||||
from src.gui.components import hide_streamlit_chrome
|
||||
from src.core.text_clean import (
|
||||
PRESETS,
|
||||
CleanOptions,
|
||||
clean_dataframe,
|
||||
)
|
||||
|
||||
hide_streamlit_chrome()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Header
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.title("✂️ Text Cleaner")
|
||||
st.caption("Clean and normalize text content across your data.")
|
||||
|
||||
st.info("This tool is under development.")
|
||||
st.caption(
|
||||
"Trim whitespace, fold smart quotes, strip invisible characters, and "
|
||||
"normalize line endings. Runs locally — your data never leaves this computer."
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# What this tool will do
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.markdown("""
|
||||
**Features:**
|
||||
- Trim leading/trailing whitespace
|
||||
- Collapse multiple spaces into one
|
||||
- Unicode normalization (NFC/NFKC)
|
||||
- Strip non-printable / control characters
|
||||
- Remove BOM (byte order mark)
|
||||
- Normalize line endings (CRLF → LF)
|
||||
- Case conversion (upper, lower, title, sentence)
|
||||
""")
|
||||
|
||||
st.divider()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# File upload (functional)
|
||||
# File upload
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
uploaded = st.file_uploader(
|
||||
"Upload CSV or Excel file",
|
||||
type=["csv", "tsv", "xlsx", "xls"],
|
||||
help="Upload a file to preview. Processing is not yet available.",
|
||||
key="textclean_file_upload",
|
||||
)
|
||||
|
||||
if uploaded is not None:
|
||||
import pandas as pd
|
||||
try:
|
||||
if uploaded.name.endswith((".xlsx", ".xls")):
|
||||
df = pd.read_excel(uploaded)
|
||||
else:
|
||||
df = pd.read_csv(uploaded)
|
||||
st.subheader(f"Preview: {uploaded.name}")
|
||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||
st.dataframe(df.head(10), use_container_width=True)
|
||||
except Exception as e:
|
||||
st.error(f"Failed to read file: {e}")
|
||||
if uploaded is None:
|
||||
st.info("Upload a CSV, TSV, or Excel file to begin.")
|
||||
st.stop()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Placeholder options
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.subheader("Operations")
|
||||
@st.cache_data(show_spinner=False)
|
||||
def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
|
||||
"""Read the uploaded bytes into a DataFrame, treating all cells as strings."""
|
||||
suffix = Path(name).suffix.lower()
|
||||
bio = io.BytesIO(data)
|
||||
if suffix in (".xlsx", ".xls"):
|
||||
return pd.read_excel(bio, dtype=str, keep_default_na=False)
|
||||
# CSV / TSV — try utf-8 then utf-8-sig then latin-1 as a fallback
|
||||
for enc in ("utf-8", "utf-8-sig", "latin-1"):
|
||||
try:
|
||||
bio.seek(0)
|
||||
sep = "\t" if suffix == ".tsv" else ","
|
||||
return pd.read_csv(
|
||||
bio, dtype=str, keep_default_na=False,
|
||||
encoding=enc, sep=sep, on_bad_lines="warn",
|
||||
)
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
bio.seek(0)
|
||||
return pd.read_csv(bio, dtype=str, keep_default_na=False, encoding="latin-1")
|
||||
|
||||
st.checkbox("Trim whitespace", value=True, disabled=True)
|
||||
st.checkbox("Collapse multiple spaces", value=True, disabled=True)
|
||||
st.checkbox("Unicode normalization (NFC)", value=False, disabled=True)
|
||||
st.checkbox("Strip non-printable characters", value=False, disabled=True)
|
||||
st.checkbox("Remove BOM", value=False, disabled=True)
|
||||
st.checkbox("Normalize line endings", value=False, disabled=True)
|
||||
st.selectbox("Case conversion", ["None", "UPPER", "lower", "Title Case", "Sentence case"], disabled=True)
|
||||
|
||||
try:
|
||||
df = _read_uploaded(uploaded.name, uploaded.getvalue())
|
||||
except Exception as e:
|
||||
st.error(f"Failed to read file: {e}")
|
||||
st.stop()
|
||||
|
||||
st.subheader(f"Preview: {uploaded.name}")
|
||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||
st.dataframe(df.head(10), use_container_width=True)
|
||||
|
||||
st.divider()
|
||||
st.button("Clean Text", type="primary", use_container_width=True, disabled=True)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Footer
|
||||
# Options
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.divider()
|
||||
st.caption(
|
||||
"Runs locally. Your data never leaves this computer. "
|
||||
"| DataTools v3.0"
|
||||
st.subheader("Options")
|
||||
|
||||
preset_label = st.radio(
|
||||
"Preset",
|
||||
["excel-hygiene (recommended)", "minimal", "paranoid"],
|
||||
index=0,
|
||||
horizontal=True,
|
||||
help=(
|
||||
"excel-hygiene: trim, collapse whitespace, fold smart quotes, strip "
|
||||
"invisible chars, normalize line endings, NFC. "
|
||||
"minimal: only trim and collapse. "
|
||||
"paranoid: everything including NFKC compat fold (lossy)."
|
||||
),
|
||||
)
|
||||
preset_key = preset_label.split(" ", 1)[0]
|
||||
options = CleanOptions.from_preset(preset_key)
|
||||
|
||||
with st.expander("Advanced options"):
|
||||
col_a, col_b = st.columns(2)
|
||||
with col_a:
|
||||
options.trim = st.checkbox("Trim leading/trailing whitespace", value=options.trim)
|
||||
options.collapse_whitespace = st.checkbox(
|
||||
"Collapse internal whitespace", value=options.collapse_whitespace,
|
||||
)
|
||||
options.normalize_line_endings = st.checkbox(
|
||||
"Normalize line endings (\\r\\n → \\n)", value=options.normalize_line_endings,
|
||||
)
|
||||
options.strip_control = st.checkbox(
|
||||
"Strip control characters", value=options.strip_control,
|
||||
)
|
||||
options.strip_bom = st.checkbox("Strip BOM", value=options.strip_bom)
|
||||
with col_b:
|
||||
options.fold_smart_chars = st.checkbox(
|
||||
"Fold smart characters (curly quotes, em-dash, NBSP)",
|
||||
value=options.fold_smart_chars,
|
||||
)
|
||||
options.strip_zero_width = st.checkbox(
|
||||
"Strip zero-width / invisible characters", value=options.strip_zero_width,
|
||||
)
|
||||
options.nfc = st.checkbox("Unicode NFC normalization", value=options.nfc)
|
||||
options.nfkc = st.checkbox(
|
||||
"Unicode NFKC compat fold (lossy: ① → 1, fi → fi)",
|
||||
value=options.nfkc,
|
||||
)
|
||||
|
||||
st.markdown("**Scope**")
|
||||
string_cols = [
|
||||
c for c in df.columns
|
||||
if pd.api.types.is_object_dtype(df[c]) or pd.api.types.is_string_dtype(df[c])
|
||||
]
|
||||
selected_cols = st.multiselect(
|
||||
"Columns to clean (default: all string columns)",
|
||||
options=list(df.columns),
|
||||
default=string_cols,
|
||||
)
|
||||
skip_cols = st.multiselect(
|
||||
"Columns to skip even if they look like text",
|
||||
options=list(df.columns),
|
||||
default=[],
|
||||
)
|
||||
options.columns = selected_cols if selected_cols else None
|
||||
options.skip_columns = list(skip_cols)
|
||||
|
||||
st.markdown("**Case conversion**")
|
||||
case_global = st.selectbox(
|
||||
"Apply case conversion to selected columns",
|
||||
["None", "UPPER", "lower", "Title", "Sentence"],
|
||||
index=0,
|
||||
)
|
||||
case_map = {
|
||||
"UPPER": "upper", "lower": "lower",
|
||||
"Title": "title", "Sentence": "sentence",
|
||||
}
|
||||
if case_global != "None":
|
||||
options.case = case_map[case_global] # type: ignore[assignment]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Run
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.divider()
|
||||
|
||||
if st.button("Clean Text", type="primary", use_container_width=True):
|
||||
with st.spinner("Cleaning..."):
|
||||
try:
|
||||
result = clean_dataframe(df, options)
|
||||
except ValueError as e:
|
||||
st.error(str(e))
|
||||
st.stop()
|
||||
st.session_state["textclean_result"] = result
|
||||
st.session_state["textclean_input_name"] = uploaded.name
|
||||
|
||||
result = st.session_state.get("textclean_result")
|
||||
if result is None:
|
||||
st.stop()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Results
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.subheader("Results")
|
||||
|
||||
pct = (result.cells_changed / result.cells_total * 100.0) if result.cells_total else 0.0
|
||||
m1, m2, m3, m4 = st.columns(4)
|
||||
m1.metric("Cells scanned", result.cells_total)
|
||||
m2.metric("Cells changed", result.cells_changed)
|
||||
m3.metric("% changed", f"{pct:.1f}%")
|
||||
m4.metric("Columns processed", len(result.columns_processed))
|
||||
|
||||
if result.cells_changed:
|
||||
counts = result.changes["column"].value_counts()
|
||||
st.markdown("**Changes by column**")
|
||||
st.dataframe(
|
||||
counts.rename("cells_changed").to_frame(),
|
||||
use_container_width=True,
|
||||
)
|
||||
|
||||
st.markdown("**Examples (first 25 changes)**")
|
||||
examples = result.changes.head(25).copy()
|
||||
examples["row"] = examples["row"] + 1
|
||||
st.dataframe(examples, use_container_width=True, hide_index=True)
|
||||
|
||||
st.markdown("**Cleaned preview (first 10 rows)**")
|
||||
st.dataframe(result.cleaned_df.head(10), use_container_width=True)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Downloads
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.divider()
|
||||
stem = Path(st.session_state.get("textclean_input_name", "input")).stem
|
||||
|
||||
dl_a, dl_b, dl_c = st.columns(3)
|
||||
with dl_a:
|
||||
cleaned_bytes = result.cleaned_df.to_csv(index=False).encode("utf-8-sig")
|
||||
st.download_button(
|
||||
"Download cleaned CSV",
|
||||
data=cleaned_bytes,
|
||||
file_name=f"{stem}_cleaned.csv",
|
||||
mime="text/csv",
|
||||
)
|
||||
with dl_b:
|
||||
if not result.changes.empty:
|
||||
changes_bytes = result.changes.to_csv(index=False).encode("utf-8-sig")
|
||||
st.download_button(
|
||||
"Download changes audit",
|
||||
data=changes_bytes,
|
||||
file_name=f"{stem}_changes.csv",
|
||||
mime="text/csv",
|
||||
)
|
||||
with dl_c:
|
||||
config_bytes = json.dumps(options.to_dict(), indent=2).encode("utf-8")
|
||||
st.download_button(
|
||||
"Download config JSON",
|
||||
data=config_bytes,
|
||||
file_name="text_clean_config.json",
|
||||
mime="application/json",
|
||||
)
|
||||
|
||||
st.divider()
|
||||
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")
|
||||
|
||||
Reference in New Issue
Block a user