Adds a Review & Normalize page that sits between upload and every tool
page. The analyzer now tags each finding with confidence (high/medium/low)
and a fix_action; the gate auto-applies high-confidence fixes, surfaces
medium/low ones for user review, and blocks tool pages on error-level
findings until resolved or waived.
Core (src/core/):
- analyze.py: Finding gains confidence, fix_action, pre_applied; new
detectors for encoding_uncertain, encoding_decode_failed; new top-
level encoding_override parameter.
- fixes.py: registry of fix algorithms keyed by fix_action id.
- normalize.py: auto_fix(), apply_decisions(), is_normalized(), and
the NormalizationResult / Decision dataclasses the gate consumes.
- io.py: detect_encoding tries strict UTF-8 first; repair_bytes now
transcodes UTF-16/32 to UTF-8 before NUL-strip (fixes UTF-16 corruption)
and normalizes line endings (fixes bare-CR parser crash); empty file
handled gracefully instead of EmptyDataError traceback.
GUI (src/gui/):
- pages/0_Review.py: gate page with per-finding decision controls,
encoding override picker (16 codepages + custom), and Advanced output
options (encoding, delimiter, line terminator) on the download.
- components.py: require_normalization_gate() helper.
- pages/1-9: gate guard wired on every tool page.
Test corpora:
- test-cases/encodings-corpus/: 31 encoded CSV fixtures + 9 reference
UTF-8 files + manifest, synced from Business/DataTools.
- test-cases/text-cleaner-corpus/test_data/17: synced malformed input
(unquoted $1,500.00) for the unquoted-delimiter detector.
Tests (94 new):
- test_normalize.py (48): finding fields, fix registry, auto_fix scope,
decision paths, gate idempotency, output-options helper.
- test_encodings_corpus.py (90, 16 xfailed): parametric detection +
decode + analyzer-no-crash sweep against the manifest.
- test_analyze.py: encoding override + encoding_uncertain detectors.
- test_corpus.py: pre-parse repair in the strict reader.
run_tests.py: new aliases --tool normalize, --tool encodings, --tool gate;
encodings corpus added to --fixtures category.
Docs: USER-GUIDE §3.3 covers the gate workflow, encoding override, and
output options; TECHNICAL §10.2.1-10.2.4 documents the analyzer schema,
gate API, Review page, and pre-parse repair pipeline; CLI-REFERENCE adds
the analyzer JSON schema with the new fields; README links to all of it.
Suite: 765 passed, 17 xfailed (was 458 passed).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
318 lines
11 KiB
Python
318 lines
11 KiB
Python
"""DataTools Text Cleaner — Streamlit page."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import io
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
import streamlit as st
|
|
|
|
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
|
if str(_project_root) not in sys.path:
|
|
sys.path.insert(0, str(_project_root))
|
|
|
|
from src.gui.components import (
|
|
hide_streamlit_chrome,
|
|
pickup_or_upload,
|
|
render_hidden_aware_preview,
|
|
require_normalization_gate,
|
|
)
|
|
from src.core.text_clean import (
|
|
PRESETS,
|
|
CleanOptions,
|
|
clean_dataframe,
|
|
hidden_char_css,
|
|
visualize_hidden_html,
|
|
)
|
|
|
|
hide_streamlit_chrome()
|
|
require_normalization_gate()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Header
|
|
# ---------------------------------------------------------------------------
|
|
|
|
st.title("✂️ Text Cleaner")
|
|
st.caption(
|
|
"Trim whitespace, fold smart quotes, strip invisible characters, and "
|
|
"normalize line endings. Runs locally — your data never leaves this computer."
|
|
)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# File upload
|
|
# ---------------------------------------------------------------------------
|
|
|
|
uploaded = pickup_or_upload(
|
|
label="Upload CSV or Excel file",
|
|
key="textclean_file_upload",
|
|
types=["csv", "tsv", "xlsx", "xls"],
|
|
)
|
|
|
|
if uploaded is None:
|
|
st.info("Upload a CSV, TSV, or Excel file to begin.")
|
|
st.stop()
|
|
|
|
|
|
@st.cache_data(show_spinner=False)
|
|
def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
|
|
"""Read the uploaded bytes into a DataFrame, treating all cells as strings."""
|
|
suffix = Path(name).suffix.lower()
|
|
bio = io.BytesIO(data)
|
|
if suffix in (".xlsx", ".xls"):
|
|
return pd.read_excel(bio, dtype=str, keep_default_na=False)
|
|
# CSV / TSV — try utf-8 then utf-8-sig then latin-1 as a fallback
|
|
for enc in ("utf-8", "utf-8-sig", "latin-1"):
|
|
try:
|
|
bio.seek(0)
|
|
sep = "\t" if suffix == ".tsv" else ","
|
|
return pd.read_csv(
|
|
bio, dtype=str, keep_default_na=False,
|
|
encoding=enc, sep=sep, on_bad_lines="warn",
|
|
)
|
|
except UnicodeDecodeError:
|
|
continue
|
|
bio.seek(0)
|
|
return pd.read_csv(bio, dtype=str, keep_default_na=False, encoding="latin-1")
|
|
|
|
|
|
try:
|
|
df = _read_uploaded(uploaded.name, uploaded.getvalue())
|
|
except Exception as e:
|
|
st.error(f"Failed to read file: {e}")
|
|
st.stop()
|
|
|
|
st.subheader(f"Preview: {uploaded.name}")
|
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
|
preview_show_hidden = st.toggle(
|
|
"Show hidden characters in preview",
|
|
value=True,
|
|
help="Highlights NBSP, zero-width chars, smart quotes, and leading/trailing whitespace.",
|
|
key="textclean_preview_show_hidden",
|
|
)
|
|
if preview_show_hidden:
|
|
render_hidden_aware_preview(df, n_rows=10)
|
|
else:
|
|
st.dataframe(df.head(10), use_container_width=True)
|
|
|
|
st.divider()
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Options
|
|
# ---------------------------------------------------------------------------
|
|
|
|
st.subheader("Options")
|
|
|
|
preset_label = st.radio(
|
|
"Preset",
|
|
["excel-hygiene (recommended)", "minimal", "paranoid"],
|
|
index=0,
|
|
horizontal=True,
|
|
help=(
|
|
"excel-hygiene: trim, collapse whitespace, fold smart quotes, strip "
|
|
"invisible chars, normalize line endings, NFC. "
|
|
"minimal: only trim and collapse. "
|
|
"paranoid: everything including NFKC compat fold (lossy)."
|
|
),
|
|
)
|
|
preset_key = preset_label.split(" ", 1)[0]
|
|
options = CleanOptions.from_preset(preset_key)
|
|
|
|
with st.expander("Advanced options"):
|
|
col_a, col_b = st.columns(2)
|
|
with col_a:
|
|
options.trim = st.checkbox("Trim leading/trailing whitespace", value=options.trim)
|
|
options.collapse_whitespace = st.checkbox(
|
|
"Collapse internal whitespace", value=options.collapse_whitespace,
|
|
)
|
|
options.normalize_line_endings = st.checkbox(
|
|
"Normalize line endings (\\r\\n → \\n)", value=options.normalize_line_endings,
|
|
)
|
|
options.strip_control = st.checkbox(
|
|
"Strip control characters", value=options.strip_control,
|
|
)
|
|
options.strip_bom = st.checkbox("Strip BOM", value=options.strip_bom)
|
|
with col_b:
|
|
options.fold_smart_chars = st.checkbox(
|
|
"Fold smart characters (curly quotes, em-dash, NBSP)",
|
|
value=options.fold_smart_chars,
|
|
)
|
|
options.strip_zero_width = st.checkbox(
|
|
"Strip zero-width / invisible characters", value=options.strip_zero_width,
|
|
)
|
|
options.nfc = st.checkbox("Unicode NFC normalization", value=options.nfc)
|
|
options.nfkc = st.checkbox(
|
|
"Unicode NFKC compat fold (lossy: ① → 1, fi → fi)",
|
|
value=options.nfkc,
|
|
)
|
|
|
|
st.markdown("**Scope**")
|
|
string_cols = [
|
|
c for c in df.columns
|
|
if pd.api.types.is_object_dtype(df[c]) or pd.api.types.is_string_dtype(df[c])
|
|
]
|
|
selected_cols = st.multiselect(
|
|
"Columns to clean (default: all string columns)",
|
|
options=list(df.columns),
|
|
default=string_cols,
|
|
)
|
|
skip_cols = st.multiselect(
|
|
"Columns to skip even if they look like text",
|
|
options=list(df.columns),
|
|
default=[],
|
|
)
|
|
options.columns = selected_cols if selected_cols else None
|
|
options.skip_columns = list(skip_cols)
|
|
|
|
st.markdown("**Case conversion**")
|
|
case_global = st.selectbox(
|
|
"Apply case conversion to selected columns",
|
|
["None", "UPPER", "lower", "Title", "Sentence"],
|
|
index=0,
|
|
)
|
|
case_map = {
|
|
"UPPER": "upper", "lower": "lower",
|
|
"Title": "title", "Sentence": "sentence",
|
|
}
|
|
if case_global != "None":
|
|
options.case = case_map[case_global] # type: ignore[assignment]
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Run
|
|
# ---------------------------------------------------------------------------
|
|
|
|
st.divider()
|
|
|
|
if st.button("Clean Text", type="primary", use_container_width=True):
|
|
with st.spinner("Cleaning..."):
|
|
try:
|
|
result = clean_dataframe(df, options)
|
|
except ValueError as e:
|
|
st.error(str(e))
|
|
st.stop()
|
|
st.session_state["textclean_result"] = result
|
|
st.session_state["textclean_input_name"] = uploaded.name
|
|
|
|
result = st.session_state.get("textclean_result")
|
|
if result is None:
|
|
st.stop()
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Results
|
|
# ---------------------------------------------------------------------------
|
|
|
|
st.subheader("Results")
|
|
|
|
pct = (result.cells_changed / result.cells_total * 100.0) if result.cells_total else 0.0
|
|
m1, m2, m3, m4 = st.columns(4)
|
|
m1.metric("Cells scanned", result.cells_total)
|
|
m2.metric("Cells changed", result.cells_changed)
|
|
m3.metric("% changed", f"{pct:.1f}%")
|
|
m4.metric("Columns processed", len(result.columns_processed))
|
|
|
|
if result.cells_changed:
|
|
counts = result.changes["column"].value_counts()
|
|
st.markdown("**Changes by column**")
|
|
st.dataframe(
|
|
counts.rename("cells_changed").to_frame(),
|
|
use_container_width=True,
|
|
)
|
|
|
|
st.markdown("**Examples (first 25 changes)**")
|
|
show_hidden = st.toggle(
|
|
"Show hidden characters (NBSP, ZWSP, smart quotes, control chars…)",
|
|
value=True,
|
|
help=(
|
|
"Highlights characters the cleaner is removing or replacing. "
|
|
"Hover any badge to see the codepoint and label."
|
|
),
|
|
key="textclean_show_hidden",
|
|
)
|
|
examples = result.changes.head(25).copy()
|
|
examples["row"] = examples["row"] + 1
|
|
if show_hidden:
|
|
# Inject the badge CSS once, then render an HTML table so the
|
|
# invisibles in old/new are actually visible to the user.
|
|
st.markdown(hidden_char_css(), unsafe_allow_html=True)
|
|
rows_html = []
|
|
for _, row in examples.iterrows():
|
|
rows_html.append(
|
|
"<tr>"
|
|
f"<td>{row['row']}</td>"
|
|
f"<td><code>{visualize_hidden_html(str(row['column']))}</code></td>"
|
|
f"<td>{visualize_hidden_html(str(row['old']))}</td>"
|
|
f"<td>{visualize_hidden_html(str(row['new']))}</td>"
|
|
f"<td><code>{row['ops_applied']}</code></td>"
|
|
"</tr>"
|
|
)
|
|
st.markdown(
|
|
"<table class='hidden-char-table'>"
|
|
"<thead><tr>"
|
|
"<th style='text-align:left'>Row</th>"
|
|
"<th style='text-align:left'>Column</th>"
|
|
"<th style='text-align:left'>Before</th>"
|
|
"<th style='text-align:left'>After</th>"
|
|
"<th style='text-align:left'>Ops applied</th>"
|
|
"</tr></thead>"
|
|
f"<tbody>{''.join(rows_html)}</tbody>"
|
|
"</table>"
|
|
"<style>"
|
|
".hidden-char-table { width: 100%; border-collapse: collapse; }"
|
|
".hidden-char-table th, .hidden-char-table td { "
|
|
" padding: 4px 8px; border-bottom: 1px solid #eee; "
|
|
" vertical-align: top; }"
|
|
".hidden-char-table tbody tr:hover { background: #fafafa; }"
|
|
"</style>",
|
|
unsafe_allow_html=True,
|
|
)
|
|
else:
|
|
st.dataframe(examples, use_container_width=True, hide_index=True)
|
|
|
|
st.markdown("**Cleaned preview (first 10 rows)**")
|
|
# Reuse the same toggle the Examples table uses so the user controls both
|
|
# the changes audit and the cleaned preview with one switch.
|
|
if show_hidden:
|
|
render_hidden_aware_preview(result.cleaned_df, n_rows=10)
|
|
else:
|
|
st.dataframe(result.cleaned_df.head(10), use_container_width=True)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Downloads
|
|
# ---------------------------------------------------------------------------
|
|
|
|
st.divider()
|
|
stem = Path(st.session_state.get("textclean_input_name", "input")).stem
|
|
|
|
dl_a, dl_b, dl_c = st.columns(3)
|
|
with dl_a:
|
|
cleaned_bytes = result.cleaned_df.to_csv(index=False).encode("utf-8-sig")
|
|
st.download_button(
|
|
"Download cleaned CSV",
|
|
data=cleaned_bytes,
|
|
file_name=f"{stem}_cleaned.csv",
|
|
mime="text/csv",
|
|
)
|
|
with dl_b:
|
|
if not result.changes.empty:
|
|
changes_bytes = result.changes.to_csv(index=False).encode("utf-8-sig")
|
|
st.download_button(
|
|
"Download changes audit",
|
|
data=changes_bytes,
|
|
file_name=f"{stem}_changes.csv",
|
|
mime="text/csv",
|
|
)
|
|
with dl_c:
|
|
config_bytes = json.dumps(options.to_dict(), indent=2).encode("utf-8")
|
|
st.download_button(
|
|
"Download config JSON",
|
|
data=config_bytes,
|
|
file_name="text_clean_config.json",
|
|
mime="application/json",
|
|
)
|
|
|
|
st.divider()
|
|
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")
|