Files
datatools-dev/src/gui/pages/2_Text_Cleaner.py
Michael 444dffbc63 chore(ui): rename Upload → Import in user-facing strings
DataTools is local-first — "Upload" reads like "send data somewhere
remote", which contradicts the product positioning. Sweep replaces
the user-visible term throughout the UI:

- ``src/i18n/packs/en.json`` + ``es.json``: all ``upload.*`` strings
  (heading, intro, uploader labels, empty state, switch-back, etc.)
  and ``gate.default_name``. The ``intro_multi`` "no upload anywhere"
  phrasing dropped the verb entirely — now reads "nothing leaves
  this computer".
- All 9 tool pages: ``st.file_uploader(label="Upload …")`` →
  ``"Import …"``; matching ``st.info("Upload a …")`` empty-state
  banners; ``help="Upload …"`` strings on disabled uploaders.
- ``9_Pipeline_Runner`` + ``5_Column_Mapper``: radio-option text
  ``"Upload schema/pipeline JSON"`` → ``"Import …"`` plus the
  ``.startswith("Upload")`` branch guards that read those values.
- ``_home.py``: "**Uploaded files**" → "**Imported files**".
- ``app_demo.py``: "Uploaded file is …" → "Imported file is …".

Internal identifiers left untouched: function names
(``pickup_or_upload``, ``_StashedUpload``), session-state keys
(``home_upload``, ``home_uploads``, ``home_uploaded_*``,
``merger_file_upload``), audit-log event category (``"upload"``),
Streamlit testid CSS selectors. None of those are visible to the
user.

The file_uploader's dropzone button text is a baked-in React
literal that Streamlit's ``label=`` doesn't reach; rewritten at the
DOM level with a small ``_RENAME_UPLOAD_BUTTON_JS`` snippet shipped
through ``st.iframe`` (same pattern the sticky footer uses to mount
on ``<body>``). A ``MutationObserver`` on the parent document re-
applies the swap when Streamlit remounts the dropzone after file
add/remove or page navigation, throttled via ``requestAnimationFrame``.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 23:48:31 +00:00

413 lines
15 KiB
Python

"""DataTools Clean Text — Streamlit page."""
from __future__ import annotations
import io
import json
import sys
from pathlib import Path
import pandas as pd
import streamlit as st
_project_root = Path(__file__).resolve().parent.parent.parent.parent
if str(_project_root) not in sys.path:
sys.path.insert(0, str(_project_root))
from src.gui.components import (
back_to_home_link,
render_sticky_footer,
hide_streamlit_chrome,
html_download_button,
pickup_or_upload,
render_hidden_aware_preview,
require_feature_or_render_upgrade,
)
from src.i18n import t
from src.license import FeatureFlag
from src.core.text_clean import (
PRESETS,
CleanOptions,
clean_dataframe,
hidden_char_css,
visualize_hidden_html,
)
hide_streamlit_chrome()
render_sticky_footer()
back_to_home_link()
from src.audit import log_page_open
log_page_open("2_Text_Cleaner")
require_feature_or_render_upgrade(FeatureFlag.TEXT_CLEANER)
# ---------------------------------------------------------------------------
# Header
# ---------------------------------------------------------------------------
st.title(t("tools.02_text_cleaner.page_title"))
st.caption(t("tools.02_text_cleaner.page_caption"))
# ---------------------------------------------------------------------------
# File upload
# ---------------------------------------------------------------------------
uploaded = pickup_or_upload(
label="Import CSV or Excel file",
key="textclean_file_upload",
types=["csv", "tsv", "xlsx", "xls"],
)
if uploaded is None:
st.info("Import a CSV, TSV, or Excel file to begin.")
st.stop()
@st.cache_data(show_spinner=False)
def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
"""Read the uploaded bytes into a DataFrame, treating all cells as strings."""
suffix = Path(name).suffix.lower()
bio = io.BytesIO(data)
if suffix in (".xlsx", ".xls"):
return pd.read_excel(bio, dtype=str, keep_default_na=False)
# CSV / TSV — try utf-8 then utf-8-sig then latin-1 as a fallback
for enc in ("utf-8", "utf-8-sig", "latin-1"):
try:
bio.seek(0)
sep = "\t" if suffix == ".tsv" else ","
return pd.read_csv(
bio, dtype=str, keep_default_na=False,
encoding=enc, sep=sep, on_bad_lines="warn",
)
except UnicodeDecodeError:
continue
bio.seek(0)
return pd.read_csv(bio, dtype=str, keep_default_na=False, encoding="latin-1")
try:
df = _read_uploaded(uploaded.name, uploaded.getvalue())
except UnicodeDecodeError as e:
st.error(
f"**Could not decode `{uploaded.name}`**\n\n"
f"The file isn't UTF-8, UTF-8-with-BOM, or Latin-1.\n\n"
f"_Underlying error: {e}_\n\n"
f"Try re-saving the file as UTF-8 from the source application, "
f"or convert it with `iconv -f <source-encoding> -t utf-8`."
)
st.stop()
except Exception as e:
from src.core.errors import format_for_user
st.error(
f"**Could not read `{uploaded.name}`**\n\n"
f"```\n{format_for_user(e)}\n```"
)
st.stop()
# Collapse the input preview once the user has clicked Clean Text so
# the Results section below is the primary visual focus. The user can
# re-expand the expander to re-inspect the source rows.
_has_result = st.session_state.get("textclean_result") is not None
with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result):
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
preview_show_hidden = st.toggle(
"Show hidden characters in preview",
value=True,
help="Highlights NBSP, zero-width chars, smart quotes, and leading/trailing whitespace.",
key="textclean_preview_show_hidden",
)
if preview_show_hidden:
render_hidden_aware_preview(df, n_rows=10)
else:
st.dataframe(df.head(10), width="stretch")
st.divider()
# ---------------------------------------------------------------------------
# Options
# ---------------------------------------------------------------------------
#
# Wrapped in an outer expander whose default state mirrors the preview
# expander above: open before a result exists, folded once the user has
# clicked Clean Text. Together they push the Results section to the top
# of the visible area after a run.
with st.expander("Options", expanded=not _has_result):
preset_label = st.radio(
"Preset",
["excel-hygiene (recommended)", "minimal", "paranoid"],
index=0,
horizontal=True,
help=(
"excel-hygiene: trim, collapse whitespace, fold smart quotes, strip "
"invisible chars, normalize line endings, NFC. "
"minimal: only trim and collapse. "
"paranoid: everything including NFKC compat fold (lossy)."
),
)
preset_key = preset_label.split(" ", 1)[0]
options = CleanOptions.from_preset(preset_key)
with st.expander("Advanced options"):
col_a, col_b = st.columns(2)
with col_a:
options.trim = st.checkbox("Trim leading/trailing whitespace", value=options.trim)
options.collapse_whitespace = st.checkbox(
"Collapse internal whitespace", value=options.collapse_whitespace,
)
options.normalize_line_endings = st.checkbox(
"Normalize line endings (\\r\\n → \\n)", value=options.normalize_line_endings,
)
options.strip_control = st.checkbox(
"Strip control characters", value=options.strip_control,
)
options.strip_bom = st.checkbox("Strip BOM", value=options.strip_bom)
with col_b:
options.fold_smart_chars = st.checkbox(
"Fold smart characters (curly quotes, em-dash, NBSP)",
value=options.fold_smart_chars,
)
options.strip_zero_width = st.checkbox(
"Strip zero-width / invisible characters", value=options.strip_zero_width,
)
options.nfc = st.checkbox("Unicode NFC normalization", value=options.nfc)
options.nfkc = st.checkbox(
"Unicode NFKC compat fold (lossy: ① → 1, fi → fi)",
value=options.nfkc,
)
st.markdown("**Scope**")
string_cols = [
c for c in df.columns
if pd.api.types.is_object_dtype(df[c]) or pd.api.types.is_string_dtype(df[c])
]
selected_cols = st.multiselect(
"Columns to clean (default: all string columns)",
options=list(df.columns),
default=string_cols,
)
skip_cols = st.multiselect(
"Columns to skip even if they look like text",
options=list(df.columns),
default=[],
)
options.columns = selected_cols if selected_cols else None
options.skip_columns = list(skip_cols)
st.markdown("**Case conversion**")
case_global = st.selectbox(
"Apply case conversion to selected columns",
["None", "UPPER", "lower", "Title", "Sentence"],
index=0,
)
case_map = {
"UPPER": "upper", "lower": "lower",
"Title": "title", "Sentence": "sentence",
}
if case_global != "None":
options.case = case_map[case_global] # type: ignore[assignment]
# ---------------------------------------------------------------------------
# Run
# ---------------------------------------------------------------------------
st.divider()
if st.button("Clean Text", type="primary", width="stretch"):
with st.spinner("Cleaning..."):
try:
result = clean_dataframe(df, options)
except ValueError as e:
st.error(str(e))
st.stop()
st.session_state["textclean_result"] = result
from src.audit import log_event
log_event("tool_run", "Clean Text run", page="2_Text_Cleaner")
st.session_state["textclean_input_name"] = uploaded.name
# One-shot flag picked up on the next pass to scroll the parent
# document to the Results anchor (see scroll snippet below).
st.session_state["_textclean_scroll_to_results"] = True
# Force a second rerun so the preview and options expanders see
# the new result on the NEXT script pass and collapse themselves.
# Without this they stay expanded until the user touches any
# other widget.
st.rerun()
result = st.session_state.get("textclean_result")
if result is None:
st.stop()
# ---------------------------------------------------------------------------
# Results
# ---------------------------------------------------------------------------
# Anchor target for the auto-scroll snippet at the end of this block.
# A bare ``<div id="...">`` survives Streamlit's HTML sanitizer (only
# ``<script>`` is stripped), and a 1px-tall div doesn't visually shift
# anything. Placed before the subheader so the scrolled-to viewport
# starts a few pixels above the section heading rather than below it.
st.markdown(
'<div id="textclean-results-anchor" style="height:1px"></div>',
unsafe_allow_html=True,
)
st.subheader("Results")
pct = (result.cells_changed / result.cells_total * 100.0) if result.cells_total else 0.0
m1, m2, m3, m4 = st.columns(4)
m1.metric("Cells scanned", result.cells_total)
m2.metric("Cells changed", result.cells_changed)
m3.metric("% changed", f"{pct:.1f}%")
m4.metric("Columns processed", len(result.columns_processed))
# Single toggle drives both the Examples table AND the Cleaned preview.
# Defined OUTSIDE the ``if result.cells_changed`` block so the
# downstream cleaned-preview render below always has the variable in
# scope, even on no-op runs (junk files / minimal preset that produces
# zero changes — previously triggered ``NameError: show_hidden``).
show_hidden = st.toggle(
"Show hidden characters (NBSP, ZWSP, smart quotes, control chars…)",
value=True,
help=(
"Highlights characters the cleaner is removing or replacing. "
"Hover any badge to see the codepoint and label."
),
key="textclean_show_hidden",
)
if result.cells_changed:
counts = result.changes["column"].value_counts()
st.markdown("**Changes by column**")
st.dataframe(
counts.rename("cells_changed").to_frame(),
width="stretch",
)
st.markdown("**Examples (first 25 changes)**")
examples = result.changes.head(25).copy()
examples["row"] = examples["row"] + 1
if show_hidden:
# Inject the badge CSS once, then render an HTML table so the
# invisibles in old/new are actually visible to the user.
# ``mark_outer_whitespace=True`` matches the input preview's
# rendering so leading/trailing spaces show up as badges in the
# Before/After columns — without it, the examples table missed
# exactly the whitespace the cleaner is removing.
st.markdown(hidden_char_css(), unsafe_allow_html=True)
rows_html = []
for _, row in examples.iterrows():
rows_html.append(
"<tr>"
f"<td>{row['row']}</td>"
f"<td><code>{visualize_hidden_html(str(row['column']), mark_outer_whitespace=True)}</code></td>"
f"<td>{visualize_hidden_html(str(row['old']), mark_outer_whitespace=True)}</td>"
f"<td>{visualize_hidden_html(str(row['new']), mark_outer_whitespace=True)}</td>"
f"<td><code>{row['ops_applied']}</code></td>"
"</tr>"
)
st.markdown(
"<table class='hidden-char-table'>"
"<thead><tr>"
"<th style='text-align:left'>Row</th>"
"<th style='text-align:left'>Column</th>"
"<th style='text-align:left'>Before</th>"
"<th style='text-align:left'>After</th>"
"<th style='text-align:left'>Ops applied</th>"
"</tr></thead>"
f"<tbody>{''.join(rows_html)}</tbody>"
"</table>"
"<style>"
".hidden-char-table { width: 100%; border-collapse: collapse; }"
".hidden-char-table th, .hidden-char-table td { "
" padding: 4px 8px; border-bottom: 1px solid #eee; "
" vertical-align: top; }"
".hidden-char-table tbody tr:hover { background: #fafafa; }"
"</style>",
unsafe_allow_html=True,
)
else:
st.dataframe(examples, width="stretch", hide_index=True)
st.markdown("**Cleaned preview (first 10 rows)**")
# Reuse the same toggle the Examples table uses so the user controls both
# the changes audit and the cleaned preview with one switch.
if show_hidden:
render_hidden_aware_preview(result.cleaned_df, n_rows=10)
else:
st.dataframe(result.cleaned_df.head(10), width="stretch")
# ---------------------------------------------------------------------------
# Downloads
# ---------------------------------------------------------------------------
#
# Rendered via ``html_download_button`` (raw <a download> anchor) rather
# than ``st.download_button``. The latter has a long-standing bug where
# the second and third download_buttons rendered in the same script pass
# fail to fire — only the first one's click reaches the browser save
# dialog. The HTML helper bypasses the widget system entirely and works
# uniformly across all browsers.
st.divider()
stem = Path(st.session_state.get("textclean_input_name", "input")).stem
cleaned_bytes = result.cleaned_df.to_csv(index=False).encode("utf-8-sig")
changes_bytes = (
result.changes.to_csv(index=False).encode("utf-8-sig")
if not result.changes.empty
else b""
)
config_bytes = json.dumps(options.to_dict(), indent=2).encode("utf-8")
dl_a, dl_b, dl_c = st.columns(3)
with dl_a:
html_download_button(
"Download cleaned CSV",
cleaned_bytes,
file_name=f"{stem}_cleaned.csv",
mime="text/csv",
)
with dl_b:
html_download_button(
"Download changes audit",
changes_bytes,
file_name=f"{stem}_changes.csv",
mime="text/csv",
disabled=result.changes.empty,
help="No changes to audit." if result.changes.empty else None,
)
with dl_c:
html_download_button(
"Download config JSON",
config_bytes,
file_name="text_clean_config.json",
mime="application/json",
)
# ---------------------------------------------------------------------------
# Post-run auto-scroll
# ---------------------------------------------------------------------------
#
# When the user clicks Clean Text, the preview + options collapse but
# Streamlit by itself doesn't scroll — the Results section is at the
# bottom of a tall script so the user has to find it. Inject a tiny
# component-html iframe that calls ``scrollIntoView`` on the parent's
# Results anchor. Streamlit's main page is same-origin with component
# iframes so ``window.parent.document`` access is allowed.
#
# The flag is one-shot (``pop`` removes it) so re-renders triggered by
# unrelated widgets in the Results section (e.g., the Show-hidden
# toggle) don't yank the viewport back to the top of Results.
if st.session_state.pop("_textclean_scroll_to_results", False):
st.iframe(
"""
<script>
const doc = window.parent.document;
const target = doc.getElementById('textclean-results-anchor');
if (target) target.scrollIntoView({behavior: 'smooth', block: 'start'});
</script>
""",
height=1,
)