Files
datatools-dev/src/gui/_home.py
Michael d807d3c11b feat(gui): add the one-click "Clean these files for me" front door
Issue #1 (the make-or-break UX fix): after the analyzer runs, Home now
leads with a primary "Clean these files for me" CTA that runs the
recommended pipeline (Clean Text -> Standardize -> Fix Missing -> Find
Duplicates, in order) on every imported file and hands back a cleaned
CSV per file — collapsing "which tool, what order" to one click. The
existing per-finding cards remain, reframed as "Or fix issues one at a
time" for users who want manual control.

- Reuses the core API verbatim (recommended_pipeline + run_pipeline);
  reader mirrors 9_Pipeline_Runner._read_uploaded so files load the same
  way the standalone orchestrator loads them.
- Per-file errors are captured so one bad file doesn't kill the batch;
  cleaned CSVs are cached in session_state so downloads survive reruns
  and are pruned when a file is removed or re-analyzed.

Verified: the read -> run_pipeline -> CSV data path executes correctly
(compile + a non-Streamlit functional smoke test). The Streamlit UI
scaffolding (button / download_button / progress / session_state)
mirrors the proven runner page but still needs a `streamlit run` check.
Front-door copy is English literals for now; i18n keys are a follow-up.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-08 17:06:30 +00:00

617 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Home-page renderer extracted into its own module.
This used to live inside ``src/gui/app.py`` as a local function. Pulling
it out into a side-effect-free module lets the ``back_to_home_link``
helper (in ``components/_legacy.py``) import the home callable to pass
into ``st.switch_page`` — without re-running ``app.py``'s navigation
setup, which would itself blow up because tool pages have a different
"main script" context that breaks the registry's relative ``pages/…``
paths.
Keep this module imports-light: nothing that runs Streamlit commands
at module top level, nothing that triggers config loads. Just the
``_home_page`` callable.
"""
from __future__ import annotations
import streamlit as st
class _StashedUpload:
"""Duck-types Streamlit's ``UploadedFile`` so ``_run_analysis_on_upload``
accepts entries restored from session-state without changes. Exposes
``.name``, ``.size``, and ``.getvalue()`` — the contract used by the
analyzer's read path.
"""
__slots__ = ("name", "size", "_data")
def __init__(self, name: str, data: bytes) -> None:
self.name = name
self.size = len(data)
self._data = data
def getvalue(self) -> bytes:
return self._data
def _format_size(n: int) -> str:
"""Human-readable byte count for file sizes shown in the GUI.
Bytes are never displayed — the smallest unit is KB, even for sub-
kilobyte files (e.g. ``0.5 KB`` for 512 bytes). Steps up to MB
once the count reaches 1 MiB, then to GB at 1 GiB. Always one
decimal place.
"""
KB = 1024
MB = 1024 * 1024
GB = 1024 * 1024 * 1024
if n < MB:
return f"{n / KB:.1f} KB"
if n < GB:
return f"{n / MB:.1f} MB"
return f"{n / GB:.1f} GB"
def _render_stats_overview(findings_by_file: dict) -> None:
"""4-card grid above the per-file findings — summarizes the run.
Card layout follows ``datatools_layout_redesign2.html`` §stats:
Files analyzed, Total findings, Warnings (severity ``warn``
``error``), Info (severity ``info``). The warn + info cards are
tinted via ``.is-warn`` / ``.is-info`` modifiers that read the
severity colors theme.py declares.
"""
import html as _html
n_files = len(findings_by_file)
all_findings = [f for fs in findings_by_file.values() for f in fs]
n_total = len(all_findings)
# Mockup groups errors with warnings on the "to review" card —
# both demand the user act. ``info`` is the lower-priority pile.
n_warn = sum(1 for f in all_findings if f.severity in ("warn", "error"))
n_info = sum(1 for f in all_findings if f.severity == "info")
def _card(label: str, value: int, unit: str = "", kind: str = "") -> str:
cls = "dt-stat" + (f" {kind}" if kind else "")
unit_html = (
f'<span class="dt-stat-unit">{_html.escape(unit)}</span>'
if unit else ""
)
return (
f'<div class="{cls}">'
f'<div class="dt-stat-label">{_html.escape(label)}</div>'
f'<div class="dt-stat-value">{value}{unit_html}</div>'
f"</div>"
)
cards = (
_card("Files analyzed", n_files)
+ _card("Total findings", n_total)
+ _card(
"Warnings",
n_warn,
unit="to review" if n_warn else "",
kind="is-warn" if n_warn else "",
)
+ _card(
"Info",
n_info,
unit="suggestions" if n_info else "",
kind="is-info" if n_info else "",
)
)
st.markdown(
f'<div class="dt-stats">{cards}</div>',
unsafe_allow_html=True,
)
def _sync_uploader_to_home_uploads() -> None:
"""``on_change`` callback for the home-page file_uploader.
Reconciles ``home_uploads`` (our persistent stash) with the widget's
current value: adds newly-uploaded files, and drops files the user
explicitly removed via the widget's built-in "" button. Per
Streamlit semantics ``on_change`` only runs for user-initiated
value changes, so the navigation-induced ``[]`` reset never reaches
here — the stash survives intact across page switches.
"""
from src.audit import log_event
widget_files = st.session_state.get("home_upload") or []
home_uploads: dict = st.session_state.setdefault("home_uploads", {})
findings: dict = st.session_state.setdefault("home_findings_by_file", {})
widget_names = {f.name for f in widget_files}
for f in widget_files:
if f.name not in home_uploads:
home_uploads[f.name] = {"bytes": f.getvalue(), "size": f.size}
log_event("upload", f"Uploaded {f.name}", filename=f.name, bytes=f.size)
for name in list(home_uploads.keys()):
if name not in widget_names:
del home_uploads[name]
findings.pop(name, None)
log_event("upload", f"Removed {name}", filename=name)
if st.session_state.get("home_uploaded_name") == name:
st.session_state.pop("home_uploaded_name", None)
st.session_state.pop("home_uploaded_size", None)
st.session_state.pop("home_uploaded_bytes", None)
st.session_state["home_uploads"] = home_uploads
st.session_state["home_findings_by_file"] = findings
def _read_upload_df(name: str, data: bytes):
"""Bytes -> DataFrame. Mirrors the Automated Workflows page reader:
Excel by extension, else CSV with encoding fallbacks. Kept in step
with ``9_Pipeline_Runner._read_uploaded`` so the one-click clean
reads files exactly as the standalone orchestrator would."""
import io as _io
from pathlib import Path as _Path
import pandas as pd
suffix = _Path(name).suffix.lower()
bio = _io.BytesIO(data)
if suffix in (".xlsx", ".xls"):
return pd.read_excel(bio)
for enc in ("utf-8", "utf-8-sig", "latin-1"):
try:
bio.seek(0)
sep = "\t" if suffix == ".tsv" else ","
return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn")
except UnicodeDecodeError:
continue
bio.seek(0)
return pd.read_csv(bio, encoding="latin-1")
def _run_recommended_clean(home_uploads: dict) -> None:
"""Front-door action: run the recommended pipeline (Clean Text ->
Standardize -> Fix Missing -> Find Duplicates, in that order) on
every imported file and stash a cleaned CSV per file in
``session_state`` for download. This is the orchestrator wearing a
friendly face — it consumes the same ``recommended_pipeline`` the
Automated Workflows page builds. Per-file errors are captured so one
bad file doesn't kill the batch."""
from src.core.pipeline import recommended_pipeline, run_pipeline
from src.core.errors import format_for_user
from src.audit import log_event
pipeline = recommended_pipeline()
names = list(home_uploads.keys())
results: dict = {}
progress = st.progress(0.0, text="Cleaning…")
for i, name in enumerate(names, start=1):
progress.progress((i - 1) / max(len(names), 1), text=name)
try:
df = _read_upload_df(name, home_uploads[name]["bytes"])
res = run_pipeline(df, pipeline, stop_on_error=False)
results[name] = {
"csv": res.final_df.to_csv(index=False).encode("utf-8"),
"initial_rows": res.initial_rows,
"final_rows": res.final_rows,
"error": None,
}
except Exception as e: # noqa: BLE001 — surface per file, keep the batch alive
results[name] = {"csv": None, "error": format_for_user(e)}
progress.empty()
log_event("tool_run", "Home one-click recommended clean", files=names)
st.session_state["home_clean_results"] = results
st.rerun()
def _render_clean_results() -> None:
"""Render per-file cleaned-CSV download buttons + a short summary from
the stash produced by :func:`_run_recommended_clean`. Only files
still present in ``home_uploads`` are shown, so removing a file
drops its stale result."""
import hashlib as _hashlib
results: dict = st.session_state.get("home_clean_results", {})
if not results:
return
current = st.session_state.get("home_uploads", {})
for name, r in results.items():
if name not in current:
continue
digest = _hashlib.sha1(
name.encode("utf-8"), usedforsecurity=False,
).hexdigest()[:10]
if r.get("error"):
st.error(f"**Could not clean `{name}`**\n\n```\n{r['error']}\n```")
continue
stem = name.rsplit(".", 1)[0]
st.download_button(
f"⬇ Download cleaned {name}",
data=r["csv"],
file_name=f"{stem}_cleaned.csv",
mime="text/csv",
key=f"home_clean_dl_{digest}",
width="stretch",
)
removed = r["initial_rows"] - r["final_rows"]
st.caption(
f"{r['final_rows']:,} rows kept"
+ (f" · {removed:,} removed" if removed else " · nothing to remove")
)
def _home_page() -> None:
"""Render the home page — multi-file upload + per-file analysis.
Uploaded files live in ``st.session_state["home_uploads"]`` (a
dict keyed by filename), NOT in the widget's transient state.
Streamlit's ``st.file_uploader`` widget gets unmounted when the
user navigates away to a tool page, and its ``UploadedFile``
objects don't always re-attach on remount — so we capture the
bytes into our own session-state stash on first sight and treat
that stash as the source of truth for everything downstream
(active-file pickup, analysis, findings rendering).
Removing a file: per-row "" buttons next to each uploaded
filename. Clearing findings: the "Clear results" button only
wipes the analysis cache, not the upload stash — the files
persist until the user explicitly removes them.
"""
from src.gui.components import (
hide_streamlit_chrome,
render_findings_panel,
render_sticky_footer,
)
from src.gui.components._legacy import _run_analysis_on_upload
from src.i18n import t
from pathlib import Path as _Path
_ICON_PATH = str(_Path(__file__).parent / "assets" / "datatools_icon_256.png")
st.set_page_config(
page_title=t("home.page_title"),
page_icon=_ICON_PATH,
layout="wide",
)
hide_streamlit_chrome()
render_sticky_footer()
import html as _html
# Page header — brand block (D icon + "UNALOGIX" eyebrow over
# "DataTools" wordmark + tagline) on the left, privacy pill on
# the right. Matches the sidebar brand chip scaled up for the
# hero. Bottom border replaces the explicit ``st.divider`` that
# used to sit below the caption.
privacy_label = _html.escape(t("home.privacy_pill"))
st.markdown(
'<header class="dt-page-header">'
'<div class="dt-page-brand">'
'<div class="dt-page-brand-row">'
'<div class="dt-page-brand-mark">D</div>'
'<div class="dt-page-brand-words">'
'<span class="dt-page-eyebrow">UNALOGIX</span>'
'<h1 class="dt-page-wordmark">DataTools</h1>'
'</div>'
'</div>'
f'<p class="dt-page-subtitle">{_html.escape(t("home.caption"))}</p>'
'</div>'
'<span class="dt-privacy-pill">'
'<svg viewBox="0 0 24 24" fill="none" stroke="currentColor">'
'<rect x="4" y="11" width="16" height="10" rx="2"/>'
'<path d="M8 11V7a4 4 0 018 0v4"/>'
'</svg>'
f'{privacy_label}'
'</span>'
'</header>',
unsafe_allow_html=True,
)
# Source of truth for uploaded files. dict[name -> {"bytes", "size"}].
home_uploads: dict = st.session_state.setdefault("home_uploads", {})
# Streamlit's file_uploader is the only path that actually receives
# bytes from the browser, but we don't want its dropzone UI to
# compete with the in-card "Add more files" button below. Park the
# whole widget off-screen via the ``dt-fileuploader-offscreen``
# CSS rule (declared in ``_DESIGN_TOKENS_CSS``) while keeping the
# underlying ``<input type="file">`` reachable to JS — the Add
# button programmatically clicks it to open the OS file picker.
#
# ``on_change`` fires ONLY on user-initiated value changes (uploads
# and the widget's built-in "✕" remove). It does NOT fire on the
# remount-induced reset. That lets us treat the callback as ground
# truth for both adds AND removes.
st.markdown(
'<style>[data-testid="stFileUploader"] {'
'position:absolute!important;left:-10000px!important;'
'width:1px!important;height:1px!important;overflow:hidden!important;'
'pointer-events:none!important;}</style>',
unsafe_allow_html=True,
)
st.file_uploader(
t("upload.uploader_label_multi"),
type=["csv", "tsv", "xlsx", "xls"],
accept_multiple_files=True,
key="home_upload",
help=t("upload.uploader_help"),
on_change=_sync_uploader_to_home_uploads,
label_visibility="collapsed",
)
# ``Files`` section header — count + total size on the right, or
# "No files imported yet" when empty (mockup §section-head).
import hashlib
n_files = len(home_uploads)
if n_files:
total_bytes = sum(meta["size"] for meta in home_uploads.values())
files_word = "file" if n_files == 1 else "files"
meta_html = (
f'{n_files} {files_word} · '
f'{_html.escape(_format_size(total_bytes))} total'
)
else:
meta_html = "No files imported yet"
st.markdown(
'<div class="dt-files-section-head">'
f'<h2>Files</h2>'
f'<span class="dt-section-meta">{meta_html}</span>'
'</div>',
unsafe_allow_html=True,
)
# Files card — always rendered. Body is file rows (if any) + the
# in-card "Add more files" button that triggers the off-screen
# file_uploader. Two-phase click capture for the X buttons: walk
# all rows once, accumulate ``to_remove`` if any was clicked,
# then mutate state + rerun ONCE after the loop.
to_remove: str | None = None
_DOC_SVG = (
'<svg viewBox="0 0 24 24" fill="none" stroke="currentColor">'
'<path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/>'
'<path d="M14 2v6h6"/>'
'</svg>'
)
_PLUS_SVG = (
'<svg viewBox="0 0 24 24" fill="none" stroke="currentColor">'
'<path d="M12 5v14M5 12h14"/>'
'</svg>'
)
with st.container(border=True):
for name in list(home_uploads.keys()):
digest = hashlib.sha1(
name.encode("utf-8"), usedforsecurity=False,
).hexdigest()[:10]
# X button on the LEFT of the row per UX feedback —
# ``✕ | filename + chip | size``.
col_x, col_name, col_size = st.columns([0.55, 8, 1.6])
if col_x.button(
"",
key=f"_home_remove_{digest}",
help=f"Remove {name}",
type="tertiary",
):
to_remove = name
col_name.markdown(
'<div class="dt-file-row">'
f'<span class="dt-file-icon-chip">{_DOC_SVG}</span>'
f'<span class="dt-file-name">{_html.escape(name)}</span>'
'</div>',
unsafe_allow_html=True,
)
col_size.markdown(
f'<div style="text-align:right;">'
f'<span class="dt-file-size">'
f'{_html.escape(_format_size(home_uploads[name]["size"]))}'
'</span></div>',
unsafe_allow_html=True,
)
# In-card "Add more files" — clicks the (off-screen)
# ``stFileUploaderDropzoneInput`` so the OS file picker opens.
# Inline ``onclick`` would be cleanest but Streamlit's HTML
# sanitizer strips event-handler attributes from
# ``unsafe_allow_html`` content; the wiring is done from
# ``_ADD_FILES_BUTTON_JS`` further down via ``st.iframe``.
st.markdown(
'<button class="dt-file-add" type="button">'
f'{_PLUS_SVG} Add more files'
'</button>',
unsafe_allow_html=True,
)
# Wire the in-card "Add more files" button to the off-screen
# ``stFileUploaderDropzoneInput`` (Streamlit strips inline
# ``onclick`` attributes; we have to do the binding from a real
# script element, which Streamlit only ships through component
# iframes — same pattern as the sticky footer + Upload→Import
# rewriter). A ``MutationObserver`` re-wires after reruns when
# Streamlit remounts the button.
st.iframe(
"""
<script>
(function () {
function wire(doc) {
var btn = doc.querySelector('button.dt-file-add');
var input = doc.querySelector('[data-testid="stFileUploaderDropzoneInput"]');
if (!btn || !input) return;
if (btn.dataset.dtWired === '1') return;
btn.dataset.dtWired = '1';
btn.addEventListener('click', function (e) {
e.preventDefault();
input.click();
});
}
var doc;
try { doc = window.parent.document; }
catch (e) { doc = document; }
wire(doc);
var win = doc.defaultView || window.parent || window;
if ('MutationObserver' in win) {
var raf = 0;
try {
new win.MutationObserver(function () {
if (raf) return;
raf = win.requestAnimationFrame(function () { raf = 0; wire(doc); });
}).observe(doc.body, { childList: true, subtree: true });
} catch (e) {}
}
})();
</script>
""",
height=1,
)
if to_remove is not None:
from src.audit import log_event
log_event(
"upload",
f"Removed {to_remove}",
filename=to_remove,
)
del home_uploads[to_remove]
# Drop any findings/results tied to the removed file.
findings_by_file_drop = st.session_state.get(
"home_findings_by_file", {}
)
findings_by_file_drop.pop(to_remove, None)
st.session_state["home_uploads"] = home_uploads
st.session_state["home_findings_by_file"] = findings_by_file_drop
# If we just removed the active upload, also clear the
# singular ``home_uploaded_*`` keys so tool pages don't
# pick up stale bytes; the next render will repopulate
# them from whatever file is now first.
if st.session_state.get("home_uploaded_name") == to_remove:
st.session_state.pop("home_uploaded_name", None)
st.session_state.pop("home_uploaded_size", None)
st.session_state.pop("home_uploaded_bytes", None)
st.rerun()
if not home_uploads:
# Empty state — page ends cleanly after the Files card. The
# in-card "Add more files" button is the only affordance the
# user needs; the old ``upload.empty_state`` info alert was
# redundant and out of step with the mockup.
return
# Expose the first uploaded file via the singular ``home_uploaded_*``
# session keys so tool pages reached via "Open <Tool>" still find an
# active upload through ``pickup_or_upload``.
first_name = next(iter(home_uploads))
first_meta = home_uploads[first_name]
if (
st.session_state.get("home_uploaded_name") != first_name
or st.session_state.get("home_uploaded_size") != first_meta["size"]
):
st.session_state["home_uploaded_name"] = first_name
st.session_state["home_uploaded_size"] = first_meta["size"]
st.session_state["home_uploaded_bytes"] = first_meta["bytes"]
# Findings cache — drop entries whose underlying file is no longer
# in the stash (e.g. user just clicked "✕").
findings_by_file: dict = st.session_state.setdefault(
"home_findings_by_file", {}
)
findings_by_file = {
name: result for name, result in findings_by_file.items()
if name in home_uploads
}
st.session_state["home_findings_by_file"] = findings_by_file
pending = [name for name in home_uploads if name not in findings_by_file]
# Action bar — Run analysis / Clear results.
col_run, col_clear, _ = st.columns([1, 1, 4])
with col_run:
run_clicked = st.button(
t("upload.run_button"),
type="primary",
key="home_run_analysis",
disabled=not pending,
width="stretch",
)
with col_clear:
clear_clicked = st.button(
t("upload.clear_results"),
key="home_clear_results",
disabled=not findings_by_file,
width="stretch",
)
if clear_clicked:
st.session_state["home_findings_by_file"] = {}
st.session_state["home_clean_results"] = {}
st.rerun()
if run_clicked:
from src.audit import log_event
log_event(
"analyze",
f"Run analysis clicked on {len(pending)} file(s)",
files=list(pending),
)
progress = st.progress(0.0, text=t("upload.scanning"))
for i, name in enumerate(pending, start=1):
stashed = _StashedUpload(name, home_uploads[name]["bytes"])
findings_by_file[name] = _run_analysis_on_upload(stashed)
progress.progress(i / len(pending), text=name)
st.session_state["home_findings_by_file"] = findings_by_file
# A fresh analysis invalidates any prior one-click clean outputs.
st.session_state["home_clean_results"] = {}
progress.empty()
st.rerun()
if findings_by_file:
st.divider()
# Overview row before drilling into per-file detail. Mockup
# layout (datatools_layout_redesign2.html §stats) puts a
# 4-card summary above the findings panels so the user can
# eyeball the run before expanding any one file.
_render_stats_overview(findings_by_file)
# ---- Front door: one-click recommended clean (primary path) ----
# The analyzer has the findings; the majority case is "just fix
# it." This primary button runs the recommended pipeline in the
# correct order and hands back a cleaned file per upload, so the
# user never has to decide which tool or what order. The per-file
# findings below remain the "fix one thing at a time" path.
if st.button(
"✨ Clean these files for me",
type="primary",
key="home_clean_all",
width="stretch",
):
_run_recommended_clean(home_uploads)
st.caption(
"Recommended: cleans text, standardizes formats, fills blanks, "
"and removes duplicates — in the right order — then gives you the "
"cleaned file."
)
_render_clean_results()
# ---- Manual path: per-file findings, fix one thing at a time ----
st.markdown("###### Or fix issues one at a time")
st.caption("Open any finding below to jump straight to the right tool.")
# Preserve the upload-stash order so the user sees results in
# the same order they appear in the file list above.
for name in home_uploads:
if name not in findings_by_file:
continue
findings = findings_by_file[name]
with st.container(border=True):
if not findings:
st.markdown(
'<div class="dt-finding-group-head">'
'<span class="dt-severity-dot success"></span>'
f'<span class="dt-group-filename">{_html.escape(name)}</span>'
'<div class="dt-group-counts">'
'<span class="dt-count-pill success">no issues</span>'
'</div>'
'</div>',
unsafe_allow_html=True,
)
else:
render_findings_panel(
findings,
header=name,
key_namespace=name,
)