From d807d3c11bb1b4e7cc214d298e28de66cc167955 Mon Sep 17 00:00:00 2001 From: Michael Date: Mon, 8 Jun 2026 17:06:30 +0000 Subject: [PATCH] feat(gui): add the one-click "Clean these files for me" front door MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #1 (the make-or-break UX fix): after the analyzer runs, Home now leads with a primary "Clean these files for me" CTA that runs the recommended pipeline (Clean Text -> Standardize -> Fix Missing -> Find Duplicates, in order) on every imported file and hands back a cleaned CSV per file — collapsing "which tool, what order" to one click. The existing per-finding cards remain, reframed as "Or fix issues one at a time" for users who want manual control. - Reuses the core API verbatim (recommended_pipeline + run_pipeline); reader mirrors 9_Pipeline_Runner._read_uploaded so files load the same way the standalone orchestrator loads them. - Per-file errors are captured so one bad file doesn't kill the batch; cleaned CSVs are cached in session_state so downloads survive reruns and are pruned when a file is removed or re-analyzed. Verified: the read -> run_pipeline -> CSV data path executes correctly (compile + a non-Streamlit functional smoke test). The Streamlit UI scaffolding (button / download_button / progress / session_state) mirrors the proven runner page but still needs a `streamlit run` check. Front-door copy is English literals for now; i18n keys are a follow-up. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/gui/_home.py | 122 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) diff --git a/src/gui/_home.py b/src/gui/_home.py index 234a6fc..c065cce 100644 --- a/src/gui/_home.py +++ b/src/gui/_home.py @@ -146,6 +146,101 @@ def _sync_uploader_to_home_uploads() -> None: st.session_state["home_findings_by_file"] = findings +def _read_upload_df(name: str, data: bytes): + """Bytes -> DataFrame. Mirrors the Automated Workflows page reader: + Excel by extension, else CSV with encoding fallbacks. Kept in step + with ``9_Pipeline_Runner._read_uploaded`` so the one-click clean + reads files exactly as the standalone orchestrator would.""" + import io as _io + from pathlib import Path as _Path + import pandas as pd + + suffix = _Path(name).suffix.lower() + bio = _io.BytesIO(data) + if suffix in (".xlsx", ".xls"): + return pd.read_excel(bio) + for enc in ("utf-8", "utf-8-sig", "latin-1"): + try: + bio.seek(0) + sep = "\t" if suffix == ".tsv" else "," + return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn") + except UnicodeDecodeError: + continue + bio.seek(0) + return pd.read_csv(bio, encoding="latin-1") + + +def _run_recommended_clean(home_uploads: dict) -> None: + """Front-door action: run the recommended pipeline (Clean Text -> + Standardize -> Fix Missing -> Find Duplicates, in that order) on + every imported file and stash a cleaned CSV per file in + ``session_state`` for download. This is the orchestrator wearing a + friendly face — it consumes the same ``recommended_pipeline`` the + Automated Workflows page builds. Per-file errors are captured so one + bad file doesn't kill the batch.""" + from src.core.pipeline import recommended_pipeline, run_pipeline + from src.core.errors import format_for_user + from src.audit import log_event + + pipeline = recommended_pipeline() + names = list(home_uploads.keys()) + results: dict = {} + progress = st.progress(0.0, text="Cleaning…") + for i, name in enumerate(names, start=1): + progress.progress((i - 1) / max(len(names), 1), text=name) + try: + df = _read_upload_df(name, home_uploads[name]["bytes"]) + res = run_pipeline(df, pipeline, stop_on_error=False) + results[name] = { + "csv": res.final_df.to_csv(index=False).encode("utf-8"), + "initial_rows": res.initial_rows, + "final_rows": res.final_rows, + "error": None, + } + except Exception as e: # noqa: BLE001 — surface per file, keep the batch alive + results[name] = {"csv": None, "error": format_for_user(e)} + progress.empty() + log_event("tool_run", "Home one-click recommended clean", files=names) + st.session_state["home_clean_results"] = results + st.rerun() + + +def _render_clean_results() -> None: + """Render per-file cleaned-CSV download buttons + a short summary from + the stash produced by :func:`_run_recommended_clean`. Only files + still present in ``home_uploads`` are shown, so removing a file + drops its stale result.""" + import hashlib as _hashlib + + results: dict = st.session_state.get("home_clean_results", {}) + if not results: + return + current = st.session_state.get("home_uploads", {}) + for name, r in results.items(): + if name not in current: + continue + digest = _hashlib.sha1( + name.encode("utf-8"), usedforsecurity=False, + ).hexdigest()[:10] + if r.get("error"): + st.error(f"**Could not clean `{name}`**\n\n```\n{r['error']}\n```") + continue + stem = name.rsplit(".", 1)[0] + st.download_button( + f"⬇ Download cleaned {name}", + data=r["csv"], + file_name=f"{stem}_cleaned.csv", + mime="text/csv", + key=f"home_clean_dl_{digest}", + width="stretch", + ) + removed = r["initial_rows"] - r["final_rows"] + st.caption( + f"{r['final_rows']:,} rows kept" + + (f" · {removed:,} removed" if removed else " · nothing to remove") + ) + + def _home_page() -> None: """Render the home page — multi-file upload + per-file analysis. @@ -443,6 +538,7 @@ def _home_page() -> None: if clear_clicked: st.session_state["home_findings_by_file"] = {} + st.session_state["home_clean_results"] = {} st.rerun() if run_clicked: @@ -458,6 +554,8 @@ def _home_page() -> None: findings_by_file[name] = _run_analysis_on_upload(stashed) progress.progress(i / len(pending), text=name) st.session_state["home_findings_by_file"] = findings_by_file + # A fresh analysis invalidates any prior one-click clean outputs. + st.session_state["home_clean_results"] = {} progress.empty() st.rerun() @@ -468,6 +566,30 @@ def _home_page() -> None: # 4-card summary above the findings panels so the user can # eyeball the run before expanding any one file. _render_stats_overview(findings_by_file) + + # ---- Front door: one-click recommended clean (primary path) ---- + # The analyzer has the findings; the majority case is "just fix + # it." This primary button runs the recommended pipeline in the + # correct order and hands back a cleaned file per upload, so the + # user never has to decide which tool or what order. The per-file + # findings below remain the "fix one thing at a time" path. + if st.button( + "✨ Clean these files for me", + type="primary", + key="home_clean_all", + width="stretch", + ): + _run_recommended_clean(home_uploads) + st.caption( + "Recommended: cleans text, standardizes formats, fills blanks, " + "and removes duplicates — in the right order — then gives you the " + "cleaned file." + ) + _render_clean_results() + + # ---- Manual path: per-file findings, fix one thing at a time ---- + st.markdown("###### Or fix issues one at a time") + st.caption("Open any finding below to jump straight to the right tool.") # Preserve the upload-stash order so the user sees results in # the same order they appear in the file list above. for name in home_uploads: