"""Home-page renderer extracted into its own module. This used to live inside ``src/gui/app.py`` as a local function. Pulling it out into a side-effect-free module lets the ``back_to_home_link`` helper (in ``components/_legacy.py``) import the home callable to pass into ``st.switch_page`` — without re-running ``app.py``'s navigation setup, which would itself blow up because tool pages have a different "main script" context that breaks the registry's relative ``pages/…`` paths. Keep this module imports-light: nothing that runs Streamlit commands at module top level, nothing that triggers config loads. Just the ``_home_page`` callable. """ from __future__ import annotations import streamlit as st class _StashedUpload: """Duck-types Streamlit's ``UploadedFile`` so ``_run_analysis_on_upload`` accepts entries restored from session-state without changes. Exposes ``.name``, ``.size``, and ``.getvalue()`` — the contract used by the analyzer's read path. """ __slots__ = ("name", "size", "_data") def __init__(self, name: str, data: bytes) -> None: self.name = name self.size = len(data) self._data = data def getvalue(self) -> bytes: return self._data def _format_size(n: int) -> str: """Human-readable byte count for file sizes shown in the GUI. Bytes are never displayed — the smallest unit is KB, even for sub- kilobyte files (e.g. ``0.5 KB`` for 512 bytes). Steps up to MB once the count reaches 1 MiB, then to GB at 1 GiB. Always one decimal place. """ KB = 1024 MB = 1024 * 1024 GB = 1024 * 1024 * 1024 if n < MB: return f"{n / KB:.1f} KB" if n < GB: return f"{n / MB:.1f} MB" return f"{n / GB:.1f} GB" def _render_stats_overview(findings_by_file: dict) -> None: """4-card grid above the per-file findings — summarizes the run. Card layout follows ``datatools_layout_redesign2.html`` §stats: Files analyzed, Total findings, Warnings (severity ``warn`` ∪ ``error``), Info (severity ``info``). The warn + info cards are tinted via ``.is-warn`` / ``.is-info`` modifiers that read the severity colors theme.py declares. """ import html as _html n_files = len(findings_by_file) all_findings = [f for fs in findings_by_file.values() for f in fs] n_total = len(all_findings) # Mockup groups errors with warnings on the "to review" card — # both demand the user act. ``info`` is the lower-priority pile. n_warn = sum(1 for f in all_findings if f.severity in ("warn", "error")) n_info = sum(1 for f in all_findings if f.severity == "info") def _card(label: str, value: int, unit: str = "", kind: str = "") -> str: cls = "dt-stat" + (f" {kind}" if kind else "") unit_html = ( f'{_html.escape(unit)}' if unit else "" ) return ( f'

' f'

{_html.escape(label)}

' f'

{value}{unit_html}

' f"

" ) cards = ( _card("Files analyzed", n_files) + _card("Total findings", n_total) + _card( "Warnings", n_warn, unit="to review" if n_warn else "", kind="is-warn" if n_warn else "", ) + _card( "Info", n_info, unit="suggestions" if n_info else "", kind="is-info" if n_info else "", ) ) st.markdown( f'

{cards}

', unsafe_allow_html=True, ) def _sync_uploader_to_home_uploads() -> None: """``on_change`` callback for the home-page file_uploader. Reconciles ``home_uploads`` (our persistent stash) with the widget's current value: adds newly-uploaded files, and drops files the user explicitly removed via the widget's built-in "✕" button. Per Streamlit semantics ``on_change`` only runs for user-initiated value changes, so the navigation-induced ``[]`` reset never reaches here — the stash survives intact across page switches. """ from src.audit import log_event widget_files = st.session_state.get("home_upload") or [] home_uploads: dict = st.session_state.setdefault("home_uploads", {}) findings: dict = st.session_state.setdefault("home_findings_by_file", {}) widget_names = {f.name for f in widget_files} for f in widget_files: if f.name not in home_uploads: home_uploads[f.name] = {"bytes": f.getvalue(), "size": f.size} log_event("upload", f"Uploaded {f.name}", filename=f.name, bytes=f.size) for name in list(home_uploads.keys()): if name not in widget_names: del home_uploads[name] findings.pop(name, None) log_event("upload", f"Removed {name}", filename=name) if st.session_state.get("home_uploaded_name") == name: st.session_state.pop("home_uploaded_name", None) st.session_state.pop("home_uploaded_size", None) st.session_state.pop("home_uploaded_bytes", None) st.session_state["home_uploads"] = home_uploads st.session_state["home_findings_by_file"] = findings def _read_upload_df(name: str, data: bytes): """Bytes -> DataFrame. Mirrors the Automated Workflows page reader: Excel by extension, else CSV with encoding fallbacks. Kept in step with ``9_Pipeline_Runner._read_uploaded`` so the one-click clean reads files exactly as the standalone orchestrator would.""" import io as _io from pathlib import Path as _Path import pandas as pd suffix = _Path(name).suffix.lower() bio = _io.BytesIO(data) if suffix in (".xlsx", ".xls"): return pd.read_excel(bio) for enc in ("utf-8", "utf-8-sig", "latin-1"): try: bio.seek(0) sep = "\t" if suffix == ".tsv" else "," return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn") except UnicodeDecodeError: continue bio.seek(0) return pd.read_csv(bio, encoding="latin-1") def _run_recommended_clean(home_uploads: dict) -> None: """Front-door action: run the recommended pipeline (Clean Text -> Standardize -> Fix Missing -> Find Duplicates, in that order) on every imported file and stash a cleaned CSV per file in ``session_state`` for download. This is the orchestrator wearing a friendly face — it consumes the same ``recommended_pipeline`` the Automated Workflows page builds. Per-file errors are captured so one bad file doesn't kill the batch.""" from src.core.pipeline import recommended_pipeline, run_pipeline from src.core.errors import format_for_user from src.audit import log_event pipeline = recommended_pipeline() names = list(home_uploads.keys()) results: dict = {} progress = st.progress(0.0, text="Cleaning…") for i, name in enumerate(names, start=1): progress.progress((i - 1) / max(len(names), 1), text=name) try: df = _read_upload_df(name, home_uploads[name]["bytes"]) res = run_pipeline(df, pipeline, stop_on_error=False) results[name] = { "csv": res.final_df.to_csv(index=False).encode("utf-8"), "initial_rows": res.initial_rows, "final_rows": res.final_rows, "error": None, } except Exception as e: # noqa: BLE001 — surface per file, keep the batch alive results[name] = {"csv": None, "error": format_for_user(e)} progress.empty() log_event("tool_run", "Home one-click recommended clean", files=names) st.session_state["home_clean_results"] = results st.rerun() def _render_clean_results() -> None: """Render per-file cleaned-CSV download buttons + a short summary from the stash produced by :func:`_run_recommended_clean`. Only files still present in ``home_uploads`` are shown, so removing a file drops its stale result.""" import hashlib as _hashlib results: dict = st.session_state.get("home_clean_results", {}) if not results: return current = st.session_state.get("home_uploads", {}) for name, r in results.items(): if name not in current: continue digest = _hashlib.sha1( name.encode("utf-8"), usedforsecurity=False, ).hexdigest()[:10] if r.get("error"): st.error(f"**Could not clean `{name}`**\n\n```\n{r['error']}\n```") continue stem = name.rsplit(".", 1)[0] st.download_button( f"⬇ Download cleaned {name}", data=r["csv"], file_name=f"{stem}_cleaned.csv", mime="text/csv", key=f"home_clean_dl_{digest}", width="stretch", ) removed = r["initial_rows"] - r["final_rows"] st.caption( f"{r['final_rows']:,} rows kept" + (f" · {removed:,} removed" if removed else " · nothing to remove") ) def _home_page() -> None: """Render the home page — multi-file upload + per-file analysis. Uploaded files live in ``st.session_state["home_uploads"]`` (a dict keyed by filename), NOT in the widget's transient state. Streamlit's ``st.file_uploader`` widget gets unmounted when the user navigates away to a tool page, and its ``UploadedFile`` objects don't always re-attach on remount — so we capture the bytes into our own session-state stash on first sight and treat that stash as the source of truth for everything downstream (active-file pickup, analysis, findings rendering). Removing a file: per-row "✕" buttons next to each uploaded filename. Clearing findings: the "Clear results" button only wipes the analysis cache, not the upload stash — the files persist until the user explicitly removes them. """ from src.gui.components import ( hide_streamlit_chrome, render_findings_panel, render_sticky_footer, ) from src.gui.components._legacy import _run_analysis_on_upload from src.i18n import t from pathlib import Path as _Path _ICON_PATH = str(_Path(__file__).parent / "assets" / "datatools_icon_256.png") st.set_page_config( page_title=t("home.page_title"), page_icon=_ICON_PATH, layout="wide", ) hide_streamlit_chrome() render_sticky_footer() import html as _html # Page header — brand block (D icon + "UNALOGIX" eyebrow over # "DataTools" wordmark + tagline) on the left, privacy pill on # the right. Matches the sidebar brand chip scaled up for the # hero. Bottom border replaces the explicit ``st.divider`` that # used to sit below the caption. privacy_label = _html.escape(t("home.privacy_pill")) st.markdown( '', unsafe_allow_html=True, ) # Source of truth for uploaded files. dict[name -> {"bytes", "size"}]. home_uploads: dict = st.session_state.setdefault("home_uploads", {}) # Streamlit's file_uploader is the only path that actually receives # bytes from the browser, but we don't want its dropzone UI to # compete with the in-card "Add more files" button below. Park the # whole widget off-screen via the ``dt-fileuploader-offscreen`` # CSS rule (declared in ``_DESIGN_TOKENS_CSS``) while keeping the # underlying ```` reachable to JS — the Add # button programmatically clicks it to open the OS file picker. # # ``on_change`` fires ONLY on user-initiated value changes (uploads # and the widget's built-in "✕" remove). It does NOT fire on the # remount-induced reset. That lets us treat the callback as ground # truth for both adds AND removes. st.markdown( '', unsafe_allow_html=True, ) st.file_uploader( t("upload.uploader_label_multi"), type=["csv", "tsv", "xlsx", "xls"], accept_multiple_files=True, key="home_upload", help=t("upload.uploader_help"), on_change=_sync_uploader_to_home_uploads, label_visibility="collapsed", ) # ``Files`` section header — count + total size on the right, or # "No files imported yet" when empty (mockup §section-head). import hashlib n_files = len(home_uploads) if n_files: total_bytes = sum(meta["size"] for meta in home_uploads.values()) files_word = "file" if n_files == 1 else "files" meta_html = ( f'{n_files} {files_word} · ' f'{_html.escape(_format_size(total_bytes))} total' ) else: meta_html = "No files imported yet" st.markdown( '

' f'

Files

' f'{meta_html}' '

', unsafe_allow_html=True, ) # Files card — always rendered. Body is file rows (if any) + the # in-card "Add more files" button that triggers the off-screen # file_uploader. Two-phase click capture for the X buttons: walk # all rows once, accumulate ``to_remove`` if any was clicked, # then mutate state + rerun ONCE after the loop. to_remove: str | None = None _DOC_SVG = ( '' ) _PLUS_SVG = ( '' ) with st.container(border=True): for name in list(home_uploads.keys()): digest = hashlib.sha1( name.encode("utf-8"), usedforsecurity=False, ).hexdigest()[:10] # X button on the LEFT of the row per UX feedback — # ``✕ | filename + chip | size``. col_x, col_name, col_size = st.columns([0.55, 8, 1.6]) if col_x.button( "✕", key=f"_home_remove_{digest}", help=f"Remove {name}", type="tertiary", ): to_remove = name col_name.markdown( '

' f'{_DOC_SVG}' f'{_html.escape(name)}' '

', unsafe_allow_html=True, ) col_size.markdown( f'

' f'' f'{_html.escape(_format_size(home_uploads[name]["size"]))}' '

', unsafe_allow_html=True, ) # In-card "Add more files" — clicks the (off-screen) # ``stFileUploaderDropzoneInput`` so the OS file picker opens. # Inline ``onclick`` would be cleanest but Streamlit's HTML # sanitizer strips event-handler attributes from # ``unsafe_allow_html`` content; the wiring is done from # ``_ADD_FILES_BUTTON_JS`` further down via ``st.iframe``. st.markdown( '', unsafe_allow_html=True, ) # Wire the in-card "Add more files" button to the off-screen # ``stFileUploaderDropzoneInput`` (Streamlit strips inline # ``onclick`` attributes; we have to do the binding from a real # script element, which Streamlit only ships through component # iframes — same pattern as the sticky footer + Upload→Import # rewriter). A ``MutationObserver`` re-wires after reruns when # Streamlit remounts the button. st.iframe( """ """, height=1, ) if to_remove is not None: from src.audit import log_event log_event( "upload", f"Removed {to_remove}", filename=to_remove, ) del home_uploads[to_remove] # Drop any findings/results tied to the removed file. findings_by_file_drop = st.session_state.get( "home_findings_by_file", {} ) findings_by_file_drop.pop(to_remove, None) st.session_state["home_uploads"] = home_uploads st.session_state["home_findings_by_file"] = findings_by_file_drop # If we just removed the active upload, also clear the # singular ``home_uploaded_*`` keys so tool pages don't # pick up stale bytes; the next render will repopulate # them from whatever file is now first. if st.session_state.get("home_uploaded_name") == to_remove: st.session_state.pop("home_uploaded_name", None) st.session_state.pop("home_uploaded_size", None) st.session_state.pop("home_uploaded_bytes", None) st.rerun() if not home_uploads: # Empty state — page ends cleanly after the Files card. The # in-card "Add more files" button is the only affordance the # user needs; the old ``upload.empty_state`` info alert was # redundant and out of step with the mockup. return # Expose the first uploaded file via the singular ``home_uploaded_*`` # session keys so tool pages reached via "Open " still find an # active upload through ``pickup_or_upload``. first_name = next(iter(home_uploads)) first_meta = home_uploads[first_name] if ( st.session_state.get("home_uploaded_name") != first_name or st.session_state.get("home_uploaded_size") != first_meta["size"] ): st.session_state["home_uploaded_name"] = first_name st.session_state["home_uploaded_size"] = first_meta["size"] st.session_state["home_uploaded_bytes"] = first_meta["bytes"] # Findings cache — drop entries whose underlying file is no longer # in the stash (e.g. user just clicked "✕"). findings_by_file: dict = st.session_state.setdefault( "home_findings_by_file", {} ) findings_by_file = { name: result for name, result in findings_by_file.items() if name in home_uploads } st.session_state["home_findings_by_file"] = findings_by_file pending = [name for name in home_uploads if name not in findings_by_file] # Action bar — Run analysis / Clear results. col_run, col_clear, _ = st.columns([1, 1, 4]) with col_run: run_clicked = st.button( t("upload.run_button"), type="primary", key="home_run_analysis", disabled=not pending, width="stretch", ) with col_clear: clear_clicked = st.button( t("upload.clear_results"), key="home_clear_results", disabled=not findings_by_file, width="stretch", ) if clear_clicked: st.session_state["home_findings_by_file"] = {} st.session_state["home_clean_results"] = {} st.rerun() if run_clicked: from src.audit import log_event log_event( "analyze", f"Run analysis clicked on {len(pending)} file(s)", files=list(pending), ) progress = st.progress(0.0, text=t("upload.scanning")) for i, name in enumerate(pending, start=1): stashed = _StashedUpload(name, home_uploads[name]["bytes"]) findings_by_file[name] = _run_analysis_on_upload(stashed) progress.progress(i / len(pending), text=name) st.session_state["home_findings_by_file"] = findings_by_file # A fresh analysis invalidates any prior one-click clean outputs. st.session_state["home_clean_results"] = {} progress.empty() st.rerun() if findings_by_file: st.divider() # Overview row before drilling into per-file detail. Mockup # layout (datatools_layout_redesign2.html §stats) puts a # 4-card summary above the findings panels so the user can # eyeball the run before expanding any one file. _render_stats_overview(findings_by_file) # ---- Front door: one-click recommended clean (primary path) ---- # The analyzer has the findings; the majority case is "just fix # it." This primary button runs the recommended pipeline in the # correct order and hands back a cleaned file per upload, so the # user never has to decide which tool or what order. The per-file # findings below remain the "fix one thing at a time" path. if st.button( "✨ Clean these files for me", type="primary", key="home_clean_all", width="stretch", ): _run_recommended_clean(home_uploads) st.caption( "Recommended: cleans text, standardizes formats, fills blanks, " "and removes duplicates — in the right order — then gives you the " "cleaned file." ) _render_clean_results() # ---- Manual path: per-file findings, fix one thing at a time ---- st.markdown("###### Or fix issues one at a time") st.caption("Open any finding below to jump straight to the right tool.") # Preserve the upload-stash order so the user sees results in # the same order they appear in the file list above. for name in home_uploads: if name not in findings_by_file: continue findings = findings_by_file[name] with st.container(border=True): if not findings: st.markdown( '

' '' f'{_html.escape(name)}' '

' 'no issues' '

' '

', unsafe_allow_html=True, ) else: render_findings_panel( findings, header=name, key_namespace=name, )