datatools-dev/src/gui/_home.py

"""Home-page renderer extracted into its own module.

This used to live inside ``src/gui/app.py`` as a local function. Pulling
it out into a side-effect-free module lets the ``back_to_home_link``
helper (in ``components/_legacy.py``) import the home callable to pass
into ``st.switch_page`` — without re-running ``app.py``'s navigation
setup, which would itself blow up because tool pages have a different
"main script" context that breaks the registry's relative ``pages/…``
paths.

Keep this module imports-light: nothing that runs Streamlit commands
at module top level, nothing that triggers config loads. Just the
``_home_page`` callable.
"""

from __future__ import annotations

import streamlit as st


class _StashedUpload:
    """Duck-types Streamlit's ``UploadedFile`` so ``_run_analysis_on_upload``
    accepts entries restored from session-state without changes. Exposes
    ``.name``, ``.size``, and ``.getvalue()`` — the contract used by the
    analyzer's read path.
    """

    __slots__ = ("name", "size", "_data")

    def __init__(self, name: str, data: bytes) -> None:
        self.name = name
        self.size = len(data)
        self._data = data

    def getvalue(self) -> bytes:
        return self._data


def _format_size(n: int) -> str:
    """Human-readable byte count for file sizes shown in the GUI.

    Bytes are never displayed — the smallest unit is KB, even for sub-
    kilobyte files (e.g. ``0.5 KB`` for 512 bytes). Steps up to MB
    once the count reaches 1 MiB, then to GB at 1 GiB. Always one
    decimal place.
    """
    KB = 1024
    MB = 1024 * 1024
    GB = 1024 * 1024 * 1024
    if n < MB:
        return f"{n / KB:.1f} KB"
    if n < GB:
        return f"{n / MB:.1f} MB"
    return f"{n / GB:.1f} GB"


def _render_stats_overview(findings_by_file: dict) -> None:
    """4-card grid above the per-file findings — summarizes the run.

    Card layout follows ``datatools_layout_redesign2.html`` §stats:
    Files analyzed, Total findings, Warnings (severity ``warn`` ∪
    ``error``), Info (severity ``info``). The warn + info cards are
    tinted via ``.is-warn`` / ``.is-info`` modifiers that read the
    severity colors theme.py declares.
    """
    import html as _html

    n_files = len(findings_by_file)
    all_findings = [f for fs in findings_by_file.values() for f in fs]
    n_total = len(all_findings)
    # Mockup groups errors with warnings on the "to review" card —
    # both demand the user act. ``info`` is the lower-priority pile.
    n_warn = sum(1 for f in all_findings if f.severity in ("warn", "error"))
    n_info = sum(1 for f in all_findings if f.severity == "info")

    def _card(label: str, value: int, unit: str = "", kind: str = "") -> str:
        cls = "dt-stat" + (f" {kind}" if kind else "")
        unit_html = (
            f'<span class="dt-stat-unit">{_html.escape(unit)}</span>'
            if unit else ""
        )
        return (
            f'<div class="{cls}">'
            f'<div class="dt-stat-label">{_html.escape(label)}</div>'
            f'<div class="dt-stat-value">{value}{unit_html}</div>'
            f"</div>"
        )

    cards = (
        _card("Files analyzed", n_files)
        + _card("Total findings", n_total)
        + _card(
            "Warnings",
            n_warn,
            unit="to review" if n_warn else "",
            kind="is-warn" if n_warn else "",
        )
        + _card(
            "Info",
            n_info,
            unit="suggestions" if n_info else "",
            kind="is-info" if n_info else "",
        )
    )

    st.markdown(
        f'<div class="dt-stats">{cards}</div>',
        unsafe_allow_html=True,
    )


def _sync_uploader_to_home_uploads() -> None:
    """``on_change`` callback for the home-page file_uploader.

    Reconciles ``home_uploads`` (our persistent stash) with the widget's
    current value: adds newly-uploaded files, and drops files the user
    explicitly removed via the widget's built-in "✕" button. Per
    Streamlit semantics ``on_change`` only runs for user-initiated
    value changes, so the navigation-induced ``[]`` reset never reaches
    here — the stash survives intact across page switches.
    """
    from src.audit import log_event

    widget_files = st.session_state.get("home_upload") or []
    home_uploads: dict = st.session_state.setdefault("home_uploads", {})
    findings: dict = st.session_state.setdefault("home_findings_by_file", {})

    widget_names = {f.name for f in widget_files}

    for f in widget_files:
        if f.name not in home_uploads:
            home_uploads[f.name] = {"bytes": f.getvalue(), "size": f.size}
            log_event("upload", f"Uploaded {f.name}", filename=f.name, bytes=f.size)

    for name in list(home_uploads.keys()):
        if name not in widget_names:
            del home_uploads[name]
            findings.pop(name, None)
            log_event("upload", f"Removed {name}", filename=name)
            if st.session_state.get("home_uploaded_name") == name:
                st.session_state.pop("home_uploaded_name", None)
                st.session_state.pop("home_uploaded_size", None)
                st.session_state.pop("home_uploaded_bytes", None)

    st.session_state["home_uploads"] = home_uploads
    st.session_state["home_findings_by_file"] = findings


def _read_upload_df(name: str, data: bytes):
    """Bytes -> DataFrame. Mirrors the Automated Workflows page reader:
    Excel by extension, else CSV with encoding fallbacks. Kept in step
    with ``9_Pipeline_Runner._read_uploaded`` so the one-click clean
    reads files exactly as the standalone orchestrator would."""
    import io as _io
    from pathlib import Path as _Path
    import pandas as pd

    suffix = _Path(name).suffix.lower()
    bio = _io.BytesIO(data)
    if suffix in (".xlsx", ".xls"):
        return pd.read_excel(bio)
    for enc in ("utf-8", "utf-8-sig", "latin-1"):
        try:
            bio.seek(0)
            sep = "\t" if suffix == ".tsv" else ","
            return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn")
        except UnicodeDecodeError:
            continue
    bio.seek(0)
    return pd.read_csv(bio, encoding="latin-1")


def _run_recommended_clean(home_uploads: dict) -> None:
    """Front-door action: run the recommended pipeline (Clean Text ->
    Standardize -> Fix Missing -> Find Duplicates, in that order) on
    every imported file and stash a cleaned CSV per file in
    ``session_state`` for download. This is the orchestrator wearing a
    friendly face — it consumes the same ``recommended_pipeline`` the
    Automated Workflows page builds. Per-file errors are captured so one
    bad file doesn't kill the batch."""
    from src.core.pipeline import recommended_pipeline, run_pipeline
    from src.core.errors import format_for_user
    from src.audit import log_event

    pipeline = recommended_pipeline()
    names = list(home_uploads.keys())
    results: dict = {}
    progress = st.progress(0.0, text="Cleaning…")
    for i, name in enumerate(names, start=1):
        progress.progress((i - 1) / max(len(names), 1), text=name)
        try:
            df = _read_upload_df(name, home_uploads[name]["bytes"])
            res = run_pipeline(df, pipeline, stop_on_error=False)
            results[name] = {
                "csv": res.final_df.to_csv(index=False).encode("utf-8"),
                "initial_rows": res.initial_rows,
                "final_rows": res.final_rows,
                "error": None,
            }
        except Exception as e:  # noqa: BLE001 — surface per file, keep the batch alive
            results[name] = {"csv": None, "error": format_for_user(e)}
    progress.empty()
    log_event("tool_run", "Home one-click recommended clean", files=names)
    st.session_state["home_clean_results"] = results
    st.rerun()


def _render_clean_results() -> None:
    """Render per-file cleaned-CSV download buttons + a short summary from
    the stash produced by :func:`_run_recommended_clean`. Only files
    still present in ``home_uploads`` are shown, so removing a file
    drops its stale result."""
    import hashlib as _hashlib

    results: dict = st.session_state.get("home_clean_results", {})
    if not results:
        return
    current = st.session_state.get("home_uploads", {})
    for name, r in results.items():
        if name not in current:
            continue
        digest = _hashlib.sha1(
            name.encode("utf-8"), usedforsecurity=False,
        ).hexdigest()[:10]
        if r.get("error"):
            st.error(f"**Could not clean `{name}`**\n\n```\n{r['error']}\n```")
            continue
        stem = name.rsplit(".", 1)[0]
        st.download_button(
            f"⬇  Download cleaned {name}",
            data=r["csv"],
            file_name=f"{stem}_cleaned.csv",
            mime="text/csv",
            key=f"home_clean_dl_{digest}",
            width="stretch",
        )
        removed = r["initial_rows"] - r["final_rows"]
        st.caption(
            f"{r['final_rows']:,} rows kept"
            + (f" · {removed:,} removed" if removed else " · nothing to remove")
        )


def _home_page() -> None:
    """Render the home page — multi-file upload + per-file analysis.

    Uploaded files live in ``st.session_state["home_uploads"]`` (a
    dict keyed by filename), NOT in the widget's transient state.
    Streamlit's ``st.file_uploader`` widget gets unmounted when the
    user navigates away to a tool page, and its ``UploadedFile``
    objects don't always re-attach on remount — so we capture the
    bytes into our own session-state stash on first sight and treat
    that stash as the source of truth for everything downstream
    (active-file pickup, analysis, findings rendering).

    Removing a file: per-row "✕" buttons next to each uploaded
    filename. Clearing findings: the "Clear results" button only
    wipes the analysis cache, not the upload stash — the files
    persist until the user explicitly removes them.
    """
    from src.gui.components import (
        hide_streamlit_chrome,
        render_findings_panel,
        render_sticky_footer,
    )
    from src.gui.components._legacy import _run_analysis_on_upload
    from src.i18n import t

    from pathlib import Path as _Path
    _ICON_PATH = str(_Path(__file__).parent / "assets" / "datatools_icon_256.png")
    st.set_page_config(
        page_title=t("home.page_title"),
        page_icon=_ICON_PATH,
        layout="wide",
    )
    hide_streamlit_chrome()
    render_sticky_footer()

    import html as _html
    # Page header — brand block (D icon + "UNALOGIX" eyebrow over
    # "DataTools" wordmark + tagline) on the left, privacy pill on
    # the right. Matches the sidebar brand chip scaled up for the
    # hero. Bottom border replaces the explicit ``st.divider`` that
    # used to sit below the caption.
    privacy_label = _html.escape(t("home.privacy_pill"))
    st.markdown(
        '<header class="dt-page-header">'
        '<div class="dt-page-brand">'
        '<div class="dt-page-brand-row">'
        '<div class="dt-page-brand-mark">D</div>'
        '<div class="dt-page-brand-words">'
        '<span class="dt-page-eyebrow">UNALOGIX</span>'
        '<h1 class="dt-page-wordmark">DataTools</h1>'
        '</div>'
        '</div>'
        f'<p class="dt-page-subtitle">{_html.escape(t("home.caption"))}</p>'
        '</div>'
        '<span class="dt-privacy-pill">'
        '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor">'
        '<rect x="4" y="11" width="16" height="10" rx="2"/>'
        '<path d="M8 11V7a4 4 0 018 0v4"/>'
        '</svg>'
        f'{privacy_label}'
        '</span>'
        '</header>',
        unsafe_allow_html=True,
    )

    # Source of truth for uploaded files. dict[name -> {"bytes", "size"}].
    home_uploads: dict = st.session_state.setdefault("home_uploads", {})

    # Streamlit's file_uploader is the only path that actually receives
    # bytes from the browser, but we don't want its dropzone UI to
    # compete with the in-card "Add more files" button below. Park the
    # whole widget off-screen via the ``dt-fileuploader-offscreen``
    # CSS rule (declared in ``_DESIGN_TOKENS_CSS``) while keeping the
    # underlying ``<input type="file">`` reachable to JS — the Add
    # button programmatically clicks it to open the OS file picker.
    #
    # ``on_change`` fires ONLY on user-initiated value changes (uploads
    # and the widget's built-in "✕" remove). It does NOT fire on the
    # remount-induced reset. That lets us treat the callback as ground
    # truth for both adds AND removes.
    st.markdown(
        '<style>[data-testid="stFileUploader"] {'
        'position:absolute!important;left:-10000px!important;'
        'width:1px!important;height:1px!important;overflow:hidden!important;'
        'pointer-events:none!important;}</style>',
        unsafe_allow_html=True,
    )
    st.file_uploader(
        t("upload.uploader_label_multi"),
        type=["csv", "tsv", "xlsx", "xls"],
        accept_multiple_files=True,
        key="home_upload",
        help=t("upload.uploader_help"),
        on_change=_sync_uploader_to_home_uploads,
        label_visibility="collapsed",
    )

    # ``Files`` section header — count + total size on the right, or
    # "No files imported yet" when empty (mockup §section-head).
    import hashlib
    n_files = len(home_uploads)
    if n_files:
        total_bytes = sum(meta["size"] for meta in home_uploads.values())
        files_word = "file" if n_files == 1 else "files"
        meta_html = (
            f'{n_files} {files_word} · '
            f'{_html.escape(_format_size(total_bytes))} total'
        )
    else:
        meta_html = "No files imported yet"
    st.markdown(
        '<div class="dt-files-section-head">'
        f'<h2>Files</h2>'
        f'<span class="dt-section-meta">{meta_html}</span>'
        '</div>',
        unsafe_allow_html=True,
    )

    # Files card — always rendered. Body is file rows (if any) + the
    # in-card "Add more files" button that triggers the off-screen
    # file_uploader. Two-phase click capture for the X buttons: walk
    # all rows once, accumulate ``to_remove`` if any was clicked,
    # then mutate state + rerun ONCE after the loop.
    to_remove: str | None = None
    _DOC_SVG = (
        '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor">'
        '<path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/>'
        '<path d="M14 2v6h6"/>'
        '</svg>'
    )
    _PLUS_SVG = (
        '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor">'
        '<path d="M12 5v14M5 12h14"/>'
        '</svg>'
    )
    with st.container(border=True):
        for name in list(home_uploads.keys()):
            digest = hashlib.sha1(
                name.encode("utf-8"), usedforsecurity=False,
            ).hexdigest()[:10]
            # X button on the LEFT of the row per UX feedback —
            # ``✕ | filename + chip | size``.
            col_x, col_name, col_size = st.columns([0.55, 8, 1.6])
            if col_x.button(
                "✕",
                key=f"_home_remove_{digest}",
                help=f"Remove {name}",
                type="tertiary",
            ):
                to_remove = name
            col_name.markdown(
                '<div class="dt-file-row">'
                f'<span class="dt-file-icon-chip">{_DOC_SVG}</span>'
                f'<span class="dt-file-name">{_html.escape(name)}</span>'
                '</div>',
                unsafe_allow_html=True,
            )
            col_size.markdown(
                f'<div style="text-align:right;">'
                f'<span class="dt-file-size">'
                f'{_html.escape(_format_size(home_uploads[name]["size"]))}'
                '</span></div>',
                unsafe_allow_html=True,
            )
        # In-card "Add more files" — clicks the (off-screen)
        # ``stFileUploaderDropzoneInput`` so the OS file picker opens.
        # Inline ``onclick`` would be cleanest but Streamlit's HTML
        # sanitizer strips event-handler attributes from
        # ``unsafe_allow_html`` content; the wiring is done from
        # ``_ADD_FILES_BUTTON_JS`` further down via ``st.iframe``.
        st.markdown(
            '<button class="dt-file-add" type="button">'
            f'{_PLUS_SVG} Add more files'
            '</button>',
            unsafe_allow_html=True,
        )

    # Wire the in-card "Add more files" button to the off-screen
    # ``stFileUploaderDropzoneInput`` (Streamlit strips inline
    # ``onclick`` attributes; we have to do the binding from a real
    # script element, which Streamlit only ships through component
    # iframes — same pattern as the sticky footer + Upload→Import
    # rewriter). A ``MutationObserver`` re-wires after reruns when
    # Streamlit remounts the button.
    st.iframe(
        """
<script>
  (function () {
    function wire(doc) {
      var btn = doc.querySelector('button.dt-file-add');
      var input = doc.querySelector('[data-testid="stFileUploaderDropzoneInput"]');
      if (!btn || !input) return;
      if (btn.dataset.dtWired === '1') return;
      btn.dataset.dtWired = '1';
      btn.addEventListener('click', function (e) {
        e.preventDefault();
        input.click();
      });
    }
    var doc;
    try { doc = window.parent.document; }
    catch (e) { doc = document; }
    wire(doc);
    var win = doc.defaultView || window.parent || window;
    if ('MutationObserver' in win) {
      var raf = 0;
      try {
        new win.MutationObserver(function () {
          if (raf) return;
          raf = win.requestAnimationFrame(function () { raf = 0; wire(doc); });
        }).observe(doc.body, { childList: true, subtree: true });
      } catch (e) {}
    }
  })();
</script>
""",
        height=1,
    )

    if to_remove is not None:
        from src.audit import log_event
        log_event(
            "upload",
            f"Removed {to_remove}",
            filename=to_remove,
        )
        del home_uploads[to_remove]
        # Drop any findings/results tied to the removed file.
        findings_by_file_drop = st.session_state.get(
            "home_findings_by_file", {}
        )
        findings_by_file_drop.pop(to_remove, None)
        st.session_state["home_uploads"] = home_uploads
        st.session_state["home_findings_by_file"] = findings_by_file_drop
        # If we just removed the active upload, also clear the
        # singular ``home_uploaded_*`` keys so tool pages don't
        # pick up stale bytes; the next render will repopulate
        # them from whatever file is now first.
        if st.session_state.get("home_uploaded_name") == to_remove:
            st.session_state.pop("home_uploaded_name", None)
            st.session_state.pop("home_uploaded_size", None)
            st.session_state.pop("home_uploaded_bytes", None)
        st.rerun()

    if not home_uploads:
        # Empty state — page ends cleanly after the Files card. The
        # in-card "Add more files" button is the only affordance the
        # user needs; the old ``upload.empty_state`` info alert was
        # redundant and out of step with the mockup.
        return

    # Expose the first uploaded file via the singular ``home_uploaded_*``
    # session keys so tool pages reached via "Open <Tool>" still find an
    # active upload through ``pickup_or_upload``.
    first_name = next(iter(home_uploads))
    first_meta = home_uploads[first_name]
    if (
        st.session_state.get("home_uploaded_name") != first_name
        or st.session_state.get("home_uploaded_size") != first_meta["size"]
    ):
        st.session_state["home_uploaded_name"] = first_name
        st.session_state["home_uploaded_size"] = first_meta["size"]
        st.session_state["home_uploaded_bytes"] = first_meta["bytes"]

    # Findings cache — drop entries whose underlying file is no longer
    # in the stash (e.g. user just clicked "✕").
    findings_by_file: dict = st.session_state.setdefault(
        "home_findings_by_file", {}
    )
    findings_by_file = {
        name: result for name, result in findings_by_file.items()
        if name in home_uploads
    }
    st.session_state["home_findings_by_file"] = findings_by_file

    pending = [name for name in home_uploads if name not in findings_by_file]

    # Action bar — Run analysis / Clear results.
    col_run, col_clear, _ = st.columns([1, 1, 4])
    with col_run:
        run_clicked = st.button(
            t("upload.run_button"),
            type="primary",
            key="home_run_analysis",
            disabled=not pending,
            width="stretch",
        )
    with col_clear:
        clear_clicked = st.button(
            t("upload.clear_results"),
            key="home_clear_results",
            disabled=not findings_by_file,
            width="stretch",
        )

    if clear_clicked:
        st.session_state["home_findings_by_file"] = {}
        st.session_state["home_clean_results"] = {}
        st.rerun()

    if run_clicked:
        from src.audit import log_event
        log_event(
            "analyze",
            f"Run analysis clicked on {len(pending)} file(s)",
            files=list(pending),
        )
        progress = st.progress(0.0, text=t("upload.scanning"))
        for i, name in enumerate(pending, start=1):
            stashed = _StashedUpload(name, home_uploads[name]["bytes"])
            findings_by_file[name] = _run_analysis_on_upload(stashed)
            progress.progress(i / len(pending), text=name)
        st.session_state["home_findings_by_file"] = findings_by_file
        # A fresh analysis invalidates any prior one-click clean outputs.
        st.session_state["home_clean_results"] = {}
        progress.empty()
        st.rerun()

    if findings_by_file:
        st.divider()
        # Overview row before drilling into per-file detail. Mockup
        # layout (datatools_layout_redesign2.html §stats) puts a
        # 4-card summary above the findings panels so the user can
        # eyeball the run before expanding any one file.
        _render_stats_overview(findings_by_file)

        # ---- Front door: one-click recommended clean (primary path) ----
        # The analyzer has the findings; the majority case is "just fix
        # it." This primary button runs the recommended pipeline in the
        # correct order and hands back a cleaned file per upload, so the
        # user never has to decide which tool or what order. The per-file
        # findings below remain the "fix one thing at a time" path.
        if st.button(
            "✨  Clean these files for me",
            type="primary",
            key="home_clean_all",
            width="stretch",
        ):
            _run_recommended_clean(home_uploads)
        st.caption(
            "Recommended: cleans text, standardizes formats, fills blanks, "
            "and removes duplicates — in the right order — then gives you the "
            "cleaned file."
        )
        _render_clean_results()

        # ---- Manual path: per-file findings, fix one thing at a time ----
        st.markdown("###### Or fix issues one at a time")
        st.caption("Open any finding below to jump straight to the right tool.")
        # Preserve the upload-stash order so the user sees results in
        # the same order they appear in the file list above.
        for name in home_uploads:
            if name not in findings_by_file:
                continue
            findings = findings_by_file[name]
            with st.container(border=True):
                if not findings:
                    st.markdown(
                        '<div class="dt-finding-group-head">'
                        '<span class="dt-severity-dot success"></span>'
                        f'<span class="dt-group-filename">{_html.escape(name)}</span>'
                        '<div class="dt-group-counts">'
                        '<span class="dt-count-pill success">no issues</span>'
                        '</div>'
                        '</div>',
                        unsafe_allow_html=True,
                    )
                else:
                    render_findings_panel(
                        findings,
                        header=name,
                        key_namespace=name,
                    )