datatools-dev/src/gui/components/_legacy.py

"""Reusable Streamlit widgets for the DataTools GUI."""

from __future__ import annotations

import io
import os
import sys
import threading
import time
from typing import Optional

import pandas as pd
import streamlit as st

from src.i18n import t as _t
from src.core.dedup import (
    Algorithm,
    ColumnMatchStrategy,
    DeduplicationResult,
    MatchResult,
    MatchStrategy,
    SurvivorRule,
)
from src.core.config import (
    ColumnStrategyConfig,
    DeduplicationConfig,
    StrategyConfig,
)
from src.core.normalizers import NormalizerType


# ---------------------------------------------------------------------------
# App chrome — hide Streamlit default UI for app-like feel
# ---------------------------------------------------------------------------

_HIDE_CHROME_CSS = """
<style>
/* Make the Streamlit header transparent and out of the way, but DO NOT
   `display: none` it — the sidebar's collapsed-state expand button is
   anchored in the header region, and removing the header makes a
   collapsed sidebar impossible to reopen. */
header[data-testid="stHeader"] {
    background: transparent !important;
    height: 0 !important;
}
/* Hide main hamburger menu and deploy button explicitly (don't rely on
   hiding the whole header). */
#MainMenu,
[data-testid="stMainMenu"],
[data-testid="stAppDeployButton"] {
    display: none !important;
}
/* Keep the sidebar expand control visible and clickable above page content. */
[data-testid="stSidebarCollapsedControl"] {
    display: flex !important;
    visibility: visible !important;
    z-index: 999 !important;
}
/* Hide footer */
footer {
    display: none !important;
}
/* Reclaim top padding lost from hidden header. Slim the bottom too —
   Streamlit's default leaves several rems below the last widget. */
.stAppViewBlockContainer,
[data-testid="stAppViewBlockContainer"] {
    padding-top: 0.5rem !important;
    padding-bottom: 0.75rem !important;
}
/* Scale content to fit app window */
.stApp {
    zoom: 0.85;
}

/* ---------- Compact-spacing layer ---------- */
/* Streamlit ships generous vertical rhythm (~1rem gap between every
   block, 1.5rem+ above each heading, 1rem on dividers). For a desktop
   data app that's a lot of empty space. Tighten the gaps without
   making the layout look cramped. */

/* Gap between stacked elements inside a vertical block (the default
   container around most page content). */
[data-testid="stVerticalBlock"] {
    gap: 0.5rem !important;
}
[data-testid="stHorizontalBlock"] {
    gap: 0.5rem !important;
}

/* Headings — tighter top space + a hair less below. */
.stApp h1 { margin-top: 0.25rem !important; margin-bottom: 0.5rem !important; }
.stApp h2 { margin-top: 0.5rem  !important; margin-bottom: 0.4rem !important; }
.stApp h3 { margin-top: 0.4rem  !important; margin-bottom: 0.3rem !important; }
.stApp h4 { margin-top: 0.3rem  !important; margin-bottom: 0.25rem !important; }

/* st.divider() — Streamlit's default hr has 1rem above and below. */
[data-testid="stMarkdownContainer"] hr,
hr {
    margin-top: 0.4rem !important;
    margin-bottom: 0.4rem !important;
}

/* Markdown paragraphs — slim trailing space. */
[data-testid="stMarkdownContainer"] p {
    margin-bottom: 0.25rem;
}

/* Captions — slim trailing space. */
[data-testid="stCaption"],
[data-testid="stCaptionContainer"] {
    margin-bottom: 0.25rem;
}

/* Expander header padding — Streamlit's default is roomy. */
[data-testid="stExpander"] details > summary {
    padding-top: 0.35rem;
    padding-bottom: 0.35rem;
}

/* Button row inside columns — tighter top space. */
[data-testid="stButton"],
[data-testid="stDownloadButton"] {
    margin-top: 0;
    margin-bottom: 0;
}

/* File-uploader internal spacing. */
[data-testid="stFileUploader"] {
    margin-bottom: 0.25rem;
}

/* Metric tiles — Streamlit pads them generously inside a row. */
[data-testid="stMetric"] {
    padding-top: 0.25rem;
    padding-bottom: 0.25rem;
}
</style>
"""


def hide_streamlit_chrome(*, gate_license: bool = True) -> None:
    """Inject CSS to hide Streamlit's default header, menu, and footer.

    Also renders the sidebar language selector + license status badge,
    since every entrypoint that hides the default chrome wants those
    visible in the same place. Pages that want a clean chrome without
    them can inject ``_HIDE_CHROME_CSS`` themselves instead of calling
    this.

    When *gate_license* is True (the default) the function calls
    :func:`require_license_or_render_activation` after the sidebar
    widgets render. If no valid license is present, the activation
    form replaces the page body and the page short-circuits via
    ``st.stop()``. The Activate page itself passes ``False`` so it
    can render its own form without recursion.
    """
    st.markdown(_HIDE_CHROME_CSS, unsafe_allow_html=True)
    # Stamp a session-start record into the audit log the first time
    # any page renders. Idempotent — subsequent calls are no-ops.
    from src.audit import log_session_start
    log_session_start()
    # Production-safe check runs first so a misconfigured shipped
    # build refuses to render anything (rather than rendering a
    # broken activation form that doesn't accept real blobs).
    # No-op in source / pytest runs.
    from src.license import assert_production_safe
    assert_production_safe()
    # Imported lazily so this module stays importable in environments
    # where the i18n packs haven't been laid out (e.g. unit tests of
    # individual legacy helpers).
    from src.i18n import render_language_selector
    render_language_selector()
    # License chrome: sidebar status badge + inline gate.
    from .activation import (
        render_license_status_sidebar,
        require_license_or_render_activation,
    )
    render_license_status_sidebar()
    _render_diagnostics_sidebar()
    if gate_license:
        require_license_or_render_activation()


def _render_diagnostics_sidebar() -> None:
    """Render a small Diagnostics expander in the sidebar.

    Shows the path to the current session's audit log and an "Open
    folder" button. Lives behind an expander so it doesn't take
    screen space until the user opens it; the support flow is
    "client mails us the file, we tell them what went wrong."
    """
    from src.audit import audit_log_dir, audit_log_path
    log_path = audit_log_path()
    with st.sidebar:
        with st.expander("🩺  Diagnostics", expanded=False):
            st.caption("Audit log for this session:")
            st.code(str(log_path), language=None)
            if st.button(
                "📂  Open log folder",
                key="_diag_open_logs",
                type="secondary",
                use_container_width=True,
            ):
                opened = _open_in_file_manager(audit_log_dir(), select=log_path)
                if not opened:
                    st.warning(
                        "Could not open the file manager from here. "
                        "Path is above — paste it into your file manager."
                    )


# ---------------------------------------------------------------------------
# Clean shutdown
# ---------------------------------------------------------------------------

_FAREWELL_SCRIPT_TEMPLATE = """
<script>
  (function () {
    // Strategy: append a full-screen overlay directly to the parent's
    // document.body (Streamlit's component iframes carry
    // allow-same-origin, so cross-frame DOM access is permitted).
    //
    // Closing the tab via JavaScript only works in windows JS opened —
    // Chrome/Edge --app windows qualify; a regular browser tab does
    // NOT, and there's no way to override that from page JS (no flag,
    // no API, no keystroke injection — synthesized keydown events
    // never reach the browser chrome or the OS). When close fails we
    // navigate the window to ``about:blank`` so the user at least
    // sees a clean blank tab instead of the connection-error overlay
    // Streamlit shows when the websocket drops.
    //
    // Display-mode detection (``standalone`` for --app windows,
    // ``browser`` for regular tabs) lets us skip the futile close
    // attempt on regular tabs and route straight to the about:blank
    // fallback.
    function isStandalone(win) {
      try {
        return win.matchMedia('(display-mode: standalone)').matches
            || win.matchMedia('(display-mode: minimal-ui)').matches
            || win.matchMedia('(display-mode: fullscreen)').matches;
      } catch (e) { return false; }
    }
    function buildOverlay(doc) {
      var overlay = doc.createElement('div');
      overlay.id = 'datatools-farewell-overlay';
      overlay.style.cssText =
        'position:fixed;inset:0;background:#0f1115;color:#e8eaed;' +
        'z-index:2147483647;display:flex;align-items:center;' +
        'justify-content:center;font-family:system-ui,-apple-system,sans-serif;';
      overlay.innerHTML =
        '<div style="text-align:center;padding:32px 40px;border:1px solid #252a36;' +
        'border-radius:12px;background:#161922;max-width:480px;">' +
        '<h1 style="margin:0 0 8px 0;font-weight:600;letter-spacing:-0.01em;">' +
        '__TITLE__</h1>' +
        '<p style="opacity:0.7;margin:0 0 20px 0;">__SUBTITLE__</p>' +
        '<button id="datatools-close-btn" style="' +
        'background:#6ee7b7;color:#052e1a;font-weight:600;' +
        'padding:10px 20px;border-radius:8px;border:none;' +
        'font-size:15px;cursor:pointer;font-family:inherit;">' +
        '__CLOSE_BTN__</button>' +
        '<p id="datatools-close-hint" style="' +
        'display:none;font-size:13px;opacity:0.6;margin:14px 0 0 0;">' +
        '__CLOSE_HINT__</p>' +
        '</div>';
      return overlay;
    }
    function tryClose(win) {
      // Escalating attempts. None of these can override the browser's
      // close-restriction policy on regular tabs.
      try { win.close(); } catch (e) {}
      if (win.closed) return true;
      try {
        var w = win.open('', '_self', '');
        if (w) {
          try { w.close(); } catch (e) {}
        }
      } catch (e) {}
      if (win.closed) return true;
      try { win.top.close(); } catch (e) {}
      return win.closed;
    }
    function fallbackToBlank(win) {
      // Navigate to about:blank so the user sees a clean empty tab
      // instead of the farewell overlay frozen on a connection-error
      // page. They can still close the tab themselves (Ctrl+W /
      // ⌘W / clicking the tab's X). Done as a single fast call — no
      // history entry pollution because location.replace doesn't
      // push to history.
      try { win.location.replace('about:blank'); } catch (e) {}
    }
    function wireClose(doc, win) {
      var btn = doc.getElementById('datatools-close-btn');
      if (!btn) return;
      btn.onclick = function () {
        var standalone = isStandalone(win);
        if (tryClose(win)) return;
        // Close failed (or definitely will fail in a regular tab).
        // Surface the hint immediately, then redirect to about:blank
        // after a short delay so the user has a moment to read why.
        var hint = doc.getElementById('datatools-close-hint');
        if (hint) hint.style.display = 'block';
        setTimeout(function () {
          if (!win.closed) fallbackToBlank(win);
        }, standalone ? 250 : 1500);
      };
    }
    try {
      var doc = window.top.document;
      var win = window.top;
      if (!doc.getElementById('datatools-farewell-overlay')) {
        doc.body.appendChild(buildOverlay(doc));
      }
      wireClose(doc, win);
      // Auto-close attempt on first paint — succeeds in Chrome --app
      // windows, fails silently on regular tabs (and we don't redirect
      // automatically here; the manual button drives that path so the
      // user is in control).
      tryClose(win);
    } catch (e) {
      // Cross-origin access denied (shouldn't happen given Streamlit's
      // sandbox flags, but fall back gracefully): cover this iframe.
      document.body.appendChild(buildOverlay(document));
      wireClose(document, window);
    }
  })();
</script>
"""


def _js_html_safe(s: str) -> str:
    """Escape *s* so it can be embedded inside the farewell overlay's
    JS-single-quoted, innerHTML-bound payload.

    Order matters: backslash first (so subsequent escapes don't get
    re-escaped), then the JS string-terminator, then HTML-special chars.
    """
    return (
        s.replace("\\", "\\\\")
         .replace("'", "\\'")
         .replace("&", "&amp;")
         .replace("<", "&lt;")
         .replace(">", "&gt;")
    )


def _farewell_script() -> str:
    """Render the farewell overlay JS with the current language's strings."""
    return (
        _FAREWELL_SCRIPT_TEMPLATE
        .replace("__TITLE__", _js_html_safe(_t("quit.farewell_title")))
        .replace("__SUBTITLE__", _js_html_safe(_t("quit.farewell_subtitle")))
        .replace("__CLOSE_BTN__", _js_html_safe(_t("quit.close_window_button")))
        .replace("__CLOSE_HINT__", _js_html_safe(_t("quit.close_hint")))
    )


def _downloads_dir() -> "Path":
    """Return the user's Downloads folder.

    Defaults to ``~/Downloads``. Overrideable via the
    ``DATATOOLS_DOWNLOADS_DIR`` environment variable so tests can write
    to a temp directory instead of polluting the developer's home.
    """
    import os
    from pathlib import Path
    override = os.environ.get("DATATOOLS_DOWNLOADS_DIR")
    if override:
        return Path(override)
    return Path.home() / "Downloads"


def _open_in_file_manager(folder: "Path", *, select: "Path | None" = None) -> bool:
    """Open the OS file manager at *folder*, optionally highlighting *select*.

    Windows
        ``explorer <folder>`` only. We deliberately do NOT use
        ``explorer /select,<file>``: when the path contains a space
        (e.g. ``C:\\Users\\Michael Dombaugh\\Downloads``), Python's
        ``subprocess.Popen`` quotes the ``/select,...`` argument as one
        unit, and Explorer's ``/select`` parser does not handle that
        form — it silently falls back to opening the user's default
        view (typically Documents). Opening the bare folder works
        reliably regardless of spaces. ``os.startfile`` is kept as a
        last-resort fallback only.
    macOS
        ``open -R <file>`` reveals the file in Finder when ``select``
        is given; otherwise just opens the folder.
    Linux / *BSD
        ``xdg-open`` on the folder. No reliable cross-distro way to
        highlight a specific file.

    Returns ``True`` if any of the dispatch attempts succeeded
    (no guarantee the window actually surfaced — the caller should
    surface a fallback path so the user can paste it manually).
    """
    import os
    import subprocess

    if sys.platform == "win32":
        try:
            subprocess.Popen(["explorer", str(folder)])
            return True
        except Exception:
            pass
        try:
            os.startfile(str(folder))  # type: ignore[attr-defined]
            return True
        except Exception:
            return False
    if sys.platform == "darwin":
        try:
            if select is not None:
                subprocess.Popen(["open", "-R", str(select)])
            else:
                subprocess.Popen(["open", str(folder)])
            return True
        except Exception:
            return False
    # Linux / *BSD / etc.
    try:
        subprocess.Popen(["xdg-open", str(folder)])
        return True
    except Exception:
        return False


def local_download_button(
    label: str,
    data: bytes,
    *,
    file_name: str,
    mime: str = "application/octet-stream",  # noqa: ARG001 — kept for API compat
    disabled: bool = False,
    help: str | None = None,
    use_container_width: bool = True,
) -> None:
    """Save bytes directly to the user's Downloads folder.

    DataTools runs as a local Streamlit app, so the "server" IS the
    user's machine — we can write straight to ``~/Downloads/<file_name>``
    instead of going through the browser save dialog. On click:

    1. Bytes are written to ``Path.home() / "Downloads" / file_name``
       (overwriting any existing file with the same name).
    2. The page reruns and renders a success caption naming the exact
       absolute path the file landed at.
    3. An "Open Downloads folder" button appears that pops the OS file
       manager (Explorer / Finder / xdg-open) at the parent directory.

    Why not ``st.download_button`` or an HTML data: URL anchor?

    - ``st.download_button`` has a long-standing failure mode where
      only the first button on the page fires when multiple are
      stacked together.
    - Data: URLs balloon by 33% (base64) and leave the user guessing
      where the browser saved it (default Downloads folder or wherever
      they last picked — varies per browser).

    The save-server-side path is unambiguous, works the same regardless
    of browser settings, and gives the user a real link to the file.

    The ``mime`` parameter is accepted for backwards compatibility with
    the previous helper signature; it is no longer relevant because
    nothing on the wire knows the bytes' content type.
    """
    import hashlib
    from pathlib import Path

    # Stable widget keys, namespaced by file_name + content digest so
    # repeated renders of the same content keep their saved-state
    # banner, but a re-run that produced different bytes gets a fresh
    # button with no stale success message.
    digest = hashlib.sha1(data, usedforsecurity=False).hexdigest()[:8]
    btn_key = f"_dl_btn_{file_name}_{digest}"
    saved_key = f"_dl_saved_{file_name}_{digest}"
    open_key = f"_dl_open_{file_name}_{digest}"

    clicked = st.button(
        label,
        key=btn_key,
        disabled=disabled,
        help=help,
        type="secondary",
        use_container_width=use_container_width,
    )

    if clicked:
        target_dir = _downloads_dir()
        try:
            target_dir.mkdir(parents=True, exist_ok=True)
            target = target_dir / file_name
            target.write_bytes(data)
            st.session_state[saved_key] = str(target)
        except Exception as e:
            st.error(
                f"Could not save **{file_name}** to `{target_dir}`: {e}"
            )
            return

    saved_path_str = st.session_state.get(saved_key)
    if saved_path_str:
        saved_path = Path(saved_path_str)
        st.success(f"✓ Saved to `{saved_path_str}`")
        if st.button(
            "📂 Open Downloads folder",
            key=open_key,
            type="secondary",
        ):
            opened = _open_in_file_manager(saved_path.parent, select=saved_path)
            if opened:
                # The dispatch returned non-zero; the OS may still have
                # opened the window behind the active one. Surface a
                # confirmation so the user knows we tried.
                st.toast(f"Opening {saved_path.parent}", icon="📂")
            else:
                st.warning(
                    f"Could not open the file manager from here. "
                    f"The file is at:\n\n`{saved_path_str}`"
                )


# Back-compat alias: existing call sites use the old name. New code
# should prefer ``local_download_button``.
html_download_button = local_download_button


def render_sticky_footer() -> None:
    """Render a slim fixed-position footer at the bottom of the viewport.

    Contains a "Back to Home" link that's always visible regardless of
    scroll position. The footer is mounted as a direct child of
    ``<body>`` via a component-iframe script so it lives OUTSIDE every
    Streamlit container — that matters because ``.stApp`` carries
    ``zoom: 0.85`` (our compact-layout scaler) and Streamlit's content
    columns add their own padding/positioning context that previously
    swallowed the in-place ``st.markdown`` footer.

    The implementation is two-pass:

    1. ``st.markdown`` injects the CSS rules into the parent document.
       Class-targeted, so the rules apply once the footer DOM node
       exists regardless of where it lives.
    2. ``streamlit.components.v1.html`` renders a zero-height iframe
       whose JS reaches ``window.parent.document`` and creates / moves
       a ``#datatools-sticky-footer`` div directly under ``<body>``.
       This bypasses every Streamlit container.

    The anchor uses ``href="home"`` (relative) so Streamlit's URL
    routing resolves it to the Home page and the link works correctly
    behind a reverse proxy or non-root mount.
    """
    import html as _html
    import json as _json
    label_raw = _t("nav.back_to_home")
    label_esc = _html.escape(label_raw)

    # CSS rules live in the parent document. Class selector so a
    # re-rendered/relocated footer div picks them up automatically.
    st.markdown(
        """
<style>
[data-testid="stAppViewBlockContainer"] {
    padding-bottom: 4rem !important;
}
#datatools-sticky-footer {
    position: fixed !important;
    bottom: 0 !important;
    left: 0 !important;
    right: 0 !important;
    background: rgba(255, 255, 255, 0.97) !important;
    backdrop-filter: blur(8px);
    -webkit-backdrop-filter: blur(8px);
    border-top: 1px solid rgba(49, 51, 63, 0.25) !important;
    padding: 0.5rem 1.25rem !important;
    z-index: 2147483646 !important;
    display: flex !important;
    align-items: center !important;
    justify-content: flex-start !important;
    font-family: system-ui, -apple-system, sans-serif !important;
    box-sizing: border-box !important;
}
#datatools-sticky-footer a.datatools-sticky-footer-link {
    display: inline-block !important;
    color: rgb(38, 39, 48) !important;
    text-decoration: none !important;
    padding: 0.4rem 0.9rem !important;
    border-radius: 0.5rem !important;
    border: 1px solid rgba(49, 51, 63, 0.28) !important;
    background: rgb(240, 242, 246) !important;
    font-size: 14px !important;
    font-weight: 500 !important;
    line-height: 1.4 !important;
    cursor: pointer !important;
    transition: background 0.12s ease, border-color 0.12s ease;
}
#datatools-sticky-footer a.datatools-sticky-footer-link:hover {
    background: rgb(225, 228, 235) !important;
    border-color: rgba(49, 51, 63, 0.4) !important;
}
#datatools-sticky-footer a.datatools-sticky-footer-link:active {
    background: rgb(210, 214, 222) !important;
}
</style>
""",
        unsafe_allow_html=True,
    )

    # Move the footer to <body> directly via component iframe. The
    # iframe carries allow-same-origin so window.parent.document is
    # reachable; if a sandbox config ever blocks that we fall back to
    # rendering inside the iframe itself (still visible, just sized
    # to the iframe rather than the viewport).
    from streamlit.components.v1 import html as _components_html
    _components_html(
        f"""
<script>
  (function () {{
    var label = {_json.dumps(label_raw)};
    function build(doc) {{
      var prev = doc.getElementById('datatools-sticky-footer');
      if (prev) prev.remove();
      var div = doc.createElement('div');
      div.id = 'datatools-sticky-footer';
      var a = doc.createElement('a');
      a.className = 'datatools-sticky-footer-link';
      a.href = 'home';
      a.target = '_self';
      a.textContent = label;
      div.appendChild(a);
      return div;
    }}
    try {{
      var doc = window.parent.document;
      doc.body.appendChild(build(doc));
    }} catch (e) {{
      document.body.appendChild(build(document));
    }}
  }})();
</script>
""",
        height=0,
    )


def back_to_home_link(*, key: str = "_back_to_home_link") -> None:
    """Render a "← Back to Home" affordance on a tool page.

    Tool pages reached from the home findings panel benefit from an
    explicit return-to-home control so a user working through findings
    on multiple uploaded files can hop between files without hunting
    through the sidebar. Call this twice on each tool page — once
    near the top (default key) and once at the bottom with
    ``key="_back_to_home_link_bottom"`` so the control stays reachable
    after the user scrolls through long results.

    Implementation: ``st.switch_page`` under ``st.navigation`` requires
    either a file path to a page in ``pages/`` or a ``StreamlitPage``
    object whose script identity matches one registered in the nav.
    The entry script ``app.py`` is the nav manager itself — it cannot
    be switched-to by filename. So we import the home callable from
    ``src.gui.app`` and rebuild the same ``st.Page`` registration here.
    Streamlit identifies pages by the underlying callable's qualified
    name, so a freshly-constructed Page resolves to the registered one.
    """
    if st.button(_t("nav.back_to_home"), key=key, type="secondary"):
        # Import from the renderer module (not from app.py — importing
        # app.py would re-execute its navigation setup with the wrong
        # "main script" context and blow up the pages/ path resolution).
        from src.gui._home import _home_page
        st.switch_page(
            st.Page(_home_page, title="Home", icon="🧹", url_path="home"),
        )


def shutdown_app() -> None:
    """Terminate the Streamlit server immediately, no confirm.

    Designed to be called from a page whose mere act of rendering means
    the user wants to quit (e.g., the sidebar Close entry). Schedules
    ``os._exit(0)`` on a daemon thread so the process terminates after
    the farewell overlay has had a chance to paint, then injects the
    overlay JS and short-circuits the rest of the page via ``st.stop``.

    Streamlit has no first-class shutdown hook, and signalling the
    process (SIGTERM/SIGINT) does not reliably terminate it — Streamlit
    installs its own handlers and the tornado/asyncio loop swallows or
    defers the signal, so the browser sees the websocket drop while the
    python process stays alive. ``os._exit`` is the only reliable kill.

    The hard-exit thread is skipped under pytest so the test suite does
    not suicide when a test renders this page. The overlay + caption
    still render so test assertions about content work.
    """
    if not st.session_state.get("_app_shutting_down"):
        st.session_state["_app_shutting_down"] = True
        if "pytest" not in sys.modules:
            def _hard_exit() -> None:
                time.sleep(1.0)
                os._exit(0)

            threading.Thread(target=_hard_exit, daemon=True).start()

    from streamlit.components.v1 import html as _components_html
    _components_html(_farewell_script(), height=0)
    st.success(_t("quit.shutting_down"))
    st.stop()


# ---------------------------------------------------------------------------
# Config panel (advanced options)
# ---------------------------------------------------------------------------

def config_panel(df: pd.DataFrame) -> dict:
    """Render the Advanced Options expander. Returns a settings dict.

    Keys returned:
        strategies: list[MatchStrategy] | None
        survivor_rule: SurvivorRule
        date_column: str | None
        merge: bool
    """
    columns = list(df.columns)

    with st.expander("Advanced Options"):
        col_left, col_right = st.columns(2)

        with col_left:
            subset_cols = st.multiselect(
                "Match on columns",
                columns,
                default=[],
                help="Leave empty to auto-detect based on column names.",
            )
            key_cols = st.multiselect(
                "Strong keys",
                columns,
                default=[],
                help="Columns that uniquely identify records (e.g., EIN, SKU). Each is an independent exact-match strategy.",
            )
            fuzzy_cols = st.multiselect(
                "Fuzzy columns",
                columns,
                default=[],
                help="Columns to fuzzy-match. Others use exact matching.",
            )

        with col_right:
            algorithm = st.selectbox(
                "Fuzzy algorithm",
                ["jaro_winkler", "levenshtein", "token_set_ratio"],
                index=0,
                help="jaro_winkler: best for names. levenshtein: best for typos. token_set_ratio: best for addresses.",
            )
            threshold = st.slider(
                "Similarity threshold",
                min_value=50,
                max_value=100,
                value=85,
                help="Lower = more matches but more false positives.",
            )
            survivor = st.selectbox(
                "Survivor rule",
                ["first", "last", "most-complete", "most-recent"],
                index=0,
                help="Which row to keep when duplicates are found.",
            )

        # Second row of options
        col_a, col_b = st.columns(2)

        with col_a:
            normalize_options = {c: "auto" for c in columns}
            normalizer_types = ["auto", "email", "phone", "name", "address", "string", "none"]

            normalize_map: dict[str, str] = {}
            if fuzzy_cols or subset_cols:
                target_cols = fuzzy_cols or subset_cols
                st.markdown("**Per-column normalizers**")
                for col_name in target_cols:
                    norm = st.selectbox(
                        f"Normalizer for '{col_name}'",
                        normalizer_types,
                        index=0,
                        key=f"norm_{col_name}",
                    )
                    if norm not in ("auto", "none"):
                        normalize_map[col_name] = norm

        with col_b:
            merge = st.checkbox(
                "Merge mode",
                value=False,
                help="Fill missing fields in the surviving row from removed duplicates.",
            )
            date_column: Optional[str] = None
            if survivor == "most-recent":
                date_column = st.selectbox(
                    "Date column",
                    columns,
                    help="Required for most-recent survivor rule.",
                )

        # Config save/load
        st.divider()
        cfg_left, cfg_right = st.columns(2)

        with cfg_left:
            config_file = st.file_uploader(
                "Load config profile",
                type=["json"],
                help="Load previously saved settings.",
                key="config_upload",
            )
            if config_file is not None:
                import json
                try:
                    data = json.loads(config_file.read())
                    loaded = DeduplicationConfig.from_dict(data)
                    st.session_state["loaded_config"] = loaded
                    st.success("Config loaded.")
                except Exception as e:
                    st.error(f"Failed to load config: {e}")

        with cfg_right:
            if st.button("Save current settings"):
                cfg = _build_config(
                    subset_cols, key_cols, fuzzy_cols,
                    algorithm, threshold, normalize_map,
                    survivor, date_column, merge,
                )
                cfg_json = cfg.to_dict()
                import json
                html_download_button(
                    "Download config JSON",
                    json.dumps(cfg_json, indent=2).encode("utf-8"),
                    file_name="dedup_config.json",
                    mime="application/json",
                )

    # Build strategies from selections
    strategies = _build_strategies(
        subset_cols, key_cols, fuzzy_cols,
        algorithm, threshold, normalize_map,
    )

    # Survivor rule mapping
    survivor_map = {
        "first": SurvivorRule.KEEP_FIRST,
        "last": SurvivorRule.KEEP_LAST,
        "most-complete": SurvivorRule.KEEP_MOST_COMPLETE,
        "most-recent": SurvivorRule.KEEP_MOST_RECENT,
    }

    return {
        "strategies": strategies,
        "survivor_rule": survivor_map[survivor],
        "date_column": date_column,
        "merge": merge,
    }


def _build_strategies(
    subset_cols: list[str],
    key_cols: list[str],
    fuzzy_cols: list[str],
    algorithm: str,
    threshold: int,
    normalize_map: dict[str, str],
) -> Optional[list[MatchStrategy]]:
    """Build MatchStrategy list from GUI selections. Returns None for auto-detect."""
    strategies: list[MatchStrategy] = []

    # If user selected columns explicitly, build from those
    if subset_cols or fuzzy_cols:
        target_cols = subset_cols if subset_cols else fuzzy_cols
        fuzzy_set = set(fuzzy_cols)
        col_strats: list[ColumnMatchStrategy] = []
        for col in target_cols:
            norm = None
            if col in normalize_map:
                norm = NormalizerType(normalize_map[col])
            if col in fuzzy_set:
                algo = Algorithm(algorithm)
                thresh = float(threshold)
            else:
                algo = Algorithm.EXACT
                thresh = 100.0
            col_strats.append(ColumnMatchStrategy(
                column=col, algorithm=algo, threshold=thresh, normalizer=norm,
            ))
        strategies.append(MatchStrategy(column_strategies=col_strats))

    # Add strong key strategies
    if key_cols:
        for col in key_cols:
            strategies.append(MatchStrategy(column_strategies=[
                ColumnMatchStrategy(column=col, algorithm=Algorithm.EXACT, threshold=100.0)
            ]))

    return strategies if strategies else None


def _build_config(
    subset_cols, key_cols, fuzzy_cols,
    algorithm, threshold, normalize_map,
    survivor, date_column, merge,
) -> DeduplicationConfig:
    """Build a DeduplicationConfig from GUI state."""
    cfg = DeduplicationConfig(
        survivor_rule=survivor.replace("-", "_"),
        date_column=date_column,
        merge=merge,
        subset_columns=subset_cols or None,
        fuzzy_columns=fuzzy_cols or None,
        default_algorithm=algorithm,
        default_threshold=float(threshold),
        normalize_map=normalize_map or None,
    )
    strategies = _build_strategies(
        subset_cols, key_cols, fuzzy_cols,
        algorithm, threshold, normalize_map,
    )
    if strategies:
        cfg.strategies = [
            StrategyConfig(columns=[
                ColumnStrategyConfig(
                    column=cs.column,
                    algorithm=cs.algorithm.value,
                    threshold=cs.threshold,
                    normalizer=cs.normalizer.value if cs.normalizer else None,
                )
                for cs in s.column_strategies
            ])
            for s in strategies
        ]
    return cfg


# ---------------------------------------------------------------------------
# Match group review card
# ---------------------------------------------------------------------------

def _find_differing_cols(
    group: MatchResult, df: pd.DataFrame, display_cols: list[str],
) -> list[str]:
    """Return columns where values differ across rows in the group."""
    differing = []
    for col in display_cols:
        values = set()
        for idx in group.row_indices:
            values.add(str(df.iloc[idx].get(col, "")).strip())
        if len(values) > 1:
            differing.append(col)
    return differing


def match_group_card(
    group: MatchResult,
    df: pd.DataFrame,
    group_num: int,
) -> None:
    """Render an expandable match group card with side-by-side diff.

    Users select which rows to keep via checkboxes.  When exactly one row
    is kept they can also cherry-pick column values from the other rows.

    Decision format stored in ``st.session_state["review_decisions"]``::

        {group_id: {"keep_indices": [int, ...], "overrides": {col: val}}}
    """
    confidence = group.confidence
    matched_on = ", ".join(group.matched_on)
    n_rows = len(group.row_indices)
    gid = group.group_id

    decisions = st.session_state.get("review_decisions", {})
    has_decision = gid in decisions
    decision_dict = decisions.get(gid, {})
    keep_indices = decision_dict.get("keep_indices", []) if has_decision else []
    overrides = decision_dict.get("overrides", {}) if has_decision else {}

    # Build label — append decision status if already decided
    label = (
        f"Group {group_num}: {n_rows} rows "
        f"(confidence: {confidence:.0f}%) "
        f"[{matched_on}]"
    )
    if has_decision:
        if len(keep_indices) == n_rows:
            label += " — Kept All"
        elif len(keep_indices) == 1:
            label += " — Merged (customized)" if overrides else " — Merged"
        else:
            label += f" — Split (kept {len(keep_indices)} of {n_rows})"

    # Decided groups collapse; undecided groups stay open
    expanded = not has_decision

    display_cols = [c for c in df.columns if not str(c).startswith("_norm_")]
    differing_cols = _find_differing_cols(group, df, display_cols)

    with st.expander(label, expanded=expanded):
        if has_decision:
            # --- Decided state: read-only table with diff highlighting ---
            rows_data = []
            for idx in group.row_indices:
                row = {"Row": idx + 1}
                for col in display_cols:
                    row[col] = df.iloc[idx].get(col, "")
                rows_data.append(row)
            compare_df = pd.DataFrame(rows_data).set_index("Row")

            def _highlight_diffs(s: pd.Series) -> list[str]:
                styles = []
                first_val = str(s.iloc[0]).strip() if len(s) > 0 else ""
                for val in s:
                    val_str = str(val).strip()
                    if val_str != first_val and val_str and first_val:
                        styles.append(
                            "background-color: rgba(245, 166, 35, 0.2)"
                        )
                    elif not val_str and first_val:
                        styles.append(
                            "background-color: rgba(240, 82, 82, 0.1)"
                        )
                    else:
                        styles.append("")
                return styles

            styled = compare_df.style.apply(_highlight_diffs, axis=0)
            st.dataframe(styled, use_container_width=True)

            if len(keep_indices) == n_rows:
                st.info("Decision: Kept All")
            elif len(keep_indices) == 1:
                msg = "Decision: Merge"
                if overrides:
                    msg += f" ({len(overrides)} column(s) customized)"
                st.success(msg)
            else:
                kept = ", ".join(str(i + 1) for i in sorted(keep_indices))
                st.success(
                    f"Decision: Keep rows {kept} "
                    f"(removing {n_rows - len(keep_indices)})"
                )

            def _undo(g=gid):
                st.session_state["review_decisions"].pop(g, None)
                st.session_state.pop(f"editor_{g}", None)

            st.button("Undo", key=f"undo_{gid}", on_click=_undo)

        else:
            # --- Undecided: interactive editor with inline checkboxes & dropdowns ---
            editor_rows = []
            for idx in group.row_indices:
                row_data = {"Keep": idx == group.survivor_index, "Row": idx + 1}
                for col in display_cols:
                    row_data[col] = str(df.iloc[idx].get(col, ""))
                editor_rows.append(row_data)
            editor_df = pd.DataFrame(editor_rows)

            col_config = {
                "Keep": st.column_config.CheckboxColumn(
                    "Keep", default=True, width="small",
                ),
                "Row": st.column_config.NumberColumn("Row", width="small"),
            }
            for col in differing_cols:
                vals = []
                for idx in group.row_indices:
                    v = str(df.iloc[idx].get(col, "")).strip()
                    if v not in vals:
                        vals.append(v)
                if "" not in vals:
                    vals.append("")
                col_config[col] = st.column_config.SelectboxColumn(
                    col, options=vals, required=False,
                )

            disabled_cols = ["Row"] + [
                c for c in display_cols if c not in differing_cols
            ]

            edited = st.data_editor(
                editor_df,
                column_config=col_config,
                disabled=disabled_cols,
                use_container_width=True,
                hide_index=True,
                key=f"editor_{gid}",
            )

            # Read which rows are checked
            checked = [
                idx
                for i, idx in enumerate(group.row_indices)
                if edited.iloc[i]["Keep"]
            ]

            if differing_cols:
                st.caption(
                    f"Columns with differences (editable): "
                    f"{', '.join(differing_cols)}"
                )

            # Status + surviving rows preview
            if len(checked) == 0:
                st.warning("Select at least one row to keep.")
            else:
                if len(checked) == n_rows:
                    st.caption("Keeping all rows (no duplicates removed)")
                elif len(checked) == 1:
                    st.caption(
                        f"Merging into Row {checked[0] + 1}, "
                        f"removing {n_rows - 1} row(s)"
                    )
                else:
                    st.caption(
                        f"Keeping {len(checked)} rows, "
                        f"removing {n_rows - len(checked)}"
                    )

                # Build preview of surviving rows with edits applied
                checked_positions = [
                    i for i, idx in enumerate(group.row_indices)
                    if idx in checked
                ]
                preview = edited.iloc[checked_positions].drop(
                    columns=["Keep"],
                ).reset_index(drop=True)
                st.markdown("**Surviving rows preview:**")
                st.dataframe(preview, use_container_width=True, hide_index=True)

            # Confirm
            def _on_confirm(
                g=gid, indices=list(group.row_indices),
                diff=differing_cols, surv=group.survivor_index,
            ):
                editor_state = st.session_state.get(f"editor_{g}", {})
                ed_rows = editor_state.get("edited_rows", {})

                # Determine which rows to keep
                keep = []
                for i, idx in enumerate(indices):
                    changes = ed_rows.get(i, {})
                    default_keep = idx == surv
                    if changes.get("Keep", default_keep):
                        keep.append(idx)
                if not keep:
                    keep = list(indices)

                # Column overrides (single-survivor merge only)
                ovr: dict[str, str] = {}
                if len(keep) == 1:
                    surv_idx = keep[0]
                    surv_pos = indices.index(surv_idx)
                    surv_changes = ed_rows.get(surv_pos, {})
                    the_df = st.session_state["df"]
                    for c in diff:
                        if c in surv_changes:
                            new_val = (
                                str(surv_changes[c])
                                if surv_changes[c] is not None
                                else ""
                            )
                            orig = str(
                                the_df.iloc[surv_idx].get(c, "")
                            ).strip()
                            if new_val.strip() != orig:
                                ovr[c] = new_val

                st.session_state["review_decisions"][g] = {
                    "keep_indices": keep,
                    "overrides": ovr,
                }

            st.button(
                "Confirm",
                key=f"confirm_{gid}",
                type="primary",
                on_click=_on_confirm,
                disabled=(len(checked) == 0),
            )


# ---------------------------------------------------------------------------
# Results summary + downloads
# ---------------------------------------------------------------------------

def results_summary(
    result: DeduplicationResult,
    original_df: pd.DataFrame,
) -> None:
    """Render summary stats and download buttons."""
    removed = result.original_row_count - len(result.deduplicated_df)

    # Summary metrics
    col1, col2, col3, col4 = st.columns(4)
    col1.metric("Rows In", result.original_row_count)
    col2.metric("Rows Out", len(result.deduplicated_df))
    col3.metric("Removed", removed)
    col4.metric("Groups", len(result.match_groups))

    st.divider()

    # Download buttons
    dl_left, dl_mid, dl_right = st.columns(3)

    with dl_left:
        csv_bytes = result.deduplicated_df.to_csv(index=False).encode("utf-8-sig")
        html_download_button(
            "Download Deduplicated CSV",
            csv_bytes,
            file_name="deduplicated.csv",
            mime="text/csv",
        )

    with dl_mid:
        if not result.removed_df.empty:
            removed_bytes = result.removed_df.to_csv(index=False).encode("utf-8-sig")
            html_download_button(
                "Download Removed Rows",
                removed_bytes,
                file_name="removed_rows.csv",
                mime="text/csv",
            )

    with dl_right:
        if result.match_groups:
            groups_data = _build_match_groups_csv(result, original_df)
            html_download_button(
                "Download Match Groups Report",
                groups_data,
                file_name="match_groups.csv",
                mime="text/csv",
            )


def apply_review_decisions(
    original_df: pd.DataFrame,
    match_groups: list[MatchResult],
    decisions: dict,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Build final DataFrames by applying user review decisions.

    Supports three modes per group:

    - **Merge** (1 row kept): single survivor with optional column overrides.
    - **Split** (some rows kept): selected rows survive, others removed.
    - **Keep all** (all rows kept): no rows removed.
    - **No decision**: engine default (single survivor).

    Returns ``(deduplicated_df, removed_df)``.
    """
    remove_indices: set[int] = set()
    row_overrides: dict[int, dict[str, str]] = {}

    for group in match_groups:
        gid = group.group_id
        decision = decisions.get(gid)

        # No decision yet — accept with engine defaults
        if decision is None:
            keep = {group.survivor_index}
        else:
            keep = set(decision.get("keep_indices", group.row_indices))
            # Safety: never remove all rows in a group
            if not keep:
                keep = set(group.row_indices)

        for idx in group.row_indices:
            if idx not in keep:
                remove_indices.add(idx)

        # Column overrides (only meaningful for single-survivor merge)
        ovr = decision.get("overrides", {}) if decision else {}
        if ovr and len(keep) == 1:
            row_overrides[next(iter(keep))] = ovr

    # Build output DataFrames
    kept = [i for i in range(len(original_df)) if i not in remove_indices]

    if row_overrides:
        rows = []
        for i in kept:
            row = original_df.iloc[i].copy()
            if i in row_overrides:
                for col, val in row_overrides[i].items():
                    if col in row.index:
                        row[col] = val
            rows.append(row)
        deduped = pd.DataFrame(rows).reset_index(drop=True)
    else:
        deduped = original_df.iloc[kept].copy().reset_index(drop=True)

    removed = (
        original_df.iloc[sorted(remove_indices)].copy().reset_index(drop=True)
        if remove_indices
        else pd.DataFrame()
    )

    return deduped, removed


def _build_match_groups_csv(
    result: DeduplicationResult,
    original_df: pd.DataFrame,
) -> bytes:
    """Build the match groups audit CSV as bytes."""
    rows = []
    for g in result.match_groups:
        for idx in g.row_indices:
            row_data = {
                "_group_id": g.group_id + 1,
                "_is_survivor": idx == g.survivor_index,
                "_confidence": g.confidence,
                "_matched_on": ", ".join(g.matched_on),
                "_original_row": idx + 1,
            }
            for col in original_df.columns:
                if not str(col).startswith("_norm_"):
                    row_data[col] = original_df.iloc[idx].get(col, "") if idx < len(original_df) else ""
            rows.append(row_data)

    groups_df = pd.DataFrame(rows)
    return groups_df.to_csv(index=False).encode("utf-8-sig")


# ---------------------------------------------------------------------------
# Analyzer integration (upload-time data quality findings)
# ---------------------------------------------------------------------------

# Tool id -> friendly display name. Single source of truth for the GUI; the
# CLI keeps its own copy so each entrypoint stays self-contained.
TOOL_DISPLAY_NAMES: dict[str, str] = {
    "01_deduplicator": "Find Duplicates",
    "02_text_cleaner": "Clean Text",
    "03_format_standardizer": "Standardize Formats",
    "04_missing_handler": "Fix Missing Values",
    "05_column_mapper": "Map Columns",
    "06_outlier_detector": "Find Unusual Values",
    "07_multi_file_merger": "Combine Files",
    "08_validator_reporter": "Quality Check",
    "09_pipeline_runner": "Automated Workflows",
}

_SEVERITY_ICON: dict[str, str] = {
    "info": "ℹ️",
    "warn": "⚠️",
    "error": "🛑",
}

_SEVERITY_COLOR: dict[str, str] = {
    "info": "blue",
    "warn": "orange",
    "error": "red",
}

# Map tool id to the streamlit page path under src/gui/. Skipped tools (no
# page yet) return empty string and the "Open" button is omitted.
_TOOL_PAGE_PATHS: dict[str, str] = {
    "01_deduplicator": "pages/1_Deduplicator.py",
    "02_text_cleaner": "pages/2_Text_Cleaner.py",
    "03_format_standardizer": "pages/3_Format_Standardizer.py",
    "04_missing_handler": "pages/4_Missing_Values.py",
    "05_column_mapper": "pages/5_Column_Mapper.py",
    "06_outlier_detector": "pages/6_Outlier_Detector.py",
    "07_multi_file_merger": "pages/7_Multi_File_Merger.py",
    "08_validator_reporter": "pages/8_Validator_Reporter.py",
    "09_pipeline_runner": "pages/9_Pipeline_Runner.py",
}


def tool_display_name(tool_id: str) -> str:
    """Map a stable tool id to its GUI display name; falls back to the id.

    Routes through the active language pack so the home grid, findings
    panel headers, and "Open tool" buttons all stay in sync with the
    sidebar's language selection.
    """
    if not tool_id:
        return _t("findings.untargeted_label")
    translated = _t(f"tools.{tool_id}.name")
    if translated != f"tools.{tool_id}.name":
        return translated
    return TOOL_DISPLAY_NAMES.get(tool_id, tool_id)


def _tool_page_slug(tool_id: str) -> str:
    return _TOOL_PAGE_PATHS.get(tool_id, "")


def render_findings_panel(findings, *, header: str | None = None) -> None:
    """Render a list of :class:`Finding` objects grouped by tool.

    Each tool gets a header with the count, an open-tool button, and a list
    of the findings underneath. Severity icon + count are shown inline so
    the user can decide which tool to open first.
    """
    from src.core.analyze import findings_by_tool  # local import to avoid cycle
    from src.core.text_clean import hidden_char_css

    if header is None:
        header = _t("findings.header")

    if not findings:
        st.success(_t("findings.none"))
        return

    # Inject the hidden-char badge styles once so every sample value below
    # can render leading/trailing whitespace and invisibles as visible badges.
    st.markdown(hidden_char_css() + _SAMPLE_TABLE_CSS, unsafe_allow_html=True)

    by_sev: dict[str, int] = {}
    for f in findings:
        by_sev[f.severity] = by_sev.get(f.severity, 0) + 1
    sev_summary = " · ".join(
        _t(
            "findings.severity_summary_segment",
            icon=_SEVERITY_ICON[s], n=by_sev[s], severity=s,
        )
        for s in ("error", "warn", "info") if by_sev.get(s)
    )
    st.markdown(f"### {header}")
    st.caption(sev_summary)

    grouped = findings_by_tool(findings)
    untargeted = [f for f in findings if not f.tool]

    for tool_id in sorted(grouped):
        items = grouped[tool_id]
        name = tool_display_name(tool_id)
        with st.expander(
            _t("findings.tool_section_label", tool=name, n=len(items)),
            expanded=any(f.severity == "error" for f in items),
        ):
            for f in items:
                _render_one_finding(f)
            page_slug = _tool_page_slug(tool_id)
            if page_slug:
                # Render as a primary (red) ``st.button`` rather than the
                # subtle ``st.page_link`` we used before — the previous
                # rendering blended into the page, making the per-tool
                # jump non-obvious. The button triggers ``st.switch_page``
                # so navigation is still a soft switch (no full reload).
                if st.button(
                    _t("findings.open_tool", tool=name),
                    key=f"_findings_open_{tool_id}",
                    type="primary",
                    use_container_width=False,
                ):
                    st.switch_page(page_slug)

    if untargeted:
        with st.expander(
            _t("findings.other_section_label", n=len(untargeted)),
            expanded=False,
        ):
            for f in untargeted:
                _render_one_finding(f)


_PREVIEW_TABLE_CSS = """
<style>
.hidden-aware-preview {
    width: 100%;
    border-collapse: collapse;
    font-size: 0.9em;
}
.hidden-aware-preview th,
.hidden-aware-preview td {
    padding: 4px 8px;
    border: 1px solid #eee;
    text-align: left;
    vertical-align: top;
    font-family: ui-monospace, SFMono-Regular, monospace;
    /* pre-wrap so internal ASCII whitespace and embedded newlines render
       as the user wrote them; otherwise browsers collapse adjacent spaces. */
    white-space: pre-wrap;
    word-break: break-word;
    max-width: 32em;
}
.hidden-aware-preview thead th {
    background: #f6f8fa;
    position: sticky;
    top: 0;
}
.hidden-aware-preview tbody tr:nth-child(even) { background: #fafafa; }
.hidden-aware-preview .row-num {
    color: #888;
    font-family: inherit;
    background: #f6f8fa;
    text-align: right;
}
.hidden-aware-preview-wrap {
    max-height: 26rem;
    overflow: auto;
    border: 1px solid #eee;
    border-radius: 4px;
}
</style>
"""


def render_hidden_aware_preview(
    df,
    *,
    n_rows: int = 10,
    caption: str | None = None,
) -> None:
    """Render a DataFrame preview that shows hidden characters in every cell.

    Used for the Clean Text tool's "before" and "after" previews so the user
    can actually see the leading/trailing whitespace, NBSP padding,
    zero-width characters, and smart punctuation that the cleaner is going
    to remove (or just removed). A plain ``st.dataframe`` collapses outer
    ASCII whitespace and renders invisibles as nothing, defeating the
    point of a preview in a cleanup tool.

    Headers and cell values are both routed through
    :func:`visualize_hidden_html` with ``mark_outer_whitespace=True``.
    """
    import pandas as pd
    from src.core.text_clean import hidden_char_css, visualize_hidden_html

    if df is None or len(df) == 0:
        st.info("No rows to preview.")
        return

    sliced = df.head(n_rows) if len(df) > n_rows else df

    st.markdown(hidden_char_css() + _PREVIEW_TABLE_CSS, unsafe_allow_html=True)
    if caption:
        st.caption(caption)

    header_cells = "".join(
        f"<th>{visualize_hidden_html(str(c), mark_outer_whitespace=True)}</th>"
        for c in sliced.columns
    )

    body_rows: list[str] = []
    for row_idx, (orig_idx, row) in enumerate(sliced.iterrows(), start=1):
        cells = ["<td class='row-num'>" + str(row_idx) + "</td>"]
        for col in sliced.columns:
            value = row[col]
            if isinstance(value, str):
                rendered = visualize_hidden_html(value, mark_outer_whitespace=True)
            elif pd.isna(value):
                rendered = "<span style='color:#aaa'>NaN</span>"
            else:
                # Non-string scalars (numerics, bools) just stringify; they
                # won't have invisible chars but we still need html-escape.
                rendered = visualize_hidden_html(str(value))
            cells.append(f"<td>{rendered}</td>")
        body_rows.append("<tr>" + "".join(cells) + "</tr>")

    st.markdown(
        "<div class='hidden-aware-preview-wrap'>"
        "<table class='hidden-aware-preview'>"
        f"<thead><tr><th class='row-num'>#</th>{header_cells}</tr></thead>"
        f"<tbody>{''.join(body_rows)}</tbody>"
        "</table>"
        "</div>",
        unsafe_allow_html=True,
    )


_SAMPLE_TABLE_CSS = """
<style>
.findings-sample-table {
    width: 100%;
    border-collapse: collapse;
    font-size: 0.9em;
}
.findings-sample-table th,
.findings-sample-table td {
    padding: 4px 8px;
    border-bottom: 1px solid #eee;
    text-align: left;
    vertical-align: top;
}
.findings-sample-table td.value {
    font-family: ui-monospace, SFMono-Regular, monospace;
    /* pre-wrap so any ASCII whitespace inside the value is preserved
       visually (browsers collapse adjacent spaces by default). */
    white-space: pre-wrap;
    word-break: break-word;
}
.findings-sample-table tbody tr:hover { background: #fafafa; }
</style>
"""


def _render_one_finding(f) -> None:
    from src.core.text_clean import visualize_hidden_html

    color = _SEVERITY_COLOR[f.severity]
    icon = _SEVERITY_ICON[f.severity]
    column_part = f" in `{f.column}`" if getattr(f, "column", None) else ""
    st.markdown(
        f"{icon} :{color}[**{f.id}**]{column_part} — {f.description}"
    )
    if f.samples:
        # Render samples as an HTML table so leading/trailing whitespace
        # and invisible characters in the value column show up as badges.
        # A plain st.dataframe collapses outer whitespace and renders
        # NBSP/ZWSP as nothing, defeating the point of the audit.
        rows_html = []
        for row, col, value in f.samples:
            rendered_value = visualize_hidden_html(
                str(value), mark_outer_whitespace=True,
            )
            rendered_col = visualize_hidden_html(
                str(col), mark_outer_whitespace=True,
            )
            rows_html.append(
                "<tr>"
                f"<td>{int(row) + 1 if isinstance(row, int) else row}</td>"
                f"<td><code>{rendered_col}</code></td>"
                f"<td class='value'>{rendered_value}</td>"
                "</tr>"
            )
        st.markdown(
            "<table class='findings-sample-table'>"
            "<thead><tr>"
            "<th>Row</th><th>Column</th><th>Value</th>"
            "</tr></thead>"
            f"<tbody>{''.join(rows_html)}</tbody>"
            "</table>",
            unsafe_allow_html=True,
        )


def upload_and_analyze_section() -> None:
    """Render the upload + analyze panel for the home page.

    Stashes the uploaded file (name + bytes) and findings in session state
    so individual tool pages can pick them up if they want to skip their
    own uploader. Each tool page already has its own uploader today, so
    this is purely additive.
    """
    st.markdown(f"### {_t('upload.heading')}")
    st.caption(_t("upload.intro"))
    st.caption(_t("upload.limits"))

    uploaded = st.file_uploader(
        _t("upload.uploader_label"),
        type=["csv", "tsv", "xlsx", "xls"],
        key="home_upload",
        help=_t("upload.uploader_help"),
    )
    if uploaded is None:
        return

    # Stash on every fresh upload so all tool pages can pick it up.
    if (
        st.session_state.get("home_uploaded_name") != uploaded.name
        or st.session_state.get("home_uploaded_size") != uploaded.size
    ):
        st.session_state["home_uploaded_name"] = uploaded.name
        st.session_state["home_uploaded_size"] = uploaded.size
        st.session_state["home_uploaded_bytes"] = uploaded.getvalue()
        # Drop stale findings on a new upload.
        st.session_state.pop("home_findings", None)
        st.session_state.pop("home_skipped", None)

    col_run, col_skip, _ = st.columns([1, 1, 4])
    with col_run:
        run_clicked = st.button(_t("upload.run_button"), type="primary", key="home_run_analysis")
    with col_skip:
        skip_clicked = st.button(_t("upload.skip_button"), key="home_skip_analysis")

    if skip_clicked:
        st.session_state["home_findings"] = []
        st.session_state["home_skipped"] = True

    if run_clicked:
        with st.spinner(_t("upload.scanning")):
            findings = _run_analysis_on_upload(uploaded)
        st.session_state["home_findings"] = findings
        st.session_state["home_skipped"] = False

    findings = st.session_state.get("home_findings")
    if findings is None:
        return

    if st.session_state.get("home_skipped"):
        st.info(_t("upload.skipped_notice"))
        return

    st.divider()
    render_findings_panel(findings)


def _run_analysis_on_upload(uploaded):
    """Read the uploaded file with pre-parse repair, then analyze.

    Errors are caught and surfaced as a single synthetic ``Finding``
    instead of bubbling a traceback up into the page chrome. A bad
    file (empty bytes, unreadable encoding, pandas parse failure on
    one of several uploaded files) should yield a clean red banner for
    that file, not kill the whole multi-file analysis run.
    """
    import hashlib
    from src.audit import log_event, log_exception
    from src.core.analyze import Finding, analyze
    from src.core.errors import format_for_user
    from src.core.io import repair_bytes

    name = uploaded.name
    data = uploaded.getvalue()
    suffix = name.rsplit(".", 1)[-1].lower() if "." in name else ""
    digest = hashlib.sha1(
        data, usedforsecurity=False,
    ).hexdigest()[:12] if data else "empty"

    log_event(
        "analyze",
        f"Analyzing {name}",
        filename=name,
        bytes=len(data),
        sha1_12=digest,
        suffix=suffix,
    )

    def _error_finding(description: str, fid: str = "analysis_failed") -> list[Finding]:
        return [Finding(
            id=fid,
            severity="error",
            tool="",
            count=1,
            description=description,
            confidence="high",
            fix_action="",
        )]

    if not data:
        log_event(
            "analyze",
            f"Skipping {name} — 0 bytes",
            level="warn",
            filename=name,
            outcome="empty_upload",
        )
        return _error_finding(
            f"`{name}` is empty (0 bytes). Please re-upload — the bytes "
            f"may not have transferred correctly from your browser.",
            fid="empty_upload",
        )

    try:
        if suffix in ("xlsx", "xls"):
            df = pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False)
            findings = analyze(df)
            log_event(
                "analyze",
                f"Analyzed {name} ({len(findings)} findings)",
                filename=name,
                bytes=len(data),
                sha1_12=digest,
                findings=len(findings),
                rows=len(df), cols=len(df.columns),
            )
            return findings

        # CSV / TSV: run repair_bytes so the user sees csv_* findings.
        text_head = data[:4096].decode("utf-8", errors="replace")
        delim = "\t" if suffix == "tsv" else ","
        if delim == ",":
            for cand in ("\t", ";", "|"):
                if text_head.count(cand) > text_head.count(",") * 1.5:
                    delim = cand
                    break
        repair = repair_bytes(data, encoding="utf-8", delimiter=delim)
        if not repair.repaired_bytes:
            log_event(
                "analyze",
                f"Skipping {name} — empty after repair",
                level="warn",
                filename=name,
                outcome="empty_after_repair",
            )
            return _error_finding(
                f"`{name}` is empty after pre-parse repair "
                f"(original was {len(data)} bytes — likely all NUL "
                f"bytes or stripped during a BOM/line-ending pass). "
                f"Open the file in a text editor to confirm it has "
                f"content.",
                fid="empty_after_repair",
            )
        df = pd.read_csv(
            io.BytesIO(repair.repaired_bytes),
            encoding="utf-8", delimiter=delim,
            dtype=str, keep_default_na=False, on_bad_lines="warn",
        )
        findings = analyze(df, repair_result=repair)
        log_event(
            "analyze",
            f"Analyzed {name} ({len(findings)} findings)",
            filename=name,
            bytes=len(data),
            sha1_12=digest,
            findings=len(findings),
            rows=len(df), cols=len(df.columns),
            delimiter=repr(delim),
        )
        return findings
    except pd.errors.EmptyDataError as e:
        log_exception(
            f"analyze({name})",
            e,
            filename=name,
            outcome="empty_after_repair",
        )
        return _error_finding(
            f"`{name}` could not be parsed — pandas reports no columns "
            f"in the file. Original size was {len(data)} bytes. Open "
            f"the file in a text editor to confirm the header row is "
            f"present and uses the same delimiter as the data rows.",
            fid="empty_after_repair",
        )
    except Exception as e:
        log_exception(
            f"analyze({name})",
            e,
            filename=name,
            outcome="analysis_failed",
        )
        return _error_finding(
            f"`{name}` could not be analyzed: {format_for_user(e)}",
        )


def findings_count_for_tool(tool_id: str) -> int:
    """How many findings in session state target *tool_id*; 0 when none.

    Used by the home-page tool grid to badge cards that have actionable
    findings without re-running the analyzer.
    """
    findings = st.session_state.get("home_findings") or []
    return sum(1 for f in findings if f.tool == tool_id)


# ---------------------------------------------------------------------------
# Cross-page upload pickup
# ---------------------------------------------------------------------------

class _StashedUpload:
    """Duck-types ``st.runtime.uploaded_file_manager.UploadedFile`` enough
    for the tool pages: ``.name``, ``.size``, ``.getvalue()``.

    Tool pages that previously consumed a Streamlit ``UploadedFile`` can
    accept this in its place without changes.
    """

    __slots__ = ("name", "size", "_data")

    def __init__(self, name: str, data: bytes) -> None:
        self.name = name
        self.size = len(data)
        self._data = data

    def getvalue(self) -> bytes:
        return self._data

    def read(self) -> bytes:
        return self._data


def pickup_or_upload(
    *,
    label: str,
    key: str,
    types: list[str],
    help: str | None = None,
):
    """Return an upload object, preferring the home-page upload when present.

    Behavior:

    - If ``st.session_state['home_uploaded_bytes']`` is set and the user
      hasn't asked for a different file on this page, render a banner
      ("Using *<name>* from upload screen") plus a "Use a different file"
      button, and return a :class:`_StashedUpload` shim.
    - Otherwise render the standard ``st.file_uploader`` with the supplied
      *label*, *key*, and *types*. Returns the Streamlit ``UploadedFile``
      directly (or ``None`` if nothing uploaded).

    The ``_StashedUpload`` shim exposes ``.name``, ``.size``, and
    ``.getvalue()`` so existing tool-page code that consumes a Streamlit
    upload object works without changes.
    """
    override_key = f"{key}__override"
    has_session_upload = st.session_state.get("home_uploaded_bytes") is not None
    use_session = has_session_upload and not st.session_state.get(override_key, False)

    if use_session:
        name = st.session_state.get("home_uploaded_name") or _t("gate.default_name")
        st.info(_t("upload.using_session_file", name=name))
        if st.button(_t("upload.use_different_file"), key=f"{key}__pick_diff"):
            st.session_state[override_key] = True
            st.rerun()
        return _StashedUpload(name, st.session_state["home_uploaded_bytes"])

    if {"csv", "tsv", "xlsx", "xls"} & set(types):
        st.caption(_t("upload.pickup_caption"))
    uploaded = st.file_uploader(label, type=types, key=key, help=help)
    if uploaded is not None and st.session_state.get(override_key):
        # User has uploaded their own file on this page; clear the override
        # so the next visit to a tool page starts fresh.
        pass
    if uploaded is None and st.session_state.get(override_key) and has_session_upload:
        if st.button(_t("upload.switch_back"), key=f"{key}__switch_back"):
            st.session_state[override_key] = False
            st.rerun()
    return uploaded