feat(gui): wire analyzer into home page with findings panel and tool badges

Home page (src/gui/app.py) gains an upload + analyze section above the tool grid: file uploader, "Run analysis" / "Skip" buttons, and a findings panel grouped by destination tool. Tool cards now carry a "N findings" badge when the active session's findings reference that tool, so the user sees at a glance which tools their just-uploaded file would benefit from. src/gui/components.py adds the shared GUI surface: - TOOL_DISPLAY_NAMES + tool_display_name() — single source of truth for GUI labels, keeping detector tool ids decoupled from the UI. - render_findings_panel(findings) — severity icons, expander per tool, open-tool page link, sample-cells dataframe. - upload_and_analyze_section() — the home-page widget; stashes file bytes and findings in session_state so future tool pages can pick up the existing upload instead of re-prompting. - findings_count_for_tool(tool_id) — used by app.py to badge cards. CSV/TSV uploads run through repair_bytes() before analysis, so the user also sees csv_bom_stripped / csv_smart_quotes_folded findings synthesized from the pre-parse repair pass. Excel uploads skip that step. The Text Cleaner tool card flips from "Coming Soon" to "Ready" — that has been true since the v3.0 implementation and the home page just hadn't been updated. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 15:53:22 +00:00
parent 5c62fb6117
commit a8943f29eb
2 changed files with 248 additions and 4 deletions
--- a/src/gui/components.py
+++ b/src/gui/components.py
@@ -686,3 +686,220 @@ def _build_match_groups_csv(

    groups_df = pd.DataFrame(rows)
    return groups_df.to_csv(index=False).encode("utf-8-sig")
+
+
+# ---------------------------------------------------------------------------
+# Analyzer integration (upload-time data quality findings)
+# ---------------------------------------------------------------------------
+
+# Tool id -> friendly display name. Single source of truth for the GUI; the
+# CLI keeps its own copy so each entrypoint stays self-contained.
+TOOL_DISPLAY_NAMES: dict[str, str] = {
+    "01_deduplicator": "Deduplicator",
+    "02_text_cleaner": "Text Cleaner",
+    "03_format_standardizer": "Format Standardizer",
+    "04_missing_handler": "Missing Value Handler",
+    "05_column_mapper": "Column Mapper",
+    "06_outlier_detector": "Outlier Detector",
+    "07_multi_file_merger": "Multi-File Merger",
+    "08_validator_reporter": "Validator & Reporter",
+    "09_pipeline_runner": "Pipeline Runner",
+}
+
+_SEVERITY_ICON: dict[str, str] = {
+    "info": "ℹ️",
+    "warn": "⚠️",
+    "error": "🛑",
+}
+
+_SEVERITY_COLOR: dict[str, str] = {
+    "info": "blue",
+    "warn": "orange",
+    "error": "red",
+}
+
+# Map tool id to the streamlit page path under src/gui/. Skipped tools (no
+# page yet) return empty string and the "Open" button is omitted.
+_TOOL_PAGE_PATHS: dict[str, str] = {
+    "01_deduplicator": "pages/1_Deduplicator.py",
+    "02_text_cleaner": "pages/2_Text_Cleaner.py",
+    "03_format_standardizer": "pages/3_Format_Standardizer.py",
+    "04_missing_handler": "pages/4_Missing_Values.py",
+    "05_column_mapper": "pages/5_Column_Mapper.py",
+    "06_outlier_detector": "pages/6_Outlier_Detector.py",
+    "07_multi_file_merger": "pages/7_Multi_File_Merger.py",
+    "08_validator_reporter": "pages/8_Validator_Reporter.py",
+    "09_pipeline_runner": "pages/9_Pipeline_Runner.py",
+}
+
+
+def tool_display_name(tool_id: str) -> str:
+    """Map a stable tool id to its GUI display name; falls back to the id."""
+    return TOOL_DISPLAY_NAMES.get(tool_id, tool_id) if tool_id else "Informational"
+
+
+def _tool_page_slug(tool_id: str) -> str:
+    return _TOOL_PAGE_PATHS.get(tool_id, "")
+
+
+def render_findings_panel(findings, *, header: str = "Detected issues") -> None:
+    """Render a list of :class:`Finding` objects grouped by tool.
+
+    Each tool gets a header with the count, an open-tool button, and a list
+    of the findings underneath. Severity icon + count are shown inline so
+    the user can decide which tool to open first.
+    """
+    from src.core.analyze import findings_by_tool  # local import to avoid cycle
+
+    if not findings:
+        st.success("No issues detected. Open any tool below to start working.")
+        return
+
+    by_sev: dict[str, int] = {}
+    for f in findings:
+        by_sev[f.severity] = by_sev.get(f.severity, 0) + 1
+    sev_summary = " · ".join(
+        f"{_SEVERITY_ICON[s]} {by_sev[s]} {s}"
+        for s in ("error", "warn", "info") if by_sev.get(s)
+    )
+    st.markdown(f"### {header}")
+    st.caption(sev_summary)
+
+    grouped = findings_by_tool(findings)
+    untargeted = [f for f in findings if not f.tool]
+
+    for tool_id in sorted(grouped):
+        items = grouped[tool_id]
+        with st.expander(
+            f"{tool_display_name(tool_id)} — {len(items)} finding(s)",
+            expanded=any(f.severity == "error" for f in items),
+        ):
+            for f in items:
+                _render_one_finding(f)
+            page_slug = _tool_page_slug(tool_id)
+            if page_slug:
+                st.page_link(f"src/gui/{page_slug}", label=f"Open {tool_display_name(tool_id)} →")
+
+    if untargeted:
+        with st.expander(
+            f"Other / file-level — {len(untargeted)} finding(s)",
+            expanded=False,
+        ):
+            for f in untargeted:
+                _render_one_finding(f)
+
+
+def _render_one_finding(f) -> None:
+    color = _SEVERITY_COLOR[f.severity]
+    icon = _SEVERITY_ICON[f.severity]
+    column_part = f" in `{f.column}`" if getattr(f, "column", None) else ""
+    st.markdown(
+        f"{icon} :{color}[**{f.id}**]{column_part} — {f.description}"
+    )
+    if f.samples:
+        sample_df = pd.DataFrame(
+            f.samples, columns=["row", "column", "value"],
+        )
+        st.dataframe(sample_df, use_container_width=True, hide_index=True)
+
+
+def upload_and_analyze_section() -> None:
+    """Render the upload + analyze panel for the home page.
+
+    Stashes the uploaded file (name + bytes) and findings in session state
+    so individual tool pages can pick them up if they want to skip their
+    own uploader. Each tool page already has its own uploader today, so
+    this is purely additive.
+    """
+    st.markdown("### 📤 Upload a file to start")
+    st.caption(
+        "Optional: scan an uploaded file for data quality issues and see "
+        "which tools can fix each one. Skip if you already know what you need."
+    )
+
+    uploaded = st.file_uploader(
+        "Upload CSV or Excel",
+        type=["csv", "tsv", "xlsx", "xls"],
+        key="home_upload",
+    )
+    if uploaded is None:
+        return
+
+    # Stash on every fresh upload so all tool pages can pick it up.
+    if (
+        st.session_state.get("home_uploaded_name") != uploaded.name
+        or st.session_state.get("home_uploaded_size") != uploaded.size
+    ):
+        st.session_state["home_uploaded_name"] = uploaded.name
+        st.session_state["home_uploaded_size"] = uploaded.size
+        st.session_state["home_uploaded_bytes"] = uploaded.getvalue()
+        # Drop stale findings on a new upload.
+        st.session_state.pop("home_findings", None)
+        st.session_state.pop("home_skipped", None)
+
+    col_run, col_skip, _ = st.columns([1, 1, 4])
+    with col_run:
+        run_clicked = st.button("Run analysis", type="primary", key="home_run_analysis")
+    with col_skip:
+        skip_clicked = st.button("Skip", key="home_skip_analysis")
+
+    if skip_clicked:
+        st.session_state["home_findings"] = []
+        st.session_state["home_skipped"] = True
+
+    if run_clicked:
+        with st.spinner("Scanning…"):
+            findings = _run_analysis_on_upload(uploaded)
+        st.session_state["home_findings"] = findings
+        st.session_state["home_skipped"] = False
+
+    findings = st.session_state.get("home_findings")
+    if findings is None:
+        return
+
+    if st.session_state.get("home_skipped"):
+        st.info("Analysis skipped. Open any tool below to start working.")
+        return
+
+    st.divider()
+    render_findings_panel(findings)
+
+
+def _run_analysis_on_upload(uploaded):
+    """Read the uploaded file with pre-parse repair, then analyze."""
+    from src.core.analyze import analyze
+    from src.core.io import repair_bytes
+
+    name = uploaded.name
+    data = uploaded.getvalue()
+    suffix = name.rsplit(".", 1)[-1].lower() if "." in name else ""
+
+    if suffix in ("xlsx", "xls"):
+        df = pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False)
+        return analyze(df)
+
+    # CSV / TSV: run repair_bytes so the user sees csv_* findings.
+    text_head = data[:4096].decode("utf-8", errors="replace")
+    delim = "\t" if suffix == "tsv" else ","
+    if delim == ",":
+        for cand in ("\t", ";", "|"):
+            if text_head.count(cand) > text_head.count(",") * 1.5:
+                delim = cand
+                break
+    repair = repair_bytes(data, encoding="utf-8", delimiter=delim)
+    df = pd.read_csv(
+        io.BytesIO(repair.repaired_bytes),
+        encoding="utf-8", delimiter=delim,
+        dtype=str, keep_default_na=False, on_bad_lines="warn",
+    )
+    return analyze(df, repair_result=repair)
+
+
+def findings_count_for_tool(tool_id: str) -> int:
+    """How many findings in session state target *tool_id*; 0 when none.
+
+    Used by the home-page tool grid to badge cards that have actionable
+    findings without re-running the analyzer.
+    """
+    findings = st.session_state.get("home_findings") or []
+    return sum(1 for f in findings if f.tool == tool_id)