feat(gui): wire analyzer into home page with findings panel and tool badges

Home page (src/gui/app.py) gains an upload + analyze section above the tool grid: file uploader, "Run analysis" / "Skip" buttons, and a findings panel grouped by destination tool. Tool cards now carry a "N findings" badge when the active session's findings reference that tool, so the user sees at a glance which tools their just-uploaded file would benefit from. src/gui/components.py adds the shared GUI surface: - TOOL_DISPLAY_NAMES + tool_display_name() — single source of truth for GUI labels, keeping detector tool ids decoupled from the UI. - render_findings_panel(findings) — severity icons, expander per tool, open-tool page link, sample-cells dataframe. - upload_and_analyze_section() — the home-page widget; stashes file bytes and findings in session_state so future tool pages can pick up the existing upload instead of re-prompting. - findings_count_for_tool(tool_id) — used by app.py to badge cards. CSV/TSV uploads run through repair_bytes() before analysis, so the user also sees csv_bom_stripped / csv_smart_quotes_folded findings synthesized from the pre-parse repair pass. Excel uploads skip that step. The Text Cleaner tool card flips from "Coming Soon" to "Ready" — that has been true since the v3.0 implementation and the home page just hadn't been updated. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 15:53:22 +00:00
parent 5c62fb6117
commit a8943f29eb
2 changed files with 248 additions and 4 deletions
--- a/src/gui/app.py
+++ b/src/gui/app.py
@@ -21,7 +21,11 @@ if str(_project_root) not in sys.path:
 # Page config
 # ---------------------------------------------------------------------------

-from src.gui.components import hide_streamlit_chrome
+from src.gui.components import (
+    findings_count_for_tool,
+    hide_streamlit_chrome,
+    upload_and_analyze_section,
+)

 st.set_page_config(
    page_title="DataTools — Data Cleaning Mastery",
@@ -41,6 +45,14 @@ st.caption("A 9-tool suite for cleaning, standardizing, and validating tabular d

 st.divider()

+# ---------------------------------------------------------------------------
+# Upload & analyze (optional onboarding step)
+# ---------------------------------------------------------------------------
+
+upload_and_analyze_section()
+
+st.divider()
+
 # ---------------------------------------------------------------------------
 # Tool cards
 # ---------------------------------------------------------------------------
@@ -52,13 +64,15 @@ TOOLS = [
        "description": "Fuzzy matching, normalization, survivor selection, and interactive review.",
        "status": "Ready",
        "page": "1_Deduplicator",
+        "tool_id": "01_deduplicator",
    },
    {
        "icon": "✂️",
        "name": "Text Cleaner",
        "description": "Whitespace trim, multi-space collapse, Unicode normalization, BOM and line-ending handling.",
-        "status": "Coming Soon",
+        "status": "Ready",
        "page": "2_Text_Cleaner",
+        "tool_id": "02_text_cleaner",
    },
    {
        "icon": "📐",
@@ -66,6 +80,7 @@ TOOLS = [
        "description": "Standardize dates, currencies, names, phone numbers, and addresses.",
        "status": "Coming Soon",
        "page": "3_Format_Standardizer",
+        "tool_id": "03_format_standardizer",
    },
    {
        "icon": "🕳️",
@@ -73,6 +88,7 @@ TOOLS = [
        "description": "Detect disguised nulls, missingness analysis, and imputation strategies.",
        "status": "Coming Soon",
        "page": "4_Missing_Values",
+        "tool_id": "04_missing_handler",
    },
    {
        "icon": "🗂️",
@@ -80,6 +96,7 @@ TOOLS = [
        "description": "Rename columns, enforce a target schema, and coerce types.",
        "status": "Coming Soon",
        "page": "5_Column_Mapper",
+        "tool_id": "05_column_mapper",
    },
    {
        "icon": "📊",
@@ -87,6 +104,7 @@ TOOLS = [
        "description": "Z-score, IQR, and MAD detection with domain-rule violations and winsorization.",
        "status": "Coming Soon",
        "page": "6_Outlier_Detector",
+        "tool_id": "06_outlier_detector",
    },
    {
        "icon": "📎",
@@ -94,6 +112,7 @@ TOOLS = [
        "description": "Combine multiple CSV/Excel files with schema alignment.",
        "status": "Coming Soon",
        "page": "7_Multi_File_Merger",
+        "tool_id": "07_multi_file_merger",
    },
    {
        "icon": "✅",
@@ -101,6 +120,7 @@ TOOLS = [
        "description": "Validate against rules and generate PDF/Excel quality reports.",
        "status": "Coming Soon",
        "page": "8_Validator_Reporter",
+        "tool_id": "08_validator_reporter",
    },
    {
        "icon": "⚙️",
@@ -108,10 +128,13 @@ TOOLS = [
        "description": "Chain tools in recommended order and pass output between steps.",
        "status": "Coming Soon",
        "page": "9_Pipeline_Runner",
+        "tool_id": "09_pipeline_runner",
    },
 ]

-# Render tool cards in a 3-column grid
+# Render tool cards in a 3-column grid. Cards picked up by the analyzer get a
+# coloured "N findings" badge so the user can see at a glance which tools
+# would help with the just-uploaded file.
 for row_start in range(0, len(TOOLS), 3):
    cols = st.columns(3)
    for i, col in enumerate(cols):
@@ -121,8 +144,12 @@ for row_start in range(0, len(TOOLS), 3):
        tool = TOOLS[idx]
        with col:
            status_color = "green" if tool["status"] == "Ready" else "orange"
+            badge = ""
+            n = findings_count_for_tool(tool.get("tool_id", ""))
+            if n:
+                badge = f" :red-background[**{n} finding{'s' if n != 1 else ''}**]"
            st.markdown(
-                f"### {tool['icon']} {tool['name']}\n\n"
+                f"### {tool['icon']} {tool['name']}{badge}\n\n"
                f"{tool['description']}\n\n"
                f":{status_color}[**{tool['status']}**]"
            )
--- a/src/gui/components.py
+++ b/src/gui/components.py
@@ -686,3 +686,220 @@ def _build_match_groups_csv(

    groups_df = pd.DataFrame(rows)
    return groups_df.to_csv(index=False).encode("utf-8-sig")
+
+
+# ---------------------------------------------------------------------------
+# Analyzer integration (upload-time data quality findings)
+# ---------------------------------------------------------------------------
+
+# Tool id -> friendly display name. Single source of truth for the GUI; the
+# CLI keeps its own copy so each entrypoint stays self-contained.
+TOOL_DISPLAY_NAMES: dict[str, str] = {
+    "01_deduplicator": "Deduplicator",
+    "02_text_cleaner": "Text Cleaner",
+    "03_format_standardizer": "Format Standardizer",
+    "04_missing_handler": "Missing Value Handler",
+    "05_column_mapper": "Column Mapper",
+    "06_outlier_detector": "Outlier Detector",
+    "07_multi_file_merger": "Multi-File Merger",
+    "08_validator_reporter": "Validator & Reporter",
+    "09_pipeline_runner": "Pipeline Runner",
+}
+
+_SEVERITY_ICON: dict[str, str] = {
+    "info": "ℹ️",
+    "warn": "⚠️",
+    "error": "🛑",
+}
+
+_SEVERITY_COLOR: dict[str, str] = {
+    "info": "blue",
+    "warn": "orange",
+    "error": "red",
+}
+
+# Map tool id to the streamlit page path under src/gui/. Skipped tools (no
+# page yet) return empty string and the "Open" button is omitted.
+_TOOL_PAGE_PATHS: dict[str, str] = {
+    "01_deduplicator": "pages/1_Deduplicator.py",
+    "02_text_cleaner": "pages/2_Text_Cleaner.py",
+    "03_format_standardizer": "pages/3_Format_Standardizer.py",
+    "04_missing_handler": "pages/4_Missing_Values.py",
+    "05_column_mapper": "pages/5_Column_Mapper.py",
+    "06_outlier_detector": "pages/6_Outlier_Detector.py",
+    "07_multi_file_merger": "pages/7_Multi_File_Merger.py",
+    "08_validator_reporter": "pages/8_Validator_Reporter.py",
+    "09_pipeline_runner": "pages/9_Pipeline_Runner.py",
+}
+
+
+def tool_display_name(tool_id: str) -> str:
+    """Map a stable tool id to its GUI display name; falls back to the id."""
+    return TOOL_DISPLAY_NAMES.get(tool_id, tool_id) if tool_id else "Informational"
+
+
+def _tool_page_slug(tool_id: str) -> str:
+    return _TOOL_PAGE_PATHS.get(tool_id, "")
+
+
+def render_findings_panel(findings, *, header: str = "Detected issues") -> None:
+    """Render a list of :class:`Finding` objects grouped by tool.
+
+    Each tool gets a header with the count, an open-tool button, and a list
+    of the findings underneath. Severity icon + count are shown inline so
+    the user can decide which tool to open first.
+    """
+    from src.core.analyze import findings_by_tool  # local import to avoid cycle
+
+    if not findings:
+        st.success("No issues detected. Open any tool below to start working.")
+        return
+
+    by_sev: dict[str, int] = {}
+    for f in findings:
+        by_sev[f.severity] = by_sev.get(f.severity, 0) + 1
+    sev_summary = " · ".join(
+        f"{_SEVERITY_ICON[s]} {by_sev[s]} {s}"
+        for s in ("error", "warn", "info") if by_sev.get(s)
+    )
+    st.markdown(f"### {header}")
+    st.caption(sev_summary)
+
+    grouped = findings_by_tool(findings)
+    untargeted = [f for f in findings if not f.tool]
+
+    for tool_id in sorted(grouped):
+        items = grouped[tool_id]
+        with st.expander(
+            f"{tool_display_name(tool_id)} — {len(items)} finding(s)",
+            expanded=any(f.severity == "error" for f in items),
+        ):
+            for f in items:
+                _render_one_finding(f)
+            page_slug = _tool_page_slug(tool_id)
+            if page_slug:
+                st.page_link(f"src/gui/{page_slug}", label=f"Open {tool_display_name(tool_id)} →")
+
+    if untargeted:
+        with st.expander(
+            f"Other / file-level — {len(untargeted)} finding(s)",
+            expanded=False,
+        ):
+            for f in untargeted:
+                _render_one_finding(f)
+
+
+def _render_one_finding(f) -> None:
+    color = _SEVERITY_COLOR[f.severity]
+    icon = _SEVERITY_ICON[f.severity]
+    column_part = f" in `{f.column}`" if getattr(f, "column", None) else ""
+    st.markdown(
+        f"{icon} :{color}[**{f.id}**]{column_part} — {f.description}"
+    )
+    if f.samples:
+        sample_df = pd.DataFrame(
+            f.samples, columns=["row", "column", "value"],
+        )
+        st.dataframe(sample_df, use_container_width=True, hide_index=True)
+
+
+def upload_and_analyze_section() -> None:
+    """Render the upload + analyze panel for the home page.
+
+    Stashes the uploaded file (name + bytes) and findings in session state
+    so individual tool pages can pick them up if they want to skip their
+    own uploader. Each tool page already has its own uploader today, so
+    this is purely additive.
+    """
+    st.markdown("### 📤 Upload a file to start")
+    st.caption(
+        "Optional: scan an uploaded file for data quality issues and see "
+        "which tools can fix each one. Skip if you already know what you need."
+    )
+
+    uploaded = st.file_uploader(
+        "Upload CSV or Excel",
+        type=["csv", "tsv", "xlsx", "xls"],
+        key="home_upload",
+    )
+    if uploaded is None:
+        return
+
+    # Stash on every fresh upload so all tool pages can pick it up.
+    if (
+        st.session_state.get("home_uploaded_name") != uploaded.name
+        or st.session_state.get("home_uploaded_size") != uploaded.size
+    ):
+        st.session_state["home_uploaded_name"] = uploaded.name
+        st.session_state["home_uploaded_size"] = uploaded.size
+        st.session_state["home_uploaded_bytes"] = uploaded.getvalue()
+        # Drop stale findings on a new upload.
+        st.session_state.pop("home_findings", None)
+        st.session_state.pop("home_skipped", None)
+
+    col_run, col_skip, _ = st.columns([1, 1, 4])
+    with col_run:
+        run_clicked = st.button("Run analysis", type="primary", key="home_run_analysis")
+    with col_skip:
+        skip_clicked = st.button("Skip", key="home_skip_analysis")
+
+    if skip_clicked:
+        st.session_state["home_findings"] = []
+        st.session_state["home_skipped"] = True
+
+    if run_clicked:
+        with st.spinner("Scanning…"):
+            findings = _run_analysis_on_upload(uploaded)
+        st.session_state["home_findings"] = findings
+        st.session_state["home_skipped"] = False
+
+    findings = st.session_state.get("home_findings")
+    if findings is None:
+        return
+
+    if st.session_state.get("home_skipped"):
+        st.info("Analysis skipped. Open any tool below to start working.")
+        return
+
+    st.divider()
+    render_findings_panel(findings)
+
+
+def _run_analysis_on_upload(uploaded):
+    """Read the uploaded file with pre-parse repair, then analyze."""
+    from src.core.analyze import analyze
+    from src.core.io import repair_bytes
+
+    name = uploaded.name
+    data = uploaded.getvalue()
+    suffix = name.rsplit(".", 1)[-1].lower() if "." in name else ""
+
+    if suffix in ("xlsx", "xls"):
+        df = pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False)
+        return analyze(df)
+
+    # CSV / TSV: run repair_bytes so the user sees csv_* findings.
+    text_head = data[:4096].decode("utf-8", errors="replace")
+    delim = "\t" if suffix == "tsv" else ","
+    if delim == ",":
+        for cand in ("\t", ";", "|"):
+            if text_head.count(cand) > text_head.count(",") * 1.5:
+                delim = cand
+                break
+    repair = repair_bytes(data, encoding="utf-8", delimiter=delim)
+    df = pd.read_csv(
+        io.BytesIO(repair.repaired_bytes),
+        encoding="utf-8", delimiter=delim,
+        dtype=str, keep_default_na=False, on_bad_lines="warn",
+    )
+    return analyze(df, repair_result=repair)
+
+
+def findings_count_for_tool(tool_id: str) -> int:
+    """How many findings in session state target *tool_id*; 0 when none.
+
+    Used by the home-page tool grid to badge cards that have actionable
+    findings without re-running the analyzer.
+    """
+    findings = st.session_state.get("home_findings") or []
+    return sum(1 for f in findings if f.tool == tool_id)