diff --git a/src/gui/app.py b/src/gui/app.py index 02aa625..7943e6e 100644 --- a/src/gui/app.py +++ b/src/gui/app.py @@ -21,7 +21,11 @@ if str(_project_root) not in sys.path: # Page config # --------------------------------------------------------------------------- -from src.gui.components import hide_streamlit_chrome +from src.gui.components import ( + findings_count_for_tool, + hide_streamlit_chrome, + upload_and_analyze_section, +) st.set_page_config( page_title="DataTools β€” Data Cleaning Mastery", @@ -41,6 +45,14 @@ st.caption("A 9-tool suite for cleaning, standardizing, and validating tabular d st.divider() +# --------------------------------------------------------------------------- +# Upload & analyze (optional onboarding step) +# --------------------------------------------------------------------------- + +upload_and_analyze_section() + +st.divider() + # --------------------------------------------------------------------------- # Tool cards # --------------------------------------------------------------------------- @@ -52,13 +64,15 @@ TOOLS = [ "description": "Fuzzy matching, normalization, survivor selection, and interactive review.", "status": "Ready", "page": "1_Deduplicator", + "tool_id": "01_deduplicator", }, { "icon": "βœ‚οΈ", "name": "Text Cleaner", "description": "Whitespace trim, multi-space collapse, Unicode normalization, BOM and line-ending handling.", - "status": "Coming Soon", + "status": "Ready", "page": "2_Text_Cleaner", + "tool_id": "02_text_cleaner", }, { "icon": "πŸ“", @@ -66,6 +80,7 @@ TOOLS = [ "description": "Standardize dates, currencies, names, phone numbers, and addresses.", "status": "Coming Soon", "page": "3_Format_Standardizer", + "tool_id": "03_format_standardizer", }, { "icon": "πŸ•³οΈ", @@ -73,6 +88,7 @@ TOOLS = [ "description": "Detect disguised nulls, missingness analysis, and imputation strategies.", "status": "Coming Soon", "page": "4_Missing_Values", + "tool_id": "04_missing_handler", }, { "icon": "πŸ—‚οΈ", @@ -80,6 +96,7 @@ TOOLS = [ "description": "Rename columns, enforce a target schema, and coerce types.", "status": "Coming Soon", "page": "5_Column_Mapper", + "tool_id": "05_column_mapper", }, { "icon": "πŸ“Š", @@ -87,6 +104,7 @@ TOOLS = [ "description": "Z-score, IQR, and MAD detection with domain-rule violations and winsorization.", "status": "Coming Soon", "page": "6_Outlier_Detector", + "tool_id": "06_outlier_detector", }, { "icon": "πŸ“Ž", @@ -94,6 +112,7 @@ TOOLS = [ "description": "Combine multiple CSV/Excel files with schema alignment.", "status": "Coming Soon", "page": "7_Multi_File_Merger", + "tool_id": "07_multi_file_merger", }, { "icon": "βœ…", @@ -101,6 +120,7 @@ TOOLS = [ "description": "Validate against rules and generate PDF/Excel quality reports.", "status": "Coming Soon", "page": "8_Validator_Reporter", + "tool_id": "08_validator_reporter", }, { "icon": "βš™οΈ", @@ -108,10 +128,13 @@ TOOLS = [ "description": "Chain tools in recommended order and pass output between steps.", "status": "Coming Soon", "page": "9_Pipeline_Runner", + "tool_id": "09_pipeline_runner", }, ] -# Render tool cards in a 3-column grid +# Render tool cards in a 3-column grid. Cards picked up by the analyzer get a +# coloured "N findings" badge so the user can see at a glance which tools +# would help with the just-uploaded file. for row_start in range(0, len(TOOLS), 3): cols = st.columns(3) for i, col in enumerate(cols): @@ -121,8 +144,12 @@ for row_start in range(0, len(TOOLS), 3): tool = TOOLS[idx] with col: status_color = "green" if tool["status"] == "Ready" else "orange" + badge = "" + n = findings_count_for_tool(tool.get("tool_id", "")) + if n: + badge = f" :red-background[**{n} finding{'s' if n != 1 else ''}**]" st.markdown( - f"### {tool['icon']} {tool['name']}\n\n" + f"### {tool['icon']} {tool['name']}{badge}\n\n" f"{tool['description']}\n\n" f":{status_color}[**{tool['status']}**]" ) diff --git a/src/gui/components.py b/src/gui/components.py index 79b32a1..10d7c38 100644 --- a/src/gui/components.py +++ b/src/gui/components.py @@ -686,3 +686,220 @@ def _build_match_groups_csv( groups_df = pd.DataFrame(rows) return groups_df.to_csv(index=False).encode("utf-8-sig") + + +# --------------------------------------------------------------------------- +# Analyzer integration (upload-time data quality findings) +# --------------------------------------------------------------------------- + +# Tool id -> friendly display name. Single source of truth for the GUI; the +# CLI keeps its own copy so each entrypoint stays self-contained. +TOOL_DISPLAY_NAMES: dict[str, str] = { + "01_deduplicator": "Deduplicator", + "02_text_cleaner": "Text Cleaner", + "03_format_standardizer": "Format Standardizer", + "04_missing_handler": "Missing Value Handler", + "05_column_mapper": "Column Mapper", + "06_outlier_detector": "Outlier Detector", + "07_multi_file_merger": "Multi-File Merger", + "08_validator_reporter": "Validator & Reporter", + "09_pipeline_runner": "Pipeline Runner", +} + +_SEVERITY_ICON: dict[str, str] = { + "info": "ℹ️", + "warn": "⚠️", + "error": "πŸ›‘", +} + +_SEVERITY_COLOR: dict[str, str] = { + "info": "blue", + "warn": "orange", + "error": "red", +} + +# Map tool id to the streamlit page path under src/gui/. Skipped tools (no +# page yet) return empty string and the "Open" button is omitted. +_TOOL_PAGE_PATHS: dict[str, str] = { + "01_deduplicator": "pages/1_Deduplicator.py", + "02_text_cleaner": "pages/2_Text_Cleaner.py", + "03_format_standardizer": "pages/3_Format_Standardizer.py", + "04_missing_handler": "pages/4_Missing_Values.py", + "05_column_mapper": "pages/5_Column_Mapper.py", + "06_outlier_detector": "pages/6_Outlier_Detector.py", + "07_multi_file_merger": "pages/7_Multi_File_Merger.py", + "08_validator_reporter": "pages/8_Validator_Reporter.py", + "09_pipeline_runner": "pages/9_Pipeline_Runner.py", +} + + +def tool_display_name(tool_id: str) -> str: + """Map a stable tool id to its GUI display name; falls back to the id.""" + return TOOL_DISPLAY_NAMES.get(tool_id, tool_id) if tool_id else "Informational" + + +def _tool_page_slug(tool_id: str) -> str: + return _TOOL_PAGE_PATHS.get(tool_id, "") + + +def render_findings_panel(findings, *, header: str = "Detected issues") -> None: + """Render a list of :class:`Finding` objects grouped by tool. + + Each tool gets a header with the count, an open-tool button, and a list + of the findings underneath. Severity icon + count are shown inline so + the user can decide which tool to open first. + """ + from src.core.analyze import findings_by_tool # local import to avoid cycle + + if not findings: + st.success("No issues detected. Open any tool below to start working.") + return + + by_sev: dict[str, int] = {} + for f in findings: + by_sev[f.severity] = by_sev.get(f.severity, 0) + 1 + sev_summary = " Β· ".join( + f"{_SEVERITY_ICON[s]} {by_sev[s]} {s}" + for s in ("error", "warn", "info") if by_sev.get(s) + ) + st.markdown(f"### {header}") + st.caption(sev_summary) + + grouped = findings_by_tool(findings) + untargeted = [f for f in findings if not f.tool] + + for tool_id in sorted(grouped): + items = grouped[tool_id] + with st.expander( + f"{tool_display_name(tool_id)} β€” {len(items)} finding(s)", + expanded=any(f.severity == "error" for f in items), + ): + for f in items: + _render_one_finding(f) + page_slug = _tool_page_slug(tool_id) + if page_slug: + st.page_link(f"src/gui/{page_slug}", label=f"Open {tool_display_name(tool_id)} β†’") + + if untargeted: + with st.expander( + f"Other / file-level β€” {len(untargeted)} finding(s)", + expanded=False, + ): + for f in untargeted: + _render_one_finding(f) + + +def _render_one_finding(f) -> None: + color = _SEVERITY_COLOR[f.severity] + icon = _SEVERITY_ICON[f.severity] + column_part = f" in `{f.column}`" if getattr(f, "column", None) else "" + st.markdown( + f"{icon} :{color}[**{f.id}**]{column_part} β€” {f.description}" + ) + if f.samples: + sample_df = pd.DataFrame( + f.samples, columns=["row", "column", "value"], + ) + st.dataframe(sample_df, use_container_width=True, hide_index=True) + + +def upload_and_analyze_section() -> None: + """Render the upload + analyze panel for the home page. + + Stashes the uploaded file (name + bytes) and findings in session state + so individual tool pages can pick them up if they want to skip their + own uploader. Each tool page already has its own uploader today, so + this is purely additive. + """ + st.markdown("### πŸ“€ Upload a file to start") + st.caption( + "Optional: scan an uploaded file for data quality issues and see " + "which tools can fix each one. Skip if you already know what you need." + ) + + uploaded = st.file_uploader( + "Upload CSV or Excel", + type=["csv", "tsv", "xlsx", "xls"], + key="home_upload", + ) + if uploaded is None: + return + + # Stash on every fresh upload so all tool pages can pick it up. + if ( + st.session_state.get("home_uploaded_name") != uploaded.name + or st.session_state.get("home_uploaded_size") != uploaded.size + ): + st.session_state["home_uploaded_name"] = uploaded.name + st.session_state["home_uploaded_size"] = uploaded.size + st.session_state["home_uploaded_bytes"] = uploaded.getvalue() + # Drop stale findings on a new upload. + st.session_state.pop("home_findings", None) + st.session_state.pop("home_skipped", None) + + col_run, col_skip, _ = st.columns([1, 1, 4]) + with col_run: + run_clicked = st.button("Run analysis", type="primary", key="home_run_analysis") + with col_skip: + skip_clicked = st.button("Skip", key="home_skip_analysis") + + if skip_clicked: + st.session_state["home_findings"] = [] + st.session_state["home_skipped"] = True + + if run_clicked: + with st.spinner("Scanning…"): + findings = _run_analysis_on_upload(uploaded) + st.session_state["home_findings"] = findings + st.session_state["home_skipped"] = False + + findings = st.session_state.get("home_findings") + if findings is None: + return + + if st.session_state.get("home_skipped"): + st.info("Analysis skipped. Open any tool below to start working.") + return + + st.divider() + render_findings_panel(findings) + + +def _run_analysis_on_upload(uploaded): + """Read the uploaded file with pre-parse repair, then analyze.""" + from src.core.analyze import analyze + from src.core.io import repair_bytes + + name = uploaded.name + data = uploaded.getvalue() + suffix = name.rsplit(".", 1)[-1].lower() if "." in name else "" + + if suffix in ("xlsx", "xls"): + df = pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False) + return analyze(df) + + # CSV / TSV: run repair_bytes so the user sees csv_* findings. + text_head = data[:4096].decode("utf-8", errors="replace") + delim = "\t" if suffix == "tsv" else "," + if delim == ",": + for cand in ("\t", ";", "|"): + if text_head.count(cand) > text_head.count(",") * 1.5: + delim = cand + break + repair = repair_bytes(data, encoding="utf-8", delimiter=delim) + df = pd.read_csv( + io.BytesIO(repair.repaired_bytes), + encoding="utf-8", delimiter=delim, + dtype=str, keep_default_na=False, on_bad_lines="warn", + ) + return analyze(df, repair_result=repair) + + +def findings_count_for_tool(tool_id: str) -> int: + """How many findings in session state target *tool_id*; 0 when none. + + Used by the home-page tool grid to badge cards that have actionable + findings without re-running the analyzer. + """ + findings = st.session_state.get("home_findings") or [] + return sum(1 for f in findings if f.tool == tool_id)