feat(gui): wire analyzer into home page with findings panel and tool badges
Home page (src/gui/app.py) gains an upload + analyze section above the tool
grid: file uploader, "Run analysis" / "Skip" buttons, and a findings panel
grouped by destination tool. Tool cards now carry a "N findings" badge
when the active session's findings reference that tool, so the user sees
at a glance which tools their just-uploaded file would benefit from.
src/gui/components.py adds the shared GUI surface:
- TOOL_DISPLAY_NAMES + tool_display_name() — single source of truth for
GUI labels, keeping detector tool ids decoupled from the UI.
- render_findings_panel(findings) — severity icons, expander per tool,
open-tool page link, sample-cells dataframe.
- upload_and_analyze_section() — the home-page widget; stashes file
bytes and findings in session_state so future tool pages can pick up
the existing upload instead of re-prompting.
- findings_count_for_tool(tool_id) — used by app.py to badge cards.
CSV/TSV uploads run through repair_bytes() before analysis, so the user
also sees csv_bom_stripped / csv_smart_quotes_folded findings synthesized
from the pre-parse repair pass. Excel uploads skip that step.
The Text Cleaner tool card flips from "Coming Soon" to "Ready" — that has
been true since the v3.0 implementation and the home page just hadn't been
updated.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -686,3 +686,220 @@ def _build_match_groups_csv(
|
||||
|
||||
groups_df = pd.DataFrame(rows)
|
||||
return groups_df.to_csv(index=False).encode("utf-8-sig")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Analyzer integration (upload-time data quality findings)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Tool id -> friendly display name. Single source of truth for the GUI; the
|
||||
# CLI keeps its own copy so each entrypoint stays self-contained.
|
||||
TOOL_DISPLAY_NAMES: dict[str, str] = {
|
||||
"01_deduplicator": "Deduplicator",
|
||||
"02_text_cleaner": "Text Cleaner",
|
||||
"03_format_standardizer": "Format Standardizer",
|
||||
"04_missing_handler": "Missing Value Handler",
|
||||
"05_column_mapper": "Column Mapper",
|
||||
"06_outlier_detector": "Outlier Detector",
|
||||
"07_multi_file_merger": "Multi-File Merger",
|
||||
"08_validator_reporter": "Validator & Reporter",
|
||||
"09_pipeline_runner": "Pipeline Runner",
|
||||
}
|
||||
|
||||
_SEVERITY_ICON: dict[str, str] = {
|
||||
"info": "ℹ️",
|
||||
"warn": "⚠️",
|
||||
"error": "🛑",
|
||||
}
|
||||
|
||||
_SEVERITY_COLOR: dict[str, str] = {
|
||||
"info": "blue",
|
||||
"warn": "orange",
|
||||
"error": "red",
|
||||
}
|
||||
|
||||
# Map tool id to the streamlit page path under src/gui/. Skipped tools (no
|
||||
# page yet) return empty string and the "Open" button is omitted.
|
||||
_TOOL_PAGE_PATHS: dict[str, str] = {
|
||||
"01_deduplicator": "pages/1_Deduplicator.py",
|
||||
"02_text_cleaner": "pages/2_Text_Cleaner.py",
|
||||
"03_format_standardizer": "pages/3_Format_Standardizer.py",
|
||||
"04_missing_handler": "pages/4_Missing_Values.py",
|
||||
"05_column_mapper": "pages/5_Column_Mapper.py",
|
||||
"06_outlier_detector": "pages/6_Outlier_Detector.py",
|
||||
"07_multi_file_merger": "pages/7_Multi_File_Merger.py",
|
||||
"08_validator_reporter": "pages/8_Validator_Reporter.py",
|
||||
"09_pipeline_runner": "pages/9_Pipeline_Runner.py",
|
||||
}
|
||||
|
||||
|
||||
def tool_display_name(tool_id: str) -> str:
|
||||
"""Map a stable tool id to its GUI display name; falls back to the id."""
|
||||
return TOOL_DISPLAY_NAMES.get(tool_id, tool_id) if tool_id else "Informational"
|
||||
|
||||
|
||||
def _tool_page_slug(tool_id: str) -> str:
|
||||
return _TOOL_PAGE_PATHS.get(tool_id, "")
|
||||
|
||||
|
||||
def render_findings_panel(findings, *, header: str = "Detected issues") -> None:
|
||||
"""Render a list of :class:`Finding` objects grouped by tool.
|
||||
|
||||
Each tool gets a header with the count, an open-tool button, and a list
|
||||
of the findings underneath. Severity icon + count are shown inline so
|
||||
the user can decide which tool to open first.
|
||||
"""
|
||||
from src.core.analyze import findings_by_tool # local import to avoid cycle
|
||||
|
||||
if not findings:
|
||||
st.success("No issues detected. Open any tool below to start working.")
|
||||
return
|
||||
|
||||
by_sev: dict[str, int] = {}
|
||||
for f in findings:
|
||||
by_sev[f.severity] = by_sev.get(f.severity, 0) + 1
|
||||
sev_summary = " · ".join(
|
||||
f"{_SEVERITY_ICON[s]} {by_sev[s]} {s}"
|
||||
for s in ("error", "warn", "info") if by_sev.get(s)
|
||||
)
|
||||
st.markdown(f"### {header}")
|
||||
st.caption(sev_summary)
|
||||
|
||||
grouped = findings_by_tool(findings)
|
||||
untargeted = [f for f in findings if not f.tool]
|
||||
|
||||
for tool_id in sorted(grouped):
|
||||
items = grouped[tool_id]
|
||||
with st.expander(
|
||||
f"{tool_display_name(tool_id)} — {len(items)} finding(s)",
|
||||
expanded=any(f.severity == "error" for f in items),
|
||||
):
|
||||
for f in items:
|
||||
_render_one_finding(f)
|
||||
page_slug = _tool_page_slug(tool_id)
|
||||
if page_slug:
|
||||
st.page_link(f"src/gui/{page_slug}", label=f"Open {tool_display_name(tool_id)} →")
|
||||
|
||||
if untargeted:
|
||||
with st.expander(
|
||||
f"Other / file-level — {len(untargeted)} finding(s)",
|
||||
expanded=False,
|
||||
):
|
||||
for f in untargeted:
|
||||
_render_one_finding(f)
|
||||
|
||||
|
||||
def _render_one_finding(f) -> None:
|
||||
color = _SEVERITY_COLOR[f.severity]
|
||||
icon = _SEVERITY_ICON[f.severity]
|
||||
column_part = f" in `{f.column}`" if getattr(f, "column", None) else ""
|
||||
st.markdown(
|
||||
f"{icon} :{color}[**{f.id}**]{column_part} — {f.description}"
|
||||
)
|
||||
if f.samples:
|
||||
sample_df = pd.DataFrame(
|
||||
f.samples, columns=["row", "column", "value"],
|
||||
)
|
||||
st.dataframe(sample_df, use_container_width=True, hide_index=True)
|
||||
|
||||
|
||||
def upload_and_analyze_section() -> None:
|
||||
"""Render the upload + analyze panel for the home page.
|
||||
|
||||
Stashes the uploaded file (name + bytes) and findings in session state
|
||||
so individual tool pages can pick them up if they want to skip their
|
||||
own uploader. Each tool page already has its own uploader today, so
|
||||
this is purely additive.
|
||||
"""
|
||||
st.markdown("### 📤 Upload a file to start")
|
||||
st.caption(
|
||||
"Optional: scan an uploaded file for data quality issues and see "
|
||||
"which tools can fix each one. Skip if you already know what you need."
|
||||
)
|
||||
|
||||
uploaded = st.file_uploader(
|
||||
"Upload CSV or Excel",
|
||||
type=["csv", "tsv", "xlsx", "xls"],
|
||||
key="home_upload",
|
||||
)
|
||||
if uploaded is None:
|
||||
return
|
||||
|
||||
# Stash on every fresh upload so all tool pages can pick it up.
|
||||
if (
|
||||
st.session_state.get("home_uploaded_name") != uploaded.name
|
||||
or st.session_state.get("home_uploaded_size") != uploaded.size
|
||||
):
|
||||
st.session_state["home_uploaded_name"] = uploaded.name
|
||||
st.session_state["home_uploaded_size"] = uploaded.size
|
||||
st.session_state["home_uploaded_bytes"] = uploaded.getvalue()
|
||||
# Drop stale findings on a new upload.
|
||||
st.session_state.pop("home_findings", None)
|
||||
st.session_state.pop("home_skipped", None)
|
||||
|
||||
col_run, col_skip, _ = st.columns([1, 1, 4])
|
||||
with col_run:
|
||||
run_clicked = st.button("Run analysis", type="primary", key="home_run_analysis")
|
||||
with col_skip:
|
||||
skip_clicked = st.button("Skip", key="home_skip_analysis")
|
||||
|
||||
if skip_clicked:
|
||||
st.session_state["home_findings"] = []
|
||||
st.session_state["home_skipped"] = True
|
||||
|
||||
if run_clicked:
|
||||
with st.spinner("Scanning…"):
|
||||
findings = _run_analysis_on_upload(uploaded)
|
||||
st.session_state["home_findings"] = findings
|
||||
st.session_state["home_skipped"] = False
|
||||
|
||||
findings = st.session_state.get("home_findings")
|
||||
if findings is None:
|
||||
return
|
||||
|
||||
if st.session_state.get("home_skipped"):
|
||||
st.info("Analysis skipped. Open any tool below to start working.")
|
||||
return
|
||||
|
||||
st.divider()
|
||||
render_findings_panel(findings)
|
||||
|
||||
|
||||
def _run_analysis_on_upload(uploaded):
|
||||
"""Read the uploaded file with pre-parse repair, then analyze."""
|
||||
from src.core.analyze import analyze
|
||||
from src.core.io import repair_bytes
|
||||
|
||||
name = uploaded.name
|
||||
data = uploaded.getvalue()
|
||||
suffix = name.rsplit(".", 1)[-1].lower() if "." in name else ""
|
||||
|
||||
if suffix in ("xlsx", "xls"):
|
||||
df = pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False)
|
||||
return analyze(df)
|
||||
|
||||
# CSV / TSV: run repair_bytes so the user sees csv_* findings.
|
||||
text_head = data[:4096].decode("utf-8", errors="replace")
|
||||
delim = "\t" if suffix == "tsv" else ","
|
||||
if delim == ",":
|
||||
for cand in ("\t", ";", "|"):
|
||||
if text_head.count(cand) > text_head.count(",") * 1.5:
|
||||
delim = cand
|
||||
break
|
||||
repair = repair_bytes(data, encoding="utf-8", delimiter=delim)
|
||||
df = pd.read_csv(
|
||||
io.BytesIO(repair.repaired_bytes),
|
||||
encoding="utf-8", delimiter=delim,
|
||||
dtype=str, keep_default_na=False, on_bad_lines="warn",
|
||||
)
|
||||
return analyze(df, repair_result=repair)
|
||||
|
||||
|
||||
def findings_count_for_tool(tool_id: str) -> int:
|
||||
"""How many findings in session state target *tool_id*; 0 when none.
|
||||
|
||||
Used by the home-page tool grid to badge cards that have actionable
|
||||
findings without re-running the analyzer.
|
||||
"""
|
||||
findings = st.session_state.get("home_findings") or []
|
||||
return sum(1 for f in findings if f.tool == tool_id)
|
||||
|
||||
Reference in New Issue
Block a user