feat(gui): wire analyzer into home page with findings panel and tool badges

Home page (src/gui/app.py) gains an upload + analyze section above the tool
grid: file uploader, "Run analysis" / "Skip" buttons, and a findings panel
grouped by destination tool. Tool cards now carry a "N findings" badge
when the active session's findings reference that tool, so the user sees
at a glance which tools their just-uploaded file would benefit from.

src/gui/components.py adds the shared GUI surface:
  - TOOL_DISPLAY_NAMES + tool_display_name() — single source of truth for
    GUI labels, keeping detector tool ids decoupled from the UI.
  - render_findings_panel(findings) — severity icons, expander per tool,
    open-tool page link, sample-cells dataframe.
  - upload_and_analyze_section() — the home-page widget; stashes file
    bytes and findings in session_state so future tool pages can pick up
    the existing upload instead of re-prompting.
  - findings_count_for_tool(tool_id) — used by app.py to badge cards.

CSV/TSV uploads run through repair_bytes() before analysis, so the user
also sees csv_bom_stripped / csv_smart_quotes_folded findings synthesized
from the pre-parse repair pass. Excel uploads skip that step.

The Text Cleaner tool card flips from "Coming Soon" to "Ready" — that has
been true since the v3.0 implementation and the home page just hadn't been
updated.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-29 15:53:22 +00:00
parent 5c62fb6117
commit a8943f29eb
2 changed files with 248 additions and 4 deletions

View File

@@ -21,7 +21,11 @@ if str(_project_root) not in sys.path:
# Page config
# ---------------------------------------------------------------------------
from src.gui.components import hide_streamlit_chrome
from src.gui.components import (
findings_count_for_tool,
hide_streamlit_chrome,
upload_and_analyze_section,
)
st.set_page_config(
page_title="DataTools — Data Cleaning Mastery",
@@ -41,6 +45,14 @@ st.caption("A 9-tool suite for cleaning, standardizing, and validating tabular d
st.divider()
# ---------------------------------------------------------------------------
# Upload & analyze (optional onboarding step)
# ---------------------------------------------------------------------------
upload_and_analyze_section()
st.divider()
# ---------------------------------------------------------------------------
# Tool cards
# ---------------------------------------------------------------------------
@@ -52,13 +64,15 @@ TOOLS = [
"description": "Fuzzy matching, normalization, survivor selection, and interactive review.",
"status": "Ready",
"page": "1_Deduplicator",
"tool_id": "01_deduplicator",
},
{
"icon": "✂️",
"name": "Text Cleaner",
"description": "Whitespace trim, multi-space collapse, Unicode normalization, BOM and line-ending handling.",
"status": "Coming Soon",
"status": "Ready",
"page": "2_Text_Cleaner",
"tool_id": "02_text_cleaner",
},
{
"icon": "📐",
@@ -66,6 +80,7 @@ TOOLS = [
"description": "Standardize dates, currencies, names, phone numbers, and addresses.",
"status": "Coming Soon",
"page": "3_Format_Standardizer",
"tool_id": "03_format_standardizer",
},
{
"icon": "🕳️",
@@ -73,6 +88,7 @@ TOOLS = [
"description": "Detect disguised nulls, missingness analysis, and imputation strategies.",
"status": "Coming Soon",
"page": "4_Missing_Values",
"tool_id": "04_missing_handler",
},
{
"icon": "🗂️",
@@ -80,6 +96,7 @@ TOOLS = [
"description": "Rename columns, enforce a target schema, and coerce types.",
"status": "Coming Soon",
"page": "5_Column_Mapper",
"tool_id": "05_column_mapper",
},
{
"icon": "📊",
@@ -87,6 +104,7 @@ TOOLS = [
"description": "Z-score, IQR, and MAD detection with domain-rule violations and winsorization.",
"status": "Coming Soon",
"page": "6_Outlier_Detector",
"tool_id": "06_outlier_detector",
},
{
"icon": "📎",
@@ -94,6 +112,7 @@ TOOLS = [
"description": "Combine multiple CSV/Excel files with schema alignment.",
"status": "Coming Soon",
"page": "7_Multi_File_Merger",
"tool_id": "07_multi_file_merger",
},
{
"icon": "",
@@ -101,6 +120,7 @@ TOOLS = [
"description": "Validate against rules and generate PDF/Excel quality reports.",
"status": "Coming Soon",
"page": "8_Validator_Reporter",
"tool_id": "08_validator_reporter",
},
{
"icon": "⚙️",
@@ -108,10 +128,13 @@ TOOLS = [
"description": "Chain tools in recommended order and pass output between steps.",
"status": "Coming Soon",
"page": "9_Pipeline_Runner",
"tool_id": "09_pipeline_runner",
},
]
# Render tool cards in a 3-column grid
# Render tool cards in a 3-column grid. Cards picked up by the analyzer get a
# coloured "N findings" badge so the user can see at a glance which tools
# would help with the just-uploaded file.
for row_start in range(0, len(TOOLS), 3):
cols = st.columns(3)
for i, col in enumerate(cols):
@@ -121,8 +144,12 @@ for row_start in range(0, len(TOOLS), 3):
tool = TOOLS[idx]
with col:
status_color = "green" if tool["status"] == "Ready" else "orange"
badge = ""
n = findings_count_for_tool(tool.get("tool_id", ""))
if n:
badge = f" :red-background[**{n} finding{'s' if n != 1 else ''}**]"
st.markdown(
f"### {tool['icon']} {tool['name']}\n\n"
f"### {tool['icon']} {tool['name']}{badge}\n\n"
f"{tool['description']}\n\n"
f":{status_color}[**{tool['status']}**]"
)

View File

@@ -686,3 +686,220 @@ def _build_match_groups_csv(
groups_df = pd.DataFrame(rows)
return groups_df.to_csv(index=False).encode("utf-8-sig")
# ---------------------------------------------------------------------------
# Analyzer integration (upload-time data quality findings)
# ---------------------------------------------------------------------------
# Tool id -> friendly display name. Single source of truth for the GUI; the
# CLI keeps its own copy so each entrypoint stays self-contained.
TOOL_DISPLAY_NAMES: dict[str, str] = {
"01_deduplicator": "Deduplicator",
"02_text_cleaner": "Text Cleaner",
"03_format_standardizer": "Format Standardizer",
"04_missing_handler": "Missing Value Handler",
"05_column_mapper": "Column Mapper",
"06_outlier_detector": "Outlier Detector",
"07_multi_file_merger": "Multi-File Merger",
"08_validator_reporter": "Validator & Reporter",
"09_pipeline_runner": "Pipeline Runner",
}
_SEVERITY_ICON: dict[str, str] = {
"info": "",
"warn": "⚠️",
"error": "🛑",
}
_SEVERITY_COLOR: dict[str, str] = {
"info": "blue",
"warn": "orange",
"error": "red",
}
# Map tool id to the streamlit page path under src/gui/. Skipped tools (no
# page yet) return empty string and the "Open" button is omitted.
_TOOL_PAGE_PATHS: dict[str, str] = {
"01_deduplicator": "pages/1_Deduplicator.py",
"02_text_cleaner": "pages/2_Text_Cleaner.py",
"03_format_standardizer": "pages/3_Format_Standardizer.py",
"04_missing_handler": "pages/4_Missing_Values.py",
"05_column_mapper": "pages/5_Column_Mapper.py",
"06_outlier_detector": "pages/6_Outlier_Detector.py",
"07_multi_file_merger": "pages/7_Multi_File_Merger.py",
"08_validator_reporter": "pages/8_Validator_Reporter.py",
"09_pipeline_runner": "pages/9_Pipeline_Runner.py",
}
def tool_display_name(tool_id: str) -> str:
"""Map a stable tool id to its GUI display name; falls back to the id."""
return TOOL_DISPLAY_NAMES.get(tool_id, tool_id) if tool_id else "Informational"
def _tool_page_slug(tool_id: str) -> str:
return _TOOL_PAGE_PATHS.get(tool_id, "")
def render_findings_panel(findings, *, header: str = "Detected issues") -> None:
"""Render a list of :class:`Finding` objects grouped by tool.
Each tool gets a header with the count, an open-tool button, and a list
of the findings underneath. Severity icon + count are shown inline so
the user can decide which tool to open first.
"""
from src.core.analyze import findings_by_tool # local import to avoid cycle
if not findings:
st.success("No issues detected. Open any tool below to start working.")
return
by_sev: dict[str, int] = {}
for f in findings:
by_sev[f.severity] = by_sev.get(f.severity, 0) + 1
sev_summary = " · ".join(
f"{_SEVERITY_ICON[s]} {by_sev[s]} {s}"
for s in ("error", "warn", "info") if by_sev.get(s)
)
st.markdown(f"### {header}")
st.caption(sev_summary)
grouped = findings_by_tool(findings)
untargeted = [f for f in findings if not f.tool]
for tool_id in sorted(grouped):
items = grouped[tool_id]
with st.expander(
f"{tool_display_name(tool_id)}{len(items)} finding(s)",
expanded=any(f.severity == "error" for f in items),
):
for f in items:
_render_one_finding(f)
page_slug = _tool_page_slug(tool_id)
if page_slug:
st.page_link(f"src/gui/{page_slug}", label=f"Open {tool_display_name(tool_id)}")
if untargeted:
with st.expander(
f"Other / file-level — {len(untargeted)} finding(s)",
expanded=False,
):
for f in untargeted:
_render_one_finding(f)
def _render_one_finding(f) -> None:
color = _SEVERITY_COLOR[f.severity]
icon = _SEVERITY_ICON[f.severity]
column_part = f" in `{f.column}`" if getattr(f, "column", None) else ""
st.markdown(
f"{icon} :{color}[**{f.id}**]{column_part}{f.description}"
)
if f.samples:
sample_df = pd.DataFrame(
f.samples, columns=["row", "column", "value"],
)
st.dataframe(sample_df, use_container_width=True, hide_index=True)
def upload_and_analyze_section() -> None:
"""Render the upload + analyze panel for the home page.
Stashes the uploaded file (name + bytes) and findings in session state
so individual tool pages can pick them up if they want to skip their
own uploader. Each tool page already has its own uploader today, so
this is purely additive.
"""
st.markdown("### 📤 Upload a file to start")
st.caption(
"Optional: scan an uploaded file for data quality issues and see "
"which tools can fix each one. Skip if you already know what you need."
)
uploaded = st.file_uploader(
"Upload CSV or Excel",
type=["csv", "tsv", "xlsx", "xls"],
key="home_upload",
)
if uploaded is None:
return
# Stash on every fresh upload so all tool pages can pick it up.
if (
st.session_state.get("home_uploaded_name") != uploaded.name
or st.session_state.get("home_uploaded_size") != uploaded.size
):
st.session_state["home_uploaded_name"] = uploaded.name
st.session_state["home_uploaded_size"] = uploaded.size
st.session_state["home_uploaded_bytes"] = uploaded.getvalue()
# Drop stale findings on a new upload.
st.session_state.pop("home_findings", None)
st.session_state.pop("home_skipped", None)
col_run, col_skip, _ = st.columns([1, 1, 4])
with col_run:
run_clicked = st.button("Run analysis", type="primary", key="home_run_analysis")
with col_skip:
skip_clicked = st.button("Skip", key="home_skip_analysis")
if skip_clicked:
st.session_state["home_findings"] = []
st.session_state["home_skipped"] = True
if run_clicked:
with st.spinner("Scanning…"):
findings = _run_analysis_on_upload(uploaded)
st.session_state["home_findings"] = findings
st.session_state["home_skipped"] = False
findings = st.session_state.get("home_findings")
if findings is None:
return
if st.session_state.get("home_skipped"):
st.info("Analysis skipped. Open any tool below to start working.")
return
st.divider()
render_findings_panel(findings)
def _run_analysis_on_upload(uploaded):
"""Read the uploaded file with pre-parse repair, then analyze."""
from src.core.analyze import analyze
from src.core.io import repair_bytes
name = uploaded.name
data = uploaded.getvalue()
suffix = name.rsplit(".", 1)[-1].lower() if "." in name else ""
if suffix in ("xlsx", "xls"):
df = pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False)
return analyze(df)
# CSV / TSV: run repair_bytes so the user sees csv_* findings.
text_head = data[:4096].decode("utf-8", errors="replace")
delim = "\t" if suffix == "tsv" else ","
if delim == ",":
for cand in ("\t", ";", "|"):
if text_head.count(cand) > text_head.count(",") * 1.5:
delim = cand
break
repair = repair_bytes(data, encoding="utf-8", delimiter=delim)
df = pd.read_csv(
io.BytesIO(repair.repaired_bytes),
encoding="utf-8", delimiter=delim,
dtype=str, keep_default_na=False, on_bad_lines="warn",
)
return analyze(df, repair_result=repair)
def findings_count_for_tool(tool_id: str) -> int:
"""How many findings in session state target *tool_id*; 0 when none.
Used by the home-page tool grid to badge cards that have actionable
findings without re-running the analyzer.
"""
findings = st.session_state.get("home_findings") or []
return sum(1 for f in findings if f.tool == tool_id)