feat(gui): wire analyzer into home page with findings panel and tool badges
Home page (src/gui/app.py) gains an upload + analyze section above the tool
grid: file uploader, "Run analysis" / "Skip" buttons, and a findings panel
grouped by destination tool. Tool cards now carry a "N findings" badge
when the active session's findings reference that tool, so the user sees
at a glance which tools their just-uploaded file would benefit from.
src/gui/components.py adds the shared GUI surface:
- TOOL_DISPLAY_NAMES + tool_display_name() — single source of truth for
GUI labels, keeping detector tool ids decoupled from the UI.
- render_findings_panel(findings) — severity icons, expander per tool,
open-tool page link, sample-cells dataframe.
- upload_and_analyze_section() — the home-page widget; stashes file
bytes and findings in session_state so future tool pages can pick up
the existing upload instead of re-prompting.
- findings_count_for_tool(tool_id) — used by app.py to badge cards.
CSV/TSV uploads run through repair_bytes() before analysis, so the user
also sees csv_bom_stripped / csv_smart_quotes_folded findings synthesized
from the pre-parse repair pass. Excel uploads skip that step.
The Text Cleaner tool card flips from "Coming Soon" to "Ready" — that has
been true since the v3.0 implementation and the home page just hadn't been
updated.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -21,7 +21,11 @@ if str(_project_root) not in sys.path:
|
||||
# Page config
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
from src.gui.components import hide_streamlit_chrome
|
||||
from src.gui.components import (
|
||||
findings_count_for_tool,
|
||||
hide_streamlit_chrome,
|
||||
upload_and_analyze_section,
|
||||
)
|
||||
|
||||
st.set_page_config(
|
||||
page_title="DataTools — Data Cleaning Mastery",
|
||||
@@ -41,6 +45,14 @@ st.caption("A 9-tool suite for cleaning, standardizing, and validating tabular d
|
||||
|
||||
st.divider()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Upload & analyze (optional onboarding step)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
upload_and_analyze_section()
|
||||
|
||||
st.divider()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tool cards
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -52,13 +64,15 @@ TOOLS = [
|
||||
"description": "Fuzzy matching, normalization, survivor selection, and interactive review.",
|
||||
"status": "Ready",
|
||||
"page": "1_Deduplicator",
|
||||
"tool_id": "01_deduplicator",
|
||||
},
|
||||
{
|
||||
"icon": "✂️",
|
||||
"name": "Text Cleaner",
|
||||
"description": "Whitespace trim, multi-space collapse, Unicode normalization, BOM and line-ending handling.",
|
||||
"status": "Coming Soon",
|
||||
"status": "Ready",
|
||||
"page": "2_Text_Cleaner",
|
||||
"tool_id": "02_text_cleaner",
|
||||
},
|
||||
{
|
||||
"icon": "📐",
|
||||
@@ -66,6 +80,7 @@ TOOLS = [
|
||||
"description": "Standardize dates, currencies, names, phone numbers, and addresses.",
|
||||
"status": "Coming Soon",
|
||||
"page": "3_Format_Standardizer",
|
||||
"tool_id": "03_format_standardizer",
|
||||
},
|
||||
{
|
||||
"icon": "🕳️",
|
||||
@@ -73,6 +88,7 @@ TOOLS = [
|
||||
"description": "Detect disguised nulls, missingness analysis, and imputation strategies.",
|
||||
"status": "Coming Soon",
|
||||
"page": "4_Missing_Values",
|
||||
"tool_id": "04_missing_handler",
|
||||
},
|
||||
{
|
||||
"icon": "🗂️",
|
||||
@@ -80,6 +96,7 @@ TOOLS = [
|
||||
"description": "Rename columns, enforce a target schema, and coerce types.",
|
||||
"status": "Coming Soon",
|
||||
"page": "5_Column_Mapper",
|
||||
"tool_id": "05_column_mapper",
|
||||
},
|
||||
{
|
||||
"icon": "📊",
|
||||
@@ -87,6 +104,7 @@ TOOLS = [
|
||||
"description": "Z-score, IQR, and MAD detection with domain-rule violations and winsorization.",
|
||||
"status": "Coming Soon",
|
||||
"page": "6_Outlier_Detector",
|
||||
"tool_id": "06_outlier_detector",
|
||||
},
|
||||
{
|
||||
"icon": "📎",
|
||||
@@ -94,6 +112,7 @@ TOOLS = [
|
||||
"description": "Combine multiple CSV/Excel files with schema alignment.",
|
||||
"status": "Coming Soon",
|
||||
"page": "7_Multi_File_Merger",
|
||||
"tool_id": "07_multi_file_merger",
|
||||
},
|
||||
{
|
||||
"icon": "✅",
|
||||
@@ -101,6 +120,7 @@ TOOLS = [
|
||||
"description": "Validate against rules and generate PDF/Excel quality reports.",
|
||||
"status": "Coming Soon",
|
||||
"page": "8_Validator_Reporter",
|
||||
"tool_id": "08_validator_reporter",
|
||||
},
|
||||
{
|
||||
"icon": "⚙️",
|
||||
@@ -108,10 +128,13 @@ TOOLS = [
|
||||
"description": "Chain tools in recommended order and pass output between steps.",
|
||||
"status": "Coming Soon",
|
||||
"page": "9_Pipeline_Runner",
|
||||
"tool_id": "09_pipeline_runner",
|
||||
},
|
||||
]
|
||||
|
||||
# Render tool cards in a 3-column grid
|
||||
# Render tool cards in a 3-column grid. Cards picked up by the analyzer get a
|
||||
# coloured "N findings" badge so the user can see at a glance which tools
|
||||
# would help with the just-uploaded file.
|
||||
for row_start in range(0, len(TOOLS), 3):
|
||||
cols = st.columns(3)
|
||||
for i, col in enumerate(cols):
|
||||
@@ -121,8 +144,12 @@ for row_start in range(0, len(TOOLS), 3):
|
||||
tool = TOOLS[idx]
|
||||
with col:
|
||||
status_color = "green" if tool["status"] == "Ready" else "orange"
|
||||
badge = ""
|
||||
n = findings_count_for_tool(tool.get("tool_id", ""))
|
||||
if n:
|
||||
badge = f" :red-background[**{n} finding{'s' if n != 1 else ''}**]"
|
||||
st.markdown(
|
||||
f"### {tool['icon']} {tool['name']}\n\n"
|
||||
f"### {tool['icon']} {tool['name']}{badge}\n\n"
|
||||
f"{tool['description']}\n\n"
|
||||
f":{status_color}[**{tool['status']}**]"
|
||||
)
|
||||
|
||||
@@ -686,3 +686,220 @@ def _build_match_groups_csv(
|
||||
|
||||
groups_df = pd.DataFrame(rows)
|
||||
return groups_df.to_csv(index=False).encode("utf-8-sig")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Analyzer integration (upload-time data quality findings)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Tool id -> friendly display name. Single source of truth for the GUI; the
|
||||
# CLI keeps its own copy so each entrypoint stays self-contained.
|
||||
TOOL_DISPLAY_NAMES: dict[str, str] = {
|
||||
"01_deduplicator": "Deduplicator",
|
||||
"02_text_cleaner": "Text Cleaner",
|
||||
"03_format_standardizer": "Format Standardizer",
|
||||
"04_missing_handler": "Missing Value Handler",
|
||||
"05_column_mapper": "Column Mapper",
|
||||
"06_outlier_detector": "Outlier Detector",
|
||||
"07_multi_file_merger": "Multi-File Merger",
|
||||
"08_validator_reporter": "Validator & Reporter",
|
||||
"09_pipeline_runner": "Pipeline Runner",
|
||||
}
|
||||
|
||||
_SEVERITY_ICON: dict[str, str] = {
|
||||
"info": "ℹ️",
|
||||
"warn": "⚠️",
|
||||
"error": "🛑",
|
||||
}
|
||||
|
||||
_SEVERITY_COLOR: dict[str, str] = {
|
||||
"info": "blue",
|
||||
"warn": "orange",
|
||||
"error": "red",
|
||||
}
|
||||
|
||||
# Map tool id to the streamlit page path under src/gui/. Skipped tools (no
|
||||
# page yet) return empty string and the "Open" button is omitted.
|
||||
_TOOL_PAGE_PATHS: dict[str, str] = {
|
||||
"01_deduplicator": "pages/1_Deduplicator.py",
|
||||
"02_text_cleaner": "pages/2_Text_Cleaner.py",
|
||||
"03_format_standardizer": "pages/3_Format_Standardizer.py",
|
||||
"04_missing_handler": "pages/4_Missing_Values.py",
|
||||
"05_column_mapper": "pages/5_Column_Mapper.py",
|
||||
"06_outlier_detector": "pages/6_Outlier_Detector.py",
|
||||
"07_multi_file_merger": "pages/7_Multi_File_Merger.py",
|
||||
"08_validator_reporter": "pages/8_Validator_Reporter.py",
|
||||
"09_pipeline_runner": "pages/9_Pipeline_Runner.py",
|
||||
}
|
||||
|
||||
|
||||
def tool_display_name(tool_id: str) -> str:
|
||||
"""Map a stable tool id to its GUI display name; falls back to the id."""
|
||||
return TOOL_DISPLAY_NAMES.get(tool_id, tool_id) if tool_id else "Informational"
|
||||
|
||||
|
||||
def _tool_page_slug(tool_id: str) -> str:
|
||||
return _TOOL_PAGE_PATHS.get(tool_id, "")
|
||||
|
||||
|
||||
def render_findings_panel(findings, *, header: str = "Detected issues") -> None:
|
||||
"""Render a list of :class:`Finding` objects grouped by tool.
|
||||
|
||||
Each tool gets a header with the count, an open-tool button, and a list
|
||||
of the findings underneath. Severity icon + count are shown inline so
|
||||
the user can decide which tool to open first.
|
||||
"""
|
||||
from src.core.analyze import findings_by_tool # local import to avoid cycle
|
||||
|
||||
if not findings:
|
||||
st.success("No issues detected. Open any tool below to start working.")
|
||||
return
|
||||
|
||||
by_sev: dict[str, int] = {}
|
||||
for f in findings:
|
||||
by_sev[f.severity] = by_sev.get(f.severity, 0) + 1
|
||||
sev_summary = " · ".join(
|
||||
f"{_SEVERITY_ICON[s]} {by_sev[s]} {s}"
|
||||
for s in ("error", "warn", "info") if by_sev.get(s)
|
||||
)
|
||||
st.markdown(f"### {header}")
|
||||
st.caption(sev_summary)
|
||||
|
||||
grouped = findings_by_tool(findings)
|
||||
untargeted = [f for f in findings if not f.tool]
|
||||
|
||||
for tool_id in sorted(grouped):
|
||||
items = grouped[tool_id]
|
||||
with st.expander(
|
||||
f"{tool_display_name(tool_id)} — {len(items)} finding(s)",
|
||||
expanded=any(f.severity == "error" for f in items),
|
||||
):
|
||||
for f in items:
|
||||
_render_one_finding(f)
|
||||
page_slug = _tool_page_slug(tool_id)
|
||||
if page_slug:
|
||||
st.page_link(f"src/gui/{page_slug}", label=f"Open {tool_display_name(tool_id)} →")
|
||||
|
||||
if untargeted:
|
||||
with st.expander(
|
||||
f"Other / file-level — {len(untargeted)} finding(s)",
|
||||
expanded=False,
|
||||
):
|
||||
for f in untargeted:
|
||||
_render_one_finding(f)
|
||||
|
||||
|
||||
def _render_one_finding(f) -> None:
|
||||
color = _SEVERITY_COLOR[f.severity]
|
||||
icon = _SEVERITY_ICON[f.severity]
|
||||
column_part = f" in `{f.column}`" if getattr(f, "column", None) else ""
|
||||
st.markdown(
|
||||
f"{icon} :{color}[**{f.id}**]{column_part} — {f.description}"
|
||||
)
|
||||
if f.samples:
|
||||
sample_df = pd.DataFrame(
|
||||
f.samples, columns=["row", "column", "value"],
|
||||
)
|
||||
st.dataframe(sample_df, use_container_width=True, hide_index=True)
|
||||
|
||||
|
||||
def upload_and_analyze_section() -> None:
|
||||
"""Render the upload + analyze panel for the home page.
|
||||
|
||||
Stashes the uploaded file (name + bytes) and findings in session state
|
||||
so individual tool pages can pick them up if they want to skip their
|
||||
own uploader. Each tool page already has its own uploader today, so
|
||||
this is purely additive.
|
||||
"""
|
||||
st.markdown("### 📤 Upload a file to start")
|
||||
st.caption(
|
||||
"Optional: scan an uploaded file for data quality issues and see "
|
||||
"which tools can fix each one. Skip if you already know what you need."
|
||||
)
|
||||
|
||||
uploaded = st.file_uploader(
|
||||
"Upload CSV or Excel",
|
||||
type=["csv", "tsv", "xlsx", "xls"],
|
||||
key="home_upload",
|
||||
)
|
||||
if uploaded is None:
|
||||
return
|
||||
|
||||
# Stash on every fresh upload so all tool pages can pick it up.
|
||||
if (
|
||||
st.session_state.get("home_uploaded_name") != uploaded.name
|
||||
or st.session_state.get("home_uploaded_size") != uploaded.size
|
||||
):
|
||||
st.session_state["home_uploaded_name"] = uploaded.name
|
||||
st.session_state["home_uploaded_size"] = uploaded.size
|
||||
st.session_state["home_uploaded_bytes"] = uploaded.getvalue()
|
||||
# Drop stale findings on a new upload.
|
||||
st.session_state.pop("home_findings", None)
|
||||
st.session_state.pop("home_skipped", None)
|
||||
|
||||
col_run, col_skip, _ = st.columns([1, 1, 4])
|
||||
with col_run:
|
||||
run_clicked = st.button("Run analysis", type="primary", key="home_run_analysis")
|
||||
with col_skip:
|
||||
skip_clicked = st.button("Skip", key="home_skip_analysis")
|
||||
|
||||
if skip_clicked:
|
||||
st.session_state["home_findings"] = []
|
||||
st.session_state["home_skipped"] = True
|
||||
|
||||
if run_clicked:
|
||||
with st.spinner("Scanning…"):
|
||||
findings = _run_analysis_on_upload(uploaded)
|
||||
st.session_state["home_findings"] = findings
|
||||
st.session_state["home_skipped"] = False
|
||||
|
||||
findings = st.session_state.get("home_findings")
|
||||
if findings is None:
|
||||
return
|
||||
|
||||
if st.session_state.get("home_skipped"):
|
||||
st.info("Analysis skipped. Open any tool below to start working.")
|
||||
return
|
||||
|
||||
st.divider()
|
||||
render_findings_panel(findings)
|
||||
|
||||
|
||||
def _run_analysis_on_upload(uploaded):
|
||||
"""Read the uploaded file with pre-parse repair, then analyze."""
|
||||
from src.core.analyze import analyze
|
||||
from src.core.io import repair_bytes
|
||||
|
||||
name = uploaded.name
|
||||
data = uploaded.getvalue()
|
||||
suffix = name.rsplit(".", 1)[-1].lower() if "." in name else ""
|
||||
|
||||
if suffix in ("xlsx", "xls"):
|
||||
df = pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False)
|
||||
return analyze(df)
|
||||
|
||||
# CSV / TSV: run repair_bytes so the user sees csv_* findings.
|
||||
text_head = data[:4096].decode("utf-8", errors="replace")
|
||||
delim = "\t" if suffix == "tsv" else ","
|
||||
if delim == ",":
|
||||
for cand in ("\t", ";", "|"):
|
||||
if text_head.count(cand) > text_head.count(",") * 1.5:
|
||||
delim = cand
|
||||
break
|
||||
repair = repair_bytes(data, encoding="utf-8", delimiter=delim)
|
||||
df = pd.read_csv(
|
||||
io.BytesIO(repair.repaired_bytes),
|
||||
encoding="utf-8", delimiter=delim,
|
||||
dtype=str, keep_default_na=False, on_bad_lines="warn",
|
||||
)
|
||||
return analyze(df, repair_result=repair)
|
||||
|
||||
|
||||
def findings_count_for_tool(tool_id: str) -> int:
|
||||
"""How many findings in session state target *tool_id*; 0 when none.
|
||||
|
||||
Used by the home-page tool grid to badge cards that have actionable
|
||||
findings without re-running the analyzer.
|
||||
"""
|
||||
findings = st.session_state.get("home_findings") or []
|
||||
return sum(1 for f in findings if f.tool == tool_id)
|
||||
|
||||
Reference in New Issue
Block a user