From f891c6116d16c1ceffb2a1b61fb69fc47075d094 Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 29 Apr 2026 20:56:21 +0000 Subject: [PATCH] refactor(gui): tool registry + components package for per-tool builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two low-risk seam moves to enable selling per-tool subsets without breaking the existing all-in-one bundle. Behaviour identical; every existing import still resolves; full pytest suite + every page returns HTTP 200. 1. **Tool registry** (src/gui/tools_registry.py) — replaces the inline dict-of-dicts in app.py with a Tool dataclass and a TOOLS list. Adds a tier field ("core" today, "pro" / "enterprise" later) and tools_for_tier() / tool_by_id() / display_name() helpers. A per-tool build slices TOOLS at import time without code changes. 2. **components package** (src/gui/components/) — converts the former single components.py into a package with: _legacy.py — original file, unchanged. __init__.py — re-exports the legacy surface; existing "from src.gui.components import …" calls continue to work. shared.py — hide_streamlit_chrome, pickup_or_upload (every build needs these). gate.py — require_normalization_gate (Pro / Suite SKUs). findings.py — analyzer-finding widgets (drops out of a standalone-Dedup build). dedup_review.py — match-group cards + apply pipeline (drops out of a non-dedup build). The seam modules are narrow re-exports today. As code migrates out of _legacy.py into the focused modules, the public import path stays stable via the shim. E2E: 765 passed, 17 xfailed (unchanged); home page + all 9 tool pages + Review page render HTTP 200; full pipeline (analyze → auto_fix → apply_decisions → output bytes) round-trips on the kitchen-sink fixture with zero high-confidence findings remaining post-fix. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/gui/app.py | 85 +--------- src/gui/components/__init__.py | 57 +++++++ .../{components.py => components/_legacy.py} | 0 src/gui/components/dedup_review.py | 24 +++ src/gui/components/findings.py | 25 +++ src/gui/components/gate.py | 16 ++ src/gui/components/shared.py | 14 ++ src/gui/tools_registry.py | 152 ++++++++++++++++++ 8 files changed, 294 insertions(+), 79 deletions(-) create mode 100644 src/gui/components/__init__.py rename src/gui/{components.py => components/_legacy.py} (100%) create mode 100644 src/gui/components/dedup_review.py create mode 100644 src/gui/components/findings.py create mode 100644 src/gui/components/gate.py create mode 100644 src/gui/components/shared.py create mode 100644 src/gui/tools_registry.py diff --git a/src/gui/app.py b/src/gui/app.py index 7943e6e..2ace7bf 100644 --- a/src/gui/app.py +++ b/src/gui/app.py @@ -57,80 +57,7 @@ st.divider() # Tool cards # --------------------------------------------------------------------------- -TOOLS = [ - { - "icon": "🔍", - "name": "Deduplicator", - "description": "Fuzzy matching, normalization, survivor selection, and interactive review.", - "status": "Ready", - "page": "1_Deduplicator", - "tool_id": "01_deduplicator", - }, - { - "icon": "✂️", - "name": "Text Cleaner", - "description": "Whitespace trim, multi-space collapse, Unicode normalization, BOM and line-ending handling.", - "status": "Ready", - "page": "2_Text_Cleaner", - "tool_id": "02_text_cleaner", - }, - { - "icon": "📐", - "name": "Format Standardizer", - "description": "Standardize dates, currencies, names, phone numbers, and addresses.", - "status": "Coming Soon", - "page": "3_Format_Standardizer", - "tool_id": "03_format_standardizer", - }, - { - "icon": "🕳️", - "name": "Missing Value Handler", - "description": "Detect disguised nulls, missingness analysis, and imputation strategies.", - "status": "Coming Soon", - "page": "4_Missing_Values", - "tool_id": "04_missing_handler", - }, - { - "icon": "🗂️", - "name": "Column Mapper", - "description": "Rename columns, enforce a target schema, and coerce types.", - "status": "Coming Soon", - "page": "5_Column_Mapper", - "tool_id": "05_column_mapper", - }, - { - "icon": "📊", - "name": "Outlier Detector", - "description": "Z-score, IQR, and MAD detection with domain-rule violations and winsorization.", - "status": "Coming Soon", - "page": "6_Outlier_Detector", - "tool_id": "06_outlier_detector", - }, - { - "icon": "📎", - "name": "Multi-File Merger", - "description": "Combine multiple CSV/Excel files with schema alignment.", - "status": "Coming Soon", - "page": "7_Multi_File_Merger", - "tool_id": "07_multi_file_merger", - }, - { - "icon": "✅", - "name": "Validator & Reporter", - "description": "Validate against rules and generate PDF/Excel quality reports.", - "status": "Coming Soon", - "page": "8_Validator_Reporter", - "tool_id": "08_validator_reporter", - }, - { - "icon": "⚙️", - "name": "Pipeline Runner", - "description": "Chain tools in recommended order and pass output between steps.", - "status": "Coming Soon", - "page": "9_Pipeline_Runner", - "tool_id": "09_pipeline_runner", - }, -] +from src.gui.tools_registry import TOOLS # Render tool cards in a 3-column grid. Cards picked up by the analyzer get a # coloured "N findings" badge so the user can see at a glance which tools @@ -143,15 +70,15 @@ for row_start in range(0, len(TOOLS), 3): break tool = TOOLS[idx] with col: - status_color = "green" if tool["status"] == "Ready" else "orange" + status_color = "green" if tool.status == "Ready" else "orange" badge = "" - n = findings_count_for_tool(tool.get("tool_id", "")) + n = findings_count_for_tool(tool.tool_id) if n: badge = f" :red-background[**{n} finding{'s' if n != 1 else ''}**]" st.markdown( - f"### {tool['icon']} {tool['name']}{badge}\n\n" - f"{tool['description']}\n\n" - f":{status_color}[**{tool['status']}**]" + f"### {tool.icon} {tool.name}{badge}\n\n" + f"{tool.description}\n\n" + f":{status_color}[**{tool.status}**]" ) diff --git a/src/gui/components/__init__.py b/src/gui/components/__init__.py new file mode 100644 index 0000000..0b7c26a --- /dev/null +++ b/src/gui/components/__init__.py @@ -0,0 +1,57 @@ +"""Reusable Streamlit widgets for the DataTools GUI. + +This package replaces the former single ``components.py`` module. Public +behaviour is identical — every name that used to be importable from +``src.gui.components`` is still importable from the same path because +this ``__init__`` re-exports the legacy surface in full. + +The package layout exists so per-tool builds can ship only the seams +they need without dragging the entire kitchen-sink module: + + components/ + __init__.py ← compatibility shim (this file) + _legacy.py ← original components.py, unchanged + gate.py ← gate-only seam (require_normalization_gate) + findings.py ← analyzer-finding rendering seam + dedup_review.py ← dedup match-group cards + review pipeline + shared.py ← chrome / file-pickup helpers used by every tool + +A standalone Deduplicator build, for example, can ship without +``findings.py`` and ``gate.py`` — those modules import the analyzer / +gate code that the Lite SKU does not include. + +Adding new tooling: drop new helpers into the appropriate seam module. +Add their names to its ``__all__`` and to this file's ``__all__`` if +they should remain importable from ``src.gui.components`` directly. +""" + +from __future__ import annotations + +# Re-export the full legacy surface so existing pages continue to +# import unchanged. Once individual tool packages start consuming +# the focused seam modules directly, names can migrate out of +# _legacy.py without breaking those imports — this shim is what +# absorbs the move. +from ._legacy import * # noqa: F401,F403 +from . import _legacy as _legacy # noqa: F401 (keep for direct access) + +# Names exported from _legacy.py that pages currently use. Kept here as +# the canonical public list so a removal from _legacy is a visible +# breaking change instead of a silent drop. +__all__ = [ + # Shared chrome / pickup / gate + "hide_streamlit_chrome", + "pickup_or_upload", + "require_normalization_gate", + # Dedup widgets + "config_panel", + "match_group_card", + "results_summary", + "apply_review_decisions", + # Analyzer widgets + "tool_display_name", + "render_findings_panel", + "render_hidden_aware_preview", + "upload_and_analyze_section", + "findings_count_for_tool", +] diff --git a/src/gui/components.py b/src/gui/components/_legacy.py similarity index 100% rename from src/gui/components.py rename to src/gui/components/_legacy.py diff --git a/src/gui/components/dedup_review.py b/src/gui/components/dedup_review.py new file mode 100644 index 0000000..50b543c --- /dev/null +++ b/src/gui/components/dedup_review.py @@ -0,0 +1,24 @@ +"""Dedup match-group cards and review pipeline. + +The interactive dedup-review surface — config panel, match-group cards, +results summary, and apply-decisions glue. This is the largest single +chunk of the GUI by line count; isolating it in a seam module means a +non-dedup SKU never has to import it (and never has to drag in +``src.core.dedup`` along the way). +""" + +from __future__ import annotations + +from ._legacy import ( + apply_review_decisions, + config_panel, + match_group_card, + results_summary, +) + +__all__ = [ + "apply_review_decisions", + "config_panel", + "match_group_card", + "results_summary", +] diff --git a/src/gui/components/findings.py b/src/gui/components/findings.py new file mode 100644 index 0000000..3031bad --- /dev/null +++ b/src/gui/components/findings.py @@ -0,0 +1,25 @@ +"""Analyzer-finding widgets. + +Surfaces for the analyzer's home-page section, the per-tool findings +panel, and the hidden-character-aware preview table. A build that +doesn't ship the analyzer (e.g. standalone Dedup-only) does not need +this module — its import would drag ``src.core.analyze`` along with it. +""" + +from __future__ import annotations + +from ._legacy import ( + findings_count_for_tool, + render_findings_panel, + render_hidden_aware_preview, + tool_display_name, + upload_and_analyze_section, +) + +__all__ = [ + "findings_count_for_tool", + "render_findings_panel", + "render_hidden_aware_preview", + "tool_display_name", + "upload_and_analyze_section", +] diff --git a/src/gui/components/gate.py b/src/gui/components/gate.py new file mode 100644 index 0000000..5b710f5 --- /dev/null +++ b/src/gui/components/gate.py @@ -0,0 +1,16 @@ +"""Normalization-gate guard for tool pages. + +``require_normalization_gate`` short-circuits a tool page when the +current upload has not yet passed the gate, redirecting the user to the +Review & Normalize page. Pulled into its own seam module so: + +* A build that includes the gate (Pro / Suite SKUs) imports this. +* A standalone single-tool build that bypasses the gate can omit this + module entirely without removing the helper from a shared file. +""" + +from __future__ import annotations + +from ._legacy import require_normalization_gate + +__all__ = ["require_normalization_gate"] diff --git a/src/gui/components/shared.py b/src/gui/components/shared.py new file mode 100644 index 0000000..da4c603 --- /dev/null +++ b/src/gui/components/shared.py @@ -0,0 +1,14 @@ +"""Chrome and file-pickup helpers — every build needs these. + +This is the smallest seam: any DataTools build, regardless of which +tools it bundles, needs ``hide_streamlit_chrome`` (the app-like UI +polish) and ``pickup_or_upload`` (lets a tool page reuse the home-page +upload). Importing from here instead of the kitchen-sink ``components`` +package keeps a Lite build's dependency graph tight. +""" + +from __future__ import annotations + +from ._legacy import hide_streamlit_chrome, pickup_or_upload + +__all__ = ["hide_streamlit_chrome", "pickup_or_upload"] diff --git a/src/gui/tools_registry.py b/src/gui/tools_registry.py new file mode 100644 index 0000000..5f5ccaa --- /dev/null +++ b/src/gui/tools_registry.py @@ -0,0 +1,152 @@ +"""Per-tool manifest registry. + +Single source of truth for what tools exist, their display strings, and +the tier (which controls whether a tool ships in a given build SKU). The +home-page sidebar consumes this list; future per-tool packaging will +filter it via the ``tier`` field. + +Adding a tool: append one ``Tool`` entry. Page filenames must match the +``page_slug`` so Streamlit's automatic page discovery picks them up. + +Selling subsets: builds can filter ``TOOLS`` by tier or tool_id at +import time — no other code changes required, since pages key off +``tool_id`` for findings badges and the home grid renders whatever's in +the filtered list. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Literal + + +Tier = Literal["core", "pro", "enterprise"] +Status = Literal["Ready", "Coming Soon"] + + +@dataclass(frozen=True) +class Tool: + """One tool's manifest entry.""" + + tool_id: str # Stable identifier matching the analyzer's tool field. + icon: str # Single-glyph icon for the home grid card. + name: str # Display name (sidebar + card title). + description: str # One-sentence card body. + page_slug: str # Streamlit page filename without ".py" (e.g. "1_Deduplicator"). + status: Status # "Ready" or "Coming Soon" — drives the card badge color. + tier: Tier = "core" # Build-time gating hook; every tool is "core" today. + + +TOOLS: list[Tool] = [ + Tool( + tool_id="01_deduplicator", + icon="🔍", + name="Deduplicator", + description=( + "Fuzzy matching, normalization, survivor selection, and " + "interactive review." + ), + page_slug="1_Deduplicator", + status="Ready", + ), + Tool( + tool_id="02_text_cleaner", + icon="✂️", + name="Text Cleaner", + description=( + "Whitespace trim, multi-space collapse, Unicode normalization, " + "BOM and line-ending handling." + ), + page_slug="2_Text_Cleaner", + status="Ready", + ), + Tool( + tool_id="03_format_standardizer", + icon="📐", + name="Format Standardizer", + description=( + "Standardize dates, currencies, names, phone numbers, and addresses." + ), + page_slug="3_Format_Standardizer", + status="Coming Soon", + ), + Tool( + tool_id="04_missing_handler", + icon="🕳️", + name="Missing Value Handler", + description=( + "Detect disguised nulls, missingness analysis, and imputation strategies." + ), + page_slug="4_Missing_Values", + status="Coming Soon", + ), + Tool( + tool_id="05_column_mapper", + icon="🗂️", + name="Column Mapper", + description="Rename columns, enforce a target schema, and coerce types.", + page_slug="5_Column_Mapper", + status="Coming Soon", + ), + Tool( + tool_id="06_outlier_detector", + icon="📊", + name="Outlier Detector", + description=( + "Z-score, IQR, and MAD detection with domain-rule violations and " + "winsorization." + ), + page_slug="6_Outlier_Detector", + status="Coming Soon", + ), + Tool( + tool_id="07_multi_file_merger", + icon="📎", + name="Multi-File Merger", + description="Combine multiple CSV/Excel files with schema alignment.", + page_slug="7_Multi_File_Merger", + status="Coming Soon", + ), + Tool( + tool_id="08_validator_reporter", + icon="✅", + name="Validator & Reporter", + description=( + "Validate against rules and generate PDF/Excel quality reports." + ), + page_slug="8_Validator_Reporter", + status="Coming Soon", + ), + Tool( + tool_id="09_pipeline_runner", + icon="⚙️", + name="Pipeline Runner", + description=( + "Chain tools in recommended order and pass output between steps." + ), + page_slug="9_Pipeline_Runner", + status="Coming Soon", + ), +] + + +def tools_for_tier(*tiers: Tier) -> list[Tool]: + """Subset filter for build-time slicing. + + Empty *tiers* returns every tool. Used by per-tool packaging to ship + only the relevant subset of pages and home-grid cards. + """ + if not tiers: + return list(TOOLS) + keep = set(tiers) + return [t for t in TOOLS if t.tier in keep] + + +def tool_by_id(tool_id: str) -> Tool | None: + return next((t for t in TOOLS if t.tool_id == tool_id), None) + + +def display_name(tool_id: str) -> str: + """Return the human-readable name; fall back to the id when unknown.""" + t = tool_by_id(tool_id) + return t.name if t else tool_id