refactor(gui): tool registry + components package for per-tool builds
Two low-risk seam moves to enable selling per-tool subsets without
breaking the existing all-in-one bundle. Behaviour identical; every
existing import still resolves; full pytest suite + every page returns
HTTP 200.
1. **Tool registry** (src/gui/tools_registry.py) — replaces the
inline dict-of-dicts in app.py with a Tool dataclass and a TOOLS
list. Adds a tier field ("core" today, "pro" / "enterprise" later)
and tools_for_tier() / tool_by_id() / display_name() helpers. A
per-tool build slices TOOLS at import time without code changes.
2. **components package** (src/gui/components/) — converts the former
single components.py into a package with:
_legacy.py — original file, unchanged.
__init__.py — re-exports the legacy surface; existing
"from src.gui.components import …" calls
continue to work.
shared.py — hide_streamlit_chrome, pickup_or_upload
(every build needs these).
gate.py — require_normalization_gate (Pro / Suite SKUs).
findings.py — analyzer-finding widgets (drops out of a
standalone-Dedup build).
dedup_review.py — match-group cards + apply pipeline (drops out
of a non-dedup build).
The seam modules are narrow re-exports today. As code migrates out
of _legacy.py into the focused modules, the public import path
stays stable via the shim.
E2E: 765 passed, 17 xfailed (unchanged); home page + all 9 tool pages
+ Review page render HTTP 200; full pipeline (analyze → auto_fix →
apply_decisions → output bytes) round-trips on the kitchen-sink
fixture with zero high-confidence findings remaining post-fix.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -57,80 +57,7 @@ st.divider()
|
|||||||
# Tool cards
|
# Tool cards
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
TOOLS = [
|
from src.gui.tools_registry import TOOLS
|
||||||
{
|
|
||||||
"icon": "🔍",
|
|
||||||
"name": "Deduplicator",
|
|
||||||
"description": "Fuzzy matching, normalization, survivor selection, and interactive review.",
|
|
||||||
"status": "Ready",
|
|
||||||
"page": "1_Deduplicator",
|
|
||||||
"tool_id": "01_deduplicator",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"icon": "✂️",
|
|
||||||
"name": "Text Cleaner",
|
|
||||||
"description": "Whitespace trim, multi-space collapse, Unicode normalization, BOM and line-ending handling.",
|
|
||||||
"status": "Ready",
|
|
||||||
"page": "2_Text_Cleaner",
|
|
||||||
"tool_id": "02_text_cleaner",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"icon": "📐",
|
|
||||||
"name": "Format Standardizer",
|
|
||||||
"description": "Standardize dates, currencies, names, phone numbers, and addresses.",
|
|
||||||
"status": "Coming Soon",
|
|
||||||
"page": "3_Format_Standardizer",
|
|
||||||
"tool_id": "03_format_standardizer",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"icon": "🕳️",
|
|
||||||
"name": "Missing Value Handler",
|
|
||||||
"description": "Detect disguised nulls, missingness analysis, and imputation strategies.",
|
|
||||||
"status": "Coming Soon",
|
|
||||||
"page": "4_Missing_Values",
|
|
||||||
"tool_id": "04_missing_handler",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"icon": "🗂️",
|
|
||||||
"name": "Column Mapper",
|
|
||||||
"description": "Rename columns, enforce a target schema, and coerce types.",
|
|
||||||
"status": "Coming Soon",
|
|
||||||
"page": "5_Column_Mapper",
|
|
||||||
"tool_id": "05_column_mapper",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"icon": "📊",
|
|
||||||
"name": "Outlier Detector",
|
|
||||||
"description": "Z-score, IQR, and MAD detection with domain-rule violations and winsorization.",
|
|
||||||
"status": "Coming Soon",
|
|
||||||
"page": "6_Outlier_Detector",
|
|
||||||
"tool_id": "06_outlier_detector",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"icon": "📎",
|
|
||||||
"name": "Multi-File Merger",
|
|
||||||
"description": "Combine multiple CSV/Excel files with schema alignment.",
|
|
||||||
"status": "Coming Soon",
|
|
||||||
"page": "7_Multi_File_Merger",
|
|
||||||
"tool_id": "07_multi_file_merger",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"icon": "✅",
|
|
||||||
"name": "Validator & Reporter",
|
|
||||||
"description": "Validate against rules and generate PDF/Excel quality reports.",
|
|
||||||
"status": "Coming Soon",
|
|
||||||
"page": "8_Validator_Reporter",
|
|
||||||
"tool_id": "08_validator_reporter",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"icon": "⚙️",
|
|
||||||
"name": "Pipeline Runner",
|
|
||||||
"description": "Chain tools in recommended order and pass output between steps.",
|
|
||||||
"status": "Coming Soon",
|
|
||||||
"page": "9_Pipeline_Runner",
|
|
||||||
"tool_id": "09_pipeline_runner",
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
# Render tool cards in a 3-column grid. Cards picked up by the analyzer get a
|
# Render tool cards in a 3-column grid. Cards picked up by the analyzer get a
|
||||||
# coloured "N findings" badge so the user can see at a glance which tools
|
# coloured "N findings" badge so the user can see at a glance which tools
|
||||||
@@ -143,15 +70,15 @@ for row_start in range(0, len(TOOLS), 3):
|
|||||||
break
|
break
|
||||||
tool = TOOLS[idx]
|
tool = TOOLS[idx]
|
||||||
with col:
|
with col:
|
||||||
status_color = "green" if tool["status"] == "Ready" else "orange"
|
status_color = "green" if tool.status == "Ready" else "orange"
|
||||||
badge = ""
|
badge = ""
|
||||||
n = findings_count_for_tool(tool.get("tool_id", ""))
|
n = findings_count_for_tool(tool.tool_id)
|
||||||
if n:
|
if n:
|
||||||
badge = f" :red-background[**{n} finding{'s' if n != 1 else ''}**]"
|
badge = f" :red-background[**{n} finding{'s' if n != 1 else ''}**]"
|
||||||
st.markdown(
|
st.markdown(
|
||||||
f"### {tool['icon']} {tool['name']}{badge}\n\n"
|
f"### {tool.icon} {tool.name}{badge}\n\n"
|
||||||
f"{tool['description']}\n\n"
|
f"{tool.description}\n\n"
|
||||||
f":{status_color}[**{tool['status']}**]"
|
f":{status_color}[**{tool.status}**]"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
57
src/gui/components/__init__.py
Normal file
57
src/gui/components/__init__.py
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
"""Reusable Streamlit widgets for the DataTools GUI.
|
||||||
|
|
||||||
|
This package replaces the former single ``components.py`` module. Public
|
||||||
|
behaviour is identical — every name that used to be importable from
|
||||||
|
``src.gui.components`` is still importable from the same path because
|
||||||
|
this ``__init__`` re-exports the legacy surface in full.
|
||||||
|
|
||||||
|
The package layout exists so per-tool builds can ship only the seams
|
||||||
|
they need without dragging the entire kitchen-sink module:
|
||||||
|
|
||||||
|
components/
|
||||||
|
__init__.py ← compatibility shim (this file)
|
||||||
|
_legacy.py ← original components.py, unchanged
|
||||||
|
gate.py ← gate-only seam (require_normalization_gate)
|
||||||
|
findings.py ← analyzer-finding rendering seam
|
||||||
|
dedup_review.py ← dedup match-group cards + review pipeline
|
||||||
|
shared.py ← chrome / file-pickup helpers used by every tool
|
||||||
|
|
||||||
|
A standalone Deduplicator build, for example, can ship without
|
||||||
|
``findings.py`` and ``gate.py`` — those modules import the analyzer /
|
||||||
|
gate code that the Lite SKU does not include.
|
||||||
|
|
||||||
|
Adding new tooling: drop new helpers into the appropriate seam module.
|
||||||
|
Add their names to its ``__all__`` and to this file's ``__all__`` if
|
||||||
|
they should remain importable from ``src.gui.components`` directly.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
# Re-export the full legacy surface so existing pages continue to
|
||||||
|
# import unchanged. Once individual tool packages start consuming
|
||||||
|
# the focused seam modules directly, names can migrate out of
|
||||||
|
# _legacy.py without breaking those imports — this shim is what
|
||||||
|
# absorbs the move.
|
||||||
|
from ._legacy import * # noqa: F401,F403
|
||||||
|
from . import _legacy as _legacy # noqa: F401 (keep for direct access)
|
||||||
|
|
||||||
|
# Names exported from _legacy.py that pages currently use. Kept here as
|
||||||
|
# the canonical public list so a removal from _legacy is a visible
|
||||||
|
# breaking change instead of a silent drop.
|
||||||
|
__all__ = [
|
||||||
|
# Shared chrome / pickup / gate
|
||||||
|
"hide_streamlit_chrome",
|
||||||
|
"pickup_or_upload",
|
||||||
|
"require_normalization_gate",
|
||||||
|
# Dedup widgets
|
||||||
|
"config_panel",
|
||||||
|
"match_group_card",
|
||||||
|
"results_summary",
|
||||||
|
"apply_review_decisions",
|
||||||
|
# Analyzer widgets
|
||||||
|
"tool_display_name",
|
||||||
|
"render_findings_panel",
|
||||||
|
"render_hidden_aware_preview",
|
||||||
|
"upload_and_analyze_section",
|
||||||
|
"findings_count_for_tool",
|
||||||
|
]
|
||||||
24
src/gui/components/dedup_review.py
Normal file
24
src/gui/components/dedup_review.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
"""Dedup match-group cards and review pipeline.
|
||||||
|
|
||||||
|
The interactive dedup-review surface — config panel, match-group cards,
|
||||||
|
results summary, and apply-decisions glue. This is the largest single
|
||||||
|
chunk of the GUI by line count; isolating it in a seam module means a
|
||||||
|
non-dedup SKU never has to import it (and never has to drag in
|
||||||
|
``src.core.dedup`` along the way).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from ._legacy import (
|
||||||
|
apply_review_decisions,
|
||||||
|
config_panel,
|
||||||
|
match_group_card,
|
||||||
|
results_summary,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"apply_review_decisions",
|
||||||
|
"config_panel",
|
||||||
|
"match_group_card",
|
||||||
|
"results_summary",
|
||||||
|
]
|
||||||
25
src/gui/components/findings.py
Normal file
25
src/gui/components/findings.py
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
"""Analyzer-finding widgets.
|
||||||
|
|
||||||
|
Surfaces for the analyzer's home-page section, the per-tool findings
|
||||||
|
panel, and the hidden-character-aware preview table. A build that
|
||||||
|
doesn't ship the analyzer (e.g. standalone Dedup-only) does not need
|
||||||
|
this module — its import would drag ``src.core.analyze`` along with it.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from ._legacy import (
|
||||||
|
findings_count_for_tool,
|
||||||
|
render_findings_panel,
|
||||||
|
render_hidden_aware_preview,
|
||||||
|
tool_display_name,
|
||||||
|
upload_and_analyze_section,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"findings_count_for_tool",
|
||||||
|
"render_findings_panel",
|
||||||
|
"render_hidden_aware_preview",
|
||||||
|
"tool_display_name",
|
||||||
|
"upload_and_analyze_section",
|
||||||
|
]
|
||||||
16
src/gui/components/gate.py
Normal file
16
src/gui/components/gate.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
"""Normalization-gate guard for tool pages.
|
||||||
|
|
||||||
|
``require_normalization_gate`` short-circuits a tool page when the
|
||||||
|
current upload has not yet passed the gate, redirecting the user to the
|
||||||
|
Review & Normalize page. Pulled into its own seam module so:
|
||||||
|
|
||||||
|
* A build that includes the gate (Pro / Suite SKUs) imports this.
|
||||||
|
* A standalone single-tool build that bypasses the gate can omit this
|
||||||
|
module entirely without removing the helper from a shared file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from ._legacy import require_normalization_gate
|
||||||
|
|
||||||
|
__all__ = ["require_normalization_gate"]
|
||||||
14
src/gui/components/shared.py
Normal file
14
src/gui/components/shared.py
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
"""Chrome and file-pickup helpers — every build needs these.
|
||||||
|
|
||||||
|
This is the smallest seam: any DataTools build, regardless of which
|
||||||
|
tools it bundles, needs ``hide_streamlit_chrome`` (the app-like UI
|
||||||
|
polish) and ``pickup_or_upload`` (lets a tool page reuse the home-page
|
||||||
|
upload). Importing from here instead of the kitchen-sink ``components``
|
||||||
|
package keeps a Lite build's dependency graph tight.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from ._legacy import hide_streamlit_chrome, pickup_or_upload
|
||||||
|
|
||||||
|
__all__ = ["hide_streamlit_chrome", "pickup_or_upload"]
|
||||||
152
src/gui/tools_registry.py
Normal file
152
src/gui/tools_registry.py
Normal file
@@ -0,0 +1,152 @@
|
|||||||
|
"""Per-tool manifest registry.
|
||||||
|
|
||||||
|
Single source of truth for what tools exist, their display strings, and
|
||||||
|
the tier (which controls whether a tool ships in a given build SKU). The
|
||||||
|
home-page sidebar consumes this list; future per-tool packaging will
|
||||||
|
filter it via the ``tier`` field.
|
||||||
|
|
||||||
|
Adding a tool: append one ``Tool`` entry. Page filenames must match the
|
||||||
|
``page_slug`` so Streamlit's automatic page discovery picks them up.
|
||||||
|
|
||||||
|
Selling subsets: builds can filter ``TOOLS`` by tier or tool_id at
|
||||||
|
import time — no other code changes required, since pages key off
|
||||||
|
``tool_id`` for findings badges and the home grid renders whatever's in
|
||||||
|
the filtered list.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
|
||||||
|
Tier = Literal["core", "pro", "enterprise"]
|
||||||
|
Status = Literal["Ready", "Coming Soon"]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class Tool:
|
||||||
|
"""One tool's manifest entry."""
|
||||||
|
|
||||||
|
tool_id: str # Stable identifier matching the analyzer's tool field.
|
||||||
|
icon: str # Single-glyph icon for the home grid card.
|
||||||
|
name: str # Display name (sidebar + card title).
|
||||||
|
description: str # One-sentence card body.
|
||||||
|
page_slug: str # Streamlit page filename without ".py" (e.g. "1_Deduplicator").
|
||||||
|
status: Status # "Ready" or "Coming Soon" — drives the card badge color.
|
||||||
|
tier: Tier = "core" # Build-time gating hook; every tool is "core" today.
|
||||||
|
|
||||||
|
|
||||||
|
TOOLS: list[Tool] = [
|
||||||
|
Tool(
|
||||||
|
tool_id="01_deduplicator",
|
||||||
|
icon="🔍",
|
||||||
|
name="Deduplicator",
|
||||||
|
description=(
|
||||||
|
"Fuzzy matching, normalization, survivor selection, and "
|
||||||
|
"interactive review."
|
||||||
|
),
|
||||||
|
page_slug="1_Deduplicator",
|
||||||
|
status="Ready",
|
||||||
|
),
|
||||||
|
Tool(
|
||||||
|
tool_id="02_text_cleaner",
|
||||||
|
icon="✂️",
|
||||||
|
name="Text Cleaner",
|
||||||
|
description=(
|
||||||
|
"Whitespace trim, multi-space collapse, Unicode normalization, "
|
||||||
|
"BOM and line-ending handling."
|
||||||
|
),
|
||||||
|
page_slug="2_Text_Cleaner",
|
||||||
|
status="Ready",
|
||||||
|
),
|
||||||
|
Tool(
|
||||||
|
tool_id="03_format_standardizer",
|
||||||
|
icon="📐",
|
||||||
|
name="Format Standardizer",
|
||||||
|
description=(
|
||||||
|
"Standardize dates, currencies, names, phone numbers, and addresses."
|
||||||
|
),
|
||||||
|
page_slug="3_Format_Standardizer",
|
||||||
|
status="Coming Soon",
|
||||||
|
),
|
||||||
|
Tool(
|
||||||
|
tool_id="04_missing_handler",
|
||||||
|
icon="🕳️",
|
||||||
|
name="Missing Value Handler",
|
||||||
|
description=(
|
||||||
|
"Detect disguised nulls, missingness analysis, and imputation strategies."
|
||||||
|
),
|
||||||
|
page_slug="4_Missing_Values",
|
||||||
|
status="Coming Soon",
|
||||||
|
),
|
||||||
|
Tool(
|
||||||
|
tool_id="05_column_mapper",
|
||||||
|
icon="🗂️",
|
||||||
|
name="Column Mapper",
|
||||||
|
description="Rename columns, enforce a target schema, and coerce types.",
|
||||||
|
page_slug="5_Column_Mapper",
|
||||||
|
status="Coming Soon",
|
||||||
|
),
|
||||||
|
Tool(
|
||||||
|
tool_id="06_outlier_detector",
|
||||||
|
icon="📊",
|
||||||
|
name="Outlier Detector",
|
||||||
|
description=(
|
||||||
|
"Z-score, IQR, and MAD detection with domain-rule violations and "
|
||||||
|
"winsorization."
|
||||||
|
),
|
||||||
|
page_slug="6_Outlier_Detector",
|
||||||
|
status="Coming Soon",
|
||||||
|
),
|
||||||
|
Tool(
|
||||||
|
tool_id="07_multi_file_merger",
|
||||||
|
icon="📎",
|
||||||
|
name="Multi-File Merger",
|
||||||
|
description="Combine multiple CSV/Excel files with schema alignment.",
|
||||||
|
page_slug="7_Multi_File_Merger",
|
||||||
|
status="Coming Soon",
|
||||||
|
),
|
||||||
|
Tool(
|
||||||
|
tool_id="08_validator_reporter",
|
||||||
|
icon="✅",
|
||||||
|
name="Validator & Reporter",
|
||||||
|
description=(
|
||||||
|
"Validate against rules and generate PDF/Excel quality reports."
|
||||||
|
),
|
||||||
|
page_slug="8_Validator_Reporter",
|
||||||
|
status="Coming Soon",
|
||||||
|
),
|
||||||
|
Tool(
|
||||||
|
tool_id="09_pipeline_runner",
|
||||||
|
icon="⚙️",
|
||||||
|
name="Pipeline Runner",
|
||||||
|
description=(
|
||||||
|
"Chain tools in recommended order and pass output between steps."
|
||||||
|
),
|
||||||
|
page_slug="9_Pipeline_Runner",
|
||||||
|
status="Coming Soon",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def tools_for_tier(*tiers: Tier) -> list[Tool]:
|
||||||
|
"""Subset filter for build-time slicing.
|
||||||
|
|
||||||
|
Empty *tiers* returns every tool. Used by per-tool packaging to ship
|
||||||
|
only the relevant subset of pages and home-grid cards.
|
||||||
|
"""
|
||||||
|
if not tiers:
|
||||||
|
return list(TOOLS)
|
||||||
|
keep = set(tiers)
|
||||||
|
return [t for t in TOOLS if t.tier in keep]
|
||||||
|
|
||||||
|
|
||||||
|
def tool_by_id(tool_id: str) -> Tool | None:
|
||||||
|
return next((t for t in TOOLS if t.tool_id == tool_id), None)
|
||||||
|
|
||||||
|
|
||||||
|
def display_name(tool_id: str) -> str:
|
||||||
|
"""Return the human-readable name; fall back to the id when unknown."""
|
||||||
|
t = tool_by_id(tool_id)
|
||||||
|
return t.name if t else tool_id
|
||||||
Reference in New Issue
Block a user