Files
datatools-dev/src/gui/tools_registry.py
Michael 4adeb5c7f3 feat(format): per-cell standardizers + 199-row buyer corpus
Adds src/core/format_standardize.py — a per-cell standardizer for dates,
phones, emails, addresses, names, currencies, booleans — wired through
StandardizeOptions / standardize_dataframe with FieldType registry.

Includes:
- Date parser handles ISO/US/EU/longform/excel-serial/unix-timestamp/
  partial-precision/quarter notation; opt-in French/German/Spanish month
  dictionaries via month_locales.
- Phone via libphonenumber with extension preservation (;ext=N), 001
  international prefix handling, error sentinels for placeholders /
  multi-number cells.
- Email lowercase/trim/mailto/angle-bracket strip with optional
  --gmail-canonical mode.
- Address USPS abbreviation expansion or compression (expand=False per
  corpus § 6.3), state-name → 2-letter conversion, multi-line collapse,
  PO Box normalization, state-code preservation regardless of input case.
- Name handler: Mc/Mac/O'/D' inner caps, hyphen segments, particle
  lowercasing (von/van/de/da), comma-format reversal, period stripping
  for titles/suffixes/initials, PhD/MD acronym preservation, conservative
  mode for mixed-case input.
- Currency: auto-detect EU vs US separators, space-thousands, Swiss
  apostrophe, accounting parens, optional ISO code preservation, error
  sentinels for percentages/ranges/word-values/ambiguous separators.
- Per-domain error_policy ("passthrough" | "sentinel") for surfacing
  malformed values as <error: reason> per corpus § 0.3.

Test corpus from Business/DataTools/test-cases-format-cleaner copied to
test-cases/format-cleaner-corpus/ — 7 fixtures plus FORMATS-CASES.md.
tests/test_format_standardize_corpus.py drives all 199 rows through the
per-cell standardizers; 0 xfailed.

Wires the GUI page (3_Format_Standardizer.py) to "Ready" status.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 02:11:24 +00:00

153 lines
4.7 KiB
Python

"""Per-tool manifest registry.
Single source of truth for what tools exist, their display strings, and
the tier (which controls whether a tool ships in a given build SKU). The
home-page sidebar consumes this list; future per-tool packaging will
filter it via the ``tier`` field.
Adding a tool: append one ``Tool`` entry. Page filenames must match the
``page_slug`` so Streamlit's automatic page discovery picks them up.
Selling subsets: builds can filter ``TOOLS`` by tier or tool_id at
import time — no other code changes required, since pages key off
``tool_id`` for findings badges and the home grid renders whatever's in
the filtered list.
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Literal
Tier = Literal["core", "pro", "enterprise"]
Status = Literal["Ready", "Coming Soon"]
@dataclass(frozen=True)
class Tool:
"""One tool's manifest entry."""
tool_id: str # Stable identifier matching the analyzer's tool field.
icon: str # Single-glyph icon for the home grid card.
name: str # Display name (sidebar + card title).
description: str # One-sentence card body.
page_slug: str # Streamlit page filename without ".py" (e.g. "1_Deduplicator").
status: Status # "Ready" or "Coming Soon" — drives the card badge color.
tier: Tier = "core" # Build-time gating hook; every tool is "core" today.
TOOLS: list[Tool] = [
Tool(
tool_id="01_deduplicator",
icon="🔍",
name="Deduplicator",
description=(
"Fuzzy matching, normalization, survivor selection, and "
"interactive review."
),
page_slug="1_Deduplicator",
status="Ready",
),
Tool(
tool_id="02_text_cleaner",
icon="✂️",
name="Text Cleaner",
description=(
"Whitespace trim, multi-space collapse, Unicode normalization, "
"BOM and line-ending handling."
),
page_slug="2_Text_Cleaner",
status="Ready",
),
Tool(
tool_id="03_format_standardizer",
icon="📐",
name="Format Standardizer",
description=(
"Standardize dates, currencies, names, phone numbers, and addresses."
),
page_slug="3_Format_Standardizer",
status="Ready",
),
Tool(
tool_id="04_missing_handler",
icon="🕳️",
name="Missing Value Handler",
description=(
"Detect disguised nulls, missingness analysis, and imputation strategies."
),
page_slug="4_Missing_Values",
status="Coming Soon",
),
Tool(
tool_id="05_column_mapper",
icon="🗂️",
name="Column Mapper",
description="Rename columns, enforce a target schema, and coerce types.",
page_slug="5_Column_Mapper",
status="Coming Soon",
),
Tool(
tool_id="06_outlier_detector",
icon="📊",
name="Outlier Detector",
description=(
"Z-score, IQR, and MAD detection with domain-rule violations and "
"winsorization."
),
page_slug="6_Outlier_Detector",
status="Coming Soon",
),
Tool(
tool_id="07_multi_file_merger",
icon="📎",
name="Multi-File Merger",
description="Combine multiple CSV/Excel files with schema alignment.",
page_slug="7_Multi_File_Merger",
status="Coming Soon",
),
Tool(
tool_id="08_validator_reporter",
icon="",
name="Validator & Reporter",
description=(
"Validate against rules and generate PDF/Excel quality reports."
),
page_slug="8_Validator_Reporter",
status="Coming Soon",
),
Tool(
tool_id="09_pipeline_runner",
icon="⚙️",
name="Pipeline Runner",
description=(
"Chain tools in recommended order and pass output between steps."
),
page_slug="9_Pipeline_Runner",
status="Coming Soon",
),
]
def tools_for_tier(*tiers: Tier) -> list[Tool]:
"""Subset filter for build-time slicing.
Empty *tiers* returns every tool. Used by per-tool packaging to ship
only the relevant subset of pages and home-grid cards.
"""
if not tiers:
return list(TOOLS)
keep = set(tiers)
return [t for t in TOOLS if t.tier in keep]
def tool_by_id(tool_id: str) -> Tool | None:
return next((t for t in TOOLS if t.tool_id == tool_id), None)
def display_name(tool_id: str) -> str:
"""Return the human-readable name; fall back to the id when unknown."""
t = tool_by_id(tool_id)
return t.name if t else tool_id