feat: refactor GUI to multi-page Streamlit app with 9 tool pages

Convert single-page deduplicator into a multi-page suite. Home page shows tool card grid. Deduplicator extracted to its own page (fully working). 8 stub pages added for Text Cleaner, Format Standardizer, Missing Values, Column Mapper, Outlier Detector, Multi-File Merger, Validator & Reporter, and Pipeline Runner — each with functional file upload and coming-soon UI. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-29 01:16:12 +00:00
parent 9ec371a85f
commit f2fdc10af7
10 changed files with 1175 additions and 330 deletions
--- a/src/gui/app.py
+++ b/src/gui/app.py
@@ -1,4 +1,4 @@
-"""DataTools Deduplicator — Streamlit GUI.
+"""DataTools — Data Cleaning Mastery Suite.

 Launch:
    streamlit run src/gui/app.py
@@ -6,11 +6,9 @@ Launch:

 from __future__ import annotations

-import io
 import sys
 from pathlib import Path

-import pandas as pd
 import streamlit as st

 # Ensure project root is on sys.path so `src.core` imports work
@@ -18,24 +16,14 @@ _project_root = Path(__file__).resolve().parent.parent.parent
 if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root))

-from src.core.dedup import deduplicate, build_default_strategies, DeduplicationResult
-from src.core.io import read_file, list_sheets, detect_encoding, detect_delimiter
-from src.core.config import DeduplicationConfig
-from src.gui.components import (
-    apply_review_decisions,
-    config_panel,
-    match_group_card,
-    results_summary,
-)
-

 # ---------------------------------------------------------------------------
 # Page config
 # ---------------------------------------------------------------------------

 st.set_page_config(
-    page_title="DataTools Deduplicator",
-    page_icon="🔍",
+    page_title="DataTools — Data Cleaning Mastery",
+    page_icon="🧹",
    layout="wide",
 )

@@ -45,331 +33,101 @@ st.markdown(
    unsafe_allow_html=True,
 )

-# ---------------------------------------------------------------------------
-# Session state defaults
-# ---------------------------------------------------------------------------
-
-_DEFAULTS = {
-    "df": None,
-    "result": None,
-    "review_decisions": {},
-    "config": None,
-    "file_name": "",
-    "sheet_names": [],
-    "detected_delimiter": ",",
-}
-for key, default in _DEFAULTS.items():
-    if key not in st.session_state:
-        st.session_state[key] = default
-

 # ---------------------------------------------------------------------------
-# Header
+# Home page
 # ---------------------------------------------------------------------------

-st.title("DataTools Deduplicator")
-st.caption("Find and remove duplicate rows in CSV, delimited text, and Excel files.")
+st.title("🧹 DataTools — Data Cleaning Mastery")
+st.caption("A 9-tool suite for cleaning, standardizing, and validating tabular data. Runs 100% locally.")

+st.divider()

 # ---------------------------------------------------------------------------
-# File upload
+# Tool cards
 # ---------------------------------------------------------------------------

-uploaded = st.file_uploader(
-    "Upload CSV or Excel file",
-    type=["csv", "tsv", "xlsx", "xls"],
-    help="Supports CSV, TSV, and Excel files. Encoding and delimiters are auto-detected.",
-)
+TOOLS = [
+    {
+        "icon": "🔍",
+        "name": "Deduplicator",
+        "description": "Fuzzy matching, normalization, survivor selection, and interactive review.",
+        "status": "Ready",
+        "page": "1_Deduplicator",
+    },
+    {
+        "icon": "✂️",
+        "name": "Text Cleaner",
+        "description": "Whitespace trim, multi-space collapse, Unicode normalization, BOM and line-ending handling.",
+        "status": "Coming Soon",
+        "page": "2_Text_Cleaner",
+    },
+    {
+        "icon": "📐",
+        "name": "Format Standardizer",
+        "description": "Standardize dates, currencies, names, phone numbers, and addresses.",
+        "status": "Coming Soon",
+        "page": "3_Format_Standardizer",
+    },
+    {
+        "icon": "🕳️",
+        "name": "Missing Value Handler",
+        "description": "Detect disguised nulls, missingness analysis, and imputation strategies.",
+        "status": "Coming Soon",
+        "page": "4_Missing_Values",
+    },
+    {
+        "icon": "🗂️",
+        "name": "Column Mapper",
+        "description": "Rename columns, enforce a target schema, and coerce types.",
+        "status": "Coming Soon",
+        "page": "5_Column_Mapper",
+    },
+    {
+        "icon": "📊",
+        "name": "Outlier Detector",
+        "description": "Z-score, IQR, and MAD detection with domain-rule violations and winsorization.",
+        "status": "Coming Soon",
+        "page": "6_Outlier_Detector",
+    },
+    {
+        "icon": "📎",
+        "name": "Multi-File Merger",
+        "description": "Combine multiple CSV/Excel files with schema alignment.",
+        "status": "Coming Soon",
+        "page": "7_Multi_File_Merger",
+    },
+    {
+        "icon": "✅",
+        "name": "Validator & Reporter",
+        "description": "Validate against rules and generate PDF/Excel quality reports.",
+        "status": "Coming Soon",
+        "page": "8_Validator_Reporter",
+    },
+    {
+        "icon": "⚙️",
+        "name": "Pipeline Runner",
+        "description": "Chain tools in recommended order and pass output between steps.",
+        "status": "Coming Soon",
+        "page": "9_Pipeline_Runner",
+    },
+]

-if uploaded is not None:
-    # Detect if file changed
-    if uploaded.name != st.session_state["file_name"]:
-        st.session_state["file_name"] = uploaded.name
-        st.session_state["result"] = None
-        st.session_state["review_decisions"] = {}
-
-        # Read the file
-        try:
-            # Write to a temp file for read_file() which needs a path
-            import tempfile
-            suffix = Path(uploaded.name).suffix
-            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
-                tmp.write(uploaded.getvalue())
-                tmp_path = Path(tmp.name)
-
-            # Check for Excel sheets / detect delimiter
-            if suffix.lower() in (".xlsx", ".xls"):
-                st.session_state["sheet_names"] = list_sheets(tmp_path)
-                st.session_state["detected_delimiter"] = ","
-            else:
-                st.session_state["sheet_names"] = []
-                enc = detect_encoding(tmp_path)
-                st.session_state["detected_delimiter"] = detect_delimiter(tmp_path, enc)
-
-            df = read_file(tmp_path)
-            if not isinstance(df, pd.DataFrame):
-                df = pd.concat(list(df), ignore_index=True)
-
-            st.session_state["df"] = df
-
-            # Clean up temp file
-            tmp_path.unlink(missing_ok=True)
-
-        except Exception as e:
-            st.error(f"Failed to read file: {e}")
-            st.session_state["df"] = None
-
-    df = st.session_state["df"]
-
-    if df is not None:
-        # Sheet selector for Excel files
-        if st.session_state["sheet_names"] and len(st.session_state["sheet_names"]) > 1:
-            sheet = st.selectbox(
-                "Select sheet",
-                st.session_state["sheet_names"],
+# Render tool cards in a 3-column grid
+for row_start in range(0, len(TOOLS), 3):
+    cols = st.columns(3)
+    for i, col in enumerate(cols):
+        idx = row_start + i
+        if idx >= len(TOOLS):
+            break
+        tool = TOOLS[idx]
+        with col:
+            status_color = "green" if tool["status"] == "Ready" else "orange"
+            st.markdown(
+                f"### {tool['icon']} {tool['name']}\n\n"
+                f"{tool['description']}\n\n"
+                f":{status_color}[**{tool['status']}**]"
            )
-            if sheet != st.session_state.get("_current_sheet"):
-                st.session_state["_current_sheet"] = sheet
-                suffix = Path(uploaded.name).suffix
-                import tempfile
-                with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
-                    tmp.write(uploaded.getvalue())
-                    tmp_path = Path(tmp.name)
-                df = read_file(tmp_path, sheet_name=sheet)
-                if not isinstance(df, pd.DataFrame):
-                    df = pd.concat(list(df), ignore_index=True)
-                st.session_state["df"] = df
-                st.session_state["result"] = None
-                st.session_state["review_decisions"] = {}
-                tmp_path.unlink(missing_ok=True)
-
-        # Delimiter selector for CSV/TSV files
-        is_csv = Path(uploaded.name).suffix.lower() not in (".xlsx", ".xls")
-        if is_csv:
-            _DELIMITERS = {
-                "Comma (,)": ",",
-                "Tab (\\t)": "\t",
-                "Semicolon (;)": ";",
-                "Pipe (|)": "|",
-                "Other": None,
-            }
-            _DELIM_LABELS = list(_DELIMITERS.keys())
-            _DELIM_VALUES = list(_DELIMITERS.values())
-            detected = st.session_state.get("detected_delimiter", ",")
-            default_idx = _DELIM_VALUES.index(detected) if detected in _DELIM_VALUES else 0
-            chosen_label = st.selectbox(
-                "Delimiter",
-                _DELIM_LABELS,
-                index=default_idx,
-                help="Auto-detected on upload. Change if the preview looks wrong.",
-            )
-            if chosen_label == "Other":
-                custom_delim = st.text_input(
-                    "Enter delimiter character",
-                    max_chars=5,
-                    help="Enter the character(s) used to separate fields.",
-                )
-                chosen_delim = custom_delim if custom_delim else ","
-            else:
-                chosen_delim = _DELIMITERS[chosen_label]
-            if chosen_delim != st.session_state.get("_current_delimiter"):
-                st.session_state["_current_delimiter"] = chosen_delim
-                import tempfile
-                suffix = Path(uploaded.name).suffix
-                with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
-                    tmp.write(uploaded.getvalue())
-                    tmp_path = Path(tmp.name)
-                df = read_file(tmp_path, delimiter=chosen_delim)
-                if not isinstance(df, pd.DataFrame):
-                    df = pd.concat(list(df), ignore_index=True)
-                st.session_state["df"] = df
-                st.session_state["result"] = None
-                st.session_state["review_decisions"] = {}
-                tmp_path.unlink(missing_ok=True)
-
-        # Preview
-        st.subheader(f"Preview: {uploaded.name}")
-        st.caption(f"{len(df)} rows, {len(df.columns)} columns")
-        st.dataframe(df.head(10), use_container_width=True)
-
-        # Advanced options
-        settings = config_panel(df)
-
-        # Apply loaded config if present
-        loaded_cfg = st.session_state.get("loaded_config")
-        if loaded_cfg is not None:
-            settings["strategies"] = loaded_cfg.to_strategies()
-            settings["survivor_rule"] = loaded_cfg.to_survivor_rule()
-            settings["date_column"] = loaded_cfg.date_column
-            settings["merge"] = loaded_cfg.merge
-            # Clear so it doesn't override on every rerun
-            del st.session_state["loaded_config"]
-
-        # ---------------------------------------------------------------------------
-        # Find Duplicates button
-        # ---------------------------------------------------------------------------
-
-        st.divider()
-
-        if st.button("Find Duplicates", type="primary", use_container_width=True):
-            progress_bar = st.progress(0, text="Comparing rows...")
-
-            def _gui_progress(current: int, total: int) -> None:
-                if total > 0:
-                    pct = min(current / total, 1.0)
-                    progress_bar.progress(pct, text=f"Comparing rows... {current:,}/{total:,}")
-
-            with st.spinner("Running deduplication..."):
-                result = deduplicate(
-                    df,
-                    strategies=settings["strategies"],
-                    survivor_rule=settings["survivor_rule"],
-                    date_column=settings["date_column"],
-                    merge=settings["merge"],
-                    preview=False,
-                    progress_callback=_gui_progress,
-                )
-
-            progress_bar.empty()
-            st.session_state["result"] = result
-            st.session_state["review_decisions"] = {}
-
-        # ---------------------------------------------------------------------------
-        # Results
-        # ---------------------------------------------------------------------------
-
-        result: DeduplicationResult | None = st.session_state["result"]
-
-        if result is not None:
-            st.divider()
-            st.subheader("Results")
-
-            # Summary + download buttons
-            results_summary(result, df)
-
-            # Match group review
-            if result.match_groups:
-                st.divider()
-                st.subheader("Match Groups")
-
-                # Batch actions
-                def _accept_all():
-                    for g in result.match_groups:
-                        st.session_state["review_decisions"][g.group_id] = {
-                            "keep_indices": [g.survivor_index],
-                            "overrides": {},
-                        }
-
-                def _reject_all():
-                    for g in result.match_groups:
-                        st.session_state["review_decisions"][g.group_id] = {
-                            "keep_indices": list(g.row_indices),
-                            "overrides": {},
-                        }
-
-                def _clear_all():
-                    st.session_state["review_decisions"] = {}
-                    for k in list(st.session_state):
-                        if k.startswith("editor_"):
-                            del st.session_state[k]
-
-                action_left, action_mid, action_right = st.columns(3)
-                with action_left:
-                    st.button("Accept All", on_click=_accept_all)
-                with action_mid:
-                    st.button("Reject All", on_click=_reject_all)
-                with action_right:
-                    st.button("Clear Decisions", on_click=_clear_all)
-
-                # Individual group cards
-                decisions = st.session_state["review_decisions"]
-                for i, group in enumerate(result.match_groups):
-                    match_group_card(group, df, group_num=i + 1)
-
-                # Show decision summary
-                if decisions:
-                    st.divider()
-                    merged = 0
-                    customized = 0
-                    split = 0
-                    kept_all = 0
-                    for v in decisions.values():
-                        if not isinstance(v, dict):
-                            continue
-                        ki = v.get("keep_indices", [])
-                        # Find the matching group size
-                        gid_for_v = next(
-                            (gid for gid, d in decisions.items() if d is v),
-                            None,
-                        )
-                        group_size = next(
-                            (len(g.row_indices) for g in result.match_groups
-                             if g.group_id == gid_for_v),
-                            0,
-                        )
-                        if len(ki) == group_size:
-                            kept_all += 1
-                        elif len(ki) == 1:
-                            if v.get("overrides"):
-                                customized += 1
-                            else:
-                                merged += 1
-                        else:
-                            split += 1
-
-                    pending = len(result.match_groups) - len(decisions)
-                    parts = []
-                    if merged:
-                        parts.append(f"{merged} merged")
-                    if customized:
-                        parts.append(f"{customized} customized")
-                    if split:
-                        parts.append(f"{split} split")
-                    if kept_all:
-                        parts.append(f"{kept_all} kept all")
-                    parts.append(f"{pending} pending")
-                    st.caption("Decisions: " + ", ".join(parts))
-
-                    # Apply decisions and offer download
-                    if st.button(
-                        "Apply Review Decisions & Download",
-                        type="primary",
-                        use_container_width=True,
-                    ):
-                        reviewed_df, reviewed_removed = apply_review_decisions(
-                            df, result.match_groups, decisions,
-                        )
-
-                        csv_bytes = reviewed_df.to_csv(
-                            index=False
-                        ).encode("utf-8-sig")
-                        st.download_button(
-                            "Download Reviewed & Deduplicated CSV",
-                            data=csv_bytes,
-                            file_name="deduplicated_reviewed.csv",
-                            mime="text/csv",
-                            key="reviewed_download",
-                        )
-                        if not reviewed_removed.empty:
-                            removed_bytes = reviewed_removed.to_csv(
-                                index=False
-                            ).encode("utf-8-sig")
-                            st.download_button(
-                                "Download Reviewed Removed Rows",
-                                data=removed_bytes,
-                                file_name="removed_reviewed.csv",
-                                mime="text/csv",
-                                key="reviewed_removed_download",
-                            )
-
-            # Log entries
-            if result.log_entries:
-                with st.expander("Processing Log"):
-                    st.code("\n".join(result.log_entries))
-
-else:
-    # No file uploaded — show placeholder
-    st.info("Upload a file to get started.")


 # ---------------------------------------------------------------------------
@@ -379,5 +137,5 @@ else:
 st.divider()
 st.caption(
    "Runs locally. Your data never leaves this computer. "
-    "| DataTools Deduplicator v3.0"
+    "| DataTools v3.0"
 )