feat: 3 new tools, format streaming, distribution-ready demo + landing pages

Tools shipped this batch (4 → 6 of 9 Ready): 04 Missing Value Handler src/core/missing.py + cli_missing.py + GUI 05 Column Mapper src/core/column_mapper.py + cli_column_map.py + GUI 09 Pipeline Runner src/core/pipeline.py + cli_pipeline.py + GUI with soft tool-dependency graph (recommended, not enforced) and JSON save/load for repeatable weekly cleanups. Format Standardizer reworked for 1 GB international files: • Vectorised dispatch + LRU cache over phone/date/currency/boolean/email • Per-row country / address columns drive parsing • Audit cap (default 10 k rows, ~50 MB RAM) • standardize_file(): chunked streaming entry point (~165 k rows/sec) • currency_decimal="auto" for EU comma-decimal locales • R$ / kr / zł multi-char currency prefixes • cli_format.py with auto-stream above 100 MB inputs Encoding detection arbiter + language-aware probe: Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM) via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes. Distribution-readiness assets: • streamlit_app.py — Streamlit Community Cloud entry shim • src/gui/app_demo.py — single-page demo, ?p=<persona> routing, 100-row cap + watermark, free-vs-paid boundary enforced at surface • samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs • landing/ — 4 static HTML pages (apex chooser + 3 niche), shared CSS, deploy.py URL-substitution script, auto-generated robots.txt + sitemap.xml + 404.html + favicon • docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md — full strategy + measurement + deployment + master checklist Test counts: before: 1,520 passed · 4 skipped · 17 xfailed after: 1,729 passed · 0 skipped · 0 xfailed Tier-1 corpora added: • missing-corpus 3 use cases + 16 edge cases • column-mapper-corpus 3 use cases + 5 edge cases • format-cleaner intl 20-row 13-country stress fixture Engine hardening flushed out by the corpora: • interpolate guards against object-dtype columns • mean/median skip all-NaN columns (silences numpy warning) • fillna runs under future.no_silent_downcasting (silences pandas warning) • mojibake test no longer skips when ftfy installed (monkeypatch path) • drop-row threshold semantics: strict-greater (consistent across rows / cols) • currency_decimal validator allow-set updated for "auto" Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 22:31:26 +00:00
parent d18b95880d
commit 966af8ef94
89 changed files with 12039 additions and 284 deletions
--- a/src/gui/app_demo.py
+++ b/src/gui/app_demo.py
@@ -0,0 +1,468 @@
+"""DataTools — public demo app (deploys to Streamlit Community Cloud).
+
+This is a SEPARATE entry point from the main GUI (``src/gui/app.py``).
+The full GUI is the paid product surface; this demo is the marketing
+surface — a single page that runs one of three persona-specific
+pipelines on a preloaded sample file, shows the BEFORE / AFTER
+side-by-side, and converts the visitor to a Gumroad purchase.
+
+Launch:
+    streamlit run src/gui/app_demo.py
+
+URL routing:
+    https://demo.datatools.app/?p=shopify-pet   (Shopify operator)
+    https://demo.datatools.app/?p=bookkeeper    (Bookkeeper)
+    https://demo.datatools.app/?p=revops        (RevOps agency)
+
+Free / paid boundary (per docs/DEMO-PLAN.md §6):
+    - input rows capped at ``DEMO_ROW_CAP``
+    - input file size capped at ``DEMO_FILE_CAP_MB``
+    - download CSV gets a single trailing watermark row
+    - the pipeline editor is read-only — visitor sees it but can't change it
+    - no audit-log download (paid feature)
+    - no save-pipeline-JSON (paid feature)
+
+The demo runs the *same engine* as the paid product. Caps are applied
+at the surface layer only — when the buyer downloads and runs the paid
+build, every cap disappears.
+"""
+
+from __future__ import annotations
+
+import io
+import json
+import sys
+import time
+from pathlib import Path
+from typing import Any
+
+import pandas as pd
+import streamlit as st
+
+
+# Ensure project root is on sys.path so `src.core` imports work
+_project_root = Path(__file__).resolve().parent.parent.parent
+if str(_project_root) not in sys.path:
+    sys.path.insert(0, str(_project_root))
+
+from src.core.pipeline import Pipeline, run_pipeline
+
+
+# ---------------------------------------------------------------------------
+# Free / paid boundary constants
+# ---------------------------------------------------------------------------
+
+DEMO_ROW_CAP: int = 100
+DEMO_FILE_CAP_MB: int = 5
+GUMROAD_BASE: str = "https://gumroad.com/l/datatools"
+
+
+# ---------------------------------------------------------------------------
+# Persona registry — single source of truth
+# ---------------------------------------------------------------------------
+
+DEMO_DIR = _project_root / "samples" / "demo"
+
+
+PERSONAS: dict[str, dict[str, Any]] = {
+    "shopify-pet": {
+        "label": "Shopify pet operator",
+        "icon": "🛍️",
+        "h1": "Klaviyo-import-ready customer lists. **In 30 seconds. Locally.**",
+        "sub": (
+            "Your Shopify customer export has duplicates Excel can't catch, "
+            "international phones Excel can't parse, and disguised nulls "
+            "(`N/A`, `(blank)`, `?`) that break Klaviyo's import. "
+            "DataTools fixes all of it in one pass — and your data never "
+            "leaves your computer."
+        ),
+        "data_file":     "shopify_pet_customers.csv",
+        "pipeline_file": "shopify_pet_pipeline.json",
+        "cta":           "Get DataTools for Shopify — $49 →",
+        "landing":       "https://datatools.app/shopify/",
+    },
+    "bookkeeper": {
+        "label": "Bookkeeper / freelance accountant",
+        "icon": "📒",
+        "h1": "Reconcile messy bank exports. **Hand your client an audit trail.**",
+        "sub": (
+            "The Jan and Feb exports overlap; the same transaction posts twice. "
+            "Vendor names are *Amazon* / *amazon.com* / *AMAZON.COM*4F2X9* in "
+            "three rows. DataTools dedups on Date + Amount + fuzzy Vendor, "
+            "produces ISO dates and numeric amounts, and gives you a row-level "
+            "audit log to hand the client."
+        ),
+        "data_file":     "bookkeeper_bank_reconcile.csv",
+        "pipeline_file": "bookkeeper_bank_pipeline.json",
+        "cta":           "Get DataTools for Bookkeepers — $49 →",
+        "landing":       "https://datatools.app/bookkeeper/",
+    },
+    "revops": {
+        "label": "Marketing / RevOps agency",
+        "icon": "🪢",
+        "h1": "Dedupe lead lists across HubSpot, LinkedIn, and manual scrapes — **locally.**",
+        "sub": (
+            "The same prospect shows up in HubSpot as `alice@acme.com`, in "
+            "LinkedIn as `Alice.Johnson@acme.com`, and in your VA's manual "
+            "scrape as `alice@acme.com` again. Country is `USA` / `US` / "
+            "`United States`. DataTools fuzzy-matches across sources, "
+            "normalizes phones for 50+ countries, and merges survivors "
+            "with their most-complete fields — without uploading anything."
+        ),
+        "data_file":     "agency_combined_leads.csv",
+        "pipeline_file": "agency_leads_pipeline.json",
+        "cta":           "Get DataTools for RevOps — $49 →",
+        "landing":       "https://datatools.app/revops/",
+    },
+}
+
+DEFAULT_PERSONA = "shopify-pet"
+
+
+# ---------------------------------------------------------------------------
+# Page config + routing
+# ---------------------------------------------------------------------------
+
+st.set_page_config(
+    page_title="DataTools — try it live",
+    page_icon="🧹",
+    layout="wide",
+    initial_sidebar_state="collapsed",
+)
+
+# Strip Streamlit chrome that breaks the iframe-embed look on the
+# landing pages.
+st.markdown("""
+<style>
+#MainMenu, footer, header { visibility: hidden; }
+.block-container { padding-top: 1.2rem; padding-bottom: 1rem; max-width: 1200px; }
+[data-testid="stSidebarNav"] { display: none; }
+section[data-testid="stSidebar"] { display: none; }
+.stApp { background: #0f1115; color: #e8eaed; }
+h1, h2, h3 { color: #e8eaed; letter-spacing: -0.01em; }
+hr { border-color: #252a36; }
+.demo-card {
+  background: #161922;
+  border: 1px solid #252a36;
+  border-radius: 12px;
+  padding: 18px;
+}
+.cta-block {
+  background: linear-gradient(135deg, #161922 0%, #1d212b 100%);
+  border: 1px solid #6ee7b7;
+  border-radius: 12px;
+  padding: 24px;
+  text-align: center;
+}
+.cta-block a {
+  display: inline-block;
+  background: #6ee7b7; color: #052e1a;
+  font-weight: 600; padding: 12px 22px;
+  border-radius: 8px; text-decoration: none;
+  font-size: 17px; margin-top: 12px;
+}
+.metric-pill {
+  display: inline-block;
+  background: #1d212b; border: 1px solid #252a36;
+  padding: 4px 10px; border-radius: 999px;
+  font-family: ui-monospace, monospace; font-size: 13px;
+  color: #6ee7b7; margin-right: 6px; margin-bottom: 4px;
+}
+</style>
+""", unsafe_allow_html=True)
+
+
+def _resolve_persona() -> str:
+    """Read ``?p=<persona>`` from query string; fall back to default."""
+    try:
+        params = st.query_params
+        raw = params.get("p", DEFAULT_PERSONA)
+    except AttributeError:
+        # Older Streamlit versions
+        params = st.experimental_get_query_params()
+        raw = params.get("p", [DEFAULT_PERSONA])
+        raw = raw[0] if isinstance(raw, list) else raw
+    if raw not in PERSONAS:
+        return DEFAULT_PERSONA
+    return raw
+
+
+persona_key = _resolve_persona()
+persona = PERSONAS[persona_key]
+
+
+# ---------------------------------------------------------------------------
+# Header + persona switch
+# ---------------------------------------------------------------------------
+
+col_brand, col_switch = st.columns([3, 2])
+with col_brand:
+    st.markdown(f"### 🧹 DataTools / for {persona['label']}")
+with col_switch:
+    # Quick-switch dropdown for visitors landing on the wrong persona
+    new_choice = st.selectbox(
+        "Try a different demo",
+        options=list(PERSONAS),
+        format_func=lambda k: f"{PERSONAS[k]['icon']} {PERSONAS[k]['label']}",
+        index=list(PERSONAS).index(persona_key),
+        key="persona_switch",
+        label_visibility="collapsed",
+    )
+    if new_choice != persona_key:
+        st.query_params["p"] = new_choice
+        st.rerun()
+
+st.markdown(f"## {persona['h1']}")
+st.markdown(persona["sub"])
+
+st.markdown("---")
+
+
+# ---------------------------------------------------------------------------
+# Load preloaded sample data + pipeline
+# ---------------------------------------------------------------------------
+
+@st.cache_data(show_spinner=False)
+def _load_demo(data_file: str, pipeline_file: str) -> tuple[pd.DataFrame, Pipeline]:
+    df = pd.read_csv(DEMO_DIR / data_file, dtype=str, keep_default_na=False)
+    pipe = Pipeline.from_file(DEMO_DIR / pipeline_file)
+    return df, pipe
+
+
+sample_df, sample_pipeline = _load_demo(persona["data_file"], persona["pipeline_file"])
+
+
+def _read_uploaded(uploaded_file) -> tuple[pd.DataFrame, list[str]]:
+    """Decode an uploaded file. Returns (df, warnings)."""
+    warnings: list[str] = []
+    raw = uploaded_file.getvalue()
+    size_mb = len(raw) / 1024 / 1024
+    if size_mb > DEMO_FILE_CAP_MB:
+        warnings.append(
+            f"Uploaded file is {size_mb:.1f} MB — demo capped at "
+            f"{DEMO_FILE_CAP_MB} MB. The paid product has no size limit."
+        )
+        return sample_df.copy(), warnings
+    suffix = Path(uploaded_file.name).suffix.lower()
+    bio = io.BytesIO(raw)
+    try:
+        if suffix in (".xlsx", ".xls"):
+            df = pd.read_excel(bio, dtype=str, keep_default_na=False)
+        else:
+            for enc in ("utf-8", "utf-8-sig", "latin-1"):
+                try:
+                    bio.seek(0)
+                    sep = "\t" if suffix == ".tsv" else ","
+                    df = pd.read_csv(
+                        bio, dtype=str, keep_default_na=False,
+                        encoding=enc, sep=sep, on_bad_lines="warn",
+                    )
+                    break
+                except UnicodeDecodeError:
+                    continue
+            else:
+                bio.seek(0)
+                df = pd.read_csv(bio, dtype=str, keep_default_na=False, encoding="latin-1")
+    except Exception as e:
+        warnings.append(f"Could not read your file ({type(e).__name__}). "
+                        "Demo will run on the sample dataset.")
+        return sample_df.copy(), warnings
+    if len(df) > DEMO_ROW_CAP:
+        warnings.append(
+            f"Demo capped at {DEMO_ROW_CAP} rows — your file has {len(df):,}. "
+            f"Running on the first {DEMO_ROW_CAP} rows. The paid product has no row limit."
+        )
+        df = df.head(DEMO_ROW_CAP)
+    return df, warnings
+
+
+# ---------------------------------------------------------------------------
+# File source: preloaded sample (default) or user upload
+# ---------------------------------------------------------------------------
+
+st.markdown(f"#### Sample dataset preloaded · `{persona['data_file']}`")
+
+with st.expander(
+    "Or replace with your own file (capped at "
+    f"{DEMO_ROW_CAP} rows / {DEMO_FILE_CAP_MB} MB for the demo)",
+    expanded=False,
+):
+    uploaded = st.file_uploader(
+        "Your file",
+        type=["csv", "tsv", "xlsx", "xls"],
+        key="demo_user_file",
+        label_visibility="collapsed",
+        help=(
+            "Files larger than the cap are accepted but only the first "
+            f"{DEMO_ROW_CAP} rows are processed. The paid build runs on "
+            "1 GB+ files via streaming."
+        ),
+    )
+
+if uploaded is not None:
+    df_in, upload_warnings = _read_uploaded(uploaded)
+    for w in upload_warnings:
+        st.info(w)
+    using_sample = False
+else:
+    df_in = sample_df.copy()
+    using_sample = True
+
+
+# ---------------------------------------------------------------------------
+# BEFORE preview
+# ---------------------------------------------------------------------------
+
+st.markdown(f"#### BEFORE — {len(df_in)} rows, {len(df_in.columns)} columns")
+st.dataframe(df_in.head(10), use_container_width=True, hide_index=True)
+
+st.markdown("---")
+
+
+# ---------------------------------------------------------------------------
+# Pipeline (read-only)
+# ---------------------------------------------------------------------------
+
+st.markdown("#### Pipeline (saved — paid version is editable)")
+pipe_summary = " → ".join(
+    f"**{i + 1}.** {step.tool}"
+    for i, step in enumerate(sample_pipeline.steps)
+)
+st.markdown(pipe_summary)
+
+
+# ---------------------------------------------------------------------------
+# Run
+# ---------------------------------------------------------------------------
+
+run_clicked = st.button(
+    "▶ Run pipeline",
+    type="primary",
+    use_container_width=True,
+    key="demo_run_button",
+)
+
+if run_clicked:
+    with st.spinner("Running…"):
+        t0 = time.perf_counter()
+        try:
+            result = run_pipeline(df_in, sample_pipeline, stop_on_error=False)
+        except Exception as e:
+            from src.core.errors import format_for_user
+            st.error(f"Demo halted: {format_for_user(e)}")
+            st.stop()
+        elapsed = time.perf_counter() - t0
+    st.session_state["demo_result"] = result
+    st.session_state["demo_elapsed"] = elapsed
+    st.session_state["demo_persona"] = persona_key
+
+result = st.session_state.get("demo_result")
+elapsed = st.session_state.get("demo_elapsed", 0.0)
+result_persona = st.session_state.get("demo_persona")
+
+# Reset cached result when persona switches
+if result is not None and result_persona != persona_key:
+    result = None
+    st.session_state.pop("demo_result", None)
+
+
+# ---------------------------------------------------------------------------
+# AFTER + metrics + CTA
+# ---------------------------------------------------------------------------
+
+if result is not None:
+    st.markdown("---")
+    st.markdown(
+        f"#### AFTER — {len(df_in)} → {len(result.final_df)} rows · "
+        f"finished in {elapsed*1000:.0f} ms"
+    )
+
+    # Per-step metric pills
+    pills_html: list[str] = []
+    for sr in result.step_results:
+        if sr.skipped:
+            continue
+        if sr.error:
+            pills_html.append(
+                f'<span class="metric-pill" style="color:#fbbf24">'
+                f'{sr.step.tool}: error</span>'
+            )
+            continue
+        s = sr.summary
+        bits: list[str] = []
+        if "cells_changed" in s and s["cells_changed"]:
+            bits.append(f"{s['cells_changed']} cells")
+        if "sentinels_standardized" in s and s["sentinels_standardized"]:
+            bits.append(f"{s['sentinels_standardized']} sentinels")
+        if "duplicates_removed" in s and s["duplicates_removed"]:
+            bits.append(f"{s['duplicates_removed']} dupes merged")
+        if "columns_renamed" in s and s["columns_renamed"]:
+            bits.append(f"{s['columns_renamed']} renamed")
+        label = ", ".join(bits) if bits else "no-op"
+        pills_html.append(
+            f'<span class="metric-pill">{sr.step.tool}: {label}</span>'
+        )
+    st.markdown("".join(pills_html), unsafe_allow_html=True)
+
+    st.dataframe(result.final_df.head(10), use_container_width=True, hide_index=True)
+
+    # ----- Download with watermark row -----
+    watermark_row = pd.DataFrame([{
+        col: f"DataTools demo — buy at {persona['landing']}"
+        if i == 0 else ""
+        for i, col in enumerate(result.final_df.columns)
+    }])
+    out_df = pd.concat([result.final_df, watermark_row], ignore_index=True)
+    csv_bytes = out_df.to_csv(index=False).encode("utf-8-sig")
+
+    col_dl, col_cta = st.columns([1, 2])
+    with col_dl:
+        st.download_button(
+            "Download cleaned CSV (sample · watermarked)",
+            data=csv_bytes,
+            file_name=Path(persona["data_file"]).stem + "_cleaned_demo.csv",
+            mime="text/csv",
+            use_container_width=True,
+        )
+    with col_cta:
+        st.markdown(
+            f"""
+<div class="cta-block">
+  <strong style="font-size: 18px;">Like what you see?</strong><br/>
+  Run this on YOUR full file — locally. No upload. No row limit. No watermark.<br/>
+  <a href="{GUMROAD_BASE}?from={persona_key}" rel="noopener">{persona['cta']}</a>
+</div>
+""",
+            unsafe_allow_html=True,
+        )
+else:
+    # Pre-run state — show the buy block at the bottom anyway so the
+    # CTA is always visible above the fold once the visitor scrolls.
+    st.markdown(
+        f"""
+<div class="cta-block" style="margin-top: 24px;">
+  <strong style="font-size: 18px;">Already convinced?</strong><br/>
+  Skip the demo and grab the full version. One-time payment, no subscription.<br/>
+  <a href="{GUMROAD_BASE}?from={persona_key}" rel="noopener">{persona['cta']}</a>
+</div>
+""",
+        unsafe_allow_html=True,
+    )
+
+# ---------------------------------------------------------------------------
+# Footer trust block
+# ---------------------------------------------------------------------------
+
+st.markdown("---")
+col_t1, col_t2, col_t3 = st.columns(3)
+with col_t1:
+    st.markdown("**🔒 Runs locally**\n\nThe paid product is desktop-only. Your data never leaves your computer.")
+with col_t2:
+    st.markdown("**📋 Audit trail**\n\nEvery cell change row-logged with old / new / which rule fired.")
+with col_t3:
+    st.markdown("**💰 One-time $49**\n\nNo subscription. Mac · Windows · Linux. Free updates for v1.x.")
+
+st.caption(
+    f"Demo capped at {DEMO_ROW_CAP} rows · output watermarked with one trailing row · "
+    "running on free hosting. The paid product is uncapped and runs offline."
+)
--- a/src/gui/pages/4_Missing_Values.py
+++ b/src/gui/pages/4_Missing_Values.py
@@ -1,111 +1,368 @@
-"""DataTools Missing Value Handler — stub page."""
+"""DataTools Missing Value Handler — Streamlit page."""

 from __future__ import annotations

+import io
+import json
 import sys
 from pathlib import Path

+import pandas as pd
 import streamlit as st

 _project_root = Path(__file__).resolve().parent.parent.parent.parent
 if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root))

-from src.gui.components import hide_streamlit_chrome, require_normalization_gate
+from src.gui.components import (
+    hide_streamlit_chrome,
+    pickup_or_upload,
+    require_normalization_gate,
+)
+from src.core.missing import (
+    DEFAULT_SENTINELS,
+    MissingOptions,
+    PRESETS,
+    handle_missing,
+    profile_missing,
+)

 hide_streamlit_chrome()
 require_normalization_gate()

+
 # ---------------------------------------------------------------------------
 # Header
 # ---------------------------------------------------------------------------

 st.title("🕳️ Missing Value Handler")
-st.caption("Detect, analyze, and handle missing values in your data.")
+st.caption(
+    "Detect disguised nulls, profile missingness, and apply imputation or "
+    "drop strategies. Runs locally — your data never leaves this computer."
+)

-st.info("This tool is under development.")

 # ---------------------------------------------------------------------------
-# What this tool will do
+# File upload
 # ---------------------------------------------------------------------------

-st.markdown("""
-**Features:**
- Detect disguised nulls (empty strings, "N/A", "n/a", "-", "NULL", "None", etc.)
- Missingness analysis: per-column counts, percentages, and patterns
- Visualize missing data heatmap
- Imputation strategies: drop rows/columns, fill with mean/median/mode, forward-fill, backward-fill
- Custom sentinel value replacement
- Before/after comparison
-""")
+uploaded = pickup_or_upload(
+    label="Upload CSV or Excel file",
+    key="missing_file_upload",
+    types=["csv", "tsv", "xlsx", "xls"],
+)
+
+if uploaded is None:
+    st.info("Upload a CSV, TSV, or Excel file to begin.")
+    st.stop()
+
+
+@st.cache_data(show_spinner=False)
+def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
+    """Read the uploaded bytes into a DataFrame.
+
+    Unlike the text cleaner, we do *not* force ``dtype=str`` here: missing-
+    value handling is more useful when numeric columns are typed correctly
+    (so mean / median / interpolate work without manual coercion).
+    Sentinel strings are still detected because they survive in object
+    columns where any cell is non-numeric.
+    """
+    suffix = Path(name).suffix.lower()
+    bio = io.BytesIO(data)
+    if suffix in (".xlsx", ".xls"):
+        return pd.read_excel(bio)
+    for enc in ("utf-8", "utf-8-sig", "latin-1"):
+        try:
+            bio.seek(0)
+            sep = "\t" if suffix == ".tsv" else ","
+            return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn")
+        except UnicodeDecodeError:
+            continue
+    bio.seek(0)
+    return pd.read_csv(bio, encoding="latin-1")
+
+
+try:
+    df = _read_uploaded(uploaded.name, uploaded.getvalue())
+except Exception as e:
+    from src.core.errors import format_for_user
+    st.error(
+        f"**Could not read `{uploaded.name}`**\n\n"
+        f"```\n{format_for_user(e)}\n```"
+    )
+    st.stop()
+
+st.subheader(f"Preview: {uploaded.name}")
+st.caption(f"{len(df)} rows, {len(df.columns)} columns")
+st.dataframe(df.head(10), use_container_width=True)

 st.divider()

 # ---------------------------------------------------------------------------
-# File upload (functional)
+# Initial profile (read-only)
 # ---------------------------------------------------------------------------

-uploaded = st.file_uploader(
-    "Upload CSV or Excel file",
-    type=["csv", "tsv", "xlsx", "xls"],
-    help="Upload a file to preview. Processing is not yet available.",
-    key="missing_file_upload",
-)
+st.subheader("Missingness profile")

-if uploaded is not None:
-    import pandas as pd
-    try:
-        if uploaded.name.endswith((".xlsx", ".xls")):
-            df = pd.read_excel(uploaded)
-        else:
-            df = pd.read_csv(uploaded)
-        st.subheader(f"Preview: {uploaded.name}")
-        st.caption(f"{len(df)} rows, {len(df.columns)} columns")
-        st.dataframe(df.head(10), use_container_width=True)
-    except Exception as e:
-        from src.core.errors import format_for_user
-        st.error(
-            f"**Could not read `{uploaded.name}`**\n\n"
-            f"```\n{format_for_user(e)}\n```"
+initial_profile = profile_missing(df, MissingOptions())
+prof_df = initial_profile.to_dataframe()
+
+m1, m2, m3, m4 = st.columns(4)
+m1.metric("Rows", initial_profile.rows_total)
+m2.metric("Cells missing", initial_profile.cells_missing)
+m3.metric("% cells missing", f"{initial_profile.cells_missing_pct:.1f}%")
+m4.metric("Complete rows", initial_profile.rows_complete)
+
+st.dataframe(prof_df, use_container_width=True, hide_index=True)
+
+if initial_profile.cells_missing == 0:
+    st.success("No missing values or disguised nulls detected. Nothing to handle.")
+
+st.divider()
+
+# ---------------------------------------------------------------------------
+# Options
+# ---------------------------------------------------------------------------
+
+st.subheader("Strategy")
+
+preset_label = st.radio(
+    "Preset",
+    [
+        "detect-only (standardize sentinels to NaN, no fill or drop)",
+        "safe-fill (numeric → median, categorical → mode)",
+        "drop-incomplete (drop any row with missing)",
+    ],
+    index=0,
+    help=(
+        "detect-only: replace 'N/A', '-', 'NULL', etc. with real NaN, then stop. "
+        "safe-fill: also fill — numeric columns with median, others with mode. "
+        "drop-incomplete: also drop every row that has any missing cell."
+    ),
+)
+preset_key = preset_label.split(" ", 1)[0]
+options = MissingOptions.from_preset(preset_key)
+
+with st.expander("Advanced options"):
+    col_a, col_b = st.columns(2)
+
+    with col_a:
+        st.markdown("**Detection**")
+        options.standardize_sentinels = st.checkbox(
+            "Standardize disguised nulls to NaN",
+            value=options.standardize_sentinels,
+            help="Replace 'N/A', '-', 'NULL', whitespace-only cells, etc. with real NaN.",
+        )
+        sentinels_text = st.text_input(
+            "Sentinel values (comma-separated)",
+            value=", ".join(options.sentinels),
+            disabled=not options.standardize_sentinels,
+            help="Matched case-insensitively after stripping whitespace.",
+        )
+        options.sentinels = [
+            s.strip() for s in sentinels_text.split(",") if s.strip()
+        ]
+
+    with col_b:
+        st.markdown("**Strategy override**")
+        strat_options = [
+            "(use preset)",
+            "none", "drop_row", "drop_col", "drop_both",
+            "mean", "median", "mode", "constant",
+            "ffill", "bfill", "interpolate",
+        ]
+        strat_choice = st.selectbox(
+            "Global strategy",
+            strat_options,
+            index=0,
+            help=(
+                "drop_row / drop_col use the thresholds below. "
+                "mean / median / interpolate are numeric only — non-numeric "
+                "columns fall back to the categorical strategy."
+            ),
+        )
+        if strat_choice != "(use preset)":
+            options.strategy = strat_choice  # type: ignore[assignment]
+
+        cat_strat = st.selectbox(
+            "Categorical fallback (for non-numeric columns)",
+            ["mode", "constant", "ffill", "bfill", "none"],
+            index=0,
+        )
+        options.categorical_strategy = cat_strat  # type: ignore[assignment]
+
+        if options.strategy == "constant" or cat_strat == "constant":
+            fill_val = st.text_input(
+                "Constant fill value",
+                value="",
+                help="Used when strategy = constant. Leave blank to fill with empty string.",
+            )
+            options.fill_value = fill_val
+
+    st.markdown("**Drop thresholds**")
+    col_c, col_d = st.columns(2)
+    with col_c:
+        options.row_drop_threshold = st.slider(
+            "Row drop threshold (drop rows with ≥ this fraction missing across selected cols)",
+            0.0, 1.0, options.row_drop_threshold, 0.05,
+        )
+    with col_d:
+        options.col_drop_threshold = st.slider(
+            "Column drop threshold (drop columns with ≥ this fraction missing)",
+            0.0, 1.0, options.col_drop_threshold, 0.05,
        )

-# ---------------------------------------------------------------------------
-# Placeholder options
-# ---------------------------------------------------------------------------
+    st.markdown("**Scope**")
+    selected_cols = st.multiselect(
+        "Columns to handle (default: all)",
+        options=list(df.columns),
+        default=list(df.columns),
+    )
+    skip_cols = st.multiselect(
+        "Columns to skip",
+        options=list(df.columns),
+        default=[],
+    )
+    options.columns = selected_cols if selected_cols else None
+    options.skip_columns = list(skip_cols)

-st.subheader("Detection Settings")
-
-st.text_input(
-    "Null patterns (comma-separated)",
-    value="N/A, n/a, NA, -, NULL, None, empty, .",
-    disabled=True,
-    help="Values to treat as missing.",
-)
-
-st.subheader("Handling Strategy")
-
-st.selectbox("Strategy", [
-    "Drop rows with any missing",
-    "Drop rows above threshold",
-    "Fill with mean (numeric)",
-    "Fill with median (numeric)",
-    "Fill with mode (categorical)",
-    "Forward-fill",
-    "Backward-fill",
-    "Custom value",
-], disabled=True)
-
-st.slider("Drop threshold (%)", 0, 100, 50, disabled=True, help="Drop rows missing more than this % of columns.")
-
-st.divider()
-st.button("Handle Missing Values", type="primary", use_container_width=True, disabled=True)
+    st.markdown("**Per-column strategy overrides** (optional)")
+    st.caption(
+        "Set a different strategy for specific columns. Leave any row blank to "
+        "use the global strategy."
+    )
+    per_col_overrides: dict[str, str] = {}
+    only_missing_cols = [
+        r.column for r in initial_profile.columns if r.has_missing
+    ]
+    if only_missing_cols:
+        edit_df = pd.DataFrame({
+            "column": only_missing_cols,
+            "strategy": ["" for _ in only_missing_cols],
+        })
+        edited = st.data_editor(
+            edit_df,
+            use_container_width=True,
+            hide_index=True,
+            column_config={
+                "column": st.column_config.TextColumn("Column", disabled=True),
+                "strategy": st.column_config.SelectboxColumn(
+                    "Override",
+                    options=[
+                        "", "drop_row", "drop_col",
+                        "mean", "median", "mode", "constant",
+                        "ffill", "bfill", "interpolate",
+                    ],
+                ),
+            },
+            key="missing_per_col_editor",
+        )
+        for _, row in edited.iterrows():
+            if row["strategy"]:
+                per_col_overrides[row["column"]] = row["strategy"]
+        options.column_strategies = per_col_overrides  # type: ignore[assignment]

 # ---------------------------------------------------------------------------
-# Footer
+# Run
 # ---------------------------------------------------------------------------

 st.divider()
-st.caption(
-    "Runs locally. Your data never leaves this computer. "
-    "| DataTools v3.0"
-)
+
+if st.button("Handle Missing Values", type="primary", use_container_width=True):
+    with st.spinner("Handling..."):
+        try:
+            result = handle_missing(df, options)
+        except (ValueError, OSError) as e:
+            from src.core.errors import format_for_user
+            st.error(format_for_user(e))
+            st.stop()
+    st.session_state["missing_result"] = result
+    st.session_state["missing_input_name"] = uploaded.name
+    st.session_state["missing_options"] = options.to_dict()
+
+result = st.session_state.get("missing_result")
+if result is None:
+    st.info("Choose a strategy and click **Handle Missing Values** to run.")
+    st.stop()
+
+# ---------------------------------------------------------------------------
+# Results
+# ---------------------------------------------------------------------------
+
+st.subheader("Results")
+
+m1, m2, m3, m4 = st.columns(4)
+m1.metric("Sentinels → NaN", result.sentinels_standardized)
+m2.metric("Cells filled", result.cells_filled)
+m3.metric("Rows dropped", result.rows_dropped)
+m4.metric("Columns dropped", len(result.columns_dropped))
+
+if result.columns_dropped:
+    st.warning(f"Dropped columns: {', '.join(result.columns_dropped)}")
+
+st.markdown("**Missingness — before vs. after**")
+before = result.profile_before.to_dataframe().set_index("column")[
+    ["missing", "missing_pct"]
+].rename(columns={"missing": "before_missing", "missing_pct": "before_pct"})
+after = result.profile_after.to_dataframe().set_index("column")[
+    ["missing", "missing_pct"]
+].rename(columns={"missing": "after_missing", "missing_pct": "after_pct"})
+combined = before.join(after, how="outer").fillna(0)
+st.dataframe(combined, use_container_width=True)
+
+if result.strategy_per_column:
+    st.markdown("**Strategy applied per column**")
+    strat_df = pd.DataFrame(
+        [{"column": c, "strategy": s} for c, s in result.strategy_per_column.items()]
+    )
+    st.dataframe(strat_df, use_container_width=True, hide_index=True)
+
+if not result.changes.empty:
+    st.markdown("**Audit (first 50 changes)**")
+    audit_view = result.changes.head(50).copy()
+    audit_view["row"] = audit_view["row"].apply(lambda x: "—" if x == -1 else x + 1)
+    st.dataframe(audit_view, use_container_width=True, hide_index=True)
+    if len(result.changes) > 50:
+        st.caption(f"… and {len(result.changes) - 50} more (download the full audit below).")
+
+st.markdown("**Handled preview (first 10 rows)**")
+st.dataframe(result.handled_df.head(10), use_container_width=True)
+
+# ---------------------------------------------------------------------------
+# Downloads
+# ---------------------------------------------------------------------------
+
+st.divider()
+stem = Path(st.session_state.get("missing_input_name", "input")).stem
+
+dl_a, dl_b, dl_c = st.columns(3)
+with dl_a:
+    handled_bytes = result.handled_df.to_csv(index=False).encode("utf-8-sig")
+    st.download_button(
+        "Download handled CSV",
+        data=handled_bytes,
+        file_name=f"{stem}_missing.csv",
+        mime="text/csv",
+    )
+with dl_b:
+    if not result.changes.empty:
+        changes_bytes = result.changes.to_csv(index=False).encode("utf-8-sig")
+        st.download_button(
+            "Download changes audit",
+            data=changes_bytes,
+            file_name=f"{stem}_missing_changes.csv",
+            mime="text/csv",
+        )
+with dl_c:
+    config_bytes = json.dumps(
+        st.session_state.get("missing_options", {}), indent=2, default=str,
+    ).encode("utf-8")
+    st.download_button(
+        "Download config JSON",
+        data=config_bytes,
+        file_name="missing_config.json",
+        mime="application/json",
+    )
+
+st.divider()
+st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")
--- a/src/gui/pages/5_Column_Mapper.py
+++ b/src/gui/pages/5_Column_Mapper.py
@@ -1,102 +1,413 @@
-"""DataTools Column Mapper — stub page."""
+"""DataTools Column Mapper — Streamlit page."""

 from __future__ import annotations

+import io
+import json
 import sys
 from pathlib import Path

+import pandas as pd
 import streamlit as st

 _project_root = Path(__file__).resolve().parent.parent.parent.parent
 if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root))

-from src.gui.components import hide_streamlit_chrome, require_normalization_gate
+from src.gui.components import (
+    hide_streamlit_chrome,
+    pickup_or_upload,
+    require_normalization_gate,
+)
+from src.core.column_mapper import (
+    MapOptions,
+    PRESETS,
+    TargetField,
+    TargetSchema,
+    infer_mapping,
+    map_columns,
+)

 hide_streamlit_chrome()
 require_normalization_gate()

+
 # ---------------------------------------------------------------------------
 # Header
 # ---------------------------------------------------------------------------

 st.title("🗂️ Column Mapper")
-st.caption("Rename columns, enforce a target schema, and coerce types.")
+st.caption(
+    "Rename columns, enforce a target schema, and coerce types. Runs locally — "
+    "your data never leaves this computer."
+)

-st.info("This tool is under development.")

 # ---------------------------------------------------------------------------
-# What this tool will do
+# File upload
 # ---------------------------------------------------------------------------

-st.markdown("""
-**Features:**
- Rename columns via interactive mapping table
- Load a target schema (JSON/CSV) to auto-map columns
- Fuzzy column name matching for automatic suggestions
- Type coercion (string → int, string → date, etc.)
- Drop unmapped columns or keep as-is
- Reorder columns to match target schema
-""")
+uploaded = pickup_or_upload(
+    label="Upload CSV or Excel file",
+    key="colmap_file_upload",
+    types=["csv", "tsv", "xlsx", "xls"],
+)
+
+if uploaded is None:
+    st.info("Upload a CSV, TSV, or Excel file to begin.")
+    st.stop()
+
+
+@st.cache_data(show_spinner=False)
+def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
+    suffix = Path(name).suffix.lower()
+    bio = io.BytesIO(data)
+    if suffix in (".xlsx", ".xls"):
+        return pd.read_excel(bio)
+    for enc in ("utf-8", "utf-8-sig", "latin-1"):
+        try:
+            bio.seek(0)
+            sep = "\t" if suffix == ".tsv" else ","
+            return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn")
+        except UnicodeDecodeError:
+            continue
+    bio.seek(0)
+    return pd.read_csv(bio, encoding="latin-1")
+
+
+try:
+    df = _read_uploaded(uploaded.name, uploaded.getvalue())
+except Exception as e:
+    from src.core.errors import format_for_user
+    st.error(
+        f"**Could not read `{uploaded.name}`**\n\n"
+        f"```\n{format_for_user(e)}\n```"
+    )
+    st.stop()
+
+st.subheader(f"Preview: {uploaded.name}")
+st.caption(f"{len(df)} rows, {len(df.columns)} columns")
+st.dataframe(df.head(10), use_container_width=True)
+st.divider()
+
+# ---------------------------------------------------------------------------
+# Schema input
+# ---------------------------------------------------------------------------
+
+st.subheader("Target schema")
+
+schema_mode = st.radio(
+    "How would you like to define the target schema?",
+    [
+        "Build interactively (start from current columns)",
+        "Upload schema JSON",
+        "Skip (rename / coerce only — no schema)",
+    ],
+    index=0,
+    help=(
+        "An interactive build is fastest for one-off cleanup. Upload a JSON "
+        "when you have a fixed contract (a CRM import format, db schema). "
+        "Skip when you only want to rename or coerce specific columns."
+    ),
+)
+
+schema: TargetSchema | None = None
+
+if schema_mode.startswith("Upload"):
+    schema_file = st.file_uploader(
+        "Schema JSON",
+        type=["json"],
+        key="colmap_schema_upload",
+        help='Format: {"fields": [{"name": "email", "dtype": "string", "required": true, "aliases": ["EmailAddr"]}, ...]}',
+    )
+    if schema_file is not None:
+        try:
+            schema = TargetSchema.from_dict(json.loads(schema_file.getvalue()))
+            st.success(f"Loaded {len(schema.fields)} target field(s).")
+        except Exception as e:
+            from src.core.errors import format_for_user
+            st.error(f"**Could not parse schema**\n\n```\n{format_for_user(e)}\n```")
+
+elif schema_mode.startswith("Build"):
+    st.caption(
+        "Edit the table to define your target schema. Add rows for fields the "
+        "input doesn't have yet (with a default), or remove rows for columns "
+        "you want to drop."
+    )
+    initial = pd.DataFrame({
+        "name": list(df.columns),
+        "dtype": ["auto"] * len(df.columns),
+        "required": [False] * len(df.columns),
+        "default": [""] * len(df.columns),
+        "aliases": [""] * len(df.columns),
+    })
+    edited = st.data_editor(
+        initial,
+        use_container_width=True,
+        num_rows="dynamic",
+        column_config={
+            "name": st.column_config.TextColumn("Target name"),
+            "dtype": st.column_config.SelectboxColumn(
+                "Type",
+                options=[
+                    "auto", "string", "integer", "float",
+                    "boolean", "date", "datetime", "category",
+                ],
+            ),
+            "required": st.column_config.CheckboxColumn("Required"),
+            "default": st.column_config.TextColumn("Default (for added cols)"),
+            "aliases": st.column_config.TextColumn(
+                "Aliases (comma-sep, helps fuzzy-match)",
+            ),
+        },
+        key="colmap_schema_editor",
+    )
+    fields: list[TargetField] = []
+    for _, row in edited.iterrows():
+        name = str(row.get("name", "")).strip()
+        if not name:
+            continue
+        aliases = [
+            a.strip() for a in str(row.get("aliases", "") or "").split(",")
+            if a.strip()
+        ]
+        default_raw = row.get("default")
+        default_val = (
+            default_raw if (default_raw not in (None, "", float("nan")))
+            else None
+        )
+        try:
+            if isinstance(default_val, float) and pd.isna(default_val):
+                default_val = None
+        except TypeError:
+            pass
+        fields.append(TargetField(
+            name=name,
+            dtype=str(row.get("dtype", "auto")),  # type: ignore[arg-type]
+            required=bool(row.get("required", False)),
+            aliases=aliases,
+            default=default_val,
+        ))
+    if fields:
+        schema = TargetSchema(fields=fields)

 st.divider()

 # ---------------------------------------------------------------------------
-# File upload (functional)
+# Strategy
 # ---------------------------------------------------------------------------

-uploaded = st.file_uploader(
-    "Upload CSV or Excel file",
-    type=["csv", "tsv", "xlsx", "xls"],
-    help="Upload a file to preview. Processing is not yet available.",
-    key="colmap_file_upload",
+st.subheader("Strategy")
+
+preset_label = st.radio(
+    "Preset",
+    [
+        "rename-only (just rename, leave types alone, keep extras)",
+        "lenient-schema (rename + coerce + reorder, keep extras)",
+        "strict-schema (rename + coerce + reorder, drop extras)",
+    ],
+    index=0,
 )
+preset_key = preset_label.split(" ", 1)[0]
+options = MapOptions.from_preset(preset_key)
+options.schema = schema

-if uploaded is not None:
-    import pandas as pd
-    try:
-        if uploaded.name.endswith((".xlsx", ".xls")):
-            df = pd.read_excel(uploaded)
-        else:
-            df = pd.read_csv(uploaded)
-        st.subheader(f"Preview: {uploaded.name}")
-        st.caption(f"{len(df)} rows, {len(df.columns)} columns")
-        st.dataframe(df.head(10), use_container_width=True)
-
-        st.subheader("Column Mapping")
-        st.caption("Map source columns to target names. (Interactive mapping coming soon.)")
-        mapping_data = pd.DataFrame({
-            "Source Column": df.columns.tolist(),
-            "Target Column": df.columns.tolist(),
-            "Type": ["auto"] * len(df.columns),
-        })
-        st.dataframe(mapping_data, use_container_width=True, hide_index=True)
-    except Exception as e:
-        from src.core.errors import format_for_user
-        st.error(
-            f"**Could not read `{uploaded.name}`**\n\n"
-            f"```\n{format_for_user(e)}\n```"
+with st.expander("Advanced options"):
+    col_a, col_b = st.columns(2)
+    with col_a:
+        options.unmapped = st.selectbox(  # type: ignore[assignment]
+            "Unmapped source columns",
+            ["keep", "drop", "error"],
+            index=["keep", "drop", "error"].index(options.unmapped),
+        )
+        options.coerce_types = st.checkbox(
+            "Coerce types per schema", value=options.coerce_types,
+        )
+        options.reorder_to_schema = st.checkbox(
+            "Reorder to schema order", value=options.reorder_to_schema,
+        )
+    with col_b:
+        options.auto_infer = st.checkbox(
+            "Auto-infer mapping (fuzzy match)", value=options.auto_infer,
+        )
+        options.fuzzy_threshold = st.slider(
+            "Fuzzy match threshold", 0.0, 1.0, options.fuzzy_threshold, 0.05,
+        )
+        options.enforce_required = st.checkbox(
+            "Enforce required fields", value=options.enforce_required,
        )

 # ---------------------------------------------------------------------------
-# Placeholder options
+# Mapping editor — show inferred and let user override
 # ---------------------------------------------------------------------------

-st.subheader("Schema Options")
+st.subheader("Mapping")

-st.file_uploader("Load target schema (JSON)", type=["json"], disabled=True, key="colmap_schema")
-st.checkbox("Drop unmapped columns", value=False, disabled=True)
-st.checkbox("Reorder to match schema", value=True, disabled=True)
-
-st.divider()
-st.button("Apply Column Mapping", type="primary", use_container_width=True, disabled=True)
+if schema is None:
+    st.caption(
+        "No schema — define explicit renames below (left blank means keep "
+        "the source name)."
+    )
+    rename_initial = pd.DataFrame({
+        "source": list(df.columns),
+        "target": list(df.columns),
+    })
+    rename_edited = st.data_editor(
+        rename_initial,
+        use_container_width=True,
+        column_config={
+            "source": st.column_config.TextColumn("Source", disabled=True),
+            "target": st.column_config.TextColumn("Target"),
+        },
+        hide_index=True,
+        key="colmap_rename_only_editor",
+    )
+    explicit_mapping: dict[str, str] = {}
+    for _, row in rename_edited.iterrows():
+        src = str(row["source"])
+        tgt = str(row["target"]).strip()
+        if tgt and tgt != src:
+            explicit_mapping[src] = tgt
+    options.mapping = explicit_mapping
+else:
+    inferred = (
+        infer_mapping(df, schema, threshold=options.fuzzy_threshold)
+        if options.auto_infer else {}
+    )
+    target_options = ["(unmapped)"] + schema.field_names()
+    map_initial = pd.DataFrame({
+        "source": list(df.columns),
+        "target": [inferred.get(c, "(unmapped)") for c in df.columns],
+        "auto": [c in inferred for c in df.columns],
+    })
+    map_edited = st.data_editor(
+        map_initial,
+        use_container_width=True,
+        column_config={
+            "source": st.column_config.TextColumn("Source", disabled=True),
+            "target": st.column_config.SelectboxColumn(
+                "Target", options=target_options,
+            ),
+            "auto": st.column_config.CheckboxColumn("Auto-suggested", disabled=True),
+        },
+        hide_index=True,
+        key="colmap_schema_mapping_editor",
+    )
+    explicit_mapping = {}
+    for _, row in map_edited.iterrows():
+        src = str(row["source"])
+        tgt = str(row["target"])
+        if tgt and tgt != "(unmapped)":
+            explicit_mapping[src] = tgt
+    options.mapping = explicit_mapping
+    # Disable auto-infer for the actual run since the editor already shows
+    # the user's resolved choices (they can manually re-select to add).
+    options.auto_infer = False

 # ---------------------------------------------------------------------------
-# Footer
+# Run
 # ---------------------------------------------------------------------------

 st.divider()
-st.caption(
-    "Runs locally. Your data never leaves this computer. "
-    "| DataTools v3.0"
+
+if st.button("Apply Column Mapping", type="primary", use_container_width=True):
+    with st.spinner("Mapping..."):
+        try:
+            result = map_columns(df, options)
+        except (ValueError, OSError) as e:
+            from src.core.errors import format_for_user
+            st.error(format_for_user(e))
+            st.stop()
+    st.session_state["colmap_result"] = result
+    st.session_state["colmap_input_name"] = uploaded.name
+    st.session_state["colmap_options"] = options.to_dict()
+
+result = st.session_state.get("colmap_result")
+if result is None:
+    st.info("Configure a mapping and click **Apply Column Mapping** to run.")
+    st.stop()
+
+# ---------------------------------------------------------------------------
+# Results
+# ---------------------------------------------------------------------------
+
+st.subheader("Results")
+
+m1, m2, m3, m4 = st.columns(4)
+m1.metric("Renamed", result.columns_renamed)
+m2.metric("Dropped", len(result.columns_dropped))
+m3.metric("Added", len(result.columns_added))
+m4.metric(
+    "Coerce fails",
+    sum(result.coercion_failures.values()) if result.coercion_failures else 0,
 )
+
+if result.columns_dropped:
+    st.warning(f"Dropped columns: {', '.join(result.columns_dropped)}")
+if result.columns_added:
+    st.info(f"Added (with defaults): {', '.join(result.columns_added)}")
+if result.coercion_failures:
+    st.warning(
+        "Some cells could not be coerced and were left as NaN: "
+        + ", ".join(f"{c} ({n})" for c, n in result.coercion_failures.items())
+    )
+
+if result.mapping:
+    st.markdown("**Resolved mapping**")
+    map_df = pd.DataFrame(
+        [
+            {"source": s, "target": t, "auto": s in result.inferred_pairs}
+            for s, t in result.mapping.items()
+        ],
+    )
+    st.dataframe(map_df, use_container_width=True, hide_index=True)
+
+st.markdown("**Mapped preview (first 10 rows)**")
+st.dataframe(result.mapped_df.head(10), use_container_width=True)
+
+# ---------------------------------------------------------------------------
+# Downloads
+# ---------------------------------------------------------------------------
+
+st.divider()
+stem = Path(st.session_state.get("colmap_input_name", "input")).stem
+
+dl_a, dl_b, dl_c = st.columns(3)
+with dl_a:
+    mapped_bytes = result.mapped_df.to_csv(index=False).encode("utf-8-sig")
+    st.download_button(
+        "Download mapped CSV",
+        data=mapped_bytes,
+        file_name=f"{stem}_mapped.csv",
+        mime="text/csv",
+    )
+with dl_b:
+    audit_bytes = json.dumps({
+        "mapping": result.mapping,
+        "inferred_pairs": result.inferred_pairs,
+        "columns_renamed": result.columns_renamed,
+        "columns_dropped": result.columns_dropped,
+        "columns_added": result.columns_added,
+        "coercion_failures": result.coercion_failures,
+        "unmapped_kept": result.unmapped_kept,
+        "missing_required_targets": result.missing_required_targets,
+    }, indent=2, default=str).encode("utf-8")
+    st.download_button(
+        "Download mapping audit",
+        data=audit_bytes,
+        file_name=f"{stem}_mapping.json",
+        mime="application/json",
+    )
+with dl_c:
+    config_bytes = json.dumps(
+        st.session_state.get("colmap_options", {}), indent=2, default=str,
+    ).encode("utf-8")
+    st.download_button(
+        "Download config JSON",
+        data=config_bytes,
+        file_name="column_map_config.json",
+        mime="application/json",
+    )
+
+st.divider()
+st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")
--- a/src/gui/pages/9_Pipeline_Runner.py
+++ b/src/gui/pages/9_Pipeline_Runner.py
@@ -1,104 +1,370 @@
-"""DataTools Pipeline Runner — stub page."""
+"""DataTools Pipeline Runner — Streamlit page."""

 from __future__ import annotations

+import io
+import json
 import sys
 from pathlib import Path

+import pandas as pd
 import streamlit as st

 _project_root = Path(__file__).resolve().parent.parent.parent.parent
 if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root))

-from src.gui.components import hide_streamlit_chrome, require_normalization_gate
+from src.gui.components import (
+    hide_streamlit_chrome,
+    pickup_or_upload,
+    require_normalization_gate,
+)
+from src.core.pipeline import (
+    Pipeline,
+    SOFT_DEPENDENCIES,
+    Step,
+    TOOL_NAMES,
+    recommended_pipeline,
+    run_pipeline,
+    validate_pipeline,
+)

 hide_streamlit_chrome()
 require_normalization_gate()

+
 # ---------------------------------------------------------------------------
 # Header
 # ---------------------------------------------------------------------------

 st.title("⚙️ Pipeline Runner")
-st.caption("Chain tools in sequence and pass output between steps automatically.")
-
-st.info("This tool is under development.")
-
-# ---------------------------------------------------------------------------
-# What this tool will do
-# ---------------------------------------------------------------------------
-
-st.markdown("""
-**Features:**
- Select tools to run in sequence
- Recommended order: Text Cleaner → Format Standardizer → Missing Values → Deduplicator → Validator
- Each step's output feeds into the next step's input
- Per-step configuration overrides
- Progress tracking across all steps
- Final combined report
-""")
-
-st.divider()
-
-# ---------------------------------------------------------------------------
-# File upload (functional)
-# ---------------------------------------------------------------------------
-
-uploaded = st.file_uploader(
-    "Upload CSV or Excel file",
-    type=["csv", "tsv", "xlsx", "xls"],
-    help="Upload a file to preview. Processing is not yet available.",
-    key="pipeline_file_upload",
+st.caption(
+    "Chain DataTools cleaning steps into one repeatable workflow. The "
+    "pipeline recommends an order; you stay in control."
 )

-if uploaded is not None:
-    import pandas as pd
+
+# ---------------------------------------------------------------------------
+# File upload
+# ---------------------------------------------------------------------------
+
+uploaded = pickup_or_upload(
+    label="Upload CSV or Excel file",
+    key="pipeline_file_upload",
+    types=["csv", "tsv", "xlsx", "xls"],
+)
+
+if uploaded is None:
+    st.info("Upload a CSV, TSV, or Excel file to begin.")
+    st.stop()
+
+
+@st.cache_data(show_spinner=False)
+def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
+    suffix = Path(name).suffix.lower()
+    bio = io.BytesIO(data)
+    if suffix in (".xlsx", ".xls"):
+        return pd.read_excel(bio)
+    for enc in ("utf-8", "utf-8-sig", "latin-1"):
+        try:
+            bio.seek(0)
+            sep = "\t" if suffix == ".tsv" else ","
+            return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn")
+        except UnicodeDecodeError:
+            continue
+    bio.seek(0)
+    return pd.read_csv(bio, encoding="latin-1")
+
+
+try:
+    df = _read_uploaded(uploaded.name, uploaded.getvalue())
+except Exception as e:
+    from src.core.errors import format_for_user
+    st.error(
+        f"**Could not read `{uploaded.name}`**\n\n"
+        f"```\n{format_for_user(e)}\n```"
+    )
+    st.stop()
+
+st.subheader(f"Preview: {uploaded.name}")
+st.caption(f"{len(df)} rows, {len(df.columns)} columns")
+st.dataframe(df.head(10), use_container_width=True)
+st.divider()
+
+
+# ---------------------------------------------------------------------------
+# Pipeline builder
+# ---------------------------------------------------------------------------
+
+st.subheader("Pipeline")
+
+mode = st.radio(
+    "How would you like to define the pipeline?",
+    [
+        "Use the recommended default (text-clean → format → missing → dedup)",
+        "Build interactively",
+        "Upload a saved pipeline JSON",
+    ],
+    index=0,
+)
+
+if "pipeline_rows" not in st.session_state:
+    default = recommended_pipeline()
+    st.session_state["pipeline_rows"] = pd.DataFrame([
+        {
+            "tool": s.tool, "enabled": s.enabled,
+            "options_json": json.dumps(s.options),
+        }
+        for s in default.steps
+    ])
+
+if mode.startswith("Use the recommended"):
+    default = recommended_pipeline()
+    st.session_state["pipeline_rows"] = pd.DataFrame([
+        {
+            "tool": s.tool, "enabled": s.enabled,
+            "options_json": json.dumps(s.options),
+        }
+        for s in default.steps
+    ])
+elif mode.startswith("Upload"):
+    pipeline_file = st.file_uploader(
+        "Pipeline JSON", type=["json"], key="pipeline_upload",
+    )
+    if pipeline_file is not None:
+        try:
+            data = json.loads(pipeline_file.getvalue())
+            uploaded_pipe = Pipeline.from_dict(data)
+            st.session_state["pipeline_rows"] = pd.DataFrame([
+                {
+                    "tool": s.tool, "enabled": s.enabled,
+                    "options_json": json.dumps(s.options),
+                }
+                for s in uploaded_pipe.steps
+            ])
+            st.success(f"Loaded {len(uploaded_pipe.steps)} step(s).")
+        except Exception as e:
+            from src.core.errors import format_for_user
+            st.error(f"**Could not parse pipeline**\n\n```\n{format_for_user(e)}\n```")
+
+st.caption(
+    "Edit the table to add, remove, reorder (drag the row index), enable, "
+    "or configure each step. Tool order is recommended, not enforced — "
+    "violations surface as warnings below the table."
+)
+edited = st.data_editor(
+    st.session_state["pipeline_rows"],
+    use_container_width=True,
+    num_rows="dynamic",
+    column_config={
+        "tool": st.column_config.SelectboxColumn(
+            "Tool", options=TOOL_NAMES, required=True,
+        ),
+        "enabled": st.column_config.CheckboxColumn("Enabled"),
+        "options_json": st.column_config.TextColumn(
+            "Options (JSON)",
+            help='e.g. {"column_types": {"phone": "phone"}}',
+        ),
+    },
+    key="pipeline_editor",
+)
+st.session_state["pipeline_rows"] = edited
+
+# Build a Pipeline object from the editor state.
+steps_list: list[Step] = []
+parse_errors: list[str] = []
+for i, row in edited.iterrows():
+    tool = row.get("tool")
+    if not tool or pd.isna(tool):
+        continue
+    raw_opts = row.get("options_json") or "{}"
+    if pd.isna(raw_opts):
+        raw_opts = "{}"
    try:
-        if uploaded.name.endswith((".xlsx", ".xls")):
-            df = pd.read_excel(uploaded)
-        else:
-            df = pd.read_csv(uploaded)
-        st.subheader(f"Preview: {uploaded.name}")
-        st.caption(f"{len(df)} rows, {len(df.columns)} columns")
-        st.dataframe(df.head(10), use_container_width=True)
+        opts = json.loads(raw_opts) if isinstance(raw_opts, str) else dict(raw_opts)
+        if not isinstance(opts, dict):
+            raise ValueError("options must be a JSON object")
    except Exception as e:
-        from src.core.errors import format_for_user
-        st.error(
-            f"**Could not read `{uploaded.name}`**\n\n"
-            f"```\n{format_for_user(e)}\n```"
+        parse_errors.append(f"Step {i + 1}: {e}")
+        continue
+    try:
+        steps_list.append(Step(
+            tool=str(tool),
+            options=opts,
+            enabled=bool(row.get("enabled", True)),
+        ))
+    except Exception as e:
+        parse_errors.append(f"Step {i + 1}: {e}")
+
+if parse_errors:
+    for err in parse_errors:
+        st.error(err)
+
+current_pipeline = Pipeline(steps=steps_list) if steps_list else None
+
+if current_pipeline is not None:
+    warnings = validate_pipeline(current_pipeline)
+    if warnings:
+        st.warning(
+            "Pipeline is out of recommended order:\n\n"
+            + "\n".join(f"- {w}" for w in warnings)
+            + "\n\nThe pipeline will still run — these are recommendations only."
        )

-# ---------------------------------------------------------------------------
-# Pipeline steps (checklist)
-# ---------------------------------------------------------------------------
-
-st.subheader("Pipeline Steps")
-st.caption("Select tools to include in the pipeline (recommended order):")
-
-st.checkbox("1. Text Cleaner", value=True, disabled=True)
-st.checkbox("2. Format Standardizer", value=True, disabled=True)
-st.checkbox("3. Missing Value Handler", value=True, disabled=True)
-st.checkbox("4. Column Mapper", value=False, disabled=True)
-st.checkbox("5. Outlier Detector", value=False, disabled=True)
-st.checkbox("6. Deduplicator", value=True, disabled=True)
-st.checkbox("7. Multi-File Merger", value=False, disabled=True)
-st.checkbox("8. Validator & Reporter", value=True, disabled=True)
-
-st.subheader("Pipeline Configuration")
-
-st.selectbox("On error", ["Stop pipeline", "Skip step and continue", "Prompt for decision"], disabled=True)
-st.checkbox("Generate combined report at end", value=True, disabled=True)
+with st.expander("Recommended tool order — why each step belongs where it does"):
+    st.markdown(
+        "\n".join(
+            f"- **{e}** before **{l}** — {why}"
+            for e, l, why in SOFT_DEPENDENCIES
+        )
+    )

 st.divider()
-st.button("Run Pipeline", type="primary", use_container_width=True, disabled=True)

 # ---------------------------------------------------------------------------
-# Footer
+# Run
+# ---------------------------------------------------------------------------
+
+run_disabled = current_pipeline is None or not current_pipeline.steps
+
+if st.button(
+    "Run Pipeline",
+    type="primary",
+    use_container_width=True,
+    disabled=run_disabled,
+):
+    progress = st.progress(0.0, text="Starting...")
+    log_box = st.empty()
+    log_lines: list[str] = []
+    total_enabled = sum(1 for s in current_pipeline.steps if s.enabled)
+    completed = [0]
+
+    def _on_step(sr) -> None:
+        completed[0] += 1
+        if sr.skipped:
+            log_lines.append(f"○ {sr.step.display_name()} (skipped)")
+        elif sr.error:
+            log_lines.append(
+                f"✗ {sr.step.display_name()} — {sr.error.splitlines()[0]}"
+            )
+        else:
+            log_lines.append(
+                f"✓ {sr.step.display_name()} — {sr.elapsed_seconds*1000:.0f} ms"
+            )
+        log_box.markdown("\n".join(log_lines))
+        progress.progress(
+            completed[0] / max(total_enabled, 1),
+            text=f"Step {completed[0]}/{total_enabled}",
+        )
+
+    try:
+        result = run_pipeline(
+            df, current_pipeline,
+            on_step_complete=_on_step,
+            stop_on_error=False,
+        )
+    except Exception as e:
+        from src.core.errors import format_for_user
+        st.error(f"**Pipeline halted**\n\n```\n{format_for_user(e)}\n```")
+        st.stop()
+
+    progress.progress(1.0, text="Done")
+    st.session_state["pipeline_result"] = result
+    st.session_state["pipeline_input_name"] = uploaded.name
+
+result = st.session_state.get("pipeline_result")
+if result is None:
+    st.info(
+        "Configure the pipeline above and click **Run Pipeline** to "
+        "execute it on your file."
+    )
+    st.stop()
+
+# ---------------------------------------------------------------------------
+# Results
+# ---------------------------------------------------------------------------
+
+st.subheader("Results")
+
+m1, m2, m3, m4 = st.columns(4)
+m1.metric("Initial rows", result.initial_rows)
+m2.metric("Final rows", result.final_rows)
+m3.metric("Steps run", sum(1 for s in result.step_results if not s.skipped))
+m4.metric("Elapsed", f"{result.total_elapsed:.2f} s")
+
+st.markdown("**Per-step summary**")
+step_df = pd.DataFrame([
+    {
+        "step": sr.step.display_name(),
+        "status": (
+            "skipped" if sr.skipped
+            else "error" if sr.error
+            else "ok"
+        ),
+        "elapsed_ms": int(sr.elapsed_seconds * 1000),
+        "summary": json.dumps(sr.summary, default=str)[:200],
+        "error": sr.error or "",
+    }
+    for sr in result.step_results
+])
+st.dataframe(step_df, use_container_width=True, hide_index=True)
+
+st.markdown("**Output preview (first 10 rows)**")
+st.dataframe(result.final_df.head(10), use_container_width=True)
+
+# ---------------------------------------------------------------------------
+# Downloads
 # ---------------------------------------------------------------------------

 st.divider()
-st.caption(
-    "Runs locally. Your data never leaves this computer. "
-    "| DataTools v3.0"
-)
+stem = Path(st.session_state.get("pipeline_input_name", "input")).stem
+
+dl_a, dl_b, dl_c = st.columns(3)
+with dl_a:
+    bytes_csv = result.final_df.to_csv(index=False).encode("utf-8-sig")
+    st.download_button(
+        "Download cleaned CSV",
+        data=bytes_csv,
+        file_name=f"{stem}_pipeline.csv",
+        mime="text/csv",
+    )
+with dl_b:
+    pipeline_bytes = json.dumps(
+        current_pipeline.to_dict() if current_pipeline else {"steps": []},
+        indent=2, default=str,
+    ).encode("utf-8")
+    st.download_button(
+        "Download pipeline JSON",
+        data=pipeline_bytes,
+        file_name="pipeline.json",
+        mime="application/json",
+        help="Save this and pass --pipeline pipeline.json to the CLI to re-run on next week's file.",
+    )
+with dl_c:
+    audit_bytes = json.dumps({
+        "warnings": result.warnings,
+        "initial_rows": result.initial_rows,
+        "final_rows": result.final_rows,
+        "total_elapsed_seconds": result.total_elapsed,
+        "steps": [
+            {
+                "tool": sr.step.tool,
+                "name": sr.step.display_name(),
+                "enabled": sr.step.enabled,
+                "skipped": sr.skipped,
+                "elapsed_seconds": sr.elapsed_seconds,
+                "summary": sr.summary,
+                "error": sr.error,
+            }
+            for sr in result.step_results
+        ],
+    }, indent=2, default=str).encode("utf-8")
+    st.download_button(
+        "Download run audit",
+        data=audit_bytes,
+        file_name=f"{stem}_pipeline_audit.json",
+        mime="application/json",
+    )
+
+st.divider()
+st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")
--- a/src/gui/tools_registry.py
+++ b/src/gui/tools_registry.py
@@ -78,7 +78,7 @@ TOOLS: list[Tool] = [
            "Detect disguised nulls, missingness analysis, and imputation strategies."
        ),
        page_slug="4_Missing_Values",
-        status="Coming Soon",
+        status="Ready",
    ),
    Tool(
        tool_id="05_column_mapper",
@@ -86,7 +86,7 @@ TOOLS: list[Tool] = [
        name="Column Mapper",
        description="Rename columns, enforce a target schema, and coerce types.",
        page_slug="5_Column_Mapper",
-        status="Coming Soon",
+        status="Ready",
    ),
    Tool(
        tool_id="06_outlier_detector",
@@ -125,7 +125,7 @@ TOOLS: list[Tool] = [
            "Chain tools in recommended order and pass output between steps."
        ),
        page_slug="9_Pipeline_Runner",
-        status="Coming Soon",
+        status="Ready",
    ),
 ]