feat: 3 new tools, format streaming, distribution-ready demo + landing pages

Tools shipped this batch (4 → 6 of 9 Ready): 04 Missing Value Handler src/core/missing.py + cli_missing.py + GUI 05 Column Mapper src/core/column_mapper.py + cli_column_map.py + GUI 09 Pipeline Runner src/core/pipeline.py + cli_pipeline.py + GUI with soft tool-dependency graph (recommended, not enforced) and JSON save/load for repeatable weekly cleanups. Format Standardizer reworked for 1 GB international files: • Vectorised dispatch + LRU cache over phone/date/currency/boolean/email • Per-row country / address columns drive parsing • Audit cap (default 10 k rows, ~50 MB RAM) • standardize_file(): chunked streaming entry point (~165 k rows/sec) • currency_decimal="auto" for EU comma-decimal locales • R$ / kr / zł multi-char currency prefixes • cli_format.py with auto-stream above 100 MB inputs Encoding detection arbiter + language-aware probe: Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM) via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes. Distribution-readiness assets: • streamlit_app.py — Streamlit Community Cloud entry shim • src/gui/app_demo.py — single-page demo, ?p=<persona> routing, 100-row cap + watermark, free-vs-paid boundary enforced at surface • samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs • landing/ — 4 static HTML pages (apex chooser + 3 niche), shared CSS, deploy.py URL-substitution script, auto-generated robots.txt + sitemap.xml + 404.html + favicon • docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md — full strategy + measurement + deployment + master checklist Test counts: before: 1,520 passed · 4 skipped · 17 xfailed after: 1,729 passed · 0 skipped · 0 xfailed Tier-1 corpora added: • missing-corpus 3 use cases + 16 edge cases • column-mapper-corpus 3 use cases + 5 edge cases • format-cleaner intl 20-row 13-country stress fixture Engine hardening flushed out by the corpora: • interpolate guards against object-dtype columns • mean/median skip all-NaN columns (silences numpy warning) • fillna runs under future.no_silent_downcasting (silences pandas warning) • mojibake test no longer skips when ftfy installed (monkeypatch path) • drop-row threshold semantics: strict-greater (consistent across rows / cols) • currency_decimal validator allow-set updated for "auto" Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 22:31:26 +00:00
parent d18b95880d
commit 966af8ef94
89 changed files with 12039 additions and 284 deletions
--- a/src/gui/pages/9_Pipeline_Runner.py
+++ b/src/gui/pages/9_Pipeline_Runner.py
@@ -1,104 +1,370 @@
-"""DataTools Pipeline Runner — stub page."""
+"""DataTools Pipeline Runner — Streamlit page."""

 from __future__ import annotations

+import io
+import json
 import sys
 from pathlib import Path

+import pandas as pd
 import streamlit as st

 _project_root = Path(__file__).resolve().parent.parent.parent.parent
 if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root))

-from src.gui.components import hide_streamlit_chrome, require_normalization_gate
+from src.gui.components import (
+    hide_streamlit_chrome,
+    pickup_or_upload,
+    require_normalization_gate,
+)
+from src.core.pipeline import (
+    Pipeline,
+    SOFT_DEPENDENCIES,
+    Step,
+    TOOL_NAMES,
+    recommended_pipeline,
+    run_pipeline,
+    validate_pipeline,
+)

 hide_streamlit_chrome()
 require_normalization_gate()

+
 # ---------------------------------------------------------------------------
 # Header
 # ---------------------------------------------------------------------------

 st.title("⚙️ Pipeline Runner")
-st.caption("Chain tools in sequence and pass output between steps automatically.")
-
-st.info("This tool is under development.")
-
-# ---------------------------------------------------------------------------
-# What this tool will do
-# ---------------------------------------------------------------------------
-
-st.markdown("""
-**Features:**
- Select tools to run in sequence
- Recommended order: Text Cleaner → Format Standardizer → Missing Values → Deduplicator → Validator
- Each step's output feeds into the next step's input
- Per-step configuration overrides
- Progress tracking across all steps
- Final combined report
-""")
-
-st.divider()
-
-# ---------------------------------------------------------------------------
-# File upload (functional)
-# ---------------------------------------------------------------------------
-
-uploaded = st.file_uploader(
-    "Upload CSV or Excel file",
-    type=["csv", "tsv", "xlsx", "xls"],
-    help="Upload a file to preview. Processing is not yet available.",
-    key="pipeline_file_upload",
+st.caption(
+    "Chain DataTools cleaning steps into one repeatable workflow. The "
+    "pipeline recommends an order; you stay in control."
 )

-if uploaded is not None:
-    import pandas as pd
+
+# ---------------------------------------------------------------------------
+# File upload
+# ---------------------------------------------------------------------------
+
+uploaded = pickup_or_upload(
+    label="Upload CSV or Excel file",
+    key="pipeline_file_upload",
+    types=["csv", "tsv", "xlsx", "xls"],
+)
+
+if uploaded is None:
+    st.info("Upload a CSV, TSV, or Excel file to begin.")
+    st.stop()
+
+
+@st.cache_data(show_spinner=False)
+def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
+    suffix = Path(name).suffix.lower()
+    bio = io.BytesIO(data)
+    if suffix in (".xlsx", ".xls"):
+        return pd.read_excel(bio)
+    for enc in ("utf-8", "utf-8-sig", "latin-1"):
+        try:
+            bio.seek(0)
+            sep = "\t" if suffix == ".tsv" else ","
+            return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn")
+        except UnicodeDecodeError:
+            continue
+    bio.seek(0)
+    return pd.read_csv(bio, encoding="latin-1")
+
+
+try:
+    df = _read_uploaded(uploaded.name, uploaded.getvalue())
+except Exception as e:
+    from src.core.errors import format_for_user
+    st.error(
+        f"**Could not read `{uploaded.name}`**\n\n"
+        f"```\n{format_for_user(e)}\n```"
+    )
+    st.stop()
+
+st.subheader(f"Preview: {uploaded.name}")
+st.caption(f"{len(df)} rows, {len(df.columns)} columns")
+st.dataframe(df.head(10), use_container_width=True)
+st.divider()
+
+
+# ---------------------------------------------------------------------------
+# Pipeline builder
+# ---------------------------------------------------------------------------
+
+st.subheader("Pipeline")
+
+mode = st.radio(
+    "How would you like to define the pipeline?",
+    [
+        "Use the recommended default (text-clean → format → missing → dedup)",
+        "Build interactively",
+        "Upload a saved pipeline JSON",
+    ],
+    index=0,
+)
+
+if "pipeline_rows" not in st.session_state:
+    default = recommended_pipeline()
+    st.session_state["pipeline_rows"] = pd.DataFrame([
+        {
+            "tool": s.tool, "enabled": s.enabled,
+            "options_json": json.dumps(s.options),
+        }
+        for s in default.steps
+    ])
+
+if mode.startswith("Use the recommended"):
+    default = recommended_pipeline()
+    st.session_state["pipeline_rows"] = pd.DataFrame([
+        {
+            "tool": s.tool, "enabled": s.enabled,
+            "options_json": json.dumps(s.options),
+        }
+        for s in default.steps
+    ])
+elif mode.startswith("Upload"):
+    pipeline_file = st.file_uploader(
+        "Pipeline JSON", type=["json"], key="pipeline_upload",
+    )
+    if pipeline_file is not None:
+        try:
+            data = json.loads(pipeline_file.getvalue())
+            uploaded_pipe = Pipeline.from_dict(data)
+            st.session_state["pipeline_rows"] = pd.DataFrame([
+                {
+                    "tool": s.tool, "enabled": s.enabled,
+                    "options_json": json.dumps(s.options),
+                }
+                for s in uploaded_pipe.steps
+            ])
+            st.success(f"Loaded {len(uploaded_pipe.steps)} step(s).")
+        except Exception as e:
+            from src.core.errors import format_for_user
+            st.error(f"**Could not parse pipeline**\n\n```\n{format_for_user(e)}\n```")
+
+st.caption(
+    "Edit the table to add, remove, reorder (drag the row index), enable, "
+    "or configure each step. Tool order is recommended, not enforced — "
+    "violations surface as warnings below the table."
+)
+edited = st.data_editor(
+    st.session_state["pipeline_rows"],
+    use_container_width=True,
+    num_rows="dynamic",
+    column_config={
+        "tool": st.column_config.SelectboxColumn(
+            "Tool", options=TOOL_NAMES, required=True,
+        ),
+        "enabled": st.column_config.CheckboxColumn("Enabled"),
+        "options_json": st.column_config.TextColumn(
+            "Options (JSON)",
+            help='e.g. {"column_types": {"phone": "phone"}}',
+        ),
+    },
+    key="pipeline_editor",
+)
+st.session_state["pipeline_rows"] = edited
+
+# Build a Pipeline object from the editor state.
+steps_list: list[Step] = []
+parse_errors: list[str] = []
+for i, row in edited.iterrows():
+    tool = row.get("tool")
+    if not tool or pd.isna(tool):
+        continue
+    raw_opts = row.get("options_json") or "{}"
+    if pd.isna(raw_opts):
+        raw_opts = "{}"
    try:
-        if uploaded.name.endswith((".xlsx", ".xls")):
-            df = pd.read_excel(uploaded)
-        else:
-            df = pd.read_csv(uploaded)
-        st.subheader(f"Preview: {uploaded.name}")
-        st.caption(f"{len(df)} rows, {len(df.columns)} columns")
-        st.dataframe(df.head(10), use_container_width=True)
+        opts = json.loads(raw_opts) if isinstance(raw_opts, str) else dict(raw_opts)
+        if not isinstance(opts, dict):
+            raise ValueError("options must be a JSON object")
    except Exception as e:
-        from src.core.errors import format_for_user
-        st.error(
-            f"**Could not read `{uploaded.name}`**\n\n"
-            f"```\n{format_for_user(e)}\n```"
+        parse_errors.append(f"Step {i + 1}: {e}")
+        continue
+    try:
+        steps_list.append(Step(
+            tool=str(tool),
+            options=opts,
+            enabled=bool(row.get("enabled", True)),
+        ))
+    except Exception as e:
+        parse_errors.append(f"Step {i + 1}: {e}")
+
+if parse_errors:
+    for err in parse_errors:
+        st.error(err)
+
+current_pipeline = Pipeline(steps=steps_list) if steps_list else None
+
+if current_pipeline is not None:
+    warnings = validate_pipeline(current_pipeline)
+    if warnings:
+        st.warning(
+            "Pipeline is out of recommended order:\n\n"
+            + "\n".join(f"- {w}" for w in warnings)
+            + "\n\nThe pipeline will still run — these are recommendations only."
        )

-# ---------------------------------------------------------------------------
-# Pipeline steps (checklist)
-# ---------------------------------------------------------------------------
-
-st.subheader("Pipeline Steps")
-st.caption("Select tools to include in the pipeline (recommended order):")
-
-st.checkbox("1. Text Cleaner", value=True, disabled=True)
-st.checkbox("2. Format Standardizer", value=True, disabled=True)
-st.checkbox("3. Missing Value Handler", value=True, disabled=True)
-st.checkbox("4. Column Mapper", value=False, disabled=True)
-st.checkbox("5. Outlier Detector", value=False, disabled=True)
-st.checkbox("6. Deduplicator", value=True, disabled=True)
-st.checkbox("7. Multi-File Merger", value=False, disabled=True)
-st.checkbox("8. Validator & Reporter", value=True, disabled=True)
-
-st.subheader("Pipeline Configuration")
-
-st.selectbox("On error", ["Stop pipeline", "Skip step and continue", "Prompt for decision"], disabled=True)
-st.checkbox("Generate combined report at end", value=True, disabled=True)
+with st.expander("Recommended tool order — why each step belongs where it does"):
+    st.markdown(
+        "\n".join(
+            f"- **{e}** before **{l}** — {why}"
+            for e, l, why in SOFT_DEPENDENCIES
+        )
+    )

 st.divider()
-st.button("Run Pipeline", type="primary", use_container_width=True, disabled=True)

 # ---------------------------------------------------------------------------
-# Footer
+# Run
+# ---------------------------------------------------------------------------
+
+run_disabled = current_pipeline is None or not current_pipeline.steps
+
+if st.button(
+    "Run Pipeline",
+    type="primary",
+    use_container_width=True,
+    disabled=run_disabled,
+):
+    progress = st.progress(0.0, text="Starting...")
+    log_box = st.empty()
+    log_lines: list[str] = []
+    total_enabled = sum(1 for s in current_pipeline.steps if s.enabled)
+    completed = [0]
+
+    def _on_step(sr) -> None:
+        completed[0] += 1
+        if sr.skipped:
+            log_lines.append(f"○ {sr.step.display_name()} (skipped)")
+        elif sr.error:
+            log_lines.append(
+                f"✗ {sr.step.display_name()} — {sr.error.splitlines()[0]}"
+            )
+        else:
+            log_lines.append(
+                f"✓ {sr.step.display_name()} — {sr.elapsed_seconds*1000:.0f} ms"
+            )
+        log_box.markdown("\n".join(log_lines))
+        progress.progress(
+            completed[0] / max(total_enabled, 1),
+            text=f"Step {completed[0]}/{total_enabled}",
+        )
+
+    try:
+        result = run_pipeline(
+            df, current_pipeline,
+            on_step_complete=_on_step,
+            stop_on_error=False,
+        )
+    except Exception as e:
+        from src.core.errors import format_for_user
+        st.error(f"**Pipeline halted**\n\n```\n{format_for_user(e)}\n```")
+        st.stop()
+
+    progress.progress(1.0, text="Done")
+    st.session_state["pipeline_result"] = result
+    st.session_state["pipeline_input_name"] = uploaded.name
+
+result = st.session_state.get("pipeline_result")
+if result is None:
+    st.info(
+        "Configure the pipeline above and click **Run Pipeline** to "
+        "execute it on your file."
+    )
+    st.stop()
+
+# ---------------------------------------------------------------------------
+# Results
+# ---------------------------------------------------------------------------
+
+st.subheader("Results")
+
+m1, m2, m3, m4 = st.columns(4)
+m1.metric("Initial rows", result.initial_rows)
+m2.metric("Final rows", result.final_rows)
+m3.metric("Steps run", sum(1 for s in result.step_results if not s.skipped))
+m4.metric("Elapsed", f"{result.total_elapsed:.2f} s")
+
+st.markdown("**Per-step summary**")
+step_df = pd.DataFrame([
+    {
+        "step": sr.step.display_name(),
+        "status": (
+            "skipped" if sr.skipped
+            else "error" if sr.error
+            else "ok"
+        ),
+        "elapsed_ms": int(sr.elapsed_seconds * 1000),
+        "summary": json.dumps(sr.summary, default=str)[:200],
+        "error": sr.error or "",
+    }
+    for sr in result.step_results
+])
+st.dataframe(step_df, use_container_width=True, hide_index=True)
+
+st.markdown("**Output preview (first 10 rows)**")
+st.dataframe(result.final_df.head(10), use_container_width=True)
+
+# ---------------------------------------------------------------------------
+# Downloads
 # ---------------------------------------------------------------------------

 st.divider()
-st.caption(
-    "Runs locally. Your data never leaves this computer. "
-    "| DataTools v3.0"
-)
+stem = Path(st.session_state.get("pipeline_input_name", "input")).stem
+
+dl_a, dl_b, dl_c = st.columns(3)
+with dl_a:
+    bytes_csv = result.final_df.to_csv(index=False).encode("utf-8-sig")
+    st.download_button(
+        "Download cleaned CSV",
+        data=bytes_csv,
+        file_name=f"{stem}_pipeline.csv",
+        mime="text/csv",
+    )
+with dl_b:
+    pipeline_bytes = json.dumps(
+        current_pipeline.to_dict() if current_pipeline else {"steps": []},
+        indent=2, default=str,
+    ).encode("utf-8")
+    st.download_button(
+        "Download pipeline JSON",
+        data=pipeline_bytes,
+        file_name="pipeline.json",
+        mime="application/json",
+        help="Save this and pass --pipeline pipeline.json to the CLI to re-run on next week's file.",
+    )
+with dl_c:
+    audit_bytes = json.dumps({
+        "warnings": result.warnings,
+        "initial_rows": result.initial_rows,
+        "final_rows": result.final_rows,
+        "total_elapsed_seconds": result.total_elapsed,
+        "steps": [
+            {
+                "tool": sr.step.tool,
+                "name": sr.step.display_name(),
+                "enabled": sr.step.enabled,
+                "skipped": sr.skipped,
+                "elapsed_seconds": sr.elapsed_seconds,
+                "summary": sr.summary,
+                "error": sr.error,
+            }
+            for sr in result.step_results
+        ],
+    }, indent=2, default=str).encode("utf-8")
+    st.download_button(
+        "Download run audit",
+        data=audit_bytes,
+        file_name=f"{stem}_pipeline_audit.json",
+        mime="application/json",
+    )
+
+st.divider()
+st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")