feat: 3 new tools, format streaming, distribution-ready demo + landing pages
Tools shipped this batch (4 → 6 of 9 Ready):
04 Missing Value Handler src/core/missing.py + cli_missing.py + GUI
05 Column Mapper src/core/column_mapper.py + cli_column_map.py + GUI
09 Pipeline Runner src/core/pipeline.py + cli_pipeline.py + GUI
with soft tool-dependency graph (recommended,
not enforced) and JSON save/load for repeatable
weekly cleanups.
Format Standardizer reworked for 1 GB international files:
• Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
• Per-row country / address columns drive parsing
• Audit cap (default 10 k rows, ~50 MB RAM)
• standardize_file(): chunked streaming entry point (~165 k rows/sec)
• currency_decimal="auto" for EU comma-decimal locales
• R$ / kr / zł multi-char currency prefixes
• cli_format.py with auto-stream above 100 MB inputs
Encoding detection arbiter + language-aware probe:
Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.
Distribution-readiness assets:
• streamlit_app.py — Streamlit Community Cloud entry shim
• src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
100-row cap + watermark, free-vs-paid boundary enforced at surface
• samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
• landing/ — 4 static HTML pages (apex chooser + 3 niche),
shared CSS, deploy.py URL-substitution script,
auto-generated robots.txt + sitemap.xml + 404.html + favicon
• docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
— full strategy + measurement + deployment + master checklist
Test counts:
before: 1,520 passed · 4 skipped · 17 xfailed
after: 1,729 passed · 0 skipped · 0 xfailed
Tier-1 corpora added:
• missing-corpus 3 use cases + 16 edge cases
• column-mapper-corpus 3 use cases + 5 edge cases
• format-cleaner intl 20-row 13-country stress fixture
Engine hardening flushed out by the corpora:
• interpolate guards against object-dtype columns
• mean/median skip all-NaN columns (silences numpy warning)
• fillna runs under future.no_silent_downcasting (silences pandas warning)
• mojibake test no longer skips when ftfy installed (monkeypatch path)
• drop-row threshold semantics: strict-greater (consistent across rows / cols)
• currency_decimal validator allow-set updated for "auto"
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,104 +1,370 @@
|
||||
"""DataTools Pipeline Runner — stub page."""
|
||||
"""DataTools Pipeline Runner — Streamlit page."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
if str(_project_root) not in sys.path:
|
||||
sys.path.insert(0, str(_project_root))
|
||||
|
||||
from src.gui.components import hide_streamlit_chrome, require_normalization_gate
|
||||
from src.gui.components import (
|
||||
hide_streamlit_chrome,
|
||||
pickup_or_upload,
|
||||
require_normalization_gate,
|
||||
)
|
||||
from src.core.pipeline import (
|
||||
Pipeline,
|
||||
SOFT_DEPENDENCIES,
|
||||
Step,
|
||||
TOOL_NAMES,
|
||||
recommended_pipeline,
|
||||
run_pipeline,
|
||||
validate_pipeline,
|
||||
)
|
||||
|
||||
hide_streamlit_chrome()
|
||||
require_normalization_gate()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Header
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.title("⚙️ Pipeline Runner")
|
||||
st.caption("Chain tools in sequence and pass output between steps automatically.")
|
||||
|
||||
st.info("This tool is under development.")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# What this tool will do
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.markdown("""
|
||||
**Features:**
|
||||
- Select tools to run in sequence
|
||||
- Recommended order: Text Cleaner → Format Standardizer → Missing Values → Deduplicator → Validator
|
||||
- Each step's output feeds into the next step's input
|
||||
- Per-step configuration overrides
|
||||
- Progress tracking across all steps
|
||||
- Final combined report
|
||||
""")
|
||||
|
||||
st.divider()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# File upload (functional)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
uploaded = st.file_uploader(
|
||||
"Upload CSV or Excel file",
|
||||
type=["csv", "tsv", "xlsx", "xls"],
|
||||
help="Upload a file to preview. Processing is not yet available.",
|
||||
key="pipeline_file_upload",
|
||||
st.caption(
|
||||
"Chain DataTools cleaning steps into one repeatable workflow. The "
|
||||
"pipeline recommends an order; you stay in control."
|
||||
)
|
||||
|
||||
if uploaded is not None:
|
||||
import pandas as pd
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# File upload
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
uploaded = pickup_or_upload(
|
||||
label="Upload CSV or Excel file",
|
||||
key="pipeline_file_upload",
|
||||
types=["csv", "tsv", "xlsx", "xls"],
|
||||
)
|
||||
|
||||
if uploaded is None:
|
||||
st.info("Upload a CSV, TSV, or Excel file to begin.")
|
||||
st.stop()
|
||||
|
||||
|
||||
@st.cache_data(show_spinner=False)
|
||||
def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
|
||||
suffix = Path(name).suffix.lower()
|
||||
bio = io.BytesIO(data)
|
||||
if suffix in (".xlsx", ".xls"):
|
||||
return pd.read_excel(bio)
|
||||
for enc in ("utf-8", "utf-8-sig", "latin-1"):
|
||||
try:
|
||||
bio.seek(0)
|
||||
sep = "\t" if suffix == ".tsv" else ","
|
||||
return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn")
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
bio.seek(0)
|
||||
return pd.read_csv(bio, encoding="latin-1")
|
||||
|
||||
|
||||
try:
|
||||
df = _read_uploaded(uploaded.name, uploaded.getvalue())
|
||||
except Exception as e:
|
||||
from src.core.errors import format_for_user
|
||||
st.error(
|
||||
f"**Could not read `{uploaded.name}`**\n\n"
|
||||
f"```\n{format_for_user(e)}\n```"
|
||||
)
|
||||
st.stop()
|
||||
|
||||
st.subheader(f"Preview: {uploaded.name}")
|
||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||
st.dataframe(df.head(10), use_container_width=True)
|
||||
st.divider()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pipeline builder
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.subheader("Pipeline")
|
||||
|
||||
mode = st.radio(
|
||||
"How would you like to define the pipeline?",
|
||||
[
|
||||
"Use the recommended default (text-clean → format → missing → dedup)",
|
||||
"Build interactively",
|
||||
"Upload a saved pipeline JSON",
|
||||
],
|
||||
index=0,
|
||||
)
|
||||
|
||||
if "pipeline_rows" not in st.session_state:
|
||||
default = recommended_pipeline()
|
||||
st.session_state["pipeline_rows"] = pd.DataFrame([
|
||||
{
|
||||
"tool": s.tool, "enabled": s.enabled,
|
||||
"options_json": json.dumps(s.options),
|
||||
}
|
||||
for s in default.steps
|
||||
])
|
||||
|
||||
if mode.startswith("Use the recommended"):
|
||||
default = recommended_pipeline()
|
||||
st.session_state["pipeline_rows"] = pd.DataFrame([
|
||||
{
|
||||
"tool": s.tool, "enabled": s.enabled,
|
||||
"options_json": json.dumps(s.options),
|
||||
}
|
||||
for s in default.steps
|
||||
])
|
||||
elif mode.startswith("Upload"):
|
||||
pipeline_file = st.file_uploader(
|
||||
"Pipeline JSON", type=["json"], key="pipeline_upload",
|
||||
)
|
||||
if pipeline_file is not None:
|
||||
try:
|
||||
data = json.loads(pipeline_file.getvalue())
|
||||
uploaded_pipe = Pipeline.from_dict(data)
|
||||
st.session_state["pipeline_rows"] = pd.DataFrame([
|
||||
{
|
||||
"tool": s.tool, "enabled": s.enabled,
|
||||
"options_json": json.dumps(s.options),
|
||||
}
|
||||
for s in uploaded_pipe.steps
|
||||
])
|
||||
st.success(f"Loaded {len(uploaded_pipe.steps)} step(s).")
|
||||
except Exception as e:
|
||||
from src.core.errors import format_for_user
|
||||
st.error(f"**Could not parse pipeline**\n\n```\n{format_for_user(e)}\n```")
|
||||
|
||||
st.caption(
|
||||
"Edit the table to add, remove, reorder (drag the row index), enable, "
|
||||
"or configure each step. Tool order is recommended, not enforced — "
|
||||
"violations surface as warnings below the table."
|
||||
)
|
||||
edited = st.data_editor(
|
||||
st.session_state["pipeline_rows"],
|
||||
use_container_width=True,
|
||||
num_rows="dynamic",
|
||||
column_config={
|
||||
"tool": st.column_config.SelectboxColumn(
|
||||
"Tool", options=TOOL_NAMES, required=True,
|
||||
),
|
||||
"enabled": st.column_config.CheckboxColumn("Enabled"),
|
||||
"options_json": st.column_config.TextColumn(
|
||||
"Options (JSON)",
|
||||
help='e.g. {"column_types": {"phone": "phone"}}',
|
||||
),
|
||||
},
|
||||
key="pipeline_editor",
|
||||
)
|
||||
st.session_state["pipeline_rows"] = edited
|
||||
|
||||
# Build a Pipeline object from the editor state.
|
||||
steps_list: list[Step] = []
|
||||
parse_errors: list[str] = []
|
||||
for i, row in edited.iterrows():
|
||||
tool = row.get("tool")
|
||||
if not tool or pd.isna(tool):
|
||||
continue
|
||||
raw_opts = row.get("options_json") or "{}"
|
||||
if pd.isna(raw_opts):
|
||||
raw_opts = "{}"
|
||||
try:
|
||||
if uploaded.name.endswith((".xlsx", ".xls")):
|
||||
df = pd.read_excel(uploaded)
|
||||
else:
|
||||
df = pd.read_csv(uploaded)
|
||||
st.subheader(f"Preview: {uploaded.name}")
|
||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||
st.dataframe(df.head(10), use_container_width=True)
|
||||
opts = json.loads(raw_opts) if isinstance(raw_opts, str) else dict(raw_opts)
|
||||
if not isinstance(opts, dict):
|
||||
raise ValueError("options must be a JSON object")
|
||||
except Exception as e:
|
||||
from src.core.errors import format_for_user
|
||||
st.error(
|
||||
f"**Could not read `{uploaded.name}`**\n\n"
|
||||
f"```\n{format_for_user(e)}\n```"
|
||||
parse_errors.append(f"Step {i + 1}: {e}")
|
||||
continue
|
||||
try:
|
||||
steps_list.append(Step(
|
||||
tool=str(tool),
|
||||
options=opts,
|
||||
enabled=bool(row.get("enabled", True)),
|
||||
))
|
||||
except Exception as e:
|
||||
parse_errors.append(f"Step {i + 1}: {e}")
|
||||
|
||||
if parse_errors:
|
||||
for err in parse_errors:
|
||||
st.error(err)
|
||||
|
||||
current_pipeline = Pipeline(steps=steps_list) if steps_list else None
|
||||
|
||||
if current_pipeline is not None:
|
||||
warnings = validate_pipeline(current_pipeline)
|
||||
if warnings:
|
||||
st.warning(
|
||||
"Pipeline is out of recommended order:\n\n"
|
||||
+ "\n".join(f"- {w}" for w in warnings)
|
||||
+ "\n\nThe pipeline will still run — these are recommendations only."
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pipeline steps (checklist)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.subheader("Pipeline Steps")
|
||||
st.caption("Select tools to include in the pipeline (recommended order):")
|
||||
|
||||
st.checkbox("1. Text Cleaner", value=True, disabled=True)
|
||||
st.checkbox("2. Format Standardizer", value=True, disabled=True)
|
||||
st.checkbox("3. Missing Value Handler", value=True, disabled=True)
|
||||
st.checkbox("4. Column Mapper", value=False, disabled=True)
|
||||
st.checkbox("5. Outlier Detector", value=False, disabled=True)
|
||||
st.checkbox("6. Deduplicator", value=True, disabled=True)
|
||||
st.checkbox("7. Multi-File Merger", value=False, disabled=True)
|
||||
st.checkbox("8. Validator & Reporter", value=True, disabled=True)
|
||||
|
||||
st.subheader("Pipeline Configuration")
|
||||
|
||||
st.selectbox("On error", ["Stop pipeline", "Skip step and continue", "Prompt for decision"], disabled=True)
|
||||
st.checkbox("Generate combined report at end", value=True, disabled=True)
|
||||
with st.expander("Recommended tool order — why each step belongs where it does"):
|
||||
st.markdown(
|
||||
"\n".join(
|
||||
f"- **{e}** before **{l}** — {why}"
|
||||
for e, l, why in SOFT_DEPENDENCIES
|
||||
)
|
||||
)
|
||||
|
||||
st.divider()
|
||||
st.button("Run Pipeline", type="primary", use_container_width=True, disabled=True)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Footer
|
||||
# Run
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
run_disabled = current_pipeline is None or not current_pipeline.steps
|
||||
|
||||
if st.button(
|
||||
"Run Pipeline",
|
||||
type="primary",
|
||||
use_container_width=True,
|
||||
disabled=run_disabled,
|
||||
):
|
||||
progress = st.progress(0.0, text="Starting...")
|
||||
log_box = st.empty()
|
||||
log_lines: list[str] = []
|
||||
total_enabled = sum(1 for s in current_pipeline.steps if s.enabled)
|
||||
completed = [0]
|
||||
|
||||
def _on_step(sr) -> None:
|
||||
completed[0] += 1
|
||||
if sr.skipped:
|
||||
log_lines.append(f"○ {sr.step.display_name()} (skipped)")
|
||||
elif sr.error:
|
||||
log_lines.append(
|
||||
f"✗ {sr.step.display_name()} — {sr.error.splitlines()[0]}"
|
||||
)
|
||||
else:
|
||||
log_lines.append(
|
||||
f"✓ {sr.step.display_name()} — {sr.elapsed_seconds*1000:.0f} ms"
|
||||
)
|
||||
log_box.markdown("\n".join(log_lines))
|
||||
progress.progress(
|
||||
completed[0] / max(total_enabled, 1),
|
||||
text=f"Step {completed[0]}/{total_enabled}",
|
||||
)
|
||||
|
||||
try:
|
||||
result = run_pipeline(
|
||||
df, current_pipeline,
|
||||
on_step_complete=_on_step,
|
||||
stop_on_error=False,
|
||||
)
|
||||
except Exception as e:
|
||||
from src.core.errors import format_for_user
|
||||
st.error(f"**Pipeline halted**\n\n```\n{format_for_user(e)}\n```")
|
||||
st.stop()
|
||||
|
||||
progress.progress(1.0, text="Done")
|
||||
st.session_state["pipeline_result"] = result
|
||||
st.session_state["pipeline_input_name"] = uploaded.name
|
||||
|
||||
result = st.session_state.get("pipeline_result")
|
||||
if result is None:
|
||||
st.info(
|
||||
"Configure the pipeline above and click **Run Pipeline** to "
|
||||
"execute it on your file."
|
||||
)
|
||||
st.stop()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Results
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.subheader("Results")
|
||||
|
||||
m1, m2, m3, m4 = st.columns(4)
|
||||
m1.metric("Initial rows", result.initial_rows)
|
||||
m2.metric("Final rows", result.final_rows)
|
||||
m3.metric("Steps run", sum(1 for s in result.step_results if not s.skipped))
|
||||
m4.metric("Elapsed", f"{result.total_elapsed:.2f} s")
|
||||
|
||||
st.markdown("**Per-step summary**")
|
||||
step_df = pd.DataFrame([
|
||||
{
|
||||
"step": sr.step.display_name(),
|
||||
"status": (
|
||||
"skipped" if sr.skipped
|
||||
else "error" if sr.error
|
||||
else "ok"
|
||||
),
|
||||
"elapsed_ms": int(sr.elapsed_seconds * 1000),
|
||||
"summary": json.dumps(sr.summary, default=str)[:200],
|
||||
"error": sr.error or "",
|
||||
}
|
||||
for sr in result.step_results
|
||||
])
|
||||
st.dataframe(step_df, use_container_width=True, hide_index=True)
|
||||
|
||||
st.markdown("**Output preview (first 10 rows)**")
|
||||
st.dataframe(result.final_df.head(10), use_container_width=True)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Downloads
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.divider()
|
||||
st.caption(
|
||||
"Runs locally. Your data never leaves this computer. "
|
||||
"| DataTools v3.0"
|
||||
)
|
||||
stem = Path(st.session_state.get("pipeline_input_name", "input")).stem
|
||||
|
||||
dl_a, dl_b, dl_c = st.columns(3)
|
||||
with dl_a:
|
||||
bytes_csv = result.final_df.to_csv(index=False).encode("utf-8-sig")
|
||||
st.download_button(
|
||||
"Download cleaned CSV",
|
||||
data=bytes_csv,
|
||||
file_name=f"{stem}_pipeline.csv",
|
||||
mime="text/csv",
|
||||
)
|
||||
with dl_b:
|
||||
pipeline_bytes = json.dumps(
|
||||
current_pipeline.to_dict() if current_pipeline else {"steps": []},
|
||||
indent=2, default=str,
|
||||
).encode("utf-8")
|
||||
st.download_button(
|
||||
"Download pipeline JSON",
|
||||
data=pipeline_bytes,
|
||||
file_name="pipeline.json",
|
||||
mime="application/json",
|
||||
help="Save this and pass --pipeline pipeline.json to the CLI to re-run on next week's file.",
|
||||
)
|
||||
with dl_c:
|
||||
audit_bytes = json.dumps({
|
||||
"warnings": result.warnings,
|
||||
"initial_rows": result.initial_rows,
|
||||
"final_rows": result.final_rows,
|
||||
"total_elapsed_seconds": result.total_elapsed,
|
||||
"steps": [
|
||||
{
|
||||
"tool": sr.step.tool,
|
||||
"name": sr.step.display_name(),
|
||||
"enabled": sr.step.enabled,
|
||||
"skipped": sr.skipped,
|
||||
"elapsed_seconds": sr.elapsed_seconds,
|
||||
"summary": sr.summary,
|
||||
"error": sr.error,
|
||||
}
|
||||
for sr in result.step_results
|
||||
],
|
||||
}, indent=2, default=str).encode("utf-8")
|
||||
st.download_button(
|
||||
"Download run audit",
|
||||
data=audit_bytes,
|
||||
file_name=f"{stem}_pipeline_audit.json",
|
||||
mime="application/json",
|
||||
)
|
||||
|
||||
st.divider()
|
||||
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")
|
||||
|
||||
Reference in New Issue
Block a user