feat(pipeline): visual module-card builder for Automated Workflows
Replaces the raw options_json data-editor table with a per-step "module card" builder matching the locked design mockup (layout-review/09_pipeline_runner.html): each step shows a friendly name + caption, an enable toggle, ▲/▼/✕ reorder/remove controls, and a Configure expander that renders that tool's own controls in plain language. Raw JSON is demoted to an Advanced import/export section. New src/gui/components/pipeline_modules.py holds the adapter-key→tool_id friendly-name bridge, one plain-language config renderer per tool (text_clean, format_standardize, missing, column_map, dedup — emitting the exact JSON option shapes the core adapters accept), and render_step_card. Steps live in session state as an ordered list with stable ids so widget keys survive reorder/remove. Reorder is ▲/▼ buttons (no JS drag dependency). The on-disk/CLI pipeline JSON format is unchanged — CLI and src/core untouched. Adds tests/gui/test_pipeline_builder.py (AppTest) covering seed, configure panels, toggle/add/remove, and a full run. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
376
src/gui/components/pipeline_modules.py
Normal file
376
src/gui/components/pipeline_modules.py
Normal file
@@ -0,0 +1,376 @@
|
|||||||
|
"""Visual pipeline builder — per-step "module" cards + plain-language config panels.
|
||||||
|
|
||||||
|
The Automated Workflows page (``9_Pipeline_Runner.py``) used to configure each
|
||||||
|
step through a raw ``options_json`` text column. This module replaces that with
|
||||||
|
one **module card** per step: a friendly name + caption, an enable toggle,
|
||||||
|
reorder/remove controls, and a **Configure** expander that renders that tool's
|
||||||
|
own controls in plain language (no JSON). Raw JSON survives only as the page's
|
||||||
|
Advanced import/export surface.
|
||||||
|
|
||||||
|
Each config renderer takes the step's current ``options`` dict, renders the
|
||||||
|
curated controls from the design mockup (``layout-review/09_pipeline_runner.html``),
|
||||||
|
and returns an updated **JSON-serialisable** options dict — the same shape the
|
||||||
|
``TOOL_ADAPTERS`` in ``src/core/pipeline.py`` consume via ``Options.from_dict``.
|
||||||
|
|
||||||
|
Two hard Streamlit constraints shaped this:
|
||||||
|
* No nested expanders — the per-step Configure expander means config renderers
|
||||||
|
here must NOT open their own expander, and the page must not wrap the card
|
||||||
|
stack in an outer expander.
|
||||||
|
* Widget identity must be stable across reorder/remove — every widget key is
|
||||||
|
derived from a step's stable ``id``, never its list position.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any, Callable, Optional
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
from src.gui.tools_registry import tool_name
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Adapter-key → registry tool_id bridge
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# Pipeline steps are keyed by adapter name (``text_clean``); the tools registry
|
||||||
|
# and i18n packs are keyed by tool_id (``02_text_cleaner``). The registry has no
|
||||||
|
# reverse lookup, so we keep the bridge here. ``step_label`` resolves the
|
||||||
|
# localized friendly name; ``step_caption`` returns a short, plain-English "what
|
||||||
|
# this step does" line for the card body.
|
||||||
|
|
||||||
|
PIPELINE_TOOL_META: dict[str, str] = {
|
||||||
|
"text_clean": "02_text_cleaner",
|
||||||
|
"format_standardize": "03_format_standardizer",
|
||||||
|
"missing": "04_missing_handler",
|
||||||
|
"column_map": "05_column_mapper",
|
||||||
|
"dedup": "01_deduplicator",
|
||||||
|
}
|
||||||
|
|
||||||
|
_STEP_CAPTIONS: dict[str, str] = {
|
||||||
|
"text_clean": "Trim spaces, collapse repeats, strip invisible characters.",
|
||||||
|
"format_standardize": "Canonicalize phones, dates, currency, names per column.",
|
||||||
|
"missing": "Flag, fill, or drop blank cells (and disguised blanks).",
|
||||||
|
"column_map": "Rename source columns onto your target column names.",
|
||||||
|
"dedup": "Find duplicate rows and keep one survivor per group.",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def step_label(tool: str) -> str:
|
||||||
|
"""Friendly, localized name for a pipeline adapter key (falls back to the key)."""
|
||||||
|
tool_id = PIPELINE_TOOL_META.get(tool)
|
||||||
|
return tool_name(tool_id) if tool_id else tool
|
||||||
|
|
||||||
|
|
||||||
|
def step_caption(tool: str) -> str:
|
||||||
|
return _STEP_CAPTIONS.get(tool, "")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Per-tool config renderers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# Uniform signature: ``render_<tool>_config(df, options, kp) -> options``.
|
||||||
|
# * ``df`` — the uploaded DataFrame (for column lists / type hints).
|
||||||
|
# * ``options`` — the step's current options dict (seed widget defaults).
|
||||||
|
# * ``kp`` — key prefix, unique per step (``f"{tool}_{id}"``).
|
||||||
|
# Returns a JSON-serialisable options dict. Renderers must not open expanders.
|
||||||
|
|
||||||
|
|
||||||
|
_CASE_LABELS: list[tuple[str, Optional[str]]] = [
|
||||||
|
("Leave as-is", None),
|
||||||
|
("UPPERCASE", "upper"),
|
||||||
|
("lowercase", "lower"),
|
||||||
|
("Title Case", "title"),
|
||||||
|
("Sentence case", "sentence"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def render_text_clean_config(df: pd.DataFrame, options: dict, kp: str) -> dict:
|
||||||
|
trim = st.checkbox(
|
||||||
|
"Trim leading & trailing whitespace",
|
||||||
|
value=bool(options.get("trim", True)), key=f"{kp}_trim",
|
||||||
|
)
|
||||||
|
collapse = st.checkbox(
|
||||||
|
"Collapse repeated spaces to one",
|
||||||
|
value=bool(options.get("collapse_whitespace", True)), key=f"{kp}_collapse",
|
||||||
|
)
|
||||||
|
fold = st.checkbox(
|
||||||
|
"Normalize smart quotes & dashes to plain ASCII",
|
||||||
|
value=bool(options.get("fold_smart_chars", True)), key=f"{kp}_fold",
|
||||||
|
)
|
||||||
|
strip_zw = st.checkbox(
|
||||||
|
"Strip zero-width / invisible characters",
|
||||||
|
value=bool(options.get("strip_zero_width", True)), key=f"{kp}_zw",
|
||||||
|
)
|
||||||
|
|
||||||
|
cur_case = options.get("case")
|
||||||
|
case_idx = next((i for i, (_, v) in enumerate(_CASE_LABELS) if v == cur_case), 0)
|
||||||
|
case_choice = st.selectbox(
|
||||||
|
"Letter case",
|
||||||
|
[lbl for lbl, _ in _CASE_LABELS],
|
||||||
|
index=case_idx, key=f"{kp}_case",
|
||||||
|
)
|
||||||
|
case_val = next(v for lbl, v in _CASE_LABELS if lbl == case_choice)
|
||||||
|
|
||||||
|
out: dict[str, Any] = {
|
||||||
|
"trim": trim,
|
||||||
|
"collapse_whitespace": collapse,
|
||||||
|
"fold_smart_chars": fold,
|
||||||
|
"strip_zero_width": strip_zw,
|
||||||
|
}
|
||||||
|
if case_val is not None:
|
||||||
|
out["case"] = case_val
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
_FORMAT_LABELS: list[tuple[str, Optional[str]]] = [
|
||||||
|
("Leave as-is", None),
|
||||||
|
("Date", "date"),
|
||||||
|
("Phone number", "phone"),
|
||||||
|
("Currency", "currency"),
|
||||||
|
("Name", "name"),
|
||||||
|
("Address", "address"),
|
||||||
|
("Email", "email"),
|
||||||
|
("Boolean (yes/no)", "boolean"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def render_format_standardize_config(df: pd.DataFrame, options: dict, kp: str) -> dict:
|
||||||
|
st.caption(
|
||||||
|
"Pick a target format for each column. Columns left as “Leave as-is” "
|
||||||
|
"are untouched."
|
||||||
|
)
|
||||||
|
current = dict(options.get("column_types", {}))
|
||||||
|
labels = [lbl for lbl, _ in _FORMAT_LABELS]
|
||||||
|
column_types: dict[str, str] = {}
|
||||||
|
for col in df.columns:
|
||||||
|
cur_val = current.get(col)
|
||||||
|
idx = next((i for i, (_, v) in enumerate(_FORMAT_LABELS) if v == cur_val), 0)
|
||||||
|
choice = st.selectbox(
|
||||||
|
str(col), labels, index=idx, key=f"{kp}_fmt__{col}",
|
||||||
|
)
|
||||||
|
val = next(v for lbl, v in _FORMAT_LABELS if lbl == choice)
|
||||||
|
if val is not None:
|
||||||
|
column_types[str(col)] = val
|
||||||
|
return {"column_types": column_types}
|
||||||
|
|
||||||
|
|
||||||
|
# Plain-language blank-handling choices → core strategy values. "fill" is a UI
|
||||||
|
# token expanded to numeric median + categorical mode (MissingOptions handles
|
||||||
|
# the per-dtype split via ``categorical_strategy``).
|
||||||
|
_MISSING_CHOICES: list[tuple[str, str]] = [
|
||||||
|
("Flag them (mark blanks, change nothing)", "flag"),
|
||||||
|
("Fill them in (numbers → median, text → most common)", "fill"),
|
||||||
|
("Drop rows that have any blank", "drop"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _missing_mode_from_strategy(strategy: Optional[str]) -> str:
|
||||||
|
if strategy in ("drop_row", "drop_col", "drop_both"):
|
||||||
|
return "drop"
|
||||||
|
if strategy in ("mean", "median", "mode", "constant", "ffill", "bfill", "interpolate"):
|
||||||
|
return "fill"
|
||||||
|
return "flag"
|
||||||
|
|
||||||
|
|
||||||
|
def render_missing_config(df: pd.DataFrame, options: dict, kp: str) -> dict:
|
||||||
|
from src.core.missing import DEFAULT_SENTINELS
|
||||||
|
|
||||||
|
cur_mode = _missing_mode_from_strategy(options.get("strategy"))
|
||||||
|
mode_idx = next((i for i, (_, v) in enumerate(_MISSING_CHOICES) if v == cur_mode), 0)
|
||||||
|
mode_choice = st.radio(
|
||||||
|
"What should happen to blank cells?",
|
||||||
|
[lbl for lbl, _ in _MISSING_CHOICES],
|
||||||
|
index=mode_idx, key=f"{kp}_strategy",
|
||||||
|
)
|
||||||
|
mode = next(v for lbl, v in _MISSING_CHOICES if lbl == mode_choice)
|
||||||
|
|
||||||
|
seed_sentinels = options.get("sentinels") or list(DEFAULT_SENTINELS)
|
||||||
|
sent_text = st.text_input(
|
||||||
|
"Treat these as blank (comma-separated)",
|
||||||
|
value=", ".join(seed_sentinels), key=f"{kp}_sentinels",
|
||||||
|
help="Matched case-insensitively after stripping whitespace.",
|
||||||
|
)
|
||||||
|
sentinels = [s.strip() for s in sent_text.split(",") if s.strip()]
|
||||||
|
|
||||||
|
out: dict[str, Any] = {
|
||||||
|
"standardize_sentinels": True,
|
||||||
|
"sentinels": sentinels,
|
||||||
|
}
|
||||||
|
if mode == "flag":
|
||||||
|
out["strategy"] = "none"
|
||||||
|
elif mode == "fill":
|
||||||
|
out["strategy"] = "median"
|
||||||
|
out["categorical_strategy"] = "mode"
|
||||||
|
else: # drop
|
||||||
|
out["strategy"] = "drop_row"
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
_UNMAPPED_CHOICES = ["keep", "drop", "error"]
|
||||||
|
|
||||||
|
|
||||||
|
def render_column_map_config(df: pd.DataFrame, options: dict, kp: str) -> dict:
|
||||||
|
st.caption(
|
||||||
|
"Type the target name each source column should become. Leave a target "
|
||||||
|
"blank to keep that column's name unchanged."
|
||||||
|
)
|
||||||
|
current = dict(options.get("mapping", {}))
|
||||||
|
table = pd.DataFrame(
|
||||||
|
{
|
||||||
|
"source": [str(c) for c in df.columns],
|
||||||
|
"target": [current.get(str(c), "") for c in df.columns],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
edited = st.data_editor(
|
||||||
|
table,
|
||||||
|
width="stretch",
|
||||||
|
hide_index=True,
|
||||||
|
disabled=["source"],
|
||||||
|
column_config={
|
||||||
|
"source": st.column_config.TextColumn("Source column"),
|
||||||
|
"target": st.column_config.TextColumn("Rename to"),
|
||||||
|
},
|
||||||
|
key=f"{kp}_mapping",
|
||||||
|
)
|
||||||
|
mapping = {
|
||||||
|
str(r["source"]): str(r["target"]).strip()
|
||||||
|
for _, r in edited.iterrows()
|
||||||
|
if str(r.get("target") or "").strip()
|
||||||
|
}
|
||||||
|
|
||||||
|
c1, c2 = st.columns(2)
|
||||||
|
with c1:
|
||||||
|
unmapped = st.selectbox(
|
||||||
|
"Columns with no rename",
|
||||||
|
_UNMAPPED_CHOICES,
|
||||||
|
index=_UNMAPPED_CHOICES.index(options.get("unmapped", "keep"))
|
||||||
|
if options.get("unmapped") in _UNMAPPED_CHOICES else 0,
|
||||||
|
key=f"{kp}_unmapped",
|
||||||
|
help="keep: leave them in place · drop: remove them · error: stop the run.",
|
||||||
|
)
|
||||||
|
with c2:
|
||||||
|
coerce = st.checkbox(
|
||||||
|
"Coerce values to target types",
|
||||||
|
value=bool(options.get("coerce_types", False)), key=f"{kp}_coerce",
|
||||||
|
)
|
||||||
|
return {"mapping": mapping, "unmapped": unmapped, "coerce_types": coerce}
|
||||||
|
|
||||||
|
|
||||||
|
_SURVIVOR_LABELS: list[tuple[str, str]] = [
|
||||||
|
("Keep the most complete row", "most_complete"),
|
||||||
|
("Keep the first seen", "first"),
|
||||||
|
("Keep the last seen", "last"),
|
||||||
|
("Keep the most recent (by date)", "most_recent"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def render_dedup_config(df: pd.DataFrame, options: dict, kp: str) -> dict:
|
||||||
|
cur_rule = options.get("survivor_rule", "first")
|
||||||
|
rule_idx = next((i for i, (_, v) in enumerate(_SURVIVOR_LABELS) if v == cur_rule), 0)
|
||||||
|
rule_choice = st.selectbox(
|
||||||
|
"When rows match, which one survives?",
|
||||||
|
[lbl for lbl, _ in _SURVIVOR_LABELS],
|
||||||
|
index=rule_idx, key=f"{kp}_survivor",
|
||||||
|
)
|
||||||
|
survivor_rule = next(v for lbl, v in _SURVIVOR_LABELS if lbl == rule_choice)
|
||||||
|
|
||||||
|
merge = st.checkbox(
|
||||||
|
"Merge matched rows (fill each survivor's blanks from its duplicates)",
|
||||||
|
value=bool(options.get("merge", False)), key=f"{kp}_merge",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Recover the previously-selected match columns from the stored strategies
|
||||||
|
# (a single exact-match strategy over the chosen columns).
|
||||||
|
prev_cols: list[str] = []
|
||||||
|
for strat in options.get("strategies", []) or []:
|
||||||
|
for c in strat.get("columns", []):
|
||||||
|
if c.get("column"):
|
||||||
|
prev_cols.append(c["column"])
|
||||||
|
all_cols = [str(c) for c in df.columns]
|
||||||
|
match_cols = st.multiselect(
|
||||||
|
"Match on these columns",
|
||||||
|
all_cols,
|
||||||
|
default=[c for c in prev_cols if c in all_cols],
|
||||||
|
key=f"{kp}_matchcols",
|
||||||
|
help="Rows are duplicates when these columns all match. Leave empty to auto-detect.",
|
||||||
|
)
|
||||||
|
|
||||||
|
out: dict[str, Any] = {"survivor_rule": survivor_rule, "merge": merge}
|
||||||
|
if match_cols:
|
||||||
|
out["strategies"] = [
|
||||||
|
{"columns": [
|
||||||
|
{"column": c, "algorithm": "exact", "threshold": 100}
|
||||||
|
for c in match_cols
|
||||||
|
]}
|
||||||
|
]
|
||||||
|
if survivor_rule == "most_recent":
|
||||||
|
date_default = options.get("date_column")
|
||||||
|
date_idx = all_cols.index(date_default) if date_default in all_cols else 0
|
||||||
|
out["date_column"] = st.selectbox(
|
||||||
|
"Date column (for most-recent)",
|
||||||
|
all_cols, index=date_idx, key=f"{kp}_datecol",
|
||||||
|
) if all_cols else None
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
CONFIG_RENDERERS: dict[str, Callable[[pd.DataFrame, dict, str], dict]] = {
|
||||||
|
"text_clean": render_text_clean_config,
|
||||||
|
"format_standardize": render_format_standardize_config,
|
||||||
|
"missing": render_missing_config,
|
||||||
|
"column_map": render_column_map_config,
|
||||||
|
"dedup": render_dedup_config,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Module card
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def render_step_card(
|
||||||
|
df: pd.DataFrame, step: dict, idx: int, total: int,
|
||||||
|
) -> Optional[str]:
|
||||||
|
"""Render one pipeline step as a module card.
|
||||||
|
|
||||||
|
Mutates ``step`` in place (``enabled`` toggle, ``options`` from the Configure
|
||||||
|
panel). Returns an action string (``"up"`` / ``"down"`` / ``"remove"``) when
|
||||||
|
the user clicks a reorder/remove control, else ``None`` — the caller applies
|
||||||
|
the action to the step list and reruns.
|
||||||
|
"""
|
||||||
|
sid = step["id"]
|
||||||
|
kp = f"{step['tool']}_{sid}"
|
||||||
|
action: Optional[str] = None
|
||||||
|
|
||||||
|
with st.container(border=True):
|
||||||
|
head, toggle, up, down, rm = st.columns([0.66, 0.12, 0.07, 0.07, 0.08])
|
||||||
|
with head:
|
||||||
|
st.markdown(f"**{idx + 1}. {step_label(step['tool'])}**")
|
||||||
|
st.caption(step_caption(step["tool"]))
|
||||||
|
with toggle:
|
||||||
|
step["enabled"] = st.toggle(
|
||||||
|
"On", value=step.get("enabled", True), key=f"{kp}_enabled",
|
||||||
|
help="Disabled steps are kept in the pipeline but skipped at run time.",
|
||||||
|
)
|
||||||
|
with up:
|
||||||
|
if st.button("▲", key=f"{kp}_up", disabled=idx == 0,
|
||||||
|
help="Move up", width="stretch"):
|
||||||
|
action = "up"
|
||||||
|
with down:
|
||||||
|
if st.button("▼", key=f"{kp}_down", disabled=idx == total - 1,
|
||||||
|
help="Move down", width="stretch"):
|
||||||
|
action = "down"
|
||||||
|
with rm:
|
||||||
|
if st.button("✕", key=f"{kp}_rm", help="Remove step", width="stretch"):
|
||||||
|
action = "remove"
|
||||||
|
|
||||||
|
renderer = CONFIG_RENDERERS.get(step["tool"])
|
||||||
|
with st.expander(f"Configure: {step_label(step['tool'])}"):
|
||||||
|
if renderer is None:
|
||||||
|
st.caption("This step has no options.")
|
||||||
|
else:
|
||||||
|
step["options"] = renderer(df, step.get("options", {}) or {}, kp)
|
||||||
|
|
||||||
|
return action
|
||||||
@@ -32,6 +32,7 @@ from src.core.pipeline import (
|
|||||||
run_pipeline,
|
run_pipeline,
|
||||||
validate_pipeline,
|
validate_pipeline,
|
||||||
)
|
)
|
||||||
|
from src.gui.components.pipeline_modules import render_step_card, step_label
|
||||||
from src.license import FeatureFlag
|
from src.license import FeatureFlag
|
||||||
|
|
||||||
hide_streamlit_chrome()
|
hide_streamlit_chrome()
|
||||||
@@ -104,120 +105,148 @@ st.divider()
|
|||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Pipeline builder
|
# Pipeline builder — visual module cards
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
#
|
#
|
||||||
# Wrapped in an outer expander whose default state mirrors the preview
|
# Each step is a "module" card (src/gui/components/pipeline_modules.py) with a
|
||||||
# expander above: open before a result exists, folded once the user has
|
# plain-language Configure panel — no raw JSON. Steps live in session state as
|
||||||
# clicked Run Pipeline. The pipeline editor is this page's "Options"
|
# an ordered list of dicts, each carrying a STABLE integer id so widget keys
|
||||||
# section — structurally analogous to Text Cleaner's options block.
|
# survive reorder/remove. Raw JSON is import/export only, under Advanced.
|
||||||
|
#
|
||||||
|
# NB: the builder is NOT wrapped in an outer expander — per-step Configure
|
||||||
|
# panels are expanders, and Streamlit forbids nesting expanders.
|
||||||
|
|
||||||
with st.expander("Options", expanded=not _has_result):
|
|
||||||
mode = st.radio(
|
def _seed_steps_from(pipeline) -> None:
|
||||||
|
"""Replace the session step list from a Pipeline, assigning fresh ids."""
|
||||||
|
seq = st.session_state.get("pipeline_step_seq", 0)
|
||||||
|
steps: list[dict] = []
|
||||||
|
for s in pipeline.steps:
|
||||||
|
steps.append({
|
||||||
|
"id": seq, "tool": s.tool,
|
||||||
|
"enabled": s.enabled, "options": dict(s.options),
|
||||||
|
})
|
||||||
|
seq += 1
|
||||||
|
st.session_state["pipeline_steps"] = steps
|
||||||
|
st.session_state["pipeline_step_seq"] = seq
|
||||||
|
|
||||||
|
|
||||||
|
if "pipeline_steps" not in st.session_state:
|
||||||
|
_seed_steps_from(recommended_pipeline())
|
||||||
|
|
||||||
|
st.subheader("Build your pipeline")
|
||||||
|
|
||||||
|
mode = st.radio(
|
||||||
"How would you like to define the pipeline?",
|
"How would you like to define the pipeline?",
|
||||||
[
|
[
|
||||||
"Use the recommended default (text-clean → format → missing → dedup)",
|
"Use the recommended default (Clean Text → Standardize → Fix Missing → Find Duplicates)",
|
||||||
"Build interactively",
|
"Build interactively",
|
||||||
"Import a saved pipeline JSON",
|
"Import a saved pipeline JSON",
|
||||||
],
|
],
|
||||||
index=0,
|
index=0,
|
||||||
|
key="pipeline_mode",
|
||||||
|
)
|
||||||
|
|
||||||
|
if mode.startswith("Use the recommended"):
|
||||||
|
# Only reseed on an explicit click that lands here while the steps already
|
||||||
|
# diverge — otherwise every rerun would wipe edits. We detect "user just
|
||||||
|
# selected this mode" by comparing against the recommended default and
|
||||||
|
# offering a one-click restore rather than silently discarding.
|
||||||
|
rec_dict = recommended_pipeline().to_dict()
|
||||||
|
cur_dict = {
|
||||||
|
"steps": [
|
||||||
|
{"tool": s["tool"], "options": s["options"],
|
||||||
|
"enabled": s["enabled"], "name": None}
|
||||||
|
for s in st.session_state["pipeline_steps"]
|
||||||
|
]
|
||||||
|
}
|
||||||
|
if cur_dict != rec_dict:
|
||||||
|
st.info(
|
||||||
|
"You've edited the recommended steps, so they're now yours to "
|
||||||
|
"change — you're effectively in **Build interactively** mode. "
|
||||||
|
"Restore the suggested steps to discard your edits."
|
||||||
)
|
)
|
||||||
|
if st.button("↺ Restore recommended steps"):
|
||||||
if "pipeline_rows" not in st.session_state:
|
_seed_steps_from(recommended_pipeline())
|
||||||
default = recommended_pipeline()
|
st.rerun()
|
||||||
st.session_state["pipeline_rows"] = pd.DataFrame([
|
elif mode.startswith("Import"):
|
||||||
{
|
|
||||||
"tool": s.tool, "enabled": s.enabled,
|
|
||||||
"options_json": json.dumps(s.options),
|
|
||||||
}
|
|
||||||
for s in default.steps
|
|
||||||
])
|
|
||||||
|
|
||||||
if mode.startswith("Use the recommended"):
|
|
||||||
default = recommended_pipeline()
|
|
||||||
st.session_state["pipeline_rows"] = pd.DataFrame([
|
|
||||||
{
|
|
||||||
"tool": s.tool, "enabled": s.enabled,
|
|
||||||
"options_json": json.dumps(s.options),
|
|
||||||
}
|
|
||||||
for s in default.steps
|
|
||||||
])
|
|
||||||
elif mode.startswith("Import"):
|
|
||||||
pipeline_file = st.file_uploader(
|
pipeline_file = st.file_uploader(
|
||||||
"Pipeline JSON", type=["json"], key="pipeline_upload",
|
"Pipeline JSON", type=["json"], key="pipeline_upload",
|
||||||
)
|
)
|
||||||
if pipeline_file is not None:
|
if pipeline_file is not None:
|
||||||
try:
|
try:
|
||||||
data = json.loads(pipeline_file.getvalue())
|
data = json.loads(pipeline_file.getvalue())
|
||||||
uploaded_pipe = Pipeline.from_dict(data)
|
_seed_steps_from(Pipeline.from_dict(data))
|
||||||
st.session_state["pipeline_rows"] = pd.DataFrame([
|
st.success(
|
||||||
{
|
f"Loaded {len(st.session_state['pipeline_steps'])} step(s). "
|
||||||
"tool": s.tool, "enabled": s.enabled,
|
"Switch to **Build interactively** to tweak them."
|
||||||
"options_json": json.dumps(s.options),
|
)
|
||||||
}
|
|
||||||
for s in uploaded_pipe.steps
|
|
||||||
])
|
|
||||||
st.success(f"Loaded {len(uploaded_pipe.steps)} step(s).")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
from src.core.errors import format_for_user
|
from src.core.errors import format_for_user
|
||||||
st.error(f"**Could not parse pipeline**\n\n```\n{format_for_user(e)}\n```")
|
st.error(f"**Could not parse pipeline**\n\n```\n{format_for_user(e)}\n```")
|
||||||
|
|
||||||
st.caption(
|
st.caption(
|
||||||
"Edit the table to add, remove, reorder (drag the row index), enable, "
|
"Each step is a module: toggle it on/off, reorder with ▲ ▼, remove with ✕, "
|
||||||
"or configure each step. Tool order is recommended, not enforced — "
|
"and open **Configure** to set its options in plain language. Tool order is "
|
||||||
"violations surface as warnings below the table."
|
"recommended, not enforced — violations surface as warnings below."
|
||||||
)
|
)
|
||||||
edited = st.data_editor(
|
|
||||||
st.session_state["pipeline_rows"],
|
|
||||||
width="stretch",
|
|
||||||
num_rows="dynamic",
|
|
||||||
column_config={
|
|
||||||
"tool": st.column_config.SelectboxColumn(
|
|
||||||
"Tool", options=TOOL_NAMES, required=True,
|
|
||||||
),
|
|
||||||
"enabled": st.column_config.CheckboxColumn("Enabled"),
|
|
||||||
"options_json": st.column_config.TextColumn(
|
|
||||||
"Options (JSON)",
|
|
||||||
help='e.g. {"column_types": {"phone": "phone"}}',
|
|
||||||
),
|
|
||||||
},
|
|
||||||
key="pipeline_editor",
|
|
||||||
)
|
|
||||||
st.session_state["pipeline_rows"] = edited
|
|
||||||
|
|
||||||
# Build a Pipeline object from the editor state.
|
# Render the module stack. A reorder/remove action mutates the list and reruns.
|
||||||
steps_list: list[Step] = []
|
steps = st.session_state["pipeline_steps"]
|
||||||
parse_errors: list[str] = []
|
total = len(steps)
|
||||||
for i, row in edited.iterrows():
|
pending_action: tuple[str, int] | None = None
|
||||||
tool = row.get("tool")
|
for i, step in enumerate(steps):
|
||||||
if not tool or pd.isna(tool):
|
act = render_step_card(df, step, i, total)
|
||||||
continue
|
if act is not None:
|
||||||
raw_opts = row.get("options_json") or "{}"
|
pending_action = (act, i)
|
||||||
if pd.isna(raw_opts):
|
|
||||||
raw_opts = "{}"
|
if pending_action is not None:
|
||||||
try:
|
act, i = pending_action
|
||||||
opts = json.loads(raw_opts) if isinstance(raw_opts, str) else dict(raw_opts)
|
if act == "remove":
|
||||||
if not isinstance(opts, dict):
|
steps.pop(i)
|
||||||
raise ValueError("options must be a JSON object")
|
elif act == "up" and i > 0:
|
||||||
except Exception as e:
|
steps[i - 1], steps[i] = steps[i], steps[i - 1]
|
||||||
parse_errors.append(f"Step {i + 1}: {e}")
|
elif act == "down" and i < total - 1:
|
||||||
continue
|
steps[i + 1], steps[i] = steps[i], steps[i + 1]
|
||||||
|
st.session_state["pipeline_steps"] = steps
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
# Add-step control.
|
||||||
|
add_col, btn_col = st.columns([0.7, 0.3])
|
||||||
|
with add_col:
|
||||||
|
add_tool = st.selectbox(
|
||||||
|
"Add a step",
|
||||||
|
TOOL_NAMES,
|
||||||
|
format_func=step_label,
|
||||||
|
key="pipeline_add_tool",
|
||||||
|
label_visibility="collapsed",
|
||||||
|
)
|
||||||
|
with btn_col:
|
||||||
|
if st.button("➕ Add step", width="stretch"):
|
||||||
|
seq = st.session_state.get("pipeline_step_seq", 0)
|
||||||
|
steps.append({"id": seq, "tool": add_tool, "enabled": True, "options": {}})
|
||||||
|
st.session_state["pipeline_step_seq"] = seq + 1
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
# Build a Pipeline object from the step list.
|
||||||
|
steps_list: list[Step] = []
|
||||||
|
parse_errors: list[str] = []
|
||||||
|
for i, step in enumerate(steps):
|
||||||
try:
|
try:
|
||||||
steps_list.append(Step(
|
steps_list.append(Step(
|
||||||
tool=str(tool),
|
tool=str(step["tool"]),
|
||||||
options=opts,
|
options=dict(step.get("options") or {}),
|
||||||
enabled=bool(row.get("enabled", True)),
|
enabled=bool(step.get("enabled", True)),
|
||||||
))
|
))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
parse_errors.append(f"Step {i + 1}: {e}")
|
parse_errors.append(f"Step {i + 1} ({step.get('tool')}): {e}")
|
||||||
|
|
||||||
if parse_errors:
|
for err in parse_errors:
|
||||||
for err in parse_errors:
|
|
||||||
st.error(err)
|
st.error(err)
|
||||||
|
|
||||||
current_pipeline = Pipeline(steps=steps_list) if steps_list else None
|
current_pipeline = Pipeline(steps=steps_list) if steps_list else None
|
||||||
|
|
||||||
if current_pipeline is not None:
|
if current_pipeline is not None:
|
||||||
warnings = validate_pipeline(current_pipeline)
|
warnings = validate_pipeline(current_pipeline)
|
||||||
if warnings:
|
if warnings:
|
||||||
st.warning(
|
st.warning(
|
||||||
@@ -226,14 +255,37 @@ with st.expander("Options", expanded=not _has_result):
|
|||||||
+ "\n\nThe pipeline will still run — these are recommendations only."
|
+ "\n\nThe pipeline will still run — these are recommendations only."
|
||||||
)
|
)
|
||||||
|
|
||||||
with st.expander("Recommended tool order — why each step belongs where it does"):
|
with st.expander("Recommended tool order — why each step belongs where it does"):
|
||||||
st.markdown(
|
st.markdown(
|
||||||
"\n".join(
|
"\n".join(
|
||||||
f"- **{e}** before **{l}** — {why}"
|
f"- **{step_label(e)}** before **{step_label(l)}** — {why}"
|
||||||
for e, l, why in SOFT_DEPENDENCIES
|
for e, l, why in SOFT_DEPENDENCIES
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
with st.expander("Advanced — import / export pipeline as JSON"):
|
||||||
|
st.caption(
|
||||||
|
"For sharing or version control. Editing is done in the step panels "
|
||||||
|
"above — this is just the saved form of the same settings. The same "
|
||||||
|
"JSON runs in the CLI via `--pipeline pipeline.json`."
|
||||||
|
)
|
||||||
|
export_json = json.dumps(
|
||||||
|
current_pipeline.to_dict() if current_pipeline else {"steps": []},
|
||||||
|
indent=2, default=str,
|
||||||
|
)
|
||||||
|
st.code(export_json, language="json")
|
||||||
|
adv_paste = st.text_area(
|
||||||
|
"Paste pipeline JSON to load it", key="pipeline_json_paste", height=140,
|
||||||
|
)
|
||||||
|
if st.button("Load pasted JSON", disabled=not adv_paste.strip()):
|
||||||
|
try:
|
||||||
|
_seed_steps_from(Pipeline.from_dict(json.loads(adv_paste)))
|
||||||
|
st.success("Loaded. Scroll up to see the steps.")
|
||||||
|
st.rerun()
|
||||||
|
except Exception as e:
|
||||||
|
from src.core.errors import format_for_user
|
||||||
|
st.error(f"**Could not parse pipeline**\n\n```\n{format_for_user(e)}\n```")
|
||||||
|
|
||||||
st.divider()
|
st.divider()
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -257,14 +309,14 @@ if st.button(
|
|||||||
def _on_step(sr) -> None:
|
def _on_step(sr) -> None:
|
||||||
completed[0] += 1
|
completed[0] += 1
|
||||||
if sr.skipped:
|
if sr.skipped:
|
||||||
log_lines.append(f"○ {sr.step.display_name()} (skipped)")
|
log_lines.append(f"○ {step_label(sr.step.tool)} (skipped)")
|
||||||
elif sr.error:
|
elif sr.error:
|
||||||
log_lines.append(
|
log_lines.append(
|
||||||
f"✗ {sr.step.display_name()} — {sr.error.splitlines()[0]}"
|
f"✗ {step_label(sr.step.tool)} — {sr.error.splitlines()[0]}"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
log_lines.append(
|
log_lines.append(
|
||||||
f"✓ {sr.step.display_name()} — {sr.elapsed_seconds*1000:.0f} ms"
|
f"✓ {step_label(sr.step.tool)} — {sr.elapsed_seconds*1000:.0f} ms"
|
||||||
)
|
)
|
||||||
log_box.markdown("\n".join(log_lines))
|
log_box.markdown("\n".join(log_lines))
|
||||||
progress.progress(
|
progress.progress(
|
||||||
@@ -330,11 +382,11 @@ m4.metric("Elapsed", f"{result.total_elapsed:.2f} s")
|
|||||||
st.markdown("**Per-step summary**")
|
st.markdown("**Per-step summary**")
|
||||||
step_df = pd.DataFrame([
|
step_df = pd.DataFrame([
|
||||||
{
|
{
|
||||||
"step": sr.step.display_name(),
|
"step": step_label(sr.step.tool),
|
||||||
"status": (
|
"status": (
|
||||||
"skipped" if sr.skipped
|
"⏭ skipped" if sr.skipped
|
||||||
else "error" if sr.error
|
else "✗ error" if sr.error
|
||||||
else "ok"
|
else "✓ ok"
|
||||||
),
|
),
|
||||||
"elapsed_ms": int(sr.elapsed_seconds * 1000),
|
"elapsed_ms": int(sr.elapsed_seconds * 1000),
|
||||||
"summary": json.dumps(sr.summary, default=str)[:200],
|
"summary": json.dumps(sr.summary, default=str)[:200],
|
||||||
|
|||||||
91
tests/gui/test_pipeline_builder.py
Normal file
91
tests/gui/test_pipeline_builder.py
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
"""Pipeline Runner — visual module-card builder contract (AppTest).
|
||||||
|
|
||||||
|
Pins the behaviors the JSON-table → module-card rewrite introduced:
|
||||||
|
recommended steps seed as cards with friendly names, each step exposes a
|
||||||
|
plain-language Configure panel (no raw per-row JSON), steps can be toggled /
|
||||||
|
added / removed, JSON lives only under Advanced, and a run produces results
|
||||||
|
with friendly step names. The page's bare initial-render contract across junk
|
||||||
|
files is covered separately in ``tests/test_junk_corpus_tool_pages.py``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from streamlit.testing.v1 import AppTest
|
||||||
|
|
||||||
|
_PAGE = (
|
||||||
|
Path(__file__).resolve().parent.parent.parent
|
||||||
|
/ "src" / "gui" / "pages" / "9_Pipeline_Runner.py"
|
||||||
|
)
|
||||||
|
|
||||||
|
_CSV = (
|
||||||
|
b"name,email,phone,signup_date\n"
|
||||||
|
b" Jane Doe ,jane@acme.io,512-555-0190,2024-01-04\n"
|
||||||
|
b"jane doe,JANE@ACME.IO,(512) 555-0190,01/04/2024\n"
|
||||||
|
b"Bob Smith,bob@globex.com,720.555.7781,2024-02-11\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _app() -> AppTest:
|
||||||
|
at = AppTest.from_file(str(_PAGE), default_timeout=30)
|
||||||
|
at.session_state["home_uploaded_bytes"] = _CSV
|
||||||
|
at.session_state["home_uploaded_name"] = "customers.csv"
|
||||||
|
at.session_state["home_uploaded_size"] = len(_CSV)
|
||||||
|
return at.run()
|
||||||
|
|
||||||
|
|
||||||
|
def test_recommended_steps_seed_as_named_cards():
|
||||||
|
at = _app()
|
||||||
|
assert not at.exception
|
||||||
|
tools = [s["tool"] for s in at.session_state["pipeline_steps"]]
|
||||||
|
assert tools == ["text_clean", "format_standardize", "missing", "dedup"]
|
||||||
|
md = " ".join(m.value for m in at.markdown)
|
||||||
|
for friendly in ("Clean Text", "Standardize Formats",
|
||||||
|
"Fix Missing Values", "Find Duplicates"):
|
||||||
|
assert friendly in md
|
||||||
|
|
||||||
|
|
||||||
|
def test_each_step_has_a_configure_panel_and_json_is_advanced_only():
|
||||||
|
at = _app()
|
||||||
|
labels = [e.label for e in at.get("expander")]
|
||||||
|
assert any(l.startswith("Configure: Clean Text") for l in labels)
|
||||||
|
assert any(l.startswith("Configure: Find Duplicates") for l in labels)
|
||||||
|
# Raw JSON is import/export only — never a per-step editing surface.
|
||||||
|
assert any("Advanced — import / export" in l for l in labels)
|
||||||
|
|
||||||
|
|
||||||
|
def test_toggle_disables_step_and_persists():
|
||||||
|
at = _app()
|
||||||
|
at.toggle[0].set_value(False).run()
|
||||||
|
assert at.session_state["pipeline_steps"][0]["enabled"] is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_step_appends_a_working_config_panel():
|
||||||
|
at = _app()
|
||||||
|
[s for s in at.selectbox if s.key == "pipeline_add_tool"][0].set_value("column_map").run()
|
||||||
|
[b for b in at.button if "Add step" in b.label][0].click().run()
|
||||||
|
assert not at.exception
|
||||||
|
assert at.session_state["pipeline_steps"][-1]["tool"] == "column_map"
|
||||||
|
labels = [e.label for e in at.get("expander")]
|
||||||
|
assert any(l.startswith("Configure: Map Columns") for l in labels)
|
||||||
|
|
||||||
|
|
||||||
|
def test_remove_step_drops_it():
|
||||||
|
at = _app()
|
||||||
|
before = len(at.session_state["pipeline_steps"])
|
||||||
|
# The first ✕ remove button in the card stack.
|
||||||
|
[b for b in at.button if b.label == "✕"][0].click().run()
|
||||||
|
assert not at.exception
|
||||||
|
assert len(at.session_state["pipeline_steps"]) == before - 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_produces_results_with_friendly_names():
|
||||||
|
at = _app()
|
||||||
|
[b for b in at.button if b.label == "Run Pipeline"][0].click().run()
|
||||||
|
assert not at.exception, at.exception
|
||||||
|
assert "pipeline_result" in at.session_state
|
||||||
|
res = at.session_state["pipeline_result"]
|
||||||
|
assert res.initial_rows == 3 and res.final_rows == 2 # the two Jane rows merge
|
||||||
|
assert all(sr.error is None for sr in res.step_results)
|
||||||
Reference in New Issue
Block a user