"""DataTools Automated Workflows — Streamlit page.""" from __future__ import annotations import io import json import sys from pathlib import Path import pandas as pd import streamlit as st _project_root = Path(__file__).resolve().parent.parent.parent.parent if str(_project_root) not in sys.path: sys.path.insert(0, str(_project_root)) from src.gui.components import ( back_to_home_link, render_sticky_footer, render_tool_header, hide_streamlit_chrome, html_download_button, pickup_or_upload, require_feature_or_render_upgrade, ) from src.core.pipeline import ( Pipeline, SOFT_DEPENDENCIES, Step, TOOL_NAMES, recommended_pipeline, run_pipeline, validate_pipeline, ) from src.license import FeatureFlag hide_streamlit_chrome() render_sticky_footer() back_to_home_link() from src.audit import log_page_open log_page_open("9_Pipeline_Runner") require_feature_or_render_upgrade(FeatureFlag.PIPELINE_RUNNER) # --------------------------------------------------------------------------- # Header # --------------------------------------------------------------------------- render_tool_header("09_pipeline_runner") # --------------------------------------------------------------------------- # File upload # --------------------------------------------------------------------------- uploaded = pickup_or_upload( label="Import CSV or Excel file", key="pipeline_file_upload", types=["csv", "tsv", "xlsx", "xls"], ) if uploaded is None: st.info("Import a CSV, TSV, or Excel file to begin.") st.stop() @st.cache_data(show_spinner=False) def _read_uploaded(name: str, data: bytes) -> pd.DataFrame: suffix = Path(name).suffix.lower() bio = io.BytesIO(data) if suffix in (".xlsx", ".xls"): return pd.read_excel(bio) for enc in ("utf-8", "utf-8-sig", "latin-1"): try: bio.seek(0) sep = "\t" if suffix == ".tsv" else "," return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn") except UnicodeDecodeError: continue bio.seek(0) return pd.read_csv(bio, encoding="latin-1") try: df = _read_uploaded(uploaded.name, uploaded.getvalue()) except Exception as e: from src.core.errors import format_for_user st.error( f"**Could not read `{uploaded.name}`**\n\n" f"```\n{format_for_user(e)}\n```" ) st.stop() # Collapse the input preview and pipeline editor once the user has clicked # Run Pipeline so the Results section below is the primary visual focus. # The user can re-expand either expander to re-inspect or adjust. _has_result = st.session_state.get("pipeline_result") is not None with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result): st.caption(f"{len(df)} rows, {len(df.columns)} columns") st.dataframe(df.head(10), width="stretch") st.divider() # --------------------------------------------------------------------------- # Pipeline builder # --------------------------------------------------------------------------- # # Wrapped in an outer expander whose default state mirrors the preview # expander above: open before a result exists, folded once the user has # clicked Run Pipeline. The pipeline editor is this page's "Options" # section — structurally analogous to Text Cleaner's options block. with st.expander("Options", expanded=not _has_result): mode = st.radio( "How would you like to define the pipeline?", [ "Use the recommended default (text-clean → format → missing → dedup)", "Build interactively", "Import a saved pipeline JSON", ], index=0, ) if "pipeline_rows" not in st.session_state: default = recommended_pipeline() st.session_state["pipeline_rows"] = pd.DataFrame([ { "tool": s.tool, "enabled": s.enabled, "options_json": json.dumps(s.options), } for s in default.steps ]) if mode.startswith("Use the recommended"): default = recommended_pipeline() st.session_state["pipeline_rows"] = pd.DataFrame([ { "tool": s.tool, "enabled": s.enabled, "options_json": json.dumps(s.options), } for s in default.steps ]) elif mode.startswith("Import"): pipeline_file = st.file_uploader( "Pipeline JSON", type=["json"], key="pipeline_upload", ) if pipeline_file is not None: try: data = json.loads(pipeline_file.getvalue()) uploaded_pipe = Pipeline.from_dict(data) st.session_state["pipeline_rows"] = pd.DataFrame([ { "tool": s.tool, "enabled": s.enabled, "options_json": json.dumps(s.options), } for s in uploaded_pipe.steps ]) st.success(f"Loaded {len(uploaded_pipe.steps)} step(s).") except Exception as e: from src.core.errors import format_for_user st.error(f"**Could not parse pipeline**\n\n```\n{format_for_user(e)}\n```") st.caption( "Edit the table to add, remove, reorder (drag the row index), enable, " "or configure each step. Tool order is recommended, not enforced — " "violations surface as warnings below the table." ) edited = st.data_editor( st.session_state["pipeline_rows"], width="stretch", num_rows="dynamic", column_config={ "tool": st.column_config.SelectboxColumn( "Tool", options=TOOL_NAMES, required=True, ), "enabled": st.column_config.CheckboxColumn("Enabled"), "options_json": st.column_config.TextColumn( "Options (JSON)", help='e.g. {"column_types": {"phone": "phone"}}', ), }, key="pipeline_editor", ) st.session_state["pipeline_rows"] = edited # Build a Pipeline object from the editor state. steps_list: list[Step] = [] parse_errors: list[str] = [] for i, row in edited.iterrows(): tool = row.get("tool") if not tool or pd.isna(tool): continue raw_opts = row.get("options_json") or "{}" if pd.isna(raw_opts): raw_opts = "{}" try: opts = json.loads(raw_opts) if isinstance(raw_opts, str) else dict(raw_opts) if not isinstance(opts, dict): raise ValueError("options must be a JSON object") except Exception as e: parse_errors.append(f"Step {i + 1}: {e}") continue try: steps_list.append(Step( tool=str(tool), options=opts, enabled=bool(row.get("enabled", True)), )) except Exception as e: parse_errors.append(f"Step {i + 1}: {e}") if parse_errors: for err in parse_errors: st.error(err) current_pipeline = Pipeline(steps=steps_list) if steps_list else None if current_pipeline is not None: warnings = validate_pipeline(current_pipeline) if warnings: st.warning( "Pipeline is out of recommended order:\n\n" + "\n".join(f"- {w}" for w in warnings) + "\n\nThe pipeline will still run — these are recommendations only." ) with st.expander("Recommended tool order — why each step belongs where it does"): st.markdown( "\n".join( f"- **{e}** before **{l}** — {why}" for e, l, why in SOFT_DEPENDENCIES ) ) st.divider() # --------------------------------------------------------------------------- # Run # --------------------------------------------------------------------------- run_disabled = current_pipeline is None or not current_pipeline.steps if st.button( "Run Pipeline", type="primary", width="stretch", disabled=run_disabled, ): progress = st.progress(0.0, text="Starting...") log_box = st.empty() log_lines: list[str] = [] total_enabled = sum(1 for s in current_pipeline.steps if s.enabled) completed = [0] def _on_step(sr) -> None: completed[0] += 1 if sr.skipped: log_lines.append(f"○ {sr.step.display_name()} (skipped)") elif sr.error: log_lines.append( f"✗ {sr.step.display_name()} — {sr.error.splitlines()[0]}" ) else: log_lines.append( f"✓ {sr.step.display_name()} — {sr.elapsed_seconds*1000:.0f} ms" ) log_box.markdown("\n".join(log_lines)) progress.progress( completed[0] / max(total_enabled, 1), text=f"Step {completed[0]}/{total_enabled}", ) try: result = run_pipeline( df, current_pipeline, on_step_complete=_on_step, stop_on_error=False, ) except Exception as e: from src.core.errors import format_for_user st.error(f"**Pipeline halted**\n\n```\n{format_for_user(e)}\n```") st.stop() progress.progress(1.0, text="Done") st.session_state["pipeline_result"] = result from src.audit import log_event log_event("tool_run", "Automated Workflows run", page="9_Pipeline_Runner") st.session_state["pipeline_input_name"] = uploaded.name # One-shot flag picked up on the next pass to scroll the parent # document to the Results anchor (see scroll snippet at end of file). st.session_state["_pipeline_scroll_to_results"] = True # Force a second rerun so the preview and options expanders see # the new result on the NEXT script pass and collapse themselves. # Without this they stay expanded until the user touches any # other widget. st.rerun() result = st.session_state.get("pipeline_result") if result is None: st.info( "Configure the pipeline above and click **Run Pipeline** to " "execute it on your file." ) st.stop() # --------------------------------------------------------------------------- # Results # --------------------------------------------------------------------------- # Anchor target for the auto-scroll snippet at the end of this block. # A bare ``