datatools-dev/src/gui/pages/9_Pipeline_Runner.py

"""DataTools Automated Workflows — Streamlit page."""

from __future__ import annotations

import io
import json
import sys
from pathlib import Path

import pandas as pd
import streamlit as st

_project_root = Path(__file__).resolve().parent.parent.parent.parent
if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root))

from src.gui.components import (
    back_to_home_link,
    render_sticky_footer,
    hide_streamlit_chrome,
    html_download_button,
    pickup_or_upload,
    require_feature_or_render_upgrade,
)
from src.i18n import t
from src.core.pipeline import (
    Pipeline,
    SOFT_DEPENDENCIES,
    Step,
    TOOL_NAMES,
    recommended_pipeline,
    run_pipeline,
    validate_pipeline,
)
from src.license import FeatureFlag

hide_streamlit_chrome()
render_sticky_footer()
back_to_home_link()
from src.audit import log_page_open
log_page_open("9_Pipeline_Runner")
require_feature_or_render_upgrade(FeatureFlag.PIPELINE_RUNNER)


# ---------------------------------------------------------------------------
# Header
# ---------------------------------------------------------------------------

st.title(t("tools.09_pipeline_runner.page_title"))
st.caption(t("tools.09_pipeline_runner.page_caption"))


# ---------------------------------------------------------------------------
# File upload
# ---------------------------------------------------------------------------

uploaded = pickup_or_upload(
    label="Upload CSV or Excel file",
    key="pipeline_file_upload",
    types=["csv", "tsv", "xlsx", "xls"],
)

if uploaded is None:
    st.info("Upload a CSV, TSV, or Excel file to begin.")
    st.stop()


@st.cache_data(show_spinner=False)
def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
    suffix = Path(name).suffix.lower()
    bio = io.BytesIO(data)
    if suffix in (".xlsx", ".xls"):
        return pd.read_excel(bio)
    for enc in ("utf-8", "utf-8-sig", "latin-1"):
        try:
            bio.seek(0)
            sep = "\t" if suffix == ".tsv" else ","
            return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn")
        except UnicodeDecodeError:
            continue
    bio.seek(0)
    return pd.read_csv(bio, encoding="latin-1")


try:
    df = _read_uploaded(uploaded.name, uploaded.getvalue())
except Exception as e:
    from src.core.errors import format_for_user
    st.error(
        f"**Could not read `{uploaded.name}`**\n\n"
        f"```\n{format_for_user(e)}\n```"
    )
    st.stop()

# Collapse the input preview and pipeline editor once the user has clicked
# Run Pipeline so the Results section below is the primary visual focus.
# The user can re-expand either expander to re-inspect or adjust.
_has_result = st.session_state.get("pipeline_result") is not None

with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result):
    st.caption(f"{len(df)} rows, {len(df.columns)} columns")
    st.dataframe(df.head(10), use_container_width=True)

st.divider()


# ---------------------------------------------------------------------------
# Pipeline builder
# ---------------------------------------------------------------------------
#
# Wrapped in an outer expander whose default state mirrors the preview
# expander above: open before a result exists, folded once the user has
# clicked Run Pipeline. The pipeline editor is this page's "Options"
# section — structurally analogous to Text Cleaner's options block.

with st.expander("Options", expanded=not _has_result):
    mode = st.radio(
        "How would you like to define the pipeline?",
        [
            "Use the recommended default (text-clean → format → missing → dedup)",
            "Build interactively",
            "Upload a saved pipeline JSON",
        ],
        index=0,
    )

    if "pipeline_rows" not in st.session_state:
        default = recommended_pipeline()
        st.session_state["pipeline_rows"] = pd.DataFrame([
            {
                "tool": s.tool, "enabled": s.enabled,
                "options_json": json.dumps(s.options),
            }
            for s in default.steps
        ])

    if mode.startswith("Use the recommended"):
        default = recommended_pipeline()
        st.session_state["pipeline_rows"] = pd.DataFrame([
            {
                "tool": s.tool, "enabled": s.enabled,
                "options_json": json.dumps(s.options),
            }
            for s in default.steps
        ])
    elif mode.startswith("Upload"):
        pipeline_file = st.file_uploader(
            "Pipeline JSON", type=["json"], key="pipeline_upload",
        )
        if pipeline_file is not None:
            try:
                data = json.loads(pipeline_file.getvalue())
                uploaded_pipe = Pipeline.from_dict(data)
                st.session_state["pipeline_rows"] = pd.DataFrame([
                    {
                        "tool": s.tool, "enabled": s.enabled,
                        "options_json": json.dumps(s.options),
                    }
                    for s in uploaded_pipe.steps
                ])
                st.success(f"Loaded {len(uploaded_pipe.steps)} step(s).")
            except Exception as e:
                from src.core.errors import format_for_user
                st.error(f"**Could not parse pipeline**\n\n```\n{format_for_user(e)}\n```")

    st.caption(
        "Edit the table to add, remove, reorder (drag the row index), enable, "
        "or configure each step. Tool order is recommended, not enforced — "
        "violations surface as warnings below the table."
    )
    edited = st.data_editor(
        st.session_state["pipeline_rows"],
        use_container_width=True,
        num_rows="dynamic",
        column_config={
            "tool": st.column_config.SelectboxColumn(
                "Tool", options=TOOL_NAMES, required=True,
            ),
            "enabled": st.column_config.CheckboxColumn("Enabled"),
            "options_json": st.column_config.TextColumn(
                "Options (JSON)",
                help='e.g. {"column_types": {"phone": "phone"}}',
            ),
        },
        key="pipeline_editor",
    )
    st.session_state["pipeline_rows"] = edited

    # Build a Pipeline object from the editor state.
    steps_list: list[Step] = []
    parse_errors: list[str] = []
    for i, row in edited.iterrows():
        tool = row.get("tool")
        if not tool or pd.isna(tool):
            continue
        raw_opts = row.get("options_json") or "{}"
        if pd.isna(raw_opts):
            raw_opts = "{}"
        try:
            opts = json.loads(raw_opts) if isinstance(raw_opts, str) else dict(raw_opts)
            if not isinstance(opts, dict):
                raise ValueError("options must be a JSON object")
        except Exception as e:
            parse_errors.append(f"Step {i + 1}: {e}")
            continue
        try:
            steps_list.append(Step(
                tool=str(tool),
                options=opts,
                enabled=bool(row.get("enabled", True)),
            ))
        except Exception as e:
            parse_errors.append(f"Step {i + 1}: {e}")

    if parse_errors:
        for err in parse_errors:
            st.error(err)

    current_pipeline = Pipeline(steps=steps_list) if steps_list else None

    if current_pipeline is not None:
        warnings = validate_pipeline(current_pipeline)
        if warnings:
            st.warning(
                "Pipeline is out of recommended order:\n\n"
                + "\n".join(f"- {w}" for w in warnings)
                + "\n\nThe pipeline will still run — these are recommendations only."
            )

    with st.expander("Recommended tool order — why each step belongs where it does"):
        st.markdown(
            "\n".join(
                f"- **{e}** before **{l}** — {why}"
                for e, l, why in SOFT_DEPENDENCIES
            )
        )

st.divider()

# ---------------------------------------------------------------------------
# Run
# ---------------------------------------------------------------------------

run_disabled = current_pipeline is None or not current_pipeline.steps

if st.button(
    "Run Pipeline",
    type="primary",
    use_container_width=True,
    disabled=run_disabled,
):
    progress = st.progress(0.0, text="Starting...")
    log_box = st.empty()
    log_lines: list[str] = []
    total_enabled = sum(1 for s in current_pipeline.steps if s.enabled)
    completed = [0]

    def _on_step(sr) -> None:
        completed[0] += 1
        if sr.skipped:
            log_lines.append(f"○ {sr.step.display_name()} (skipped)")
        elif sr.error:
            log_lines.append(
                f"✗ {sr.step.display_name()} — {sr.error.splitlines()[0]}"
            )
        else:
            log_lines.append(
                f"✓ {sr.step.display_name()} — {sr.elapsed_seconds*1000:.0f} ms"
            )
        log_box.markdown("\n".join(log_lines))
        progress.progress(
            completed[0] / max(total_enabled, 1),
            text=f"Step {completed[0]}/{total_enabled}",
        )

    try:
        result = run_pipeline(
            df, current_pipeline,
            on_step_complete=_on_step,
            stop_on_error=False,
        )
    except Exception as e:
        from src.core.errors import format_for_user
        st.error(f"**Pipeline halted**\n\n```\n{format_for_user(e)}\n```")
        st.stop()

    progress.progress(1.0, text="Done")
    st.session_state["pipeline_result"] = result
    from src.audit import log_event
    log_event("tool_run", "Automated Workflows run", page="9_Pipeline_Runner")
    st.session_state["pipeline_input_name"] = uploaded.name
    # One-shot flag picked up on the next pass to scroll the parent
    # document to the Results anchor (see scroll snippet at end of file).
    st.session_state["_pipeline_scroll_to_results"] = True
    # Force a second rerun so the preview and options expanders see
    # the new result on the NEXT script pass and collapse themselves.
    # Without this they stay expanded until the user touches any
    # other widget.
    st.rerun()

result = st.session_state.get("pipeline_result")
if result is None:
    st.info(
        "Configure the pipeline above and click **Run Pipeline** to "
        "execute it on your file."
    )
    st.stop()

# ---------------------------------------------------------------------------
# Results
# ---------------------------------------------------------------------------

# Anchor target for the auto-scroll snippet at the end of this block.
# A bare ``<div id="...">`` survives Streamlit's HTML sanitizer (only
# ``<script>`` is stripped), and a 1px-tall div doesn't visually shift
# anything. Placed before the subheader so the scrolled-to viewport
# starts a few pixels above the section heading rather than below it.
st.markdown(
    '<div id="pipeline-results-anchor" style="height:1px"></div>',
    unsafe_allow_html=True,
)

st.subheader("Results")

m1, m2, m3, m4 = st.columns(4)
m1.metric("Initial rows", result.initial_rows)
m2.metric("Final rows", result.final_rows)
m3.metric("Steps run", sum(1 for s in result.step_results if not s.skipped))
m4.metric("Elapsed", f"{result.total_elapsed:.2f} s")

st.markdown("**Per-step summary**")
step_df = pd.DataFrame([
    {
        "step": sr.step.display_name(),
        "status": (
            "skipped" if sr.skipped
            else "error" if sr.error
            else "ok"
        ),
        "elapsed_ms": int(sr.elapsed_seconds * 1000),
        "summary": json.dumps(sr.summary, default=str)[:200],
        "error": sr.error or "",
    }
    for sr in result.step_results
])
st.dataframe(step_df, use_container_width=True, hide_index=True)

st.markdown("**Output preview (first 10 rows)**")
st.dataframe(result.final_df.head(10), use_container_width=True)

# ---------------------------------------------------------------------------
# Downloads
# ---------------------------------------------------------------------------
#
# All three byte buffers are prepared up front (outside the columns) so
# each ``st.download_button`` sees stable ``data`` across reruns and an
# explicit ``key`` — without those, Streamlit auto-derived widget IDs
# can collide for multiple download_buttons in adjacent columns and
# only the first one actually fires on click. The pipeline-JSON button
# now renders unconditionally (disabled when no pipeline is defined)
# so the layout stays steady.

st.divider()
stem = Path(st.session_state.get("pipeline_input_name", "input")).stem

cleaned_bytes = result.final_df.to_csv(index=False).encode("utf-8-sig")
pipeline_bytes = json.dumps(
    current_pipeline.to_dict() if current_pipeline else {"steps": []},
    indent=2, default=str,
).encode("utf-8")
audit_bytes = json.dumps({
    "warnings": result.warnings,
    "initial_rows": result.initial_rows,
    "final_rows": result.final_rows,
    "total_elapsed_seconds": result.total_elapsed,
    "steps": [
        {
            "tool": sr.step.tool,
            "name": sr.step.display_name(),
            "enabled": sr.step.enabled,
            "skipped": sr.skipped,
            "elapsed_seconds": sr.elapsed_seconds,
            "summary": sr.summary,
            "error": sr.error,
        }
        for sr in result.step_results
    ],
}, indent=2, default=str).encode("utf-8")

_pipeline_empty = current_pipeline is None or not current_pipeline.steps

dl_a, dl_b, dl_c = st.columns(3)
with dl_a:
    html_download_button(
        "Download cleaned CSV",
        cleaned_bytes,
        file_name=f"{stem}_pipeline.csv",
        mime="text/csv",
    )
with dl_b:
    html_download_button(
        "Download pipeline JSON",
        pipeline_bytes,
        file_name="pipeline.json",
        mime="application/json",
        disabled=_pipeline_empty,
        help=(
            "No pipeline defined."
            if _pipeline_empty
            else "Save this and pass --pipeline pipeline.json to the CLI to re-run on next week's file."
        ),
    )
with dl_c:
    html_download_button(
        "Download run audit",
        audit_bytes,
        file_name=f"{stem}_pipeline_audit.json",
        mime="application/json",
    )


# ---------------------------------------------------------------------------
# Post-run auto-scroll
# ---------------------------------------------------------------------------
#
# When the user clicks Run Pipeline, the preview + options collapse but
# Streamlit by itself doesn't scroll — the Results section is at the
# bottom of a tall script so the user has to find it. Inject a tiny
# component-html iframe that calls ``scrollIntoView`` on the parent's
# Results anchor. Streamlit's main page is same-origin with component
# iframes so ``window.parent.document`` access is allowed.
#
# The flag is one-shot (``pop`` removes it) so re-renders triggered by
# unrelated widgets in the Results section don't yank the viewport
# back to the top of Results.
if st.session_state.pop("_pipeline_scroll_to_results", False):
    from streamlit.components.v1 import html as _components_html
    _components_html(
        """
        <script>
          const doc = window.parent.document;
          const target = doc.getElementById('pipeline-results-anchor');
          if (target) target.scrollIntoView({behavior: 'smooth', block: 'start'});
        </script>
        """,
        height=0,
    )