datatools-dev/tests/gui/test_pipeline_builder.py

"""Pipeline Runner — visual module-card builder contract (AppTest).

Pins the behaviors the JSON-table → module-card rewrite introduced:
recommended steps seed as cards with friendly names, each step exposes a
plain-language Configure panel (no raw per-row JSON), steps can be toggled /
added / removed, JSON lives only under Advanced, and a run produces results
with friendly step names. The page's bare initial-render contract across junk
files is covered separately in ``tests/test_junk_corpus_tool_pages.py``.
"""

from __future__ import annotations

from pathlib import Path

import pytest
from streamlit.testing.v1 import AppTest

_PAGE = (
    Path(__file__).resolve().parent.parent.parent
    / "src" / "gui" / "pages" / "9_Pipeline_Runner.py"
)

_CSV = (
    b"name,email,phone,signup_date\n"
    b"  Jane Doe ,jane@acme.io,512-555-0190,2024-01-04\n"
    b"jane doe,JANE@ACME.IO,(512) 555-0190,01/04/2024\n"
    b"Bob Smith,bob@globex.com,720.555.7781,2024-02-11\n"
)


def _app() -> AppTest:
    at = AppTest.from_file(str(_PAGE), default_timeout=30)
    at.session_state["home_uploaded_bytes"] = _CSV
    at.session_state["home_uploaded_name"] = "customers.csv"
    at.session_state["home_uploaded_size"] = len(_CSV)
    return at.run()


def test_recommended_steps_seed_as_named_cards():
    at = _app()
    assert not at.exception
    tools = [s["tool"] for s in at.session_state["pipeline_steps"]]
    assert tools == ["text_clean", "format_standardize", "missing", "dedup"]
    md = " ".join(m.value for m in at.markdown)
    for friendly in ("Clean Text", "Standardize Formats",
                     "Fix Missing Values", "Find Duplicates"):
        assert friendly in md


def test_each_step_has_a_configure_panel_and_json_is_advanced_only():
    at = _app()
    labels = [e.label for e in at.get("expander")]
    assert any(l.startswith("Configure: Clean Text") for l in labels)
    assert any(l.startswith("Configure: Find Duplicates") for l in labels)
    # Raw JSON is import/export only — never a per-step editing surface.
    assert any("Advanced — import / export" in l for l in labels)


def test_toggle_disables_step_and_persists():
    at = _app()
    at.toggle[0].set_value(False).run()
    assert at.session_state["pipeline_steps"][0]["enabled"] is False


def test_add_step_appends_a_working_config_panel():
    at = _app()
    [s for s in at.selectbox if s.key == "pipeline_add_tool"][0].set_value("column_map").run()
    [b for b in at.button if "Add step" in b.label][0].click().run()
    assert not at.exception
    assert at.session_state["pipeline_steps"][-1]["tool"] == "column_map"
    labels = [e.label for e in at.get("expander")]
    assert any(l.startswith("Configure: Map Columns") for l in labels)


def test_remove_step_drops_it():
    at = _app()
    before = len(at.session_state["pipeline_steps"])
    # The first ✕ remove button in the card stack.
    [b for b in at.button if b.label == "✕"][0].click().run()
    assert not at.exception
    assert len(at.session_state["pipeline_steps"]) == before - 1


def test_run_produces_results_with_friendly_names():
    at = _app()
    [b for b in at.button if b.label == "Run Pipeline"][0].click().run()
    assert not at.exception, at.exception
    assert "pipeline_result" in at.session_state
    res = at.session_state["pipeline_result"]
    assert res.initial_rows == 3 and res.final_rows == 2  # the two Jane rows merge
    assert all(sr.error is None for sr in res.step_results)


def test_step_phrase_is_plain_english_not_json():
    from src.gui.components.pipeline_modules import step_phrase, step_status

    # dedup phrasing mirrors the design mockup wording exactly.
    phrase = step_phrase("dedup", {
        "input_rows": 18442, "output_rows": 18130,
        "duplicates_removed": 312, "groups": 147,
    })
    assert phrase == "312 duplicates removed across 147 groups (18,442 → 18,130 rows)"

    # text_clean lists affected columns in prose, with thousands separators.
    assert step_phrase("text_clean", {
        "cells_changed": 1204, "columns_processed": ["name", "city"],
    }) == "1,204 cells cleaned in name & city"

    # singular nouns pluralize correctly
    assert step_phrase("missing", {"rows_dropped": 1, "columns_dropped": ["x"]}) == \
        "1 row dropped, 1 column dropped"

    # unparseable cells downgrade the pill to warn with an inline detail
    label, level, detail = step_status(
        "format_standardize", {"cells_changed": 100, "cells_unparseable": 141},
    )
    assert level == "warn" and "141 skipped" in label and detail

    # a clean step is "ok" with no detail
    assert step_status("text_clean", {"cells_changed": 5})[1] == "ok"