datatools-dev/src/gui/app_demo.py

"""DataTools — public demo app (deploys to Streamlit Community Cloud).

This is a SEPARATE entry point from the main GUI (``src/gui/app.py``).
The full GUI is the paid product surface; this demo is the marketing
surface — a single page that runs one of three persona-specific
pipelines on a preloaded sample file, shows the BEFORE / AFTER
side-by-side, and converts the visitor to a Gumroad purchase.

Launch:
    streamlit run src/gui/app_demo.py

URL routing:
    https://demo.datatools.app/?p=shopify-pet   (Shopify operator)
    https://demo.datatools.app/?p=bookkeeper    (Bookkeeper)
    https://demo.datatools.app/?p=revops        (RevOps agency)

Free / paid boundary (per docs/DEMO-PLAN.md §6):
    - input rows capped at ``DEMO_ROW_CAP``
    - input file size capped at ``DEMO_FILE_CAP_MB``
    - download CSV gets a single trailing watermark row
    - the pipeline editor is read-only — visitor sees it but can't change it
    - no audit-log download (paid feature)
    - no save-pipeline-JSON (paid feature)

The demo runs the *same engine* as the paid product. Caps are applied
at the surface layer only — when the buyer downloads and runs the paid
build, every cap disappears.
"""

from __future__ import annotations

import io
import json
import sys
import time
from pathlib import Path
from typing import Any

import pandas as pd
import streamlit as st


# Ensure project root is on sys.path so `src.core` imports work
_project_root = Path(__file__).resolve().parent.parent.parent
if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root))

from src.core.pipeline import Pipeline, run_pipeline


# ---------------------------------------------------------------------------
# Free / paid boundary constants
# ---------------------------------------------------------------------------

DEMO_ROW_CAP: int = 100
DEMO_FILE_CAP_MB: int = 5
GUMROAD_BASE: str = "https://gumroad.com/l/datatools"


# ---------------------------------------------------------------------------
# Persona registry — single source of truth
# ---------------------------------------------------------------------------

DEMO_DIR = _project_root / "samples" / "demo"


PERSONAS: dict[str, dict[str, Any]] = {
    "shopify-pet": {
        "label": "Shopify pet operator",
        "icon": "🛍️",
        "h1": "Klaviyo-import-ready customer lists. **In 30 seconds. Locally.**",
        "sub": (
            "Your Shopify customer export has duplicates Excel can't catch, "
            "international phones Excel can't parse, and disguised nulls "
            "(`N/A`, `(blank)`, `?`) that break Klaviyo's import. "
            "DataTools fixes all of it in one pass — and your data never "
            "leaves your computer."
        ),
        "data_file":     "shopify_pet_customers.csv",
        "pipeline_file": "shopify_pet_pipeline.json",
        "cta":           "Get DataTools for Shopify — $49 →",
        "landing":       "https://datatools.app/shopify/",
    },
    "bookkeeper": {
        "label": "Bookkeeper / freelance accountant",
        "icon": "📒",
        "h1": "Reconcile messy bank exports. **Hand your client an audit trail.**",
        "sub": (
            "The Jan and Feb exports overlap; the same transaction posts twice. "
            "Vendor names are *Amazon* / *amazon.com* / *AMAZON.COM*4F2X9* in "
            "three rows. DataTools dedups on Date + Amount + fuzzy Vendor, "
            "produces ISO dates and numeric amounts, and gives you a row-level "
            "audit log to hand the client."
        ),
        "data_file":     "bookkeeper_bank_reconcile.csv",
        "pipeline_file": "bookkeeper_bank_pipeline.json",
        "cta":           "Get DataTools for Bookkeepers — $49 →",
        "landing":       "https://datatools.app/bookkeeper/",
    },
    "revops": {
        "label": "Marketing / RevOps agency",
        "icon": "🪢",
        "h1": "Dedupe lead lists across HubSpot, LinkedIn, and manual scrapes — **locally.**",
        "sub": (
            "The same prospect shows up in HubSpot as `alice@acme.com`, in "
            "LinkedIn as `Alice.Johnson@acme.com`, and in your VA's manual "
            "scrape as `alice@acme.com` again. Country is `USA` / `US` / "
            "`United States`. DataTools fuzzy-matches across sources, "
            "normalizes phones for 50+ countries, and merges survivors "
            "with their most-complete fields — without uploading anything."
        ),
        "data_file":     "agency_combined_leads.csv",
        "pipeline_file": "agency_leads_pipeline.json",
        "cta":           "Get DataTools for RevOps — $49 →",
        "landing":       "https://datatools.app/revops/",
    },
}

DEFAULT_PERSONA = "shopify-pet"


# ---------------------------------------------------------------------------
# Page config + routing
# ---------------------------------------------------------------------------

st.set_page_config(
    page_title="DataTools — try it live",
    page_icon="🧹",
    layout="wide",
    initial_sidebar_state="collapsed",
)

# Strip Streamlit chrome that breaks the iframe-embed look on the
# landing pages.
st.markdown("""
<style>
#MainMenu, footer, header { visibility: hidden; }
.block-container { padding-top: 1.2rem; padding-bottom: 1rem; max-width: 1200px; }
[data-testid="stSidebarNav"] { display: none; }
section[data-testid="stSidebar"] { display: none; }
.stApp { background: #0f1115; color: #e8eaed; }
h1, h2, h3 { color: #e8eaed; letter-spacing: -0.01em; }
hr { border-color: #252a36; }
.demo-card {
  background: #161922;
  border: 1px solid #252a36;
  border-radius: 12px;
  padding: 18px;
}
.cta-block {
  background: linear-gradient(135deg, #161922 0%, #1d212b 100%);
  border: 1px solid #6ee7b7;
  border-radius: 12px;
  padding: 24px;
  text-align: center;
}
.cta-block a {
  display: inline-block;
  background: #6ee7b7; color: #052e1a;
  font-weight: 600; padding: 12px 22px;
  border-radius: 8px; text-decoration: none;
  font-size: 17px; margin-top: 12px;
}
.metric-pill {
  display: inline-block;
  background: #1d212b; border: 1px solid #252a36;
  padding: 4px 10px; border-radius: 999px;
  font-family: ui-monospace, monospace; font-size: 13px;
  color: #6ee7b7; margin-right: 6px; margin-bottom: 4px;
}
</style>
""", unsafe_allow_html=True)


def _resolve_persona() -> str:
    """Read ``?p=<persona>`` from query string; fall back to default."""
    try:
        params = st.query_params
        raw = params.get("p", DEFAULT_PERSONA)
    except AttributeError:
        # Older Streamlit versions
        params = st.experimental_get_query_params()
        raw = params.get("p", [DEFAULT_PERSONA])
        raw = raw[0] if isinstance(raw, list) else raw
    if raw not in PERSONAS:
        return DEFAULT_PERSONA
    return raw


persona_key = _resolve_persona()
persona = PERSONAS[persona_key]


# ---------------------------------------------------------------------------
# Header + persona switch
# ---------------------------------------------------------------------------

col_brand, col_switch = st.columns([3, 2])
with col_brand:
    st.markdown(f"### 🧹 DataTools / for {persona['label']}")
with col_switch:
    # Quick-switch dropdown for visitors landing on the wrong persona
    new_choice = st.selectbox(
        "Try a different demo",
        options=list(PERSONAS),
        format_func=lambda k: f"{PERSONAS[k]['icon']} {PERSONAS[k]['label']}",
        index=list(PERSONAS).index(persona_key),
        key="persona_switch",
        label_visibility="collapsed",
    )
    if new_choice != persona_key:
        st.query_params["p"] = new_choice
        st.rerun()

st.markdown(f"## {persona['h1']}")
st.markdown(persona["sub"])

st.markdown("---")


# ---------------------------------------------------------------------------
# Load preloaded sample data + pipeline
# ---------------------------------------------------------------------------

@st.cache_data(show_spinner=False)
def _load_demo(data_file: str, pipeline_file: str) -> tuple[pd.DataFrame, Pipeline]:
    df = pd.read_csv(DEMO_DIR / data_file, dtype=str, keep_default_na=False)
    pipe = Pipeline.from_file(DEMO_DIR / pipeline_file)
    return df, pipe


sample_df, sample_pipeline = _load_demo(persona["data_file"], persona["pipeline_file"])


def _read_uploaded(uploaded_file) -> tuple[pd.DataFrame, list[str]]:
    """Decode an uploaded file. Returns (df, warnings)."""
    warnings: list[str] = []
    raw = uploaded_file.getvalue()
    size_mb = len(raw) / 1024 / 1024
    if size_mb > DEMO_FILE_CAP_MB:
        warnings.append(
            f"Uploaded file is {size_mb:.1f} MB — demo capped at "
            f"{DEMO_FILE_CAP_MB} MB. The paid product has no size limit."
        )
        return sample_df.copy(), warnings
    suffix = Path(uploaded_file.name).suffix.lower()
    bio = io.BytesIO(raw)
    try:
        if suffix in (".xlsx", ".xls"):
            df = pd.read_excel(bio, dtype=str, keep_default_na=False)
        else:
            for enc in ("utf-8", "utf-8-sig", "latin-1"):
                try:
                    bio.seek(0)
                    sep = "\t" if suffix == ".tsv" else ","
                    df = pd.read_csv(
                        bio, dtype=str, keep_default_na=False,
                        encoding=enc, sep=sep, on_bad_lines="warn",
                    )
                    break
                except UnicodeDecodeError:
                    continue
            else:
                bio.seek(0)
                df = pd.read_csv(bio, dtype=str, keep_default_na=False, encoding="latin-1")
    except Exception as e:
        warnings.append(f"Could not read your file ({type(e).__name__}). "
                        "Demo will run on the sample dataset.")
        return sample_df.copy(), warnings
    if len(df) > DEMO_ROW_CAP:
        warnings.append(
            f"Demo capped at {DEMO_ROW_CAP} rows — your file has {len(df):,}. "
            f"Running on the first {DEMO_ROW_CAP} rows. The paid product has no row limit."
        )
        df = df.head(DEMO_ROW_CAP)
    return df, warnings


# ---------------------------------------------------------------------------
# File source: preloaded sample (default) or user upload
# ---------------------------------------------------------------------------

st.markdown(f"#### Sample dataset preloaded · `{persona['data_file']}`")

with st.expander(
    "Or replace with your own file (capped at "
    f"{DEMO_ROW_CAP} rows / {DEMO_FILE_CAP_MB} MB for the demo)",
    expanded=False,
):
    uploaded = st.file_uploader(
        "Your file",
        type=["csv", "tsv", "xlsx", "xls"],
        key="demo_user_file",
        label_visibility="collapsed",
        help=(
            "Files larger than the cap are accepted but only the first "
            f"{DEMO_ROW_CAP} rows are processed. The paid build runs on "
            "1 GB+ files via streaming."
        ),
    )

if uploaded is not None:
    df_in, upload_warnings = _read_uploaded(uploaded)
    for w in upload_warnings:
        st.info(w)
    using_sample = False
else:
    df_in = sample_df.copy()
    using_sample = True


# ---------------------------------------------------------------------------
# BEFORE preview
# ---------------------------------------------------------------------------

st.markdown(f"#### BEFORE — {len(df_in)} rows, {len(df_in.columns)} columns")
st.dataframe(df_in.head(10), use_container_width=True, hide_index=True)

st.markdown("---")


# ---------------------------------------------------------------------------
# Pipeline (read-only)
# ---------------------------------------------------------------------------

st.markdown("#### Pipeline (saved — paid version is editable)")
pipe_summary = " → ".join(
    f"**{i + 1}.** {step.tool}"
    for i, step in enumerate(sample_pipeline.steps)
)
st.markdown(pipe_summary)


# ---------------------------------------------------------------------------
# Run
# ---------------------------------------------------------------------------

run_clicked = st.button(
    "▶ Run pipeline",
    type="primary",
    use_container_width=True,
    key="demo_run_button",
)

if run_clicked:
    with st.spinner("Running…"):
        t0 = time.perf_counter()
        try:
            result = run_pipeline(df_in, sample_pipeline, stop_on_error=False)
        except Exception as e:
            from src.core.errors import format_for_user
            st.error(f"Demo halted: {format_for_user(e)}")
            st.stop()
        elapsed = time.perf_counter() - t0
    st.session_state["demo_result"] = result
    st.session_state["demo_elapsed"] = elapsed
    st.session_state["demo_persona"] = persona_key

result = st.session_state.get("demo_result")
elapsed = st.session_state.get("demo_elapsed", 0.0)
result_persona = st.session_state.get("demo_persona")

# Reset cached result when persona switches
if result is not None and result_persona != persona_key:
    result = None
    st.session_state.pop("demo_result", None)


# ---------------------------------------------------------------------------
# AFTER + metrics + CTA
# ---------------------------------------------------------------------------

if result is not None:
    st.markdown("---")
    st.markdown(
        f"#### AFTER — {len(df_in)} → {len(result.final_df)} rows · "
        f"finished in {elapsed*1000:.0f} ms"
    )

    # Per-step metric pills
    pills_html: list[str] = []
    for sr in result.step_results:
        if sr.skipped:
            continue
        if sr.error:
            pills_html.append(
                f'<span class="metric-pill" style="color:#fbbf24">'
                f'{sr.step.tool}: error</span>'
            )
            continue
        s = sr.summary
        bits: list[str] = []
        if "cells_changed" in s and s["cells_changed"]:
            bits.append(f"{s['cells_changed']} cells")
        if "sentinels_standardized" in s and s["sentinels_standardized"]:
            bits.append(f"{s['sentinels_standardized']} sentinels")
        if "duplicates_removed" in s and s["duplicates_removed"]:
            bits.append(f"{s['duplicates_removed']} dupes merged")
        if "columns_renamed" in s and s["columns_renamed"]:
            bits.append(f"{s['columns_renamed']} renamed")
        label = ", ".join(bits) if bits else "no-op"
        pills_html.append(
            f'<span class="metric-pill">{sr.step.tool}: {label}</span>'
        )
    st.markdown("".join(pills_html), unsafe_allow_html=True)

    st.dataframe(result.final_df.head(10), use_container_width=True, hide_index=True)

    # ----- Download with watermark row -----
    watermark_row = pd.DataFrame([{
        col: f"DataTools demo — buy at {persona['landing']}"
        if i == 0 else ""
        for i, col in enumerate(result.final_df.columns)
    }])
    out_df = pd.concat([result.final_df, watermark_row], ignore_index=True)
    csv_bytes = out_df.to_csv(index=False).encode("utf-8-sig")

    col_dl, col_cta = st.columns([1, 2])
    with col_dl:
        st.download_button(
            "Download cleaned CSV (sample · watermarked)",
            data=csv_bytes,
            file_name=Path(persona["data_file"]).stem + "_cleaned_demo.csv",
            mime="text/csv",
            use_container_width=True,
        )
    with col_cta:
        st.markdown(
            f"""
<div class="cta-block">
  <strong style="font-size: 18px;">Like what you see?</strong><br/>
  Run this on YOUR full file — locally. No upload. No row limit. No watermark.<br/>
  <a href="{GUMROAD_BASE}?from={persona_key}" rel="noopener">{persona['cta']}</a>
</div>
""",
            unsafe_allow_html=True,
        )
else:
    # Pre-run state — show the buy block at the bottom anyway so the
    # CTA is always visible above the fold once the visitor scrolls.
    st.markdown(
        f"""
<div class="cta-block" style="margin-top: 24px;">
  <strong style="font-size: 18px;">Already convinced?</strong><br/>
  Skip the demo and grab the full version. One-time payment, no subscription.<br/>
  <a href="{GUMROAD_BASE}?from={persona_key}" rel="noopener">{persona['cta']}</a>
</div>
""",
        unsafe_allow_html=True,
    )

# ---------------------------------------------------------------------------
# Footer trust block
# ---------------------------------------------------------------------------

st.markdown("---")
col_t1, col_t2, col_t3 = st.columns(3)
with col_t1:
    st.markdown("**🔒 Runs locally**\n\nThe paid product is desktop-only. Your data never leaves your computer.")
with col_t2:
    st.markdown("**📋 Audit trail**\n\nEvery cell change row-logged with old / new / which rule fired.")
with col_t3:
    st.markdown("**💰 One-time $49**\n\nNo subscription. Mac · Windows · Linux. Free updates for v1.x.")

st.caption(
    f"Demo capped at {DEMO_ROW_CAP} rows · output watermarked with one trailing row · "
    "running on free hosting. The paid product is uncapped and runs offline."
)