"""DataTools — public demo app (deploys to Streamlit Community Cloud). This is a SEPARATE entry point from the main GUI (``src/gui/app.py``). The full GUI is the paid product surface; this demo is the marketing surface — a single page that runs one of three persona-specific pipelines on a preloaded sample file, shows the BEFORE / AFTER side-by-side, and converts the visitor to a Gumroad purchase. Launch: streamlit run src/gui/app_demo.py URL routing: https://demo.datatools.app/?p=shopify-pet (Shopify operator) https://demo.datatools.app/?p=bookkeeper (Bookkeeper) https://demo.datatools.app/?p=revops (RevOps agency) Free / paid boundary (per docs/DEMO-PLAN.md §6): - input rows capped at ``DEMO_ROW_CAP`` - input file size capped at ``DEMO_FILE_CAP_MB`` - download CSV gets a single trailing watermark row - the pipeline editor is read-only — visitor sees it but can't change it - no audit-log download (paid feature) - no save-pipeline-JSON (paid feature) The demo runs the *same engine* as the paid product. Caps are applied at the surface layer only — when the buyer downloads and runs the paid build, every cap disappears. """ from __future__ import annotations import io import json import sys import time from pathlib import Path from typing import Any import pandas as pd import streamlit as st # Ensure project root is on sys.path so `src.core` imports work _project_root = Path(__file__).resolve().parent.parent.parent if str(_project_root) not in sys.path: sys.path.insert(0, str(_project_root)) from src.core.pipeline import Pipeline, run_pipeline # --------------------------------------------------------------------------- # Free / paid boundary constants # --------------------------------------------------------------------------- DEMO_ROW_CAP: int = 100 DEMO_FILE_CAP_MB: int = 5 GUMROAD_BASE: str = "https://gumroad.com/l/datatools" # --------------------------------------------------------------------------- # Persona registry — single source of truth # --------------------------------------------------------------------------- DEMO_DIR = _project_root / "samples" / "demo" PERSONAS: dict[str, dict[str, Any]] = { "shopify-pet": { "label": "Shopify pet operator", "icon": "🛍️", "h1": "Klaviyo-import-ready customer lists. **In 30 seconds. Locally.**", "sub": ( "Your Shopify customer export has duplicates Excel can't catch, " "international phones Excel can't parse, and disguised nulls " "(`N/A`, `(blank)`, `?`) that break Klaviyo's import. " "DataTools fixes all of it in one pass — and your data never " "leaves your computer." ), "data_file": "shopify_pet_customers.csv", "pipeline_file": "shopify_pet_pipeline.json", "cta": "Get DataTools for Shopify — $49 →", "landing": "https://datatools.app/shopify/", }, "bookkeeper": { "label": "Bookkeeper / freelance accountant", "icon": "📒", "h1": "Reconcile messy bank exports. **Hand your client an audit trail.**", "sub": ( "The Jan and Feb exports overlap; the same transaction posts twice. " "Vendor names are *Amazon* / *amazon.com* / *AMAZON.COM*4F2X9* in " "three rows. DataTools dedups on Date + Amount + fuzzy Vendor, " "produces ISO dates and numeric amounts, and gives you a row-level " "audit log to hand the client." ), "data_file": "bookkeeper_bank_reconcile.csv", "pipeline_file": "bookkeeper_bank_pipeline.json", "cta": "Get DataTools for Bookkeepers — $49 →", "landing": "https://datatools.app/bookkeeper/", }, "revops": { "label": "Marketing / RevOps agency", "icon": "🪢", "h1": "Dedupe lead lists across HubSpot, LinkedIn, and manual scrapes — **locally.**", "sub": ( "The same prospect shows up in HubSpot as `alice@acme.com`, in " "LinkedIn as `Alice.Johnson@acme.com`, and in your VA's manual " "scrape as `alice@acme.com` again. Country is `USA` / `US` / " "`United States`. DataTools fuzzy-matches across sources, " "normalizes phones for 50+ countries, and merges survivors " "with their most-complete fields — without uploading anything." ), "data_file": "agency_combined_leads.csv", "pipeline_file": "agency_leads_pipeline.json", "cta": "Get DataTools for RevOps — $49 →", "landing": "https://datatools.app/revops/", }, } DEFAULT_PERSONA = "shopify-pet" # --------------------------------------------------------------------------- # Page config + routing # --------------------------------------------------------------------------- st.set_page_config( page_title="DataTools — try it live", page_icon="🧹", layout="wide", initial_sidebar_state="collapsed", ) # Strip Streamlit chrome that breaks the iframe-embed look on the # landing pages. st.markdown(""" """, unsafe_allow_html=True) def _resolve_persona() -> str: """Read ``?p=`` from query string; fall back to default.""" try: params = st.query_params raw = params.get("p", DEFAULT_PERSONA) except AttributeError: # Older Streamlit versions params = st.experimental_get_query_params() raw = params.get("p", [DEFAULT_PERSONA]) raw = raw[0] if isinstance(raw, list) else raw if raw not in PERSONAS: return DEFAULT_PERSONA return raw persona_key = _resolve_persona() persona = PERSONAS[persona_key] # --------------------------------------------------------------------------- # Header + persona switch # --------------------------------------------------------------------------- col_brand, col_switch = st.columns([3, 2]) with col_brand: st.markdown(f"### 🧹 DataTools / for {persona['label']}") with col_switch: # Quick-switch dropdown for visitors landing on the wrong persona new_choice = st.selectbox( "Try a different demo", options=list(PERSONAS), format_func=lambda k: f"{PERSONAS[k]['icon']} {PERSONAS[k]['label']}", index=list(PERSONAS).index(persona_key), key="persona_switch", label_visibility="collapsed", ) if new_choice != persona_key: st.query_params["p"] = new_choice st.rerun() st.markdown(f"## {persona['h1']}") st.markdown(persona["sub"]) st.markdown("---") # --------------------------------------------------------------------------- # Load preloaded sample data + pipeline # --------------------------------------------------------------------------- @st.cache_data(show_spinner=False) def _load_demo(data_file: str, pipeline_file: str) -> tuple[pd.DataFrame, Pipeline]: df = pd.read_csv(DEMO_DIR / data_file, dtype=str, keep_default_na=False) pipe = Pipeline.from_file(DEMO_DIR / pipeline_file) return df, pipe sample_df, sample_pipeline = _load_demo(persona["data_file"], persona["pipeline_file"]) def _read_uploaded(uploaded_file) -> tuple[pd.DataFrame, list[str]]: """Decode an uploaded file. Returns (df, warnings).""" warnings: list[str] = [] raw = uploaded_file.getvalue() size_mb = len(raw) / 1024 / 1024 if size_mb > DEMO_FILE_CAP_MB: warnings.append( f"Uploaded file is {size_mb:.1f} MB — demo capped at " f"{DEMO_FILE_CAP_MB} MB. The paid product has no size limit." ) return sample_df.copy(), warnings suffix = Path(uploaded_file.name).suffix.lower() bio = io.BytesIO(raw) try: if suffix in (".xlsx", ".xls"): df = pd.read_excel(bio, dtype=str, keep_default_na=False) else: for enc in ("utf-8", "utf-8-sig", "latin-1"): try: bio.seek(0) sep = "\t" if suffix == ".tsv" else "," df = pd.read_csv( bio, dtype=str, keep_default_na=False, encoding=enc, sep=sep, on_bad_lines="warn", ) break except UnicodeDecodeError: continue else: bio.seek(0) df = pd.read_csv(bio, dtype=str, keep_default_na=False, encoding="latin-1") except Exception as e: warnings.append(f"Could not read your file ({type(e).__name__}). " "Demo will run on the sample dataset.") return sample_df.copy(), warnings if len(df) > DEMO_ROW_CAP: warnings.append( f"Demo capped at {DEMO_ROW_CAP} rows — your file has {len(df):,}. " f"Running on the first {DEMO_ROW_CAP} rows. The paid product has no row limit." ) df = df.head(DEMO_ROW_CAP) return df, warnings # --------------------------------------------------------------------------- # File source: preloaded sample (default) or user upload # --------------------------------------------------------------------------- st.markdown(f"#### Sample dataset preloaded · `{persona['data_file']}`") with st.expander( "Or replace with your own file (capped at " f"{DEMO_ROW_CAP} rows / {DEMO_FILE_CAP_MB} MB for the demo)", expanded=False, ): uploaded = st.file_uploader( "Your file", type=["csv", "tsv", "xlsx", "xls"], key="demo_user_file", label_visibility="collapsed", help=( "Files larger than the cap are accepted but only the first " f"{DEMO_ROW_CAP} rows are processed. The paid build runs on " "1 GB+ files via streaming." ), ) if uploaded is not None: df_in, upload_warnings = _read_uploaded(uploaded) for w in upload_warnings: st.info(w) using_sample = False else: df_in = sample_df.copy() using_sample = True # --------------------------------------------------------------------------- # BEFORE preview # --------------------------------------------------------------------------- st.markdown(f"#### BEFORE — {len(df_in)} rows, {len(df_in.columns)} columns") st.dataframe(df_in.head(10), use_container_width=True, hide_index=True) st.markdown("---") # --------------------------------------------------------------------------- # Pipeline (read-only) # --------------------------------------------------------------------------- st.markdown("#### Pipeline (saved — paid version is editable)") pipe_summary = " → ".join( f"**{i + 1}.** {step.tool}" for i, step in enumerate(sample_pipeline.steps) ) st.markdown(pipe_summary) # --------------------------------------------------------------------------- # Run # --------------------------------------------------------------------------- run_clicked = st.button( "▶ Run pipeline", type="primary", use_container_width=True, key="demo_run_button", ) if run_clicked: with st.spinner("Running…"): t0 = time.perf_counter() try: result = run_pipeline(df_in, sample_pipeline, stop_on_error=False) except Exception as e: from src.core.errors import format_for_user st.error(f"Demo halted: {format_for_user(e)}") st.stop() elapsed = time.perf_counter() - t0 st.session_state["demo_result"] = result st.session_state["demo_elapsed"] = elapsed st.session_state["demo_persona"] = persona_key result = st.session_state.get("demo_result") elapsed = st.session_state.get("demo_elapsed", 0.0) result_persona = st.session_state.get("demo_persona") # Reset cached result when persona switches if result is not None and result_persona != persona_key: result = None st.session_state.pop("demo_result", None) # --------------------------------------------------------------------------- # AFTER + metrics + CTA # --------------------------------------------------------------------------- if result is not None: st.markdown("---") st.markdown( f"#### AFTER — {len(df_in)} → {len(result.final_df)} rows · " f"finished in {elapsed*1000:.0f} ms" ) # Per-step metric pills pills_html: list[str] = [] for sr in result.step_results: if sr.skipped: continue if sr.error: pills_html.append( f'' f'{sr.step.tool}: error' ) continue s = sr.summary bits: list[str] = [] if "cells_changed" in s and s["cells_changed"]: bits.append(f"{s['cells_changed']} cells") if "sentinels_standardized" in s and s["sentinels_standardized"]: bits.append(f"{s['sentinels_standardized']} sentinels") if "duplicates_removed" in s and s["duplicates_removed"]: bits.append(f"{s['duplicates_removed']} dupes merged") if "columns_renamed" in s and s["columns_renamed"]: bits.append(f"{s['columns_renamed']} renamed") label = ", ".join(bits) if bits else "no-op" pills_html.append( f'{sr.step.tool}: {label}' ) st.markdown("".join(pills_html), unsafe_allow_html=True) st.dataframe(result.final_df.head(10), use_container_width=True, hide_index=True) # ----- Download with watermark row ----- watermark_row = pd.DataFrame([{ col: f"DataTools demo — buy at {persona['landing']}" if i == 0 else "" for i, col in enumerate(result.final_df.columns) }]) out_df = pd.concat([result.final_df, watermark_row], ignore_index=True) csv_bytes = out_df.to_csv(index=False).encode("utf-8-sig") col_dl, col_cta = st.columns([1, 2]) with col_dl: st.download_button( "Download cleaned CSV (sample · watermarked)", data=csv_bytes, file_name=Path(persona["data_file"]).stem + "_cleaned_demo.csv", mime="text/csv", use_container_width=True, ) with col_cta: st.markdown( f"""
Like what you see?
Run this on YOUR full file — locally. No upload. No row limit. No watermark.
{persona['cta']}
""", unsafe_allow_html=True, ) else: # Pre-run state — show the buy block at the bottom anyway so the # CTA is always visible above the fold once the visitor scrolls. st.markdown( f"""
Already convinced?
Skip the demo and grab the full version. One-time payment, no subscription.
{persona['cta']}
""", unsafe_allow_html=True, ) # --------------------------------------------------------------------------- # Footer trust block # --------------------------------------------------------------------------- st.markdown("---") col_t1, col_t2, col_t3 = st.columns(3) with col_t1: st.markdown("**🔒 Runs locally**\n\nThe paid product is desktop-only. Your data never leaves your computer.") with col_t2: st.markdown("**📋 Audit trail**\n\nEvery cell change row-logged with old / new / which rule fired.") with col_t3: st.markdown("**💰 One-time $49**\n\nNo subscription. Mac · Windows · Linux. Free updates for v1.x.") st.caption( f"Demo capped at {DEMO_ROW_CAP} rows · output watermarked with one trailing row · " "running on free hosting. The paid product is uncapped and runs offline." )