Files
datatools-dev/tests/test_demo_pipelines.py
Michael 6df726e69e demo: reconstruct sales demos for an accounting audience
Replaces the Shopify / RevOps / Bookkeeper demo trio with three accounting
personas that share one buyer, each entering through a workflow where a
messy export costs money — all running the same saved 4-step pipeline:

- bank_reconciliation.csv (Bookkeeper): 26 -> 20 rows, 6 double-posted
  transactions caught after date+amount standardization.
- vendor_1099.csv (AP / 1099): 24 records -> 8 vendors, 7 missing EINs
  recovered via dedup merge — the 1099-complete story.
- ar_open_invoices.csv (AR): 26 -> 21 rows, 5 double-entered invoices
  removed, blank status backfilled from the twin row.

Every number is validated against the live engine and pinned by
tests/test_demo_pipelines.py (read path mirrors app_demo._load_demo:
dtype=str, keep_default_na=False). Rewires src/gui/app_demo.py PERSONAS
(keys bookkeeper / ap-1099 / ar-aging, accounting H1/sub/CTA) and rewrites
docs/DEMO-PLAN.md sections 3/4/7 with the validated outcomes.

(Repo hygiene forced by a partial-clone gap: finalizes the already-deleted,
unreferenced samples/messy_text.csv whose blob was unrecoverable.)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-22 18:52:39 +00:00

72 lines
3.1 KiB
Python

"""Demo pipelines must keep showing value (accounting personas).
Each persona's preloaded dataset + saved pipeline is the marketing surface
driven by ``src/gui/app_demo.py``. These tests pin that every demo loads,
runs clean, and produces its headline value (duplicate rows removed, clean
parse, disguised nulls caught) — so a stale dataset or an engine change can't
silently gut the sales demo. The read path mirrors ``app_demo._load_demo``
exactly (``dtype=str, keep_default_na=False`` so every disguised null survives
to the pipeline).
"""
from __future__ import annotations
from pathlib import Path
import pandas as pd
import pytest
from src.core.pipeline import Pipeline, run_pipeline
_REPO = Path(__file__).resolve().parent.parent
_DEMO = _REPO / "samples" / "demo"
# (data_file, pipeline_file, min_duplicates_removed) — one per accounting
# persona in app_demo.PERSONAS. The dup floors are the validated demo numbers.
_DEMOS = [
("bank_reconciliation.csv", "bank_reconciliation_pipeline.json", 6),
("vendor_1099.csv", "vendor_1099_pipeline.json", 8),
("ar_open_invoices.csv", "ar_open_invoices_pipeline.json", 5),
]
@pytest.mark.parametrize("data_file,pipeline_file,min_dupes", _DEMOS)
def test_demo_runs_clean_and_shows_value(data_file, pipeline_file, min_dupes):
df = pd.read_csv(_DEMO / data_file, dtype=str, keep_default_na=False)
pipe = Pipeline.from_file(_DEMO / pipeline_file)
res = run_pipeline(df, pipe, stop_on_error=True)
# 1. Nothing errored — the demo never shows a visitor a red banner.
assert all(sr.error is None for sr in res.step_results), [
(sr.step.tool, sr.error) for sr in res.step_results
]
# 2. Dedup removed the designed duplicate rows (the headline value).
assert res.final_rows < res.initial_rows
dedup = next(sr for sr in res.step_results if sr.step.tool == "dedup")
assert dedup.summary["duplicates_removed"] >= min_dupes
# 3. Standardization parsed every typed cell — a demo with unparseable
# cells reads as "the tool choked," which kills the pitch.
fmt = next(sr for sr in res.step_results if sr.step.tool == "format_standardize")
assert fmt.summary["cells_unparseable"] == 0
assert fmt.summary["cells_changed"] > 0
# 4. The disguised nulls (—, (blank), TBD, …) were caught.
miss = next(sr for sr in res.step_results if sr.step.tool == "missing")
assert miss.summary["sentinels_standardized"] > 0
def test_app_demo_references_each_demo_file():
"""Every data/pipeline file the demo app names must exist on disk.
Guards against a rename in app_demo.py drifting away from samples/demo/
(or vice versa) without a test catching it.
"""
src = (_REPO / "src" / "gui" / "app_demo.py").read_text(encoding="utf-8")
for data_file, pipeline_file, _ in _DEMOS:
assert data_file in src, f"{data_file} not referenced in app_demo.py"
assert pipeline_file in src, f"{pipeline_file} not referenced in app_demo.py"
assert (_DEMO / data_file).exists(), f"missing {data_file}"
assert (_DEMO / pipeline_file).exists(), f"missing {pipeline_file}"