diff --git a/build/datatools.spec b/build/datatools.spec
index b5d3268..1469fea 100644
--- a/build/datatools.spec
+++ b/build/datatools.spec
@@ -58,15 +58,12 @@ hidden_imports += collect_submodules("charset_normalizer")
 hidden_imports += collect_submodules("openpyxl")
 hidden_imports += collect_submodules("loguru")
 
-# PDF Extractor stack. ``streamlit_drawable_canvas`` and
-# ``pypdfium2`` both have their own PyInstaller hooks under
-# ``build/hooks/`` that pull in the native binary + frontend
-# assets — keep the ``collect_submodules`` calls here for
-# belt-and-braces.
+# PDF Extractor stack. ``pypdfium2`` has its own PyInstaller hook
+# under ``build/hooks/`` that pulls in the native PDFium binary —
+# keep the ``collect_submodules`` calls here for belt-and-braces.
 hidden_imports += collect_submodules("pdfplumber")
 hidden_imports += collect_submodules("pdfminer")
 hidden_imports += collect_submodules("pypdfium2")
-hidden_imports += collect_submodules("streamlit_drawable_canvas")
 hidden_imports += collect_submodules("PIL")
 hidden_imports += collect_submodules("pytesseract")
 
@@ -91,13 +88,10 @@ datas += collect_data_files("phonenumbers", include_py_files=False)
 
 # PDF Extractor data files. ``pypdfium2`` ships a native PDFium
 # shared library (``.dll`` / ``.so`` / ``.dylib``) under its package
-# dir; ``streamlit-drawable-canvas`` ships a built JS bundle that
-# Streamlit serves from the package dir at runtime; pdfminer ships
-# the Adobe CMap tables it uses for character mapping. Hooks
-# under ``build/hooks/`` mirror these calls for explicit
-# documentation and survive ``collect_data_files`` regressions.
+# dir; ``pdfminer`` ships the Adobe CMap tables it uses for
+# character mapping. The drawable-canvas frontend bundle is gone
+# now that the visual picker was removed.
 datas += collect_data_files("pypdfium2", include_py_files=False)
-datas += collect_data_files("streamlit_drawable_canvas")
 datas += collect_data_files("pdfminer", include_py_files=False)
 
 # Our application files. PyInstaller's bundler treats source as code
diff --git a/build/hooks/hook-streamlit_drawable_canvas.py b/build/hooks/hook-streamlit_drawable_canvas.py
deleted file mode 100644
index 17483ab..0000000
--- a/build/hooks/hook-streamlit_drawable_canvas.py
+++ /dev/null
@@ -1,19 +0,0 @@
-"""PyInstaller hook for streamlit-drawable-canvas.
-
-Streamlit components are Python packages that also ship a built
-JavaScript/CSS bundle Streamlit serves from disk at component-
-render time. Without those assets in the bundle the canvas
-iframe loads blank — the user sees the page render fine but the
-visual picker shows no image and no drawing controls.
-
-``collect_data_files`` covers the frontend bundle directory
-(named ``frontend`` or ``frontend/build`` depending on the
-component version). Hidden imports are picked up by the main
-spec's ``collect_submodules`` call, repeated here for the same
-belt-and-braces reason as ``hook-pypdfium2.py``.
-"""
-
-from PyInstaller.utils.hooks import collect_data_files, collect_submodules
-
-datas = collect_data_files("streamlit_drawable_canvas")
-hiddenimports = collect_submodules("streamlit_drawable_canvas")
diff --git a/requirements.txt b/requirements.txt
index 226b13d..26e662c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,10 +10,14 @@ phonenumbers>=8.13,<9
 streamlit>=1.35,<2
 cryptography>=41,<49
 # PDF Extractor stack — pinned to exact tested versions so a future
-# upstream release can't change the visual picker's coordinate model
-# or pdfplumber's word-position behavior mid-build. Bump these
+# upstream release can't quietly change pdfplumber's word-position
+# behavior or pypdfium2's OCR rendering mid-build. Bump these
 # explicitly when re-testing against a new release.
+#
+# ``pypdfium2`` is here for the OCR fallback path only (rasterizing
+# pages to images for Tesseract). The drawable-canvas dep was
+# removed when the visual picker was ripped out — the scanner is
+# pure heuristic now, no coordinate UI.
 pdfplumber==0.11.9
 pypdfium2==5.8.0
 pytesseract==0.3.13
-streamlit-drawable-canvas==0.9.3
diff --git a/src/gui/_drawable_canvas_compat.py b/src/gui/_drawable_canvas_compat.py
deleted file mode 100644
index e6b2258..0000000
--- a/src/gui/_drawable_canvas_compat.py
+++ /dev/null
@@ -1,86 +0,0 @@
-"""Compatibility shim for streamlit-drawable-canvas on modern Streamlit.
-
-``streamlit-drawable-canvas`` 0.9.3 (last release 2023) calls
-``streamlit.elements.image.image_to_url(image, width, clamp,
-channels, output_format, image_id)``. Streamlit ~1.30+ moved this
-helper out of ``streamlit.elements.image`` and changed its
-signature so the second positional argument is now a
-``LayoutConfig`` dataclass instead of a plain ``int`` width.
-
-The canvas package hasn't been updated, so on modern Streamlit
-its very first call fails with::
-
-    AttributeError: module 'streamlit.elements.image'
-        has no attribute 'image_to_url'
-
-This module re-attaches a wrapper at the old import path that
-adapts the old call shape to the new function. Import it once
-before any ``st_canvas`` call; idempotent.
-
-The shim is opt-in (not auto-installed at module import) so the
-audit log of "I patched a third-party internal" is visible in
-``grep`` rather than silently happening on every page load.
-"""
-
-from __future__ import annotations
-
-
-_PATCHED = False
-
-
-def install() -> None:
-    """Install the ``image_to_url`` compatibility shim.
-
-    Idempotent — safe to call multiple times. Returns silently
-    if the canvas package or Streamlit can't be imported (lets
-    the caller handle the "PDF deps missing" path on its own).
-    """
-    global _PATCHED
-    if _PATCHED:
-        return
-
-    try:
-        import streamlit.elements.image as _old_image_module
-    except ImportError:
-        return
-
-    # Already present (old Streamlit, or already shimmed) — bail.
-    if hasattr(_old_image_module, "image_to_url"):
-        _PATCHED = True
-        return
-
-    try:
-        from streamlit.elements.lib.image_utils import (
-            image_to_url as _new_image_to_url,
-        )
-        from streamlit.elements.lib.layout_utils import LayoutConfig
-    except ImportError:
-        # ``image_to_url`` is in some other location we don't know
-        # about yet — let the canvas surface its own error so we
-        # learn where to look. Don't fail silently.
-        return
-
-    def _shim(
-        image,
-        width,
-        clamp,
-        channels,
-        output_format,
-        image_id,
-    ) -> str:
-        """Old API → new API. The old ``width=-1`` sentinel meant
-        "use the image's natural width", which is also the new
-        function's default behavior when ``LayoutConfig`` is left
-        unconfigured."""
-        layout = LayoutConfig()
-        return _new_image_to_url(
-            image,
-            layout,
-            clamp,
-            channels,
-            output_format,
-            image_id,
-        )
-
-    _old_image_module.image_to_url = _shim
-    _PATCHED = True
diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py
index aefb50f..e268ddb 100644
--- a/src/gui/pages/10_PDF_Extractor.py
+++ b/src/gui/pages/10_PDF_Extractor.py
@@ -1,23 +1,13 @@
-"""PDF Extractor — extract bank-statement transactions to CSV.
+"""PDF to CSV — heuristic transaction scanner.
 
-Three modes:
-
-- **Extract** (daily workflow): pick a saved template, upload a
-  PDF, get a CSV preview + download.
-- **Build template**: upload a sample PDF, configure how the
-  table is identified, save the template for reuse.
-- **Manage templates**: list / rename / delete / export / import.
-
-The expensive step is ``extract_pages_auto`` (PDF I/O + word
-extraction + optional OCR). It runs only on explicit user action
-("Extract" / "Preview"), and results are stashed in session_state
-so re-renders from form-field edits don't re-parse the PDF. Heavy
-work off Streamlit's rerun-on-every-widget path.
+Upload one or more bank-statement PDFs, scan for transaction-like
+rows ([date] [description] [amount]), uncheck the rows you don't
+want, download as CSV. No templates, no per-bank configuration,
+no coordinate picking.
 """
 
 from __future__ import annotations
 
-import io
 import sys
 from datetime import datetime
 from pathlib import Path
@@ -30,26 +20,17 @@ if str(_project_root) not in sys.path:
     sys.path.insert(0, str(_project_root))
 
 from src.audit import log_event, log_page_open
-from src.gui._drawable_canvas_compat import install as _install_canvas_compat
 from src.gui.components import hide_streamlit_chrome, render_sticky_footer
 from src.pdf_extract import (
     PdfDependencyMissing,
-    apply_template,
-    extract_pages_auto,
     ocr_available,
-    render_page_image,
+    scan_pdf_for_transactions,
 )
 
-# streamlit-drawable-canvas 0.9.3 calls a Streamlit internal
-# (``image_to_url``) that was relocated in Streamlit ~1.30+. The
-# shim re-attaches the old import path with a signature adapter.
-# See ``src/gui/_drawable_canvas_compat.py`` for the why.
-_install_canvas_compat()
-
 
 def _pdf_deps_status() -> tuple[bool, list[str]]:
-    """Probe each runtime PDF dep without forcing the user to hit the
-    extract button. Returns ``(ok, missing_names)``."""
+    """Probe each runtime PDF dep without forcing the user to hit
+    the Scan button. Returns ``(ok, missing_names)``."""
     missing: list[str] = []
     for name in ("pdfplumber", "pypdfium2"):
         try:
@@ -59,20 +40,6 @@ def _pdf_deps_status() -> tuple[bool, list[str]]:
     return (not missing), missing
 
 
-from src.pdf_templates import (
-    SCHEMA_VERSION,
-    VALID_TARGETS,
-    delete_template,
-    list_templates,
-    load_template,
-    new_template,
-    save_template,
-    slugify,
-    template_from_json,
-    template_to_json,
-    validate_template,
-)
-
 log_page_open("10_PDF_Extractor")
 
 _ICON_PATH = str(Path(__file__).parent.parent / "assets" / "datatools_icon_256.png")
@@ -84,36 +51,25 @@ st.set_page_config(
 hide_streamlit_chrome()
 render_sticky_footer()
 
-
 # ---------------------------------------------------------------------------
-# Session-state keys (centralized so the build / extract flows agree on names)
+# Session-state keys
 # ---------------------------------------------------------------------------
 
-K_MODE = "pdf_mode"
-K_CURRENT_TEMPLATE = "pdf_tpl_current"
-K_SAMPLE_BYTES = "pdf_tpl_sample_bytes"
-K_SAMPLE_NAME = "pdf_tpl_sample_name"
-K_SAMPLE_PAGES = "pdf_tpl_sample_pages"
-K_EXTRACT_DF = "pdf_extract_df"
-K_EXTRACT_WARNINGS = "pdf_extract_warnings"
-K_EXTRACT_FILES = "pdf_extract_files"
-
-
-def _get_or_init(key: str, default):
-    if key not in st.session_state:
-        st.session_state[key] = default
-    return st.session_state[key]
+K_ROWS = "pdf_scan_rows"
+K_WARNINGS = "pdf_scan_warnings"
+K_SOURCE_COUNT = "pdf_scan_source_count"
 
 
 # ---------------------------------------------------------------------------
-# Page header + mode selector
+# Header + dep guard
 # ---------------------------------------------------------------------------
 
 st.markdown("# PDF to CSV")
 st.caption(
-    "Extract transaction tables from bank-statement PDFs. Build one "
-    "template per source (bank + account type), then reuse it for "
-    "every statement that follows the same layout."
+    "Scan bank-statement PDFs for transaction rows "
+    "(``[date] [description] [amount]``). Review the table, uncheck "
+    "rows you don't want, edit any cell that needs fixing, then "
+    "download as CSV. No per-bank setup."
 )
 
 _pdf_ok, _pdf_missing = _pdf_deps_status()
@@ -122,1011 +78,218 @@ if not _pdf_ok:
         "**PDF dependencies are not installed.** "
         f"Missing module(s): `{', '.join(_pdf_missing)}`.\n\n"
         "Install them into the same Python that launches DataTools:\n\n"
-        "```\npip install pdfplumber pypdfium2 "
-        "streamlit-drawable-canvas pytesseract\n```\n\n"
-        "Then **fully restart the launcher** to pick up the new modules. "
-        "(Templates you've already saved are unaffected.)"
+        "```\npip install pdfplumber pypdfium2 pytesseract\n```\n\n"
+        "Then **fully restart the launcher** to pick up the new modules."
     )
     st.stop()
 
-_ocr_ok, _ocr_reason = ocr_available()
-c_mode, c_ocr = st.columns([3, 2])
-with c_mode:
-    mode = st.radio(
-        "Mode",
-        ["Extract", "Build template", "Manage templates"],
-        horizontal=True,
-        key=K_MODE,
-        label_visibility="collapsed",
-    )
-with c_ocr:
-    if _ocr_ok:
-        st.caption("**OCR:** ready · scanned pages will be transcribed.")
-    else:
-        import platform as _platform
-        _os_name = _platform.system()
-        with st.expander("**OCR:** unavailable", expanded=False):
-            st.markdown(
-                f"**Reason:** {_ocr_reason or 'unknown'}.\n\n"
-                "Scanned (image-based) statements will fall through "
-                "with warnings. Most modern bank statements are text-"
-                "based and don't need OCR — only install Tesseract if "
-                "your statements actually come through as images."
-            )
-            if _os_name == "Windows":
-                st.markdown(
-                    "**Install on Windows:**\n"
-                    "1. Download the installer from "
-                    "[UB-Mannheim/tesseract](https://github.com/UB-Mannheim/tesseract/wiki) "
-                    "(look for ``tesseract-ocr-w64-setup-…``).\n"
-                    "2. Run it. Keep the **\"Add tesseract to system "
-                    "PATH\"** checkbox on during setup.\n"
-                    "3. Restart the DataTools launcher.\n\n"
-                    "If you installed without PATH and don't want to "
-                    "reinstall, point DataTools at the binary directly "
-                    "by setting the ``DATATOOLS_TESSERACT_PATH`` env "
-                    "var to ``C:\\Program Files\\Tesseract-OCR\\tesseract.exe`` "
-                    "before launching."
-                )
-            elif _os_name == "Darwin":
-                st.markdown(
-                    "**Install on macOS:** ``brew install tesseract`` "
-                    "(requires [Homebrew](https://brew.sh)). Restart "
-                    "the DataTools launcher afterward."
-                )
-            else:
-                st.markdown(
-                    "**Install on Linux:** ``sudo apt install "
-                    "tesseract-ocr`` (Debian/Ubuntu) or your distro's "
-                    "equivalent (``dnf``, ``pacman``, …). Restart the "
-                    "DataTools launcher afterward."
-                )
 
-st.divider()
+# ---------------------------------------------------------------------------
+# Options + upload
+# ---------------------------------------------------------------------------
 
-
-# ===========================================================================
-# Extract mode
-# ===========================================================================
-
-
-def _render_extract_mode() -> None:
-    templates = list_templates()
-    if not templates:
-        st.info(
-            "No templates yet. Switch to **Build template** to create your "
-            "first one — you'll need a sample PDF from the source bank."
-        )
-        return
-
-    options = {f"{t['name']}  ·  {t['slug']}": t["slug"] for t in templates}
-    label = st.selectbox("Template", list(options.keys()))
-    slug = options[label]
-
-    uploads = st.file_uploader(
-        "Statement PDF(s)",
-        type=["pdf"],
-        accept_multiple_files=True,
-        help=(
-            "Drop one or more statements from the same source. Rows from "
-            "every file are combined into a single CSV, tagged with the "
-            "source filename."
-        ),
-    )
-
-    c1, c2, c3 = st.columns(3)
-    sort_by_date = c1.checkbox(
-        "Sort combined output by date",
+with st.expander("Scan options", expanded=False):
+    c1, c2 = st.columns(2)
+    negative_in_parens = c1.checkbox(
+        "Treat (4.50) as negative",
         value=True,
         help=(
-            "Sorts the combined CSV ascending by the ``date`` column "
-            "after extraction. Off → preserve per-PDF order."
+            "Bank statements commonly show withdrawals as ``(4.50)``. "
+            "Off if your statements use a different convention."
         ),
     )
-    output_shape = c2.radio(
-        "Output",
-        ["Combined CSV", "ZIP of per-PDF CSVs"],
-        horizontal=True,
-        help=(
-            "Combined: one CSV with a ``source_file`` column. "
-            "ZIP: one CSV per source PDF, useful when feeding files "
-            "back into separate ledgers."
-        ),
-    )
-    use_ocr = c3.checkbox(
+    _ocr_ok, _ocr_reason = ocr_available()
+    use_ocr = c2.checkbox(
         "Use OCR for scanned pages",
         value=_ocr_ok,
         disabled=not _ocr_ok,
         help=(
-            "When a page has no extractable text (typically a scan), "
-            "OCR it with Tesseract. Disabled when OCR isn't installed."
+            f"OCR status: {'ready' if _ocr_ok else _ocr_reason or 'unavailable'}. "
+            "Most modern bank PDFs are text-based and don't need OCR — "
+            "only enable for image-based scans."
         ),
     )
 
-    run = st.button("Extract", type="primary", disabled=not uploads)
-    if run and uploads:
-        try:
-            tpl = load_template(slug)
-        except Exception as e:
-            st.error(f"Couldn't load template {slug!r}: {e}")
-            return
+uploads = st.file_uploader(
+    "PDF file(s)",
+    type=["pdf"],
+    accept_multiple_files=True,
+    help="Drop one or more bank-statement PDFs. Multi-file batches "
+    "are merged into a single table with a ``source_file`` column.",
+)
 
-        per_file_frames: list[pd.DataFrame] = []
-        all_warnings: list[str] = []
-        files_meta: list[dict] = []
-        with st.status(
-            f"Extracting {len(uploads)} file(s)…",
-            expanded=True,
-        ) as status:
-            for i, up in enumerate(uploads, start=1):
-                st.write(f"**{i}/{len(uploads)}** · {up.name}")
-                try:
-                    pdf_bytes = up.read()
-                    pages, warns = extract_pages_auto(
-                        pdf_bytes, allow_ocr=use_ocr,
-                    )
-                    df = apply_template(pages, tpl)
-                    df.insert(0, "source_file", up.name)
-                    per_file_frames.append(df)
-                    files_meta.append({
-                        "file": up.name,
-                        "pages": len(pages),
-                        "rows": len(df),
-                        "warnings": len(warns),
-                        "status": "ok" if len(df) else "no rows",
-                    })
-                    for w in warns:
-                        all_warnings.append(f"[{up.name}] {w}")
-                except Exception as e:
-                    all_warnings.append(
-                        f"[{up.name}] extraction failed: "
-                        f"{type(e).__name__}: {e}"
-                    )
-                    files_meta.append({
-                        "file": up.name,
-                        "pages": 0,
-                        "rows": 0,
-                        "warnings": 1,
-                        "status": f"error: {type(e).__name__}",
-                    })
-            ok_count = sum(1 for m in files_meta if m["status"] == "ok")
-            status.update(
-                label=f"Done · {ok_count}/{len(uploads)} extracted",
-                state="complete" if ok_count == len(uploads) else "error",
-                expanded=False,
-            )
+scan_clicked = st.button(
+    "Scan", type="primary", disabled=not uploads,
+)
 
-        if per_file_frames:
-            combined = pd.concat(per_file_frames, ignore_index=True)
-            if sort_by_date and "date" in combined.columns:
-                combined = combined.sort_values(
-                    by=["date", "source_file"],
-                    kind="mergesort",
-                    na_position="last",
-                ).reset_index(drop=True)
-        else:
-            combined = pd.DataFrame()
-        st.session_state[K_EXTRACT_DF] = combined
-        st.session_state[K_EXTRACT_WARNINGS] = all_warnings
-        st.session_state[K_EXTRACT_FILES] = files_meta
-        st.session_state["pdf_extract_output_shape"] = output_shape
-        st.session_state["pdf_extract_per_file"] = [
-            (m["file"], per_file_frames[i])
-            for i, m in enumerate(files_meta)
-            if m["status"] == "ok"
-        ]
 
-        log_event(
-            "tool_run",
-            "PDF Extractor run",
-            page="10_PDF_Extractor",
-            template=slug,
-            files=len(uploads),
-            rows=len(combined),
-            output_shape=output_shape,
-        )
+# ---------------------------------------------------------------------------
+# Scan
+# ---------------------------------------------------------------------------
 
-    df = st.session_state.get(K_EXTRACT_DF)
-    if isinstance(df, pd.DataFrame):
-        warnings = st.session_state.get(K_EXTRACT_WARNINGS, []) or []
-        files_meta = st.session_state.get(K_EXTRACT_FILES, []) or []
-        if files_meta:
-            st.markdown("#### Per-file summary")
-            st.dataframe(
-                pd.DataFrame(files_meta),
-                hide_index=True,
-                use_container_width=True,
-            )
-        if warnings:
-            with st.expander(f"Warnings ({len(warnings)})", expanded=False):
-                for w in warnings:
-                    st.warning(w)
-
-        if df.empty:
-            st.info(
-                "No rows were extracted. Re-check the template's header "
-                "text, column boundaries, and end markers in **Build "
-                "template** mode against a sample PDF."
-            )
-        else:
-            st.markdown(f"#### Extracted rows ({len(df):,})")
-            st.dataframe(df, hide_index=True, use_container_width=True)
-            ts = datetime.now().strftime("%Y%m%d-%H%M%S")
-            output_shape = st.session_state.get(
-                "pdf_extract_output_shape", "Combined CSV",
-            )
-            if output_shape == "ZIP of per-PDF CSVs":
-                import zipfile
-                per_file = st.session_state.get("pdf_extract_per_file") or []
-                if not per_file:
-                    st.warning("No per-file CSVs to bundle.")
-                else:
-                    buf = io.BytesIO()
-                    with zipfile.ZipFile(
-                        buf, "w", zipfile.ZIP_DEFLATED,
-                    ) as zf:
-                        for name, sub_df in per_file:
-                            stem = Path(name).stem or "transactions"
-                            zf.writestr(
-                                f"{stem}.csv",
-                                sub_df.to_csv(index=False),
-                            )
-                    st.download_button(
-                        f"Download ZIP ({len(per_file)} files)",
-                        data=buf.getvalue(),
-                        file_name=f"transactions-{slug}-{ts}.zip",
-                        mime="application/zip",
-                        type="primary",
-                    )
-            else:
-                csv_bytes = df.to_csv(index=False).encode("utf-8")
-                st.download_button(
-                    "Download CSV",
-                    data=csv_bytes,
-                    file_name=f"transactions-{slug}-{ts}.csv",
-                    mime="text/csv",
-                    type="primary",
+if scan_clicked and uploads:
+    all_rows: list[dict] = []
+    all_warnings: list[str] = []
+    with st.status(
+        f"Scanning {len(uploads)} file(s)…",
+        expanded=True,
+    ) as status:
+        for i, up in enumerate(uploads, start=1):
+            st.write(f"**{i}/{len(uploads)}** · {up.name}")
+            try:
+                rows, warns = scan_pdf_for_transactions(
+                    up.read(),
+                    negative_in_parens=negative_in_parens,
+                    allow_ocr=use_ocr,
                 )
-
-
-# ===========================================================================
-# Build-template mode
-# ===========================================================================
-
-
-def _ensure_sample_loaded() -> bool:
-    """Side-bar uploader for the sample PDF. Returns True if a sample
-    is loaded and parsed (pages cached in session_state)."""
-    up = st.file_uploader(
-        "Sample statement",
-        type=["pdf"],
-        help=(
-            "Used to drive the live preview while you build the "
-            "template — pick a representative statement from this "
-            "source."
-        ),
-        key="pdf_tpl_sample_uploader",
-    )
-    if up is not None and up.name != st.session_state.get(K_SAMPLE_NAME):
-        pdf_bytes = up.read()
-        try:
-            pages, warns = extract_pages_auto(pdf_bytes, allow_ocr=True)
-        except Exception as e:
-            st.error(f"Couldn't read PDF: {type(e).__name__}: {e}")
-            return False
-        st.session_state[K_SAMPLE_BYTES] = pdf_bytes
-        st.session_state[K_SAMPLE_NAME] = up.name
-        st.session_state[K_SAMPLE_PAGES] = pages
-        for w in warns:
-            st.info(w)
-    return bool(st.session_state.get(K_SAMPLE_PAGES))
-
-
-def _render_columns_editor(tpl: dict) -> None:
-    """Edit the column mapping (source index → target field) and the
-    boundary x-positions in one place."""
-    st.markdown("##### Columns")
-    boundaries = list(tpl["table"].get("column_boundaries") or [])
-    bounds_text = st.text_input(
-        "Column boundaries (x-positions, comma-separated)",
-        value=", ".join(str(int(b)) for b in boundaries),
-        help=(
-            "N boundaries create N+1 columns. The visual picker in "
-            "the next phase will set these for you — until then you "
-            "can read x-positions from the page-preview hover tip "
-            "below, or trial-and-error against the live preview."
-        ),
-    )
-    try:
-        tpl["table"]["column_boundaries"] = sorted(
-            float(x.strip()) for x in bounds_text.split(",") if x.strip()
-        )
-    except ValueError:
-        st.warning("Column boundaries must be numbers.")
-
-    n_cols = len(tpl["table"]["column_boundaries"]) + 1
-    st.caption(f"{n_cols} source column(s) defined.")
-
-    # Column mapping: one row per output column the user wants.
-    columns_state = tpl.get("columns") or []
-    if not columns_state:
-        # Seed a reasonable default the first time.
-        columns_state = [
-            {"source": 0, "target": "date"},
-            {"source": 1, "target": "description"},
-            {"source": 2, "target": "amount"},
-        ][:n_cols]
-
-    targets = ["date", "description", "amount", "amount_debit",
-               "amount_credit", "balance", "type"]
-    new_columns: list[dict] = []
-    for i, col in enumerate(columns_state):
-        c1, c2, c3 = st.columns([2, 3, 1])
-        src = c1.number_input(
-            f"Source #{i + 1}",
-            min_value=0,
-            max_value=max(n_cols - 1, 0),
-            value=min(int(col.get("source", 0)), max(n_cols - 1, 0)),
-            step=1,
-            key=f"src_{i}",
-        )
-        tgt_default = col.get("target", "")
-        if tgt_default not in targets:
-            targets_ext = targets + [tgt_default] if tgt_default else targets
-        else:
-            targets_ext = targets
-        tgt = c2.selectbox(
-            f"Target #{i + 1}",
-            targets_ext,
-            index=(targets_ext.index(tgt_default) if tgt_default in targets_ext else 0),
-            key=f"tgt_{i}",
-        )
-        keep = c3.checkbox("Keep", value=True, key=f"keep_{i}")
-        if keep:
-            new_columns.append({"source": int(src), "target": tgt})
-
-    if st.button("+ Add column", key="add_col"):
-        new_columns.append({"source": n_cols - 1 if n_cols else 0, "target": ""})
-        st.rerun()
-    tpl["columns"] = new_columns
-
-
-def _render_visual_picker(tpl: dict) -> None:
-    """Drawable-canvas overlay on a rendered sample page.
-
-    The user draws (mostly) vertical lines where columns should
-    split. We harvest each line's x-midpoint and write that into
-    ``tpl["table"]["column_boundaries"]`` (in PDF point space). An
-    optional rectangle becomes ``tpl["visual"]["table_bbox"]`` (in
-    PDF points), preserved for round-trip but not yet used by
-    extraction — the header/end-marker pair is enough to slice
-    the row band in practice.
-    """
-    from streamlit_drawable_canvas import st_canvas
-
-    pdf_bytes = st.session_state.get(K_SAMPLE_BYTES)
-    pages = st.session_state.get(K_SAMPLE_PAGES) or []
-    if not pdf_bytes or not pages:
-        st.info("Upload a sample PDF above to use the visual picker.")
-        return
-
-    max_page = len(pages)
-    sample_page = int(tpl.get("visual", {}).get("sample_page", 1))
-    sample_page = st.number_input(
-        "Sample page",
-        min_value=1,
-        max_value=max_page,
-        value=min(sample_page, max_page),
-        step=1,
-        help="Pick a page that contains the transactions table.",
-    )
-    tpl.setdefault("visual", {})["sample_page"] = int(sample_page)
-
-    try:
-        pil_image, scale = render_page_image(pdf_bytes, int(sample_page))
-    except Exception as e:
-        st.error(f"Couldn't render page {sample_page}: {type(e).__name__}: {e}")
-        return
-
-    tpl["visual"]["page_width"] = pil_image.width / scale
-    tpl["visual"]["page_height"] = pil_image.height / scale
-
-    c_left, c_right = st.columns([2, 1])
-    with c_right:
-        st.markdown("**How to use**")
-        st.caption(
-            "• **Lines** mode: drag short vertical strokes where you "
-            "want columns to split. Each stroke contributes one "
-            "x-boundary.\n"
-            "• **Rect** mode: drag a box around the transactions "
-            "table to crop the working region.\n"
-            "• Use the trash icon (top-right of the canvas) to "
-            "remove the last shape, or the X to clear all."
-        )
-        drawing_mode = st.radio(
-            "Draw",
-            ["line", "rect", "transform"],
-            horizontal=True,
-            help=(
-                "transform lets you move/resize already-drawn shapes."
+                for r in rows:
+                    r["source_file"] = up.name
+                all_rows.extend(rows)
+                all_warnings.extend(f"[{up.name}] {w}" for w in warns)
+            except PdfDependencyMissing as e:
+                all_warnings.append(f"[{up.name}] {e}")
+            except Exception as e:
+                all_warnings.append(
+                    f"[{up.name}] scan failed: {type(e).__name__}: {e}"
+                )
+        status.update(
+            label=(
+                f"Found {len(all_rows):,} candidate transactions "
+                f"across {len(uploads)} file(s)"
             ),
+            state="complete",
+            expanded=False,
         )
 
-    initial_objects = _boundaries_to_canvas_lines(
-        tpl["table"].get("column_boundaries", []),
-        scale=scale,
-        image_height=pil_image.height,
+    st.session_state[K_ROWS] = all_rows
+    st.session_state[K_WARNINGS] = all_warnings
+    st.session_state[K_SOURCE_COUNT] = len(uploads)
+
+    log_event(
+        "tool_run",
+        "PDF scan",
+        page="10_PDF_Extractor",
+        files=len(uploads),
+        rows=len(all_rows),
+        warnings=len(all_warnings),
     )
-    bbox = tpl["visual"].get("table_bbox")
-    if bbox:
-        initial_objects.append(_bbox_to_canvas_rect(bbox, scale))
-
-    with c_left:
-        canvas_state = st_canvas(
-            fill_color="rgba(255, 165, 0, 0.15)",
-            stroke_width=2,
-            stroke_color="#d62728",
-            background_image=pil_image,
-            update_streamlit=True,
-            height=pil_image.height,
-            width=pil_image.width,
-            drawing_mode=drawing_mode,
-            initial_drawing={"version": "4.4.0", "objects": initial_objects},
-            key=f"pdf_canvas_p{sample_page}",
-        )
-
-    new_bounds, new_bbox = _harvest_canvas(canvas_state, scale)
-    if new_bounds is not None:
-        tpl["table"]["column_boundaries"] = new_bounds
-    if new_bbox is not None:
-        tpl["visual"]["table_bbox"] = new_bbox
-
-    if tpl["table"].get("column_boundaries"):
-        st.caption(
-            "Boundaries (PDF pts): "
-            + ", ".join(
-                f"{b:.0f}" for b in tpl["table"]["column_boundaries"]
-            )
-        )
 
 
-def _boundaries_to_canvas_lines(
-    boundaries: list[float],
-    *,
-    scale: float,
-    image_height: int,
-) -> list[dict]:
-    """Seed the canvas with full-height vertical lines for any
-    boundaries already on the template, so the user sees their
-    saved state when re-entering build mode."""
-    out: list[dict] = []
-    for b in boundaries:
-        x_px = float(b) * scale
-        out.append({
-            "type": "line",
-            "left": x_px,
-            "top": 0,
-            "width": 0,
-            "height": image_height,
-            "x1": 0, "y1": 0,
-            "x2": 0, "y2": image_height,
-            "stroke": "#1f77b4",
-            "strokeWidth": 2,
-            "fill": "#1f77b4",
-            "selectable": True,
-        })
-    return out
+# ---------------------------------------------------------------------------
+# Results — editable table + download
+# ---------------------------------------------------------------------------
 
+rows = st.session_state.get(K_ROWS)
+warnings = st.session_state.get(K_WARNINGS) or []
+source_count = st.session_state.get(K_SOURCE_COUNT, 0)
 
-def _bbox_to_canvas_rect(bbox: list[float], scale: float) -> dict:
-    x0, top, x1, bottom = bbox
-    return {
-        "type": "rect",
-        "left": x0 * scale,
-        "top": top * scale,
-        "width": (x1 - x0) * scale,
-        "height": (bottom - top) * scale,
-        "stroke": "#d62728",
-        "strokeWidth": 1,
-        "fill": "rgba(255, 165, 0, 0.10)",
+if warnings:
+    with st.expander(f"Warnings ({len(warnings)})", expanded=False):
+        for w in warnings:
+            st.warning(w)
+
+if rows is None:
+    if uploads:
+        st.info("Click **Scan** to detect transactions.")
+    else:
+        st.info("Upload one or more PDF files to begin.")
+
+elif not rows:
+    st.info(
+        "No transaction rows detected. The scanner looks for lines "
+        "containing a date and at least one amount. Check the "
+        "warnings expander above for clues — most often the PDF is "
+        "scanned (image-only) and OCR isn't available."
+    )
+
+else:
+    df = pd.DataFrame(rows)
+
+    # Order columns so the user-facing fields are leftmost; raw +
+    # internals are last and easy to scroll past or unselect at
+    # download time.
+    front = ["date", "description"]
+    amount_cols = sorted(c for c in df.columns if c.startswith("amount_"))
+    tail = ["source_file", "page", "raw"]
+    ordered = [c for c in front + amount_cols + tail if c in df.columns]
+    extras = [c for c in df.columns if c not in ordered]
+    df = df[ordered + extras]
+
+    # Prepend the include checkbox.
+    df.insert(0, "Include", True)
+
+    st.markdown(
+        f"#### {len(df):,} candidate transaction(s) "
+        f"from {source_count} file(s)"
+    )
+    st.caption(
+        "Uncheck rows to exclude. Edit any cell to fix a value the "
+        "scanner got wrong. The ``raw`` column shows the original "
+        "PDF text for that row."
+    )
+
+    column_config = {
+        "Include": st.column_config.CheckboxColumn(
+            "Include",
+            default=True,
+            help="Uncheck to drop this row from the CSV.",
+        ),
+        "raw": st.column_config.TextColumn(
+            "raw",
+            help="Original text line from the PDF (read-only reference).",
+            disabled=True,
+            width="large",
+        ),
+        "page": st.column_config.NumberColumn(
+            "page", disabled=True, width="small",
+        ),
     }
-
-
-def _harvest_canvas(canvas_state, scale: float):
-    """Pull boundaries + bbox out of a ``st_canvas`` return value.
-
-    Returns ``(boundaries_or_None, bbox_or_None)`` where ``None``
-    means "no change" (so the existing template values stay put)."""
-    if canvas_state is None or canvas_state.json_data is None:
-        return None, None
-    objects = canvas_state.json_data.get("objects") or []
-
-    bounds: list[float] = []
-    bbox: list[float] | None = None
-    for obj in objects:
-        kind = obj.get("type")
-        left = float(obj.get("left", 0))
-        width = float(obj.get("width", 0))
-        if kind == "line":
-            # Take the line's x-midpoint as the boundary x-position.
-            bounds.append((left + width / 2) / scale)
-        elif kind == "rect":
-            top = float(obj.get("top", 0))
-            height = float(obj.get("height", 0))
-            bbox = [
-                left / scale,
-                top / scale,
-                (left + width) / scale,
-                (top + height) / scale,
-            ]
-    return sorted(bounds), bbox
-
-
-def _render_build_form(tpl: dict) -> None:
-    """Mode-aware editor.
-
-    Default mode (``row_heuristic``) presents simple form fields
-    for the parsing rules and a live preview of detected
-    transactions. The visual picker only shows when the user
-    explicitly switches to ``column_visual`` in the Advanced tab —
-    most users never need to go there.
-    """
-    # Header: name + mode switcher
-    c_name, c_mode = st.columns([3, 2])
-    with c_name:
-        tpl["name"] = st.text_input(
-            "Template name",
-            value=tpl.get("name", ""),
-            help="What this source is called, e.g. 'Chase Personal Checking'.",
+    if "source_file" in df.columns:
+        column_config["source_file"] = st.column_config.TextColumn(
+            "source_file", disabled=True,
         )
-        tpl["slug"] = slugify(tpl["name"])
-    with c_mode:
-        current_mode = tpl.get("mode", "row_heuristic")
-        new_mode = st.radio(
-            "Detection mode",
-            ["row_heuristic", "column_visual"],
-            index=["row_heuristic", "column_visual"].index(current_mode),
-            format_func=lambda m: {
-                "row_heuristic": "Auto-detect (recommended)",
-                "column_visual": "Visual columns (advanced)",
-            }.get(m, m),
-            help=(
-                "Auto-detect finds rows by date+amount patterns — no "
-                "coordinates needed; survives layout changes between "
-                "statements. Visual columns uses x-position boundaries "
-                "you draw — useful only when auto-detect fails to find "
-                "the table."
-            ),
-            horizontal=False,
-        )
-        if new_mode != current_mode:
-            tpl["mode"] = new_mode
-            st.rerun()
 
-    if tpl.get("mode", "row_heuristic") == "row_heuristic":
-        _render_build_form_row_heuristic(tpl)
-    else:
-        _render_build_form_column_visual(tpl)
-
-
-def _render_build_form_row_heuristic(tpl: dict) -> None:
-    """Simple form for the row-heuristic mode."""
-    tab_amount, tab_filters, tab_save = st.tabs(
-        ["Amount layout", "Filters & date", "Save"]
+    edited = st.data_editor(
+        df,
+        hide_index=True,
+        use_container_width=True,
+        column_config=column_config,
+        num_rows="fixed",
+        key="pdf_results_editor",
     )
 
-    tpl.setdefault("row_detection", {})
-    tpl.setdefault("amounts", {})
-    tpl.setdefault("date", {})
-    tpl.setdefault("pages", {})
+    selected = edited[edited["Include"]].drop(columns=["Include"])
 
-    with tab_amount:
-        st.caption(
-            "Tell us how many amount columns each transaction row has, "
-            "and how negatives are written. The detector handles the "
-            "rest — no x-positions needed."
-        )
-        shape_labels = {
-            "single": "One amount per row (sign in the number)",
-            "txn_balance": "Two amounts: transaction + running balance",
-            "debit_credit": "Two columns: separate debit and credit",
-            "debit_credit_balance": "Three: debit, credit, balance",
-        }
-        current_shape = tpl["amounts"].get("shape", "single")
-        shape = st.selectbox(
-            "Amount layout",
-            list(shape_labels.keys()),
-            index=list(shape_labels.keys()).index(
-                current_shape if current_shape in shape_labels else "single"
-            ),
-            format_func=lambda s: shape_labels[s],
-        )
-        tpl["amounts"]["shape"] = shape
-
-        c1, c2 = st.columns(2)
-        with c1:
-            tpl["amounts"]["negative_in_parens"] = st.checkbox(
-                "Parens (4.50) = negative",
-                value=bool(tpl["amounts"].get("negative_in_parens", True)),
-            )
-            tpl["amounts"]["currency_strip"] = st.text_input(
-                "Currency symbols to strip",
-                value=tpl["amounts"].get("currency_strip", "$"),
-                max_chars=4,
-            )
-        with c2:
-            tpl["amounts"]["decimal_separator"] = st.text_input(
-                "Decimal separator",
-                value=tpl["amounts"].get("decimal_separator", "."),
-                max_chars=1,
-            )
-            tpl["amounts"]["thousands_separator"] = st.text_input(
-                "Thousands separator",
-                value=tpl["amounts"].get("thousands_separator", ","),
-                max_chars=1,
-            )
-
-        # Auto-derive min/max amounts from the chosen shape unless
-        # user has set non-default values explicitly.
-        shape_to_min_max = {
-            "single": (1, 1),
-            "txn_balance": (2, 2),
-            "debit_credit": (1, 2),
-            "debit_credit_balance": (2, 3),
-        }
-        cur_min = tpl["row_detection"].get("min_amounts_per_row")
-        cur_max = tpl["row_detection"].get("max_amounts_per_row")
-        derived_min, derived_max = shape_to_min_max.get(shape, (1, 3))
-        if cur_min is None or cur_max is None:
-            tpl["row_detection"]["min_amounts_per_row"] = derived_min
-            tpl["row_detection"]["max_amounts_per_row"] = derived_max
-
-        with st.expander("Advanced: tune amount-count range", expanded=False):
-            tpl["row_detection"]["min_amounts_per_row"] = st.number_input(
-                "Minimum amounts per transaction row",
-                min_value=1, max_value=10,
-                value=int(tpl["row_detection"].get("min_amounts_per_row", derived_min)),
-                step=1,
-            )
-            tpl["row_detection"]["max_amounts_per_row"] = st.number_input(
-                "Maximum amounts per transaction row",
-                min_value=1, max_value=10,
-                value=int(tpl["row_detection"].get("max_amounts_per_row", derived_max)),
-                step=1,
-            )
-
-    with tab_filters:
-        c1, c2 = st.columns(2)
-        with c1:
-            tpl["date"]["format"] = st.text_input(
-                "Date format",
-                value=tpl["date"].get("format", "%m/%d/%Y"),
-                help=(
-                    "Python strftime format. Common: %m/%d/%Y (US), "
-                    "%d/%m/%Y (EU), %Y-%m-%d (ISO). Leave default to "
-                    "try common formats automatically."
-                ),
-            )
-            tpl["pages"]["range"] = st.text_input(
-                "Pages",
-                value=tpl["pages"].get("range", "all"),
-                help='"all", "1-3", "2,4", "3-" all work.',
-            )
-        with c2:
-            tpl["row_detection"]["merge_multiline_description"] = st.checkbox(
-                "Merge multi-line descriptions",
-                value=bool(
-                    tpl["row_detection"].get("merge_multiline_description", True)
-                ),
-                help=(
-                    "Lines without a date attach to the previous "
-                    "row's description — handles wrapped vendor names."
-                ),
-            )
-            tpl["row_detection"]["y_tolerance"] = st.number_input(
-                "Row y-tolerance (pts)",
-                min_value=0.5,
-                max_value=20.0,
-                value=float(tpl["row_detection"].get("y_tolerance", 3.0)),
-                step=0.5,
-                help=(
-                    "How close two words' y-positions must be to be on "
-                    "the same row. Adjust if rows are splitting or merging."
-                ),
-            )
-
-        skips = "\n".join(tpl["row_detection"].get("skip_rows_matching") or [])
-        new_skips = st.text_area(
-            "Skip rows matching (one regex per line, optional)",
-            value=skips,
-            help=(
-                "Lines whose text matches any of these regexes are "
-                'excluded. Common: "Total", "Subtotal", "^Page ".'
-            ),
-            height=80,
-        )
-        tpl["row_detection"]["skip_rows_matching"] = [
-            line.strip() for line in new_skips.splitlines() if line.strip()
-        ]
-
-    with tab_save:
-        tpl["notes"] = st.text_area(
-            "Notes (optional)", value=tpl.get("notes", ""), height=70,
-        )
-        ok, errors = validate_template(tpl)
-        if errors:
-            for err in errors:
-                st.error(err)
-        c1, c2 = st.columns([1, 3])
-        with c1:
-            save_btn = st.button(
-                "Save template", type="primary", disabled=not ok,
-            )
-        with c2:
-            st.caption(
-                f"Will save as: ``{tpl.get('slug') or '—'}``  "
-                f"(folder: ``~/.datatools/pdf_templates/``)"
-            )
-        if save_btn:
-            try:
-                slug = save_template(tpl)
-                st.success(
-                    f"Saved as **{slug}**. Switch to Extract mode to use it."
-                )
-                log_event(
-                    "tool_run",
-                    "PDF Extractor template saved",
-                    page="10_PDF_Extractor",
-                    template=slug,
-                    mode=tpl.get("mode"),
-                )
-            except Exception as e:
-                st.error(f"Save failed: {e}")
-
-
-def _render_build_form_column_visual(tpl: dict) -> None:
-    """Legacy column-visual editor. Reached via the Detection mode
-    radio when the user opts into the advanced flow."""
-    st.warning(
-        "**Advanced mode.** Column-x-position templates depend on "
-        "every statement from this source having identical layout. "
-        "If your statements drift between months, switch back to "
-        "Auto-detect."
-    )
-
-    t0, t1, t2, t3, t4 = st.tabs(
-        ["Visual picker", "Pages & table", "Columns", "Parsing", "Save"]
-    )
-
-    tpl.setdefault("table", {})
-    tpl.setdefault("parse", {})
-    tpl.setdefault("pages", {})
-    tpl.setdefault("columns", [])
-
-    with t0:
-        _render_visual_picker(tpl)
-
-    with t1:
-        c1, c2 = st.columns(2)
-        with c1:
-            tpl["notes"] = st.text_area(
-                "Notes", value=tpl.get("notes", ""), height=70,
-            )
-            tpl["pages"]["range"] = st.text_input(
-                "Pages",
-                value=tpl["pages"].get("range", "all"),
-                help='"all", "1-3", "2,4", "3-" all work.',
-            )
-            tpl["pages"]["skip_matching"] = st.text_input(
-                "Skip pages matching (regex, optional)",
-                value=tpl["pages"].get("skip_matching", ""),
-            )
-        with c2:
-            tpl["table"]["header_text"] = st.text_input(
-                "Header text",
-                value=tpl["table"].get("header_text", ""),
-            )
-            ends = "\n".join(tpl["table"].get("end_markers") or [])
-            new_ends = st.text_area(
-                "End markers (one regex per line)",
-                value=ends,
-                height=80,
-            )
-            tpl["table"]["end_markers"] = [
-                line.strip() for line in new_ends.splitlines() if line.strip()
-            ]
-            skips = "\n".join(tpl["table"].get("skip_rows_matching") or [])
-            new_skips = st.text_area(
-                "Skip rows matching (one regex per line)",
-                value=skips,
-                height=80,
-            )
-            tpl["table"]["skip_rows_matching"] = [
-                line.strip() for line in new_skips.splitlines() if line.strip()
-            ]
-
-    with t2:
-        _render_columns_editor(tpl)
-
-    with t3:
-        tpl["parse"]["date_format"] = st.text_input(
-            "Date format",
-            value=tpl["parse"].get("date_format", "%m/%d/%Y"),
-        )
-        tpl["parse"]["currency_strip"] = st.text_input(
-            "Currency symbols", value=tpl["parse"].get("currency_strip", "$"),
-        )
-        tpl["parse"]["amount_negative_in_parens"] = st.checkbox(
-            "Parens = negative",
-            value=bool(tpl["parse"].get("amount_negative_in_parens", True)),
-        )
-        tpl["parse"]["merge_multiline_description"] = st.checkbox(
-            "Merge multi-line descriptions",
-            value=bool(tpl["parse"].get("merge_multiline_description", True)),
-        )
-
-    with t4:
-        ok, errors = validate_template(tpl)
-        if errors:
-            for err in errors:
-                st.error(err)
-        save_btn = st.button(
-            "Save template", type="primary", disabled=not ok, key="cv_save",
-        )
-        if save_btn:
-            try:
-                slug = save_template(tpl)
-                st.success(f"Saved as **{slug}**.")
-                log_event(
-                    "tool_run",
-                    "PDF Extractor template saved",
-                    page="10_PDF_Extractor",
-                    template=slug,
-                    mode=tpl.get("mode"),
-                )
-            except Exception as e:
-                st.error(f"Save failed: {e}")
-
-
-def _render_preview(tpl: dict) -> None:
-    """Below-the-fold live preview against the cached sample pages."""
-    pages = st.session_state.get(K_SAMPLE_PAGES)
-    if not pages:
-        return
-    st.divider()
-    st.markdown("##### Live preview")
-    try:
-        df = apply_template(pages, tpl)
-    except Exception as e:
-        st.error(f"Preview failed: {type(e).__name__}: {e}")
-        return
-    mode = tpl.get("mode", "row_heuristic")
-    if df.empty:
-        if mode == "row_heuristic":
-            st.info(
-                "No transaction rows detected yet. Check that the date "
-                "format matches your statements, and try widening the "
-                "amount-count range under \"Advanced\" if your rows have "
-                "balance or extra columns."
-            )
+    c_dl, c_meta = st.columns([2, 3])
+    with c_dl:
+        if selected.empty:
+            st.button("Download CSV", disabled=True)
         else:
-            st.info(
-                "Template doesn't match any rows yet. Tighten the header "
-                "text, add an end marker, or adjust column boundaries."
+            ts = datetime.now().strftime("%Y%m%d-%H%M%S")
+            # Default: drop the internal columns from the download.
+            keep_default = [
+                c for c in selected.columns
+                if c not in ("page", "raw")
+            ]
+            with c_meta:
+                keep = st.multiselect(
+                    "Columns to include in CSV",
+                    options=list(selected.columns),
+                    default=keep_default,
+                    help="``page`` and ``raw`` are kept off by default; "
+                    "tick them if you want them in the file.",
+                )
+            export = selected[keep] if keep else selected
+            csv_bytes = export.to_csv(index=False).encode("utf-8")
+            st.download_button(
+                f"Download {len(export):,} rows as CSV",
+                data=csv_bytes,
+                file_name=f"transactions-{ts}.csv",
+                mime="text/csv",
+                type="primary",
             )
-    else:
+
+    if not selected.empty:
         st.caption(
-            f"{len(df)} row(s) from {len(pages)} page(s) "
-            f"· mode: {mode}"
+            f"{len(selected):,} of {len(df):,} rows selected."
         )
-        st.dataframe(df.head(100), hide_index=True, use_container_width=True)
-
-
-def _render_build_mode() -> None:
-    # Optionally load an existing template into the form
-    templates = list_templates()
-    c1, c2, c3 = st.columns([2, 2, 1])
-    with c1:
-        existing_label = "— start from scratch —"
-        choices = [existing_label] + [
-            f"{t['name']}  ·  {t['slug']}" for t in templates
-        ]
-        picked = st.selectbox("Load existing", choices, key="build_load_pick")
-    with c2:
-        if st.button("Load", disabled=picked == existing_label, key="build_load_btn"):
-            slug = picked.split("  ·  ")[-1]
-            try:
-                st.session_state[K_CURRENT_TEMPLATE] = load_template(slug)
-                st.rerun()
-            except Exception as e:
-                st.error(f"Load failed: {e}")
-    with c3:
-        if st.button("New", key="build_new_btn"):
-            st.session_state[K_CURRENT_TEMPLATE] = new_template("New template")
-            st.rerun()
-
-    tpl = _get_or_init(K_CURRENT_TEMPLATE, new_template("New template"))
-
-    if not _ensure_sample_loaded():
-        st.info(
-            "Upload a sample statement from this source to drive the live "
-            "preview. Your template is built against the sample's layout."
-        )
-        return
-
-    _render_build_form(tpl)
-    _render_preview(tpl)
-
-
-# ===========================================================================
-# Manage-templates mode
-# ===========================================================================
-
-
-def _render_manage_mode() -> None:
-    templates = list_templates()
-
-    st.markdown("##### Import a template")
-    up = st.file_uploader(
-        "Template JSON",
-        type=["json"],
-        key="manage_import_uploader",
-        help="Paste a colleague's exported JSON file here to add it to your library.",
-    )
-    if up is not None:
-        try:
-            imported = template_from_json(up.read().decode("utf-8"))
-            save_template(imported)
-            st.success(f"Imported **{imported['name']}** (slug `{imported['slug']}`).")
-            st.rerun()
-        except Exception as e:
-            st.error(f"Import failed: {e}")
-
-    st.divider()
-    st.markdown("##### Existing templates")
-    if not templates:
-        st.caption("No templates yet — build one in **Build template** mode.")
-        return
-
-    for t in templates:
-        slug = t["slug"]
-        with st.container(border=True):
-            c1, c2, c3, c4 = st.columns([3, 3, 2, 2])
-            with c1:
-                st.markdown(f"**{t['name']}**")
-                st.caption(f"`{slug}`")
-            with c2:
-                st.caption(f"Updated: {t.get('updated_at', '—')}")
-                if t.get("notes"):
-                    st.caption(t["notes"])
-            with c3:
-                try:
-                    full = load_template(slug)
-                    payload = template_to_json(full)
-                    st.download_button(
-                        "Export",
-                        data=payload.encode("utf-8"),
-                        file_name=f"{slug}.json",
-                        mime="application/json",
-                        key=f"export_{slug}",
-                    )
-                except Exception as e:
-                    st.error(f"Read failed: {e}")
-            with c4:
-                if st.button("Delete", key=f"del_{slug}"):
-                    delete_template(slug)
-                    st.success(f"Deleted `{slug}`.")
-                    st.rerun()
-
-
-# ===========================================================================
-# Dispatch
-# ===========================================================================
-
-
-if mode == "Extract":
-    _render_extract_mode()
-elif mode == "Build template":
-    _render_build_mode()
-elif mode == "Manage templates":
-    _render_manage_mode()
diff --git a/src/pdf_extract.py b/src/pdf_extract.py
index cd35143..4c375e4 100644
--- a/src/pdf_extract.py
+++ b/src/pdf_extract.py
@@ -1,64 +1,51 @@
-"""PDF transaction extraction.
+"""Heuristic PDF transaction scanner.
 
-Pure module — no Streamlit, no user-config I/O. Reads PDF bytes,
-produces a ``pandas.DataFrame`` of rows according to a template
-dict. The accountant-facing use case is extracting transaction
-tables from bank statements (different banks = different
-templates, reused across statements that share a format).
+Single public entry point: ``scan_pdf_for_transactions(pdf_bytes)``
+returns a list of dicts shaped like ``[date] [description] [amount]``,
+plus a list of warning strings. The GUI renders those rows in an
+editable table and lets the user pick which to keep before
+exporting to CSV.
 
-Strategy:
+There are no templates, no per-bank configuration files, and no
+coordinate dependencies. A transaction row is "any extracted text
+line containing a date pattern AND at least one amount pattern."
+Multi-amount rows surface every detected amount as ``amount_1``,
+``amount_2``, ... — the user labels and reshapes in their CSV
+editor of choice.
 
-- ``pdfplumber`` for text + word positions. Bank-statement tables
-  rarely have visible cell borders, so we don't rely on table-line
-  detection — instead the template carries explicit column
-  x-position boundaries (set by the visual picker UI).
-- Rows are detected by clustering word ``top`` (y-position) values
-  within a small tolerance — words on the same baseline.
-- Multi-line descriptions: rows whose first column (date) is empty
-  are merged into the previous row's description column.
-- Signed amounts: parenthesized values (``(123.45)``) parse negative.
-  Single signed amount column passes through. Separate debit/credit
-  columns are combined into one signed amount column with credits
-  positive and debits negative (accounting register convention —
-  matches QuickBooks/Xero import expectations).
-- Optional OCR: pages with no extractable text fall through to
-  ``pytesseract`` IF the binding + Tesseract binary are both
-  available. Otherwise the page is skipped with a warning row.
-
-The template is a plain dict matching the schema documented in
-``src/pdf_templates.py``. This module reads it; ``pdf_templates``
-manages its persistence and validation.
+Optional OCR fallback for scanned PDFs via ``pytesseract`` +
+``pypdfium2``. Robust to missing system Tesseract — returns a
+clear reason string instead of raising.
 """
 
 from __future__ import annotations
 
 import io
+import os
+import platform
 import re
 from dataclasses import dataclass, field
 from datetime import datetime
+from pathlib import Path
 from typing import Any
 
-import pandas as pd
 
-
-# Lazy imports for the heavy PDF deps so a fresh ``pip`` that hasn't
-# picked up the new ``requirements.txt`` lines yet doesn't crash the
-# module-import path. The GUI page surfaces a friendly install message
-# when these come back missing instead of throwing an ImportError
-# traceback over the whole tool. Pure helpers (parse_amount, parse_date,
-# cluster_rows, …) keep working with no PDF dep installed.
+# ---------------------------------------------------------------------------
+# Dependency guards
+# ---------------------------------------------------------------------------
 
 
 class PdfDependencyMissing(ImportError):
-    """Raised when a runtime PDF dependency is missing.
+    """A runtime PDF dependency is missing.
 
-    Carries an actionable ``hint`` for the GUI to show to the user."""
+    Carries an actionable install hint that the GUI surfaces.
+    """
 
     def __init__(self, missing: str, hint: str = ""):
         self.missing = missing
         self.hint = hint or (
-            f"Install the PDF dependencies: ``pip install "
-            f"pdfplumber pypdfium2 streamlit-drawable-canvas pytesseract``"
+            "Install the PDF dependencies: ``pip install "
+            "pdfplumber pypdfium2 pytesseract``"
         )
         super().__init__(f"{missing} is not installed. {self.hint}")
 
@@ -99,24 +86,123 @@ class WordBox:
 @dataclass
 class Page:
     """One PDF page's text + word positions."""
-    page_no: int  # 1-indexed
+    page_no: int
     width: float
     height: float
     text: str
     words: list[WordBox] = field(default_factory=list)
 
 
+# ---------------------------------------------------------------------------
+# Value parsing
+# ---------------------------------------------------------------------------
+
+
+_DATE_RES = [
+    re.compile(r"\b(\d{1,2}/\d{1,2}/\d{2,4})\b"),
+    re.compile(r"\b(\d{1,2}-\d{1,2}-\d{2,4})\b"),
+    re.compile(r"\b(\d{4}-\d{2}-\d{2})\b"),
+    re.compile(r"\b([A-Z][a-z]{2}\s+\d{1,2},?\s+\d{2,4})\b"),
+    re.compile(r"\b(\d{1,2}\s+[A-Z][a-z]{2}\s+\d{2,4})\b"),
+]
+
+_DATE_FORMATS_FALLBACK = [
+    "%m/%d/%Y", "%m/%d/%y", "%Y-%m-%d", "%d/%m/%Y", "%d/%m/%y",
+    "%b %d %Y", "%b %d, %Y", "%d %b %Y", "%d-%b-%Y",
+    "%m-%d-%Y", "%m-%d-%y",
+]
+
+# Amount tokens: optional $/€/£, optional leading -, optional parens,
+# 1-3 digits before grouping with comma-thousand groups, optional
+# decimal portion. Trailing minus also captured.
+_AMOUNT_RE = re.compile(
+    r"(?<![\w.])"
+    r"(\(?-?[\$€£]?-?\d{1,3}(?:,\d{3})*(?:\.\d{1,4})?\)?-?)"
+    r"(?![\w.])"
+)
+
+
+def parse_amount(
+    text: str,
+    *,
+    negative_in_parens: bool = True,
+    decimal: str = ".",
+    thousands: str = ",",
+    currency_strip: str = "$€£",
+) -> float | None:
+    """Parse a money string to a signed float, or ``None`` if it
+    doesn't parse.
+
+    Handles: currency prefixes (configurable), thousands separators,
+    parenthesized negatives, trailing minus signs ("123.45-"),
+    leading minus, and bare blanks.
+    """
+    if text is None:
+        return None
+    s = str(text).strip()
+    if not s:
+        return None
+
+    negative = False
+    if negative_in_parens and s.startswith("(") and s.endswith(")"):
+        negative = True
+        s = s[1:-1].strip()
+    if s.endswith("-"):
+        negative = True
+        s = s[:-1].strip()
+    if s.startswith("-"):
+        negative = True
+        s = s[1:].strip()
+    for ch in currency_strip:
+        s = s.replace(ch, "")
+    s = s.replace(" ", "")
+    if thousands:
+        s = s.replace(thousands, "")
+    if decimal != ".":
+        s = s.replace(decimal, ".")
+
+    if not s or not re.match(r"^\d+(\.\d+)?$", s):
+        return None
+    val = float(s)
+    return -val if negative else val
+
+
+def parse_date(
+    text: str,
+    formats: list[str] | None = None,
+) -> str | None:
+    """Parse a date string and return ISO ``YYYY-MM-DD``.
+
+    Tries *formats* first, then a list of common formats. Returns
+    ``None`` if no format matches. Caller is responsible for
+    preserving the raw text alongside the parsed value so the user
+    can correct mis-detections in the editor.
+    """
+    if text is None:
+        return None
+    s = str(text).strip()
+    if not s:
+        return None
+    tries = list(formats or []) + _DATE_FORMATS_FALLBACK
+    for fmt in tries:
+        try:
+            return datetime.strptime(s, fmt).strftime("%Y-%m-%d")
+        except ValueError:
+            continue
+    return None
+
+
 # ---------------------------------------------------------------------------
 # PDF reading
 # ---------------------------------------------------------------------------
 
 
 def extract_pages(pdf_bytes: bytes) -> list[Page]:
-    """Parse a PDF blob into our internal ``Page`` representation.
+    """Parse a PDF blob into ``Page`` records with word positions.
 
-    Each page carries every word's bounding box; downstream code
-    groups them into rows by ``top`` clustering and into columns
-    by template-defined x-boundaries.
+    Word positions are kept so the row clusterer can group by
+    y-coordinate, but no x-position information is used downstream
+    — the detector only looks at text content.
     """
     pdfplumber = _require_pdfplumber()
     out: list[Page] = []
@@ -149,102 +235,15 @@ def extract_pages(pdf_bytes: bytes) -> list[Page]:
     return out
 
 
-# ---------------------------------------------------------------------------
-# Value parsing
-# ---------------------------------------------------------------------------
-
-
-_AMOUNT_DEFAULTS = {
-    "decimal_separator": ".",
-    "thousands_separator": ",",
-    "currency_strip": "$",
-    "negative_in_parens": True,
-}
-
-_DATE_FORMATS_FALLBACK = [
-    "%m/%d/%Y", "%m/%d/%y", "%Y-%m-%d", "%d/%m/%Y", "%d/%m/%y",
-    "%b %d %Y", "%d %b %Y", "%d-%b-%Y", "%m-%d-%Y", "%m-%d-%y",
-]
-
-
-def parse_amount(text: str, opts: dict[str, Any] | None = None) -> float | None:
-    """Parse a money string to a signed float, or ``None`` if it doesn't parse.
-
-    Handles: currency prefixes, thousands separators, parenthesized
-    negatives, trailing minus signs ("123.45-"), and bare blanks.
-    """
-    if text is None:
-        return None
-    s = text.strip()
-    if not s:
-        return None
-    o = {**_AMOUNT_DEFAULTS, **(opts or {})}
-
-    negative = False
-    if o["negative_in_parens"] and s.startswith("(") and s.endswith(")"):
-        negative = True
-        s = s[1:-1].strip()
-    if s.endswith("-"):
-        negative = True
-        s = s[:-1].strip()
-    if s.startswith("-"):
-        negative = True
-        s = s[1:].strip()
-    currency = o.get("currency_strip") or ""
-    if currency:
-        for ch in currency:
-            s = s.replace(ch, "")
-    s = s.replace(" ", "")
-    if o["thousands_separator"]:
-        s = s.replace(o["thousands_separator"], "")
-    if o["decimal_separator"] != ".":
-        s = s.replace(o["decimal_separator"], ".")
-
-    if not s or not re.match(r"^\d+(\.\d+)?$", s):
-        return None
-    val = float(s)
-    return -val if negative else val
-
-
-def parse_date(
-    text: str,
-    formats: list[str] | None = None,
-) -> str | None:
-    """Parse a date string against the provided formats and return ISO ``YYYY-MM-DD``.
-
-    Falls back to a list of common formats if *formats* is empty.
-    Returns ``None`` if no format matches.
-    """
-    if text is None:
-        return None
-    s = text.strip()
-    if not s:
-        return None
-    tries = list(formats or []) + _DATE_FORMATS_FALLBACK
-    for fmt in tries:
-        try:
-            dt = datetime.strptime(s, fmt)
-            return dt.strftime("%Y-%m-%d")
-        except ValueError:
-            continue
-    return None
-
-
-# ---------------------------------------------------------------------------
-# Row + column structure
-# ---------------------------------------------------------------------------
-
-
 def cluster_rows(
     words: list[WordBox],
     y_tolerance: float = 3.0,
 ) -> list[list[WordBox]]:
-    """Group word boxes into rows by ``top`` coordinate.
+    """Group word boxes into visual rows by ``top`` coordinate.
 
-    Words whose ``top`` is within *y_tolerance* of an existing row's
-    median are added to that row. Otherwise a new row is started.
-    Output rows are sorted top-to-bottom; within a row, words are
-    sorted left-to-right.
+    Words whose ``top`` is within *y_tolerance* of the current
+    cluster's first word join that cluster. Output rows are sorted
+    top-to-bottom and words within a row are sorted left-to-right.
     """
     if not words:
         return []
@@ -263,679 +262,6 @@ def cluster_rows(
     return rows
 
 
-def assign_columns(
-    row_words: list[WordBox],
-    boundaries: list[float],
-) -> list[str]:
-    """Bucket the words of a single row into columns.
-
-    ``boundaries`` are the *interior* x-positions between adjacent
-    columns. N boundaries → N+1 columns. A word's column is decided
-    by its horizontal midpoint; words within a column are joined
-    with single spaces in left-to-right order.
-    """
-    n_cols = len(boundaries) + 1
-    buckets: list[list[WordBox]] = [[] for _ in range(n_cols)]
-    sorted_bounds = sorted(boundaries)
-    for w in row_words:
-        mid = (w.x0 + w.x1) / 2
-        col = 0
-        for i, b in enumerate(sorted_bounds):
-            if mid >= b:
-                col = i + 1
-        buckets[col].append(w)
-    return [
-        " ".join(w.text for w in sorted(bucket, key=lambda w: w.x0))
-        for bucket in buckets
-    ]
-
-
-# ---------------------------------------------------------------------------
-# Template application
-# ---------------------------------------------------------------------------
-
-
-def _pages_in_range(pages: list[Page], range_spec: str) -> list[Page]:
-    """Filter *pages* by a range spec like ``"all"``, ``"2-"``, ``"1,3-5"``.
-
-    Empty / ``"all"`` returns all pages. Bad specs return all pages
-    (template author can fix at preview time)."""
-    s = (range_spec or "").strip().lower()
-    if not s or s == "all":
-        return pages
-    keep: set[int] = set()
-    for chunk in s.split(","):
-        chunk = chunk.strip()
-        if not chunk:
-            continue
-        if "-" in chunk:
-            a, b = chunk.split("-", 1)
-            a_i = int(a) if a.strip().isdigit() else 1
-            b_i = int(b) if b.strip().isdigit() else len(pages)
-            for i in range(a_i, b_i + 1):
-                keep.add(i)
-        elif chunk.isdigit():
-            keep.add(int(chunk))
-    return [p for p in pages if p.page_no in keep] if keep else pages
-
-
-def _within_table_window(
-    rows: list[list[WordBox]],
-    header_text: str,
-    end_markers: list[str],
-) -> list[list[WordBox]]:
-    """Slice *rows* to the band between the header line and the end marker.
-
-    Header match: the first row whose joined text contains every word
-    of ``header_text`` (case-insensitive). The header row itself is
-    excluded. End match: any row whose joined text matches one of the
-    ``end_markers`` regex patterns; that row and below are excluded.
-
-    Empty ``header_text`` keeps from the first row; empty
-    ``end_markers`` keeps through the last row.
-    """
-    if not rows:
-        return []
-    needle_words = [w.lower() for w in (header_text or "").split() if w]
-    end_res = [re.compile(p, re.IGNORECASE) for p in end_markers if p]
-
-    start = 0
-    if needle_words:
-        start = -1
-        for i, row in enumerate(rows):
-            joined = " ".join(w.text for w in row).lower()
-            if all(nw in joined for nw in needle_words):
-                start = i + 1
-                break
-        if start == -1:
-            return []
-
-    end = len(rows)
-    for i in range(start, len(rows)):
-        joined = " ".join(w.text for w in rows[i])
-        if any(rx.search(joined) for rx in end_res):
-            end = i
-            break
-    return rows[start:end]
-
-
-def _row_is_continuation(cells: list[str]) -> bool:
-    """A row whose first column is empty is treated as a continuation
-    of the previous row's description (multi-line wrap)."""
-    return bool(cells) and not cells[0].strip()
-
-
-def _coerce_amount_columns(
-    record: dict[str, str],
-    column_map: list[dict[str, Any]],
-    parse_opts: dict[str, Any],
-) -> dict[str, Any]:
-    """Convert source-column text into typed output fields.
-
-    Supports three amount shapes:
-
-    1. A single column mapped to ``amount`` — passes through with sign.
-    2. Two columns mapped to ``amount_debit`` + ``amount_credit`` —
-       combined into a signed ``amount`` (credit positive, debit
-       negative — accounting register convention).
-    3. A column mapped to ``balance`` — parsed as signed number.
-
-    The ``date`` target is parsed against the template's date format.
-    Other targets pass through as text.
-    """
-    out: dict[str, Any] = {}
-    debit_val: float | None = None
-    credit_val: float | None = None
-
-    for col in column_map:
-        target = col.get("target", "")
-        source_text = record.get(target, "") if target else ""
-        if target == "date":
-            iso = parse_date(source_text, parse_opts.get("date_formats") or [])
-            out["date"] = iso or source_text
-        elif target == "description":
-            out["description"] = source_text
-        elif target == "amount":
-            out["amount"] = parse_amount(source_text, parse_opts)
-        elif target == "amount_debit":
-            debit_val = parse_amount(source_text, parse_opts)
-        elif target == "amount_credit":
-            credit_val = parse_amount(source_text, parse_opts)
-        elif target == "balance":
-            out["balance"] = parse_amount(source_text, parse_opts)
-        elif target:
-            out[target] = source_text
-
-    if "amount" not in out and (debit_val is not None or credit_val is not None):
-        amt = 0.0
-        if credit_val:
-            amt += credit_val
-        if debit_val:
-            amt -= debit_val
-        out["amount"] = amt
-        out["type"] = "credit" if amt > 0 else ("debit" if amt < 0 else "")
-    return out
-
-
-# ---------------------------------------------------------------------------
-# Row-heuristic extraction (mode = "row_heuristic", default for new templates)
-# ---------------------------------------------------------------------------
-
-
-_DATE_RES = [
-    re.compile(r"\b(\d{1,2}/\d{1,2}/\d{2,4})\b"),
-    re.compile(r"\b(\d{1,2}-\d{1,2}-\d{2,4})\b"),
-    re.compile(r"\b(\d{4}-\d{2}-\d{2})\b"),
-    re.compile(r"\b([A-Z][a-z]{2}\s+\d{1,2},?\s+\d{2,4})\b"),
-    re.compile(r"\b(\d{1,2}\s+[A-Z][a-z]{2}\s+\d{2,4})\b"),
-    # Short month-day (e.g. "Jan 15") — sometimes used when year is
-    # implied by the statement period. Lower-priority match.
-    re.compile(r"\b([A-Z][a-z]{2}\s+\d{1,2})\b"),
-]
-
-# Amount tokens: optional $/€/£, optional leading -, optional parens,
-# 1-3 digits before grouping with comma-thousand groups, optional
-# decimal portion. Trailing minus also captured.
-_AMOUNT_RE = re.compile(
-    r"(?<![\w.])"                                  # no preceding letter/dot
-    r"(\(?-?[\$€£]?-?\d{1,3}(?:,\d{3})*(?:\.\d{1,4})?\)?-?)"
-    r"(?![\w.])"
-)
-
-
-def _looks_like_amount(token: str) -> bool:
-    """Reject tokens that match the amount regex but are obviously
-    not money — e.g. a bare year or a page number. Real amounts
-    have at least one of: currency symbol, decimal point, parens,
-    minus sign, or a thousand separator."""
-    if not token:
-        return False
-    return bool(re.search(r"[\$€£.,()\-]", token))
-
-
-def _find_dates_in_words(
-    row_words: list[WordBox],
-) -> list[tuple[int, str]]:
-    """Find the FIRST date-like substring on this row.
-
-    Returns ``[(word_index, date_text)]`` or empty list. Searches
-    word-by-word so we can identify which word(s) constitute the
-    date and exclude them from the description."""
-    for i, w in enumerate(row_words):
-        # Stitch the next few words together — some date formats
-        # like "Jan 15, 2026" span 3 word tokens.
-        for window in (3, 2, 1):
-            chunk = " ".join(x.text for x in row_words[i : i + window])
-            for rx in _DATE_RES:
-                m = rx.search(chunk)
-                if m:
-                    return [(i, m.group(1))]
-    return []
-
-
-def _find_amount_tokens(
-    row_words: list[WordBox],
-) -> list[tuple[int, WordBox, str]]:
-    """Find amount-shaped tokens on this row, keeping their position.
-
-    Returns ``[(word_index, wordbox, normalized_text)]``. The
-    word_index lets the caller exclude these from description text;
-    the wordbox preserves the x-position so we can cluster amount
-    columns later without templated coordinates."""
-    out: list[tuple[int, WordBox, str]] = []
-    for i, w in enumerate(row_words):
-        # Each word might contain multiple amount tokens if the PDF
-        # extractor merged things, but in practice one match per word.
-        m = _AMOUNT_RE.search(w.text)
-        if m and _looks_like_amount(m.group(1)):
-            out.append((i, w, m.group(1)))
-    return out
-
-
-def _row_is_transaction(
-    row_words: list[WordBox],
-    *,
-    min_amounts: int,
-    max_amounts: int,
-) -> bool:
-    """A transaction line has at least one date AND enough amount
-    tokens to satisfy the configured shape."""
-    if not _find_dates_in_words(row_words):
-        return False
-    amounts = _find_amount_tokens(row_words)
-    return min_amounts <= len(amounts) <= max_amounts
-
-
-def _description_from_row(
-    row_words: list[WordBox],
-    date_idx: int,
-    amount_idxs: set[int],
-) -> str:
-    """Stitch the row's description: everything between the date
-    word and the first amount token, plus anything after the last
-    amount that isn't itself an amount."""
-    keep: list[str] = []
-    seen_first_amount = False
-    last_amount_idx = max(amount_idxs) if amount_idxs else -1
-    for i, w in enumerate(row_words):
-        if i == date_idx:
-            continue
-        if i in amount_idxs:
-            seen_first_amount = True
-            continue
-        # After the last amount, trailing tokens are usually a
-        # check number or memo — keep them too.
-        if seen_first_amount and i < last_amount_idx:
-            continue
-        keep.append(w.text)
-    return " ".join(keep).strip()
-
-
-def _assign_amounts_by_shape(
-    amount_tokens: list[tuple[int, WordBox, str]],
-    shape: str,
-    parse_opts: dict[str, Any],
-    column_centers: list[float] | None = None,
-) -> dict[str, Any]:
-    """Map raw amount tokens to typed CSV fields per the shape.
-
-    Shapes:
-      ``single``  → first amount is ``amount`` (sign in value)
-      ``txn_balance`` → leftmost is ``amount``, rightmost is
-        ``balance``
-      ``debit_credit`` → if one token, assign to debit or credit by
-        x-position (uses ``column_centers``); if two, leftmost is
-        debit, next is credit. Combine into signed ``amount``.
-      ``debit_credit_balance`` → leftmost is debit, middle is
-        credit, rightmost is balance.
-    """
-    out: dict[str, Any] = {}
-    if not amount_tokens:
-        return out
-    txt = [t[2] for t in amount_tokens]
-    boxes = [t[1] for t in amount_tokens]
-
-    if shape == "single":
-        out["amount"] = parse_amount(txt[0], parse_opts)
-
-    elif shape == "txn_balance":
-        out["amount"] = parse_amount(txt[0], parse_opts)
-        if len(txt) >= 2:
-            out["balance"] = parse_amount(txt[-1], parse_opts)
-
-    elif shape == "debit_credit":
-        debit_val: float | None = None
-        credit_val: float | None = None
-        if len(txt) == 1 and column_centers and len(column_centers) >= 2:
-            # Decide debit vs credit by which column-center the token's
-            # midpoint is closest to.
-            mid = (boxes[0].x0 + boxes[0].x1) / 2
-            distances = [abs(mid - c) for c in column_centers[:2]]
-            if distances[0] <= distances[1]:
-                debit_val = parse_amount(txt[0], parse_opts)
-            else:
-                credit_val = parse_amount(txt[0], parse_opts)
-        else:
-            # Two tokens: leftmost = debit, rightmost = credit.
-            if len(txt) >= 1:
-                debit_val = parse_amount(txt[0], parse_opts)
-            if len(txt) >= 2:
-                credit_val = parse_amount(txt[1], parse_opts)
-        amt = 0.0
-        if credit_val:
-            amt += credit_val
-        if debit_val:
-            amt -= debit_val
-        out["amount"] = amt
-        out["type"] = "credit" if amt > 0 else ("debit" if amt < 0 else "")
-
-    elif shape == "debit_credit_balance":
-        debit_val = None
-        credit_val = None
-        if len(txt) == 2 and column_centers and len(column_centers) >= 3:
-            # Two tokens but the shape expects three — fall through
-            # to x-position assignment using the configured columns.
-            mids = [(b.x0 + b.x1) / 2 for b in boxes]
-            assigned: list[int | None] = [None, None, None]
-            for k, m in enumerate(mids):
-                col = min(
-                    range(3),
-                    key=lambda c, m=m: abs(m - column_centers[c]),
-                )
-                assigned[col] = k
-            if assigned[0] is not None:
-                debit_val = parse_amount(txt[assigned[0]], parse_opts)
-            if assigned[1] is not None:
-                credit_val = parse_amount(txt[assigned[1]], parse_opts)
-            if assigned[2] is not None:
-                out["balance"] = parse_amount(txt[assigned[2]], parse_opts)
-        else:
-            if len(txt) >= 1:
-                debit_val = parse_amount(txt[0], parse_opts)
-            if len(txt) >= 2:
-                credit_val = parse_amount(txt[1], parse_opts)
-            if len(txt) >= 3:
-                out["balance"] = parse_amount(txt[2], parse_opts)
-        amt = 0.0
-        if credit_val:
-            amt += credit_val
-        if debit_val:
-            amt -= debit_val
-        out["amount"] = amt
-        out["type"] = "credit" if amt > 0 else ("debit" if amt < 0 else "")
-    else:
-        # Unknown shape — fall back to the simplest interpretation.
-        out["amount"] = parse_amount(txt[0], parse_opts)
-    return out
-
-
-def _infer_amount_column_centers(
-    rows: list[list[WordBox]],
-    *,
-    expected: int,
-    min_amounts: int,
-    max_amounts: int,
-) -> list[float]:
-    """Cluster amount-token x-midpoints across all transaction rows
-    to find natural column centers. Returns up to *expected* centers
-    sorted left-to-right.
-
-    Avoids re-introducing user-drawn coordinates: the columns are
-    inferred from the data itself. We can't run k-means without
-    scikit-learn, so use a simple sorted-midpoints + greedy bucket
-    by proximity tolerance approach.
-    """
-    midpoints: list[float] = []
-    for row_words in rows:
-        if not _row_is_transaction(
-            row_words, min_amounts=min_amounts, max_amounts=max_amounts,
-        ):
-            continue
-        for _, w, _ in _find_amount_tokens(row_words):
-            midpoints.append((w.x0 + w.x1) / 2)
-    if not midpoints:
-        return []
-    midpoints.sort()
-    # Bucket by adjacency: any gap > 30pt starts a new bucket.
-    # 30pt ≈ 4x the typical inter-column spacing on bank statements.
-    buckets: list[list[float]] = [[midpoints[0]]]
-    for m in midpoints[1:]:
-        if m - buckets[-1][-1] <= 30:
-            buckets[-1].append(m)
-        else:
-            buckets.append([m])
-    centers = [sum(b) / len(b) for b in buckets]
-    if len(centers) <= expected:
-        return centers
-    # More buckets than expected — keep the *expected* most-populated.
-    by_pop = sorted(
-        zip(centers, (len(b) for b in buckets)),
-        key=lambda x: x[1],
-        reverse=True,
-    )[:expected]
-    return sorted(c for c, _ in by_pop)
-
-
-def find_transaction_rows(
-    pages: list[Page],
-    template: dict[str, Any],
-) -> list[dict[str, Any]]:
-    """Heuristic row detector. Returns a list of preview records
-    suitable for rendering in the build-mode preview table.
-
-    Each record carries the raw text + parsed fields; the GUI
-    surfaces these so the user can confirm or tune the template
-    before extraction commits to disk.
-    """
-    rd = template.get("row_detection", {}) or {}
-    amt_cfg = template.get("amounts", {}) or {}
-    date_cfg = template.get("date", {}) or {}
-    pages_cfg = template.get("pages", {}) or {}
-
-    pages_used = _pages_in_range(pages, pages_cfg.get("range", "all"))
-    skip_pages_re = pages_cfg.get("skip_matching") or ""
-    if skip_pages_re:
-        skip_re = re.compile(skip_pages_re, re.IGNORECASE)
-        pages_used = [p for p in pages_used if not skip_re.search(p.text)]
-
-    min_amounts = int(rd.get("min_amounts_per_row", 1))
-    max_amounts = int(rd.get("max_amounts_per_row", 3))
-    skip_row_res = [
-        re.compile(p, re.IGNORECASE)
-        for p in (rd.get("skip_rows_matching") or [])
-    ]
-    shape = amt_cfg.get("shape", "single")
-    expected_amount_cols = {
-        "single": 1,
-        "txn_balance": 2,
-        "debit_credit": 2,
-        "debit_credit_balance": 3,
-    }.get(shape, 1)
-
-    parse_opts = {
-        "decimal_separator": amt_cfg.get("decimal_separator", "."),
-        "thousands_separator": amt_cfg.get("thousands_separator", ","),
-        "currency_strip": amt_cfg.get("currency_strip", "$"),
-        "negative_in_parens": amt_cfg.get("negative_in_parens", True),
-    }
-    date_formats: list[str] = list(date_cfg.get("formats_fallback") or [])
-    if date_cfg.get("format"):
-        date_formats = [date_cfg["format"]] + date_formats
-
-    # First pass per page: gather rows so we can also infer amount
-    # column centers across the whole document.
-    all_rows: list[tuple[Page, list[list[WordBox]]]] = []
-    for page in pages_used:
-        rows = cluster_rows(
-            page.words,
-            y_tolerance=float(rd.get("y_tolerance", 3.0)),
-        )
-        all_rows.append((page, rows))
-
-    flat_rows = [r for _, rows in all_rows for r in rows]
-    column_centers = _infer_amount_column_centers(
-        flat_rows,
-        expected=expected_amount_cols,
-        min_amounts=min_amounts,
-        max_amounts=max_amounts,
-    )
-
-    out: list[dict[str, Any]] = []
-    merge_multi = bool(rd.get("merge_multiline_description", True))
-    prev: dict[str, Any] | None = None
-
-    for page, rows in all_rows:
-        for row_words in rows:
-            line = " ".join(w.text for w in row_words)
-            if not line.strip():
-                continue
-            if any(rx.search(line) for rx in skip_row_res):
-                continue
-
-            dates = _find_dates_in_words(row_words)
-            amount_tokens = _find_amount_tokens(row_words)
-
-            is_txn = bool(dates) and (
-                min_amounts <= len(amount_tokens) <= max_amounts
-            )
-
-            if not is_txn:
-                # Possible multi-line description continuation —
-                # a no-date, no-amount line directly following a
-                # transaction.
-                if (
-                    merge_multi
-                    and prev is not None
-                    and not amount_tokens
-                    and not dates
-                ):
-                    prev["description"] = (
-                        (prev.get("description") or "") + " " + line
-                    ).strip()
-                continue
-
-            date_idx, date_text = dates[0]
-            amount_idxs = {idx for idx, _, _ in amount_tokens}
-            desc = _description_from_row(row_words, date_idx, amount_idxs)
-
-            record: dict[str, Any] = {
-                "date": parse_date(date_text, date_formats) or date_text,
-                "description": desc,
-                "_page": page.page_no,
-                "_raw_line": line,
-            }
-            record.update(_assign_amounts_by_shape(
-                amount_tokens, shape, parse_opts, column_centers,
-            ))
-            out.append(record)
-            prev = record
-
-    return out
-
-
-def apply_template_row_heuristic(
-    pages: list[Page],
-    template: dict[str, Any],
-) -> pd.DataFrame:
-    """Row-heuristic counterpart to ``apply_template``. Same return
-    shape (a DataFrame) so callers don't care which mode produced it."""
-    rows = find_transaction_rows(pages, template)
-    if not rows:
-        return pd.DataFrame()
-    df = pd.DataFrame(rows)
-    # Drop internal helper columns from the user-facing output.
-    if "_raw_line" in df.columns:
-        df = df.drop(columns=["_raw_line"])
-    preferred = ["date", "description", "amount", "type", "balance"]
-    cols = [c for c in preferred if c in df.columns]
-    extras = [c for c in df.columns if c not in cols and c != "_page"]
-    df = df[cols + extras + (["_page"] if "_page" in df.columns else [])]
-    return df
-
-
-def apply_template(
-    pages: list[Page],
-    template: dict[str, Any],
-) -> pd.DataFrame:
-    """Dispatch by template mode and return the extracted DataFrame.
-
-    ``mode="row_heuristic"`` (default for new templates): no
-    coordinates needed — finds transaction lines by date+amount
-    pattern matching. Robust to layout drift between statements.
-
-    ``mode="column_visual"`` (legacy): uses x-position boundaries
-    from the visual picker. Kept for templates saved before the
-    row-heuristic shift.
-
-    Templates without a mode key default to ``column_visual`` for
-    backward compatibility with schema_version=1 templates.
-    """
-    mode = template.get("mode", "column_visual")
-    if mode == "row_heuristic":
-        return apply_template_row_heuristic(pages, template)
-    return _apply_template_column_visual(pages, template)
-
-
-def _apply_template_column_visual(
-    pages: list[Page],
-    template: dict[str, Any],
-) -> pd.DataFrame:
-    """Original column-x-position pipeline. Now the legacy code
-    path; kept for any v1 templates and as the manual-override
-    advanced mode in the build UI."""
-    pages_cfg = template.get("pages", {}) or {}
-    table_cfg = template.get("table", {}) or {}
-    columns_cfg = template.get("columns", []) or []
-    parse_cfg = template.get("parse", {}) or {}
-
-    pages_used = _pages_in_range(pages, pages_cfg.get("range", "all"))
-    skip_pages_re = pages_cfg.get("skip_matching") or ""
-    if skip_pages_re:
-        skip_re = re.compile(skip_pages_re, re.IGNORECASE)
-        pages_used = [p for p in pages_used if not skip_re.search(p.text)]
-
-    boundaries = list(table_cfg.get("column_boundaries", []) or [])
-    header_text = table_cfg.get("header_text", "") or ""
-    end_markers = list(table_cfg.get("end_markers", []) or [])
-    skip_rows_res = [
-        re.compile(p, re.IGNORECASE)
-        for p in (table_cfg.get("skip_rows_matching") or [])
-    ]
-    merge_multiline = bool(parse_cfg.get("merge_multiline_description", True))
-
-    target_names = [c.get("target") for c in columns_cfg if c.get("target")]
-    if not target_names:
-        target_names = [f"col_{i}" for i in range(len(boundaries) + 1)]
-
-    parse_opts = {
-        "decimal_separator": parse_cfg.get("decimal_separator", "."),
-        "thousands_separator": parse_cfg.get("thousands_separator", ","),
-        "currency_strip": parse_cfg.get("currency_strip", "$"),
-        "negative_in_parens": parse_cfg.get("amount_negative_in_parens", True),
-        "date_formats": parse_cfg.get("date_formats")
-            or ([parse_cfg["date_format"]] if parse_cfg.get("date_format") else []),
-    }
-
-    out_rows: list[dict[str, Any]] = []
-    for page in pages_used:
-        rows = cluster_rows(
-            page.words,
-            y_tolerance=float(table_cfg.get("y_tolerance", 3.0)),
-        )
-        rows = _within_table_window(rows, header_text, end_markers)
-
-        prev_record: dict[str, Any] | None = None
-        for row_words in rows:
-            if not boundaries:
-                cells = [" ".join(w.text for w in row_words)]
-            else:
-                cells = assign_columns(row_words, boundaries)
-            joined = " ".join(c.strip() for c in cells if c.strip())
-            if not joined:
-                continue
-            if any(rx.search(joined) for rx in skip_rows_res):
-                continue
-
-            if merge_multiline and _row_is_continuation(cells) and prev_record:
-                # Glue the non-empty columns into the previous record's
-                # description (the natural sink for wrapped text).
-                extra = " ".join(c.strip() for c in cells if c.strip())
-                if extra:
-                    prev_record["description"] = (
-                        (prev_record.get("description") or "")
-                        + " "
-                        + extra
-                    ).strip()
-                continue
-
-            record_src: dict[str, str] = {}
-            for col_cfg in columns_cfg:
-                src_idx = col_cfg.get("source")
-                tgt = col_cfg.get("target")
-                if (
-                    isinstance(src_idx, int)
-                    and 0 <= src_idx < len(cells)
-                    and tgt
-                ):
-                    record_src[tgt] = cells[src_idx]
-
-            record_src.setdefault("_page", str(page.page_no))
-            record = _coerce_amount_columns(record_src, columns_cfg, parse_opts)
-            record["_page"] = page.page_no
-            out_rows.append(record)
-            prev_record = record
-
-    if not out_rows:
-        return pd.DataFrame()
-    df = pd.DataFrame(out_rows)
-
-    preferred = ["date", "description", "amount", "type", "balance"]
-    cols = [c for c in preferred if c in df.columns]
-    extras = [c for c in df.columns if c not in cols and c != "_page"]
-    df = df[cols + extras + (["_page"] if "_page" in df.columns else [])]
-    return df
-
-
 # ---------------------------------------------------------------------------
 # OCR fallback (optional)
 # ---------------------------------------------------------------------------
@@ -943,38 +269,25 @@ def _apply_template_column_visual(
 
 def page_has_extractable_text(page: Page, min_words: int = 5) -> bool:
     """Heuristic: a scanned page typically yields zero or near-zero
-    words. ``min_words`` of 5 catches title/logo-only pages too."""
+    words. ``min_words=5`` catches title/logo-only pages too."""
     return len(page.words) >= min_words
 
 
 def _autodetect_tesseract_path() -> str | None:
-    """Probe well-known install locations for ``tesseract.exe``.
-
-    UB-Mannheim's Windows installer drops Tesseract at one of two
-    paths by default. Auto-detecting them lets ``ocr_available``
-    succeed even when the user (or their installer) skipped the
-    "Add to PATH" step — the most common Windows install
-    snag based on real user reports.
-
-    No-op on non-Windows: macOS/Linux package managers
-    always put ``tesseract`` on PATH, so PATH-based discovery is
-    sufficient.
-    """
-    import os as _os
-    import platform as _platform
-    from pathlib import Path as _Path
-
-    if _platform.system() != "Windows":
+    """Probe well-known install locations for ``tesseract.exe`` on
+    Windows. No-op on macOS/Linux where Tesseract is on PATH via
+    the system package manager."""
+    if platform.system() != "Windows":
         return None
     candidates = [
         r"C:\Program Files\Tesseract-OCR\tesseract.exe",
         r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe",
-        _os.path.expandvars(
+        os.path.expandvars(
             r"%LOCALAPPDATA%\Programs\Tesseract-OCR\tesseract.exe"
         ),
     ]
     for p in candidates:
-        if p and _Path(p).exists():
+        if p and Path(p).exists():
             return p
     return None
 
@@ -982,27 +295,16 @@ def _autodetect_tesseract_path() -> str | None:
 def ocr_available() -> tuple[bool, str]:
     """Return ``(available, reason)`` — is OCR usable right now?
 
-    Checks both the Python binding (``pytesseract``) and the
-    Tesseract binary. The reason string is suitable for surfacing
-    to the user when OCR is unavailable.
-
-    Discovery order for the Tesseract binary:
-
-    1. ``DATATOOLS_TESSERACT_PATH`` env var — explicit override,
-       wins over everything else. Useful for portable installs.
-    2. Whatever's on PATH (``pytesseract``'s default).
-    3. ``_autodetect_tesseract_path`` — known Windows install
-       locations. Sets ``pytesseract.pytesseract.tesseract_cmd``
-       so subsequent ``image_to_data`` calls use the same binary.
+    Discovery order: ``DATATOOLS_TESSERACT_PATH`` env var override,
+    then PATH-based lookup, then well-known Windows install
+    locations.
     """
-    import os as _os
-
     try:
-        import pytesseract  # noqa: F401, PLC0415
+        import pytesseract  # noqa: PLC0415
     except ImportError:
         return False, "pytesseract is not installed."
 
-    override = _os.environ.get("DATATOOLS_TESSERACT_PATH")
+    override = os.environ.get("DATATOOLS_TESSERACT_PATH")
     if override:
         pytesseract.pytesseract.tesseract_cmd = override
 
@@ -1010,7 +312,6 @@ def ocr_available() -> tuple[bool, str]:
         pytesseract.get_tesseract_version()
         return True, ""
     except Exception as e_path:
-        # Fallback: probe known install locations.
         candidate = _autodetect_tesseract_path()
         if candidate:
             pytesseract.pytesseract.tesseract_cmd = candidate
@@ -1025,56 +326,18 @@ def ocr_available() -> tuple[bool, str]:
         return False, f"Tesseract binary not found on PATH: {e_path}"
 
 
-def render_page_image(
-    pdf_bytes: bytes,
-    page_no: int,
-    *,
-    target_width: int = 900,
-) -> tuple["Any", float]:
-    """Rasterize one page of *pdf_bytes* (1-indexed) to a PIL image.
-
-    Returns ``(pil_image, scale)`` where ``scale`` is the
-    pixels-per-PDF-point factor. The caller uses ``scale`` to map
-    canvas coordinates (pixels) back to PDF coordinates (points).
-
-    ``target_width`` caps the rendered width so the image is a
-    sensible size for the visual picker — bank statements at 100%
-    can be 800–1200 pts wide; we want ~900px on screen.
-    """
-    pdfium = _require_pdfium()
-
-    pdf = pdfium.PdfDocument(pdf_bytes)
-    try:
-        idx = max(0, min(page_no - 1, len(pdf) - 1))
-        page = pdf[idx]
-        # Width in PDF points → pixels-per-point scale.
-        pdf_width = page.get_width()
-        scale = target_width / pdf_width if pdf_width else 2.0
-        # Cap scale so big A3-style scans don't blow up.
-        scale = min(scale, 3.0)
-        bitmap = page.render(scale=scale)
-        return bitmap.to_pil(), scale
-    finally:
-        pdf.close()
-
-
 def ocr_pdf_to_pages(pdf_bytes: bytes, dpi: int = 200) -> list[Page]:
-    """Run Tesseract over each page of *pdf_bytes* and return a
-    word-position-rich ``Page`` list, parallel to ``extract_pages``.
+    """OCR every page of *pdf_bytes* and return word-position-rich
+    ``Page`` records, parallel to ``extract_pages``.
 
-    Caller is responsible for first checking ``ocr_available()``.
-    Uses pypdfium2 to rasterize and pytesseract's ``image_to_data``
-    to recover per-word bounding boxes so the same column-assignment
-    pipeline keeps working.
+    Caller must check ``ocr_available()`` first.
     """
     pdfium = _require_pdfium()
     import pytesseract  # noqa: PLC0415
-    from PIL import Image  # noqa: F401, PLC0415  (transitively required)
 
     pages: list[Page] = []
     pdf = pdfium.PdfDocument(pdf_bytes)
     try:
-        # PDF points-per-inch is 72; scale renders into pixels.
         scale = dpi / 72.0
         for i in range(len(pdf)):
             pil_image = pdf[i].render(scale=scale).to_pil()
@@ -1091,9 +354,6 @@ def ocr_pdf_to_pages(pdf_bytes: bytes, dpi: int = 200) -> list[Page]:
                 top = float(data["top"][j])
                 width = float(data["width"][j])
                 height = float(data["height"][j])
-                # Convert pixel coords back to PDF points so column
-                # boundaries from the template (in PDF points) keep
-                # working when an OCR page is mixed with text pages.
                 words.append(WordBox(
                     x0=left / scale,
                     top=top / scale,
@@ -1119,11 +379,10 @@ def extract_pages_auto(
     *,
     allow_ocr: bool = True,
 ) -> tuple[list[Page], list[str]]:
-    """Try text extraction first; OCR the pages that come back empty.
+    """Text extraction first; OCR the pages that come back empty.
 
-    Returns ``(pages, warnings)``. ``warnings`` is a list of human-
-    readable strings — e.g. "Pages 3, 4 had no text and OCR is
-    unavailable; they were skipped." Caller surfaces these in the UI.
+    Returns ``(pages, warnings)`` — human-readable warning strings
+    the caller surfaces in the UI.
     """
     warnings: list[str] = []
     pages = extract_pages(pdf_bytes)
@@ -1146,7 +405,6 @@ def extract_pages_auto(
         return pages, warnings
 
     ocr_pages = ocr_pdf_to_pages(pdf_bytes)
-    # Splice OCR results into the original list for the blank pages.
     by_no = {p.page_no: p for p in ocr_pages}
     merged: list[Page] = []
     for p in pages:
@@ -1160,3 +418,175 @@ def extract_pages_auto(
         f"OCR was used for {len(blank)} page(s) with no extractable text."
     )
     return merged, warnings
+
+
+# ---------------------------------------------------------------------------
+# Row detection (the only thing the GUI actually calls)
+# ---------------------------------------------------------------------------
+
+
+def _find_dates_in_words(
+    row_words: list[WordBox],
+) -> list[tuple[int, str]]:
+    """Return ``[(word_index, date_text)]`` for the first date-like
+    substring on this row, or ``[]`` if none. The index lets the
+    caller exclude the date words from the description text.
+
+    Multi-word formats like ``Jan 15, 2026`` are handled by stitching
+    up to three adjacent words before matching.
+    """
+    for i, w in enumerate(row_words):
+        for window in (3, 2, 1):
+            chunk = " ".join(x.text for x in row_words[i : i + window])
+            for rx in _DATE_RES:
+                m = rx.search(chunk)
+                if m:
+                    return [(i, m.group(1))]
+    return []
+
+
+def _find_amount_tokens(
+    row_words: list[WordBox],
+) -> list[tuple[int, WordBox, str]]:
+    """Return ``[(word_index, wordbox, normalized_text)]`` for each
+    amount-shaped token on this row, left-to-right.
+
+    Filters out tokens that match the regex but lack real money
+    markers (currency symbol, decimal point, parens, sign,
+    thousand separator) — keeps bare years and page numbers out.
+    """
+    out: list[tuple[int, WordBox, str]] = []
+    for i, w in enumerate(row_words):
+        m = _AMOUNT_RE.search(w.text)
+        if not m:
+            continue
+        token = m.group(1)
+        if not re.search(r"[\$€£.,()\-]", token):
+            continue
+        out.append((i, w, token))
+    return out
+
+
+def _description_from_row(
+    row_words: list[WordBox],
+    date_idx: int,
+    amount_idxs: set[int],
+) -> str:
+    """Stitch the description from the row's non-date, non-amount
+    tokens. Keeps tokens before the first amount and after the last
+    amount (trailing check numbers and memos); drops words between
+    amount tokens (usually whitespace artifacts in column gaps)."""
+    keep: list[str] = []
+    seen_first_amount = False
+    last_amount_idx = max(amount_idxs) if amount_idxs else -1
+    for i, w in enumerate(row_words):
+        if i == date_idx:
+            continue
+        if i in amount_idxs:
+            seen_first_amount = True
+            continue
+        if seen_first_amount and i < last_amount_idx:
+            continue
+        keep.append(w.text)
+    return " ".join(keep).strip()
+
+
+def scan_pdf_for_transactions(
+    pdf_bytes: bytes,
+    *,
+    negative_in_parens: bool = True,
+    allow_ocr: bool = True,
+    date_formats: list[str] | None = None,
+    y_tolerance: float = 3.0,
+    merge_multiline_descriptions: bool = True,
+) -> tuple[list[dict[str, Any]], list[str]]:
+    """Scan *pdf_bytes* for transaction-like rows.
+
+    A row qualifies if it contains a date pattern AND at least one
+    amount pattern. Each returned record looks like::
+
+        {
+          "date": "2026-01-15",   # ISO, or raw text if unparsable
+          "description": "...",
+          "amount_1": 4.50,       # always present
+          "amount_2": 1000.00,    # if a second amount was found
+          "amount_3": ...,        # if a third was found
+          "page": 1,
+          "raw": "01/15/2026 Coffee $4.50",
+        }
+
+    Multi-line descriptions (rows with no date and no amount) attach
+    to the most recent transaction row when
+    ``merge_multiline_descriptions=True`` (default).
+
+    Returns ``(rows, warnings)``. Warnings are human-readable
+    strings the GUI surfaces in an expander.
+    """
+    pages, warnings = extract_pages_auto(pdf_bytes, allow_ocr=allow_ocr)
+
+    out_rows: list[dict[str, Any]] = []
+    prev: dict[str, Any] | None = None
+
+    for page in pages:
+        rows = cluster_rows(page.words, y_tolerance=y_tolerance)
+        for row_words in rows:
+            line = " ".join(w.text for w in row_words).strip()
+            if not line:
+                continue
+
+            dates = _find_dates_in_words(row_words)
+            amount_tokens = _find_amount_tokens(row_words)
+
+            if not dates or not amount_tokens:
+                # Continuation candidate — a line on a transaction
+                # page that has neither a date nor an amount of its
+                # own. Attach to the previous row's description.
+                if (
+                    merge_multiline_descriptions
+                    and prev is not None
+                    and not dates
+                    and not amount_tokens
+                ):
+                    prev["description"] = (
+                        (prev["description"] + " " + line).strip()
+                    )
+                continue
+
+            date_idx, date_text = dates[0]
+            amount_idxs = {idx for idx, _, _ in amount_tokens}
+            desc = _description_from_row(row_words, date_idx, amount_idxs)
+
+            record: dict[str, Any] = {
+                "date": parse_date(date_text, date_formats) or date_text,
+                "description": desc,
+                "page": page.page_no,
+                "raw": line,
+            }
+            for k, (_, _, txt) in enumerate(amount_tokens, start=1):
+                parsed = parse_amount(
+                    txt, negative_in_parens=negative_in_parens,
+                )
+                # Fall back to the raw text if the parser fails so
+                # the user sees something to fix in the editor
+                # rather than a silent NaN.
+                record[f"amount_{k}"] = (
+                    parsed if parsed is not None else txt
+                )
+            out_rows.append(record)
+            prev = record
+
+    return out_rows, warnings
+
+
+__all__ = [
+    "PdfDependencyMissing",
+    "Page",
+    "WordBox",
+    "cluster_rows",
+    "extract_pages",
+    "extract_pages_auto",
+    "ocr_available",
+    "parse_amount",
+    "parse_date",
+    "scan_pdf_for_transactions",
+]
diff --git a/src/pdf_templates.py b/src/pdf_templates.py
deleted file mode 100644
index 0339c6d..0000000
--- a/src/pdf_templates.py
+++ /dev/null
@@ -1,508 +0,0 @@
-"""PDF extract template storage.
-
-Templates encode "how to read this bank's statements" — page
-range, table window markers, column x-positions, target field
-mapping, amount/date parse options. They live as JSON files in
-``~/.datatools/pdf_templates/`` so an accountant can build one
-per source and reuse it for every statement that follows the
-same layout. Templates are portable: the ``export`` / ``import``
-flow is just a file copy of the JSON.
-
-The schema is intentionally a plain dict (not a frozen dataclass)
-because the GUI mutates it incrementally during the build flow.
-``validate_template`` enforces the contract at save time.
-
-Schema (``schema_version: 1``)::
-
-    {
-      "schema_version": 1,
-      "slug": "chase-personal-checking",
-      "name": "Chase Personal Checking",
-      "notes": "",
-      "created_at": "<iso8601>",
-      "updated_at": "<iso8601>",
-      "pages": {
-        "range": "all" | "1-3" | "2,4,6-",
-        "skip_matching": "<regex>"
-      },
-      "table": {
-        "header_text": "<text containing all header words>",
-        "end_markers": ["<regex>", ...],
-        "column_boundaries": [x0, x1, ...],
-        "y_tolerance": 3.0,
-        "skip_rows_matching": ["<regex>", ...]
-      },
-      "columns": [
-        {"source": 0, "target": "date"},
-        ...
-        # ``target`` is one of: date | description | amount |
-        # amount_debit | amount_credit | balance | <free text>
-      ],
-      "parse": {
-        "date_format": "%m/%d/%Y",
-        "date_formats": [],
-        "decimal_separator": ".",
-        "thousands_separator": ",",
-        "currency_strip": "$",
-        "amount_negative_in_parens": true,
-        "merge_multiline_description": true
-      },
-      "visual": {
-        "page_width": 612.0,
-        "page_height": 792.0,
-        "sample_page": 1,
-        "table_bbox": [x0, top, x1, bottom] | null
-      }
-    }
-
-The ``visual`` block is preserved across save/load so the build
-UI can round-trip the user's last visual-picker state.
-"""
-
-from __future__ import annotations
-
-import json
-import os
-import re
-import tempfile
-from datetime import datetime, timezone
-from pathlib import Path
-from typing import Any
-
-
-SCHEMA_VERSION = 2
-
-# Backward-compatible versions ``load_template`` will accept.
-# v1 templates predate the row-heuristic shift and are loaded as
-# ``mode="column_visual"``; they're not auto-migrated on disk, so
-# the user keeps their canonical original until they re-save.
-_LOAD_SUPPORTED_VERSIONS = frozenset({1, 2})
-
-# Extraction modes. ``row_heuristic`` is the default for new
-# templates — finds transactions by date+amount pattern matching
-# with no coordinate dependency. ``column_visual`` is the legacy
-# x-position-boundary approach, kept for old templates and for
-# the "Advanced" build-mode fallback when the heuristic misfires.
-VALID_MODES = frozenset({"row_heuristic", "column_visual"})
-
-# Amount shapes for row_heuristic mode. The GUI offers these as a
-# dropdown; the parser uses them to assign amount tokens to fields.
-VALID_AMOUNT_SHAPES = frozenset({
-    "single",
-    "txn_balance",
-    "debit_credit",
-    "debit_credit_balance",
-})
-
-VALID_TARGETS = frozenset({
-    "date",
-    "description",
-    "amount",
-    "amount_debit",
-    "amount_credit",
-    "balance",
-    "type",
-})
-
-
-# ---------------------------------------------------------------------------
-# Filesystem layout
-# ---------------------------------------------------------------------------
-
-
-def templates_dir() -> Path:
-    """Return ``~/.datatools/pdf_templates/``. Override via the
-    ``DATATOOLS_PDF_TEMPLATES_DIR`` env var (used by tests)."""
-    override = os.environ.get("DATATOOLS_PDF_TEMPLATES_DIR")
-    if override:
-        return Path(override)
-    try:
-        return Path.home() / ".datatools" / "pdf_templates"
-    except Exception:
-        return Path(tempfile.gettempdir()) / "datatools-pdf-templates"
-
-
-def template_path(slug: str) -> Path:
-    """Resolve *slug* to its on-disk JSON path."""
-    return templates_dir() / f"{slug}.json"
-
-
-# ---------------------------------------------------------------------------
-# Slugify
-# ---------------------------------------------------------------------------
-
-
-_SLUG_STRIP = re.compile(r"[^a-z0-9]+")
-
-
-def slugify(name: str) -> str:
-    """Make a filesystem-safe slug from a human-friendly name."""
-    s = (name or "").strip().lower()
-    s = _SLUG_STRIP.sub("-", s).strip("-")
-    return s or "untitled"
-
-
-# ---------------------------------------------------------------------------
-# Construction + defaults
-# ---------------------------------------------------------------------------
-
-
-def new_template(name: str) -> dict[str, Any]:
-    """Build a blank template with sensible defaults.
-
-    Defaults to ``mode="row_heuristic"`` — the simpler, more
-    robust approach. The GUI's build flow lets the user switch to
-    ``mode="column_visual"`` if the heuristic doesn't fit their
-    statement layout.
-    """
-    now = datetime.now(tz=timezone.utc).isoformat(timespec="seconds")
-    slug = slugify(name)
-    return {
-        "schema_version": SCHEMA_VERSION,
-        "slug": slug,
-        "name": name or slug,
-        "notes": "",
-        "mode": "row_heuristic",
-        "created_at": now,
-        "updated_at": now,
-        "pages": {
-            "range": "all",
-            "skip_matching": "",
-        },
-        # Row-heuristic config (primary path).
-        "row_detection": {
-            "min_amounts_per_row": 1,
-            "max_amounts_per_row": 3,
-            "y_tolerance": 3.0,
-            "skip_rows_matching": [],
-            "merge_multiline_description": True,
-        },
-        "amounts": {
-            "shape": "single",
-            "negative_in_parens": True,
-            "decimal_separator": ".",
-            "thousands_separator": ",",
-            "currency_strip": "$",
-        },
-        "date": {
-            "format": "%m/%d/%Y",
-            "formats_fallback": [],
-        },
-        # Column-visual config (legacy / Advanced fallback). Empty
-        # placeholders so the GUI can populate when the user
-        # switches modes without inserting keys at runtime.
-        "table": {
-            "header_text": "",
-            "end_markers": [],
-            "column_boundaries": [],
-            "y_tolerance": 3.0,
-            "skip_rows_matching": [],
-        },
-        "columns": [],
-        "parse": {
-            "date_format": "%m/%d/%Y",
-            "date_formats": [],
-            "decimal_separator": ".",
-            "thousands_separator": ",",
-            "currency_strip": "$",
-            "amount_negative_in_parens": True,
-            "merge_multiline_description": True,
-        },
-        "visual": {
-            "page_width": 612.0,
-            "page_height": 792.0,
-            "sample_page": 1,
-            "table_bbox": None,
-        },
-    }
-
-
-# ---------------------------------------------------------------------------
-# Validation
-# ---------------------------------------------------------------------------
-
-
-def validate_template(template: dict[str, Any]) -> tuple[bool, list[str]]:
-    """Check the template before saving. Returns ``(ok, errors)``.
-
-    Mode-aware: row-heuristic templates and column-visual
-    templates have different required fields. The GUI shows the
-    errors next to the Save button; nothing silent here."""
-    errors: list[str] = []
-    if not isinstance(template, dict):
-        return False, ["Template must be a JSON object."]
-
-    sv = template.get("schema_version")
-    if sv != SCHEMA_VERSION:
-        errors.append(
-            f"Unsupported schema_version {sv!r} (expected {SCHEMA_VERSION})."
-        )
-
-    name = template.get("name", "")
-    if not isinstance(name, str) or not name.strip():
-        errors.append("name is required.")
-
-    slug = template.get("slug") or slugify(name)
-    if not re.match(r"^[a-z0-9][a-z0-9-]{0,63}$", slug or ""):
-        errors.append(
-            "slug must be lowercase alphanumeric + hyphens, "
-            "1–64 chars, starting with a letter or digit."
-        )
-
-    mode = template.get("mode", "row_heuristic")
-    if mode not in VALID_MODES:
-        errors.append(
-            f"mode {mode!r} must be one of: {sorted(VALID_MODES)}."
-        )
-
-    if mode == "row_heuristic":
-        amounts = template.get("amounts", {}) or {}
-        shape = amounts.get("shape", "single")
-        if shape not in VALID_AMOUNT_SHAPES:
-            errors.append(
-                f"amounts.shape {shape!r} must be one of: "
-                f"{sorted(VALID_AMOUNT_SHAPES)}."
-            )
-        rd = template.get("row_detection", {}) or {}
-        min_a = rd.get("min_amounts_per_row", 1)
-        max_a = rd.get("max_amounts_per_row", 3)
-        if not (isinstance(min_a, int) and isinstance(max_a, int)):
-            errors.append(
-                "row_detection.min_amounts_per_row and "
-                "max_amounts_per_row must be integers."
-            )
-        elif min_a < 1 or max_a < min_a:
-            errors.append(
-                "row_detection.min_amounts_per_row must be ≥1 and ≤ "
-                "max_amounts_per_row."
-            )
-
-    elif mode == "column_visual":
-        columns = template.get("columns", [])
-        if not isinstance(columns, list) or len(columns) < 2:
-            errors.append(
-                "column_visual mode: at least two output columns "
-                "are required."
-            )
-        else:
-            seen_targets: list[str] = []
-            for i, col in enumerate(columns):
-                if not isinstance(col, dict):
-                    errors.append(f"columns[{i}] must be an object.")
-                    continue
-                src = col.get("source")
-                tgt = col.get("target")
-                if not isinstance(src, int) or src < 0:
-                    errors.append(
-                        f"columns[{i}].source must be a non-negative "
-                        f"integer."
-                    )
-                if not isinstance(tgt, str) or not tgt:
-                    errors.append(
-                        f"columns[{i}].target must be a non-empty string."
-                    )
-                else:
-                    seen_targets.append(tgt)
-            if "date" not in seen_targets:
-                errors.append(
-                    "column_visual mode: at least one column must map "
-                    "to 'date'."
-                )
-            if (
-                "amount" not in seen_targets
-                and not (
-                    "amount_debit" in seen_targets
-                    and "amount_credit" in seen_targets
-                )
-            ):
-                errors.append(
-                    "column_visual mode: either an 'amount' column or "
-                    "both 'amount_debit' + 'amount_credit' columns "
-                    "are required."
-                )
-
-        table = template.get("table", {}) or {}
-        boundaries = table.get("column_boundaries", [])
-        if not isinstance(boundaries, list):
-            errors.append("table.column_boundaries must be a list.")
-
-    return (not errors), errors
-
-
-# ---------------------------------------------------------------------------
-# Persistence
-# ---------------------------------------------------------------------------
-
-
-def _atomic_write(path: Path, payload: str) -> None:
-    """Write *payload* to *path* via a temp file + rename.
-
-    Avoids leaving a half-written JSON if the process dies mid-save —
-    the GUI saves on every visual-picker change, and a corrupt
-    template file would be hostile to recover from.
-    """
-    path.parent.mkdir(parents=True, exist_ok=True)
-    fd, tmp_path = tempfile.mkstemp(
-        prefix=f".{path.name}.",
-        suffix=".tmp",
-        dir=str(path.parent),
-    )
-    try:
-        with os.fdopen(fd, "w", encoding="utf-8") as f:
-            f.write(payload)
-        os.replace(tmp_path, path)
-    except Exception:
-        try:
-            os.unlink(tmp_path)
-        except FileNotFoundError:
-            pass
-        raise
-
-
-def save_template(template: dict[str, Any]) -> str:
-    """Persist *template* to disk; return the slug it was saved as.
-
-    Stamps ``updated_at``. Atomic via temp-file + rename.
-    Raises ``ValueError`` with a multi-line error list if validation
-    fails — caller should surface that to the user.
-    """
-    ok, errors = validate_template(template)
-    if not ok:
-        raise ValueError("\n".join(errors))
-    template = dict(template)
-    template["updated_at"] = datetime.now(tz=timezone.utc).isoformat(
-        timespec="seconds"
-    )
-    slug = template["slug"]
-    payload = json.dumps(template, indent=2, ensure_ascii=False)
-    _atomic_write(template_path(slug), payload)
-    return slug
-
-
-def load_template(slug: str) -> dict[str, Any]:
-    """Read the template at *slug*. Raises ``FileNotFoundError`` if
-    missing, ``ValueError`` if the JSON is corrupt or the schema
-    version is unsupported.
-
-    v1 templates (pre row-heuristic) are accepted and migrated
-    in-memory to v2 shape with ``mode="column_visual"``. The file
-    on disk is NOT rewritten — the user's canonical original stays
-    intact until they explicitly re-save, so a buggy migration
-    can't silently corrupt their template library.
-    """
-    p = template_path(slug)
-    try:
-        raw = p.read_text(encoding="utf-8")
-    except FileNotFoundError:
-        raise
-    try:
-        data = json.loads(raw)
-    except json.JSONDecodeError as e:
-        raise ValueError(f"Corrupt template {slug!r}: {e}") from e
-    sv = data.get("schema_version")
-    if sv not in _LOAD_SUPPORTED_VERSIONS:
-        raise ValueError(
-            f"Template {slug!r} has unsupported schema_version {sv!r}; "
-            f"this build supports {sorted(_LOAD_SUPPORTED_VERSIONS)}."
-        )
-    return _migrate_to_current(data)
-
-
-def _migrate_to_current(data: dict[str, Any]) -> dict[str, Any]:
-    """In-memory migration of older schemas to the current shape.
-
-    v1 → v2 adds a ``mode`` key defaulting to ``"column_visual"``
-    (since v1 was the column-x-position approach) and stamps
-    ``schema_version`` to the current value. All v1 keys keep
-    their original meaning."""
-    if data.get("schema_version") == 1:
-        data = dict(data)
-        data["schema_version"] = SCHEMA_VERSION
-        data.setdefault("mode", "column_visual")
-    return data
-
-
-def delete_template(slug: str) -> bool:
-    """Remove the template file; returns ``True`` if it existed."""
-    p = template_path(slug)
-    try:
-        p.unlink()
-        return True
-    except FileNotFoundError:
-        return False
-
-
-def list_templates() -> list[dict[str, Any]]:
-    """Return a sorted list of ``{slug, name, updated_at}`` summaries.
-
-    Skips files that fail to parse — surfaces them in the manage UI
-    as warnings rather than crashing the list view.
-    """
-    d = templates_dir()
-    if not d.exists():
-        return []
-    out: list[dict[str, Any]] = []
-    for p in sorted(d.glob("*.json")):
-        try:
-            data = json.loads(p.read_text(encoding="utf-8"))
-        except Exception:
-            continue
-        if not isinstance(data, dict):
-            continue
-        out.append({
-            "slug": data.get("slug") or p.stem,
-            "name": data.get("name") or p.stem,
-            "updated_at": data.get("updated_at", ""),
-            "notes": data.get("notes", ""),
-        })
-    out.sort(key=lambda r: r["updated_at"] or r["name"], reverse=True)
-    return out
-
-
-# ---------------------------------------------------------------------------
-# Import / export
-# ---------------------------------------------------------------------------
-
-
-def template_to_json(template: dict[str, Any]) -> str:
-    """Serialize a template for download. Pretty-printed for human
-    inspection / diffing."""
-    return json.dumps(template, indent=2, ensure_ascii=False)
-
-
-def template_from_json(payload: str) -> dict[str, Any]:
-    """Deserialize uploaded template JSON. Validates schema version
-    but does NOT save — caller decides whether to ``save_template``
-    or merge into the current build.
-
-    Raises ``ValueError`` on malformed input."""
-    try:
-        data = json.loads(payload)
-    except json.JSONDecodeError as e:
-        raise ValueError(f"Not valid JSON: {e}") from e
-    if not isinstance(data, dict):
-        raise ValueError("Top-level JSON must be an object.")
-    sv = data.get("schema_version")
-    if sv != SCHEMA_VERSION:
-        raise ValueError(
-            f"Imported template has schema_version {sv!r}; "
-            f"this build expects {SCHEMA_VERSION}."
-        )
-    return data
-
-
-__all__ = [
-    "SCHEMA_VERSION",
-    "VALID_TARGETS",
-    "delete_template",
-    "list_templates",
-    "load_template",
-    "new_template",
-    "save_template",
-    "slugify",
-    "template_from_json",
-    "template_path",
-    "template_to_json",
-    "templates_dir",
-    "validate_template",
-]
diff --git a/tests/test_drawable_canvas_compat.py b/tests/test_drawable_canvas_compat.py
deleted file mode 100644
index 22d90ba..0000000
--- a/tests/test_drawable_canvas_compat.py
+++ /dev/null
@@ -1,116 +0,0 @@
-"""Tests for the streamlit-drawable-canvas compatibility shim.
-
-The shim re-attaches ``image_to_url`` to ``streamlit.elements.image``
-on modern Streamlit where the helper was relocated to
-``streamlit.elements.lib.image_utils`` and given a new signature
-(takes a ``LayoutConfig`` dataclass instead of a plain ``int``
-width).
-
-If this test ever fails on a Streamlit upgrade, it almost
-certainly means the ``image_to_url`` function moved AGAIN — the
-shim's fallback message points to where to look. Update
-``_drawable_canvas_compat.py`` to find the new location.
-"""
-
-from __future__ import annotations
-
-import sys
-import types
-
-
-def test_shim_attaches_image_to_url():
-    """After ``install()`` the old import path resolves to a
-    callable, even on modern Streamlit where the original was
-    relocated."""
-    # Force a fresh import so the module-level _PATCHED guard
-    # doesn't short-circuit between tests.
-    sys.modules.pop("src.gui._drawable_canvas_compat", None)
-    from src.gui._drawable_canvas_compat import install
-    install()
-    import streamlit.elements.image as old_loc
-    assert hasattr(old_loc, "image_to_url")
-    assert callable(old_loc.image_to_url)
-
-
-def test_shim_is_idempotent():
-    """Calling ``install()`` twice doesn't double-wrap or break
-    anything — important because the page module imports + calls
-    it once, and a Streamlit script-rerun re-executes the page
-    module top-to-bottom."""
-    sys.modules.pop("src.gui._drawable_canvas_compat", None)
-    from src.gui._drawable_canvas_compat import install
-    install()
-    import streamlit.elements.image as old_loc
-    first = old_loc.image_to_url
-    install()
-    second = old_loc.image_to_url
-    assert first is second
-
-
-def test_shim_no_op_when_image_to_url_already_present():
-    """If a future Streamlit restores ``image_to_url`` at the old
-    location, the shim must not overwrite it — leave the upstream
-    function in place so the canvas package gets the official
-    version, not our compatibility wrapper."""
-    sys.modules.pop("src.gui._drawable_canvas_compat", None)
-    import streamlit.elements.image as old_loc
-
-    sentinel = lambda *a, **kw: "sentinel-url"  # noqa: E731
-    old_loc.image_to_url = sentinel
-    try:
-        from src.gui._drawable_canvas_compat import install
-        install()
-        assert old_loc.image_to_url is sentinel, (
-            "Shim must not clobber an existing image_to_url."
-        )
-    finally:
-        # Tidy up so subsequent tests see a clean module.
-        delattr(old_loc, "image_to_url")
-        sys.modules.pop("src.gui._drawable_canvas_compat", None)
-
-
-def test_shim_calls_new_function_with_layout_config():
-    """The shim's wrapper must translate the old ``(image, width,
-    clamp, channels, output_format, image_id)`` call into the new
-    ``(image, layout_config, …)`` signature without breaking."""
-    sys.modules.pop("src.gui._drawable_canvas_compat", None)
-    import streamlit.elements.image as old_loc
-    if hasattr(old_loc, "image_to_url"):
-        delattr(old_loc, "image_to_url")
-
-    # Replace the new function with a recorder so we can inspect
-    # what arguments the shim passed through.
-    from streamlit.elements.lib import image_utils
-    captured: dict = {}
-    original = image_utils.image_to_url
-
-    def recorder(image, layout_config, clamp, channels, output_format, image_id):
-        captured["image"] = image
-        captured["layout_config"] = layout_config
-        captured["clamp"] = clamp
-        captured["channels"] = channels
-        captured["output_format"] = output_format
-        captured["image_id"] = image_id
-        return "fake-url"
-
-    image_utils.image_to_url = recorder
-    try:
-        from src.gui._drawable_canvas_compat import install
-        install()
-        result = old_loc.image_to_url(
-            "fake-image", -1, False, "RGB", "PNG", "test-id",
-        )
-        assert result == "fake-url"
-        assert captured["image"] == "fake-image"
-        assert captured["clamp"] is False
-        assert captured["channels"] == "RGB"
-        assert captured["output_format"] == "PNG"
-        assert captured["image_id"] == "test-id"
-        # The shim wraps the int width into a LayoutConfig.
-        from streamlit.elements.lib.layout_utils import LayoutConfig
-        assert isinstance(captured["layout_config"], LayoutConfig)
-    finally:
-        image_utils.image_to_url = original
-        if hasattr(old_loc, "image_to_url"):
-            delattr(old_loc, "image_to_url")
-        sys.modules.pop("src.gui._drawable_canvas_compat", None)
diff --git a/tests/test_pdf_extract.py b/tests/test_pdf_extract.py
index 0f72aed..93abb2f 100644
--- a/tests/test_pdf_extract.py
+++ b/tests/test_pdf_extract.py
@@ -1,36 +1,33 @@
-"""Tests for the pure PDF-extraction pipeline.
+"""Tests for the minimal PDF transaction scanner.
 
-Real PDF parsing (``extract_pages``) is a thin wrapper around
-``pdfplumber`` and is exercised by hand on real bank statements.
-These tests pin the meaty bits — value parsing, row clustering,
-column assignment, template-driven extraction — against synthetic
-``WordBox`` data so they run fast and have no PDF dependency.
+The public API is one function: ``scan_pdf_for_transactions``.
+These tests cover the value-parsing helpers, the row clusterer,
+the date/amount token finders, and the end-to-end scanner
+against synthetic ``Page`` objects with no real PDF involved.
+
+End-to-end-on-a-real-PDF coverage lives in
+``test_pdf_extract_smoke.py``, which uses ``fpdf2`` to generate
+a fixture statement at test time.
 """
 
 from __future__ import annotations
 
-import pandas as pd
-
 from src.pdf_extract import (
     Page,
     WordBox,
-    apply_template,
-    assign_columns,
+    _find_amount_tokens,
+    _find_dates_in_words,
     cluster_rows,
     parse_amount,
     parse_date,
-    _pages_in_range,
-    _within_table_window,
 )
 
 
 def _w(text: str, x0: float, top: float, x1: float | None = None) -> WordBox:
-    """Convenience constructor — heights and exact x1 don't matter
-    for the tests we write."""
     return WordBox(
         x0=x0,
         top=top,
-        x1=x1 if x1 is not None else x0 + 10 * len(text),
+        x1=x1 if x1 is not None else x0 + 8 * len(text),
         bottom=top + 10,
         text=text,
     )
@@ -61,13 +58,18 @@ class TestParseAmount:
         assert parse_amount("not a number") is None
 
     def test_european_decimal(self):
-        opts = {
-            "decimal_separator": ",",
-            "thousands_separator": ".",
-            "currency_strip": "€",
-            "negative_in_parens": True,
-        }
-        assert parse_amount("€1.234,56", opts) == 1234.56
+        assert parse_amount(
+            "€1.234,56",
+            decimal=",",
+            thousands=".",
+            currency_strip="€",
+        ) == 1234.56
+
+    def test_parens_off_disables_paren_negative(self):
+        # With parens off, (4.50) won't be treated as negative —
+        # but it also won't parse cleanly since "(4.50)" isn't a
+        # plain number. Verify the off-path is non-flipping.
+        assert parse_amount("(4.50)", negative_in_parens=False) is None
 
 
 class TestParseDate:
@@ -78,7 +80,7 @@ class TestParseDate:
         assert parse_date("2026-01-15", ["%Y-%m-%d"]) == "2026-01-15"
 
     def test_fallback_format(self):
-        # Not in the supplied list — should still parse via fallback.
+        # Not in supplied list — should still parse via fallback.
         assert parse_date("01/15/26") == "2026-01-15"
 
     def test_invalid(self):
@@ -88,199 +90,74 @@ class TestParseDate:
 class TestClusterRows:
     def test_groups_close_y(self):
         words = [
-            _w("A", x0=0, top=100),
-            _w("B", x0=20, top=101),
-            _w("C", x0=40, top=102),
+            _w("A", 0, 100), _w("B", 20, 101), _w("C", 40, 102),
         ]
-        rows = cluster_rows(words, y_tolerance=3.0)
+        rows = cluster_rows(words)
         assert len(rows) == 1
         assert [w.text for w in rows[0]] == ["A", "B", "C"]
 
     def test_separates_far_y(self):
-        words = [
-            _w("A", x0=0, top=100),
-            _w("B", x0=0, top=120),
-        ]
-        rows = cluster_rows(words, y_tolerance=3.0)
-        assert [[w.text for w in r] for r in rows] == [["A"], ["B"]]
+        words = [_w("A", 0, 100), _w("B", 0, 120)]
+        assert [
+            [w.text for w in r] for r in cluster_rows(words)
+        ] == [["A"], ["B"]]
 
     def test_sorts_left_to_right_within_row(self):
-        words = [
-            _w("C", x0=40, top=100),
-            _w("A", x0=0, top=100),
-            _w("B", x0=20, top=100),
-        ]
-        rows = cluster_rows(words)
-        assert [w.text for w in rows[0]] == ["A", "B", "C"]
+        words = [_w("C", 40, 100), _w("A", 0, 100), _w("B", 20, 100)]
+        assert [w.text for w in cluster_rows(words)[0]] == ["A", "B", "C"]
 
     def test_empty(self):
         assert cluster_rows([]) == []
 
 
-class TestAssignColumns:
-    def test_three_columns(self):
-        # boundaries at x=100, 200 → columns [0,100), [100,200), [200,∞)
-        row = [
-            _w("Jan", x0=10, top=0, x1=40),       # col 0
-            _w("1", x0=45, top=0, x1=55),         # col 0
-            _w("Deposit", x0=110, top=0, x1=180), # col 1
-            _w("250.00", x0=210, top=0, x1=260),  # col 2
-        ]
-        cells = assign_columns(row, [100, 200])
-        assert cells[0] == "Jan 1"
-        assert cells[1] == "Deposit"
-        assert cells[2] == "250.00"
+class TestFindDatesInWords:
+    def test_us_slash(self):
+        row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)]
+        assert _find_dates_in_words(row) == [(0, "01/15/2026")]
 
-    def test_no_boundaries_one_column(self):
-        row = [_w("A", 0, 0), _w("B", 20, 0)]
-        cells = assign_columns(row, [])
-        assert cells == ["A B"]
+    def test_two_digit_year(self):
+        row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)]
+        result = _find_dates_in_words(row)
+        assert result and result[0][1] == "01/15/26"
+
+    def test_iso(self):
+        row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)]
+        assert _find_dates_in_words(row) == [(0, "2026-01-15")]
+
+    def test_month_name(self):
+        row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)]
+        result = _find_dates_in_words(row)
+        assert result and "Jan 15" in result[0][1]
+
+    def test_no_date(self):
+        row = [_w("Just", 0, 0), _w("text", 50, 0)]
+        assert _find_dates_in_words(row) == []
 
 
-class TestPagesInRange:
-    def _mk(self, n):
-        return [Page(page_no=i + 1, width=600, height=800, text="", words=[]) for i in range(n)]
+class TestFindAmountTokens:
+    def test_currency_format(self):
+        row = [_w("Coffee", 0, 0), _w("$4.50", 100, 0)]
+        out = _find_amount_tokens(row)
+        assert len(out) == 1
+        assert out[0][2] == "$4.50"
 
-    def test_all(self):
-        pages = self._mk(5)
-        assert len(_pages_in_range(pages, "all")) == 5
-        assert len(_pages_in_range(pages, "")) == 5
+    def test_parens_negative(self):
+        row = [_w("(123.45)", 0, 0)]
+        out = _find_amount_tokens(row)
+        assert out and out[0][2] == "(123.45)"
 
-    def test_explicit_list(self):
-        pages = self._mk(5)
-        got = [p.page_no for p in _pages_in_range(pages, "1,3,5")]
-        assert got == [1, 3, 5]
+    def test_no_amount_on_pure_text(self):
+        row = [_w("Hello", 0, 0), _w("World", 50, 0)]
+        assert _find_amount_tokens(row) == []
 
-    def test_range(self):
-        pages = self._mk(5)
-        got = [p.page_no for p in _pages_in_range(pages, "2-4")]
-        assert got == [2, 3, 4]
-
-    def test_open_ended(self):
-        pages = self._mk(5)
-        got = [p.page_no for p in _pages_in_range(pages, "3-")]
-        assert got == [3, 4, 5]
+    def test_rejects_bare_year(self):
+        # A bare 4-digit year matches the digit pattern but lacks
+        # any money marker — should be filtered out.
+        row = [_w("2026", 0, 0)]
+        assert _find_amount_tokens(row) == []
 
 
-class TestWithinTableWindow:
-    def test_header_skipped_end_excluded(self):
-        rows = [
-            [_w("STATEMENT", 0, 0)],
-            [_w("Date", 0, 20), _w("Description", 50, 20), _w("Amount", 200, 20)],
-            [_w("01/15", 0, 40), _w("Coffee", 50, 40), _w("4.50", 200, 40)],
-            [_w("01/16", 0, 60), _w("Refund", 50, 60), _w("12.00", 200, 60)],
-            [_w("Closing", 0, 80), _w("balance", 50, 80)],
-            [_w("Page", 0, 100), _w("1", 50, 100)],
-        ]
-        out = _within_table_window(rows, "Date Description Amount", ["Closing balance"])
-        # Should keep just the two transaction rows.
-        assert len(out) == 2
-        assert out[0][0].text == "01/15"
-        assert out[1][0].text == "01/16"
-
-    def test_no_header_returns_empty_when_required(self):
-        rows = [[_w("foo", 0, 0)]]
-        assert _within_table_window(rows, "Date Description Amount", []) == []
-
-    def test_blank_header_passes_through(self):
-        rows = [[_w("x", 0, 0)], [_w("y", 0, 20)]]
-        assert _within_table_window(rows, "", []) == rows
-
-
-class TestApplyTemplate:
-    """End-to-end on synthetic ``Page`` objects."""
-
-    def _statement_page(self) -> Page:
-        # Mock layout: 3 columns at x=0/100/200, header at y=20, data at 40+.
-        words = [
-            _w("STATEMENT", 0, 0),
-            # Header
-            _w("Date", 5, 20), _w("Description", 105, 20), _w("Amount", 205, 20),
-            # Row 1
-            _w("01/15/2026", 5, 40), _w("Coffee", 105, 40),
-            _w("Shop", 140, 40), _w("(4.50)", 205, 40),
-            # Row 2
-            _w("01/16/2026", 5, 60), _w("Refund", 105, 60), _w("$12.00", 205, 60),
-            # Continuation row (no date) — should merge into row 2
-            _w("from", 105, 80), _w("vendor", 140, 80),
-            # End marker
-            _w("Closing", 5, 100), _w("balance", 105, 100), _w("$1,000.00", 205, 100),
-        ]
-        return Page(page_no=1, width=300, height=120, text="", words=words)
-
-    def _template(self) -> dict:
-        return {
-            "pages": {"range": "all"},
-            "table": {
-                "header_text": "Date Description Amount",
-                "end_markers": ["Closing balance"],
-                "column_boundaries": [100, 200],
-                "y_tolerance": 3.0,
-                "skip_rows_matching": [],
-            },
-            "columns": [
-                {"source": 0, "target": "date"},
-                {"source": 1, "target": "description"},
-                {"source": 2, "target": "amount"},
-            ],
-            "parse": {
-                "date_format": "%m/%d/%Y",
-                "amount_negative_in_parens": True,
-                "merge_multiline_description": True,
-            },
-        }
-
-    def test_basic_extraction(self):
-        df = apply_template([self._statement_page()], self._template())
-        assert isinstance(df, pd.DataFrame)
-        assert len(df) == 2
-        assert list(df["date"]) == ["2026-01-15", "2026-01-16"]
-        # Parens-negative
-        assert df.iloc[0]["amount"] == -4.50
-        # Plain positive with currency strip
-        assert df.iloc[1]["amount"] == 12.00
-        # Multi-line description merged
-        assert "from vendor" in df.iloc[1]["description"]
-
-    def test_debit_credit_split_columns(self):
-        # Layout: date | description | debit | credit columns
-        page = Page(
-            page_no=1, width=400, height=80, text="",
-            words=[
-                _w("Date", 5, 0), _w("Desc", 105, 0),
-                _w("Debit", 205, 0), _w("Credit", 305, 0),
-                _w("01/15/2026", 5, 20), _w("Coffee", 105, 20), _w("4.50", 205, 20),
-                _w("01/16/2026", 5, 40), _w("Refund", 105, 40),
-                _w("", 205, 40),  # no debit
-                _w("12.00", 305, 40),
-            ],
-        )
-        tpl = {
-            "table": {
-                "header_text": "Date Desc Debit Credit",
-                "column_boundaries": [100, 200, 300],
-            },
-            "columns": [
-                {"source": 0, "target": "date"},
-                {"source": 1, "target": "description"},
-                {"source": 2, "target": "amount_debit"},
-                {"source": 3, "target": "amount_credit"},
-            ],
-            "parse": {"date_format": "%m/%d/%Y"},
-        }
-        df = apply_template([page], tpl)
-        assert list(df["amount"]) == [-4.50, 12.00]
-        assert list(df["type"]) == ["debit", "credit"]
-
-    def test_skip_rows_matching(self):
-        page = self._statement_page()
-        tpl = self._template()
-        tpl["table"]["skip_rows_matching"] = ["Refund"]
-        df = apply_template([page], tpl)
-        # Refund row is dropped — only one transaction left
-        assert len(df) == 1
-        assert df.iloc[0]["amount"] == -4.50
-
-    def test_empty_pages_returns_empty_df(self):
-        df = apply_template([], self._template())
-        assert df.empty
+# End-to-end tests against synthetic Page objects are in the smoke
+# test module — they need ``scan_pdf_for_transactions`` which in
+# turn uses ``extract_pages_auto``. The unit-test layer here pins
+# the building blocks; smoke tests pin the wiring.
diff --git a/tests/test_pdf_extract_smoke.py b/tests/test_pdf_extract_smoke.py
index f6c4004..f648871 100644
--- a/tests/test_pdf_extract_smoke.py
+++ b/tests/test_pdf_extract_smoke.py
@@ -1,55 +1,43 @@
-"""End-to-end smoke tests for the PDF extraction stack.
+"""End-to-end smoke tests for the PDF transaction scanner.
 
-These tests run real ``pdfplumber`` + ``pypdfium2`` calls against
-a small PDF generated in-memory with ``fpdf2``. They exist to
-catch the failure mode the user hit on first install — a missing
-or mismatched native dependency that doesn't show up until the
-extractor actually tries to open a PDF.
+These run real ``pdfplumber`` + ``pypdfium2`` (when OCR is in play)
+calls against a small statement-shaped PDF generated in memory
+with ``fpdf2``. They catch the failure modes most likely to bite
+an end-user installer build: missing native lib, broken hook
+bundling, pin/installed mismatch.
 
-Per ``project-pdf-extractor`` memory: ``test_pdf_extract.py``
-covers the parsing logic on synthetic ``WordBox`` data with no
-PDF dep involved. This file is the layer above: it confirms the
-deps themselves work, that hooks bundled them correctly (the
-versions pinned in ``requirements.txt`` matter here), and that
-the extractor's pipeline survives a round-trip through real
-``pdfplumber.extract_words`` and real ``pypdfium2.render``.
-
-Generation note: ``fpdf2`` is a test-only dep listed in
+Generation note: ``fpdf2`` is a test-only dep in
 ``requirements-dev.txt``. We don't ship it.
 """
 
 from __future__ import annotations
 
-import io
-
 import pytest
 
 
 def _build_tiny_statement_pdf() -> bytes:
-    """Render a one-page PDF that looks roughly like the simplest
-    possible bank statement: a header line + three transaction
-    rows + a closing-balance footer. Word positions are stable
-    enough that the parser can identify columns by x-position."""
+    """One-page PDF: header line + three transaction rows + a
+    closing-balance footer. The scanner should pick up exactly the
+    three transactions."""
     from fpdf import FPDF
 
     pdf = FPDF(orientation="P", unit="pt", format="letter")
     pdf.add_page()
     pdf.set_font("Helvetica", size=12)
-    # Header
     pdf.set_xy(40, 50)
     pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT")
-    # Transaction-table header row
+    # Header row (not a transaction — no amount)
     pdf.set_xy(40, 100)
     pdf.cell(120, 14, "Date")
     pdf.set_xy(160, 100)
     pdf.cell(200, 14, "Description")
     pdf.set_xy(360, 100)
     pdf.cell(80, 14, "Amount")
-    # Three rows
+    # Three transactions
     rows = [
-        ("01/15/2026", "Coffee Shop",   "(4.50)"),
-        ("01/16/2026", "Refund Vendor", "$12.00"),
-        ("01/17/2026", "ATM Withdrawal","(40.00)"),
+        ("01/15/2026", "Coffee Shop",     "(4.50)"),
+        ("01/16/2026", "Refund Vendor",   "$12.00"),
+        ("01/17/2026", "ATM Withdrawal",  "(40.00)"),
     ]
     y = 130
     for date, desc, amt in rows:
@@ -60,7 +48,7 @@ def _build_tiny_statement_pdf() -> bytes:
         pdf.set_xy(360, y)
         pdf.cell(80, 14, amt)
         y += 20
-    # Closing-balance footer
+    # Footer — has a date-like number maybe but no real txn shape
     pdf.set_xy(40, y + 20)
     pdf.cell(0, 14, "Closing balance: $1,000.00")
     return bytes(pdf.output())
@@ -72,12 +60,8 @@ def _build_tiny_statement_pdf() -> bytes:
 
 
 class TestDependencyImports:
-    """Each runtime PDF dep must be importable.
-
-    These tests will fail fast on a stripped/broken install — most
-    valuable as a CI gate when the requirements.txt pins are
-    bumped, so we know the new pin still installs cleanly across
-    the matrix."""
+    """Each runtime PDF dep must be importable. Fails fast on a
+    stripped install or a missing CI pin."""
 
     def test_pdfplumber(self):
         import pdfplumber  # noqa: F401
@@ -85,130 +69,135 @@ class TestDependencyImports:
     def test_pypdfium2(self):
         import pypdfium2  # noqa: F401
 
-    def test_streamlit_drawable_canvas(self):
-        # Don't instantiate the canvas — that needs a Streamlit
-        # script-run context. Just confirm the module loads.
-        import streamlit_drawable_canvas  # noqa: F401
-
     def test_pytesseract(self):
-        # The Python binding must import even when the Tesseract
-        # binary isn't installed — the OCR availability check
-        # handles binary absence separately.
         import pytesseract  # noqa: F401
 
     def test_PIL(self):
-        # Transitively required by pdfplumber + pypdfium2 + canvas.
-        # Pinning explicit confirms hooks pull it through.
         from PIL import Image  # noqa: F401
 
 
 # ---------------------------------------------------------------------------
-# Real-PDF round-trip
+# End-to-end against a real PDF
 # ---------------------------------------------------------------------------
 
 
-class TestRealPdfRoundTrip:
-    """``extract_pages`` + ``apply_template`` against a real PDF."""
-
+class TestScanPdfForTransactions:
     @pytest.fixture
     def pdf_bytes(self) -> bytes:
         return _build_tiny_statement_pdf()
 
-    def test_extract_pages_returns_words(self, pdf_bytes):
-        from src.pdf_extract import extract_pages
-        pages = extract_pages(pdf_bytes)
-        assert len(pages) == 1
-        assert pages[0].width > 0 and pages[0].height > 0
-        # At minimum we should have the words from the header and
-        # one transaction row — proves pdfplumber wired up.
-        all_text = " ".join(w.text for w in pages[0].words)
-        assert "ACME" in all_text
-        assert "Coffee" in all_text
-        assert "01/15/2026" in all_text
+    def test_finds_three_transactions(self, pdf_bytes):
+        from src.pdf_extract import scan_pdf_for_transactions
+        rows, warnings = scan_pdf_for_transactions(pdf_bytes)
+        # The PDF has 3 transactions plus a header and a closing-
+        # balance footer. Header has no amount; closing-balance has
+        # no date in the same line — neither qualifies as a txn.
+        assert len(rows) == 3, (
+            f"expected 3 rows, got {len(rows)}:\n"
+            f"{[r.get('raw') for r in rows]}"
+        )
 
-    def test_apply_template_extracts_three_rows(self, pdf_bytes):
-        from src.pdf_extract import apply_template, extract_pages
-        # The template's column boundaries are tuned to fpdf2's
-        # x-coordinates above (40 / 160 / 360 pt).
-        tpl = {
-            "pages": {"range": "all"},
-            "table": {
-                "header_text": "Date Description Amount",
-                "end_markers": ["Closing balance"],
-                "column_boundaries": [150, 350],
-                "y_tolerance": 3.0,
-            },
-            "columns": [
-                {"source": 0, "target": "date"},
-                {"source": 1, "target": "description"},
-                {"source": 2, "target": "amount"},
-            ],
-            "parse": {
-                "date_format": "%m/%d/%Y",
-                "amount_negative_in_parens": True,
-                "merge_multiline_description": True,
-            },
-        }
-        pages = extract_pages(pdf_bytes)
-        df = apply_template(pages, tpl)
-        assert len(df) == 3, f"expected 3 rows, got {len(df)}:\n{df}"
-        assert list(df["date"]) == [
+    def test_parses_dates_to_iso(self, pdf_bytes):
+        from src.pdf_extract import scan_pdf_for_transactions
+        rows, _ = scan_pdf_for_transactions(pdf_bytes)
+        assert [r["date"] for r in rows] == [
             "2026-01-15", "2026-01-16", "2026-01-17",
         ]
-        # Parens-negative + currency-positive both round-trip
-        assert df.iloc[0]["amount"] == -4.50
-        assert df.iloc[1]["amount"] == 12.00
-        assert df.iloc[2]["amount"] == -40.00
+
+    def test_parses_amounts_with_signs(self, pdf_bytes):
+        from src.pdf_extract import scan_pdf_for_transactions
+        rows, _ = scan_pdf_for_transactions(pdf_bytes)
+        assert rows[0]["amount_1"] == -4.50
+        assert rows[1]["amount_1"] == 12.00
+        assert rows[2]["amount_1"] == -40.00
+
+    def test_preserves_raw_line(self, pdf_bytes):
+        from src.pdf_extract import scan_pdf_for_transactions
+        rows, _ = scan_pdf_for_transactions(pdf_bytes)
+        # Raw line lets the user verify what was matched.
+        assert all("raw" in r and r["raw"] for r in rows)
+        assert "Coffee" in rows[0]["raw"]
+
+    def test_page_tagged(self, pdf_bytes):
+        from src.pdf_extract import scan_pdf_for_transactions
+        rows, _ = scan_pdf_for_transactions(pdf_bytes)
+        assert all(r["page"] == 1 for r in rows)
+
+    def test_negative_in_parens_off(self, pdf_bytes):
+        """With parens-negative off, the parser can't decode
+        ``(4.50)`` and falls back to the raw text — the row still
+        surfaces, just with the unparsed string in the amount slot
+        so the user can see and fix it in the editor."""
+        from src.pdf_extract import scan_pdf_for_transactions
+        rows, _ = scan_pdf_for_transactions(
+            pdf_bytes, negative_in_parens=False,
+        )
+        # Row 0 had "(4.50)" — without parens-negative, parse_amount
+        # returns None and the scanner keeps the raw token.
+        assert rows[0]["amount_1"] == "(4.50)"
+        # Row 1 had "$12.00" — still parses to positive.
+        assert rows[1]["amount_1"] == 12.00
 
 
 # ---------------------------------------------------------------------------
-# pypdfium2 rendering (powers the visual picker)
+# Multi-line description merging
 # ---------------------------------------------------------------------------
 
 
-class TestRenderPageImage:
-    """``render_page_image`` is what feeds the drawable canvas.
+class TestMultilineDescription:
+    def test_continuation_line_merges(self):
+        """A line with no date and no amount, sitting between two
+        transaction rows, attaches to the previous transaction's
+        description."""
+        from src.pdf_extract import (
+            Page,
+            WordBox,
+            scan_pdf_for_transactions,
+        )
+        # Build a synthetic page through the public entry point by
+        # going through extract_pages_auto's intermediate? Easier:
+        # call the internals directly via a fake PDF. For unit
+        # coverage of the merge behavior, route through the helper:
+        from src import pdf_extract as mod
 
-    Catches the most common installer-bug: native PDFium .dll/.so
-    missing from the bundle. If this test crashes with a
-    ``FileNotFoundError`` it almost always means the
-    ``hook-pypdfium2.py`` didn't pick up the shared lib."""
+        original = mod.extract_pages_auto
 
-    def test_renders_a_real_pil_image(self):
-        from src.pdf_extract import render_page_image
-        pdf_bytes = _build_tiny_statement_pdf()
-        image, scale = render_page_image(pdf_bytes, page_no=1)
-        # Letter-size at scale ≈ 900/612 ≈ 1.47 → ~900px wide.
-        assert image.width > 800
-        assert image.height > 800
-        assert scale > 0
-        # PIL Image is duck-typed; check the attrs we depend on.
-        assert hasattr(image, "save")
-        assert hasattr(image, "tobytes")
+        def fake(_pdf_bytes, *, allow_ocr=True):
+            words = [
+                WordBox(x0=0, top=0, x1=80, bottom=10, text="01/15/2026"),
+                WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
+                WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
+                # Continuation: no date, no amount
+                WordBox(x0=100, top=20, x1=160, bottom=30, text="Vendor"),
+                WordBox(x0=170, top=20, x1=230, bottom=30, text="memo"),
+                # Next transaction
+                WordBox(x0=0, top=40, x1=80, bottom=50, text="01/16/2026"),
+                WordBox(x0=100, top=40, x1=160, bottom=50, text="Other"),
+                WordBox(x0=200, top=40, x1=240, bottom=50, text="$10.00"),
+            ]
+            return [Page(
+                page_no=1, width=300, height=100, text="", words=words,
+            )], []
 
-    def test_invalid_page_number_clamps(self):
-        from src.pdf_extract import render_page_image
-        pdf_bytes = _build_tiny_statement_pdf()
-        # PDF has 1 page; page_no=99 should clamp, not raise.
-        image, scale = render_page_image(pdf_bytes, page_no=99)
-        assert image.width > 0
+        mod.extract_pages_auto = fake
+        try:
+            rows, _ = scan_pdf_for_transactions(b"")
+        finally:
+            mod.extract_pages_auto = original
+
+        assert len(rows) == 2
+        assert "Vendor memo" in rows[0]["description"]
+        assert rows[1]["description"] == "Other"
 
 
 # ---------------------------------------------------------------------------
-# Graceful-fallback behavior
+# Graceful fallback when deps absent
 # ---------------------------------------------------------------------------
 
 
 class TestPdfDependencyMissing:
-    """The page should see a clean exception when a dep is absent,
-    not a raw ``ImportError`` that leaks into the Streamlit traceback."""
-
     def test_require_pdfplumber_raises_typed_on_absence(self, monkeypatch):
         from src import pdf_extract
-        # Simulate "pdfplumber not installed" without uninstalling.
-        # ``_require_pdfplumber`` does its own ``import pdfplumber``
-        # at call time; patch ``__import__`` to throw for that one
-        # name only.
         import builtins
         real_import = builtins.__import__
 
@@ -218,10 +207,10 @@ class TestPdfDependencyMissing:
             return real_import(name, *a, **kw)
 
         monkeypatch.setattr(builtins, "__import__", fake_import)
-        with pytest.raises(pdf_extract.PdfDependencyMissing) as exc_info:
+        with pytest.raises(pdf_extract.PdfDependencyMissing) as exc:
             pdf_extract._require_pdfplumber()
-        assert "pdfplumber" in str(exc_info.value)
-        assert exc_info.value.hint  # actionable hint must be populated
+        assert "pdfplumber" in str(exc.value)
+        assert exc.value.hint
 
     def test_require_pdfium_raises_typed_on_absence(self, monkeypatch):
         from src import pdf_extract
@@ -239,17 +228,13 @@ class TestPdfDependencyMissing:
 
 
 # ---------------------------------------------------------------------------
-# Requirements-pin consistency
+# Requirements pin consistency
 # ---------------------------------------------------------------------------
 
 
 class TestPinnedVersionsMatchInstalled:
     """If someone bumps the pin in ``requirements.txt`` without
-    actually reinstalling, this test points it out before CI does.
-
-    Uses ``importlib.metadata`` rather than each library's
-    ``__version__`` attribute because not every PDF dep exposes
-    one (``pypdfium2`` keeps version info on a submodule)."""
+    actually reinstalling, this test points it out before CI does."""
 
     def _parse_pins(self) -> dict[str, str]:
         from pathlib import Path
@@ -266,21 +251,17 @@ class TestPinnedVersionsMatchInstalled:
                 pins[name.strip()] = version.strip()
         return pins
 
-    def _installed(self, dist_name: str) -> str:
-        import importlib.metadata as md
-        return md.version(dist_name)
-
     @pytest.mark.parametrize("dist_name", [
         "pdfplumber",
         "pypdfium2",
         "pytesseract",
-        "streamlit-drawable-canvas",
     ])
     def test_pin_matches_installed(self, dist_name):
+        import importlib.metadata as md
         pins = self._parse_pins()
         if dist_name not in pins:
             pytest.skip(f"{dist_name} not exact-pinned in requirements.txt")
-        installed = self._installed(dist_name)
+        installed = md.version(dist_name)
         assert installed == pins[dist_name], (
             f"installed {dist_name}=={installed} but requirements.txt "
             f"pins {pins[dist_name]} — bump the pin, or reinstall."
@@ -288,79 +269,52 @@ class TestPinnedVersionsMatchInstalled:
 
 
 # ---------------------------------------------------------------------------
-# OCR availability runtime probe
+# OCR availability
 # ---------------------------------------------------------------------------
 
 
 class TestOcrAvailability:
-    """``ocr_available`` is the linchpin of the UI's OCR banner.
-    Returns ``(bool, str)`` — both branches must round-trip."""
-
     def test_returns_a_tuple(self):
         from src.pdf_extract import ocr_available
         result = ocr_available()
-        assert isinstance(result, tuple)
-        assert len(result) == 2
+        assert isinstance(result, tuple) and len(result) == 2
         ok, reason = result
         assert isinstance(ok, bool)
         assert isinstance(reason, str)
 
     def test_extract_pages_auto_skips_ocr_when_disabled(self):
         from src.pdf_extract import extract_pages_auto
-        # With allow_ocr=False, no OCR even if pages are blank.
         pdf_bytes = _build_tiny_statement_pdf()
         pages, warnings = extract_pages_auto(pdf_bytes, allow_ocr=False)
         assert len(pages) == 1
-        # No OCR-disabled warning on a text PDF, since pages have text.
         assert not any("OCR is disabled" in w for w in warnings)
 
 
 class TestTesseractDiscovery:
-    """Windows install paths + env-var override are how a real user
-    (no PATH munging) gets OCR working. Cover the discovery logic
-    even on Linux/macOS test runners by mocking out the OS check
-    and ``Path.exists``."""
-
     def test_autodetect_returns_none_on_non_windows(self, monkeypatch):
         from src import pdf_extract
-        monkeypatch.setattr(
-            "platform.system",
-            lambda: "Linux",
-        )
+        monkeypatch.setattr("platform.system", lambda: "Linux")
         assert pdf_extract._autodetect_tesseract_path() is None
 
     def test_autodetect_finds_program_files_on_windows(self, monkeypatch):
         from src import pdf_extract
         monkeypatch.setattr("platform.system", lambda: "Windows")
-
         target = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
 
         def fake_exists(self):
             return str(self) == target
 
-        monkeypatch.setattr(
-            "pathlib.Path.exists",
-            fake_exists,
-        )
+        monkeypatch.setattr("pathlib.Path.exists", fake_exists)
         assert pdf_extract._autodetect_tesseract_path() == target
 
-    def test_autodetect_returns_none_when_nothing_installed(
-        self, monkeypatch,
-    ):
+    def test_autodetect_returns_none_when_nothing_installed(self, monkeypatch):
         from src import pdf_extract
         monkeypatch.setattr("platform.system", lambda: "Windows")
         monkeypatch.setattr("pathlib.Path.exists", lambda self: False)
         assert pdf_extract._autodetect_tesseract_path() is None
 
     def test_env_var_override_takes_precedence(self, monkeypatch, tmp_path):
-        """``DATATOOLS_TESSERACT_PATH`` wins over discovery so a
-        portable install at a non-default path works without
-        relying on PATH."""
         from src import pdf_extract
-        # Point the override at a path that doesn't exist —
-        # ocr_available will try it and report the failure, but
-        # importantly the cmd attribute is set BEFORE the call,
-        # which is what we're verifying.
         fake_bin = str(tmp_path / "fake-tesseract.exe")
         monkeypatch.setenv("DATATOOLS_TESSERACT_PATH", fake_bin)
         pdf_extract.ocr_available()
diff --git a/tests/test_pdf_row_heuristic.py b/tests/test_pdf_row_heuristic.py
deleted file mode 100644
index c9f06b9..0000000
--- a/tests/test_pdf_row_heuristic.py
+++ /dev/null
@@ -1,280 +0,0 @@
-"""Tests for the row-heuristic extraction pipeline.
-
-This is now the primary extraction mode — uses date + amount
-pattern matching to find transaction lines, with no dependency
-on x-position column boundaries. Robust to layout drift across
-statements from the same bank.
-
-The legacy column-visual pipeline keeps its own tests in
-``test_pdf_extract.py``.
-"""
-
-from __future__ import annotations
-
-import pandas as pd
-
-from src.pdf_extract import (
-    Page,
-    WordBox,
-    apply_template,
-    apply_template_row_heuristic,
-    find_transaction_rows,
-    _find_amount_tokens,
-    _find_dates_in_words,
-    _infer_amount_column_centers,
-)
-
-
-def _w(text: str, x0: float, top: float) -> WordBox:
-    return WordBox(
-        x0=x0,
-        top=top,
-        x1=x0 + 8 * len(text),
-        bottom=top + 10,
-        text=text,
-    )
-
-
-class TestFindDatesInRow:
-    def test_us_slash(self):
-        row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)]
-        assert _find_dates_in_words(row) == [(0, "01/15/2026")]
-
-    def test_two_digit_year(self):
-        row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)]
-        result = _find_dates_in_words(row)
-        assert result and result[0][1] == "01/15/26"
-
-    def test_iso(self):
-        row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)]
-        assert _find_dates_in_words(row) == [(0, "2026-01-15")]
-
-    def test_month_name(self):
-        # "Jan 15, 2026" — three word tokens, should stitch.
-        row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)]
-        result = _find_dates_in_words(row)
-        assert result, "Multi-word month-day-year should match"
-        assert "Jan 15" in result[0][1]
-
-    def test_no_date(self):
-        row = [_w("Just", 0, 0), _w("text", 50, 0)]
-        assert _find_dates_in_words(row) == []
-
-
-class TestFindAmountTokens:
-    def test_currency_format(self):
-        row = [_w("Coffee", 0, 0), _w("$4.50", 100, 0)]
-        out = _find_amount_tokens(row)
-        assert len(out) == 1
-        assert out[0][2] == "$4.50"
-
-    def test_parens_negative(self):
-        row = [_w("(123.45)", 0, 0)]
-        out = _find_amount_tokens(row)
-        assert out and out[0][2] == "(123.45)"
-
-    def test_no_amount_on_pure_text(self):
-        row = [_w("Hello", 0, 0), _w("World", 50, 0)]
-        assert _find_amount_tokens(row) == []
-
-    def test_rejects_bare_year(self):
-        # "2026" matches the digit pattern but lacks $/decimal/etc.,
-        # so the looks-like-amount filter should drop it.
-        row = [_w("2026", 0, 0)]
-        # Bare integer can pass the regex but not the heuristic.
-        out = _find_amount_tokens(row)
-        # Either filtered out OR included — both are defensible.
-        # If included, it'd be missed-amount territory not a false-
-        # positive. Pin the conservative behavior: NO match.
-        assert out == [], "Bare 4-digit year should not register as amount"
-
-
-class TestInferAmountColumnCenters:
-    def test_two_clear_columns(self):
-        # 5 rows, each with two amounts at roughly x=300 and x=450.
-        rows = []
-        for top in range(0, 100, 20):
-            rows.append([
-                _w("01/15/2026", 20, top),
-                _w("Item", 100, top),
-                _w("$10.00", 300, top),
-                _w("$1,000.00", 450, top),
-            ])
-        centers = _infer_amount_column_centers(
-            rows, expected=2, min_amounts=2, max_amounts=2,
-        )
-        assert len(centers) == 2
-        # Left center ≈ 300 + 8*len("$10.00")/2 = 300+24 = 324
-        assert 310 < centers[0] < 340
-        assert 460 < centers[1] < 490
-
-    def test_no_transactions_returns_empty(self):
-        rows = [[_w("just", 0, 0), _w("text", 50, 0)]]
-        assert _infer_amount_column_centers(
-            rows, expected=2, min_amounts=1, max_amounts=3,
-        ) == []
-
-
-class TestRowHeuristicEndToEnd:
-    """Synthetic ``Page`` objects exercise the full row-heuristic
-    pipeline end-to-end without a real PDF."""
-
-    def _page_single_amount(self) -> Page:
-        words = [
-            _w("ACME BANK STATEMENT", 20, 0),
-            _w("01/15/2026", 20, 30), _w("Coffee", 100, 30),
-            _w("Shop", 150, 30), _w("$4.50", 400, 30),
-            _w("01/16/2026", 20, 50), _w("Refund", 100, 50),
-            _w("from", 100, 70), _w("vendor", 140, 70),  # continuation
-            _w("Vendor", 140, 50), _w("$12.00", 400, 50),
-            _w("Page", 20, 90), _w("1", 60, 90),  # not a txn
-        ]
-        return Page(page_no=1, width=600, height=120, text="", words=words)
-
-    def test_extracts_two_rows_single_amount(self):
-        tpl = {
-            "mode": "row_heuristic",
-            "row_detection": {
-                "min_amounts_per_row": 1,
-                "max_amounts_per_row": 1,
-                "merge_multiline_description": True,
-            },
-            "amounts": {"shape": "single", "negative_in_parens": True},
-            "date": {"format": "%m/%d/%Y"},
-        }
-        df = apply_template_row_heuristic([self._page_single_amount()], tpl)
-        assert len(df) == 2
-        assert list(df["date"]) == ["2026-01-15", "2026-01-16"]
-        # Multi-line description merged
-        assert "from vendor" in df.iloc[1]["description"]
-
-    def test_dispatches_through_apply_template(self):
-        tpl = {
-            "mode": "row_heuristic",
-            "row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1},
-            "amounts": {"shape": "single"},
-            "date": {"format": "%m/%d/%Y"},
-        }
-        df = apply_template([self._page_single_amount()], tpl)
-        assert isinstance(df, pd.DataFrame)
-        assert len(df) == 2
-
-    def test_txn_balance_shape(self):
-        page = Page(
-            page_no=1, width=600, height=100, text="", words=[
-                _w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
-                _w("(4.50)", 300, 0), _w("1,000.00", 450, 0),
-                _w("01/16/2026", 20, 20), _w("Refund", 100, 20),
-                _w("12.00", 300, 20), _w("1,012.00", 450, 20),
-            ],
-        )
-        tpl = {
-            "mode": "row_heuristic",
-            "row_detection": {"min_amounts_per_row": 2, "max_amounts_per_row": 2},
-            "amounts": {"shape": "txn_balance", "negative_in_parens": True},
-            "date": {"format": "%m/%d/%Y"},
-        }
-        df = apply_template([page], tpl)
-        assert len(df) == 2
-        assert df.iloc[0]["amount"] == -4.50
-        assert df.iloc[0]["balance"] == 1000.00
-        assert df.iloc[1]["amount"] == 12.00
-        assert df.iloc[1]["balance"] == 1012.00
-
-    def test_debit_credit_balance_shape(self):
-        page = Page(
-            page_no=1, width=600, height=100, text="", words=[
-                _w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
-                _w("4.50", 300, 0), _w("1,000.00", 450, 0),
-                _w("01/16/2026", 20, 20), _w("Refund", 100, 20),
-                _w("12.00", 380, 20), _w("1,012.00", 450, 20),
-            ],
-        )
-        tpl = {
-            "mode": "row_heuristic",
-            "row_detection": {"min_amounts_per_row": 2, "max_amounts_per_row": 3},
-            "amounts": {"shape": "debit_credit_balance"},
-            "date": {"format": "%m/%d/%Y"},
-        }
-        df = apply_template([page], tpl)
-        assert len(df) == 2
-        # Row 0: amount at x=300 (debit column) → debit, balance at 450
-        assert df.iloc[0]["amount"] == -4.50
-        assert df.iloc[0]["type"] == "debit"
-        # Row 1: amount at x=380 (credit column) → credit, balance at 450
-        assert df.iloc[1]["amount"] == 12.00
-        assert df.iloc[1]["type"] == "credit"
-
-    def test_skip_rows_matching(self):
-        page = self._page_single_amount()
-        tpl = {
-            "mode": "row_heuristic",
-            "row_detection": {
-                "min_amounts_per_row": 1,
-                "max_amounts_per_row": 1,
-                "skip_rows_matching": ["Refund"],
-            },
-            "amounts": {"shape": "single"},
-            "date": {"format": "%m/%d/%Y"},
-        }
-        df = apply_template_row_heuristic([page], tpl)
-        assert len(df) == 1
-        assert df.iloc[0]["date"] == "2026-01-15"
-
-    def test_layout_drift_doesnt_matter(self):
-        """The whole point of row-heuristic: same template works
-        on pages of different sizes / different column x-positions."""
-        # Page A: amounts at x=400
-        page_a = Page(
-            page_no=1, width=600, height=80, text="", words=[
-                _w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
-                _w("$4.50", 400, 0),
-            ],
-        )
-        # Page B: amounts shifted to x=520 (different layout)
-        page_b = Page(
-            page_no=1, width=720, height=80, text="", words=[
-                _w("01/15/2026", 50, 0), _w("Coffee", 150, 0),
-                _w("$4.50", 520, 0),
-            ],
-        )
-        tpl = {
-            "mode": "row_heuristic",
-            "row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1},
-            "amounts": {"shape": "single"},
-            "date": {"format": "%m/%d/%Y"},
-        }
-        df_a = apply_template([page_a], tpl)
-        df_b = apply_template([page_b], tpl)
-        # Both should extract — proves no coordinate dependency.
-        assert len(df_a) == 1
-        assert len(df_b) == 1
-        assert df_a.iloc[0]["amount"] == df_b.iloc[0]["amount"] == 4.50
-
-
-class TestFindTransactionRows:
-    """The pre-DataFrame stage — returns dict records the build UI
-    uses to render a preview before the user commits."""
-
-    def test_returns_records(self):
-        page = Page(
-            page_no=1, width=600, height=80, text="", words=[
-                _w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
-                _w("$4.50", 400, 0),
-            ],
-        )
-        tpl = {
-            "mode": "row_heuristic",
-            "row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1},
-            "amounts": {"shape": "single"},
-            "date": {"format": "%m/%d/%Y"},
-        }
-        rows = find_transaction_rows([page], tpl)
-        assert len(rows) == 1
-        r = rows[0]
-        assert r["date"] == "2026-01-15"
-        assert r["description"] == "Coffee"
-        assert r["amount"] == 4.50
-        assert r["_page"] == 1
-        # Raw line is preserved so the GUI can show "what we saw"
-        assert "_raw_line" in r
diff --git a/tests/test_pdf_templates.py b/tests/test_pdf_templates.py
deleted file mode 100644
index 551dab6..0000000
--- a/tests/test_pdf_templates.py
+++ /dev/null
@@ -1,316 +0,0 @@
-"""Tests for the PDF template storage layer."""
-
-from __future__ import annotations
-
-import json
-
-import pytest
-
-from src.pdf_templates import (
-    SCHEMA_VERSION,
-    delete_template,
-    list_templates,
-    load_template,
-    new_template,
-    save_template,
-    slugify,
-    template_from_json,
-    template_path,
-    templates_dir,
-    template_to_json,
-    validate_template,
-)
-
-
-@pytest.fixture
-def isolated_templates(monkeypatch, tmp_path):
-    """Redirect the templates directory into ``tmp_path``."""
-    monkeypatch.setenv("DATATOOLS_PDF_TEMPLATES_DIR", str(tmp_path))
-    yield tmp_path
-
-
-class TestSlugify:
-    def test_basic(self):
-        assert slugify("Chase Personal Checking") == "chase-personal-checking"
-
-    def test_strips_punctuation(self):
-        assert slugify("BofA: Business (USD)") == "bofa-business-usd"
-
-    def test_empty_falls_back(self):
-        assert slugify("") == "untitled"
-        assert slugify("   ") == "untitled"
-
-
-class TestNewTemplate:
-    def test_has_schema_version(self):
-        t = new_template("Sample")
-        assert t["schema_version"] == SCHEMA_VERSION
-
-    def test_slug_derived_from_name(self):
-        t = new_template("Sample Bank")
-        assert t["slug"] == "sample-bank"
-        assert t["name"] == "Sample Bank"
-
-    def test_timestamps_present(self):
-        t = new_template("X")
-        assert t["created_at"]
-        assert t["updated_at"]
-
-
-class TestValidateTemplateRowHeuristic:
-    """Row-heuristic mode is the v2 default."""
-
-    def _valid(self) -> dict:
-        return {
-            "schema_version": SCHEMA_VERSION,
-            "slug": "x",
-            "name": "X",
-            "mode": "row_heuristic",
-            "row_detection": {
-                "min_amounts_per_row": 1,
-                "max_amounts_per_row": 3,
-            },
-            "amounts": {"shape": "single"},
-            "date": {"format": "%m/%d/%Y"},
-        }
-
-    def test_valid_passes(self):
-        ok, errs = validate_template(self._valid())
-        assert ok, errs
-
-    def test_missing_name_fails(self):
-        t = self._valid()
-        t["name"] = ""
-        ok, errs = validate_template(t)
-        assert not ok
-
-    def test_bad_mode_fails(self):
-        t = self._valid()
-        t["mode"] = "magic"
-        ok, errs = validate_template(t)
-        assert not ok
-        assert any("mode" in e for e in errs)
-
-    def test_bad_shape_fails(self):
-        t = self._valid()
-        t["amounts"]["shape"] = "telepathic"
-        ok, errs = validate_template(t)
-        assert not ok
-        assert any("shape" in e for e in errs)
-
-    def test_inverted_amount_range_fails(self):
-        t = self._valid()
-        t["row_detection"]["min_amounts_per_row"] = 5
-        t["row_detection"]["max_amounts_per_row"] = 2
-        ok, errs = validate_template(t)
-        assert not ok
-
-    def test_does_not_require_columns_in_row_mode(self):
-        """Key point: row mode doesn't need ``columns`` populated.
-        That's what makes the GUI's primary path simpler than v1."""
-        t = self._valid()
-        # No columns key at all.
-        ok, errs = validate_template(t)
-        assert ok, errs
-
-
-class TestValidateTemplateColumnVisual:
-    """Legacy column-visual mode keeps its own contract."""
-
-    def _valid(self) -> dict:
-        return {
-            "schema_version": SCHEMA_VERSION,
-            "slug": "x",
-            "name": "X",
-            "mode": "column_visual",
-            "pages": {"range": "all"},
-            "table": {"column_boundaries": [100, 200]},
-            "columns": [
-                {"source": 0, "target": "date"},
-                {"source": 1, "target": "description"},
-                {"source": 2, "target": "amount"},
-            ],
-            "parse": {},
-        }
-
-    def test_valid_passes(self):
-        ok, errs = validate_template(self._valid())
-        assert ok, errs
-
-    def test_requires_date_column(self):
-        t = self._valid()
-        t["columns"] = [
-            {"source": 0, "target": "description"},
-            {"source": 1, "target": "amount"},
-        ]
-        ok, errs = validate_template(t)
-        assert not ok
-        assert any("date" in e for e in errs)
-
-    def test_requires_amount_or_debit_credit(self):
-        t = self._valid()
-        t["columns"] = [
-            {"source": 0, "target": "date"},
-            {"source": 1, "target": "description"},
-        ]
-        ok, errs = validate_template(t)
-        assert not ok
-        assert any("amount" in e for e in errs)
-
-    def test_debit_credit_pair_is_valid(self):
-        t = self._valid()
-        t["columns"] = [
-            {"source": 0, "target": "date"},
-            {"source": 1, "target": "description"},
-            {"source": 2, "target": "amount_debit"},
-            {"source": 3, "target": "amount_credit"},
-        ]
-        t["table"]["column_boundaries"] = [100, 200, 300]
-        ok, errs = validate_template(t)
-        assert ok, errs
-
-
-class TestV1Migration:
-    """v1 templates load with mode='column_visual' auto-injected;
-    the file on disk stays v1 until the user re-saves."""
-
-    def test_loads_v1_template(self, isolated_templates, tmp_path):
-        import json
-        v1_payload = {
-            "schema_version": 1,
-            "slug": "legacy",
-            "name": "Legacy Bank",
-            "pages": {"range": "all"},
-            "table": {"column_boundaries": [100, 200]},
-            "columns": [
-                {"source": 0, "target": "date"},
-                {"source": 1, "target": "description"},
-                {"source": 2, "target": "amount"},
-            ],
-            "parse": {},
-        }
-        (tmp_path / "legacy.json").write_text(
-            json.dumps(v1_payload), encoding="utf-8",
-        )
-        loaded = load_template("legacy")
-        # In-memory migration adds mode + bumps schema_version
-        assert loaded["mode"] == "column_visual"
-        assert loaded["schema_version"] == SCHEMA_VERSION
-        # Original keys still intact
-        assert loaded["columns"][0]["target"] == "date"
-
-
-class TestPersistence:
-    def test_round_trip(self, isolated_templates):
-        t = new_template("Round Trip Bank")
-        t["columns"] = [
-            {"source": 0, "target": "date"},
-            {"source": 1, "target": "description"},
-            {"source": 2, "target": "amount"},
-        ]
-        t["table"]["column_boundaries"] = [100, 200]
-        slug = save_template(t)
-        assert slug == "round-trip-bank"
-
-        path = template_path(slug)
-        assert path.exists()
-        loaded = load_template(slug)
-        assert loaded["name"] == "Round Trip Bank"
-        assert loaded["columns"][0]["target"] == "date"
-
-    def test_save_rejects_invalid(self, isolated_templates):
-        with pytest.raises(ValueError):
-            save_template({"schema_version": 1, "name": ""})
-
-    def test_load_missing_raises(self, isolated_templates):
-        with pytest.raises(FileNotFoundError):
-            load_template("does-not-exist")
-
-    def test_load_corrupt_raises(self, isolated_templates, tmp_path):
-        bad = tmp_path / "bad.json"
-        bad.write_text("not json", encoding="utf-8")
-        with pytest.raises(ValueError):
-            load_template("bad")
-
-    def test_delete(self, isolated_templates):
-        t = new_template("To Delete")
-        t["columns"] = [
-            {"source": 0, "target": "date"},
-            {"source": 1, "target": "amount"},
-        ]
-        t["table"]["column_boundaries"] = [100]
-        save_template(t)
-        assert delete_template("to-delete") is True
-        assert delete_template("to-delete") is False
-
-    def test_list_returns_summaries(self, isolated_templates):
-        for name in ["Alpha", "Bravo"]:
-            t = new_template(name)
-            t["columns"] = [
-                {"source": 0, "target": "date"},
-                {"source": 1, "target": "amount"},
-            ]
-            t["table"]["column_boundaries"] = [100]
-            save_template(t)
-        rows = list_templates()
-        assert {r["slug"] for r in rows} == {"alpha", "bravo"}
-
-    def test_list_skips_corrupt(self, isolated_templates, tmp_path):
-        (tmp_path / "broken.json").write_text("nope", encoding="utf-8")
-        # Even with a broken file present, list still returns []
-        rows = list_templates()
-        assert rows == []
-
-    def test_atomic_save_no_partial_file_on_failure(
-        self, isolated_templates, monkeypatch
-    ):
-        """If the write step fails mid-way, no half-written JSON survives
-        at the target path. Tests the temp-file-rename safety pattern."""
-        t = new_template("Atomic")
-        t["columns"] = [
-            {"source": 0, "target": "date"},
-            {"source": 1, "target": "amount"},
-        ]
-        t["table"]["column_boundaries"] = [100]
-
-        # Make json.dumps blow up to simulate a failure during write.
-        # save_template already validated before this step, so the
-        # crash is "after validation, during write".
-        import src.pdf_templates as mod
-        original_dumps = mod.json.dumps
-
-        def boom(*a, **kw):
-            raise IOError("disk full")
-
-        monkeypatch.setattr(mod.json, "dumps", boom)
-        with pytest.raises(IOError):
-            save_template(t)
-        monkeypatch.setattr(mod.json, "dumps", original_dumps)
-
-        assert not template_path("atomic").exists()
-
-
-class TestImportExport:
-    def test_round_trip_via_json(self):
-        t = new_template("Exported")
-        t["columns"] = [
-            {"source": 0, "target": "date"},
-            {"source": 1, "target": "amount"},
-        ]
-        payload = template_to_json(t)
-        loaded = template_from_json(payload)
-        assert loaded["name"] == "Exported"
-
-    def test_import_rejects_bad_schema(self):
-        bad = json.dumps({"schema_version": 999, "name": "X"})
-        with pytest.raises(ValueError):
-            template_from_json(bad)
-
-    def test_import_rejects_non_object(self):
-        with pytest.raises(ValueError):
-            template_from_json('["not", "an", "object"]')
-
-
-def test_templates_dir_env_override(monkeypatch, tmp_path):
-    monkeypatch.setenv("DATATOOLS_PDF_TEMPLATES_DIR", str(tmp_path))
-    assert templates_dir() == tmp_path