feat(pdf): UI rework — Auto-detect is the default build flow

Pulls the user's primary mental model away from "draw column boundaries" toward "tell me what shape your amounts have, see detected rows, save." The visual picker that wasn't working for multi-statement workflows is reachable but no longer the default. **Build mode header** now has a mode radio: - "Auto-detect (recommended)" — row_heuristic. Tabs: Amount layout · Filters & date · Save. Three small forms; no coordinate UI anywhere. The Amount-layout tab's dropdown picks one of single / txn+balance / debit+credit / debit+credit+balance and auto-derives the min/max amount-count range (overridable under an expander). - "Visual columns (advanced)" — column_visual. Five tabs (the original Visual picker / Pages & table / Columns / Parsing / Save). A yellow warning panel up top reminds the user that column-x templates only work when statement layout is stable. Switching modes triggers a rerun so the right tab set renders immediately. The template object preserves both mode's config trees side-by-side so a user can flip between them without losing work. **Live preview** below the form runs ``apply_template`` against the cached sample pages (already cached in session_state so this re-renders cheaply on every form edit). The "no rows yet" message is mode-aware — points users at the right tuning knobs for whichever mode they're in. The preview caption notes which mode produced the rows so the user can correlate decisions to output. The visual picker bug the user reported — "a single box stays in the same location regardless of page" — is sidestepped rather than fixed: in row_heuristic mode there's no canvas to confuse, and for the rare column_visual user the canvas is still imperfect but no longer their first interaction with the tool. Cleaning up the column_visual canvas state bugs is a separate follow-up if real users still hit the Advanced mode. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 23:46:27 +00:00
parent 48cd9e8249
commit 60969c0770
1 changed files with 296 additions and 96 deletions
--- a/src/gui/pages/10_PDF_Extractor.py
+++ b/src/gui/pages/10_PDF_Extractor.py
@@ -663,123 +663,205 @@ def _harvest_canvas(canvas_state, scale: float):


 def _render_build_form(tpl: dict) -> None:
-    """Render every editable field on the template, in tabs."""
-    t0, t1, t2, t3, t4 = st.tabs(
-        ["Visual picker", "Pages & table", "Columns", "Parsing", "Save"]
+    """Mode-aware editor.
+
+    Default mode (``row_heuristic``) presents simple form fields
+    for the parsing rules and a live preview of detected
+    transactions. The visual picker only shows when the user
+    explicitly switches to ``column_visual`` in the Advanced tab —
+    most users never need to go there.
+    """
+    # Header: name + mode switcher
+    c_name, c_mode = st.columns([3, 2])
+    with c_name:
+        tpl["name"] = st.text_input(
+            "Template name",
+            value=tpl.get("name", ""),
+            help="What this source is called, e.g. 'Chase Personal Checking'.",
+        )
+        tpl["slug"] = slugify(tpl["name"])
+    with c_mode:
+        current_mode = tpl.get("mode", "row_heuristic")
+        new_mode = st.radio(
+            "Detection mode",
+            ["row_heuristic", "column_visual"],
+            index=["row_heuristic", "column_visual"].index(current_mode),
+            format_func=lambda m: {
+                "row_heuristic": "Auto-detect (recommended)",
+                "column_visual": "Visual columns (advanced)",
+            }.get(m, m),
+            help=(
+                "Auto-detect finds rows by date+amount patterns — no "
+                "coordinates needed; survives layout changes between "
+                "statements. Visual columns uses x-position boundaries "
+                "you draw — useful only when auto-detect fails to find "
+                "the table."
+            ),
+            horizontal=False,
+        )
+        if new_mode != current_mode:
+            tpl["mode"] = new_mode
+            st.rerun()
+
+    if tpl.get("mode", "row_heuristic") == "row_heuristic":
+        _render_build_form_row_heuristic(tpl)
+    else:
+        _render_build_form_column_visual(tpl)
+
+
+def _render_build_form_row_heuristic(tpl: dict) -> None:
+    """Simple form for the row-heuristic mode."""
+    tab_amount, tab_filters, tab_save = st.tabs(
+        ["Amount layout", "Filters & date", "Save"]
    )

-    with t0:
-        _render_visual_picker(tpl)
+    tpl.setdefault("row_detection", {})
+    tpl.setdefault("amounts", {})
+    tpl.setdefault("date", {})
+    tpl.setdefault("pages", {})
+
+    with tab_amount:
+        st.caption(
+            "Tell us how many amount columns each transaction row has, "
+            "and how negatives are written. The detector handles the "
+            "rest — no x-positions needed."
+        )
+        shape_labels = {
+            "single": "One amount per row (sign in the number)",
+            "txn_balance": "Two amounts: transaction + running balance",
+            "debit_credit": "Two columns: separate debit and credit",
+            "debit_credit_balance": "Three: debit, credit, balance",
+        }
+        current_shape = tpl["amounts"].get("shape", "single")
+        shape = st.selectbox(
+            "Amount layout",
+            list(shape_labels.keys()),
+            index=list(shape_labels.keys()).index(
+                current_shape if current_shape in shape_labels else "single"
+            ),
+            format_func=lambda s: shape_labels[s],
+        )
+        tpl["amounts"]["shape"] = shape

-    with t1:
        c1, c2 = st.columns(2)
        with c1:
-            tpl["name"] = st.text_input("Template name", value=tpl.get("name", ""))
-            tpl["slug"] = slugify(tpl["name"])
-            tpl["notes"] = st.text_area("Notes", value=tpl.get("notes", ""), height=70)
+            tpl["amounts"]["negative_in_parens"] = st.checkbox(
+                "Parens (4.50) = negative",
+                value=bool(tpl["amounts"].get("negative_in_parens", True)),
+            )
+            tpl["amounts"]["currency_strip"] = st.text_input(
+                "Currency symbols to strip",
+                value=tpl["amounts"].get("currency_strip", "$"),
+                max_chars=4,
+            )
+        with c2:
+            tpl["amounts"]["decimal_separator"] = st.text_input(
+                "Decimal separator",
+                value=tpl["amounts"].get("decimal_separator", "."),
+                max_chars=1,
+            )
+            tpl["amounts"]["thousands_separator"] = st.text_input(
+                "Thousands separator",
+                value=tpl["amounts"].get("thousands_separator", ","),
+                max_chars=1,
+            )
+
+        # Auto-derive min/max amounts from the chosen shape unless
+        # user has set non-default values explicitly.
+        shape_to_min_max = {
+            "single": (1, 1),
+            "txn_balance": (2, 2),
+            "debit_credit": (1, 2),
+            "debit_credit_balance": (2, 3),
+        }
+        cur_min = tpl["row_detection"].get("min_amounts_per_row")
+        cur_max = tpl["row_detection"].get("max_amounts_per_row")
+        derived_min, derived_max = shape_to_min_max.get(shape, (1, 3))
+        if cur_min is None or cur_max is None:
+            tpl["row_detection"]["min_amounts_per_row"] = derived_min
+            tpl["row_detection"]["max_amounts_per_row"] = derived_max
+
+        with st.expander("Advanced: tune amount-count range", expanded=False):
+            tpl["row_detection"]["min_amounts_per_row"] = st.number_input(
+                "Minimum amounts per transaction row",
+                min_value=1, max_value=10,
+                value=int(tpl["row_detection"].get("min_amounts_per_row", derived_min)),
+                step=1,
+            )
+            tpl["row_detection"]["max_amounts_per_row"] = st.number_input(
+                "Maximum amounts per transaction row",
+                min_value=1, max_value=10,
+                value=int(tpl["row_detection"].get("max_amounts_per_row", derived_max)),
+                step=1,
+            )
+
+    with tab_filters:
+        c1, c2 = st.columns(2)
+        with c1:
+            tpl["date"]["format"] = st.text_input(
+                "Date format",
+                value=tpl["date"].get("format", "%m/%d/%Y"),
+                help=(
+                    "Python strftime format. Common: %m/%d/%Y (US), "
+                    "%d/%m/%Y (EU), %Y-%m-%d (ISO). Leave default to "
+                    "try common formats automatically."
+                ),
+            )
            tpl["pages"]["range"] = st.text_input(
                "Pages",
                value=tpl["pages"].get("range", "all"),
                help='"all", "1-3", "2,4", "3-" all work.',
            )
-            tpl["pages"]["skip_matching"] = st.text_input(
-                "Skip pages matching (regex, optional)",
-                value=tpl["pages"].get("skip_matching", ""),
-                help='e.g. "Page \\d+ of" to skip cover pages.',
-            )
        with c2:
-            tpl["table"]["header_text"] = st.text_input(
-                "Header text (transactions table)",
-                value=tpl["table"].get("header_text", ""),
+            tpl["row_detection"]["merge_multiline_description"] = st.checkbox(
+                "Merge multi-line descriptions",
+                value=bool(
+                    tpl["row_detection"].get("merge_multiline_description", True)
+                ),
                help=(
-                    "Words from the header row of the transactions table, "
-                    "e.g. \"Date Description Amount Balance\". Extraction "
-                    "starts on the row AFTER this match."
+                    "Lines without a date attach to the previous "
+                    "row's description — handles wrapped vendor names."
                ),
            )
-            ends = "\n".join(tpl["table"].get("end_markers") or [])
-            new_ends = st.text_area(
-                "End markers (one regex per line)",
-                value=ends,
-                help='e.g. "Closing balance", "Page \\d+ of".',
-                height=80,
-            )
-            tpl["table"]["end_markers"] = [
-                line.strip() for line in new_ends.splitlines() if line.strip()
-            ]
-            skips = "\n".join(tpl["table"].get("skip_rows_matching") or [])
-            new_skips = st.text_area(
-                "Skip rows matching (one regex per line, optional)",
-                value=skips,
-                help='Common entries: "Total", "Subtotal", "^Page ".',
-                height=80,
-            )
-            tpl["table"]["skip_rows_matching"] = [
-                line.strip() for line in new_skips.splitlines() if line.strip()
-            ]
-            tpl["table"]["y_tolerance"] = st.number_input(
+            tpl["row_detection"]["y_tolerance"] = st.number_input(
                "Row y-tolerance (pts)",
                min_value=0.5,
                max_value=20.0,
-                value=float(tpl["table"].get("y_tolerance", 3.0)),
+                value=float(tpl["row_detection"].get("y_tolerance", 3.0)),
                step=0.5,
                help=(
-                    "How close two words' y-positions must be to be on the "
-                    "same row. Bump up if rows are getting split, down if "
-                    "rows are merging."
+                    "How close two words' y-positions must be to be on "
+                    "the same row. Adjust if rows are splitting or merging."
                ),
            )

-    with t2:
-        _render_columns_editor(tpl)
+        skips = "\n".join(tpl["row_detection"].get("skip_rows_matching") or [])
+        new_skips = st.text_area(
+            "Skip rows matching (one regex per line, optional)",
+            value=skips,
+            help=(
+                "Lines whose text matches any of these regexes are "
+                'excluded. Common: "Total", "Subtotal", "^Page ".'
+            ),
+            height=80,
+        )
+        tpl["row_detection"]["skip_rows_matching"] = [
+            line.strip() for line in new_skips.splitlines() if line.strip()
+        ]

-    with t3:
-        c1, c2 = st.columns(2)
-        with c1:
-            tpl["parse"]["date_format"] = st.text_input(
-                "Date format",
-                value=tpl["parse"].get("date_format", "%m/%d/%Y"),
-                help=(
-                    "Python strftime format. Common: %m/%d/%Y (US), "
-                    "%d/%m/%Y (EU), %Y-%m-%d (ISO)."
-                ),
-            )
-            tpl["parse"]["currency_strip"] = st.text_input(
-                "Currency symbols to strip",
-                value=tpl["parse"].get("currency_strip", "$"),
-            )
-            tpl["parse"]["decimal_separator"] = st.text_input(
-                "Decimal separator",
-                value=tpl["parse"].get("decimal_separator", "."),
-                max_chars=1,
-            )
-            tpl["parse"]["thousands_separator"] = st.text_input(
-                "Thousands separator",
-                value=tpl["parse"].get("thousands_separator", ","),
-                max_chars=1,
-            )
-        with c2:
-            tpl["parse"]["amount_negative_in_parens"] = st.checkbox(
-                "Parens = negative amount",
-                value=bool(tpl["parse"].get("amount_negative_in_parens", True)),
-            )
-            tpl["parse"]["merge_multiline_description"] = st.checkbox(
-                "Merge multi-line descriptions",
-                value=bool(tpl["parse"].get("merge_multiline_description", True)),
-                help=(
-                    "Rows with no date attach to the previous row's "
-                    "description — handles wrapped vendor names."
-                ),
-            )
-
-    with t4:
+    with tab_save:
+        tpl["notes"] = st.text_area(
+            "Notes (optional)", value=tpl.get("notes", ""), height=70,
+        )
        ok, errors = validate_template(tpl)
        if errors:
            for err in errors:
                st.error(err)
        c1, c2 = st.columns([1, 3])
        with c1:
-            save_btn = st.button("Save template", type="primary", disabled=not ok)
+            save_btn = st.button(
+                "Save template", type="primary", disabled=not ok,
+            )
        with c2:
            st.caption(
                f"Will save as: ``{tpl.get('slug') or '—'}``  "
@@ -788,12 +870,119 @@ def _render_build_form(tpl: dict) -> None:
        if save_btn:
            try:
                slug = save_template(tpl)
-                st.success(f"Saved as **{slug}**. Switch to Extract mode to use it.")
+                st.success(
+                    f"Saved as **{slug}**. Switch to Extract mode to use it."
+                )
                log_event(
                    "tool_run",
                    "PDF Extractor template saved",
                    page="10_PDF_Extractor",
                    template=slug,
+                    mode=tpl.get("mode"),
+                )
+            except Exception as e:
+                st.error(f"Save failed: {e}")
+
+
+def _render_build_form_column_visual(tpl: dict) -> None:
+    """Legacy column-visual editor. Reached via the Detection mode
+    radio when the user opts into the advanced flow."""
+    st.warning(
+        "**Advanced mode.** Column-x-position templates depend on "
+        "every statement from this source having identical layout. "
+        "If your statements drift between months, switch back to "
+        "Auto-detect."
+    )
+
+    t0, t1, t2, t3, t4 = st.tabs(
+        ["Visual picker", "Pages & table", "Columns", "Parsing", "Save"]
+    )
+
+    tpl.setdefault("table", {})
+    tpl.setdefault("parse", {})
+    tpl.setdefault("pages", {})
+    tpl.setdefault("columns", [])
+
+    with t0:
+        _render_visual_picker(tpl)
+
+    with t1:
+        c1, c2 = st.columns(2)
+        with c1:
+            tpl["notes"] = st.text_area(
+                "Notes", value=tpl.get("notes", ""), height=70,
+            )
+            tpl["pages"]["range"] = st.text_input(
+                "Pages",
+                value=tpl["pages"].get("range", "all"),
+                help='"all", "1-3", "2,4", "3-" all work.',
+            )
+            tpl["pages"]["skip_matching"] = st.text_input(
+                "Skip pages matching (regex, optional)",
+                value=tpl["pages"].get("skip_matching", ""),
+            )
+        with c2:
+            tpl["table"]["header_text"] = st.text_input(
+                "Header text",
+                value=tpl["table"].get("header_text", ""),
+            )
+            ends = "\n".join(tpl["table"].get("end_markers") or [])
+            new_ends = st.text_area(
+                "End markers (one regex per line)",
+                value=ends,
+                height=80,
+            )
+            tpl["table"]["end_markers"] = [
+                line.strip() for line in new_ends.splitlines() if line.strip()
+            ]
+            skips = "\n".join(tpl["table"].get("skip_rows_matching") or [])
+            new_skips = st.text_area(
+                "Skip rows matching (one regex per line)",
+                value=skips,
+                height=80,
+            )
+            tpl["table"]["skip_rows_matching"] = [
+                line.strip() for line in new_skips.splitlines() if line.strip()
+            ]
+
+    with t2:
+        _render_columns_editor(tpl)
+
+    with t3:
+        tpl["parse"]["date_format"] = st.text_input(
+            "Date format",
+            value=tpl["parse"].get("date_format", "%m/%d/%Y"),
+        )
+        tpl["parse"]["currency_strip"] = st.text_input(
+            "Currency symbols", value=tpl["parse"].get("currency_strip", "$"),
+        )
+        tpl["parse"]["amount_negative_in_parens"] = st.checkbox(
+            "Parens = negative",
+            value=bool(tpl["parse"].get("amount_negative_in_parens", True)),
+        )
+        tpl["parse"]["merge_multiline_description"] = st.checkbox(
+            "Merge multi-line descriptions",
+            value=bool(tpl["parse"].get("merge_multiline_description", True)),
+        )
+
+    with t4:
+        ok, errors = validate_template(tpl)
+        if errors:
+            for err in errors:
+                st.error(err)
+        save_btn = st.button(
+            "Save template", type="primary", disabled=not ok, key="cv_save",
+        )
+        if save_btn:
+            try:
+                slug = save_template(tpl)
+                st.success(f"Saved as **{slug}**.")
+                log_event(
+                    "tool_run",
+                    "PDF Extractor template saved",
+                    page="10_PDF_Extractor",
+                    template=slug,
+                    mode=tpl.get("mode"),
                )
            except Exception as e:
                st.error(f"Save failed: {e}")
@@ -811,15 +1000,26 @@ def _render_preview(tpl: dict) -> None:
    except Exception as e:
        st.error(f"Preview failed: {type(e).__name__}: {e}")
        return
+    mode = tpl.get("mode", "row_heuristic")
    if df.empty:
-        st.info(
-            "Template doesn't match any rows yet. Common fixes: tighten "
-            "the header text, add an end marker, adjust column "
-            "boundaries."
-        )
+        if mode == "row_heuristic":
+            st.info(
+                "No transaction rows detected yet. Check that the date "
+                "format matches your statements, and try widening the "
+                "amount-count range under \"Advanced\" if your rows have "
+                "balance or extra columns."
+            )
+        else:
+            st.info(
+                "Template doesn't match any rows yet. Tighten the header "
+                "text, add an end marker, or adjust column boundaries."
+            )
    else:
-        st.caption(f"{len(df)} row(s) from {len(pages)} page(s)")
-        st.dataframe(df.head(50), hide_index=True, use_container_width=True)
+        st.caption(
+            f"{len(df)} row(s) from {len(pages)} page(s) "
+            f"· mode: {mode}"
+        )
+        st.dataframe(df.head(100), hide_index=True, use_container_width=True)


 def _render_build_mode() -> None: