feat(pdf): UI rework — Auto-detect is the default build flow

Pulls the user's primary mental model away from "draw column
boundaries" toward "tell me what shape your amounts have, see
detected rows, save." The visual picker that wasn't working for
multi-statement workflows is reachable but no longer the
default.

**Build mode header** now has a mode radio:

- "Auto-detect (recommended)" — row_heuristic. Tabs: Amount
  layout · Filters & date · Save. Three small forms; no
  coordinate UI anywhere. The Amount-layout tab's dropdown picks
  one of single / txn+balance / debit+credit / debit+credit+balance
  and auto-derives the min/max amount-count range (overridable
  under an expander).
- "Visual columns (advanced)" — column_visual. Five tabs (the
  original Visual picker / Pages & table / Columns / Parsing /
  Save). A yellow warning panel up top reminds the user that
  column-x templates only work when statement layout is stable.

Switching modes triggers a rerun so the right tab set renders
immediately. The template object preserves both mode's config
trees side-by-side so a user can flip between them without
losing work.

**Live preview** below the form runs ``apply_template`` against
the cached sample pages (already cached in session_state so this
re-renders cheaply on every form edit). The "no rows yet"
message is mode-aware — points users at the right tuning knobs
for whichever mode they're in. The preview caption notes which
mode produced the rows so the user can correlate decisions to
output.

The visual picker bug the user reported — "a single box stays in
the same location regardless of page" — is sidestepped rather
than fixed: in row_heuristic mode there's no canvas to confuse,
and for the rare column_visual user the canvas is still
imperfect but no longer their first interaction with the tool.
Cleaning up the column_visual canvas state bugs is a separate
follow-up if real users still hit the Advanced mode.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-19 23:46:27 +00:00
parent 48cd9e8249
commit 60969c0770

View File

@@ -663,123 +663,205 @@ def _harvest_canvas(canvas_state, scale: float):
def _render_build_form(tpl: dict) -> None:
"""Render every editable field on the template, in tabs."""
t0, t1, t2, t3, t4 = st.tabs(
["Visual picker", "Pages & table", "Columns", "Parsing", "Save"]
"""Mode-aware editor.
Default mode (``row_heuristic``) presents simple form fields
for the parsing rules and a live preview of detected
transactions. The visual picker only shows when the user
explicitly switches to ``column_visual`` in the Advanced tab —
most users never need to go there.
"""
# Header: name + mode switcher
c_name, c_mode = st.columns([3, 2])
with c_name:
tpl["name"] = st.text_input(
"Template name",
value=tpl.get("name", ""),
help="What this source is called, e.g. 'Chase Personal Checking'.",
)
tpl["slug"] = slugify(tpl["name"])
with c_mode:
current_mode = tpl.get("mode", "row_heuristic")
new_mode = st.radio(
"Detection mode",
["row_heuristic", "column_visual"],
index=["row_heuristic", "column_visual"].index(current_mode),
format_func=lambda m: {
"row_heuristic": "Auto-detect (recommended)",
"column_visual": "Visual columns (advanced)",
}.get(m, m),
help=(
"Auto-detect finds rows by date+amount patterns — no "
"coordinates needed; survives layout changes between "
"statements. Visual columns uses x-position boundaries "
"you draw — useful only when auto-detect fails to find "
"the table."
),
horizontal=False,
)
if new_mode != current_mode:
tpl["mode"] = new_mode
st.rerun()
if tpl.get("mode", "row_heuristic") == "row_heuristic":
_render_build_form_row_heuristic(tpl)
else:
_render_build_form_column_visual(tpl)
def _render_build_form_row_heuristic(tpl: dict) -> None:
"""Simple form for the row-heuristic mode."""
tab_amount, tab_filters, tab_save = st.tabs(
["Amount layout", "Filters & date", "Save"]
)
with t0:
_render_visual_picker(tpl)
tpl.setdefault("row_detection", {})
tpl.setdefault("amounts", {})
tpl.setdefault("date", {})
tpl.setdefault("pages", {})
with tab_amount:
st.caption(
"Tell us how many amount columns each transaction row has, "
"and how negatives are written. The detector handles the "
"rest — no x-positions needed."
)
shape_labels = {
"single": "One amount per row (sign in the number)",
"txn_balance": "Two amounts: transaction + running balance",
"debit_credit": "Two columns: separate debit and credit",
"debit_credit_balance": "Three: debit, credit, balance",
}
current_shape = tpl["amounts"].get("shape", "single")
shape = st.selectbox(
"Amount layout",
list(shape_labels.keys()),
index=list(shape_labels.keys()).index(
current_shape if current_shape in shape_labels else "single"
),
format_func=lambda s: shape_labels[s],
)
tpl["amounts"]["shape"] = shape
with t1:
c1, c2 = st.columns(2)
with c1:
tpl["name"] = st.text_input("Template name", value=tpl.get("name", ""))
tpl["slug"] = slugify(tpl["name"])
tpl["notes"] = st.text_area("Notes", value=tpl.get("notes", ""), height=70)
tpl["amounts"]["negative_in_parens"] = st.checkbox(
"Parens (4.50) = negative",
value=bool(tpl["amounts"].get("negative_in_parens", True)),
)
tpl["amounts"]["currency_strip"] = st.text_input(
"Currency symbols to strip",
value=tpl["amounts"].get("currency_strip", "$"),
max_chars=4,
)
with c2:
tpl["amounts"]["decimal_separator"] = st.text_input(
"Decimal separator",
value=tpl["amounts"].get("decimal_separator", "."),
max_chars=1,
)
tpl["amounts"]["thousands_separator"] = st.text_input(
"Thousands separator",
value=tpl["amounts"].get("thousands_separator", ","),
max_chars=1,
)
# Auto-derive min/max amounts from the chosen shape unless
# user has set non-default values explicitly.
shape_to_min_max = {
"single": (1, 1),
"txn_balance": (2, 2),
"debit_credit": (1, 2),
"debit_credit_balance": (2, 3),
}
cur_min = tpl["row_detection"].get("min_amounts_per_row")
cur_max = tpl["row_detection"].get("max_amounts_per_row")
derived_min, derived_max = shape_to_min_max.get(shape, (1, 3))
if cur_min is None or cur_max is None:
tpl["row_detection"]["min_amounts_per_row"] = derived_min
tpl["row_detection"]["max_amounts_per_row"] = derived_max
with st.expander("Advanced: tune amount-count range", expanded=False):
tpl["row_detection"]["min_amounts_per_row"] = st.number_input(
"Minimum amounts per transaction row",
min_value=1, max_value=10,
value=int(tpl["row_detection"].get("min_amounts_per_row", derived_min)),
step=1,
)
tpl["row_detection"]["max_amounts_per_row"] = st.number_input(
"Maximum amounts per transaction row",
min_value=1, max_value=10,
value=int(tpl["row_detection"].get("max_amounts_per_row", derived_max)),
step=1,
)
with tab_filters:
c1, c2 = st.columns(2)
with c1:
tpl["date"]["format"] = st.text_input(
"Date format",
value=tpl["date"].get("format", "%m/%d/%Y"),
help=(
"Python strftime format. Common: %m/%d/%Y (US), "
"%d/%m/%Y (EU), %Y-%m-%d (ISO). Leave default to "
"try common formats automatically."
),
)
tpl["pages"]["range"] = st.text_input(
"Pages",
value=tpl["pages"].get("range", "all"),
help='"all", "1-3", "2,4", "3-" all work.',
)
tpl["pages"]["skip_matching"] = st.text_input(
"Skip pages matching (regex, optional)",
value=tpl["pages"].get("skip_matching", ""),
help='e.g. "Page \\d+ of" to skip cover pages.',
)
with c2:
tpl["table"]["header_text"] = st.text_input(
"Header text (transactions table)",
value=tpl["table"].get("header_text", ""),
tpl["row_detection"]["merge_multiline_description"] = st.checkbox(
"Merge multi-line descriptions",
value=bool(
tpl["row_detection"].get("merge_multiline_description", True)
),
help=(
"Words from the header row of the transactions table, "
"e.g. \"Date Description Amount Balance\". Extraction "
"starts on the row AFTER this match."
"Lines without a date attach to the previous "
"row's description — handles wrapped vendor names."
),
)
ends = "\n".join(tpl["table"].get("end_markers") or [])
new_ends = st.text_area(
"End markers (one regex per line)",
value=ends,
help='e.g. "Closing balance", "Page \\d+ of".',
height=80,
)
tpl["table"]["end_markers"] = [
line.strip() for line in new_ends.splitlines() if line.strip()
]
skips = "\n".join(tpl["table"].get("skip_rows_matching") or [])
new_skips = st.text_area(
"Skip rows matching (one regex per line, optional)",
value=skips,
help='Common entries: "Total", "Subtotal", "^Page ".',
height=80,
)
tpl["table"]["skip_rows_matching"] = [
line.strip() for line in new_skips.splitlines() if line.strip()
]
tpl["table"]["y_tolerance"] = st.number_input(
tpl["row_detection"]["y_tolerance"] = st.number_input(
"Row y-tolerance (pts)",
min_value=0.5,
max_value=20.0,
value=float(tpl["table"].get("y_tolerance", 3.0)),
value=float(tpl["row_detection"].get("y_tolerance", 3.0)),
step=0.5,
help=(
"How close two words' y-positions must be to be on the "
"same row. Bump up if rows are getting split, down if "
"rows are merging."
"How close two words' y-positions must be to be on "
"the same row. Adjust if rows are splitting or merging."
),
)
with t2:
_render_columns_editor(tpl)
skips = "\n".join(tpl["row_detection"].get("skip_rows_matching") or [])
new_skips = st.text_area(
"Skip rows matching (one regex per line, optional)",
value=skips,
help=(
"Lines whose text matches any of these regexes are "
'excluded. Common: "Total", "Subtotal", "^Page ".'
),
height=80,
)
tpl["row_detection"]["skip_rows_matching"] = [
line.strip() for line in new_skips.splitlines() if line.strip()
]
with t3:
c1, c2 = st.columns(2)
with c1:
tpl["parse"]["date_format"] = st.text_input(
"Date format",
value=tpl["parse"].get("date_format", "%m/%d/%Y"),
help=(
"Python strftime format. Common: %m/%d/%Y (US), "
"%d/%m/%Y (EU), %Y-%m-%d (ISO)."
),
)
tpl["parse"]["currency_strip"] = st.text_input(
"Currency symbols to strip",
value=tpl["parse"].get("currency_strip", "$"),
)
tpl["parse"]["decimal_separator"] = st.text_input(
"Decimal separator",
value=tpl["parse"].get("decimal_separator", "."),
max_chars=1,
)
tpl["parse"]["thousands_separator"] = st.text_input(
"Thousands separator",
value=tpl["parse"].get("thousands_separator", ","),
max_chars=1,
)
with c2:
tpl["parse"]["amount_negative_in_parens"] = st.checkbox(
"Parens = negative amount",
value=bool(tpl["parse"].get("amount_negative_in_parens", True)),
)
tpl["parse"]["merge_multiline_description"] = st.checkbox(
"Merge multi-line descriptions",
value=bool(tpl["parse"].get("merge_multiline_description", True)),
help=(
"Rows with no date attach to the previous row's "
"description — handles wrapped vendor names."
),
)
with t4:
with tab_save:
tpl["notes"] = st.text_area(
"Notes (optional)", value=tpl.get("notes", ""), height=70,
)
ok, errors = validate_template(tpl)
if errors:
for err in errors:
st.error(err)
c1, c2 = st.columns([1, 3])
with c1:
save_btn = st.button("Save template", type="primary", disabled=not ok)
save_btn = st.button(
"Save template", type="primary", disabled=not ok,
)
with c2:
st.caption(
f"Will save as: ``{tpl.get('slug') or ''}`` "
@@ -788,12 +870,119 @@ def _render_build_form(tpl: dict) -> None:
if save_btn:
try:
slug = save_template(tpl)
st.success(f"Saved as **{slug}**. Switch to Extract mode to use it.")
st.success(
f"Saved as **{slug}**. Switch to Extract mode to use it."
)
log_event(
"tool_run",
"PDF Extractor template saved",
page="10_PDF_Extractor",
template=slug,
mode=tpl.get("mode"),
)
except Exception as e:
st.error(f"Save failed: {e}")
def _render_build_form_column_visual(tpl: dict) -> None:
"""Legacy column-visual editor. Reached via the Detection mode
radio when the user opts into the advanced flow."""
st.warning(
"**Advanced mode.** Column-x-position templates depend on "
"every statement from this source having identical layout. "
"If your statements drift between months, switch back to "
"Auto-detect."
)
t0, t1, t2, t3, t4 = st.tabs(
["Visual picker", "Pages & table", "Columns", "Parsing", "Save"]
)
tpl.setdefault("table", {})
tpl.setdefault("parse", {})
tpl.setdefault("pages", {})
tpl.setdefault("columns", [])
with t0:
_render_visual_picker(tpl)
with t1:
c1, c2 = st.columns(2)
with c1:
tpl["notes"] = st.text_area(
"Notes", value=tpl.get("notes", ""), height=70,
)
tpl["pages"]["range"] = st.text_input(
"Pages",
value=tpl["pages"].get("range", "all"),
help='"all", "1-3", "2,4", "3-" all work.',
)
tpl["pages"]["skip_matching"] = st.text_input(
"Skip pages matching (regex, optional)",
value=tpl["pages"].get("skip_matching", ""),
)
with c2:
tpl["table"]["header_text"] = st.text_input(
"Header text",
value=tpl["table"].get("header_text", ""),
)
ends = "\n".join(tpl["table"].get("end_markers") or [])
new_ends = st.text_area(
"End markers (one regex per line)",
value=ends,
height=80,
)
tpl["table"]["end_markers"] = [
line.strip() for line in new_ends.splitlines() if line.strip()
]
skips = "\n".join(tpl["table"].get("skip_rows_matching") or [])
new_skips = st.text_area(
"Skip rows matching (one regex per line)",
value=skips,
height=80,
)
tpl["table"]["skip_rows_matching"] = [
line.strip() for line in new_skips.splitlines() if line.strip()
]
with t2:
_render_columns_editor(tpl)
with t3:
tpl["parse"]["date_format"] = st.text_input(
"Date format",
value=tpl["parse"].get("date_format", "%m/%d/%Y"),
)
tpl["parse"]["currency_strip"] = st.text_input(
"Currency symbols", value=tpl["parse"].get("currency_strip", "$"),
)
tpl["parse"]["amount_negative_in_parens"] = st.checkbox(
"Parens = negative",
value=bool(tpl["parse"].get("amount_negative_in_parens", True)),
)
tpl["parse"]["merge_multiline_description"] = st.checkbox(
"Merge multi-line descriptions",
value=bool(tpl["parse"].get("merge_multiline_description", True)),
)
with t4:
ok, errors = validate_template(tpl)
if errors:
for err in errors:
st.error(err)
save_btn = st.button(
"Save template", type="primary", disabled=not ok, key="cv_save",
)
if save_btn:
try:
slug = save_template(tpl)
st.success(f"Saved as **{slug}**.")
log_event(
"tool_run",
"PDF Extractor template saved",
page="10_PDF_Extractor",
template=slug,
mode=tpl.get("mode"),
)
except Exception as e:
st.error(f"Save failed: {e}")
@@ -811,15 +1000,26 @@ def _render_preview(tpl: dict) -> None:
except Exception as e:
st.error(f"Preview failed: {type(e).__name__}: {e}")
return
mode = tpl.get("mode", "row_heuristic")
if df.empty:
st.info(
"Template doesn't match any rows yet. Common fixes: tighten "
"the header text, add an end marker, adjust column "
"boundaries."
)
if mode == "row_heuristic":
st.info(
"No transaction rows detected yet. Check that the date "
"format matches your statements, and try widening the "
"amount-count range under \"Advanced\" if your rows have "
"balance or extra columns."
)
else:
st.info(
"Template doesn't match any rows yet. Tighten the header "
"text, add an end marker, or adjust column boundaries."
)
else:
st.caption(f"{len(df)} row(s) from {len(pages)} page(s)")
st.dataframe(df.head(50), hide_index=True, use_container_width=True)
st.caption(
f"{len(df)} row(s) from {len(pages)} page(s) "
f"· mode: {mode}"
)
st.dataframe(df.head(100), hide_index=True, use_container_width=True)
def _render_build_mode() -> None: