diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py index 035a25f..aefb50f 100644 --- a/src/gui/pages/10_PDF_Extractor.py +++ b/src/gui/pages/10_PDF_Extractor.py @@ -663,123 +663,205 @@ def _harvest_canvas(canvas_state, scale: float): def _render_build_form(tpl: dict) -> None: - """Render every editable field on the template, in tabs.""" - t0, t1, t2, t3, t4 = st.tabs( - ["Visual picker", "Pages & table", "Columns", "Parsing", "Save"] + """Mode-aware editor. + + Default mode (``row_heuristic``) presents simple form fields + for the parsing rules and a live preview of detected + transactions. The visual picker only shows when the user + explicitly switches to ``column_visual`` in the Advanced tab — + most users never need to go there. + """ + # Header: name + mode switcher + c_name, c_mode = st.columns([3, 2]) + with c_name: + tpl["name"] = st.text_input( + "Template name", + value=tpl.get("name", ""), + help="What this source is called, e.g. 'Chase Personal Checking'.", + ) + tpl["slug"] = slugify(tpl["name"]) + with c_mode: + current_mode = tpl.get("mode", "row_heuristic") + new_mode = st.radio( + "Detection mode", + ["row_heuristic", "column_visual"], + index=["row_heuristic", "column_visual"].index(current_mode), + format_func=lambda m: { + "row_heuristic": "Auto-detect (recommended)", + "column_visual": "Visual columns (advanced)", + }.get(m, m), + help=( + "Auto-detect finds rows by date+amount patterns — no " + "coordinates needed; survives layout changes between " + "statements. Visual columns uses x-position boundaries " + "you draw — useful only when auto-detect fails to find " + "the table." + ), + horizontal=False, + ) + if new_mode != current_mode: + tpl["mode"] = new_mode + st.rerun() + + if tpl.get("mode", "row_heuristic") == "row_heuristic": + _render_build_form_row_heuristic(tpl) + else: + _render_build_form_column_visual(tpl) + + +def _render_build_form_row_heuristic(tpl: dict) -> None: + """Simple form for the row-heuristic mode.""" + tab_amount, tab_filters, tab_save = st.tabs( + ["Amount layout", "Filters & date", "Save"] ) - with t0: - _render_visual_picker(tpl) + tpl.setdefault("row_detection", {}) + tpl.setdefault("amounts", {}) + tpl.setdefault("date", {}) + tpl.setdefault("pages", {}) + + with tab_amount: + st.caption( + "Tell us how many amount columns each transaction row has, " + "and how negatives are written. The detector handles the " + "rest — no x-positions needed." + ) + shape_labels = { + "single": "One amount per row (sign in the number)", + "txn_balance": "Two amounts: transaction + running balance", + "debit_credit": "Two columns: separate debit and credit", + "debit_credit_balance": "Three: debit, credit, balance", + } + current_shape = tpl["amounts"].get("shape", "single") + shape = st.selectbox( + "Amount layout", + list(shape_labels.keys()), + index=list(shape_labels.keys()).index( + current_shape if current_shape in shape_labels else "single" + ), + format_func=lambda s: shape_labels[s], + ) + tpl["amounts"]["shape"] = shape - with t1: c1, c2 = st.columns(2) with c1: - tpl["name"] = st.text_input("Template name", value=tpl.get("name", "")) - tpl["slug"] = slugify(tpl["name"]) - tpl["notes"] = st.text_area("Notes", value=tpl.get("notes", ""), height=70) + tpl["amounts"]["negative_in_parens"] = st.checkbox( + "Parens (4.50) = negative", + value=bool(tpl["amounts"].get("negative_in_parens", True)), + ) + tpl["amounts"]["currency_strip"] = st.text_input( + "Currency symbols to strip", + value=tpl["amounts"].get("currency_strip", "$"), + max_chars=4, + ) + with c2: + tpl["amounts"]["decimal_separator"] = st.text_input( + "Decimal separator", + value=tpl["amounts"].get("decimal_separator", "."), + max_chars=1, + ) + tpl["amounts"]["thousands_separator"] = st.text_input( + "Thousands separator", + value=tpl["amounts"].get("thousands_separator", ","), + max_chars=1, + ) + + # Auto-derive min/max amounts from the chosen shape unless + # user has set non-default values explicitly. + shape_to_min_max = { + "single": (1, 1), + "txn_balance": (2, 2), + "debit_credit": (1, 2), + "debit_credit_balance": (2, 3), + } + cur_min = tpl["row_detection"].get("min_amounts_per_row") + cur_max = tpl["row_detection"].get("max_amounts_per_row") + derived_min, derived_max = shape_to_min_max.get(shape, (1, 3)) + if cur_min is None or cur_max is None: + tpl["row_detection"]["min_amounts_per_row"] = derived_min + tpl["row_detection"]["max_amounts_per_row"] = derived_max + + with st.expander("Advanced: tune amount-count range", expanded=False): + tpl["row_detection"]["min_amounts_per_row"] = st.number_input( + "Minimum amounts per transaction row", + min_value=1, max_value=10, + value=int(tpl["row_detection"].get("min_amounts_per_row", derived_min)), + step=1, + ) + tpl["row_detection"]["max_amounts_per_row"] = st.number_input( + "Maximum amounts per transaction row", + min_value=1, max_value=10, + value=int(tpl["row_detection"].get("max_amounts_per_row", derived_max)), + step=1, + ) + + with tab_filters: + c1, c2 = st.columns(2) + with c1: + tpl["date"]["format"] = st.text_input( + "Date format", + value=tpl["date"].get("format", "%m/%d/%Y"), + help=( + "Python strftime format. Common: %m/%d/%Y (US), " + "%d/%m/%Y (EU), %Y-%m-%d (ISO). Leave default to " + "try common formats automatically." + ), + ) tpl["pages"]["range"] = st.text_input( "Pages", value=tpl["pages"].get("range", "all"), help='"all", "1-3", "2,4", "3-" all work.', ) - tpl["pages"]["skip_matching"] = st.text_input( - "Skip pages matching (regex, optional)", - value=tpl["pages"].get("skip_matching", ""), - help='e.g. "Page \\d+ of" to skip cover pages.', - ) with c2: - tpl["table"]["header_text"] = st.text_input( - "Header text (transactions table)", - value=tpl["table"].get("header_text", ""), + tpl["row_detection"]["merge_multiline_description"] = st.checkbox( + "Merge multi-line descriptions", + value=bool( + tpl["row_detection"].get("merge_multiline_description", True) + ), help=( - "Words from the header row of the transactions table, " - "e.g. \"Date Description Amount Balance\". Extraction " - "starts on the row AFTER this match." + "Lines without a date attach to the previous " + "row's description — handles wrapped vendor names." ), ) - ends = "\n".join(tpl["table"].get("end_markers") or []) - new_ends = st.text_area( - "End markers (one regex per line)", - value=ends, - help='e.g. "Closing balance", "Page \\d+ of".', - height=80, - ) - tpl["table"]["end_markers"] = [ - line.strip() for line in new_ends.splitlines() if line.strip() - ] - skips = "\n".join(tpl["table"].get("skip_rows_matching") or []) - new_skips = st.text_area( - "Skip rows matching (one regex per line, optional)", - value=skips, - help='Common entries: "Total", "Subtotal", "^Page ".', - height=80, - ) - tpl["table"]["skip_rows_matching"] = [ - line.strip() for line in new_skips.splitlines() if line.strip() - ] - tpl["table"]["y_tolerance"] = st.number_input( + tpl["row_detection"]["y_tolerance"] = st.number_input( "Row y-tolerance (pts)", min_value=0.5, max_value=20.0, - value=float(tpl["table"].get("y_tolerance", 3.0)), + value=float(tpl["row_detection"].get("y_tolerance", 3.0)), step=0.5, help=( - "How close two words' y-positions must be to be on the " - "same row. Bump up if rows are getting split, down if " - "rows are merging." + "How close two words' y-positions must be to be on " + "the same row. Adjust if rows are splitting or merging." ), ) - with t2: - _render_columns_editor(tpl) + skips = "\n".join(tpl["row_detection"].get("skip_rows_matching") or []) + new_skips = st.text_area( + "Skip rows matching (one regex per line, optional)", + value=skips, + help=( + "Lines whose text matches any of these regexes are " + 'excluded. Common: "Total", "Subtotal", "^Page ".' + ), + height=80, + ) + tpl["row_detection"]["skip_rows_matching"] = [ + line.strip() for line in new_skips.splitlines() if line.strip() + ] - with t3: - c1, c2 = st.columns(2) - with c1: - tpl["parse"]["date_format"] = st.text_input( - "Date format", - value=tpl["parse"].get("date_format", "%m/%d/%Y"), - help=( - "Python strftime format. Common: %m/%d/%Y (US), " - "%d/%m/%Y (EU), %Y-%m-%d (ISO)." - ), - ) - tpl["parse"]["currency_strip"] = st.text_input( - "Currency symbols to strip", - value=tpl["parse"].get("currency_strip", "$"), - ) - tpl["parse"]["decimal_separator"] = st.text_input( - "Decimal separator", - value=tpl["parse"].get("decimal_separator", "."), - max_chars=1, - ) - tpl["parse"]["thousands_separator"] = st.text_input( - "Thousands separator", - value=tpl["parse"].get("thousands_separator", ","), - max_chars=1, - ) - with c2: - tpl["parse"]["amount_negative_in_parens"] = st.checkbox( - "Parens = negative amount", - value=bool(tpl["parse"].get("amount_negative_in_parens", True)), - ) - tpl["parse"]["merge_multiline_description"] = st.checkbox( - "Merge multi-line descriptions", - value=bool(tpl["parse"].get("merge_multiline_description", True)), - help=( - "Rows with no date attach to the previous row's " - "description — handles wrapped vendor names." - ), - ) - - with t4: + with tab_save: + tpl["notes"] = st.text_area( + "Notes (optional)", value=tpl.get("notes", ""), height=70, + ) ok, errors = validate_template(tpl) if errors: for err in errors: st.error(err) c1, c2 = st.columns([1, 3]) with c1: - save_btn = st.button("Save template", type="primary", disabled=not ok) + save_btn = st.button( + "Save template", type="primary", disabled=not ok, + ) with c2: st.caption( f"Will save as: ``{tpl.get('slug') or '—'}`` " @@ -788,12 +870,119 @@ def _render_build_form(tpl: dict) -> None: if save_btn: try: slug = save_template(tpl) - st.success(f"Saved as **{slug}**. Switch to Extract mode to use it.") + st.success( + f"Saved as **{slug}**. Switch to Extract mode to use it." + ) log_event( "tool_run", "PDF Extractor template saved", page="10_PDF_Extractor", template=slug, + mode=tpl.get("mode"), + ) + except Exception as e: + st.error(f"Save failed: {e}") + + +def _render_build_form_column_visual(tpl: dict) -> None: + """Legacy column-visual editor. Reached via the Detection mode + radio when the user opts into the advanced flow.""" + st.warning( + "**Advanced mode.** Column-x-position templates depend on " + "every statement from this source having identical layout. " + "If your statements drift between months, switch back to " + "Auto-detect." + ) + + t0, t1, t2, t3, t4 = st.tabs( + ["Visual picker", "Pages & table", "Columns", "Parsing", "Save"] + ) + + tpl.setdefault("table", {}) + tpl.setdefault("parse", {}) + tpl.setdefault("pages", {}) + tpl.setdefault("columns", []) + + with t0: + _render_visual_picker(tpl) + + with t1: + c1, c2 = st.columns(2) + with c1: + tpl["notes"] = st.text_area( + "Notes", value=tpl.get("notes", ""), height=70, + ) + tpl["pages"]["range"] = st.text_input( + "Pages", + value=tpl["pages"].get("range", "all"), + help='"all", "1-3", "2,4", "3-" all work.', + ) + tpl["pages"]["skip_matching"] = st.text_input( + "Skip pages matching (regex, optional)", + value=tpl["pages"].get("skip_matching", ""), + ) + with c2: + tpl["table"]["header_text"] = st.text_input( + "Header text", + value=tpl["table"].get("header_text", ""), + ) + ends = "\n".join(tpl["table"].get("end_markers") or []) + new_ends = st.text_area( + "End markers (one regex per line)", + value=ends, + height=80, + ) + tpl["table"]["end_markers"] = [ + line.strip() for line in new_ends.splitlines() if line.strip() + ] + skips = "\n".join(tpl["table"].get("skip_rows_matching") or []) + new_skips = st.text_area( + "Skip rows matching (one regex per line)", + value=skips, + height=80, + ) + tpl["table"]["skip_rows_matching"] = [ + line.strip() for line in new_skips.splitlines() if line.strip() + ] + + with t2: + _render_columns_editor(tpl) + + with t3: + tpl["parse"]["date_format"] = st.text_input( + "Date format", + value=tpl["parse"].get("date_format", "%m/%d/%Y"), + ) + tpl["parse"]["currency_strip"] = st.text_input( + "Currency symbols", value=tpl["parse"].get("currency_strip", "$"), + ) + tpl["parse"]["amount_negative_in_parens"] = st.checkbox( + "Parens = negative", + value=bool(tpl["parse"].get("amount_negative_in_parens", True)), + ) + tpl["parse"]["merge_multiline_description"] = st.checkbox( + "Merge multi-line descriptions", + value=bool(tpl["parse"].get("merge_multiline_description", True)), + ) + + with t4: + ok, errors = validate_template(tpl) + if errors: + for err in errors: + st.error(err) + save_btn = st.button( + "Save template", type="primary", disabled=not ok, key="cv_save", + ) + if save_btn: + try: + slug = save_template(tpl) + st.success(f"Saved as **{slug}**.") + log_event( + "tool_run", + "PDF Extractor template saved", + page="10_PDF_Extractor", + template=slug, + mode=tpl.get("mode"), ) except Exception as e: st.error(f"Save failed: {e}") @@ -811,15 +1000,26 @@ def _render_preview(tpl: dict) -> None: except Exception as e: st.error(f"Preview failed: {type(e).__name__}: {e}") return + mode = tpl.get("mode", "row_heuristic") if df.empty: - st.info( - "Template doesn't match any rows yet. Common fixes: tighten " - "the header text, add an end marker, adjust column " - "boundaries." - ) + if mode == "row_heuristic": + st.info( + "No transaction rows detected yet. Check that the date " + "format matches your statements, and try widening the " + "amount-count range under \"Advanced\" if your rows have " + "balance or extra columns." + ) + else: + st.info( + "Template doesn't match any rows yet. Tighten the header " + "text, add an end marker, or adjust column boundaries." + ) else: - st.caption(f"{len(df)} row(s) from {len(pages)} page(s)") - st.dataframe(df.head(50), hide_index=True, use_container_width=True) + st.caption( + f"{len(df)} row(s) from {len(pages)} page(s) " + f"· mode: {mode}" + ) + st.dataframe(df.head(100), hide_index=True, use_container_width=True) def _render_build_mode() -> None: