From b86828d791db545cfb6d5cd0eef18036205d1f1e Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 19 May 2026 22:52:54 +0000 Subject: [PATCH] feat(pdf): visual region picker on rendered sample page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 5/6. Adds a "Visual picker" tab as the first stop in the template-build flow. The sample PDF page is rasterized with ``pypdfium2`` (capped at ~900px wide for sensible display), and ``streamlit-drawable-canvas`` overlays drawing tools on top. UX: - **Line mode** — drag short (roughly vertical) strokes where you want columns to split. Each stroke's x-midpoint becomes one boundary in PDF point coordinates. - **Rect mode** — drag a rectangle around the transactions table; bbox is preserved on the template as ``visual.table_bbox`` for round-trip, future use as a hard crop region. - **Transform mode** — move/resize already-drawn shapes after the fact. Round-trip: re-entering Build mode with an existing template seeds the canvas with full-height vertical lines for every boundary already on the template, plus the saved bbox if any, so editing-after-save matches the user's mental model. Coordinate translation: the canvas reports pixel positions; we divide by the renderer's pixels-per-PDF-point scale to get back to PDF coordinates that ``apply_template`` already expects. No template-schema change required — the boundaries the picker writes are the same list the text-input editor wrote in commit 3, just sourced visually. New helper in the extraction module: - ``render_page_image(pdf_bytes, page_no, target_width=900)`` — rasterize a single 1-indexed page to a PIL image; returns ``(image, scale)`` for coordinate translation. The text-input boundary editor in the Columns tab remains as a fallback for power users / keyboard-only workflows and for copy-paste from spreadsheet-derived x-positions. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/gui/pages/10_PDF_Extractor.py | 180 +++++++++++++++++++++++++++++- src/pdf_extract.py | 33 ++++++ 2 files changed, 211 insertions(+), 2 deletions(-) diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py index 9fa42ae..611f2c0 100644 --- a/src/gui/pages/10_PDF_Extractor.py +++ b/src/gui/pages/10_PDF_Extractor.py @@ -31,7 +31,7 @@ if str(_project_root) not in sys.path: from src.audit import log_event, log_page_open from src.gui.components import hide_streamlit_chrome, render_sticky_footer -from src.pdf_extract import apply_template, extract_pages_auto +from src.pdf_extract import apply_template, extract_pages_auto, render_page_image from src.pdf_templates import ( SCHEMA_VERSION, VALID_TARGETS, @@ -393,9 +393,185 @@ def _render_columns_editor(tpl: dict) -> None: tpl["columns"] = new_columns +def _render_visual_picker(tpl: dict) -> None: + """Drawable-canvas overlay on a rendered sample page. + + The user draws (mostly) vertical lines where columns should + split. We harvest each line's x-midpoint and write that into + ``tpl["table"]["column_boundaries"]`` (in PDF point space). An + optional rectangle becomes ``tpl["visual"]["table_bbox"]`` (in + PDF points), preserved for round-trip but not yet used by + extraction — the header/end-marker pair is enough to slice + the row band in practice. + """ + from streamlit_drawable_canvas import st_canvas + + pdf_bytes = st.session_state.get(K_SAMPLE_BYTES) + pages = st.session_state.get(K_SAMPLE_PAGES) or [] + if not pdf_bytes or not pages: + st.info("Upload a sample PDF above to use the visual picker.") + return + + max_page = len(pages) + sample_page = int(tpl.get("visual", {}).get("sample_page", 1)) + sample_page = st.number_input( + "Sample page", + min_value=1, + max_value=max_page, + value=min(sample_page, max_page), + step=1, + help="Pick a page that contains the transactions table.", + ) + tpl.setdefault("visual", {})["sample_page"] = int(sample_page) + + try: + pil_image, scale = render_page_image(pdf_bytes, int(sample_page)) + except Exception as e: + st.error(f"Couldn't render page {sample_page}: {type(e).__name__}: {e}") + return + + tpl["visual"]["page_width"] = pil_image.width / scale + tpl["visual"]["page_height"] = pil_image.height / scale + + c_left, c_right = st.columns([2, 1]) + with c_right: + st.markdown("**How to use**") + st.caption( + "• **Lines** mode: drag short vertical strokes where you " + "want columns to split. Each stroke contributes one " + "x-boundary.\n" + "• **Rect** mode: drag a box around the transactions " + "table to crop the working region.\n" + "• Use the trash icon (top-right of the canvas) to " + "remove the last shape, or the X to clear all." + ) + drawing_mode = st.radio( + "Draw", + ["line", "rect", "transform"], + horizontal=True, + help=( + "transform lets you move/resize already-drawn shapes." + ), + ) + + initial_objects = _boundaries_to_canvas_lines( + tpl["table"].get("column_boundaries", []), + scale=scale, + image_height=pil_image.height, + ) + bbox = tpl["visual"].get("table_bbox") + if bbox: + initial_objects.append(_bbox_to_canvas_rect(bbox, scale)) + + with c_left: + canvas_state = st_canvas( + fill_color="rgba(255, 165, 0, 0.15)", + stroke_width=2, + stroke_color="#d62728", + background_image=pil_image, + update_streamlit=True, + height=pil_image.height, + width=pil_image.width, + drawing_mode=drawing_mode, + initial_drawing={"version": "4.4.0", "objects": initial_objects}, + key=f"pdf_canvas_p{sample_page}", + ) + + new_bounds, new_bbox = _harvest_canvas(canvas_state, scale) + if new_bounds is not None: + tpl["table"]["column_boundaries"] = new_bounds + if new_bbox is not None: + tpl["visual"]["table_bbox"] = new_bbox + + if tpl["table"].get("column_boundaries"): + st.caption( + "Boundaries (PDF pts): " + + ", ".join( + f"{b:.0f}" for b in tpl["table"]["column_boundaries"] + ) + ) + + +def _boundaries_to_canvas_lines( + boundaries: list[float], + *, + scale: float, + image_height: int, +) -> list[dict]: + """Seed the canvas with full-height vertical lines for any + boundaries already on the template, so the user sees their + saved state when re-entering build mode.""" + out: list[dict] = [] + for b in boundaries: + x_px = float(b) * scale + out.append({ + "type": "line", + "left": x_px, + "top": 0, + "width": 0, + "height": image_height, + "x1": 0, "y1": 0, + "x2": 0, "y2": image_height, + "stroke": "#1f77b4", + "strokeWidth": 2, + "fill": "#1f77b4", + "selectable": True, + }) + return out + + +def _bbox_to_canvas_rect(bbox: list[float], scale: float) -> dict: + x0, top, x1, bottom = bbox + return { + "type": "rect", + "left": x0 * scale, + "top": top * scale, + "width": (x1 - x0) * scale, + "height": (bottom - top) * scale, + "stroke": "#d62728", + "strokeWidth": 1, + "fill": "rgba(255, 165, 0, 0.10)", + } + + +def _harvest_canvas(canvas_state, scale: float): + """Pull boundaries + bbox out of a ``st_canvas`` return value. + + Returns ``(boundaries_or_None, bbox_or_None)`` where ``None`` + means "no change" (so the existing template values stay put).""" + if canvas_state is None or canvas_state.json_data is None: + return None, None + objects = canvas_state.json_data.get("objects") or [] + + bounds: list[float] = [] + bbox: list[float] | None = None + for obj in objects: + kind = obj.get("type") + left = float(obj.get("left", 0)) + width = float(obj.get("width", 0)) + if kind == "line": + # Take the line's x-midpoint as the boundary x-position. + bounds.append((left + width / 2) / scale) + elif kind == "rect": + top = float(obj.get("top", 0)) + height = float(obj.get("height", 0)) + bbox = [ + left / scale, + top / scale, + (left + width) / scale, + (top + height) / scale, + ] + return sorted(bounds), bbox + + def _render_build_form(tpl: dict) -> None: """Render every editable field on the template, in tabs.""" - t1, t2, t3, t4 = st.tabs(["Pages & table", "Columns", "Parsing", "Save"]) + t0, t1, t2, t3, t4 = st.tabs( + ["Visual picker", "Pages & table", "Columns", "Parsing", "Save"] + ) + + with t0: + _render_visual_picker(tpl) with t1: c1, c2 = st.columns(2) diff --git a/src/pdf_extract.py b/src/pdf_extract.py index 43ba1d5..059b706 100644 --- a/src/pdf_extract.py +++ b/src/pdf_extract.py @@ -512,6 +512,39 @@ def ocr_available() -> tuple[bool, str]: return True, "" +def render_page_image( + pdf_bytes: bytes, + page_no: int, + *, + target_width: int = 900, +) -> tuple["Any", float]: + """Rasterize one page of *pdf_bytes* (1-indexed) to a PIL image. + + Returns ``(pil_image, scale)`` where ``scale`` is the + pixels-per-PDF-point factor. The caller uses ``scale`` to map + canvas coordinates (pixels) back to PDF coordinates (points). + + ``target_width`` caps the rendered width so the image is a + sensible size for the visual picker — bank statements at 100% + can be 800–1200 pts wide; we want ~900px on screen. + """ + import pypdfium2 as pdfium + + pdf = pdfium.PdfDocument(pdf_bytes) + try: + idx = max(0, min(page_no - 1, len(pdf) - 1)) + page = pdf[idx] + # Width in PDF points → pixels-per-point scale. + pdf_width = page.get_width() + scale = target_width / pdf_width if pdf_width else 2.0 + # Cap scale so big A3-style scans don't blow up. + scale = min(scale, 3.0) + bitmap = page.render(scale=scale) + return bitmap.to_pil(), scale + finally: + pdf.close() + + def ocr_pdf_to_pages(pdf_bytes: bytes, dpi: int = 200) -> list[Page]: """Run Tesseract over each page of *pdf_bytes* and return a word-position-rich ``Page`` list, parallel to ``extract_pages``.