From 967d3f6a11aeeb58e8f6f3a4763a81bd673b074e Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 19 May 2026 22:54:11 +0000 Subject: [PATCH] feat(pdf): OCR availability banner + per-run toggle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 6/6. Final polish layer on top of the OCR pipeline that ``extract_pages_auto`` has carried since commit 1. - **OCR status banner** at the top of the page next to the mode selector. Ready: a one-liner caption confirming OCR will run on scanned pages. Unavailable: a collapsed expander explaining the missing piece (``pytesseract`` binding vs. Tesseract binary) with install pointers for Windows, macOS, and Linux. The expander explicitly notes that modern text-based bank statements don't need OCR — most users will never expand it. - **"Use OCR for scanned pages" toggle** in Extract mode, defaulting to the runtime availability. Disabled (greyed out) when Tesseract isn't usable, so the user can't accidentally set themselves up for confusing warnings. Passes through as ``allow_ocr`` to ``extract_pages_auto``. - Build mode's sample-loading path continues to call ``extract_pages_auto(..., allow_ocr=True)`` — sample preview always uses OCR if available, since the user is actively diagnosing template fit. No schema change. OCR's structural support is in commits 1 + 3; this commit just makes it discoverable + opt-out. Rolling up the 6-commit feature: b8aff86 Phase 1 — pure pdf_extract module + tests aea520d Phase 2 — template storage layer + tests 2f349e8 Phase 3 — Extract/Build/Manage page + nav + i18n 5a8e2ec Phase 4 — batch polish (ZIP, sort, status block) b86828d Phase 5 — visual region picker (drawable canvas) THIS Phase 6 — OCR banner + toggle Each commit is independently revertable; rolling all the way back to ``c16e2a5`` is ``git revert b86828d 5a8e2ec 2f349e8 aea520d b8aff86 `` (or just ``git reset --hard c16e2a5`` on a clean branch). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/gui/pages/10_PDF_Extractor.py | 53 +++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 9 deletions(-) diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py index 611f2c0..cd81750 100644 --- a/src/gui/pages/10_PDF_Extractor.py +++ b/src/gui/pages/10_PDF_Extractor.py @@ -31,7 +31,12 @@ if str(_project_root) not in sys.path: from src.audit import log_event, log_page_open from src.gui.components import hide_streamlit_chrome, render_sticky_footer -from src.pdf_extract import apply_template, extract_pages_auto, render_page_image +from src.pdf_extract import ( + apply_template, + extract_pages_auto, + ocr_available, + render_page_image, +) from src.pdf_templates import ( SCHEMA_VERSION, VALID_TARGETS, @@ -89,12 +94,31 @@ st.caption( "every statement that follows the same layout." ) -mode = st.radio( - "Mode", - ["Extract", "Build template", "Manage templates"], - horizontal=True, - key=K_MODE, -) +_ocr_ok, _ocr_reason = ocr_available() +c_mode, c_ocr = st.columns([3, 2]) +with c_mode: + mode = st.radio( + "Mode", + ["Extract", "Build template", "Manage templates"], + horizontal=True, + key=K_MODE, + label_visibility="collapsed", + ) +with c_ocr: + if _ocr_ok: + st.caption("**OCR:** ready · scanned pages will be transcribed.") + else: + with st.expander("**OCR:** unavailable", expanded=False): + st.caption( + f"Reason: {_ocr_reason or 'unknown'}. Scanned (image-based) " + "statements will fall through with warnings. " + "To enable OCR, install Tesseract on this machine — " + "[Windows](https://github.com/UB-Mannheim/tesseract/wiki) · " + "macOS: ``brew install tesseract`` · " + "Linux: ``apt install tesseract-ocr``. " + "Modern text-based statements don't need OCR." + ) + st.divider() @@ -127,7 +151,7 @@ def _render_extract_mode() -> None: ), ) - c1, c2 = st.columns(2) + c1, c2, c3 = st.columns(3) sort_by_date = c1.checkbox( "Sort combined output by date", value=True, @@ -146,6 +170,15 @@ def _render_extract_mode() -> None: "back into separate ledgers." ), ) + use_ocr = c3.checkbox( + "Use OCR for scanned pages", + value=_ocr_ok, + disabled=not _ocr_ok, + help=( + "When a page has no extractable text (typically a scan), " + "OCR it with Tesseract. Disabled when OCR isn't installed." + ), + ) run = st.button("Extract", type="primary", disabled=not uploads) if run and uploads: @@ -166,7 +199,9 @@ def _render_extract_mode() -> None: st.write(f"**{i}/{len(uploads)}** · {up.name}") try: pdf_bytes = up.read() - pages, warns = extract_pages_auto(pdf_bytes, allow_ocr=True) + pages, warns = extract_pages_auto( + pdf_bytes, allow_ocr=use_ocr, + ) df = apply_template(pages, tpl) df.insert(0, "source_file", up.name) per_file_frames.append(df)