feat(pdf): OCR availability banner + per-run toggle

Phase 6/6. Final polish layer on top of the OCR pipeline that ``extract_pages_auto`` has carried since commit 1. - **OCR status banner** at the top of the page next to the mode selector. Ready: a one-liner caption confirming OCR will run on scanned pages. Unavailable: a collapsed expander explaining the missing piece (``pytesseract`` binding vs. Tesseract binary) with install pointers for Windows, macOS, and Linux. The expander explicitly notes that modern text-based bank statements don't need OCR — most users will never expand it. - **"Use OCR for scanned pages" toggle** in Extract mode, defaulting to the runtime availability. Disabled (greyed out) when Tesseract isn't usable, so the user can't accidentally set themselves up for confusing warnings. Passes through as ``allow_ocr`` to ``extract_pages_auto``. - Build mode's sample-loading path continues to call ``extract_pages_auto(..., allow_ocr=True)`` — sample preview always uses OCR if available, since the user is actively diagnosing template fit. No schema change. OCR's structural support is in commits 1 + 3; this commit just makes it discoverable + opt-out. Rolling up the 6-commit feature: b8aff86 Phase 1 — pure pdf_extract module + tests aea520d Phase 2 — template storage layer + tests 2f349e8 Phase 3 — Extract/Build/Manage page + nav + i18n 5a8e2ec Phase 4 — batch polish (ZIP, sort, status block) b86828d Phase 5 — visual region picker (drawable canvas) THIS Phase 6 — OCR banner + toggle Each commit is independently revertable; rolling all the way back to ``c16e2a5`` is ``git revert b86828d 5a8e2ec 2f349e8 aea520d b8aff86 <this>`` (or just ``git reset --hard c16e2a5`` on a clean branch). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 22:54:11 +00:00
parent b86828d791
commit 967d3f6a11
1 changed files with 44 additions and 9 deletions
--- a/src/gui/pages/10_PDF_Extractor.py
+++ b/src/gui/pages/10_PDF_Extractor.py
@@ -31,7 +31,12 @@ if str(_project_root) not in sys.path:

 from src.audit import log_event, log_page_open
 from src.gui.components import hide_streamlit_chrome, render_sticky_footer
-from src.pdf_extract import apply_template, extract_pages_auto, render_page_image
+from src.pdf_extract import (
+    apply_template,
+    extract_pages_auto,
+    ocr_available,
+    render_page_image,
+)
 from src.pdf_templates import (
    SCHEMA_VERSION,
    VALID_TARGETS,
@@ -89,12 +94,31 @@ st.caption(
    "every statement that follows the same layout."
 )

-mode = st.radio(
-    "Mode",
-    ["Extract", "Build template", "Manage templates"],
-    horizontal=True,
-    key=K_MODE,
-)
+_ocr_ok, _ocr_reason = ocr_available()
+c_mode, c_ocr = st.columns([3, 2])
+with c_mode:
+    mode = st.radio(
+        "Mode",
+        ["Extract", "Build template", "Manage templates"],
+        horizontal=True,
+        key=K_MODE,
+        label_visibility="collapsed",
+    )
+with c_ocr:
+    if _ocr_ok:
+        st.caption("**OCR:** ready · scanned pages will be transcribed.")
+    else:
+        with st.expander("**OCR:** unavailable", expanded=False):
+            st.caption(
+                f"Reason: {_ocr_reason or 'unknown'}. Scanned (image-based) "
+                "statements will fall through with warnings. "
+                "To enable OCR, install Tesseract on this machine — "
+                "[Windows](https://github.com/UB-Mannheim/tesseract/wiki) · "
+                "macOS: ``brew install tesseract`` · "
+                "Linux: ``apt install tesseract-ocr``. "
+                "Modern text-based statements don't need OCR."
+            )
+
 st.divider()


@@ -127,7 +151,7 @@ def _render_extract_mode() -> None:
        ),
    )

-    c1, c2 = st.columns(2)
+    c1, c2, c3 = st.columns(3)
    sort_by_date = c1.checkbox(
        "Sort combined output by date",
        value=True,
@@ -146,6 +170,15 @@ def _render_extract_mode() -> None:
            "back into separate ledgers."
        ),
    )
+    use_ocr = c3.checkbox(
+        "Use OCR for scanned pages",
+        value=_ocr_ok,
+        disabled=not _ocr_ok,
+        help=(
+            "When a page has no extractable text (typically a scan), "
+            "OCR it with Tesseract. Disabled when OCR isn't installed."
+        ),
+    )

    run = st.button("Extract", type="primary", disabled=not uploads)
    if run and uploads:
@@ -166,7 +199,9 @@ def _render_extract_mode() -> None:
                st.write(f"**{i}/{len(uploads)}** · {up.name}")
                try:
                    pdf_bytes = up.read()
-                    pages, warns = extract_pages_auto(pdf_bytes, allow_ocr=True)
+                    pages, warns = extract_pages_auto(
+                        pdf_bytes, allow_ocr=use_ocr,
+                    )
                    df = apply_template(pages, tpl)
                    df.insert(0, "source_file", up.name)
                    per_file_frames.append(df)