From 967d3f6a11aeeb58e8f6f3a4763a81bd673b074e Mon Sep 17 00:00:00 2001
From: Michael <michael.dombaugh@gmail.com>
Date: Tue, 19 May 2026 22:54:11 +0000
Subject: [PATCH] feat(pdf): OCR availability banner + per-run toggle
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 6/6. Final polish layer on top of the OCR pipeline that
``extract_pages_auto`` has carried since commit 1.

- **OCR status banner** at the top of the page next to the mode
  selector. Ready: a one-liner caption confirming OCR will run
  on scanned pages. Unavailable: a collapsed expander explaining
  the missing piece (``pytesseract`` binding vs. Tesseract
  binary) with install pointers for Windows, macOS, and Linux.
  The expander explicitly notes that modern text-based bank
  statements don't need OCR — most users will never expand it.
- **"Use OCR for scanned pages" toggle** in Extract mode,
  defaulting to the runtime availability. Disabled (greyed out)
  when Tesseract isn't usable, so the user can't accidentally
  set themselves up for confusing warnings. Passes through as
  ``allow_ocr`` to ``extract_pages_auto``.
- Build mode's sample-loading path continues to call
  ``extract_pages_auto(..., allow_ocr=True)`` — sample preview
  always uses OCR if available, since the user is actively
  diagnosing template fit.

No schema change. OCR's structural support is in commits 1 + 3;
this commit just makes it discoverable + opt-out.

Rolling up the 6-commit feature:

  b8aff86  Phase 1 — pure pdf_extract module + tests
  aea520d  Phase 2 — template storage layer + tests
  2f349e8  Phase 3 — Extract/Build/Manage page + nav + i18n
  5a8e2ec  Phase 4 — batch polish (ZIP, sort, status block)
  b86828d  Phase 5 — visual region picker (drawable canvas)
  THIS     Phase 6 — OCR banner + toggle

Each commit is independently revertable; rolling all the way
back to ``c16e2a5`` is ``git revert b86828d 5a8e2ec 2f349e8
aea520d b8aff86 <this>`` (or just ``git reset --hard c16e2a5``
on a clean branch).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/gui/pages/10_PDF_Extractor.py | 53 +++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 9 deletions(-)
diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py
index 611f2c0..cd81750 100644
--- a/src/gui/pages/10_PDF_Extractor.py
+++ b/src/gui/pages/10_PDF_Extractor.py
@@ -31,7 +31,12 @@ if str(_project_root) not in sys.path:
 
 from src.audit import log_event, log_page_open
 from src.gui.components import hide_streamlit_chrome, render_sticky_footer
-from src.pdf_extract import apply_template, extract_pages_auto, render_page_image
+from src.pdf_extract import (
+    apply_template,
+    extract_pages_auto,
+    ocr_available,
+    render_page_image,
+)
 from src.pdf_templates import (
     SCHEMA_VERSION,
     VALID_TARGETS,
@@ -89,12 +94,31 @@ st.caption(
     "every statement that follows the same layout."
 )
 
-mode = st.radio(
-    "Mode",
-    ["Extract", "Build template", "Manage templates"],
-    horizontal=True,
-    key=K_MODE,
-)
+_ocr_ok, _ocr_reason = ocr_available()
+c_mode, c_ocr = st.columns([3, 2])
+with c_mode:
+    mode = st.radio(
+        "Mode",
+        ["Extract", "Build template", "Manage templates"],
+        horizontal=True,
+        key=K_MODE,
+        label_visibility="collapsed",
+    )
+with c_ocr:
+    if _ocr_ok:
+        st.caption("**OCR:** ready · scanned pages will be transcribed.")
+    else:
+        with st.expander("**OCR:** unavailable", expanded=False):
+            st.caption(
+                f"Reason: {_ocr_reason or 'unknown'}. Scanned (image-based) "
+                "statements will fall through with warnings. "
+                "To enable OCR, install Tesseract on this machine — "
+                "[Windows](https://github.com/UB-Mannheim/tesseract/wiki) · "
+                "macOS: ``brew install tesseract`` · "
+                "Linux: ``apt install tesseract-ocr``. "
+                "Modern text-based statements don't need OCR."
+            )
+
 st.divider()
 
 
@@ -127,7 +151,7 @@ def _render_extract_mode() -> None:
         ),
     )
 
-    c1, c2 = st.columns(2)
+    c1, c2, c3 = st.columns(3)
     sort_by_date = c1.checkbox(
         "Sort combined output by date",
         value=True,
@@ -146,6 +170,15 @@ def _render_extract_mode() -> None:
             "back into separate ledgers."
         ),
     )
+    use_ocr = c3.checkbox(
+        "Use OCR for scanned pages",
+        value=_ocr_ok,
+        disabled=not _ocr_ok,
+        help=(
+            "When a page has no extractable text (typically a scan), "
+            "OCR it with Tesseract. Disabled when OCR isn't installed."
+        ),
+    )
 
     run = st.button("Extract", type="primary", disabled=not uploads)
     if run and uploads:
@@ -166,7 +199,9 @@ def _render_extract_mode() -> None:
                 st.write(f"**{i}/{len(uploads)}** · {up.name}")
                 try:
                     pdf_bytes = up.read()
-                    pages, warns = extract_pages_auto(pdf_bytes, allow_ocr=True)
+                    pages, warns = extract_pages_auto(
+                        pdf_bytes, allow_ocr=use_ocr,
+                    )
                     df = apply_template(pages, tpl)
                     df.insert(0, "source_file", up.name)
                     per_file_frames.append(df)