feat(pdf): OCR availability banner + per-run toggle
Phase 6/6. Final polish layer on top of the OCR pipeline that ``extract_pages_auto`` has carried since commit 1. - **OCR status banner** at the top of the page next to the mode selector. Ready: a one-liner caption confirming OCR will run on scanned pages. Unavailable: a collapsed expander explaining the missing piece (``pytesseract`` binding vs. Tesseract binary) with install pointers for Windows, macOS, and Linux. The expander explicitly notes that modern text-based bank statements don't need OCR — most users will never expand it. - **"Use OCR for scanned pages" toggle** in Extract mode, defaulting to the runtime availability. Disabled (greyed out) when Tesseract isn't usable, so the user can't accidentally set themselves up for confusing warnings. Passes through as ``allow_ocr`` to ``extract_pages_auto``. - Build mode's sample-loading path continues to call ``extract_pages_auto(..., allow_ocr=True)`` — sample preview always uses OCR if available, since the user is actively diagnosing template fit. No schema change. OCR's structural support is in commits 1 + 3; this commit just makes it discoverable + opt-out. Rolling up the 6-commit feature:b8aff86Phase 1 — pure pdf_extract module + testsaea520dPhase 2 — template storage layer + tests2f349e8Phase 3 — Extract/Build/Manage page + nav + i18n5a8e2ecPhase 4 — batch polish (ZIP, sort, status block)b86828dPhase 5 — visual region picker (drawable canvas) THIS Phase 6 — OCR banner + toggle Each commit is independently revertable; rolling all the way back to ``c16e2a5`` is ``git revertb86828d5a8e2ec2f349e8aea520db8aff86<this>`` (or just ``git reset --hard c16e2a5`` on a clean branch). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -31,7 +31,12 @@ if str(_project_root) not in sys.path:
|
|||||||
|
|
||||||
from src.audit import log_event, log_page_open
|
from src.audit import log_event, log_page_open
|
||||||
from src.gui.components import hide_streamlit_chrome, render_sticky_footer
|
from src.gui.components import hide_streamlit_chrome, render_sticky_footer
|
||||||
from src.pdf_extract import apply_template, extract_pages_auto, render_page_image
|
from src.pdf_extract import (
|
||||||
|
apply_template,
|
||||||
|
extract_pages_auto,
|
||||||
|
ocr_available,
|
||||||
|
render_page_image,
|
||||||
|
)
|
||||||
from src.pdf_templates import (
|
from src.pdf_templates import (
|
||||||
SCHEMA_VERSION,
|
SCHEMA_VERSION,
|
||||||
VALID_TARGETS,
|
VALID_TARGETS,
|
||||||
@@ -89,12 +94,31 @@ st.caption(
|
|||||||
"every statement that follows the same layout."
|
"every statement that follows the same layout."
|
||||||
)
|
)
|
||||||
|
|
||||||
mode = st.radio(
|
_ocr_ok, _ocr_reason = ocr_available()
|
||||||
|
c_mode, c_ocr = st.columns([3, 2])
|
||||||
|
with c_mode:
|
||||||
|
mode = st.radio(
|
||||||
"Mode",
|
"Mode",
|
||||||
["Extract", "Build template", "Manage templates"],
|
["Extract", "Build template", "Manage templates"],
|
||||||
horizontal=True,
|
horizontal=True,
|
||||||
key=K_MODE,
|
key=K_MODE,
|
||||||
)
|
label_visibility="collapsed",
|
||||||
|
)
|
||||||
|
with c_ocr:
|
||||||
|
if _ocr_ok:
|
||||||
|
st.caption("**OCR:** ready · scanned pages will be transcribed.")
|
||||||
|
else:
|
||||||
|
with st.expander("**OCR:** unavailable", expanded=False):
|
||||||
|
st.caption(
|
||||||
|
f"Reason: {_ocr_reason or 'unknown'}. Scanned (image-based) "
|
||||||
|
"statements will fall through with warnings. "
|
||||||
|
"To enable OCR, install Tesseract on this machine — "
|
||||||
|
"[Windows](https://github.com/UB-Mannheim/tesseract/wiki) · "
|
||||||
|
"macOS: ``brew install tesseract`` · "
|
||||||
|
"Linux: ``apt install tesseract-ocr``. "
|
||||||
|
"Modern text-based statements don't need OCR."
|
||||||
|
)
|
||||||
|
|
||||||
st.divider()
|
st.divider()
|
||||||
|
|
||||||
|
|
||||||
@@ -127,7 +151,7 @@ def _render_extract_mode() -> None:
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
c1, c2 = st.columns(2)
|
c1, c2, c3 = st.columns(3)
|
||||||
sort_by_date = c1.checkbox(
|
sort_by_date = c1.checkbox(
|
||||||
"Sort combined output by date",
|
"Sort combined output by date",
|
||||||
value=True,
|
value=True,
|
||||||
@@ -146,6 +170,15 @@ def _render_extract_mode() -> None:
|
|||||||
"back into separate ledgers."
|
"back into separate ledgers."
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
use_ocr = c3.checkbox(
|
||||||
|
"Use OCR for scanned pages",
|
||||||
|
value=_ocr_ok,
|
||||||
|
disabled=not _ocr_ok,
|
||||||
|
help=(
|
||||||
|
"When a page has no extractable text (typically a scan), "
|
||||||
|
"OCR it with Tesseract. Disabled when OCR isn't installed."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
run = st.button("Extract", type="primary", disabled=not uploads)
|
run = st.button("Extract", type="primary", disabled=not uploads)
|
||||||
if run and uploads:
|
if run and uploads:
|
||||||
@@ -166,7 +199,9 @@ def _render_extract_mode() -> None:
|
|||||||
st.write(f"**{i}/{len(uploads)}** · {up.name}")
|
st.write(f"**{i}/{len(uploads)}** · {up.name}")
|
||||||
try:
|
try:
|
||||||
pdf_bytes = up.read()
|
pdf_bytes = up.read()
|
||||||
pages, warns = extract_pages_auto(pdf_bytes, allow_ocr=True)
|
pages, warns = extract_pages_auto(
|
||||||
|
pdf_bytes, allow_ocr=use_ocr,
|
||||||
|
)
|
||||||
df = apply_template(pages, tpl)
|
df = apply_template(pages, tpl)
|
||||||
df.insert(0, "source_file", up.name)
|
df.insert(0, "source_file", up.name)
|
||||||
per_file_frames.append(df)
|
per_file_frames.append(df)
|
||||||
|
|||||||
Reference in New Issue
Block a user