diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py index 611f2c0..cd81750 100644 --- a/src/gui/pages/10_PDF_Extractor.py +++ b/src/gui/pages/10_PDF_Extractor.py @@ -31,7 +31,12 @@ if str(_project_root) not in sys.path: from src.audit import log_event, log_page_open from src.gui.components import hide_streamlit_chrome, render_sticky_footer -from src.pdf_extract import apply_template, extract_pages_auto, render_page_image +from src.pdf_extract import ( + apply_template, + extract_pages_auto, + ocr_available, + render_page_image, +) from src.pdf_templates import ( SCHEMA_VERSION, VALID_TARGETS, @@ -89,12 +94,31 @@ st.caption( "every statement that follows the same layout." ) -mode = st.radio( - "Mode", - ["Extract", "Build template", "Manage templates"], - horizontal=True, - key=K_MODE, -) +_ocr_ok, _ocr_reason = ocr_available() +c_mode, c_ocr = st.columns([3, 2]) +with c_mode: + mode = st.radio( + "Mode", + ["Extract", "Build template", "Manage templates"], + horizontal=True, + key=K_MODE, + label_visibility="collapsed", + ) +with c_ocr: + if _ocr_ok: + st.caption("**OCR:** ready · scanned pages will be transcribed.") + else: + with st.expander("**OCR:** unavailable", expanded=False): + st.caption( + f"Reason: {_ocr_reason or 'unknown'}. Scanned (image-based) " + "statements will fall through with warnings. " + "To enable OCR, install Tesseract on this machine — " + "[Windows](https://github.com/UB-Mannheim/tesseract/wiki) · " + "macOS: ``brew install tesseract`` · " + "Linux: ``apt install tesseract-ocr``. " + "Modern text-based statements don't need OCR." + ) + st.divider() @@ -127,7 +151,7 @@ def _render_extract_mode() -> None: ), ) - c1, c2 = st.columns(2) + c1, c2, c3 = st.columns(3) sort_by_date = c1.checkbox( "Sort combined output by date", value=True, @@ -146,6 +170,15 @@ def _render_extract_mode() -> None: "back into separate ledgers." ), ) + use_ocr = c3.checkbox( + "Use OCR for scanned pages", + value=_ocr_ok, + disabled=not _ocr_ok, + help=( + "When a page has no extractable text (typically a scan), " + "OCR it with Tesseract. Disabled when OCR isn't installed." + ), + ) run = st.button("Extract", type="primary", disabled=not uploads) if run and uploads: @@ -166,7 +199,9 @@ def _render_extract_mode() -> None: st.write(f"**{i}/{len(uploads)}** · {up.name}") try: pdf_bytes = up.read() - pages, warns = extract_pages_auto(pdf_bytes, allow_ocr=True) + pages, warns = extract_pages_auto( + pdf_bytes, allow_ocr=use_ocr, + ) df = apply_template(pages, tpl) df.insert(0, "source_file", up.name) per_file_frames.append(df)