diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py index cd81750..49dd68f 100644 --- a/src/gui/pages/10_PDF_Extractor.py +++ b/src/gui/pages/10_PDF_Extractor.py @@ -32,11 +32,26 @@ if str(_project_root) not in sys.path: from src.audit import log_event, log_page_open from src.gui.components import hide_streamlit_chrome, render_sticky_footer from src.pdf_extract import ( + PdfDependencyMissing, apply_template, extract_pages_auto, ocr_available, render_page_image, ) + + +def _pdf_deps_status() -> tuple[bool, list[str]]: + """Probe each runtime PDF dep without forcing the user to hit the + extract button. Returns ``(ok, missing_names)``.""" + missing: list[str] = [] + for name in ("pdfplumber", "pypdfium2"): + try: + __import__(name) + except ImportError: + missing.append(name) + return (not missing), missing + + from src.pdf_templates import ( SCHEMA_VERSION, VALID_TARGETS, @@ -94,6 +109,19 @@ st.caption( "every statement that follows the same layout." ) +_pdf_ok, _pdf_missing = _pdf_deps_status() +if not _pdf_ok: + st.error( + "**PDF dependencies are not installed.** " + f"Missing module(s): `{', '.join(_pdf_missing)}`.\n\n" + "Install them into the same Python that launches DataTools:\n\n" + "```\npip install pdfplumber pypdfium2 " + "streamlit-drawable-canvas pytesseract\n```\n\n" + "Then **fully restart the launcher** to pick up the new modules. " + "(Templates you've already saved are unaffected.)" + ) + st.stop() + _ocr_ok, _ocr_reason = ocr_available() c_mode, c_ocr = st.columns([3, 2]) with c_mode: diff --git a/src/pdf_extract.py b/src/pdf_extract.py index 059b706..2d853eb 100644 --- a/src/pdf_extract.py +++ b/src/pdf_extract.py @@ -39,7 +39,44 @@ from datetime import datetime from typing import Any import pandas as pd -import pdfplumber + + +# Lazy imports for the heavy PDF deps so a fresh ``pip`` that hasn't +# picked up the new ``requirements.txt`` lines yet doesn't crash the +# module-import path. The GUI page surfaces a friendly install message +# when these come back missing instead of throwing an ImportError +# traceback over the whole tool. Pure helpers (parse_amount, parse_date, +# cluster_rows, …) keep working with no PDF dep installed. + + +class PdfDependencyMissing(ImportError): + """Raised when a runtime PDF dependency is missing. + + Carries an actionable ``hint`` for the GUI to show to the user.""" + + def __init__(self, missing: str, hint: str = ""): + self.missing = missing + self.hint = hint or ( + f"Install the PDF dependencies: ``pip install " + f"pdfplumber pypdfium2 streamlit-drawable-canvas pytesseract``" + ) + super().__init__(f"{missing} is not installed. {self.hint}") + + +def _require_pdfplumber(): + try: + import pdfplumber # noqa: PLC0415 + return pdfplumber + except ImportError as e: + raise PdfDependencyMissing("pdfplumber") from e + + +def _require_pdfium(): + try: + import pypdfium2 # noqa: PLC0415 + return pypdfium2 + except ImportError as e: + raise PdfDependencyMissing("pypdfium2") from e # --------------------------------------------------------------------------- @@ -81,6 +118,7 @@ def extract_pages(pdf_bytes: bytes) -> list[Page]: groups them into rows by ``top`` clustering and into columns by template-defined x-boundaries. """ + pdfplumber = _require_pdfplumber() out: list[Page] = [] with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: for i, page in enumerate(pdf.pages, start=1): @@ -528,7 +566,7 @@ def render_page_image( sensible size for the visual picker — bank statements at 100% can be 800–1200 pts wide; we want ~900px on screen. """ - import pypdfium2 as pdfium + pdfium = _require_pdfium() pdf = pdfium.PdfDocument(pdf_bytes) try: @@ -554,9 +592,9 @@ def ocr_pdf_to_pages(pdf_bytes: bytes, dpi: int = 200) -> list[Page]: to recover per-word bounding boxes so the same column-assignment pipeline keeps working. """ - import pypdfium2 as pdfium - import pytesseract - from PIL import Image # noqa: F401 (transitively required) + pdfium = _require_pdfium() + import pytesseract # noqa: PLC0415 + from PIL import Image # noqa: F401, PLC0415 (transitively required) pages: list[Page] = [] pdf = pdfium.PdfDocument(pdf_bytes)