fix(pdf): graceful fallback when PDF dependencies aren't installed

User hit a hard ImportError on opening the PDF→CSV tool because
``pip install -r requirements.txt`` hadn't picked up the new
``pdfplumber`` / ``pypdfium2`` lines yet. Streamlit surfaces
that as an unfiltered traceback — friendlier to show a clear
install-required panel inside the tool instead.

Two changes:

1. ``src/pdf_extract.py`` lazy-imports the PDF deps via
   ``_require_pdfplumber()`` / ``_require_pdfium()`` helpers that
   raise a new ``PdfDependencyMissing`` (subclass of ImportError)
   with an actionable ``hint`` field. Pure helpers
   (``parse_amount``, ``parse_date``, ``cluster_rows``, etc.)
   keep working with no PDF dep installed — useful for tests and
   for keeping module-import paths cheap.

2. The tool page probes both deps at render time via
   ``_pdf_deps_status()``; if anything's missing it shows a
   ``st.error`` panel with the exact pip command and a
   "restart the launcher" reminder, then ``st.stop()``s before
   touching any PDF code path.

The page itself loads cleanly without the deps installed, so the
sidebar nav doesn't 500 — the user just sees the install panel
on click.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-19 22:59:20 +00:00
parent 967d3f6a11
commit 2d927bc95f
2 changed files with 71 additions and 5 deletions

View File

@@ -39,7 +39,44 @@ from datetime import datetime
from typing import Any
import pandas as pd
import pdfplumber
# Lazy imports for the heavy PDF deps so a fresh ``pip`` that hasn't
# picked up the new ``requirements.txt`` lines yet doesn't crash the
# module-import path. The GUI page surfaces a friendly install message
# when these come back missing instead of throwing an ImportError
# traceback over the whole tool. Pure helpers (parse_amount, parse_date,
# cluster_rows, …) keep working with no PDF dep installed.
class PdfDependencyMissing(ImportError):
"""Raised when a runtime PDF dependency is missing.
Carries an actionable ``hint`` for the GUI to show to the user."""
def __init__(self, missing: str, hint: str = ""):
self.missing = missing
self.hint = hint or (
f"Install the PDF dependencies: ``pip install "
f"pdfplumber pypdfium2 streamlit-drawable-canvas pytesseract``"
)
super().__init__(f"{missing} is not installed. {self.hint}")
def _require_pdfplumber():
try:
import pdfplumber # noqa: PLC0415
return pdfplumber
except ImportError as e:
raise PdfDependencyMissing("pdfplumber") from e
def _require_pdfium():
try:
import pypdfium2 # noqa: PLC0415
return pypdfium2
except ImportError as e:
raise PdfDependencyMissing("pypdfium2") from e
# ---------------------------------------------------------------------------
@@ -81,6 +118,7 @@ def extract_pages(pdf_bytes: bytes) -> list[Page]:
groups them into rows by ``top`` clustering and into columns
by template-defined x-boundaries.
"""
pdfplumber = _require_pdfplumber()
out: list[Page] = []
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
for i, page in enumerate(pdf.pages, start=1):
@@ -528,7 +566,7 @@ def render_page_image(
sensible size for the visual picker — bank statements at 100%
can be 8001200 pts wide; we want ~900px on screen.
"""
import pypdfium2 as pdfium
pdfium = _require_pdfium()
pdf = pdfium.PdfDocument(pdf_bytes)
try:
@@ -554,9 +592,9 @@ def ocr_pdf_to_pages(pdf_bytes: bytes, dpi: int = 200) -> list[Page]:
to recover per-word bounding boxes so the same column-assignment
pipeline keeps working.
"""
import pypdfium2 as pdfium
import pytesseract
from PIL import Image # noqa: F401 (transitively required)
pdfium = _require_pdfium()
import pytesseract # noqa: PLC0415
from PIL import Image # noqa: F401, PLC0415 (transitively required)
pages: list[Page] = []
pdf = pdfium.PdfDocument(pdf_bytes)