fix(pdf): graceful fallback when PDF dependencies aren't installed
User hit a hard ImportError on opening the PDF→CSV tool because ``pip install -r requirements.txt`` hadn't picked up the new ``pdfplumber`` / ``pypdfium2`` lines yet. Streamlit surfaces that as an unfiltered traceback — friendlier to show a clear install-required panel inside the tool instead. Two changes: 1. ``src/pdf_extract.py`` lazy-imports the PDF deps via ``_require_pdfplumber()`` / ``_require_pdfium()`` helpers that raise a new ``PdfDependencyMissing`` (subclass of ImportError) with an actionable ``hint`` field. Pure helpers (``parse_amount``, ``parse_date``, ``cluster_rows``, etc.) keep working with no PDF dep installed — useful for tests and for keeping module-import paths cheap. 2. The tool page probes both deps at render time via ``_pdf_deps_status()``; if anything's missing it shows a ``st.error`` panel with the exact pip command and a "restart the launcher" reminder, then ``st.stop()``s before touching any PDF code path. The page itself loads cleanly without the deps installed, so the sidebar nav doesn't 500 — the user just sees the install panel on click. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -32,11 +32,26 @@ if str(_project_root) not in sys.path:
|
|||||||
from src.audit import log_event, log_page_open
|
from src.audit import log_event, log_page_open
|
||||||
from src.gui.components import hide_streamlit_chrome, render_sticky_footer
|
from src.gui.components import hide_streamlit_chrome, render_sticky_footer
|
||||||
from src.pdf_extract import (
|
from src.pdf_extract import (
|
||||||
|
PdfDependencyMissing,
|
||||||
apply_template,
|
apply_template,
|
||||||
extract_pages_auto,
|
extract_pages_auto,
|
||||||
ocr_available,
|
ocr_available,
|
||||||
render_page_image,
|
render_page_image,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _pdf_deps_status() -> tuple[bool, list[str]]:
|
||||||
|
"""Probe each runtime PDF dep without forcing the user to hit the
|
||||||
|
extract button. Returns ``(ok, missing_names)``."""
|
||||||
|
missing: list[str] = []
|
||||||
|
for name in ("pdfplumber", "pypdfium2"):
|
||||||
|
try:
|
||||||
|
__import__(name)
|
||||||
|
except ImportError:
|
||||||
|
missing.append(name)
|
||||||
|
return (not missing), missing
|
||||||
|
|
||||||
|
|
||||||
from src.pdf_templates import (
|
from src.pdf_templates import (
|
||||||
SCHEMA_VERSION,
|
SCHEMA_VERSION,
|
||||||
VALID_TARGETS,
|
VALID_TARGETS,
|
||||||
@@ -94,6 +109,19 @@ st.caption(
|
|||||||
"every statement that follows the same layout."
|
"every statement that follows the same layout."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
_pdf_ok, _pdf_missing = _pdf_deps_status()
|
||||||
|
if not _pdf_ok:
|
||||||
|
st.error(
|
||||||
|
"**PDF dependencies are not installed.** "
|
||||||
|
f"Missing module(s): `{', '.join(_pdf_missing)}`.\n\n"
|
||||||
|
"Install them into the same Python that launches DataTools:\n\n"
|
||||||
|
"```\npip install pdfplumber pypdfium2 "
|
||||||
|
"streamlit-drawable-canvas pytesseract\n```\n\n"
|
||||||
|
"Then **fully restart the launcher** to pick up the new modules. "
|
||||||
|
"(Templates you've already saved are unaffected.)"
|
||||||
|
)
|
||||||
|
st.stop()
|
||||||
|
|
||||||
_ocr_ok, _ocr_reason = ocr_available()
|
_ocr_ok, _ocr_reason = ocr_available()
|
||||||
c_mode, c_ocr = st.columns([3, 2])
|
c_mode, c_ocr = st.columns([3, 2])
|
||||||
with c_mode:
|
with c_mode:
|
||||||
|
|||||||
@@ -39,7 +39,44 @@ from datetime import datetime
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pdfplumber
|
|
||||||
|
|
||||||
|
# Lazy imports for the heavy PDF deps so a fresh ``pip`` that hasn't
|
||||||
|
# picked up the new ``requirements.txt`` lines yet doesn't crash the
|
||||||
|
# module-import path. The GUI page surfaces a friendly install message
|
||||||
|
# when these come back missing instead of throwing an ImportError
|
||||||
|
# traceback over the whole tool. Pure helpers (parse_amount, parse_date,
|
||||||
|
# cluster_rows, …) keep working with no PDF dep installed.
|
||||||
|
|
||||||
|
|
||||||
|
class PdfDependencyMissing(ImportError):
|
||||||
|
"""Raised when a runtime PDF dependency is missing.
|
||||||
|
|
||||||
|
Carries an actionable ``hint`` for the GUI to show to the user."""
|
||||||
|
|
||||||
|
def __init__(self, missing: str, hint: str = ""):
|
||||||
|
self.missing = missing
|
||||||
|
self.hint = hint or (
|
||||||
|
f"Install the PDF dependencies: ``pip install "
|
||||||
|
f"pdfplumber pypdfium2 streamlit-drawable-canvas pytesseract``"
|
||||||
|
)
|
||||||
|
super().__init__(f"{missing} is not installed. {self.hint}")
|
||||||
|
|
||||||
|
|
||||||
|
def _require_pdfplumber():
|
||||||
|
try:
|
||||||
|
import pdfplumber # noqa: PLC0415
|
||||||
|
return pdfplumber
|
||||||
|
except ImportError as e:
|
||||||
|
raise PdfDependencyMissing("pdfplumber") from e
|
||||||
|
|
||||||
|
|
||||||
|
def _require_pdfium():
|
||||||
|
try:
|
||||||
|
import pypdfium2 # noqa: PLC0415
|
||||||
|
return pypdfium2
|
||||||
|
except ImportError as e:
|
||||||
|
raise PdfDependencyMissing("pypdfium2") from e
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -81,6 +118,7 @@ def extract_pages(pdf_bytes: bytes) -> list[Page]:
|
|||||||
groups them into rows by ``top`` clustering and into columns
|
groups them into rows by ``top`` clustering and into columns
|
||||||
by template-defined x-boundaries.
|
by template-defined x-boundaries.
|
||||||
"""
|
"""
|
||||||
|
pdfplumber = _require_pdfplumber()
|
||||||
out: list[Page] = []
|
out: list[Page] = []
|
||||||
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
|
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
|
||||||
for i, page in enumerate(pdf.pages, start=1):
|
for i, page in enumerate(pdf.pages, start=1):
|
||||||
@@ -528,7 +566,7 @@ def render_page_image(
|
|||||||
sensible size for the visual picker — bank statements at 100%
|
sensible size for the visual picker — bank statements at 100%
|
||||||
can be 800–1200 pts wide; we want ~900px on screen.
|
can be 800–1200 pts wide; we want ~900px on screen.
|
||||||
"""
|
"""
|
||||||
import pypdfium2 as pdfium
|
pdfium = _require_pdfium()
|
||||||
|
|
||||||
pdf = pdfium.PdfDocument(pdf_bytes)
|
pdf = pdfium.PdfDocument(pdf_bytes)
|
||||||
try:
|
try:
|
||||||
@@ -554,9 +592,9 @@ def ocr_pdf_to_pages(pdf_bytes: bytes, dpi: int = 200) -> list[Page]:
|
|||||||
to recover per-word bounding boxes so the same column-assignment
|
to recover per-word bounding boxes so the same column-assignment
|
||||||
pipeline keeps working.
|
pipeline keeps working.
|
||||||
"""
|
"""
|
||||||
import pypdfium2 as pdfium
|
pdfium = _require_pdfium()
|
||||||
import pytesseract
|
import pytesseract # noqa: PLC0415
|
||||||
from PIL import Image # noqa: F401 (transitively required)
|
from PIL import Image # noqa: F401, PLC0415 (transitively required)
|
||||||
|
|
||||||
pages: list[Page] = []
|
pages: list[Page] = []
|
||||||
pdf = pdfium.PdfDocument(pdf_bytes)
|
pdf = pdfium.PdfDocument(pdf_bytes)
|
||||||
|
|||||||
Reference in New Issue
Block a user