fix(pdf): graceful fallback when PDF dependencies aren't installed
User hit a hard ImportError on opening the PDF→CSV tool because ``pip install -r requirements.txt`` hadn't picked up the new ``pdfplumber`` / ``pypdfium2`` lines yet. Streamlit surfaces that as an unfiltered traceback — friendlier to show a clear install-required panel inside the tool instead. Two changes: 1. ``src/pdf_extract.py`` lazy-imports the PDF deps via ``_require_pdfplumber()`` / ``_require_pdfium()`` helpers that raise a new ``PdfDependencyMissing`` (subclass of ImportError) with an actionable ``hint`` field. Pure helpers (``parse_amount``, ``parse_date``, ``cluster_rows``, etc.) keep working with no PDF dep installed — useful for tests and for keeping module-import paths cheap. 2. The tool page probes both deps at render time via ``_pdf_deps_status()``; if anything's missing it shows a ``st.error`` panel with the exact pip command and a "restart the launcher" reminder, then ``st.stop()``s before touching any PDF code path. The page itself loads cleanly without the deps installed, so the sidebar nav doesn't 500 — the user just sees the install panel on click. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -32,11 +32,26 @@ if str(_project_root) not in sys.path:
|
||||
from src.audit import log_event, log_page_open
|
||||
from src.gui.components import hide_streamlit_chrome, render_sticky_footer
|
||||
from src.pdf_extract import (
|
||||
PdfDependencyMissing,
|
||||
apply_template,
|
||||
extract_pages_auto,
|
||||
ocr_available,
|
||||
render_page_image,
|
||||
)
|
||||
|
||||
|
||||
def _pdf_deps_status() -> tuple[bool, list[str]]:
|
||||
"""Probe each runtime PDF dep without forcing the user to hit the
|
||||
extract button. Returns ``(ok, missing_names)``."""
|
||||
missing: list[str] = []
|
||||
for name in ("pdfplumber", "pypdfium2"):
|
||||
try:
|
||||
__import__(name)
|
||||
except ImportError:
|
||||
missing.append(name)
|
||||
return (not missing), missing
|
||||
|
||||
|
||||
from src.pdf_templates import (
|
||||
SCHEMA_VERSION,
|
||||
VALID_TARGETS,
|
||||
@@ -94,6 +109,19 @@ st.caption(
|
||||
"every statement that follows the same layout."
|
||||
)
|
||||
|
||||
_pdf_ok, _pdf_missing = _pdf_deps_status()
|
||||
if not _pdf_ok:
|
||||
st.error(
|
||||
"**PDF dependencies are not installed.** "
|
||||
f"Missing module(s): `{', '.join(_pdf_missing)}`.\n\n"
|
||||
"Install them into the same Python that launches DataTools:\n\n"
|
||||
"```\npip install pdfplumber pypdfium2 "
|
||||
"streamlit-drawable-canvas pytesseract\n```\n\n"
|
||||
"Then **fully restart the launcher** to pick up the new modules. "
|
||||
"(Templates you've already saved are unaffected.)"
|
||||
)
|
||||
st.stop()
|
||||
|
||||
_ocr_ok, _ocr_reason = ocr_available()
|
||||
c_mode, c_ocr = st.columns([3, 2])
|
||||
with c_mode:
|
||||
|
||||
Reference in New Issue
Block a user