User tried ``brew install tesseract`` in PowerShell after seeing all three OSes listed inline in the OCR banner — easy mistake when the install commands are crammed on one line with ``·`` separators. Two changes pre-empt this: **OS-aware OCR banner.** The expander now detects the user's platform via ``platform.system()`` and shows only the relevant install instructions: - **Windows**: UB-Mannheim installer link, numbered steps, explicit "keep the Add to PATH checkbox on" callout, plus a fallback paragraph telling the user how to set ``DATATOOLS_TESSERACT_PATH`` if they already installed without PATH and don't want to reinstall. - **macOS**: ``brew install tesseract`` with a Homebrew link. - **Linux**: ``apt install tesseract-ocr`` with a "or your distro's equivalent" hedge. **Robust binary discovery in ``ocr_available()``.** Three-stage: 1. Honor ``DATATOOLS_TESSERACT_PATH`` env var if set — explicit override for portable installs or non-default locations. 2. Try ``pytesseract``'s default PATH-based lookup. 3. If PATH lookup fails, probe known Windows install paths (``C:\Program Files\Tesseract-OCR\tesseract.exe``, the x86 variant, and ``%LOCALAPPDATA%\Programs\Tesseract-OCR\``) via the new ``_autodetect_tesseract_path``. On hit, set ``pytesseract.pytesseract.tesseract_cmd`` so all subsequent ``image_to_data`` calls use the same binary without re-discovering. This means a user who runs the UB-Mannheim installer with default options but forgets the PATH checkbox will still get OCR working after a launcher restart, without env-var gymnastics. Tests (4 new, 85 total in the suite): - Auto-detect returns None on non-Windows (no false positives on dev laptops). - Auto-detect finds the binary at a mocked ``C:\Program Files\Tesseract-OCR\tesseract.exe``. - Auto-detect returns None when no candidate exists. - ``DATATOOLS_TESSERACT_PATH`` env var beats both PATH lookup and auto-detect (sets ``tesseract_cmd`` even when the path doesn't resolve, so a real binary at a custom location works). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
369 lines
14 KiB
Python
369 lines
14 KiB
Python
"""End-to-end smoke tests for the PDF extraction stack.
|
|
|
|
These tests run real ``pdfplumber`` + ``pypdfium2`` calls against
|
|
a small PDF generated in-memory with ``fpdf2``. They exist to
|
|
catch the failure mode the user hit on first install — a missing
|
|
or mismatched native dependency that doesn't show up until the
|
|
extractor actually tries to open a PDF.
|
|
|
|
Per ``project-pdf-extractor`` memory: ``test_pdf_extract.py``
|
|
covers the parsing logic on synthetic ``WordBox`` data with no
|
|
PDF dep involved. This file is the layer above: it confirms the
|
|
deps themselves work, that hooks bundled them correctly (the
|
|
versions pinned in ``requirements.txt`` matter here), and that
|
|
the extractor's pipeline survives a round-trip through real
|
|
``pdfplumber.extract_words`` and real ``pypdfium2.render``.
|
|
|
|
Generation note: ``fpdf2`` is a test-only dep listed in
|
|
``requirements-dev.txt``. We don't ship it.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import io
|
|
|
|
import pytest
|
|
|
|
|
|
def _build_tiny_statement_pdf() -> bytes:
|
|
"""Render a one-page PDF that looks roughly like the simplest
|
|
possible bank statement: a header line + three transaction
|
|
rows + a closing-balance footer. Word positions are stable
|
|
enough that the parser can identify columns by x-position."""
|
|
from fpdf import FPDF
|
|
|
|
pdf = FPDF(orientation="P", unit="pt", format="letter")
|
|
pdf.add_page()
|
|
pdf.set_font("Helvetica", size=12)
|
|
# Header
|
|
pdf.set_xy(40, 50)
|
|
pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT")
|
|
# Transaction-table header row
|
|
pdf.set_xy(40, 100)
|
|
pdf.cell(120, 14, "Date")
|
|
pdf.set_xy(160, 100)
|
|
pdf.cell(200, 14, "Description")
|
|
pdf.set_xy(360, 100)
|
|
pdf.cell(80, 14, "Amount")
|
|
# Three rows
|
|
rows = [
|
|
("01/15/2026", "Coffee Shop", "(4.50)"),
|
|
("01/16/2026", "Refund Vendor", "$12.00"),
|
|
("01/17/2026", "ATM Withdrawal","(40.00)"),
|
|
]
|
|
y = 130
|
|
for date, desc, amt in rows:
|
|
pdf.set_xy(40, y)
|
|
pdf.cell(120, 14, date)
|
|
pdf.set_xy(160, y)
|
|
pdf.cell(200, 14, desc)
|
|
pdf.set_xy(360, y)
|
|
pdf.cell(80, 14, amt)
|
|
y += 20
|
|
# Closing-balance footer
|
|
pdf.set_xy(40, y + 20)
|
|
pdf.cell(0, 14, "Closing balance: $1,000.00")
|
|
return bytes(pdf.output())
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Dependency import smoke
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestDependencyImports:
|
|
"""Each runtime PDF dep must be importable.
|
|
|
|
These tests will fail fast on a stripped/broken install — most
|
|
valuable as a CI gate when the requirements.txt pins are
|
|
bumped, so we know the new pin still installs cleanly across
|
|
the matrix."""
|
|
|
|
def test_pdfplumber(self):
|
|
import pdfplumber # noqa: F401
|
|
|
|
def test_pypdfium2(self):
|
|
import pypdfium2 # noqa: F401
|
|
|
|
def test_streamlit_drawable_canvas(self):
|
|
# Don't instantiate the canvas — that needs a Streamlit
|
|
# script-run context. Just confirm the module loads.
|
|
import streamlit_drawable_canvas # noqa: F401
|
|
|
|
def test_pytesseract(self):
|
|
# The Python binding must import even when the Tesseract
|
|
# binary isn't installed — the OCR availability check
|
|
# handles binary absence separately.
|
|
import pytesseract # noqa: F401
|
|
|
|
def test_PIL(self):
|
|
# Transitively required by pdfplumber + pypdfium2 + canvas.
|
|
# Pinning explicit confirms hooks pull it through.
|
|
from PIL import Image # noqa: F401
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Real-PDF round-trip
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestRealPdfRoundTrip:
|
|
"""``extract_pages`` + ``apply_template`` against a real PDF."""
|
|
|
|
@pytest.fixture
|
|
def pdf_bytes(self) -> bytes:
|
|
return _build_tiny_statement_pdf()
|
|
|
|
def test_extract_pages_returns_words(self, pdf_bytes):
|
|
from src.pdf_extract import extract_pages
|
|
pages = extract_pages(pdf_bytes)
|
|
assert len(pages) == 1
|
|
assert pages[0].width > 0 and pages[0].height > 0
|
|
# At minimum we should have the words from the header and
|
|
# one transaction row — proves pdfplumber wired up.
|
|
all_text = " ".join(w.text for w in pages[0].words)
|
|
assert "ACME" in all_text
|
|
assert "Coffee" in all_text
|
|
assert "01/15/2026" in all_text
|
|
|
|
def test_apply_template_extracts_three_rows(self, pdf_bytes):
|
|
from src.pdf_extract import apply_template, extract_pages
|
|
# The template's column boundaries are tuned to fpdf2's
|
|
# x-coordinates above (40 / 160 / 360 pt).
|
|
tpl = {
|
|
"pages": {"range": "all"},
|
|
"table": {
|
|
"header_text": "Date Description Amount",
|
|
"end_markers": ["Closing balance"],
|
|
"column_boundaries": [150, 350],
|
|
"y_tolerance": 3.0,
|
|
},
|
|
"columns": [
|
|
{"source": 0, "target": "date"},
|
|
{"source": 1, "target": "description"},
|
|
{"source": 2, "target": "amount"},
|
|
],
|
|
"parse": {
|
|
"date_format": "%m/%d/%Y",
|
|
"amount_negative_in_parens": True,
|
|
"merge_multiline_description": True,
|
|
},
|
|
}
|
|
pages = extract_pages(pdf_bytes)
|
|
df = apply_template(pages, tpl)
|
|
assert len(df) == 3, f"expected 3 rows, got {len(df)}:\n{df}"
|
|
assert list(df["date"]) == [
|
|
"2026-01-15", "2026-01-16", "2026-01-17",
|
|
]
|
|
# Parens-negative + currency-positive both round-trip
|
|
assert df.iloc[0]["amount"] == -4.50
|
|
assert df.iloc[1]["amount"] == 12.00
|
|
assert df.iloc[2]["amount"] == -40.00
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# pypdfium2 rendering (powers the visual picker)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestRenderPageImage:
|
|
"""``render_page_image`` is what feeds the drawable canvas.
|
|
|
|
Catches the most common installer-bug: native PDFium .dll/.so
|
|
missing from the bundle. If this test crashes with a
|
|
``FileNotFoundError`` it almost always means the
|
|
``hook-pypdfium2.py`` didn't pick up the shared lib."""
|
|
|
|
def test_renders_a_real_pil_image(self):
|
|
from src.pdf_extract import render_page_image
|
|
pdf_bytes = _build_tiny_statement_pdf()
|
|
image, scale = render_page_image(pdf_bytes, page_no=1)
|
|
# Letter-size at scale ≈ 900/612 ≈ 1.47 → ~900px wide.
|
|
assert image.width > 800
|
|
assert image.height > 800
|
|
assert scale > 0
|
|
# PIL Image is duck-typed; check the attrs we depend on.
|
|
assert hasattr(image, "save")
|
|
assert hasattr(image, "tobytes")
|
|
|
|
def test_invalid_page_number_clamps(self):
|
|
from src.pdf_extract import render_page_image
|
|
pdf_bytes = _build_tiny_statement_pdf()
|
|
# PDF has 1 page; page_no=99 should clamp, not raise.
|
|
image, scale = render_page_image(pdf_bytes, page_no=99)
|
|
assert image.width > 0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Graceful-fallback behavior
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestPdfDependencyMissing:
|
|
"""The page should see a clean exception when a dep is absent,
|
|
not a raw ``ImportError`` that leaks into the Streamlit traceback."""
|
|
|
|
def test_require_pdfplumber_raises_typed_on_absence(self, monkeypatch):
|
|
from src import pdf_extract
|
|
# Simulate "pdfplumber not installed" without uninstalling.
|
|
# ``_require_pdfplumber`` does its own ``import pdfplumber``
|
|
# at call time; patch ``__import__`` to throw for that one
|
|
# name only.
|
|
import builtins
|
|
real_import = builtins.__import__
|
|
|
|
def fake_import(name, *a, **kw):
|
|
if name == "pdfplumber":
|
|
raise ImportError("simulated absent dep")
|
|
return real_import(name, *a, **kw)
|
|
|
|
monkeypatch.setattr(builtins, "__import__", fake_import)
|
|
with pytest.raises(pdf_extract.PdfDependencyMissing) as exc_info:
|
|
pdf_extract._require_pdfplumber()
|
|
assert "pdfplumber" in str(exc_info.value)
|
|
assert exc_info.value.hint # actionable hint must be populated
|
|
|
|
def test_require_pdfium_raises_typed_on_absence(self, monkeypatch):
|
|
from src import pdf_extract
|
|
import builtins
|
|
real_import = builtins.__import__
|
|
|
|
def fake_import(name, *a, **kw):
|
|
if name == "pypdfium2":
|
|
raise ImportError("simulated absent dep")
|
|
return real_import(name, *a, **kw)
|
|
|
|
monkeypatch.setattr(builtins, "__import__", fake_import)
|
|
with pytest.raises(pdf_extract.PdfDependencyMissing):
|
|
pdf_extract._require_pdfium()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Requirements-pin consistency
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestPinnedVersionsMatchInstalled:
|
|
"""If someone bumps the pin in ``requirements.txt`` without
|
|
actually reinstalling, this test points it out before CI does.
|
|
|
|
Uses ``importlib.metadata`` rather than each library's
|
|
``__version__`` attribute because not every PDF dep exposes
|
|
one (``pypdfium2`` keeps version info on a submodule)."""
|
|
|
|
def _parse_pins(self) -> dict[str, str]:
|
|
from pathlib import Path
|
|
text = (
|
|
Path(__file__).resolve().parent.parent / "requirements.txt"
|
|
).read_text(encoding="utf-8")
|
|
pins: dict[str, str] = {}
|
|
for line in text.splitlines():
|
|
line = line.strip()
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
if "==" in line:
|
|
name, _, version = line.partition("==")
|
|
pins[name.strip()] = version.strip()
|
|
return pins
|
|
|
|
def _installed(self, dist_name: str) -> str:
|
|
import importlib.metadata as md
|
|
return md.version(dist_name)
|
|
|
|
@pytest.mark.parametrize("dist_name", [
|
|
"pdfplumber",
|
|
"pypdfium2",
|
|
"pytesseract",
|
|
"streamlit-drawable-canvas",
|
|
])
|
|
def test_pin_matches_installed(self, dist_name):
|
|
pins = self._parse_pins()
|
|
if dist_name not in pins:
|
|
pytest.skip(f"{dist_name} not exact-pinned in requirements.txt")
|
|
installed = self._installed(dist_name)
|
|
assert installed == pins[dist_name], (
|
|
f"installed {dist_name}=={installed} but requirements.txt "
|
|
f"pins {pins[dist_name]} — bump the pin, or reinstall."
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# OCR availability runtime probe
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestOcrAvailability:
|
|
"""``ocr_available`` is the linchpin of the UI's OCR banner.
|
|
Returns ``(bool, str)`` — both branches must round-trip."""
|
|
|
|
def test_returns_a_tuple(self):
|
|
from src.pdf_extract import ocr_available
|
|
result = ocr_available()
|
|
assert isinstance(result, tuple)
|
|
assert len(result) == 2
|
|
ok, reason = result
|
|
assert isinstance(ok, bool)
|
|
assert isinstance(reason, str)
|
|
|
|
def test_extract_pages_auto_skips_ocr_when_disabled(self):
|
|
from src.pdf_extract import extract_pages_auto
|
|
# With allow_ocr=False, no OCR even if pages are blank.
|
|
pdf_bytes = _build_tiny_statement_pdf()
|
|
pages, warnings = extract_pages_auto(pdf_bytes, allow_ocr=False)
|
|
assert len(pages) == 1
|
|
# No OCR-disabled warning on a text PDF, since pages have text.
|
|
assert not any("OCR is disabled" in w for w in warnings)
|
|
|
|
|
|
class TestTesseractDiscovery:
|
|
"""Windows install paths + env-var override are how a real user
|
|
(no PATH munging) gets OCR working. Cover the discovery logic
|
|
even on Linux/macOS test runners by mocking out the OS check
|
|
and ``Path.exists``."""
|
|
|
|
def test_autodetect_returns_none_on_non_windows(self, monkeypatch):
|
|
from src import pdf_extract
|
|
monkeypatch.setattr(
|
|
"platform.system",
|
|
lambda: "Linux",
|
|
)
|
|
assert pdf_extract._autodetect_tesseract_path() is None
|
|
|
|
def test_autodetect_finds_program_files_on_windows(self, monkeypatch):
|
|
from src import pdf_extract
|
|
monkeypatch.setattr("platform.system", lambda: "Windows")
|
|
|
|
target = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
|
|
|
|
def fake_exists(self):
|
|
return str(self) == target
|
|
|
|
monkeypatch.setattr(
|
|
"pathlib.Path.exists",
|
|
fake_exists,
|
|
)
|
|
assert pdf_extract._autodetect_tesseract_path() == target
|
|
|
|
def test_autodetect_returns_none_when_nothing_installed(
|
|
self, monkeypatch,
|
|
):
|
|
from src import pdf_extract
|
|
monkeypatch.setattr("platform.system", lambda: "Windows")
|
|
monkeypatch.setattr("pathlib.Path.exists", lambda self: False)
|
|
assert pdf_extract._autodetect_tesseract_path() is None
|
|
|
|
def test_env_var_override_takes_precedence(self, monkeypatch, tmp_path):
|
|
"""``DATATOOLS_TESSERACT_PATH`` wins over discovery so a
|
|
portable install at a non-default path works without
|
|
relying on PATH."""
|
|
from src import pdf_extract
|
|
# Point the override at a path that doesn't exist —
|
|
# ocr_available will try it and report the failure, but
|
|
# importantly the cmd attribute is set BEFORE the call,
|
|
# which is what we're verifying.
|
|
fake_bin = str(tmp_path / "fake-tesseract.exe")
|
|
monkeypatch.setenv("DATATOOLS_TESSERACT_PATH", fake_bin)
|
|
pdf_extract.ocr_available()
|
|
import pytesseract
|
|
assert pytesseract.pytesseract.tesseract_cmd == fake_bin
|