"""End-to-end smoke tests for the PDF extraction stack. These tests run real ``pdfplumber`` + ``pypdfium2`` calls against a small PDF generated in-memory with ``fpdf2``. They exist to catch the failure mode the user hit on first install — a missing or mismatched native dependency that doesn't show up until the extractor actually tries to open a PDF. Per ``project-pdf-extractor`` memory: ``test_pdf_extract.py`` covers the parsing logic on synthetic ``WordBox`` data with no PDF dep involved. This file is the layer above: it confirms the deps themselves work, that hooks bundled them correctly (the versions pinned in ``requirements.txt`` matter here), and that the extractor's pipeline survives a round-trip through real ``pdfplumber.extract_words`` and real ``pypdfium2.render``. Generation note: ``fpdf2`` is a test-only dep listed in ``requirements-dev.txt``. We don't ship it. """ from __future__ import annotations import io import pytest def _build_tiny_statement_pdf() -> bytes: """Render a one-page PDF that looks roughly like the simplest possible bank statement: a header line + three transaction rows + a closing-balance footer. Word positions are stable enough that the parser can identify columns by x-position.""" from fpdf import FPDF pdf = FPDF(orientation="P", unit="pt", format="letter") pdf.add_page() pdf.set_font("Helvetica", size=12) # Header pdf.set_xy(40, 50) pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT") # Transaction-table header row pdf.set_xy(40, 100) pdf.cell(120, 14, "Date") pdf.set_xy(160, 100) pdf.cell(200, 14, "Description") pdf.set_xy(360, 100) pdf.cell(80, 14, "Amount") # Three rows rows = [ ("01/15/2026", "Coffee Shop", "(4.50)"), ("01/16/2026", "Refund Vendor", "$12.00"), ("01/17/2026", "ATM Withdrawal","(40.00)"), ] y = 130 for date, desc, amt in rows: pdf.set_xy(40, y) pdf.cell(120, 14, date) pdf.set_xy(160, y) pdf.cell(200, 14, desc) pdf.set_xy(360, y) pdf.cell(80, 14, amt) y += 20 # Closing-balance footer pdf.set_xy(40, y + 20) pdf.cell(0, 14, "Closing balance: $1,000.00") return bytes(pdf.output()) # --------------------------------------------------------------------------- # Dependency import smoke # --------------------------------------------------------------------------- class TestDependencyImports: """Each runtime PDF dep must be importable. These tests will fail fast on a stripped/broken install — most valuable as a CI gate when the requirements.txt pins are bumped, so we know the new pin still installs cleanly across the matrix.""" def test_pdfplumber(self): import pdfplumber # noqa: F401 def test_pypdfium2(self): import pypdfium2 # noqa: F401 def test_streamlit_drawable_canvas(self): # Don't instantiate the canvas — that needs a Streamlit # script-run context. Just confirm the module loads. import streamlit_drawable_canvas # noqa: F401 def test_pytesseract(self): # The Python binding must import even when the Tesseract # binary isn't installed — the OCR availability check # handles binary absence separately. import pytesseract # noqa: F401 def test_PIL(self): # Transitively required by pdfplumber + pypdfium2 + canvas. # Pinning explicit confirms hooks pull it through. from PIL import Image # noqa: F401 # --------------------------------------------------------------------------- # Real-PDF round-trip # --------------------------------------------------------------------------- class TestRealPdfRoundTrip: """``extract_pages`` + ``apply_template`` against a real PDF.""" @pytest.fixture def pdf_bytes(self) -> bytes: return _build_tiny_statement_pdf() def test_extract_pages_returns_words(self, pdf_bytes): from src.pdf_extract import extract_pages pages = extract_pages(pdf_bytes) assert len(pages) == 1 assert pages[0].width > 0 and pages[0].height > 0 # At minimum we should have the words from the header and # one transaction row — proves pdfplumber wired up. all_text = " ".join(w.text for w in pages[0].words) assert "ACME" in all_text assert "Coffee" in all_text assert "01/15/2026" in all_text def test_apply_template_extracts_three_rows(self, pdf_bytes): from src.pdf_extract import apply_template, extract_pages # The template's column boundaries are tuned to fpdf2's # x-coordinates above (40 / 160 / 360 pt). tpl = { "pages": {"range": "all"}, "table": { "header_text": "Date Description Amount", "end_markers": ["Closing balance"], "column_boundaries": [150, 350], "y_tolerance": 3.0, }, "columns": [ {"source": 0, "target": "date"}, {"source": 1, "target": "description"}, {"source": 2, "target": "amount"}, ], "parse": { "date_format": "%m/%d/%Y", "amount_negative_in_parens": True, "merge_multiline_description": True, }, } pages = extract_pages(pdf_bytes) df = apply_template(pages, tpl) assert len(df) == 3, f"expected 3 rows, got {len(df)}:\n{df}" assert list(df["date"]) == [ "2026-01-15", "2026-01-16", "2026-01-17", ] # Parens-negative + currency-positive both round-trip assert df.iloc[0]["amount"] == -4.50 assert df.iloc[1]["amount"] == 12.00 assert df.iloc[2]["amount"] == -40.00 # --------------------------------------------------------------------------- # pypdfium2 rendering (powers the visual picker) # --------------------------------------------------------------------------- class TestRenderPageImage: """``render_page_image`` is what feeds the drawable canvas. Catches the most common installer-bug: native PDFium .dll/.so missing from the bundle. If this test crashes with a ``FileNotFoundError`` it almost always means the ``hook-pypdfium2.py`` didn't pick up the shared lib.""" def test_renders_a_real_pil_image(self): from src.pdf_extract import render_page_image pdf_bytes = _build_tiny_statement_pdf() image, scale = render_page_image(pdf_bytes, page_no=1) # Letter-size at scale ≈ 900/612 ≈ 1.47 → ~900px wide. assert image.width > 800 assert image.height > 800 assert scale > 0 # PIL Image is duck-typed; check the attrs we depend on. assert hasattr(image, "save") assert hasattr(image, "tobytes") def test_invalid_page_number_clamps(self): from src.pdf_extract import render_page_image pdf_bytes = _build_tiny_statement_pdf() # PDF has 1 page; page_no=99 should clamp, not raise. image, scale = render_page_image(pdf_bytes, page_no=99) assert image.width > 0 # --------------------------------------------------------------------------- # Graceful-fallback behavior # --------------------------------------------------------------------------- class TestPdfDependencyMissing: """The page should see a clean exception when a dep is absent, not a raw ``ImportError`` that leaks into the Streamlit traceback.""" def test_require_pdfplumber_raises_typed_on_absence(self, monkeypatch): from src import pdf_extract # Simulate "pdfplumber not installed" without uninstalling. # ``_require_pdfplumber`` does its own ``import pdfplumber`` # at call time; patch ``__import__`` to throw for that one # name only. import builtins real_import = builtins.__import__ def fake_import(name, *a, **kw): if name == "pdfplumber": raise ImportError("simulated absent dep") return real_import(name, *a, **kw) monkeypatch.setattr(builtins, "__import__", fake_import) with pytest.raises(pdf_extract.PdfDependencyMissing) as exc_info: pdf_extract._require_pdfplumber() assert "pdfplumber" in str(exc_info.value) assert exc_info.value.hint # actionable hint must be populated def test_require_pdfium_raises_typed_on_absence(self, monkeypatch): from src import pdf_extract import builtins real_import = builtins.__import__ def fake_import(name, *a, **kw): if name == "pypdfium2": raise ImportError("simulated absent dep") return real_import(name, *a, **kw) monkeypatch.setattr(builtins, "__import__", fake_import) with pytest.raises(pdf_extract.PdfDependencyMissing): pdf_extract._require_pdfium() # --------------------------------------------------------------------------- # Requirements-pin consistency # --------------------------------------------------------------------------- class TestPinnedVersionsMatchInstalled: """If someone bumps the pin in ``requirements.txt`` without actually reinstalling, this test points it out before CI does. Uses ``importlib.metadata`` rather than each library's ``__version__`` attribute because not every PDF dep exposes one (``pypdfium2`` keeps version info on a submodule).""" def _parse_pins(self) -> dict[str, str]: from pathlib import Path text = ( Path(__file__).resolve().parent.parent / "requirements.txt" ).read_text(encoding="utf-8") pins: dict[str, str] = {} for line in text.splitlines(): line = line.strip() if not line or line.startswith("#"): continue if "==" in line: name, _, version = line.partition("==") pins[name.strip()] = version.strip() return pins def _installed(self, dist_name: str) -> str: import importlib.metadata as md return md.version(dist_name) @pytest.mark.parametrize("dist_name", [ "pdfplumber", "pypdfium2", "pytesseract", "streamlit-drawable-canvas", ]) def test_pin_matches_installed(self, dist_name): pins = self._parse_pins() if dist_name not in pins: pytest.skip(f"{dist_name} not exact-pinned in requirements.txt") installed = self._installed(dist_name) assert installed == pins[dist_name], ( f"installed {dist_name}=={installed} but requirements.txt " f"pins {pins[dist_name]} — bump the pin, or reinstall." ) # --------------------------------------------------------------------------- # OCR availability runtime probe # --------------------------------------------------------------------------- class TestOcrAvailability: """``ocr_available`` is the linchpin of the UI's OCR banner. Returns ``(bool, str)`` — both branches must round-trip.""" def test_returns_a_tuple(self): from src.pdf_extract import ocr_available result = ocr_available() assert isinstance(result, tuple) assert len(result) == 2 ok, reason = result assert isinstance(ok, bool) assert isinstance(reason, str) def test_extract_pages_auto_skips_ocr_when_disabled(self): from src.pdf_extract import extract_pages_auto # With allow_ocr=False, no OCR even if pages are blank. pdf_bytes = _build_tiny_statement_pdf() pages, warnings = extract_pages_auto(pdf_bytes, allow_ocr=False) assert len(pages) == 1 # No OCR-disabled warning on a text PDF, since pages have text. assert not any("OCR is disabled" in w for w in warnings)