diff --git a/build/datatools.spec b/build/datatools.spec index 5b3212c..b5d3268 100644 --- a/build/datatools.spec +++ b/build/datatools.spec @@ -58,6 +58,18 @@ hidden_imports += collect_submodules("charset_normalizer") hidden_imports += collect_submodules("openpyxl") hidden_imports += collect_submodules("loguru") +# PDF Extractor stack. ``streamlit_drawable_canvas`` and +# ``pypdfium2`` both have their own PyInstaller hooks under +# ``build/hooks/`` that pull in the native binary + frontend +# assets — keep the ``collect_submodules`` calls here for +# belt-and-braces. +hidden_imports += collect_submodules("pdfplumber") +hidden_imports += collect_submodules("pdfminer") +hidden_imports += collect_submodules("pypdfium2") +hidden_imports += collect_submodules("streamlit_drawable_canvas") +hidden_imports += collect_submodules("PIL") +hidden_imports += collect_submodules("pytesseract") + # Our own engine + GUI modules. Even though we import them directly # at the top of ``launcher.py`` / ``app.py``, the Streamlit # session-state and per-page page discovery layers re-import via @@ -77,6 +89,17 @@ datas += collect_data_files("streamlit", include_py_files=False) # phonenumbers ships its country/area-code metadata as resources. datas += collect_data_files("phonenumbers", include_py_files=False) +# PDF Extractor data files. ``pypdfium2`` ships a native PDFium +# shared library (``.dll`` / ``.so`` / ``.dylib``) under its package +# dir; ``streamlit-drawable-canvas`` ships a built JS bundle that +# Streamlit serves from the package dir at runtime; pdfminer ships +# the Adobe CMap tables it uses for character mapping. Hooks +# under ``build/hooks/`` mirror these calls for explicit +# documentation and survive ``collect_data_files`` regressions. +datas += collect_data_files("pypdfium2", include_py_files=False) +datas += collect_data_files("streamlit_drawable_canvas") +datas += collect_data_files("pdfminer", include_py_files=False) + # Our application files. PyInstaller's bundler treats source as code # (.pyc) by default; we add it again as data so the launcher's # ``Path(sys._MEIPASS) / "src" / "gui" / "app.py"`` resolution works. diff --git a/build/hooks/hook-pypdfium2.py b/build/hooks/hook-pypdfium2.py new file mode 100644 index 0000000..c62b427 --- /dev/null +++ b/build/hooks/hook-pypdfium2.py @@ -0,0 +1,31 @@ +"""PyInstaller hook for pypdfium2. + +``pypdfium2`` ships the native PDFium shared library as a data file +inside its package directory (``pdfium``-prefixed ``.dll`` on +Windows, ``.so`` on Linux, ``.dylib`` on macOS). PyInstaller's +default discovery picks up Python ``.py``/``.pyc`` but can miss +the binary if the package is wheel-installed and the shared lib +isn't on the ``__init__``'s module-level path it scans. + +This hook is belt-and-braces — the main spec already calls +``collect_data_files("pypdfium2")`` and ``collect_submodules``, +but PyInstaller's hook-discovery-by-name is the documented +escape hatch for native-bundled libraries. Without this, the +visual picker (which renders PDF pages via +``pypdfium2.PdfDocument(...).render(...)``) silently fails on +installed builds with a ``FileNotFoundError`` for the PDFium +shared library. +""" + +from PyInstaller.utils.hooks import ( + collect_all, + collect_data_files, + collect_dynamic_libs, +) + +datas, binaries, hiddenimports = collect_all("pypdfium2") +# Make absolutely sure the bundled PDFium .dll/.so/.dylib is +# carried over — PyInstaller treats it as a dynamic lib, not data. +binaries += collect_dynamic_libs("pypdfium2") +# And its raw data files (the type stubs + metadata file). +datas += collect_data_files("pypdfium2", include_py_files=False) diff --git a/build/hooks/hook-streamlit_drawable_canvas.py b/build/hooks/hook-streamlit_drawable_canvas.py new file mode 100644 index 0000000..17483ab --- /dev/null +++ b/build/hooks/hook-streamlit_drawable_canvas.py @@ -0,0 +1,19 @@ +"""PyInstaller hook for streamlit-drawable-canvas. + +Streamlit components are Python packages that also ship a built +JavaScript/CSS bundle Streamlit serves from disk at component- +render time. Without those assets in the bundle the canvas +iframe loads blank — the user sees the page render fine but the +visual picker shows no image and no drawing controls. + +``collect_data_files`` covers the frontend bundle directory +(named ``frontend`` or ``frontend/build`` depending on the +component version). Hidden imports are picked up by the main +spec's ``collect_submodules`` call, repeated here for the same +belt-and-braces reason as ``hook-pypdfium2.py``. +""" + +from PyInstaller.utils.hooks import collect_data_files, collect_submodules + +datas = collect_data_files("streamlit_drawable_canvas") +hiddenimports = collect_submodules("streamlit_drawable_canvas") diff --git a/requirements-dev.txt b/requirements-dev.txt index 173e3b3..84ec15d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,2 +1,6 @@ pytest>=8.0,<9 pytest-cov>=5.0,<6 +# Test-only: generate small fixture PDFs in +# tests/test_pdf_extract_smoke.py so we can exercise pdfplumber + +# pypdfium2 end-to-end without committing binary fixtures. +fpdf2==2.8.7 diff --git a/requirements.txt b/requirements.txt index 59557d4..226b13d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,8 +8,12 @@ tqdm>=4.66,<5 typer>=0.12,<1 phonenumbers>=8.13,<9 streamlit>=1.35,<2 -streamlit-drawable-canvas>=0.9,<1 cryptography>=41,<49 -pdfplumber>=0.10,<1 -pypdfium2>=4,<6 -pytesseract>=0.3,<1 +# PDF Extractor stack — pinned to exact tested versions so a future +# upstream release can't change the visual picker's coordinate model +# or pdfplumber's word-position behavior mid-build. Bump these +# explicitly when re-testing against a new release. +pdfplumber==0.11.9 +pypdfium2==5.8.0 +pytesseract==0.3.13 +streamlit-drawable-canvas==0.9.3 diff --git a/tests/test_pdf_extract_smoke.py b/tests/test_pdf_extract_smoke.py new file mode 100644 index 0000000..5b3fe15 --- /dev/null +++ b/tests/test_pdf_extract_smoke.py @@ -0,0 +1,315 @@ +"""End-to-end smoke tests for the PDF extraction stack. + +These tests run real ``pdfplumber`` + ``pypdfium2`` calls against +a small PDF generated in-memory with ``fpdf2``. They exist to +catch the failure mode the user hit on first install — a missing +or mismatched native dependency that doesn't show up until the +extractor actually tries to open a PDF. + +Per ``project-pdf-extractor`` memory: ``test_pdf_extract.py`` +covers the parsing logic on synthetic ``WordBox`` data with no +PDF dep involved. This file is the layer above: it confirms the +deps themselves work, that hooks bundled them correctly (the +versions pinned in ``requirements.txt`` matter here), and that +the extractor's pipeline survives a round-trip through real +``pdfplumber.extract_words`` and real ``pypdfium2.render``. + +Generation note: ``fpdf2`` is a test-only dep listed in +``requirements-dev.txt``. We don't ship it. +""" + +from __future__ import annotations + +import io + +import pytest + + +def _build_tiny_statement_pdf() -> bytes: + """Render a one-page PDF that looks roughly like the simplest + possible bank statement: a header line + three transaction + rows + a closing-balance footer. Word positions are stable + enough that the parser can identify columns by x-position.""" + from fpdf import FPDF + + pdf = FPDF(orientation="P", unit="pt", format="letter") + pdf.add_page() + pdf.set_font("Helvetica", size=12) + # Header + pdf.set_xy(40, 50) + pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT") + # Transaction-table header row + pdf.set_xy(40, 100) + pdf.cell(120, 14, "Date") + pdf.set_xy(160, 100) + pdf.cell(200, 14, "Description") + pdf.set_xy(360, 100) + pdf.cell(80, 14, "Amount") + # Three rows + rows = [ + ("01/15/2026", "Coffee Shop", "(4.50)"), + ("01/16/2026", "Refund Vendor", "$12.00"), + ("01/17/2026", "ATM Withdrawal","(40.00)"), + ] + y = 130 + for date, desc, amt in rows: + pdf.set_xy(40, y) + pdf.cell(120, 14, date) + pdf.set_xy(160, y) + pdf.cell(200, 14, desc) + pdf.set_xy(360, y) + pdf.cell(80, 14, amt) + y += 20 + # Closing-balance footer + pdf.set_xy(40, y + 20) + pdf.cell(0, 14, "Closing balance: $1,000.00") + return bytes(pdf.output()) + + +# --------------------------------------------------------------------------- +# Dependency import smoke +# --------------------------------------------------------------------------- + + +class TestDependencyImports: + """Each runtime PDF dep must be importable. + + These tests will fail fast on a stripped/broken install — most + valuable as a CI gate when the requirements.txt pins are + bumped, so we know the new pin still installs cleanly across + the matrix.""" + + def test_pdfplumber(self): + import pdfplumber # noqa: F401 + + def test_pypdfium2(self): + import pypdfium2 # noqa: F401 + + def test_streamlit_drawable_canvas(self): + # Don't instantiate the canvas — that needs a Streamlit + # script-run context. Just confirm the module loads. + import streamlit_drawable_canvas # noqa: F401 + + def test_pytesseract(self): + # The Python binding must import even when the Tesseract + # binary isn't installed — the OCR availability check + # handles binary absence separately. + import pytesseract # noqa: F401 + + def test_PIL(self): + # Transitively required by pdfplumber + pypdfium2 + canvas. + # Pinning explicit confirms hooks pull it through. + from PIL import Image # noqa: F401 + + +# --------------------------------------------------------------------------- +# Real-PDF round-trip +# --------------------------------------------------------------------------- + + +class TestRealPdfRoundTrip: + """``extract_pages`` + ``apply_template`` against a real PDF.""" + + @pytest.fixture + def pdf_bytes(self) -> bytes: + return _build_tiny_statement_pdf() + + def test_extract_pages_returns_words(self, pdf_bytes): + from src.pdf_extract import extract_pages + pages = extract_pages(pdf_bytes) + assert len(pages) == 1 + assert pages[0].width > 0 and pages[0].height > 0 + # At minimum we should have the words from the header and + # one transaction row — proves pdfplumber wired up. + all_text = " ".join(w.text for w in pages[0].words) + assert "ACME" in all_text + assert "Coffee" in all_text + assert "01/15/2026" in all_text + + def test_apply_template_extracts_three_rows(self, pdf_bytes): + from src.pdf_extract import apply_template, extract_pages + # The template's column boundaries are tuned to fpdf2's + # x-coordinates above (40 / 160 / 360 pt). + tpl = { + "pages": {"range": "all"}, + "table": { + "header_text": "Date Description Amount", + "end_markers": ["Closing balance"], + "column_boundaries": [150, 350], + "y_tolerance": 3.0, + }, + "columns": [ + {"source": 0, "target": "date"}, + {"source": 1, "target": "description"}, + {"source": 2, "target": "amount"}, + ], + "parse": { + "date_format": "%m/%d/%Y", + "amount_negative_in_parens": True, + "merge_multiline_description": True, + }, + } + pages = extract_pages(pdf_bytes) + df = apply_template(pages, tpl) + assert len(df) == 3, f"expected 3 rows, got {len(df)}:\n{df}" + assert list(df["date"]) == [ + "2026-01-15", "2026-01-16", "2026-01-17", + ] + # Parens-negative + currency-positive both round-trip + assert df.iloc[0]["amount"] == -4.50 + assert df.iloc[1]["amount"] == 12.00 + assert df.iloc[2]["amount"] == -40.00 + + +# --------------------------------------------------------------------------- +# pypdfium2 rendering (powers the visual picker) +# --------------------------------------------------------------------------- + + +class TestRenderPageImage: + """``render_page_image`` is what feeds the drawable canvas. + + Catches the most common installer-bug: native PDFium .dll/.so + missing from the bundle. If this test crashes with a + ``FileNotFoundError`` it almost always means the + ``hook-pypdfium2.py`` didn't pick up the shared lib.""" + + def test_renders_a_real_pil_image(self): + from src.pdf_extract import render_page_image + pdf_bytes = _build_tiny_statement_pdf() + image, scale = render_page_image(pdf_bytes, page_no=1) + # Letter-size at scale ≈ 900/612 ≈ 1.47 → ~900px wide. + assert image.width > 800 + assert image.height > 800 + assert scale > 0 + # PIL Image is duck-typed; check the attrs we depend on. + assert hasattr(image, "save") + assert hasattr(image, "tobytes") + + def test_invalid_page_number_clamps(self): + from src.pdf_extract import render_page_image + pdf_bytes = _build_tiny_statement_pdf() + # PDF has 1 page; page_no=99 should clamp, not raise. + image, scale = render_page_image(pdf_bytes, page_no=99) + assert image.width > 0 + + +# --------------------------------------------------------------------------- +# Graceful-fallback behavior +# --------------------------------------------------------------------------- + + +class TestPdfDependencyMissing: + """The page should see a clean exception when a dep is absent, + not a raw ``ImportError`` that leaks into the Streamlit traceback.""" + + def test_require_pdfplumber_raises_typed_on_absence(self, monkeypatch): + from src import pdf_extract + # Simulate "pdfplumber not installed" without uninstalling. + # ``_require_pdfplumber`` does its own ``import pdfplumber`` + # at call time; patch ``__import__`` to throw for that one + # name only. + import builtins + real_import = builtins.__import__ + + def fake_import(name, *a, **kw): + if name == "pdfplumber": + raise ImportError("simulated absent dep") + return real_import(name, *a, **kw) + + monkeypatch.setattr(builtins, "__import__", fake_import) + with pytest.raises(pdf_extract.PdfDependencyMissing) as exc_info: + pdf_extract._require_pdfplumber() + assert "pdfplumber" in str(exc_info.value) + assert exc_info.value.hint # actionable hint must be populated + + def test_require_pdfium_raises_typed_on_absence(self, monkeypatch): + from src import pdf_extract + import builtins + real_import = builtins.__import__ + + def fake_import(name, *a, **kw): + if name == "pypdfium2": + raise ImportError("simulated absent dep") + return real_import(name, *a, **kw) + + monkeypatch.setattr(builtins, "__import__", fake_import) + with pytest.raises(pdf_extract.PdfDependencyMissing): + pdf_extract._require_pdfium() + + +# --------------------------------------------------------------------------- +# Requirements-pin consistency +# --------------------------------------------------------------------------- + + +class TestPinnedVersionsMatchInstalled: + """If someone bumps the pin in ``requirements.txt`` without + actually reinstalling, this test points it out before CI does. + + Uses ``importlib.metadata`` rather than each library's + ``__version__`` attribute because not every PDF dep exposes + one (``pypdfium2`` keeps version info on a submodule).""" + + def _parse_pins(self) -> dict[str, str]: + from pathlib import Path + text = ( + Path(__file__).resolve().parent.parent / "requirements.txt" + ).read_text(encoding="utf-8") + pins: dict[str, str] = {} + for line in text.splitlines(): + line = line.strip() + if not line or line.startswith("#"): + continue + if "==" in line: + name, _, version = line.partition("==") + pins[name.strip()] = version.strip() + return pins + + def _installed(self, dist_name: str) -> str: + import importlib.metadata as md + return md.version(dist_name) + + @pytest.mark.parametrize("dist_name", [ + "pdfplumber", + "pypdfium2", + "pytesseract", + "streamlit-drawable-canvas", + ]) + def test_pin_matches_installed(self, dist_name): + pins = self._parse_pins() + if dist_name not in pins: + pytest.skip(f"{dist_name} not exact-pinned in requirements.txt") + installed = self._installed(dist_name) + assert installed == pins[dist_name], ( + f"installed {dist_name}=={installed} but requirements.txt " + f"pins {pins[dist_name]} — bump the pin, or reinstall." + ) + + +# --------------------------------------------------------------------------- +# OCR availability runtime probe +# --------------------------------------------------------------------------- + + +class TestOcrAvailability: + """``ocr_available`` is the linchpin of the UI's OCR banner. + Returns ``(bool, str)`` — both branches must round-trip.""" + + def test_returns_a_tuple(self): + from src.pdf_extract import ocr_available + result = ocr_available() + assert isinstance(result, tuple) + assert len(result) == 2 + ok, reason = result + assert isinstance(ok, bool) + assert isinstance(reason, str) + + def test_extract_pages_auto_skips_ocr_when_disabled(self): + from src.pdf_extract import extract_pages_auto + # With allow_ocr=False, no OCR even if pages are blank. + pdf_bytes = _build_tiny_statement_pdf() + pages, warnings = extract_pages_auto(pdf_bytes, allow_ocr=False) + assert len(pages) == 1 + # No OCR-disabled warning on a text PDF, since pages have text. + assert not any("OCR is disabled" in w for w in warnings)