datatools-dev/tests/test_pdf_extract_smoke.py

"""End-to-end smoke tests for the PDF extraction stack.

These tests run real ``pdfplumber`` + ``pypdfium2`` calls against
a small PDF generated in-memory with ``fpdf2``. They exist to
catch the failure mode the user hit on first install — a missing
or mismatched native dependency that doesn't show up until the
extractor actually tries to open a PDF.

Per ``project-pdf-extractor`` memory: ``test_pdf_extract.py``
covers the parsing logic on synthetic ``WordBox`` data with no
PDF dep involved. This file is the layer above: it confirms the
deps themselves work, that hooks bundled them correctly (the
versions pinned in ``requirements.txt`` matter here), and that
the extractor's pipeline survives a round-trip through real
``pdfplumber.extract_words`` and real ``pypdfium2.render``.

Generation note: ``fpdf2`` is a test-only dep listed in
``requirements-dev.txt``. We don't ship it.
"""

from __future__ import annotations

import io

import pytest


def _build_tiny_statement_pdf() -> bytes:
    """Render a one-page PDF that looks roughly like the simplest
    possible bank statement: a header line + three transaction
    rows + a closing-balance footer. Word positions are stable
    enough that the parser can identify columns by x-position."""
    from fpdf import FPDF

    pdf = FPDF(orientation="P", unit="pt", format="letter")
    pdf.add_page()
    pdf.set_font("Helvetica", size=12)
    # Header
    pdf.set_xy(40, 50)
    pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT")
    # Transaction-table header row
    pdf.set_xy(40, 100)
    pdf.cell(120, 14, "Date")
    pdf.set_xy(160, 100)
    pdf.cell(200, 14, "Description")
    pdf.set_xy(360, 100)
    pdf.cell(80, 14, "Amount")
    # Three rows
    rows = [
        ("01/15/2026", "Coffee Shop",   "(4.50)"),
        ("01/16/2026", "Refund Vendor", "$12.00"),
        ("01/17/2026", "ATM Withdrawal","(40.00)"),
    ]
    y = 130
    for date, desc, amt in rows:
        pdf.set_xy(40, y)
        pdf.cell(120, 14, date)
        pdf.set_xy(160, y)
        pdf.cell(200, 14, desc)
        pdf.set_xy(360, y)
        pdf.cell(80, 14, amt)
        y += 20
    # Closing-balance footer
    pdf.set_xy(40, y + 20)
    pdf.cell(0, 14, "Closing balance: $1,000.00")
    return bytes(pdf.output())


# ---------------------------------------------------------------------------
# Dependency import smoke
# ---------------------------------------------------------------------------


class TestDependencyImports:
    """Each runtime PDF dep must be importable.

    These tests will fail fast on a stripped/broken install — most
    valuable as a CI gate when the requirements.txt pins are
    bumped, so we know the new pin still installs cleanly across
    the matrix."""

    def test_pdfplumber(self):
        import pdfplumber  # noqa: F401

    def test_pypdfium2(self):
        import pypdfium2  # noqa: F401

    def test_streamlit_drawable_canvas(self):
        # Don't instantiate the canvas — that needs a Streamlit
        # script-run context. Just confirm the module loads.
        import streamlit_drawable_canvas  # noqa: F401

    def test_pytesseract(self):
        # The Python binding must import even when the Tesseract
        # binary isn't installed — the OCR availability check
        # handles binary absence separately.
        import pytesseract  # noqa: F401

    def test_PIL(self):
        # Transitively required by pdfplumber + pypdfium2 + canvas.
        # Pinning explicit confirms hooks pull it through.
        from PIL import Image  # noqa: F401


# ---------------------------------------------------------------------------
# Real-PDF round-trip
# ---------------------------------------------------------------------------


class TestRealPdfRoundTrip:
    """``extract_pages`` + ``apply_template`` against a real PDF."""

    @pytest.fixture
    def pdf_bytes(self) -> bytes:
        return _build_tiny_statement_pdf()

    def test_extract_pages_returns_words(self, pdf_bytes):
        from src.pdf_extract import extract_pages
        pages = extract_pages(pdf_bytes)
        assert len(pages) == 1
        assert pages[0].width > 0 and pages[0].height > 0
        # At minimum we should have the words from the header and
        # one transaction row — proves pdfplumber wired up.
        all_text = " ".join(w.text for w in pages[0].words)
        assert "ACME" in all_text
        assert "Coffee" in all_text
        assert "01/15/2026" in all_text

    def test_apply_template_extracts_three_rows(self, pdf_bytes):
        from src.pdf_extract import apply_template, extract_pages
        # The template's column boundaries are tuned to fpdf2's
        # x-coordinates above (40 / 160 / 360 pt).
        tpl = {
            "pages": {"range": "all"},
            "table": {
                "header_text": "Date Description Amount",
                "end_markers": ["Closing balance"],
                "column_boundaries": [150, 350],
                "y_tolerance": 3.0,
            },
            "columns": [
                {"source": 0, "target": "date"},
                {"source": 1, "target": "description"},
                {"source": 2, "target": "amount"},
            ],
            "parse": {
                "date_format": "%m/%d/%Y",
                "amount_negative_in_parens": True,
                "merge_multiline_description": True,
            },
        }
        pages = extract_pages(pdf_bytes)
        df = apply_template(pages, tpl)
        assert len(df) == 3, f"expected 3 rows, got {len(df)}:\n{df}"
        assert list(df["date"]) == [
            "2026-01-15", "2026-01-16", "2026-01-17",
        ]
        # Parens-negative + currency-positive both round-trip
        assert df.iloc[0]["amount"] == -4.50
        assert df.iloc[1]["amount"] == 12.00
        assert df.iloc[2]["amount"] == -40.00


# ---------------------------------------------------------------------------
# pypdfium2 rendering (powers the visual picker)
# ---------------------------------------------------------------------------


class TestRenderPageImage:
    """``render_page_image`` is what feeds the drawable canvas.

    Catches the most common installer-bug: native PDFium .dll/.so
    missing from the bundle. If this test crashes with a
    ``FileNotFoundError`` it almost always means the
    ``hook-pypdfium2.py`` didn't pick up the shared lib."""

    def test_renders_a_real_pil_image(self):
        from src.pdf_extract import render_page_image
        pdf_bytes = _build_tiny_statement_pdf()
        image, scale = render_page_image(pdf_bytes, page_no=1)
        # Letter-size at scale ≈ 900/612 ≈ 1.47 → ~900px wide.
        assert image.width > 800
        assert image.height > 800
        assert scale > 0
        # PIL Image is duck-typed; check the attrs we depend on.
        assert hasattr(image, "save")
        assert hasattr(image, "tobytes")

    def test_invalid_page_number_clamps(self):
        from src.pdf_extract import render_page_image
        pdf_bytes = _build_tiny_statement_pdf()
        # PDF has 1 page; page_no=99 should clamp, not raise.
        image, scale = render_page_image(pdf_bytes, page_no=99)
        assert image.width > 0


# ---------------------------------------------------------------------------
# Graceful-fallback behavior
# ---------------------------------------------------------------------------


class TestPdfDependencyMissing:
    """The page should see a clean exception when a dep is absent,
    not a raw ``ImportError`` that leaks into the Streamlit traceback."""

    def test_require_pdfplumber_raises_typed_on_absence(self, monkeypatch):
        from src import pdf_extract
        # Simulate "pdfplumber not installed" without uninstalling.
        # ``_require_pdfplumber`` does its own ``import pdfplumber``
        # at call time; patch ``__import__`` to throw for that one
        # name only.
        import builtins
        real_import = builtins.__import__

        def fake_import(name, *a, **kw):
            if name == "pdfplumber":
                raise ImportError("simulated absent dep")
            return real_import(name, *a, **kw)

        monkeypatch.setattr(builtins, "__import__", fake_import)
        with pytest.raises(pdf_extract.PdfDependencyMissing) as exc_info:
            pdf_extract._require_pdfplumber()
        assert "pdfplumber" in str(exc_info.value)
        assert exc_info.value.hint  # actionable hint must be populated

    def test_require_pdfium_raises_typed_on_absence(self, monkeypatch):
        from src import pdf_extract
        import builtins
        real_import = builtins.__import__

        def fake_import(name, *a, **kw):
            if name == "pypdfium2":
                raise ImportError("simulated absent dep")
            return real_import(name, *a, **kw)

        monkeypatch.setattr(builtins, "__import__", fake_import)
        with pytest.raises(pdf_extract.PdfDependencyMissing):
            pdf_extract._require_pdfium()


# ---------------------------------------------------------------------------
# Requirements-pin consistency
# ---------------------------------------------------------------------------


class TestPinnedVersionsMatchInstalled:
    """If someone bumps the pin in ``requirements.txt`` without
    actually reinstalling, this test points it out before CI does.

    Uses ``importlib.metadata`` rather than each library's
    ``__version__`` attribute because not every PDF dep exposes
    one (``pypdfium2`` keeps version info on a submodule)."""

    def _parse_pins(self) -> dict[str, str]:
        from pathlib import Path
        text = (
            Path(__file__).resolve().parent.parent / "requirements.txt"
        ).read_text(encoding="utf-8")
        pins: dict[str, str] = {}
        for line in text.splitlines():
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            if "==" in line:
                name, _, version = line.partition("==")
                pins[name.strip()] = version.strip()
        return pins

    def _installed(self, dist_name: str) -> str:
        import importlib.metadata as md
        return md.version(dist_name)

    @pytest.mark.parametrize("dist_name", [
        "pdfplumber",
        "pypdfium2",
        "pytesseract",
        "streamlit-drawable-canvas",
    ])
    def test_pin_matches_installed(self, dist_name):
        pins = self._parse_pins()
        if dist_name not in pins:
            pytest.skip(f"{dist_name} not exact-pinned in requirements.txt")
        installed = self._installed(dist_name)
        assert installed == pins[dist_name], (
            f"installed {dist_name}=={installed} but requirements.txt "
            f"pins {pins[dist_name]} — bump the pin, or reinstall."
        )


# ---------------------------------------------------------------------------
# OCR availability runtime probe
# ---------------------------------------------------------------------------


class TestOcrAvailability:
    """``ocr_available`` is the linchpin of the UI's OCR banner.
    Returns ``(bool, str)`` — both branches must round-trip."""

    def test_returns_a_tuple(self):
        from src.pdf_extract import ocr_available
        result = ocr_available()
        assert isinstance(result, tuple)
        assert len(result) == 2
        ok, reason = result
        assert isinstance(ok, bool)
        assert isinstance(reason, str)

    def test_extract_pages_auto_skips_ocr_when_disabled(self):
        from src.pdf_extract import extract_pages_auto
        # With allow_ocr=False, no OCR even if pages are blank.
        pdf_bytes = _build_tiny_statement_pdf()
        pages, warnings = extract_pages_auto(pdf_bytes, allow_ocr=False)
        assert len(pages) == 1
        # No OCR-disabled warning on a text PDF, since pages have text.
        assert not any("OCR is disabled" in w for w in warnings)


class TestTesseractDiscovery:
    """Windows install paths + env-var override are how a real user
    (no PATH munging) gets OCR working. Cover the discovery logic
    even on Linux/macOS test runners by mocking out the OS check
    and ``Path.exists``."""

    def test_autodetect_returns_none_on_non_windows(self, monkeypatch):
        from src import pdf_extract
        monkeypatch.setattr(
            "platform.system",
            lambda: "Linux",
        )
        assert pdf_extract._autodetect_tesseract_path() is None

    def test_autodetect_finds_program_files_on_windows(self, monkeypatch):
        from src import pdf_extract
        monkeypatch.setattr("platform.system", lambda: "Windows")

        target = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

        def fake_exists(self):
            return str(self) == target

        monkeypatch.setattr(
            "pathlib.Path.exists",
            fake_exists,
        )
        assert pdf_extract._autodetect_tesseract_path() == target

    def test_autodetect_returns_none_when_nothing_installed(
        self, monkeypatch,
    ):
        from src import pdf_extract
        monkeypatch.setattr("platform.system", lambda: "Windows")
        monkeypatch.setattr("pathlib.Path.exists", lambda self: False)
        assert pdf_extract._autodetect_tesseract_path() is None

    def test_env_var_override_takes_precedence(self, monkeypatch, tmp_path):
        """``DATATOOLS_TESSERACT_PATH`` wins over discovery so a
        portable install at a non-default path works without
        relying on PATH."""
        from src import pdf_extract
        # Point the override at a path that doesn't exist —
        # ocr_available will try it and report the failure, but
        # importantly the cmd attribute is set BEFORE the call,
        # which is what we're verifying.
        fake_bin = str(tmp_path / "fake-tesseract.exe")
        monkeypatch.setenv("DATATOOLS_TESSERACT_PATH", fake_bin)
        pdf_extract.ocr_available()
        import pytesseract
        assert pytesseract.pytesseract.tesseract_cmd == fake_bin