datatools-dev/tests/test_pdf_extract_smoke.py

"""End-to-end smoke tests for the PDF transaction scanner.

These run real ``pdfplumber`` + ``pypdfium2`` (when OCR is in play)
calls against a small statement-shaped PDF generated in memory
with ``fpdf2``. They catch the failure modes most likely to bite
an end-user installer build: missing native lib, broken hook
bundling, pin/installed mismatch.

Generation note: ``fpdf2`` is a test-only dep in
``requirements-dev.txt``. We don't ship it.
"""

from __future__ import annotations

import pytest


def _build_statement_pdf_with_header() -> bytes:
    """Statement with realistic header (account + period) plus
    transactions. Exercises the metadata-extraction path end-to-end."""
    from fpdf import FPDF

    pdf = FPDF(orientation="P", unit="pt", format="letter")
    pdf.add_page()
    pdf.set_font("Helvetica", size=12)
    pdf.set_xy(40, 50)
    pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT")
    pdf.set_xy(40, 70)
    pdf.cell(0, 14, "Account Number: ****5678", new_x="LMARGIN", new_y="NEXT")
    pdf.set_xy(40, 85)
    pdf.cell(0, 14, "Statement Period: 01/01/2025 - 01/31/2025",
             new_x="LMARGIN", new_y="NEXT")
    # Header row
    pdf.set_xy(40, 130)
    pdf.cell(120, 14, "Date")
    pdf.set_xy(160, 130)
    pdf.cell(200, 14, "Description")
    pdf.set_xy(360, 130)
    pdf.cell(80, 14, "Amount")
    # Transactions with SHORT dates — year is implied by period.
    rows = [
        ("01/13", "Coffee Shop",     "(4.50)"),
        ("01/16", "Refund Vendor",   "$12.00"),
    ]
    y = 160
    for date, desc, amt in rows:
        pdf.set_xy(40, y)
        pdf.cell(120, 14, date)
        pdf.set_xy(160, y)
        pdf.cell(200, 14, desc)
        pdf.set_xy(360, y)
        pdf.cell(80, 14, amt)
        y += 20
    return bytes(pdf.output())


def _build_tiny_statement_pdf() -> bytes:
    """One-page PDF: header line + three transaction rows + a
    closing-balance footer. The scanner should pick up exactly the
    three transactions."""
    from fpdf import FPDF

    pdf = FPDF(orientation="P", unit="pt", format="letter")
    pdf.add_page()
    pdf.set_font("Helvetica", size=12)
    pdf.set_xy(40, 50)
    pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT")
    # Header row (not a transaction — no amount)
    pdf.set_xy(40, 100)
    pdf.cell(120, 14, "Date")
    pdf.set_xy(160, 100)
    pdf.cell(200, 14, "Description")
    pdf.set_xy(360, 100)
    pdf.cell(80, 14, "Amount")
    # Three transactions
    rows = [
        ("01/15/2026", "Coffee Shop",     "(4.50)"),
        ("01/16/2026", "Refund Vendor",   "$12.00"),
        ("01/17/2026", "ATM Withdrawal",  "(40.00)"),
    ]
    y = 130
    for date, desc, amt in rows:
        pdf.set_xy(40, y)
        pdf.cell(120, 14, date)
        pdf.set_xy(160, y)
        pdf.cell(200, 14, desc)
        pdf.set_xy(360, y)
        pdf.cell(80, 14, amt)
        y += 20
    # Footer — has a date-like number maybe but no real txn shape
    pdf.set_xy(40, y + 20)
    pdf.cell(0, 14, "Closing balance: $1,000.00")
    return bytes(pdf.output())


# ---------------------------------------------------------------------------
# Dependency import smoke
# ---------------------------------------------------------------------------


class TestDependencyImports:
    """Each runtime PDF dep must be importable. Fails fast on a
    stripped install or a missing CI pin."""

    def test_pdfplumber(self):
        import pdfplumber  # noqa: F401

    def test_pypdfium2(self):
        import pypdfium2  # noqa: F401

    def test_pytesseract(self):
        import pytesseract  # noqa: F401

    def test_PIL(self):
        from PIL import Image  # noqa: F401


# ---------------------------------------------------------------------------
# End-to-end against a real PDF
# ---------------------------------------------------------------------------


class TestScanPdfForTransactions:
    @pytest.fixture
    def pdf_bytes(self) -> bytes:
        return _build_tiny_statement_pdf()

    def test_finds_three_transactions(self, pdf_bytes):
        from src.pdf_extract import scan_pdf_for_transactions
        rows, warnings = scan_pdf_for_transactions(pdf_bytes)
        # The PDF has 3 transactions plus a header and a closing-
        # balance footer. Header has no amount; closing-balance has
        # no date in the same line — neither qualifies as a txn.
        assert len(rows) == 3, (
            f"expected 3 rows, got {len(rows)}:\n"
            f"{[r.get('raw') for r in rows]}"
        )

    def test_dates_formatted_yyyymmdd_by_default(self, pdf_bytes):
        from src.pdf_extract import scan_pdf_for_transactions
        rows, _ = scan_pdf_for_transactions(pdf_bytes)
        # Default output format is %Y%m%d
        assert [r["date"] for r in rows] == [
            "20260115", "20260116", "20260117",
        ]

    def test_output_date_format_override(self, pdf_bytes):
        from src.pdf_extract import scan_pdf_for_transactions
        rows, _ = scan_pdf_for_transactions(
            pdf_bytes, output_date_format="%Y-%m-%d",
        )
        assert [r["date"] for r in rows] == [
            "2026-01-15", "2026-01-16", "2026-01-17",
        ]

    def test_metadata_fields_present_on_every_row(self, pdf_bytes):
        from src.pdf_extract import scan_pdf_for_transactions
        rows, _ = scan_pdf_for_transactions(pdf_bytes)
        # The fixture PDF has no statement-period or account
        # header, so the metadata fields exist but are empty
        # strings — the contract is: ALWAYS present on every row.
        for r in rows:
            assert "account_number" in r
            assert "statement_period_start" in r
            assert "statement_period_end" in r

    def test_parses_amounts_with_signs(self, pdf_bytes):
        from src.pdf_extract import scan_pdf_for_transactions
        rows, _ = scan_pdf_for_transactions(pdf_bytes)
        assert rows[0]["amount_1"] == -4.50
        assert rows[1]["amount_1"] == 12.00
        assert rows[2]["amount_1"] == -40.00

    def test_preserves_raw_line(self, pdf_bytes):
        from src.pdf_extract import scan_pdf_for_transactions
        rows, _ = scan_pdf_for_transactions(pdf_bytes)
        # Raw line lets the user verify what was matched.
        assert all("raw" in r and r["raw"] for r in rows)
        assert "Coffee" in rows[0]["raw"]

    def test_page_tagged(self, pdf_bytes):
        from src.pdf_extract import scan_pdf_for_transactions
        rows, _ = scan_pdf_for_transactions(pdf_bytes)
        assert all(r["page"] == 1 for r in rows)

    def test_negative_in_parens_off(self, pdf_bytes):
        """With parens-negative off, the parser can't decode
        ``(4.50)`` and falls back to the raw text — the row still
        surfaces, just with the unparsed string in the amount slot
        so the user can see and fix it in the editor."""
        from src.pdf_extract import scan_pdf_for_transactions
        rows, _ = scan_pdf_for_transactions(
            pdf_bytes, negative_in_parens=False,
        )
        # Row 0 had "(4.50)" — without parens-negative, parse_amount
        # returns None and the scanner keeps the raw token.
        assert rows[0]["amount_1"] == "(4.50)"
        # Row 1 had "$12.00" — still parses to positive.
        assert rows[1]["amount_1"] == 12.00


# ---------------------------------------------------------------------------
# Multi-line description merging
# ---------------------------------------------------------------------------


class TestStatementHeaderEndToEnd:
    """A real PDF with a real header — exercise the full pipeline:
    metadata extraction + year inference for short dates + format
    application. This is the failure mode most likely to break on
    the user's actual Chase statements."""

    @pytest.fixture
    def pdf_bytes(self) -> bytes:
        return _build_statement_pdf_with_header()

    def test_metadata_extracted_and_stamped(self, pdf_bytes):
        from src.pdf_extract import scan_pdf_for_transactions
        rows, _ = scan_pdf_for_transactions(pdf_bytes)
        assert rows, "expected at least one transaction"
        for r in rows:
            assert r["account_number"] == "****5678"
            assert r["statement_period_start"] == "20250101"
            assert r["statement_period_end"] == "20250131"

    def test_short_dates_get_year_from_period(self, pdf_bytes):
        from src.pdf_extract import scan_pdf_for_transactions
        rows, _ = scan_pdf_for_transactions(pdf_bytes)
        # Short ``01/13`` + period ending in 2025 → 20250113
        assert rows[0]["date"] == "20250113"
        assert rows[1]["date"] == "20250116"

    def test_iso_format_round_trip(self, pdf_bytes):
        from src.pdf_extract import scan_pdf_for_transactions
        rows, _ = scan_pdf_for_transactions(
            pdf_bytes, output_date_format="%Y-%m-%d",
        )
        assert rows[0]["date"] == "2025-01-13"
        assert rows[0]["statement_period_start"] == "2025-01-01"
        assert rows[0]["statement_period_end"] == "2025-01-31"


class TestMultiDateRow:
    """Some statements (Chase, BofA) show both a transaction date
    and a posting date per row. The scanner uses the first date
    in position order and excludes every date from the description."""

    def test_first_date_wins_second_excluded_from_description(self):
        from src import pdf_extract as mod
        from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions

        original = mod.extract_pages_auto

        def fake(_b, *, allow_ocr=True):
            words = [
                WordBox(x0=0, top=0, x1=40, bottom=10, text="01/13"),
                WordBox(x0=50, top=0, x1=90, bottom=10, text="01/14"),
                WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
                WordBox(x0=170, top=0, x1=210, bottom=10, text="Shop"),
                WordBox(x0=220, top=0, x1=270, bottom=10, text="$4.50"),
            ]
            return [Page(
                page_no=1, width=300, height=20, text="", words=words,
            )], []

        mod.extract_pages_auto = fake
        try:
            rows, _ = scan_pdf_for_transactions(b"")
        finally:
            mod.extract_pages_auto = original

        assert len(rows) == 1
        # First date used as the canonical
        assert rows[0]["date"] == "01/13"
        # Second date NOT in description
        assert "01/14" not in rows[0]["description"]
        # Description is the actual content between dates and amount
        assert rows[0]["description"] == "Coffee Shop"


class TestZeroAmountRowsAreDropped:
    """Rows where the transaction amount is exactly 0 are noise
    (statements love to print "INTEREST EARNED 0.00" or
    "PAGE TOTAL 0.00") and get filtered out."""

    def test_zero_amount_row_dropped(self):
        from src import pdf_extract as mod
        from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions

        original = mod.extract_pages_auto

        def fake(_b, *, allow_ocr=True):
            words = [
                # Real transaction
                WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
                WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
                WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
                # Zero-amount noise row (should be dropped)
                WordBox(x0=0, top=20, x1=80, bottom=30, text="01/14/2026"),
                WordBox(x0=100, top=20, x1=180, bottom=30, text="INTEREST"),
                WordBox(x0=200, top=20, x1=240, bottom=30, text="0.00"),
            ]
            return [Page(
                page_no=1, width=300, height=40, text="", words=words,
            )], []

        mod.extract_pages_auto = fake
        try:
            rows, _ = scan_pdf_for_transactions(b"")
        finally:
            mod.extract_pages_auto = original

        assert len(rows) == 1
        assert rows[0]["amount_1"] == 4.50
        assert "INTEREST" not in rows[0]["description"]

    def test_negative_amount_kept(self):
        from src import pdf_extract as mod
        from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions

        original = mod.extract_pages_auto

        def fake(_b, *, allow_ocr=True):
            words = [
                WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
                WordBox(x0=100, top=0, x1=160, bottom=10, text="Withdraw"),
                WordBox(x0=200, top=0, x1=240, bottom=10, text="(40.00)"),
            ]
            return [Page(
                page_no=1, width=300, height=20, text="", words=words,
            )], []

        mod.extract_pages_auto = fake
        try:
            rows, _ = scan_pdf_for_transactions(b"")
        finally:
            mod.extract_pages_auto = original

        # -40 is not zero — keep it
        assert len(rows) == 1
        assert rows[0]["amount_1"] == -40.00


class TestMultilineDescription:
    def test_continuation_line_merges(self):
        """A line with no date and no amount, sitting between two
        transaction rows, attaches to the previous transaction's
        description."""
        from src.pdf_extract import (
            Page,
            WordBox,
            scan_pdf_for_transactions,
        )
        # Build a synthetic page through the public entry point by
        # going through extract_pages_auto's intermediate? Easier:
        # call the internals directly via a fake PDF. For unit
        # coverage of the merge behavior, route through the helper:
        from src import pdf_extract as mod

        original = mod.extract_pages_auto

        def fake(_pdf_bytes, *, allow_ocr=True):
            words = [
                WordBox(x0=0, top=0, x1=80, bottom=10, text="01/15/2026"),
                WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
                WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
                # Continuation: no date, no amount
                WordBox(x0=100, top=20, x1=160, bottom=30, text="Vendor"),
                WordBox(x0=170, top=20, x1=230, bottom=30, text="memo"),
                # Next transaction
                WordBox(x0=0, top=40, x1=80, bottom=50, text="01/16/2026"),
                WordBox(x0=100, top=40, x1=160, bottom=50, text="Other"),
                WordBox(x0=200, top=40, x1=240, bottom=50, text="$10.00"),
            ]
            return [Page(
                page_no=1, width=300, height=100, text="", words=words,
            )], []

        mod.extract_pages_auto = fake
        try:
            rows, _ = scan_pdf_for_transactions(b"")
        finally:
            mod.extract_pages_auto = original

        assert len(rows) == 2
        assert "Vendor memo" in rows[0]["description"]
        assert rows[1]["description"] == "Other"


# ---------------------------------------------------------------------------
# Graceful fallback when deps absent
# ---------------------------------------------------------------------------


class TestPdfDependencyMissing:
    def test_require_pdfplumber_raises_typed_on_absence(self, monkeypatch):
        from src import pdf_extract
        import builtins
        real_import = builtins.__import__

        def fake_import(name, *a, **kw):
            if name == "pdfplumber":
                raise ImportError("simulated absent dep")
            return real_import(name, *a, **kw)

        monkeypatch.setattr(builtins, "__import__", fake_import)
        with pytest.raises(pdf_extract.PdfDependencyMissing) as exc:
            pdf_extract._require_pdfplumber()
        assert "pdfplumber" in str(exc.value)
        assert exc.value.hint

    def test_require_pdfium_raises_typed_on_absence(self, monkeypatch):
        from src import pdf_extract
        import builtins
        real_import = builtins.__import__

        def fake_import(name, *a, **kw):
            if name == "pypdfium2":
                raise ImportError("simulated absent dep")
            return real_import(name, *a, **kw)

        monkeypatch.setattr(builtins, "__import__", fake_import)
        with pytest.raises(pdf_extract.PdfDependencyMissing):
            pdf_extract._require_pdfium()


# ---------------------------------------------------------------------------
# Requirements pin consistency
# ---------------------------------------------------------------------------


class TestPinnedVersionsMatchInstalled:
    """If someone bumps the pin in ``requirements.txt`` without
    actually reinstalling, this test points it out before CI does."""

    def _parse_pins(self) -> dict[str, str]:
        from pathlib import Path
        text = (
            Path(__file__).resolve().parent.parent / "requirements.txt"
        ).read_text(encoding="utf-8")
        pins: dict[str, str] = {}
        for line in text.splitlines():
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            if "==" in line:
                name, _, version = line.partition("==")
                pins[name.strip()] = version.strip()
        return pins

    @pytest.mark.parametrize("dist_name", [
        "pdfplumber",
        "pypdfium2",
        "pytesseract",
    ])
    def test_pin_matches_installed(self, dist_name):
        import importlib.metadata as md
        pins = self._parse_pins()
        if dist_name not in pins:
            pytest.skip(f"{dist_name} not exact-pinned in requirements.txt")
        installed = md.version(dist_name)
        assert installed == pins[dist_name], (
            f"installed {dist_name}=={installed} but requirements.txt "
            f"pins {pins[dist_name]} — bump the pin, or reinstall."
        )


# ---------------------------------------------------------------------------
# OCR availability
# ---------------------------------------------------------------------------


class TestOcrAvailability:
    def test_returns_a_tuple(self):
        from src.pdf_extract import ocr_available
        result = ocr_available()
        assert isinstance(result, tuple) and len(result) == 2
        ok, reason = result
        assert isinstance(ok, bool)
        assert isinstance(reason, str)

    def test_extract_pages_auto_skips_ocr_when_disabled(self):
        from src.pdf_extract import extract_pages_auto
        pdf_bytes = _build_tiny_statement_pdf()
        pages, warnings = extract_pages_auto(pdf_bytes, allow_ocr=False)
        assert len(pages) == 1
        assert not any("OCR is disabled" in w for w in warnings)


class TestTesseractDiscovery:
    def test_autodetect_returns_none_on_non_windows(self, monkeypatch):
        from src import pdf_extract
        monkeypatch.setattr("platform.system", lambda: "Linux")
        assert pdf_extract._autodetect_tesseract_path() is None

    def test_autodetect_finds_program_files_on_windows(self, monkeypatch):
        from src import pdf_extract
        monkeypatch.setattr("platform.system", lambda: "Windows")
        target = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

        def fake_exists(self):
            return str(self) == target

        monkeypatch.setattr("pathlib.Path.exists", fake_exists)
        assert pdf_extract._autodetect_tesseract_path() == target

    def test_autodetect_returns_none_when_nothing_installed(self, monkeypatch):
        from src import pdf_extract
        monkeypatch.setattr("platform.system", lambda: "Windows")
        monkeypatch.setattr("pathlib.Path.exists", lambda self: False)
        assert pdf_extract._autodetect_tesseract_path() is None

    def test_env_var_override_takes_precedence(self, monkeypatch, tmp_path):
        from src import pdf_extract
        fake_bin = str(tmp_path / "fake-tesseract.exe")
        monkeypatch.setenv("DATATOOLS_TESSERACT_PATH", fake_bin)
        pdf_extract.ocr_available()
        import pytesseract
        assert pytesseract.pytesseract.tesseract_cmd == fake_bin