"""End-to-end smoke tests for the PDF transaction scanner. These run real ``pdfplumber`` + ``pypdfium2`` (when OCR is in play) calls against a small statement-shaped PDF generated in memory with ``fpdf2``. They catch the failure modes most likely to bite an end-user installer build: missing native lib, broken hook bundling, pin/installed mismatch. Generation note: ``fpdf2`` is a test-only dep in ``requirements-dev.txt``. We don't ship it. """ from __future__ import annotations import pytest def _build_statement_pdf_with_header() -> bytes: """Statement with realistic header (account + period) plus transactions. Exercises the metadata-extraction path end-to-end.""" from fpdf import FPDF pdf = FPDF(orientation="P", unit="pt", format="letter") pdf.add_page() pdf.set_font("Helvetica", size=12) pdf.set_xy(40, 50) pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT") pdf.set_xy(40, 70) pdf.cell(0, 14, "Account Number: ****5678", new_x="LMARGIN", new_y="NEXT") pdf.set_xy(40, 85) pdf.cell(0, 14, "Statement Period: 01/01/2025 - 01/31/2025", new_x="LMARGIN", new_y="NEXT") # Header row pdf.set_xy(40, 130) pdf.cell(120, 14, "Date") pdf.set_xy(160, 130) pdf.cell(200, 14, "Description") pdf.set_xy(360, 130) pdf.cell(80, 14, "Amount") # Transactions with SHORT dates — year is implied by period. rows = [ ("01/13", "Coffee Shop", "(4.50)"), ("01/16", "Refund Vendor", "$12.00"), ] y = 160 for date, desc, amt in rows: pdf.set_xy(40, y) pdf.cell(120, 14, date) pdf.set_xy(160, y) pdf.cell(200, 14, desc) pdf.set_xy(360, y) pdf.cell(80, 14, amt) y += 20 return bytes(pdf.output()) def _build_tiny_statement_pdf() -> bytes: """One-page PDF: header line + three transaction rows + a closing-balance footer. The scanner should pick up exactly the three transactions.""" from fpdf import FPDF pdf = FPDF(orientation="P", unit="pt", format="letter") pdf.add_page() pdf.set_font("Helvetica", size=12) pdf.set_xy(40, 50) pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT") # Header row (not a transaction — no amount) pdf.set_xy(40, 100) pdf.cell(120, 14, "Date") pdf.set_xy(160, 100) pdf.cell(200, 14, "Description") pdf.set_xy(360, 100) pdf.cell(80, 14, "Amount") # Three transactions rows = [ ("01/15/2026", "Coffee Shop", "(4.50)"), ("01/16/2026", "Refund Vendor", "$12.00"), ("01/17/2026", "ATM Withdrawal", "(40.00)"), ] y = 130 for date, desc, amt in rows: pdf.set_xy(40, y) pdf.cell(120, 14, date) pdf.set_xy(160, y) pdf.cell(200, 14, desc) pdf.set_xy(360, y) pdf.cell(80, 14, amt) y += 20 # Footer — has a date-like number maybe but no real txn shape pdf.set_xy(40, y + 20) pdf.cell(0, 14, "Closing balance: $1,000.00") return bytes(pdf.output()) # --------------------------------------------------------------------------- # Dependency import smoke # --------------------------------------------------------------------------- class TestDependencyImports: """Each runtime PDF dep must be importable. Fails fast on a stripped install or a missing CI pin.""" def test_pdfplumber(self): import pdfplumber # noqa: F401 def test_pypdfium2(self): import pypdfium2 # noqa: F401 def test_pytesseract(self): import pytesseract # noqa: F401 def test_PIL(self): from PIL import Image # noqa: F401 # --------------------------------------------------------------------------- # End-to-end against a real PDF # --------------------------------------------------------------------------- class TestScanPdfForTransactions: @pytest.fixture def pdf_bytes(self) -> bytes: return _build_tiny_statement_pdf() def test_finds_three_transactions(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions rows, warnings = scan_pdf_for_transactions(pdf_bytes) # The PDF has 3 transactions plus a header and a closing- # balance footer. Header has no amount; closing-balance has # no date in the same line — neither qualifies as a txn. assert len(rows) == 3, ( f"expected 3 rows, got {len(rows)}:\n" f"{[r.get('raw') for r in rows]}" ) def test_dates_formatted_yyyymmdd_by_default(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions rows, _ = scan_pdf_for_transactions(pdf_bytes) # Default output format is %Y%m%d assert [r["date"] for r in rows] == [ "20260115", "20260116", "20260117", ] def test_output_date_format_override(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions rows, _ = scan_pdf_for_transactions( pdf_bytes, output_date_format="%Y-%m-%d", ) assert [r["date"] for r in rows] == [ "2026-01-15", "2026-01-16", "2026-01-17", ] def test_metadata_fields_present_on_every_row(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions rows, _ = scan_pdf_for_transactions(pdf_bytes) # The fixture PDF has no statement-period or account # header, so the metadata fields exist but are empty # strings — the contract is: ALWAYS present on every row. for r in rows: assert "account_number" in r assert "statement_period_start" in r assert "statement_period_end" in r def test_parses_amounts_with_signs(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions rows, _ = scan_pdf_for_transactions(pdf_bytes) assert rows[0]["amount_1"] == -4.50 assert rows[1]["amount_1"] == 12.00 assert rows[2]["amount_1"] == -40.00 def test_preserves_raw_line(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions rows, _ = scan_pdf_for_transactions(pdf_bytes) # Raw line lets the user verify what was matched. assert all("raw" in r and r["raw"] for r in rows) assert "Coffee" in rows[0]["raw"] def test_page_tagged(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions rows, _ = scan_pdf_for_transactions(pdf_bytes) assert all(r["page"] == 1 for r in rows) def test_negative_in_parens_off(self, pdf_bytes): """With parens-negative off, the parser can't decode ``(4.50)`` and falls back to the raw text — the row still surfaces, just with the unparsed string in the amount slot so the user can see and fix it in the editor.""" from src.pdf_extract import scan_pdf_for_transactions rows, _ = scan_pdf_for_transactions( pdf_bytes, negative_in_parens=False, ) # Row 0 had "(4.50)" — without parens-negative, parse_amount # returns None and the scanner keeps the raw token. assert rows[0]["amount_1"] == "(4.50)" # Row 1 had "$12.00" — still parses to positive. assert rows[1]["amount_1"] == 12.00 # --------------------------------------------------------------------------- # Multi-line description merging # --------------------------------------------------------------------------- class TestStatementHeaderEndToEnd: """A real PDF with a real header — exercise the full pipeline: metadata extraction + year inference for short dates + format application. This is the failure mode most likely to break on the user's actual Chase statements.""" @pytest.fixture def pdf_bytes(self) -> bytes: return _build_statement_pdf_with_header() def test_metadata_extracted_and_stamped(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions rows, _ = scan_pdf_for_transactions(pdf_bytes) assert rows, "expected at least one transaction" for r in rows: assert r["account_number"] == "****5678" assert r["statement_period_start"] == "20250101" assert r["statement_period_end"] == "20250131" def test_short_dates_get_year_from_period(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions rows, _ = scan_pdf_for_transactions(pdf_bytes) # Short ``01/13`` + period ending in 2025 → 20250113 assert rows[0]["date"] == "20250113" assert rows[1]["date"] == "20250116" def test_iso_format_round_trip(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions rows, _ = scan_pdf_for_transactions( pdf_bytes, output_date_format="%Y-%m-%d", ) assert rows[0]["date"] == "2025-01-13" assert rows[0]["statement_period_start"] == "2025-01-01" assert rows[0]["statement_period_end"] == "2025-01-31" class TestMultiDateRow: """Some statements (Chase, BofA) show both a transaction date and a posting date per row. The scanner uses the first date in position order and excludes every date from the description.""" def test_first_date_wins_second_excluded_from_description(self): from src import pdf_extract as mod from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions original = mod.extract_pages_auto def fake(_b, *, allow_ocr=True): words = [ WordBox(x0=0, top=0, x1=40, bottom=10, text="01/13"), WordBox(x0=50, top=0, x1=90, bottom=10, text="01/14"), WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"), WordBox(x0=170, top=0, x1=210, bottom=10, text="Shop"), WordBox(x0=220, top=0, x1=270, bottom=10, text="$4.50"), ] return [Page( page_no=1, width=300, height=20, text="", words=words, )], [] mod.extract_pages_auto = fake try: rows, _ = scan_pdf_for_transactions(b"") finally: mod.extract_pages_auto = original assert len(rows) == 1 # First date used as the canonical assert rows[0]["date"] == "01/13" # Second date NOT in description assert "01/14" not in rows[0]["description"] # Description is the actual content between dates and amount assert rows[0]["description"] == "Coffee Shop" class TestZeroAmountRowsAreDropped: """Rows where the transaction amount is exactly 0 are noise (statements love to print "INTEREST EARNED 0.00" or "PAGE TOTAL 0.00") and get filtered out.""" def test_zero_amount_row_dropped(self): from src import pdf_extract as mod from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions original = mod.extract_pages_auto def fake(_b, *, allow_ocr=True): words = [ # Real transaction WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"), WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"), WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"), # Zero-amount noise row (should be dropped) WordBox(x0=0, top=20, x1=80, bottom=30, text="01/14/2026"), WordBox(x0=100, top=20, x1=180, bottom=30, text="INTEREST"), WordBox(x0=200, top=20, x1=240, bottom=30, text="0.00"), ] return [Page( page_no=1, width=300, height=40, text="", words=words, )], [] mod.extract_pages_auto = fake try: rows, _ = scan_pdf_for_transactions(b"") finally: mod.extract_pages_auto = original assert len(rows) == 1 assert rows[0]["amount_1"] == 4.50 assert "INTEREST" not in rows[0]["description"] def test_negative_amount_kept(self): from src import pdf_extract as mod from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions original = mod.extract_pages_auto def fake(_b, *, allow_ocr=True): words = [ WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"), WordBox(x0=100, top=0, x1=160, bottom=10, text="Withdraw"), WordBox(x0=200, top=0, x1=240, bottom=10, text="(40.00)"), ] return [Page( page_no=1, width=300, height=20, text="", words=words, )], [] mod.extract_pages_auto = fake try: rows, _ = scan_pdf_for_transactions(b"") finally: mod.extract_pages_auto = original # -40 is not zero — keep it assert len(rows) == 1 assert rows[0]["amount_1"] == -40.00 class TestRequiresDescription: """Every kept row must have non-empty description. Filters out "Daily Ledger Balances" entries (date + amount with no description) and similar statement-furniture rows.""" def test_empty_description_row_dropped(self): from src import pdf_extract as mod from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions original = mod.extract_pages_auto def fake(_b, *, allow_ocr=True): words = [ # Real transaction WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"), WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"), WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"), # Daily-balance row: date + amount, NO description WordBox(x0=0, top=200, x1=80, bottom=210, text="01/14/2026"), WordBox(x0=200, top=200, x1=280, bottom=210, text="$1,000.00"), ] return [Page( page_no=1, width=300, height=300, text="", words=words, )], [] mod.extract_pages_auto = fake try: rows, _ = scan_pdf_for_transactions(b"") finally: mod.extract_pages_auto = original assert len(rows) == 1 assert "Coffee" in rows[0]["description"] class TestPrevTransactionResetsPerPage: """A no-date no-amount line at the top of page 2 must NOT attach to the last transaction of page 1. Different pages have independent y-coordinate origins so the gap check would be meaningless across pages.""" def test_no_cross_page_merge(self): from src import pdf_extract as mod from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions original = mod.extract_pages_auto def fake(_b, *, allow_ocr=True): p1 = Page( page_no=1, width=300, height=300, text="", words=[ WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"), WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"), WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"), ], ) p2 = Page( page_no=2, width=300, height=300, text="", # First content on page 2 is a section header with # no date and no amount — should NOT attach to the # Coffee row from page 1. words=[ WordBox(x0=0, top=0, x1=200, bottom=10, text="Daily"), WordBox(x0=50, top=0, x1=160, bottom=10, text="Ledger"), WordBox(x0=120, top=0, x1=240, bottom=10, text="Balances"), ], ) return [p1, p2], [] mod.extract_pages_auto = fake try: rows, _ = scan_pdf_for_transactions(b"") finally: mod.extract_pages_auto = original assert len(rows) == 1 assert rows[0]["description"] == "Coffee" assert "Ledger" not in rows[0]["description"] class TestMultilineMergeYGap: """Wrapped-description continuations are close to the previous transaction (~12 pts gap). Section headers further down the page must NOT be silently merged in.""" def test_close_continuation_merges(self): from src import pdf_extract as mod from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions original = mod.extract_pages_auto def fake(_b, *, allow_ocr=True): words = [ # Transaction at y=0..10 WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"), WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"), WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"), # Continuation at y=20..30 (gap = 10 pts) — should merge WordBox(x0=100, top=20, x1=200, bottom=30, text="Vendor"), WordBox(x0=210, top=20, x1=260, bottom=30, text="memo"), ] return [Page( page_no=1, width=300, height=50, text="", words=words, )], [] mod.extract_pages_auto = fake try: rows, _ = scan_pdf_for_transactions(b"") finally: mod.extract_pages_auto = original assert len(rows) == 1 assert "Vendor memo" in rows[0]["description"] def test_far_section_header_does_not_merge(self): """Same setup but the second line is far below — would be a different paragraph in the source PDF.""" from src import pdf_extract as mod from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions original = mod.extract_pages_auto def fake(_b, *, allow_ocr=True): words = [ WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"), WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"), WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"), # 100 pts away — well past the 25-pt merge ceiling WordBox(x0=0, top=110, x1=200, bottom=120, text="Daily"), WordBox(x0=80, top=110, x1=200, bottom=120, text="Ledger"), WordBox(x0=180, top=110, x1=300, bottom=120, text="Balances"), ] return [Page( page_no=1, width=300, height=200, text="", words=words, )], [] mod.extract_pages_auto = fake try: rows, _ = scan_pdf_for_transactions(b"") finally: mod.extract_pages_auto = original assert len(rows) == 1 assert rows[0]["description"] == "Coffee" assert "Ledger" not in rows[0]["description"] class TestMultilineDescription: def test_continuation_line_merges(self): """A line with no date and no amount, sitting between two transaction rows, attaches to the previous transaction's description.""" from src.pdf_extract import ( Page, WordBox, scan_pdf_for_transactions, ) # Build a synthetic page through the public entry point by # going through extract_pages_auto's intermediate? Easier: # call the internals directly via a fake PDF. For unit # coverage of the merge behavior, route through the helper: from src import pdf_extract as mod original = mod.extract_pages_auto def fake(_pdf_bytes, *, allow_ocr=True): words = [ WordBox(x0=0, top=0, x1=80, bottom=10, text="01/15/2026"), WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"), WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"), # Continuation: no date, no amount WordBox(x0=100, top=20, x1=160, bottom=30, text="Vendor"), WordBox(x0=170, top=20, x1=230, bottom=30, text="memo"), # Next transaction WordBox(x0=0, top=40, x1=80, bottom=50, text="01/16/2026"), WordBox(x0=100, top=40, x1=160, bottom=50, text="Other"), WordBox(x0=200, top=40, x1=240, bottom=50, text="$10.00"), ] return [Page( page_no=1, width=300, height=100, text="", words=words, )], [] mod.extract_pages_auto = fake try: rows, _ = scan_pdf_for_transactions(b"") finally: mod.extract_pages_auto = original assert len(rows) == 2 assert "Vendor memo" in rows[0]["description"] assert rows[1]["description"] == "Other" # --------------------------------------------------------------------------- # Graceful fallback when deps absent # --------------------------------------------------------------------------- class TestPdfDependencyMissing: def test_require_pdfplumber_raises_typed_on_absence(self, monkeypatch): from src import pdf_extract import builtins real_import = builtins.__import__ def fake_import(name, *a, **kw): if name == "pdfplumber": raise ImportError("simulated absent dep") return real_import(name, *a, **kw) monkeypatch.setattr(builtins, "__import__", fake_import) with pytest.raises(pdf_extract.PdfDependencyMissing) as exc: pdf_extract._require_pdfplumber() assert "pdfplumber" in str(exc.value) assert exc.value.hint def test_require_pdfium_raises_typed_on_absence(self, monkeypatch): from src import pdf_extract import builtins real_import = builtins.__import__ def fake_import(name, *a, **kw): if name == "pypdfium2": raise ImportError("simulated absent dep") return real_import(name, *a, **kw) monkeypatch.setattr(builtins, "__import__", fake_import) with pytest.raises(pdf_extract.PdfDependencyMissing): pdf_extract._require_pdfium() # --------------------------------------------------------------------------- # Requirements pin consistency # --------------------------------------------------------------------------- class TestPinnedVersionsMatchInstalled: """If someone bumps the pin in ``requirements.txt`` without actually reinstalling, this test points it out before CI does.""" def _parse_pins(self) -> dict[str, str]: from pathlib import Path text = ( Path(__file__).resolve().parent.parent / "requirements.txt" ).read_text(encoding="utf-8") pins: dict[str, str] = {} for line in text.splitlines(): line = line.strip() if not line or line.startswith("#"): continue if "==" in line: name, _, version = line.partition("==") pins[name.strip()] = version.strip() return pins @pytest.mark.parametrize("dist_name", [ "pdfplumber", "pypdfium2", "pytesseract", ]) def test_pin_matches_installed(self, dist_name): import importlib.metadata as md pins = self._parse_pins() if dist_name not in pins: pytest.skip(f"{dist_name} not exact-pinned in requirements.txt") installed = md.version(dist_name) assert installed == pins[dist_name], ( f"installed {dist_name}=={installed} but requirements.txt " f"pins {pins[dist_name]} — bump the pin, or reinstall." ) # --------------------------------------------------------------------------- # OCR availability # --------------------------------------------------------------------------- class TestOcrAvailability: def test_returns_a_tuple(self): from src.pdf_extract import ocr_available result = ocr_available() assert isinstance(result, tuple) and len(result) == 2 ok, reason = result assert isinstance(ok, bool) assert isinstance(reason, str) def test_extract_pages_auto_skips_ocr_when_disabled(self): from src.pdf_extract import extract_pages_auto pdf_bytes = _build_tiny_statement_pdf() pages, warnings = extract_pages_auto(pdf_bytes, allow_ocr=False) assert len(pages) == 1 assert not any("OCR is disabled" in w for w in warnings) class TestTesseractDiscovery: def test_autodetect_returns_none_on_non_windows(self, monkeypatch): from src import pdf_extract monkeypatch.setattr("platform.system", lambda: "Linux") assert pdf_extract._autodetect_tesseract_path() is None def test_autodetect_finds_program_files_on_windows(self, monkeypatch): from src import pdf_extract monkeypatch.setattr("platform.system", lambda: "Windows") target = r"C:\Program Files\Tesseract-OCR\tesseract.exe" def fake_exists(self): return str(self) == target monkeypatch.setattr("pathlib.Path.exists", fake_exists) assert pdf_extract._autodetect_tesseract_path() == target def test_autodetect_returns_none_when_nothing_installed(self, monkeypatch): from src import pdf_extract monkeypatch.setattr("platform.system", lambda: "Windows") monkeypatch.setattr("pathlib.Path.exists", lambda self: False) assert pdf_extract._autodetect_tesseract_path() is None def test_env_var_override_takes_precedence(self, monkeypatch, tmp_path): from src import pdf_extract fake_bin = str(tmp_path / "fake-tesseract.exe") monkeypatch.setenv("DATATOOLS_TESSERACT_PATH", fake_bin) pdf_extract.ocr_available() import pytesseract assert pytesseract.pytesseract.tesseract_cmd == fake_bin