User asked to flip the default from YYYYMMDD to YYYY-MM-DD. ISO is the better default for an accountant CSV workflow: - Lexicographic sort = chronological sort (no parsing needed). - Every spreadsheet tool the user might import into recognises it as a real date with no ambiguity (US vs EU readers can't disagree on the order). - Hyphens make the year/month/day boundaries scan-able by eye. Concrete changes: - New module constant ``DEFAULT_DATE_FORMAT = "%Y-%m-%d"``, used as the default for ``format_date()`` and the ``output_date_format`` keyword on ``scan_pdf_for_transactions``. - Page's ``_DATE_FORMAT_CHOICES`` reordered so the ISO entry is first (index 0 = default Streamlit selection); YYYYMMDD drops to second. - Custom-strftime input default also flips to ``%Y-%m-%d``. Tests updated to reflect the new default (``test_dates_formatted_iso_by_default``, ``test_short_dates_get_year_from_period``, ``test_compact_format_round_trip``, plus a new ``test_default_is_iso`` for the format_date helper). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
676 lines
26 KiB
Python
676 lines
26 KiB
Python
"""End-to-end smoke tests for the PDF transaction scanner.
|
|
|
|
These run real ``pdfplumber`` + ``pypdfium2`` (when OCR is in play)
|
|
calls against a small statement-shaped PDF generated in memory
|
|
with ``fpdf2``. They catch the failure modes most likely to bite
|
|
an end-user installer build: missing native lib, broken hook
|
|
bundling, pin/installed mismatch.
|
|
|
|
Generation note: ``fpdf2`` is a test-only dep in
|
|
``requirements-dev.txt``. We don't ship it.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
|
|
def _build_statement_pdf_with_header() -> bytes:
|
|
"""Statement with realistic header (account + period) plus
|
|
transactions. Exercises the metadata-extraction path end-to-end."""
|
|
from fpdf import FPDF
|
|
|
|
pdf = FPDF(orientation="P", unit="pt", format="letter")
|
|
pdf.add_page()
|
|
pdf.set_font("Helvetica", size=12)
|
|
pdf.set_xy(40, 50)
|
|
pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT")
|
|
pdf.set_xy(40, 70)
|
|
pdf.cell(0, 14, "Account Number: ****5678", new_x="LMARGIN", new_y="NEXT")
|
|
pdf.set_xy(40, 85)
|
|
pdf.cell(0, 14, "Statement Period: 01/01/2025 - 01/31/2025",
|
|
new_x="LMARGIN", new_y="NEXT")
|
|
# Header row
|
|
pdf.set_xy(40, 130)
|
|
pdf.cell(120, 14, "Date")
|
|
pdf.set_xy(160, 130)
|
|
pdf.cell(200, 14, "Description")
|
|
pdf.set_xy(360, 130)
|
|
pdf.cell(80, 14, "Amount")
|
|
# Transactions with SHORT dates — year is implied by period.
|
|
rows = [
|
|
("01/13", "Coffee Shop", "(4.50)"),
|
|
("01/16", "Refund Vendor", "$12.00"),
|
|
]
|
|
y = 160
|
|
for date, desc, amt in rows:
|
|
pdf.set_xy(40, y)
|
|
pdf.cell(120, 14, date)
|
|
pdf.set_xy(160, y)
|
|
pdf.cell(200, 14, desc)
|
|
pdf.set_xy(360, y)
|
|
pdf.cell(80, 14, amt)
|
|
y += 20
|
|
return bytes(pdf.output())
|
|
|
|
|
|
def _build_tiny_statement_pdf() -> bytes:
|
|
"""One-page PDF: header line + three transaction rows + a
|
|
closing-balance footer. The scanner should pick up exactly the
|
|
three transactions."""
|
|
from fpdf import FPDF
|
|
|
|
pdf = FPDF(orientation="P", unit="pt", format="letter")
|
|
pdf.add_page()
|
|
pdf.set_font("Helvetica", size=12)
|
|
pdf.set_xy(40, 50)
|
|
pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT")
|
|
# Header row (not a transaction — no amount)
|
|
pdf.set_xy(40, 100)
|
|
pdf.cell(120, 14, "Date")
|
|
pdf.set_xy(160, 100)
|
|
pdf.cell(200, 14, "Description")
|
|
pdf.set_xy(360, 100)
|
|
pdf.cell(80, 14, "Amount")
|
|
# Three transactions
|
|
rows = [
|
|
("01/15/2026", "Coffee Shop", "(4.50)"),
|
|
("01/16/2026", "Refund Vendor", "$12.00"),
|
|
("01/17/2026", "ATM Withdrawal", "(40.00)"),
|
|
]
|
|
y = 130
|
|
for date, desc, amt in rows:
|
|
pdf.set_xy(40, y)
|
|
pdf.cell(120, 14, date)
|
|
pdf.set_xy(160, y)
|
|
pdf.cell(200, 14, desc)
|
|
pdf.set_xy(360, y)
|
|
pdf.cell(80, 14, amt)
|
|
y += 20
|
|
# Footer — has a date-like number maybe but no real txn shape
|
|
pdf.set_xy(40, y + 20)
|
|
pdf.cell(0, 14, "Closing balance: $1,000.00")
|
|
return bytes(pdf.output())
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Dependency import smoke
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestDependencyImports:
|
|
"""Each runtime PDF dep must be importable. Fails fast on a
|
|
stripped install or a missing CI pin."""
|
|
|
|
def test_pdfplumber(self):
|
|
import pdfplumber # noqa: F401
|
|
|
|
def test_pypdfium2(self):
|
|
import pypdfium2 # noqa: F401
|
|
|
|
def test_pytesseract(self):
|
|
import pytesseract # noqa: F401
|
|
|
|
def test_PIL(self):
|
|
from PIL import Image # noqa: F401
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# End-to-end against a real PDF
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestScanPdfForTransactions:
|
|
@pytest.fixture
|
|
def pdf_bytes(self) -> bytes:
|
|
return _build_tiny_statement_pdf()
|
|
|
|
def test_finds_three_transactions(self, pdf_bytes):
|
|
from src.pdf_extract import scan_pdf_for_transactions
|
|
rows, warnings = scan_pdf_for_transactions(pdf_bytes)
|
|
# The PDF has 3 transactions plus a header and a closing-
|
|
# balance footer. Header has no amount; closing-balance has
|
|
# no date in the same line — neither qualifies as a txn.
|
|
assert len(rows) == 3, (
|
|
f"expected 3 rows, got {len(rows)}:\n"
|
|
f"{[r.get('raw') for r in rows]}"
|
|
)
|
|
|
|
def test_dates_formatted_iso_by_default(self, pdf_bytes):
|
|
from src.pdf_extract import scan_pdf_for_transactions
|
|
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
|
# Default output format is %Y-%m-%d — ISO ordering, parses
|
|
# cleanly in every spreadsheet tool the user might import
|
|
# this CSV into.
|
|
assert [r["date"] for r in rows] == [
|
|
"2026-01-15", "2026-01-16", "2026-01-17",
|
|
]
|
|
|
|
def test_output_date_format_override(self, pdf_bytes):
|
|
from src.pdf_extract import scan_pdf_for_transactions
|
|
rows, _ = scan_pdf_for_transactions(
|
|
pdf_bytes, output_date_format="%Y%m%d",
|
|
)
|
|
assert [r["date"] for r in rows] == [
|
|
"20260115", "20260116", "20260117",
|
|
]
|
|
|
|
def test_account_number_field_present_on_every_row(self, pdf_bytes):
|
|
from src.pdf_extract import scan_pdf_for_transactions
|
|
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
|
# ``account_number`` is the only per-row metadata field that
|
|
# surfaces in the CSV; the period fields are extracted but
|
|
# used only for internal year inference.
|
|
for r in rows:
|
|
assert "account_number" in r
|
|
assert "statement_period_start" not in r
|
|
assert "statement_period_end" not in r
|
|
|
|
def test_parses_amounts_with_signs(self, pdf_bytes):
|
|
from src.pdf_extract import scan_pdf_for_transactions
|
|
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
|
assert rows[0]["amount_1"] == -4.50
|
|
assert rows[1]["amount_1"] == 12.00
|
|
assert rows[2]["amount_1"] == -40.00
|
|
|
|
def test_preserves_raw_line(self, pdf_bytes):
|
|
from src.pdf_extract import scan_pdf_for_transactions
|
|
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
|
# Raw line lets the user verify what was matched.
|
|
assert all("raw" in r and r["raw"] for r in rows)
|
|
assert "Coffee" in rows[0]["raw"]
|
|
|
|
def test_page_tagged(self, pdf_bytes):
|
|
from src.pdf_extract import scan_pdf_for_transactions
|
|
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
|
assert all(r["page"] == 1 for r in rows)
|
|
|
|
def test_negative_in_parens_off(self, pdf_bytes):
|
|
"""With parens-negative off, the parser can't decode
|
|
``(4.50)`` and falls back to the raw text — the row still
|
|
surfaces, just with the unparsed string in the amount slot
|
|
so the user can see and fix it in the editor."""
|
|
from src.pdf_extract import scan_pdf_for_transactions
|
|
rows, _ = scan_pdf_for_transactions(
|
|
pdf_bytes, negative_in_parens=False,
|
|
)
|
|
# Row 0 had "(4.50)" — without parens-negative, parse_amount
|
|
# returns None and the scanner keeps the raw token.
|
|
assert rows[0]["amount_1"] == "(4.50)"
|
|
# Row 1 had "$12.00" — still parses to positive.
|
|
assert rows[1]["amount_1"] == 12.00
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Multi-line description merging
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestStatementHeaderEndToEnd:
|
|
"""A real PDF with a real header — exercise the full pipeline:
|
|
metadata extraction + year inference for short dates + format
|
|
application. This is the failure mode most likely to break on
|
|
the user's actual Chase statements."""
|
|
|
|
@pytest.fixture
|
|
def pdf_bytes(self) -> bytes:
|
|
return _build_statement_pdf_with_header()
|
|
|
|
def test_account_number_stamped(self, pdf_bytes):
|
|
from src.pdf_extract import scan_pdf_for_transactions
|
|
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
|
assert rows, "expected at least one transaction"
|
|
for r in rows:
|
|
assert r["account_number"] == "****5678"
|
|
|
|
def test_short_dates_get_year_from_period(self, pdf_bytes):
|
|
from src.pdf_extract import scan_pdf_for_transactions
|
|
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
|
# Short ``01/13`` + period ending in 2025 → 2025-01-13.
|
|
# The period itself isn't surfaced as a column anymore, but
|
|
# the year inference that depends on it still works because
|
|
# extraction happens internally before the per-row stamp.
|
|
# Output is in the default ISO format now.
|
|
assert rows[0]["date"] == "2025-01-13"
|
|
assert rows[1]["date"] == "2025-01-16"
|
|
|
|
def test_period_fields_not_in_output(self, pdf_bytes):
|
|
from src.pdf_extract import scan_pdf_for_transactions
|
|
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
|
for r in rows:
|
|
assert "statement_period_start" not in r
|
|
assert "statement_period_end" not in r
|
|
|
|
def test_compact_format_round_trip(self, pdf_bytes):
|
|
from src.pdf_extract import scan_pdf_for_transactions
|
|
rows, _ = scan_pdf_for_transactions(
|
|
pdf_bytes, output_date_format="%Y%m%d",
|
|
)
|
|
assert rows[0]["date"] == "20250113"
|
|
|
|
|
|
class TestMultiDateRow:
|
|
"""Some statements (Chase, BofA) show both a transaction date
|
|
and a posting date per row. The scanner uses the first date
|
|
in position order and excludes every date from the description."""
|
|
|
|
def test_first_date_wins_second_excluded_from_description(self):
|
|
from src import pdf_extract as mod
|
|
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
|
|
|
|
original = mod.extract_pages_auto
|
|
|
|
def fake(_b, *, allow_ocr=True):
|
|
words = [
|
|
WordBox(x0=0, top=0, x1=40, bottom=10, text="01/13"),
|
|
WordBox(x0=50, top=0, x1=90, bottom=10, text="01/14"),
|
|
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
|
|
WordBox(x0=170, top=0, x1=210, bottom=10, text="Shop"),
|
|
WordBox(x0=220, top=0, x1=270, bottom=10, text="$4.50"),
|
|
]
|
|
return [Page(
|
|
page_no=1, width=300, height=20, text="", words=words,
|
|
)], []
|
|
|
|
mod.extract_pages_auto = fake
|
|
try:
|
|
rows, _ = scan_pdf_for_transactions(b"")
|
|
finally:
|
|
mod.extract_pages_auto = original
|
|
|
|
assert len(rows) == 1
|
|
# First date used as the canonical
|
|
assert rows[0]["date"] == "01/13"
|
|
# Second date NOT in description
|
|
assert "01/14" not in rows[0]["description"]
|
|
# Description is the actual content between dates and amount
|
|
assert rows[0]["description"] == "Coffee Shop"
|
|
|
|
|
|
class TestZeroAmountRowsAreDropped:
|
|
"""Rows where the transaction amount is exactly 0 are noise
|
|
(statements love to print "INTEREST EARNED 0.00" or
|
|
"PAGE TOTAL 0.00") and get filtered out."""
|
|
|
|
def test_zero_amount_row_dropped(self):
|
|
from src import pdf_extract as mod
|
|
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
|
|
|
|
original = mod.extract_pages_auto
|
|
|
|
def fake(_b, *, allow_ocr=True):
|
|
words = [
|
|
# Real transaction
|
|
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
|
|
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
|
|
WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
|
|
# Zero-amount noise row (should be dropped)
|
|
WordBox(x0=0, top=20, x1=80, bottom=30, text="01/14/2026"),
|
|
WordBox(x0=100, top=20, x1=180, bottom=30, text="INTEREST"),
|
|
WordBox(x0=200, top=20, x1=240, bottom=30, text="0.00"),
|
|
]
|
|
return [Page(
|
|
page_no=1, width=300, height=40, text="", words=words,
|
|
)], []
|
|
|
|
mod.extract_pages_auto = fake
|
|
try:
|
|
rows, _ = scan_pdf_for_transactions(b"")
|
|
finally:
|
|
mod.extract_pages_auto = original
|
|
|
|
assert len(rows) == 1
|
|
assert rows[0]["amount_1"] == 4.50
|
|
assert "INTEREST" not in rows[0]["description"]
|
|
|
|
def test_negative_amount_kept(self):
|
|
from src import pdf_extract as mod
|
|
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
|
|
|
|
original = mod.extract_pages_auto
|
|
|
|
def fake(_b, *, allow_ocr=True):
|
|
words = [
|
|
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
|
|
WordBox(x0=100, top=0, x1=160, bottom=10, text="Withdraw"),
|
|
WordBox(x0=200, top=0, x1=240, bottom=10, text="(40.00)"),
|
|
]
|
|
return [Page(
|
|
page_no=1, width=300, height=20, text="", words=words,
|
|
)], []
|
|
|
|
mod.extract_pages_auto = fake
|
|
try:
|
|
rows, _ = scan_pdf_for_transactions(b"")
|
|
finally:
|
|
mod.extract_pages_auto = original
|
|
|
|
# -40 is not zero — keep it
|
|
assert len(rows) == 1
|
|
assert rows[0]["amount_1"] == -40.00
|
|
|
|
|
|
class TestRequiresDescription:
|
|
"""Every kept row must have non-empty description. Filters out
|
|
"Daily Ledger Balances" entries (date + amount with no
|
|
description) and similar statement-furniture rows."""
|
|
|
|
def test_empty_description_row_dropped(self):
|
|
from src import pdf_extract as mod
|
|
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
|
|
|
|
original = mod.extract_pages_auto
|
|
|
|
def fake(_b, *, allow_ocr=True):
|
|
words = [
|
|
# Real transaction
|
|
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
|
|
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
|
|
WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
|
|
# Daily-balance row: date + amount, NO description
|
|
WordBox(x0=0, top=200, x1=80, bottom=210, text="01/14/2026"),
|
|
WordBox(x0=200, top=200, x1=280, bottom=210, text="$1,000.00"),
|
|
]
|
|
return [Page(
|
|
page_no=1, width=300, height=300, text="", words=words,
|
|
)], []
|
|
|
|
mod.extract_pages_auto = fake
|
|
try:
|
|
rows, _ = scan_pdf_for_transactions(b"")
|
|
finally:
|
|
mod.extract_pages_auto = original
|
|
|
|
assert len(rows) == 1
|
|
assert "Coffee" in rows[0]["description"]
|
|
|
|
|
|
class TestPrevTransactionResetsPerPage:
|
|
"""A no-date no-amount line at the top of page 2 must NOT
|
|
attach to the last transaction of page 1. Different pages have
|
|
independent y-coordinate origins so the gap check would be
|
|
meaningless across pages."""
|
|
|
|
def test_no_cross_page_merge(self):
|
|
from src import pdf_extract as mod
|
|
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
|
|
|
|
original = mod.extract_pages_auto
|
|
|
|
def fake(_b, *, allow_ocr=True):
|
|
p1 = Page(
|
|
page_no=1, width=300, height=300, text="",
|
|
words=[
|
|
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
|
|
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
|
|
WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
|
|
],
|
|
)
|
|
p2 = Page(
|
|
page_no=2, width=300, height=300, text="",
|
|
# First content on page 2 is a section header with
|
|
# no date and no amount — should NOT attach to the
|
|
# Coffee row from page 1.
|
|
words=[
|
|
WordBox(x0=0, top=0, x1=200, bottom=10, text="Daily"),
|
|
WordBox(x0=50, top=0, x1=160, bottom=10, text="Ledger"),
|
|
WordBox(x0=120, top=0, x1=240, bottom=10, text="Balances"),
|
|
],
|
|
)
|
|
return [p1, p2], []
|
|
|
|
mod.extract_pages_auto = fake
|
|
try:
|
|
rows, _ = scan_pdf_for_transactions(b"")
|
|
finally:
|
|
mod.extract_pages_auto = original
|
|
|
|
assert len(rows) == 1
|
|
assert rows[0]["description"] == "Coffee"
|
|
assert "Ledger" not in rows[0]["description"]
|
|
|
|
|
|
class TestMultilineMergeYGap:
|
|
"""Wrapped-description continuations are close to the previous
|
|
transaction (~12 pts gap). Section headers further down the
|
|
page must NOT be silently merged in."""
|
|
|
|
def test_close_continuation_merges(self):
|
|
from src import pdf_extract as mod
|
|
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
|
|
|
|
original = mod.extract_pages_auto
|
|
|
|
def fake(_b, *, allow_ocr=True):
|
|
words = [
|
|
# Transaction at y=0..10
|
|
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
|
|
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
|
|
WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
|
|
# Continuation at y=20..30 (gap = 10 pts) — should merge
|
|
WordBox(x0=100, top=20, x1=200, bottom=30, text="Vendor"),
|
|
WordBox(x0=210, top=20, x1=260, bottom=30, text="memo"),
|
|
]
|
|
return [Page(
|
|
page_no=1, width=300, height=50, text="", words=words,
|
|
)], []
|
|
|
|
mod.extract_pages_auto = fake
|
|
try:
|
|
rows, _ = scan_pdf_for_transactions(b"")
|
|
finally:
|
|
mod.extract_pages_auto = original
|
|
|
|
assert len(rows) == 1
|
|
assert "Vendor memo" in rows[0]["description"]
|
|
|
|
def test_far_section_header_does_not_merge(self):
|
|
"""Same setup but the second line is far below — would be
|
|
a different paragraph in the source PDF."""
|
|
from src import pdf_extract as mod
|
|
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
|
|
|
|
original = mod.extract_pages_auto
|
|
|
|
def fake(_b, *, allow_ocr=True):
|
|
words = [
|
|
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
|
|
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
|
|
WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
|
|
# 100 pts away — well past the 25-pt merge ceiling
|
|
WordBox(x0=0, top=110, x1=200, bottom=120, text="Daily"),
|
|
WordBox(x0=80, top=110, x1=200, bottom=120, text="Ledger"),
|
|
WordBox(x0=180, top=110, x1=300, bottom=120, text="Balances"),
|
|
]
|
|
return [Page(
|
|
page_no=1, width=300, height=200, text="", words=words,
|
|
)], []
|
|
|
|
mod.extract_pages_auto = fake
|
|
try:
|
|
rows, _ = scan_pdf_for_transactions(b"")
|
|
finally:
|
|
mod.extract_pages_auto = original
|
|
|
|
assert len(rows) == 1
|
|
assert rows[0]["description"] == "Coffee"
|
|
assert "Ledger" not in rows[0]["description"]
|
|
|
|
|
|
class TestMultilineDescription:
|
|
def test_continuation_line_merges(self):
|
|
"""A line with no date and no amount, sitting between two
|
|
transaction rows, attaches to the previous transaction's
|
|
description."""
|
|
from src.pdf_extract import (
|
|
Page,
|
|
WordBox,
|
|
scan_pdf_for_transactions,
|
|
)
|
|
# Build a synthetic page through the public entry point by
|
|
# going through extract_pages_auto's intermediate? Easier:
|
|
# call the internals directly via a fake PDF. For unit
|
|
# coverage of the merge behavior, route through the helper:
|
|
from src import pdf_extract as mod
|
|
|
|
original = mod.extract_pages_auto
|
|
|
|
def fake(_pdf_bytes, *, allow_ocr=True):
|
|
words = [
|
|
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/15/2026"),
|
|
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
|
|
WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
|
|
# Continuation: no date, no amount
|
|
WordBox(x0=100, top=20, x1=160, bottom=30, text="Vendor"),
|
|
WordBox(x0=170, top=20, x1=230, bottom=30, text="memo"),
|
|
# Next transaction
|
|
WordBox(x0=0, top=40, x1=80, bottom=50, text="01/16/2026"),
|
|
WordBox(x0=100, top=40, x1=160, bottom=50, text="Other"),
|
|
WordBox(x0=200, top=40, x1=240, bottom=50, text="$10.00"),
|
|
]
|
|
return [Page(
|
|
page_no=1, width=300, height=100, text="", words=words,
|
|
)], []
|
|
|
|
mod.extract_pages_auto = fake
|
|
try:
|
|
rows, _ = scan_pdf_for_transactions(b"")
|
|
finally:
|
|
mod.extract_pages_auto = original
|
|
|
|
assert len(rows) == 2
|
|
assert "Vendor memo" in rows[0]["description"]
|
|
assert rows[1]["description"] == "Other"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Graceful fallback when deps absent
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestPdfDependencyMissing:
|
|
def test_require_pdfplumber_raises_typed_on_absence(self, monkeypatch):
|
|
from src import pdf_extract
|
|
import builtins
|
|
real_import = builtins.__import__
|
|
|
|
def fake_import(name, *a, **kw):
|
|
if name == "pdfplumber":
|
|
raise ImportError("simulated absent dep")
|
|
return real_import(name, *a, **kw)
|
|
|
|
monkeypatch.setattr(builtins, "__import__", fake_import)
|
|
with pytest.raises(pdf_extract.PdfDependencyMissing) as exc:
|
|
pdf_extract._require_pdfplumber()
|
|
assert "pdfplumber" in str(exc.value)
|
|
assert exc.value.hint
|
|
|
|
def test_require_pdfium_raises_typed_on_absence(self, monkeypatch):
|
|
from src import pdf_extract
|
|
import builtins
|
|
real_import = builtins.__import__
|
|
|
|
def fake_import(name, *a, **kw):
|
|
if name == "pypdfium2":
|
|
raise ImportError("simulated absent dep")
|
|
return real_import(name, *a, **kw)
|
|
|
|
monkeypatch.setattr(builtins, "__import__", fake_import)
|
|
with pytest.raises(pdf_extract.PdfDependencyMissing):
|
|
pdf_extract._require_pdfium()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Requirements pin consistency
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestPinnedVersionsMatchInstalled:
|
|
"""If someone bumps the pin in ``requirements.txt`` without
|
|
actually reinstalling, this test points it out before CI does."""
|
|
|
|
def _parse_pins(self) -> dict[str, str]:
|
|
from pathlib import Path
|
|
text = (
|
|
Path(__file__).resolve().parent.parent / "requirements.txt"
|
|
).read_text(encoding="utf-8")
|
|
pins: dict[str, str] = {}
|
|
for line in text.splitlines():
|
|
line = line.strip()
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
if "==" in line:
|
|
name, _, version = line.partition("==")
|
|
pins[name.strip()] = version.strip()
|
|
return pins
|
|
|
|
@pytest.mark.parametrize("dist_name", [
|
|
"pdfplumber",
|
|
"pypdfium2",
|
|
"pytesseract",
|
|
])
|
|
def test_pin_matches_installed(self, dist_name):
|
|
import importlib.metadata as md
|
|
pins = self._parse_pins()
|
|
if dist_name not in pins:
|
|
pytest.skip(f"{dist_name} not exact-pinned in requirements.txt")
|
|
installed = md.version(dist_name)
|
|
assert installed == pins[dist_name], (
|
|
f"installed {dist_name}=={installed} but requirements.txt "
|
|
f"pins {pins[dist_name]} — bump the pin, or reinstall."
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# OCR availability
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestOcrAvailability:
|
|
def test_returns_a_tuple(self):
|
|
from src.pdf_extract import ocr_available
|
|
result = ocr_available()
|
|
assert isinstance(result, tuple) and len(result) == 2
|
|
ok, reason = result
|
|
assert isinstance(ok, bool)
|
|
assert isinstance(reason, str)
|
|
|
|
def test_extract_pages_auto_skips_ocr_when_disabled(self):
|
|
from src.pdf_extract import extract_pages_auto
|
|
pdf_bytes = _build_tiny_statement_pdf()
|
|
pages, warnings = extract_pages_auto(pdf_bytes, allow_ocr=False)
|
|
assert len(pages) == 1
|
|
assert not any("OCR is disabled" in w for w in warnings)
|
|
|
|
|
|
class TestTesseractDiscovery:
|
|
def test_autodetect_returns_none_on_non_windows(self, monkeypatch):
|
|
from src import pdf_extract
|
|
monkeypatch.setattr("platform.system", lambda: "Linux")
|
|
assert pdf_extract._autodetect_tesseract_path() is None
|
|
|
|
def test_autodetect_finds_program_files_on_windows(self, monkeypatch):
|
|
from src import pdf_extract
|
|
monkeypatch.setattr("platform.system", lambda: "Windows")
|
|
target = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
|
|
|
|
def fake_exists(self):
|
|
return str(self) == target
|
|
|
|
monkeypatch.setattr("pathlib.Path.exists", fake_exists)
|
|
assert pdf_extract._autodetect_tesseract_path() == target
|
|
|
|
def test_autodetect_returns_none_when_nothing_installed(self, monkeypatch):
|
|
from src import pdf_extract
|
|
monkeypatch.setattr("platform.system", lambda: "Windows")
|
|
monkeypatch.setattr("pathlib.Path.exists", lambda self: False)
|
|
assert pdf_extract._autodetect_tesseract_path() is None
|
|
|
|
def test_env_var_override_takes_precedence(self, monkeypatch, tmp_path):
|
|
from src import pdf_extract
|
|
fake_bin = str(tmp_path / "fake-tesseract.exe")
|
|
monkeypatch.setenv("DATATOOLS_TESSERACT_PATH", fake_bin)
|
|
pdf_extract.ocr_available()
|
|
import pytesseract
|
|
assert pytesseract.pytesseract.tesseract_cmd == fake_bin
|