Files
datatools-dev/tests/test_pdf_extract_smoke.py
Michael 155dd30746 feat(pdf): extract statement header (account + period) + date format
Two related additions for the accountant workflow:

**1. Statement header extraction.** New
``extract_statement_metadata(pages)`` pulls the account number
and statement period out of the first page (falls back to
page 1+2 if either is missing on page 1 — Wells Fargo business
accounts put header info on page 2). Detected fields are
stamped onto EVERY transaction row so a multi-statement CSV is
self-attributing per row::

    {
      "date": "20250113",
      "description": "Coffee Shop",
      "amount_1": -4.50,
      "account_number": "****5678",
      "statement_period_start": "20250101",
      "statement_period_end": "20250131",
      ...
    }

Account-number regex is tolerant of masks (``****1234``),
hyphens (``1234-5678-9012``), and spaces. Period regex looks
for "Statement Period" / "From" / "Period Covered" labels plus
the first 1-2 full-year dates that follow. If only one date is
present near the label, it's used for both start and end (some
statements show only the closing date).

**2. Year inference for short dates.** When the row date is a
short ``01/13`` or ``Jan 13`` without a year, the scanner now
binds the year from the statement period's end date BEFORE
formatting. Doesn't handle the December-in-January-statement
cross-year case (rare; user can edit in the table).

**3. Configurable output date format.** New
``output_date_format`` parameter on ``scan_pdf_for_transactions``
defaults to ``%Y%m%d``. Applied to: the transaction date column
AND the statement period start/end fields. The page surfaces a
dropdown in Scan options with common presets (YYYYMMDD,
YYYY-MM-DD, MM/DD/YYYY, DD/MM/YYYY, ``Mon DD, YYYY``) plus a
Custom option that accepts a raw strftime string.

New helper: ``format_date(iso_str, fmt)`` converts ISO
``YYYY-MM-DD`` to any strftime; passes invalid input through
unchanged so the user can see what was actually there rather
than getting silent empties.

20 new tests cover: format_date, account-number extraction
(masked / hyphenated / spaced / no-label / short), period
extraction (standard / from-to / single-date / no-label),
metadata orchestrator (full header / no pages / page-2
fallback), year inference (US / dash / month-name / no-period /
unparseable), plus an end-to-end class that builds a header'd
PDF with short-date transactions and confirms metadata
attribution + year inference + format round-trip.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 00:20:46 +00:00

520 lines
20 KiB
Python

"""End-to-end smoke tests for the PDF transaction scanner.
These run real ``pdfplumber`` + ``pypdfium2`` (when OCR is in play)
calls against a small statement-shaped PDF generated in memory
with ``fpdf2``. They catch the failure modes most likely to bite
an end-user installer build: missing native lib, broken hook
bundling, pin/installed mismatch.
Generation note: ``fpdf2`` is a test-only dep in
``requirements-dev.txt``. We don't ship it.
"""
from __future__ import annotations
import pytest
def _build_statement_pdf_with_header() -> bytes:
"""Statement with realistic header (account + period) plus
transactions. Exercises the metadata-extraction path end-to-end."""
from fpdf import FPDF
pdf = FPDF(orientation="P", unit="pt", format="letter")
pdf.add_page()
pdf.set_font("Helvetica", size=12)
pdf.set_xy(40, 50)
pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT")
pdf.set_xy(40, 70)
pdf.cell(0, 14, "Account Number: ****5678", new_x="LMARGIN", new_y="NEXT")
pdf.set_xy(40, 85)
pdf.cell(0, 14, "Statement Period: 01/01/2025 - 01/31/2025",
new_x="LMARGIN", new_y="NEXT")
# Header row
pdf.set_xy(40, 130)
pdf.cell(120, 14, "Date")
pdf.set_xy(160, 130)
pdf.cell(200, 14, "Description")
pdf.set_xy(360, 130)
pdf.cell(80, 14, "Amount")
# Transactions with SHORT dates — year is implied by period.
rows = [
("01/13", "Coffee Shop", "(4.50)"),
("01/16", "Refund Vendor", "$12.00"),
]
y = 160
for date, desc, amt in rows:
pdf.set_xy(40, y)
pdf.cell(120, 14, date)
pdf.set_xy(160, y)
pdf.cell(200, 14, desc)
pdf.set_xy(360, y)
pdf.cell(80, 14, amt)
y += 20
return bytes(pdf.output())
def _build_tiny_statement_pdf() -> bytes:
"""One-page PDF: header line + three transaction rows + a
closing-balance footer. The scanner should pick up exactly the
three transactions."""
from fpdf import FPDF
pdf = FPDF(orientation="P", unit="pt", format="letter")
pdf.add_page()
pdf.set_font("Helvetica", size=12)
pdf.set_xy(40, 50)
pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT")
# Header row (not a transaction — no amount)
pdf.set_xy(40, 100)
pdf.cell(120, 14, "Date")
pdf.set_xy(160, 100)
pdf.cell(200, 14, "Description")
pdf.set_xy(360, 100)
pdf.cell(80, 14, "Amount")
# Three transactions
rows = [
("01/15/2026", "Coffee Shop", "(4.50)"),
("01/16/2026", "Refund Vendor", "$12.00"),
("01/17/2026", "ATM Withdrawal", "(40.00)"),
]
y = 130
for date, desc, amt in rows:
pdf.set_xy(40, y)
pdf.cell(120, 14, date)
pdf.set_xy(160, y)
pdf.cell(200, 14, desc)
pdf.set_xy(360, y)
pdf.cell(80, 14, amt)
y += 20
# Footer — has a date-like number maybe but no real txn shape
pdf.set_xy(40, y + 20)
pdf.cell(0, 14, "Closing balance: $1,000.00")
return bytes(pdf.output())
# ---------------------------------------------------------------------------
# Dependency import smoke
# ---------------------------------------------------------------------------
class TestDependencyImports:
"""Each runtime PDF dep must be importable. Fails fast on a
stripped install or a missing CI pin."""
def test_pdfplumber(self):
import pdfplumber # noqa: F401
def test_pypdfium2(self):
import pypdfium2 # noqa: F401
def test_pytesseract(self):
import pytesseract # noqa: F401
def test_PIL(self):
from PIL import Image # noqa: F401
# ---------------------------------------------------------------------------
# End-to-end against a real PDF
# ---------------------------------------------------------------------------
class TestScanPdfForTransactions:
@pytest.fixture
def pdf_bytes(self) -> bytes:
return _build_tiny_statement_pdf()
def test_finds_three_transactions(self, pdf_bytes):
from src.pdf_extract import scan_pdf_for_transactions
rows, warnings = scan_pdf_for_transactions(pdf_bytes)
# The PDF has 3 transactions plus a header and a closing-
# balance footer. Header has no amount; closing-balance has
# no date in the same line — neither qualifies as a txn.
assert len(rows) == 3, (
f"expected 3 rows, got {len(rows)}:\n"
f"{[r.get('raw') for r in rows]}"
)
def test_dates_formatted_yyyymmdd_by_default(self, pdf_bytes):
from src.pdf_extract import scan_pdf_for_transactions
rows, _ = scan_pdf_for_transactions(pdf_bytes)
# Default output format is %Y%m%d
assert [r["date"] for r in rows] == [
"20260115", "20260116", "20260117",
]
def test_output_date_format_override(self, pdf_bytes):
from src.pdf_extract import scan_pdf_for_transactions
rows, _ = scan_pdf_for_transactions(
pdf_bytes, output_date_format="%Y-%m-%d",
)
assert [r["date"] for r in rows] == [
"2026-01-15", "2026-01-16", "2026-01-17",
]
def test_metadata_fields_present_on_every_row(self, pdf_bytes):
from src.pdf_extract import scan_pdf_for_transactions
rows, _ = scan_pdf_for_transactions(pdf_bytes)
# The fixture PDF has no statement-period or account
# header, so the metadata fields exist but are empty
# strings — the contract is: ALWAYS present on every row.
for r in rows:
assert "account_number" in r
assert "statement_period_start" in r
assert "statement_period_end" in r
def test_parses_amounts_with_signs(self, pdf_bytes):
from src.pdf_extract import scan_pdf_for_transactions
rows, _ = scan_pdf_for_transactions(pdf_bytes)
assert rows[0]["amount_1"] == -4.50
assert rows[1]["amount_1"] == 12.00
assert rows[2]["amount_1"] == -40.00
def test_preserves_raw_line(self, pdf_bytes):
from src.pdf_extract import scan_pdf_for_transactions
rows, _ = scan_pdf_for_transactions(pdf_bytes)
# Raw line lets the user verify what was matched.
assert all("raw" in r and r["raw"] for r in rows)
assert "Coffee" in rows[0]["raw"]
def test_page_tagged(self, pdf_bytes):
from src.pdf_extract import scan_pdf_for_transactions
rows, _ = scan_pdf_for_transactions(pdf_bytes)
assert all(r["page"] == 1 for r in rows)
def test_negative_in_parens_off(self, pdf_bytes):
"""With parens-negative off, the parser can't decode
``(4.50)`` and falls back to the raw text — the row still
surfaces, just with the unparsed string in the amount slot
so the user can see and fix it in the editor."""
from src.pdf_extract import scan_pdf_for_transactions
rows, _ = scan_pdf_for_transactions(
pdf_bytes, negative_in_parens=False,
)
# Row 0 had "(4.50)" — without parens-negative, parse_amount
# returns None and the scanner keeps the raw token.
assert rows[0]["amount_1"] == "(4.50)"
# Row 1 had "$12.00" — still parses to positive.
assert rows[1]["amount_1"] == 12.00
# ---------------------------------------------------------------------------
# Multi-line description merging
# ---------------------------------------------------------------------------
class TestStatementHeaderEndToEnd:
"""A real PDF with a real header — exercise the full pipeline:
metadata extraction + year inference for short dates + format
application. This is the failure mode most likely to break on
the user's actual Chase statements."""
@pytest.fixture
def pdf_bytes(self) -> bytes:
return _build_statement_pdf_with_header()
def test_metadata_extracted_and_stamped(self, pdf_bytes):
from src.pdf_extract import scan_pdf_for_transactions
rows, _ = scan_pdf_for_transactions(pdf_bytes)
assert rows, "expected at least one transaction"
for r in rows:
assert r["account_number"] == "****5678"
assert r["statement_period_start"] == "20250101"
assert r["statement_period_end"] == "20250131"
def test_short_dates_get_year_from_period(self, pdf_bytes):
from src.pdf_extract import scan_pdf_for_transactions
rows, _ = scan_pdf_for_transactions(pdf_bytes)
# Short ``01/13`` + period ending in 2025 → 20250113
assert rows[0]["date"] == "20250113"
assert rows[1]["date"] == "20250116"
def test_iso_format_round_trip(self, pdf_bytes):
from src.pdf_extract import scan_pdf_for_transactions
rows, _ = scan_pdf_for_transactions(
pdf_bytes, output_date_format="%Y-%m-%d",
)
assert rows[0]["date"] == "2025-01-13"
assert rows[0]["statement_period_start"] == "2025-01-01"
assert rows[0]["statement_period_end"] == "2025-01-31"
class TestMultiDateRow:
"""Some statements (Chase, BofA) show both a transaction date
and a posting date per row. The scanner uses the first date
in position order and excludes every date from the description."""
def test_first_date_wins_second_excluded_from_description(self):
from src import pdf_extract as mod
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
original = mod.extract_pages_auto
def fake(_b, *, allow_ocr=True):
words = [
WordBox(x0=0, top=0, x1=40, bottom=10, text="01/13"),
WordBox(x0=50, top=0, x1=90, bottom=10, text="01/14"),
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
WordBox(x0=170, top=0, x1=210, bottom=10, text="Shop"),
WordBox(x0=220, top=0, x1=270, bottom=10, text="$4.50"),
]
return [Page(
page_no=1, width=300, height=20, text="", words=words,
)], []
mod.extract_pages_auto = fake
try:
rows, _ = scan_pdf_for_transactions(b"")
finally:
mod.extract_pages_auto = original
assert len(rows) == 1
# First date used as the canonical
assert rows[0]["date"] == "01/13"
# Second date NOT in description
assert "01/14" not in rows[0]["description"]
# Description is the actual content between dates and amount
assert rows[0]["description"] == "Coffee Shop"
class TestZeroAmountRowsAreDropped:
"""Rows where the transaction amount is exactly 0 are noise
(statements love to print "INTEREST EARNED 0.00" or
"PAGE TOTAL 0.00") and get filtered out."""
def test_zero_amount_row_dropped(self):
from src import pdf_extract as mod
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
original = mod.extract_pages_auto
def fake(_b, *, allow_ocr=True):
words = [
# Real transaction
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
# Zero-amount noise row (should be dropped)
WordBox(x0=0, top=20, x1=80, bottom=30, text="01/14/2026"),
WordBox(x0=100, top=20, x1=180, bottom=30, text="INTEREST"),
WordBox(x0=200, top=20, x1=240, bottom=30, text="0.00"),
]
return [Page(
page_no=1, width=300, height=40, text="", words=words,
)], []
mod.extract_pages_auto = fake
try:
rows, _ = scan_pdf_for_transactions(b"")
finally:
mod.extract_pages_auto = original
assert len(rows) == 1
assert rows[0]["amount_1"] == 4.50
assert "INTEREST" not in rows[0]["description"]
def test_negative_amount_kept(self):
from src import pdf_extract as mod
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
original = mod.extract_pages_auto
def fake(_b, *, allow_ocr=True):
words = [
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
WordBox(x0=100, top=0, x1=160, bottom=10, text="Withdraw"),
WordBox(x0=200, top=0, x1=240, bottom=10, text="(40.00)"),
]
return [Page(
page_no=1, width=300, height=20, text="", words=words,
)], []
mod.extract_pages_auto = fake
try:
rows, _ = scan_pdf_for_transactions(b"")
finally:
mod.extract_pages_auto = original
# -40 is not zero — keep it
assert len(rows) == 1
assert rows[0]["amount_1"] == -40.00
class TestMultilineDescription:
def test_continuation_line_merges(self):
"""A line with no date and no amount, sitting between two
transaction rows, attaches to the previous transaction's
description."""
from src.pdf_extract import (
Page,
WordBox,
scan_pdf_for_transactions,
)
# Build a synthetic page through the public entry point by
# going through extract_pages_auto's intermediate? Easier:
# call the internals directly via a fake PDF. For unit
# coverage of the merge behavior, route through the helper:
from src import pdf_extract as mod
original = mod.extract_pages_auto
def fake(_pdf_bytes, *, allow_ocr=True):
words = [
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/15/2026"),
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
# Continuation: no date, no amount
WordBox(x0=100, top=20, x1=160, bottom=30, text="Vendor"),
WordBox(x0=170, top=20, x1=230, bottom=30, text="memo"),
# Next transaction
WordBox(x0=0, top=40, x1=80, bottom=50, text="01/16/2026"),
WordBox(x0=100, top=40, x1=160, bottom=50, text="Other"),
WordBox(x0=200, top=40, x1=240, bottom=50, text="$10.00"),
]
return [Page(
page_no=1, width=300, height=100, text="", words=words,
)], []
mod.extract_pages_auto = fake
try:
rows, _ = scan_pdf_for_transactions(b"")
finally:
mod.extract_pages_auto = original
assert len(rows) == 2
assert "Vendor memo" in rows[0]["description"]
assert rows[1]["description"] == "Other"
# ---------------------------------------------------------------------------
# Graceful fallback when deps absent
# ---------------------------------------------------------------------------
class TestPdfDependencyMissing:
def test_require_pdfplumber_raises_typed_on_absence(self, monkeypatch):
from src import pdf_extract
import builtins
real_import = builtins.__import__
def fake_import(name, *a, **kw):
if name == "pdfplumber":
raise ImportError("simulated absent dep")
return real_import(name, *a, **kw)
monkeypatch.setattr(builtins, "__import__", fake_import)
with pytest.raises(pdf_extract.PdfDependencyMissing) as exc:
pdf_extract._require_pdfplumber()
assert "pdfplumber" in str(exc.value)
assert exc.value.hint
def test_require_pdfium_raises_typed_on_absence(self, monkeypatch):
from src import pdf_extract
import builtins
real_import = builtins.__import__
def fake_import(name, *a, **kw):
if name == "pypdfium2":
raise ImportError("simulated absent dep")
return real_import(name, *a, **kw)
monkeypatch.setattr(builtins, "__import__", fake_import)
with pytest.raises(pdf_extract.PdfDependencyMissing):
pdf_extract._require_pdfium()
# ---------------------------------------------------------------------------
# Requirements pin consistency
# ---------------------------------------------------------------------------
class TestPinnedVersionsMatchInstalled:
"""If someone bumps the pin in ``requirements.txt`` without
actually reinstalling, this test points it out before CI does."""
def _parse_pins(self) -> dict[str, str]:
from pathlib import Path
text = (
Path(__file__).resolve().parent.parent / "requirements.txt"
).read_text(encoding="utf-8")
pins: dict[str, str] = {}
for line in text.splitlines():
line = line.strip()
if not line or line.startswith("#"):
continue
if "==" in line:
name, _, version = line.partition("==")
pins[name.strip()] = version.strip()
return pins
@pytest.mark.parametrize("dist_name", [
"pdfplumber",
"pypdfium2",
"pytesseract",
])
def test_pin_matches_installed(self, dist_name):
import importlib.metadata as md
pins = self._parse_pins()
if dist_name not in pins:
pytest.skip(f"{dist_name} not exact-pinned in requirements.txt")
installed = md.version(dist_name)
assert installed == pins[dist_name], (
f"installed {dist_name}=={installed} but requirements.txt "
f"pins {pins[dist_name]} — bump the pin, or reinstall."
)
# ---------------------------------------------------------------------------
# OCR availability
# ---------------------------------------------------------------------------
class TestOcrAvailability:
def test_returns_a_tuple(self):
from src.pdf_extract import ocr_available
result = ocr_available()
assert isinstance(result, tuple) and len(result) == 2
ok, reason = result
assert isinstance(ok, bool)
assert isinstance(reason, str)
def test_extract_pages_auto_skips_ocr_when_disabled(self):
from src.pdf_extract import extract_pages_auto
pdf_bytes = _build_tiny_statement_pdf()
pages, warnings = extract_pages_auto(pdf_bytes, allow_ocr=False)
assert len(pages) == 1
assert not any("OCR is disabled" in w for w in warnings)
class TestTesseractDiscovery:
def test_autodetect_returns_none_on_non_windows(self, monkeypatch):
from src import pdf_extract
monkeypatch.setattr("platform.system", lambda: "Linux")
assert pdf_extract._autodetect_tesseract_path() is None
def test_autodetect_finds_program_files_on_windows(self, monkeypatch):
from src import pdf_extract
monkeypatch.setattr("platform.system", lambda: "Windows")
target = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
def fake_exists(self):
return str(self) == target
monkeypatch.setattr("pathlib.Path.exists", fake_exists)
assert pdf_extract._autodetect_tesseract_path() == target
def test_autodetect_returns_none_when_nothing_installed(self, monkeypatch):
from src import pdf_extract
monkeypatch.setattr("platform.system", lambda: "Windows")
monkeypatch.setattr("pathlib.Path.exists", lambda self: False)
assert pdf_extract._autodetect_tesseract_path() is None
def test_env_var_override_takes_precedence(self, monkeypatch, tmp_path):
from src import pdf_extract
fake_bin = str(tmp_path / "fake-tesseract.exe")
monkeypatch.setenv("DATATOOLS_TESSERACT_PATH", fake_bin)
pdf_extract.ocr_available()
import pytesseract
assert pytesseract.pytesseract.tesseract_cmd == fake_bin