feat(pdf): extract statement header (account + period) + date format
Two related additions for the accountant workflow:
**1. Statement header extraction.** New
``extract_statement_metadata(pages)`` pulls the account number
and statement period out of the first page (falls back to
page 1+2 if either is missing on page 1 — Wells Fargo business
accounts put header info on page 2). Detected fields are
stamped onto EVERY transaction row so a multi-statement CSV is
self-attributing per row::
{
"date": "20250113",
"description": "Coffee Shop",
"amount_1": -4.50,
"account_number": "****5678",
"statement_period_start": "20250101",
"statement_period_end": "20250131",
...
}
Account-number regex is tolerant of masks (``****1234``),
hyphens (``1234-5678-9012``), and spaces. Period regex looks
for "Statement Period" / "From" / "Period Covered" labels plus
the first 1-2 full-year dates that follow. If only one date is
present near the label, it's used for both start and end (some
statements show only the closing date).
**2. Year inference for short dates.** When the row date is a
short ``01/13`` or ``Jan 13`` without a year, the scanner now
binds the year from the statement period's end date BEFORE
formatting. Doesn't handle the December-in-January-statement
cross-year case (rare; user can edit in the table).
**3. Configurable output date format.** New
``output_date_format`` parameter on ``scan_pdf_for_transactions``
defaults to ``%Y%m%d``. Applied to: the transaction date column
AND the statement period start/end fields. The page surfaces a
dropdown in Scan options with common presets (YYYYMMDD,
YYYY-MM-DD, MM/DD/YYYY, DD/MM/YYYY, ``Mon DD, YYYY``) plus a
Custom option that accepts a raw strftime string.
New helper: ``format_date(iso_str, fmt)`` converts ISO
``YYYY-MM-DD`` to any strftime; passes invalid input through
unchanged so the user can see what was actually there rather
than getting silent empties.
20 new tests cover: format_date, account-number extraction
(masked / hyphenated / spaced / no-label / short), period
extraction (standard / from-to / single-date / no-label),
metadata orchestrator (full header / no pages / page-2
fallback), year inference (US / dash / month-name / no-period /
unparseable), plus an end-to-end class that builds a header'd
PDF with short-date transactions and confirms metadata
attribution + year inference + format round-trip.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -15,6 +15,45 @@ from __future__ import annotations
|
||||
import pytest
|
||||
|
||||
|
||||
def _build_statement_pdf_with_header() -> bytes:
|
||||
"""Statement with realistic header (account + period) plus
|
||||
transactions. Exercises the metadata-extraction path end-to-end."""
|
||||
from fpdf import FPDF
|
||||
|
||||
pdf = FPDF(orientation="P", unit="pt", format="letter")
|
||||
pdf.add_page()
|
||||
pdf.set_font("Helvetica", size=12)
|
||||
pdf.set_xy(40, 50)
|
||||
pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT")
|
||||
pdf.set_xy(40, 70)
|
||||
pdf.cell(0, 14, "Account Number: ****5678", new_x="LMARGIN", new_y="NEXT")
|
||||
pdf.set_xy(40, 85)
|
||||
pdf.cell(0, 14, "Statement Period: 01/01/2025 - 01/31/2025",
|
||||
new_x="LMARGIN", new_y="NEXT")
|
||||
# Header row
|
||||
pdf.set_xy(40, 130)
|
||||
pdf.cell(120, 14, "Date")
|
||||
pdf.set_xy(160, 130)
|
||||
pdf.cell(200, 14, "Description")
|
||||
pdf.set_xy(360, 130)
|
||||
pdf.cell(80, 14, "Amount")
|
||||
# Transactions with SHORT dates — year is implied by period.
|
||||
rows = [
|
||||
("01/13", "Coffee Shop", "(4.50)"),
|
||||
("01/16", "Refund Vendor", "$12.00"),
|
||||
]
|
||||
y = 160
|
||||
for date, desc, amt in rows:
|
||||
pdf.set_xy(40, y)
|
||||
pdf.cell(120, 14, date)
|
||||
pdf.set_xy(160, y)
|
||||
pdf.cell(200, 14, desc)
|
||||
pdf.set_xy(360, y)
|
||||
pdf.cell(80, 14, amt)
|
||||
y += 20
|
||||
return bytes(pdf.output())
|
||||
|
||||
|
||||
def _build_tiny_statement_pdf() -> bytes:
|
||||
"""One-page PDF: header line + three transaction rows + a
|
||||
closing-balance footer. The scanner should pick up exactly the
|
||||
@@ -97,13 +136,34 @@ class TestScanPdfForTransactions:
|
||||
f"{[r.get('raw') for r in rows]}"
|
||||
)
|
||||
|
||||
def test_parses_dates_to_iso(self, pdf_bytes):
|
||||
def test_dates_formatted_yyyymmdd_by_default(self, pdf_bytes):
|
||||
from src.pdf_extract import scan_pdf_for_transactions
|
||||
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||
# Default output format is %Y%m%d
|
||||
assert [r["date"] for r in rows] == [
|
||||
"20260115", "20260116", "20260117",
|
||||
]
|
||||
|
||||
def test_output_date_format_override(self, pdf_bytes):
|
||||
from src.pdf_extract import scan_pdf_for_transactions
|
||||
rows, _ = scan_pdf_for_transactions(
|
||||
pdf_bytes, output_date_format="%Y-%m-%d",
|
||||
)
|
||||
assert [r["date"] for r in rows] == [
|
||||
"2026-01-15", "2026-01-16", "2026-01-17",
|
||||
]
|
||||
|
||||
def test_metadata_fields_present_on_every_row(self, pdf_bytes):
|
||||
from src.pdf_extract import scan_pdf_for_transactions
|
||||
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||
# The fixture PDF has no statement-period or account
|
||||
# header, so the metadata fields exist but are empty
|
||||
# strings — the contract is: ALWAYS present on every row.
|
||||
for r in rows:
|
||||
assert "account_number" in r
|
||||
assert "statement_period_start" in r
|
||||
assert "statement_period_end" in r
|
||||
|
||||
def test_parses_amounts_with_signs(self, pdf_bytes):
|
||||
from src.pdf_extract import scan_pdf_for_transactions
|
||||
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||
@@ -144,6 +204,42 @@ class TestScanPdfForTransactions:
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestStatementHeaderEndToEnd:
|
||||
"""A real PDF with a real header — exercise the full pipeline:
|
||||
metadata extraction + year inference for short dates + format
|
||||
application. This is the failure mode most likely to break on
|
||||
the user's actual Chase statements."""
|
||||
|
||||
@pytest.fixture
|
||||
def pdf_bytes(self) -> bytes:
|
||||
return _build_statement_pdf_with_header()
|
||||
|
||||
def test_metadata_extracted_and_stamped(self, pdf_bytes):
|
||||
from src.pdf_extract import scan_pdf_for_transactions
|
||||
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||
assert rows, "expected at least one transaction"
|
||||
for r in rows:
|
||||
assert r["account_number"] == "****5678"
|
||||
assert r["statement_period_start"] == "20250101"
|
||||
assert r["statement_period_end"] == "20250131"
|
||||
|
||||
def test_short_dates_get_year_from_period(self, pdf_bytes):
|
||||
from src.pdf_extract import scan_pdf_for_transactions
|
||||
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||
# Short ``01/13`` + period ending in 2025 → 20250113
|
||||
assert rows[0]["date"] == "20250113"
|
||||
assert rows[1]["date"] == "20250116"
|
||||
|
||||
def test_iso_format_round_trip(self, pdf_bytes):
|
||||
from src.pdf_extract import scan_pdf_for_transactions
|
||||
rows, _ = scan_pdf_for_transactions(
|
||||
pdf_bytes, output_date_format="%Y-%m-%d",
|
||||
)
|
||||
assert rows[0]["date"] == "2025-01-13"
|
||||
assert rows[0]["statement_period_start"] == "2025-01-01"
|
||||
assert rows[0]["statement_period_end"] == "2025-01-31"
|
||||
|
||||
|
||||
class TestMultiDateRow:
|
||||
"""Some statements (Chase, BofA) show both a transaction date
|
||||
and a posting date per row. The scanner uses the first date
|
||||
|
||||
Reference in New Issue
Block a user