feat(pdf): extract statement header (account + period) + date format
Two related additions for the accountant workflow:
**1. Statement header extraction.** New
``extract_statement_metadata(pages)`` pulls the account number
and statement period out of the first page (falls back to
page 1+2 if either is missing on page 1 — Wells Fargo business
accounts put header info on page 2). Detected fields are
stamped onto EVERY transaction row so a multi-statement CSV is
self-attributing per row::
{
"date": "20250113",
"description": "Coffee Shop",
"amount_1": -4.50,
"account_number": "****5678",
"statement_period_start": "20250101",
"statement_period_end": "20250131",
...
}
Account-number regex is tolerant of masks (``****1234``),
hyphens (``1234-5678-9012``), and spaces. Period regex looks
for "Statement Period" / "From" / "Period Covered" labels plus
the first 1-2 full-year dates that follow. If only one date is
present near the label, it's used for both start and end (some
statements show only the closing date).
**2. Year inference for short dates.** When the row date is a
short ``01/13`` or ``Jan 13`` without a year, the scanner now
binds the year from the statement period's end date BEFORE
formatting. Doesn't handle the December-in-January-statement
cross-year case (rare; user can edit in the table).
**3. Configurable output date format.** New
``output_date_format`` parameter on ``scan_pdf_for_transactions``
defaults to ``%Y%m%d``. Applied to: the transaction date column
AND the statement period start/end fields. The page surfaces a
dropdown in Scan options with common presets (YYYYMMDD,
YYYY-MM-DD, MM/DD/YYYY, DD/MM/YYYY, ``Mon DD, YYYY``) plus a
Custom option that accepts a raw strftime string.
New helper: ``format_date(iso_str, fmt)`` converts ISO
``YYYY-MM-DD`` to any strftime; passes invalid input through
unchanged so the user can see what was actually there rather
than getting silent empties.
20 new tests cover: format_date, account-number extraction
(masked / hyphenated / spaced / no-label / short), period
extraction (standard / from-to / single-date / no-label),
metadata orchestrator (full header / no pages / page-2
fallback), year inference (US / dash / month-name / no-period /
unparseable), plus an end-to-end class that builds a header'd
PDF with short-date transactions and confirms metadata
attribution + year inference + format round-trip.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -15,9 +15,14 @@ from __future__ import annotations
|
||||
from src.pdf_extract import (
|
||||
Page,
|
||||
WordBox,
|
||||
_extract_account_number,
|
||||
_extract_statement_period,
|
||||
_find_amount_tokens,
|
||||
_find_dates_in_words,
|
||||
_infer_year_for_short_date,
|
||||
cluster_rows,
|
||||
extract_statement_metadata,
|
||||
format_date,
|
||||
parse_amount,
|
||||
parse_date,
|
||||
)
|
||||
@@ -207,3 +212,134 @@ class TestFindAmountTokens:
|
||||
# test module — they need ``scan_pdf_for_transactions`` which in
|
||||
# turn uses ``extract_pages_auto``. The unit-test layer here pins
|
||||
# the building blocks; smoke tests pin the wiring.
|
||||
|
||||
|
||||
class TestFormatDate:
|
||||
def test_yyyymmdd(self):
|
||||
assert format_date("2026-01-13", "%Y%m%d") == "20260113"
|
||||
|
||||
def test_iso_passthrough(self):
|
||||
assert format_date("2026-01-13", "%Y-%m-%d") == "2026-01-13"
|
||||
|
||||
def test_us(self):
|
||||
assert format_date("2026-01-13", "%m/%d/%Y") == "01/13/2026"
|
||||
|
||||
def test_invalid_input_passes_through(self):
|
||||
# Non-ISO input — return as-is so the user sees what was
|
||||
# actually there rather than a silent empty string.
|
||||
assert format_date("01/13", "%Y%m%d") == "01/13"
|
||||
|
||||
def test_none_or_empty(self):
|
||||
assert format_date(None) == ""
|
||||
assert format_date("") == ""
|
||||
|
||||
|
||||
class TestExtractAccountNumber:
|
||||
def test_masked(self):
|
||||
text = "Customer Name\nAccount Number: ****1234\nBalance"
|
||||
assert _extract_account_number(text) == "****1234"
|
||||
|
||||
def test_with_hyphens(self):
|
||||
text = "Account #: 1234-5678-9012"
|
||||
assert _extract_account_number(text) == "1234-5678-9012"
|
||||
|
||||
def test_with_spaces(self):
|
||||
text = "Account: 1234 5678 9012"
|
||||
assert _extract_account_number(text) == "1234 5678 9012"
|
||||
|
||||
def test_no_label_no_match(self):
|
||||
text = "Just some text with 1234567890 in it"
|
||||
assert _extract_account_number(text) is None
|
||||
|
||||
def test_requires_at_least_four_digits(self):
|
||||
# An "account" label followed by only XX shouldn't count.
|
||||
text = "Account: XX"
|
||||
assert _extract_account_number(text) is None
|
||||
|
||||
|
||||
class TestExtractStatementPeriod:
|
||||
def test_standard_period(self):
|
||||
text = "Statement Period: 01/01/2025 - 01/31/2025\nBalance"
|
||||
start, end = _extract_statement_period(text)
|
||||
assert start == "2025-01-01"
|
||||
assert end == "2025-01-31"
|
||||
|
||||
def test_from_to(self):
|
||||
text = "From 01/01/2025 to 01/31/2025"
|
||||
start, end = _extract_statement_period(text)
|
||||
assert start == "2025-01-01"
|
||||
assert end == "2025-01-31"
|
||||
|
||||
def test_single_date_both_fields(self):
|
||||
# When only one date appears near the label, return it for both.
|
||||
text = "Statement Date: 01/31/2025"
|
||||
start, end = _extract_statement_period(text)
|
||||
assert start == "2025-01-31"
|
||||
assert end == "2025-01-31"
|
||||
|
||||
def test_no_label_no_match(self):
|
||||
text = "Some random text with 01/01/2025 in it"
|
||||
start, end = _extract_statement_period(text)
|
||||
# No "Period" / "From" / "Statement Date" label
|
||||
assert (start, end) == (None, None)
|
||||
|
||||
|
||||
class TestExtractStatementMetadata:
|
||||
def test_full_header(self):
|
||||
pages = [Page(
|
||||
page_no=1, width=600, height=800,
|
||||
text=(
|
||||
"ACME BANK\n"
|
||||
"Customer: John Doe\n"
|
||||
"Account Number: ****5678\n"
|
||||
"Statement Period: 01/01/2025 - 01/31/2025\n"
|
||||
"Beginning balance: $1,000.00\n"
|
||||
),
|
||||
words=[],
|
||||
)]
|
||||
meta = extract_statement_metadata(pages)
|
||||
assert meta["account_number"] == "****5678"
|
||||
assert meta["period_start"] == "2025-01-01"
|
||||
assert meta["period_end"] == "2025-01-31"
|
||||
|
||||
def test_no_pages(self):
|
||||
meta = extract_statement_metadata([])
|
||||
assert meta == {
|
||||
"account_number": None,
|
||||
"period_start": None,
|
||||
"period_end": None,
|
||||
}
|
||||
|
||||
def test_fallback_to_page_two(self):
|
||||
# Page 1 has only account; period is on page 2.
|
||||
p1 = Page(
|
||||
page_no=1, width=600, height=800,
|
||||
text="Account Number: ****1234\nBalance summary",
|
||||
words=[],
|
||||
)
|
||||
p2 = Page(
|
||||
page_no=2, width=600, height=800,
|
||||
text="Statement Period: 02/01/2025 - 02/28/2025",
|
||||
words=[],
|
||||
)
|
||||
meta = extract_statement_metadata([p1, p2])
|
||||
assert meta["account_number"] == "****1234"
|
||||
assert meta["period_start"] == "2025-02-01"
|
||||
assert meta["period_end"] == "2025-02-28"
|
||||
|
||||
|
||||
class TestInferYearForShortDate:
|
||||
def test_us_short_with_period_end(self):
|
||||
assert _infer_year_for_short_date("01/13", "2025-01-31") == "2025-01-13"
|
||||
|
||||
def test_short_dash(self):
|
||||
assert _infer_year_for_short_date("01-13", "2025-01-31") == "2025-01-13"
|
||||
|
||||
def test_month_name(self):
|
||||
assert _infer_year_for_short_date("Jan 13", "2025-01-31") == "2025-01-13"
|
||||
|
||||
def test_no_period_end(self):
|
||||
assert _infer_year_for_short_date("01/13", None) is None
|
||||
|
||||
def test_unparseable(self):
|
||||
assert _infer_year_for_short_date("xx/yy", "2025-01-31") is None
|
||||
|
||||
Reference in New Issue
Block a user