feat(pdf): extract statement header (account + period) + date format

Two related additions for the accountant workflow: **1. Statement header extraction.** New ``extract_statement_metadata(pages)`` pulls the account number and statement period out of the first page (falls back to page 1+2 if either is missing on page 1 — Wells Fargo business accounts put header info on page 2). Detected fields are stamped onto EVERY transaction row so a multi-statement CSV is self-attributing per row:: { "date": "20250113", "description": "Coffee Shop", "amount_1": -4.50, "account_number": "****5678", "statement_period_start": "20250101", "statement_period_end": "20250131", ... } Account-number regex is tolerant of masks (``****1234``), hyphens (``1234-5678-9012``), and spaces. Period regex looks for "Statement Period" / "From" / "Period Covered" labels plus the first 1-2 full-year dates that follow. If only one date is present near the label, it's used for both start and end (some statements show only the closing date). **2. Year inference for short dates.** When the row date is a short ``01/13`` or ``Jan 13`` without a year, the scanner now binds the year from the statement period's end date BEFORE formatting. Doesn't handle the December-in-January-statement cross-year case (rare; user can edit in the table). **3. Configurable output date format.** New ``output_date_format`` parameter on ``scan_pdf_for_transactions`` defaults to ``%Y%m%d``. Applied to: the transaction date column AND the statement period start/end fields. The page surfaces a dropdown in Scan options with common presets (YYYYMMDD, YYYY-MM-DD, MM/DD/YYYY, DD/MM/YYYY, ``Mon DD, YYYY``) plus a Custom option that accepts a raw strftime string. New helper: ``format_date(iso_str, fmt)`` converts ISO ``YYYY-MM-DD`` to any strftime; passes invalid input through unchanged so the user can see what was actually there rather than getting silent empties. 20 new tests cover: format_date, account-number extraction (masked / hyphenated / spaced / no-label / short), period extraction (standard / from-to / single-date / no-label), metadata orchestrator (full header / no pages / page-2 fallback), year inference (US / dash / month-name / no-period / unparseable), plus an end-to-end class that builds a header'd PDF with short-date transactions and confirms metadata attribution + year inference + format round-trip. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 00:20:46 +00:00
parent 3cf935c999
commit 155dd30746
4 changed files with 499 additions and 13 deletions
--- a/tests/test_pdf_extract.py
+++ b/tests/test_pdf_extract.py
@@ -15,9 +15,14 @@ from __future__ import annotations
 from src.pdf_extract import (
    Page,
    WordBox,
+    _extract_account_number,
+    _extract_statement_period,
    _find_amount_tokens,
    _find_dates_in_words,
+    _infer_year_for_short_date,
    cluster_rows,
+    extract_statement_metadata,
+    format_date,
    parse_amount,
    parse_date,
 )
@@ -207,3 +212,134 @@ class TestFindAmountTokens:
 # test module — they need ``scan_pdf_for_transactions`` which in
 # turn uses ``extract_pages_auto``. The unit-test layer here pins
 # the building blocks; smoke tests pin the wiring.
+
+
+class TestFormatDate:
+    def test_yyyymmdd(self):
+        assert format_date("2026-01-13", "%Y%m%d") == "20260113"
+
+    def test_iso_passthrough(self):
+        assert format_date("2026-01-13", "%Y-%m-%d") == "2026-01-13"
+
+    def test_us(self):
+        assert format_date("2026-01-13", "%m/%d/%Y") == "01/13/2026"
+
+    def test_invalid_input_passes_through(self):
+        # Non-ISO input — return as-is so the user sees what was
+        # actually there rather than a silent empty string.
+        assert format_date("01/13", "%Y%m%d") == "01/13"
+
+    def test_none_or_empty(self):
+        assert format_date(None) == ""
+        assert format_date("") == ""
+
+
+class TestExtractAccountNumber:
+    def test_masked(self):
+        text = "Customer Name\nAccount Number: ****1234\nBalance"
+        assert _extract_account_number(text) == "****1234"
+
+    def test_with_hyphens(self):
+        text = "Account #: 1234-5678-9012"
+        assert _extract_account_number(text) == "1234-5678-9012"
+
+    def test_with_spaces(self):
+        text = "Account: 1234 5678 9012"
+        assert _extract_account_number(text) == "1234 5678 9012"
+
+    def test_no_label_no_match(self):
+        text = "Just some text with 1234567890 in it"
+        assert _extract_account_number(text) is None
+
+    def test_requires_at_least_four_digits(self):
+        # An "account" label followed by only XX shouldn't count.
+        text = "Account: XX"
+        assert _extract_account_number(text) is None
+
+
+class TestExtractStatementPeriod:
+    def test_standard_period(self):
+        text = "Statement Period: 01/01/2025 - 01/31/2025\nBalance"
+        start, end = _extract_statement_period(text)
+        assert start == "2025-01-01"
+        assert end == "2025-01-31"
+
+    def test_from_to(self):
+        text = "From 01/01/2025 to 01/31/2025"
+        start, end = _extract_statement_period(text)
+        assert start == "2025-01-01"
+        assert end == "2025-01-31"
+
+    def test_single_date_both_fields(self):
+        # When only one date appears near the label, return it for both.
+        text = "Statement Date: 01/31/2025"
+        start, end = _extract_statement_period(text)
+        assert start == "2025-01-31"
+        assert end == "2025-01-31"
+
+    def test_no_label_no_match(self):
+        text = "Some random text with 01/01/2025 in it"
+        start, end = _extract_statement_period(text)
+        # No "Period" / "From" / "Statement Date" label
+        assert (start, end) == (None, None)
+
+
+class TestExtractStatementMetadata:
+    def test_full_header(self):
+        pages = [Page(
+            page_no=1, width=600, height=800,
+            text=(
+                "ACME BANK\n"
+                "Customer: John Doe\n"
+                "Account Number: ****5678\n"
+                "Statement Period: 01/01/2025 - 01/31/2025\n"
+                "Beginning balance: $1,000.00\n"
+            ),
+            words=[],
+        )]
+        meta = extract_statement_metadata(pages)
+        assert meta["account_number"] == "****5678"
+        assert meta["period_start"] == "2025-01-01"
+        assert meta["period_end"] == "2025-01-31"
+
+    def test_no_pages(self):
+        meta = extract_statement_metadata([])
+        assert meta == {
+            "account_number": None,
+            "period_start": None,
+            "period_end": None,
+        }
+
+    def test_fallback_to_page_two(self):
+        # Page 1 has only account; period is on page 2.
+        p1 = Page(
+            page_no=1, width=600, height=800,
+            text="Account Number: ****1234\nBalance summary",
+            words=[],
+        )
+        p2 = Page(
+            page_no=2, width=600, height=800,
+            text="Statement Period: 02/01/2025 - 02/28/2025",
+            words=[],
+        )
+        meta = extract_statement_metadata([p1, p2])
+        assert meta["account_number"] == "****1234"
+        assert meta["period_start"] == "2025-02-01"
+        assert meta["period_end"] == "2025-02-28"
+
+
+class TestInferYearForShortDate:
+    def test_us_short_with_period_end(self):
+        assert _infer_year_for_short_date("01/13", "2025-01-31") == "2025-01-13"
+
+    def test_short_dash(self):
+        assert _infer_year_for_short_date("01-13", "2025-01-31") == "2025-01-13"
+
+    def test_month_name(self):
+        assert _infer_year_for_short_date("Jan 13", "2025-01-31") == "2025-01-13"
+
+    def test_no_period_end(self):
+        assert _infer_year_for_short_date("01/13", None) is None
+
+    def test_unparseable(self):
+        assert _infer_year_for_short_date("xx/yy", "2025-01-31") is None
--- a/tests/test_pdf_extract_smoke.py
+++ b/tests/test_pdf_extract_smoke.py
@@ -15,6 +15,45 @@ from __future__ import annotations
 import pytest


+def _build_statement_pdf_with_header() -> bytes:
+    """Statement with realistic header (account + period) plus
+    transactions. Exercises the metadata-extraction path end-to-end."""
+    from fpdf import FPDF
+
+    pdf = FPDF(orientation="P", unit="pt", format="letter")
+    pdf.add_page()
+    pdf.set_font("Helvetica", size=12)
+    pdf.set_xy(40, 50)
+    pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT")
+    pdf.set_xy(40, 70)
+    pdf.cell(0, 14, "Account Number: ****5678", new_x="LMARGIN", new_y="NEXT")
+    pdf.set_xy(40, 85)
+    pdf.cell(0, 14, "Statement Period: 01/01/2025 - 01/31/2025",
+             new_x="LMARGIN", new_y="NEXT")
+    # Header row
+    pdf.set_xy(40, 130)
+    pdf.cell(120, 14, "Date")
+    pdf.set_xy(160, 130)
+    pdf.cell(200, 14, "Description")
+    pdf.set_xy(360, 130)
+    pdf.cell(80, 14, "Amount")
+    # Transactions with SHORT dates — year is implied by period.
+    rows = [
+        ("01/13", "Coffee Shop",     "(4.50)"),
+        ("01/16", "Refund Vendor",   "$12.00"),
+    ]
+    y = 160
+    for date, desc, amt in rows:
+        pdf.set_xy(40, y)
+        pdf.cell(120, 14, date)
+        pdf.set_xy(160, y)
+        pdf.cell(200, 14, desc)
+        pdf.set_xy(360, y)
+        pdf.cell(80, 14, amt)
+        y += 20
+    return bytes(pdf.output())
+
+
 def _build_tiny_statement_pdf() -> bytes:
    """One-page PDF: header line + three transaction rows + a
    closing-balance footer. The scanner should pick up exactly the
@@ -97,13 +136,34 @@ class TestScanPdfForTransactions:
            f"{[r.get('raw') for r in rows]}"
        )

-    def test_parses_dates_to_iso(self, pdf_bytes):
+    def test_dates_formatted_yyyymmdd_by_default(self, pdf_bytes):
        from src.pdf_extract import scan_pdf_for_transactions
        rows, _ = scan_pdf_for_transactions(pdf_bytes)
+        # Default output format is %Y%m%d
+        assert [r["date"] for r in rows] == [
+            "20260115", "20260116", "20260117",
+        ]
+
+    def test_output_date_format_override(self, pdf_bytes):
+        from src.pdf_extract import scan_pdf_for_transactions
+        rows, _ = scan_pdf_for_transactions(
+            pdf_bytes, output_date_format="%Y-%m-%d",
+        )
        assert [r["date"] for r in rows] == [
            "2026-01-15", "2026-01-16", "2026-01-17",
        ]

+    def test_metadata_fields_present_on_every_row(self, pdf_bytes):
+        from src.pdf_extract import scan_pdf_for_transactions
+        rows, _ = scan_pdf_for_transactions(pdf_bytes)
+        # The fixture PDF has no statement-period or account
+        # header, so the metadata fields exist but are empty
+        # strings — the contract is: ALWAYS present on every row.
+        for r in rows:
+            assert "account_number" in r
+            assert "statement_period_start" in r
+            assert "statement_period_end" in r
+
    def test_parses_amounts_with_signs(self, pdf_bytes):
        from src.pdf_extract import scan_pdf_for_transactions
        rows, _ = scan_pdf_for_transactions(pdf_bytes)
@@ -144,6 +204,42 @@ class TestScanPdfForTransactions:
 # ---------------------------------------------------------------------------


+class TestStatementHeaderEndToEnd:
+    """A real PDF with a real header — exercise the full pipeline:
+    metadata extraction + year inference for short dates + format
+    application. This is the failure mode most likely to break on
+    the user's actual Chase statements."""
+
+    @pytest.fixture
+    def pdf_bytes(self) -> bytes:
+        return _build_statement_pdf_with_header()
+
+    def test_metadata_extracted_and_stamped(self, pdf_bytes):
+        from src.pdf_extract import scan_pdf_for_transactions
+        rows, _ = scan_pdf_for_transactions(pdf_bytes)
+        assert rows, "expected at least one transaction"
+        for r in rows:
+            assert r["account_number"] == "****5678"
+            assert r["statement_period_start"] == "20250101"
+            assert r["statement_period_end"] == "20250131"
+
+    def test_short_dates_get_year_from_period(self, pdf_bytes):
+        from src.pdf_extract import scan_pdf_for_transactions
+        rows, _ = scan_pdf_for_transactions(pdf_bytes)
+        # Short ``01/13`` + period ending in 2025 → 20250113
+        assert rows[0]["date"] == "20250113"
+        assert rows[1]["date"] == "20250116"
+
+    def test_iso_format_round_trip(self, pdf_bytes):
+        from src.pdf_extract import scan_pdf_for_transactions
+        rows, _ = scan_pdf_for_transactions(
+            pdf_bytes, output_date_format="%Y-%m-%d",
+        )
+        assert rows[0]["date"] == "2025-01-13"
+        assert rows[0]["statement_period_start"] == "2025-01-01"
+        assert rows[0]["statement_period_end"] == "2025-01-31"
+
+
 class TestMultiDateRow:
    """Some statements (Chase, BofA) show both a transaction date
    and a posting date per row. The scanner uses the first date