From 155dd30746d4e374c7d3f7a955b15d53cf6062e0 Mon Sep 17 00:00:00 2001
From: Michael <michael.dombaugh@gmail.com>
Date: Wed, 20 May 2026 00:20:46 +0000
Subject: [PATCH] feat(pdf): extract statement header (account + period) + date
 format
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two related additions for the accountant workflow:

**1. Statement header extraction.** New
``extract_statement_metadata(pages)`` pulls the account number
and statement period out of the first page (falls back to
page 1+2 if either is missing on page 1 — Wells Fargo business
accounts put header info on page 2). Detected fields are
stamped onto EVERY transaction row so a multi-statement CSV is
self-attributing per row::

    {
      "date": "20250113",
      "description": "Coffee Shop",
      "amount_1": -4.50,
      "account_number": "****5678",
      "statement_period_start": "20250101",
      "statement_period_end": "20250131",
      ...
    }

Account-number regex is tolerant of masks (``****1234``),
hyphens (``1234-5678-9012``), and spaces. Period regex looks
for "Statement Period" / "From" / "Period Covered" labels plus
the first 1-2 full-year dates that follow. If only one date is
present near the label, it's used for both start and end (some
statements show only the closing date).

**2. Year inference for short dates.** When the row date is a
short ``01/13`` or ``Jan 13`` without a year, the scanner now
binds the year from the statement period's end date BEFORE
formatting. Doesn't handle the December-in-January-statement
cross-year case (rare; user can edit in the table).

**3. Configurable output date format.** New
``output_date_format`` parameter on ``scan_pdf_for_transactions``
defaults to ``%Y%m%d``. Applied to: the transaction date column
AND the statement period start/end fields. The page surfaces a
dropdown in Scan options with common presets (YYYYMMDD,
YYYY-MM-DD, MM/DD/YYYY, DD/MM/YYYY, ``Mon DD, YYYY``) plus a
Custom option that accepts a raw strftime string.

New helper: ``format_date(iso_str, fmt)`` converts ISO
``YYYY-MM-DD`` to any strftime; passes invalid input through
unchanged so the user can see what was actually there rather
than getting silent empties.

20 new tests cover: format_date, account-number extraction
(masked / hyphenated / spaced / no-label / short), period
extraction (standard / from-to / single-date / no-label),
metadata orchestrator (full header / no pages / page-2
fallback), year inference (US / dash / month-name / no-period /
unparseable), plus an end-to-end class that builds a header'd
PDF with short-date transactions and confirms metadata
attribution + year inference + format round-trip.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/gui/pages/10_PDF_Extractor.py |  51 ++++++-
 src/pdf_extract.py                | 227 ++++++++++++++++++++++++++++--
 tests/test_pdf_extract.py         | 136 ++++++++++++++++++
 tests/test_pdf_extract_smoke.py   |  98 ++++++++++++-
 4 files changed, 499 insertions(+), 13 deletions(-)

diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py
index 7033a60..ffd3956 100644
--- a/src/gui/pages/10_PDF_Extractor.py
+++ b/src/gui/pages/10_PDF_Extractor.py
@@ -90,6 +90,15 @@ if not _pdf_ok:
 # Options + upload
 # ---------------------------------------------------------------------------
 
+_DATE_FORMAT_CHOICES = {
+    "YYYYMMDD (20260113)": "%Y%m%d",
+    "YYYY-MM-DD (2026-01-13)": "%Y-%m-%d",
+    "MM/DD/YYYY (01/13/2026)": "%m/%d/%Y",
+    "DD/MM/YYYY (13/01/2026)": "%d/%m/%Y",
+    "MMM DD, YYYY (Jan 13, 2026)": "%b %d, %Y",
+    "Custom strftime…": "__custom__",
+}
+
 with st.expander("Scan options", expanded=False):
     c1, c2 = st.columns(2)
     negative_in_parens = c1.checkbox(
@@ -112,6 +121,28 @@ with st.expander("Scan options", expanded=False):
         ),
     )
 
+    c3, c4 = st.columns(2)
+    date_label = c3.selectbox(
+        "Output date format",
+        list(_DATE_FORMAT_CHOICES.keys()),
+        index=0,
+        help=(
+            "Applied to the transaction date AND the statement "
+            "period dates pulled from the header. Pick Custom to "
+            "enter your own ``strftime`` string."
+        ),
+    )
+    output_date_format = _DATE_FORMAT_CHOICES[date_label]
+    if output_date_format == "__custom__":
+        output_date_format = c4.text_input(
+            "Custom strftime format",
+            value="%Y%m%d",
+            help=(
+                "Python ``strftime`` codes — e.g., ``%Y%m%d`` for "
+                "20260113, ``%Y-%m-%d`` for 2026-01-13."
+            ),
+        )
+
 uploads = st.file_uploader(
     "PDF file(s)",
     type=["pdf"],
@@ -148,6 +179,7 @@ if scan_clicked and uploads:
                     raw,
                     negative_in_parens=negative_in_parens,
                     allow_ocr=use_ocr,
+                    output_date_format=output_date_format,
                 )
                 for r in rows:
                     r["source_file"] = up.name
@@ -258,11 +290,24 @@ else:
 
     # Order columns so the user-facing fields are leftmost; raw +
     # internals are last and easy to scroll past or unselect at
-    # download time.
-    front = ["date", "description"]
+    # download time. Statement metadata sits with the transaction
+    # detail since it's per-row context an accountant typically
+    # wants alongside the amounts.
+    front = [
+        "date",
+        "description",
+    ]
     amount_cols = sorted(c for c in df.columns if c.startswith("amount_"))
+    metadata_cols = [
+        "account_number",
+        "statement_period_start",
+        "statement_period_end",
+    ]
     tail = ["source_file", "page", "raw"]
-    ordered = [c for c in front + amount_cols + tail if c in df.columns]
+    ordered = [
+        c for c in front + amount_cols + metadata_cols + tail
+        if c in df.columns
+    ]
     extras = [c for c in df.columns if c not in ordered]
     df = df[ordered + extras]
 
diff --git a/src/pdf_extract.py b/src/pdf_extract.py
index 5f9ef91..31cac7b 100644
--- a/src/pdf_extract.py
+++ b/src/pdf_extract.py
@@ -520,6 +520,180 @@ def _find_amount_tokens(
     return out
 
 
+def format_date(iso_str: str | None, fmt: str = "%Y%m%d") -> str:
+    """Convert an ISO ``YYYY-MM-DD`` date string to *fmt*.
+
+    Returns the input unchanged if it's not parseable as ISO,
+    empty string if input is None/empty. The scanner uses this
+    on every date column (transaction date + statement period
+    start/end) so the output CSV is consistent.
+    """
+    if not iso_str:
+        return ""
+    try:
+        return datetime.strptime(iso_str, "%Y-%m-%d").strftime(fmt)
+    except (ValueError, TypeError):
+        return iso_str
+
+
+# ---------------------------------------------------------------------------
+# Statement-level metadata (account number + period)
+# ---------------------------------------------------------------------------
+
+# Account number regexes. Bank statements label these in a small
+# handful of conventional ways. The capture group is a permissive
+# run of digits / X / * / dashes / spaces — accounts are often
+# masked like ``****1234`` or printed with grouping like
+# ``1234-5678-9012``.
+_ACCOUNT_RES = [
+    re.compile(
+        r"Account\s*(?:Number|No\.?|#)\s*[:.]?\s*"
+        r"([X\*\d][X\*\d\-\s]{3,30}[X\*\d])",
+        re.IGNORECASE,
+    ),
+    re.compile(
+        r"Account\s*[:.]\s*([X\*\d][X\*\d\-\s]{3,30}[X\*\d])",
+        re.IGNORECASE,
+    ),
+    re.compile(
+        r"A/?[Cc]\s*(?:#|No\.?)?\s*[:.]?\s*"
+        r"([X\*\d][X\*\d\-\s]{3,30}[X\*\d])",
+        re.IGNORECASE,
+    ),
+]
+
+
+def _extract_account_number(text: str) -> str | None:
+    """Find the first plausible account number in *text*.
+
+    Plausible = at least 4 digit characters and matched near an
+    'Account' label. Whitespace is collapsed; the literal mask
+    characters (``X``, ``*``) and dashes are preserved so the
+    user sees ``****1234`` rather than ``1234`` (which would lose
+    information).
+    """
+    for rx in _ACCOUNT_RES:
+        for m in rx.finditer(text):
+            value = re.sub(r"\s+", " ", m.group(1).strip())
+            digit_count = sum(1 for c in value if c.isdigit())
+            if digit_count >= 4:
+                return value
+    return None
+
+
+_PERIOD_LABEL_RE = re.compile(
+    r"(?:Statement\s*(?:Period|Date)|"
+    r"For\s+the\s+(?:period|statement\s+period)|"
+    r"Period\s+(?:Covered|Beginning|of\s+Statement)|"
+    r"From)",
+    re.IGNORECASE,
+)
+
+
+def _extract_statement_period(
+    text: str,
+) -> tuple[str | None, str | None]:
+    """Locate the statement period dates and return them as ISO
+    ``(start, end)`` or ``(None, None)``.
+
+    Strategy: find every "Statement Period" / "From" / etc. label,
+    then look for full-year dates in the ~150 chars following the
+    label. The first two dates become start/end. If only one date
+    appears, both fields get the same value (single-statement-date
+    case — common on monthly cycles where only the closing date
+    is shown).
+    """
+    for label_m in _PERIOD_LABEL_RE.finditer(text):
+        snippet = text[label_m.end() : label_m.end() + 150]
+        dates: list[tuple[int, str]] = []
+        for rx in _DATE_RES_FULL:
+            for m in rx.finditer(snippet):
+                iso = parse_date(m.group(1))
+                if iso:
+                    dates.append((m.start(), iso))
+        if dates:
+            dates.sort(key=lambda x: x[0])
+            if len(dates) >= 2:
+                return dates[0][1], dates[1][1]
+            return dates[0][1], dates[0][1]
+    return None, None
+
+
+def extract_statement_metadata(
+    pages: list[Page],
+) -> dict[str, str | None]:
+    """Pull account number + statement period out of the header
+    region of *pages*.
+
+    Searches page 1's text, falling back to page 1 + 2 combined
+    if page 1's account/period detection comes up empty (some
+    statements put header info on page 2 — Wells Fargo business
+    accounts do this).
+
+    Returns ``{"account_number", "period_start", "period_end"}``
+    with ``None`` for any field that couldn't be detected. ISO
+    format for the dates.
+    """
+    if not pages:
+        return {
+            "account_number": None,
+            "period_start": None,
+            "period_end": None,
+        }
+
+    text = pages[0].text
+    account = _extract_account_number(text)
+    start, end = _extract_statement_period(text)
+
+    # Fallback to pages 1+2 if anything was missed.
+    if (account is None or start is None) and len(pages) > 1:
+        extended = pages[0].text + "\n" + pages[1].text
+        if account is None:
+            account = _extract_account_number(extended)
+        if start is None:
+            start, end = _extract_statement_period(extended)
+
+    return {
+        "account_number": account,
+        "period_start": start,
+        "period_end": end,
+    }
+
+
+def _infer_year_for_short_date(
+    raw_date: str,
+    period_end_iso: str | None,
+) -> str | None:
+    """Try to bind a short date like ``01/13`` or ``Jan 13`` to
+    the year of the statement period's end. Returns ISO or None
+    if no candidate format parses.
+
+    Doesn't handle the December-in-January-statement cross-year
+    case — too rare to be worth the complexity. The user sees the
+    inferred year in the editor and can correct if needed; the
+    raw text stays in the ``raw`` column for reference.
+    """
+    if not raw_date or not period_end_iso:
+        return None
+    try:
+        end_year = int(period_end_iso[:4])
+    except (ValueError, IndexError):
+        return None
+
+    candidates = [
+        ("%m/%d/%Y", f"{raw_date}/{end_year}"),
+        ("%m-%d-%Y", f"{raw_date}-{end_year}"),
+        ("%b %d %Y", f"{raw_date} {end_year}"),
+        ("%d %b %Y", f"{raw_date} {end_year}"),
+    ]
+    for fmt, candidate in candidates:
+        try:
+            return datetime.strptime(candidate, fmt).strftime("%Y-%m-%d")
+        except ValueError:
+            continue
+    return None
+
+
 def _description_from_row(
     row_words: list[WordBox],
     date_ranges: list[tuple[int, int]],
@@ -564,6 +738,7 @@ def scan_pdf_for_transactions(
     date_formats: list[str] | None = None,
     y_tolerance: float = 3.0,
     merge_multiline_descriptions: bool = True,
+    output_date_format: str = "%Y%m%d",
 ) -> tuple[list[dict[str, Any]], list[str]]:
     """Scan *pdf_bytes* for transaction-like rows.
 
@@ -571,23 +746,36 @@ def scan_pdf_for_transactions(
     amount pattern. Each returned record looks like::
 
         {
-          "date": "2026-01-15",   # ISO, or raw text if unparsable
+          "date": "20260115",     # output_date_format applied
           "description": "...",
-          "amount_1": 4.50,       # always present
+          "amount_1": 4.50,
           "amount_2": 1000.00,    # if a second amount was found
-          "amount_3": ...,        # if a third was found
           "page": 1,
           "raw": "01/15/2026 Coffee $4.50",
+          "account_number": "****1234",      # from header
+          "statement_period_start": "20260101",
+          "statement_period_end": "20260131",
         }
 
-    Multi-line descriptions (rows with no date and no amount) attach
-    to the most recent transaction row when
+    Header metadata (``account_number`` /
+    ``statement_period_start`` / ``statement_period_end``) is
+    extracted once per PDF and stamped onto every detected row.
+    That way a multi-statement CSV remains attributable per row
+    when it's reshaped or imported elsewhere.
+
+    Short dates without a year (``01/13``, ``Jan 13``) are bound
+    to the year of the statement period's end before formatting.
+    If period detection fails, the raw short text is preserved.
+
+    Multi-line descriptions (rows with no date and no amount)
+    attach to the most recent transaction row when
     ``merge_multiline_descriptions=True`` (default).
 
     Returns ``(rows, warnings)``. Warnings are human-readable
     strings the GUI surfaces in an expander.
     """
     pages, warnings = extract_pages_auto(pdf_bytes, allow_ocr=allow_ocr)
+    metadata = extract_statement_metadata(pages)
 
     out_rows: list[dict[str, Any]] = []
     prev: dict[str, Any] | None = None
@@ -628,11 +816,20 @@ def scan_pdf_for_transactions(
                 row_words, date_ranges, amount_idxs,
             )
 
+            iso = parse_date(first_date_text, date_formats)
+            if iso is None:
+                # Short date — try to bind to the statement period
+                # year before falling back to the raw text.
+                iso = _infer_year_for_short_date(
+                    first_date_text, metadata["period_end"],
+                )
+            formatted_date = (
+                format_date(iso, output_date_format)
+                if iso else first_date_text
+            )
+
             record: dict[str, Any] = {
-                "date": (
-                    parse_date(first_date_text, date_formats)
-                    or first_date_text
-                ),
+                "date": formatted_date,
                 "description": desc,
                 "page": page.page_no,
                 "raw": line,
@@ -658,6 +855,16 @@ def scan_pdf_for_transactions(
             if not _has_real_transaction_amount(record):
                 continue
 
+            # Stamp the header metadata onto every kept row so the
+            # CSV is self-attributing.
+            record["account_number"] = metadata["account_number"] or ""
+            record["statement_period_start"] = format_date(
+                metadata["period_start"], output_date_format,
+            )
+            record["statement_period_end"] = format_date(
+                metadata["period_end"], output_date_format,
+            )
+
             out_rows.append(record)
             prev = record
 
@@ -731,6 +938,8 @@ __all__ = [
     "diagnose_pdf_lines",
     "extract_pages",
     "extract_pages_auto",
+    "extract_statement_metadata",
+    "format_date",
     "ocr_available",
     "parse_amount",
     "parse_date",
diff --git a/tests/test_pdf_extract.py b/tests/test_pdf_extract.py
index ebaba7c..517d9a2 100644
--- a/tests/test_pdf_extract.py
+++ b/tests/test_pdf_extract.py
@@ -15,9 +15,14 @@ from __future__ import annotations
 from src.pdf_extract import (
     Page,
     WordBox,
+    _extract_account_number,
+    _extract_statement_period,
     _find_amount_tokens,
     _find_dates_in_words,
+    _infer_year_for_short_date,
     cluster_rows,
+    extract_statement_metadata,
+    format_date,
     parse_amount,
     parse_date,
 )
@@ -207,3 +212,134 @@ class TestFindAmountTokens:
 # test module — they need ``scan_pdf_for_transactions`` which in
 # turn uses ``extract_pages_auto``. The unit-test layer here pins
 # the building blocks; smoke tests pin the wiring.
+
+
+class TestFormatDate:
+    def test_yyyymmdd(self):
+        assert format_date("2026-01-13", "%Y%m%d") == "20260113"
+
+    def test_iso_passthrough(self):
+        assert format_date("2026-01-13", "%Y-%m-%d") == "2026-01-13"
+
+    def test_us(self):
+        assert format_date("2026-01-13", "%m/%d/%Y") == "01/13/2026"
+
+    def test_invalid_input_passes_through(self):
+        # Non-ISO input — return as-is so the user sees what was
+        # actually there rather than a silent empty string.
+        assert format_date("01/13", "%Y%m%d") == "01/13"
+
+    def test_none_or_empty(self):
+        assert format_date(None) == ""
+        assert format_date("") == ""
+
+
+class TestExtractAccountNumber:
+    def test_masked(self):
+        text = "Customer Name\nAccount Number: ****1234\nBalance"
+        assert _extract_account_number(text) == "****1234"
+
+    def test_with_hyphens(self):
+        text = "Account #: 1234-5678-9012"
+        assert _extract_account_number(text) == "1234-5678-9012"
+
+    def test_with_spaces(self):
+        text = "Account: 1234 5678 9012"
+        assert _extract_account_number(text) == "1234 5678 9012"
+
+    def test_no_label_no_match(self):
+        text = "Just some text with 1234567890 in it"
+        assert _extract_account_number(text) is None
+
+    def test_requires_at_least_four_digits(self):
+        # An "account" label followed by only XX shouldn't count.
+        text = "Account: XX"
+        assert _extract_account_number(text) is None
+
+
+class TestExtractStatementPeriod:
+    def test_standard_period(self):
+        text = "Statement Period: 01/01/2025 - 01/31/2025\nBalance"
+        start, end = _extract_statement_period(text)
+        assert start == "2025-01-01"
+        assert end == "2025-01-31"
+
+    def test_from_to(self):
+        text = "From 01/01/2025 to 01/31/2025"
+        start, end = _extract_statement_period(text)
+        assert start == "2025-01-01"
+        assert end == "2025-01-31"
+
+    def test_single_date_both_fields(self):
+        # When only one date appears near the label, return it for both.
+        text = "Statement Date: 01/31/2025"
+        start, end = _extract_statement_period(text)
+        assert start == "2025-01-31"
+        assert end == "2025-01-31"
+
+    def test_no_label_no_match(self):
+        text = "Some random text with 01/01/2025 in it"
+        start, end = _extract_statement_period(text)
+        # No "Period" / "From" / "Statement Date" label
+        assert (start, end) == (None, None)
+
+
+class TestExtractStatementMetadata:
+    def test_full_header(self):
+        pages = [Page(
+            page_no=1, width=600, height=800,
+            text=(
+                "ACME BANK\n"
+                "Customer: John Doe\n"
+                "Account Number: ****5678\n"
+                "Statement Period: 01/01/2025 - 01/31/2025\n"
+                "Beginning balance: $1,000.00\n"
+            ),
+            words=[],
+        )]
+        meta = extract_statement_metadata(pages)
+        assert meta["account_number"] == "****5678"
+        assert meta["period_start"] == "2025-01-01"
+        assert meta["period_end"] == "2025-01-31"
+
+    def test_no_pages(self):
+        meta = extract_statement_metadata([])
+        assert meta == {
+            "account_number": None,
+            "period_start": None,
+            "period_end": None,
+        }
+
+    def test_fallback_to_page_two(self):
+        # Page 1 has only account; period is on page 2.
+        p1 = Page(
+            page_no=1, width=600, height=800,
+            text="Account Number: ****1234\nBalance summary",
+            words=[],
+        )
+        p2 = Page(
+            page_no=2, width=600, height=800,
+            text="Statement Period: 02/01/2025 - 02/28/2025",
+            words=[],
+        )
+        meta = extract_statement_metadata([p1, p2])
+        assert meta["account_number"] == "****1234"
+        assert meta["period_start"] == "2025-02-01"
+        assert meta["period_end"] == "2025-02-28"
+
+
+class TestInferYearForShortDate:
+    def test_us_short_with_period_end(self):
+        assert _infer_year_for_short_date("01/13", "2025-01-31") == "2025-01-13"
+
+    def test_short_dash(self):
+        assert _infer_year_for_short_date("01-13", "2025-01-31") == "2025-01-13"
+
+    def test_month_name(self):
+        assert _infer_year_for_short_date("Jan 13", "2025-01-31") == "2025-01-13"
+
+    def test_no_period_end(self):
+        assert _infer_year_for_short_date("01/13", None) is None
+
+    def test_unparseable(self):
+        assert _infer_year_for_short_date("xx/yy", "2025-01-31") is None
diff --git a/tests/test_pdf_extract_smoke.py b/tests/test_pdf_extract_smoke.py
index eae937f..79a0d13 100644
--- a/tests/test_pdf_extract_smoke.py
+++ b/tests/test_pdf_extract_smoke.py
@@ -15,6 +15,45 @@ from __future__ import annotations
 import pytest
 
 
+def _build_statement_pdf_with_header() -> bytes:
+    """Statement with realistic header (account + period) plus
+    transactions. Exercises the metadata-extraction path end-to-end."""
+    from fpdf import FPDF
+
+    pdf = FPDF(orientation="P", unit="pt", format="letter")
+    pdf.add_page()
+    pdf.set_font("Helvetica", size=12)
+    pdf.set_xy(40, 50)
+    pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT")
+    pdf.set_xy(40, 70)
+    pdf.cell(0, 14, "Account Number: ****5678", new_x="LMARGIN", new_y="NEXT")
+    pdf.set_xy(40, 85)
+    pdf.cell(0, 14, "Statement Period: 01/01/2025 - 01/31/2025",
+             new_x="LMARGIN", new_y="NEXT")
+    # Header row
+    pdf.set_xy(40, 130)
+    pdf.cell(120, 14, "Date")
+    pdf.set_xy(160, 130)
+    pdf.cell(200, 14, "Description")
+    pdf.set_xy(360, 130)
+    pdf.cell(80, 14, "Amount")
+    # Transactions with SHORT dates — year is implied by period.
+    rows = [
+        ("01/13", "Coffee Shop",     "(4.50)"),
+        ("01/16", "Refund Vendor",   "$12.00"),
+    ]
+    y = 160
+    for date, desc, amt in rows:
+        pdf.set_xy(40, y)
+        pdf.cell(120, 14, date)
+        pdf.set_xy(160, y)
+        pdf.cell(200, 14, desc)
+        pdf.set_xy(360, y)
+        pdf.cell(80, 14, amt)
+        y += 20
+    return bytes(pdf.output())
+
+
 def _build_tiny_statement_pdf() -> bytes:
     """One-page PDF: header line + three transaction rows + a
     closing-balance footer. The scanner should pick up exactly the
@@ -97,13 +136,34 @@ class TestScanPdfForTransactions:
             f"{[r.get('raw') for r in rows]}"
         )
 
-    def test_parses_dates_to_iso(self, pdf_bytes):
+    def test_dates_formatted_yyyymmdd_by_default(self, pdf_bytes):
         from src.pdf_extract import scan_pdf_for_transactions
         rows, _ = scan_pdf_for_transactions(pdf_bytes)
+        # Default output format is %Y%m%d
+        assert [r["date"] for r in rows] == [
+            "20260115", "20260116", "20260117",
+        ]
+
+    def test_output_date_format_override(self, pdf_bytes):
+        from src.pdf_extract import scan_pdf_for_transactions
+        rows, _ = scan_pdf_for_transactions(
+            pdf_bytes, output_date_format="%Y-%m-%d",
+        )
         assert [r["date"] for r in rows] == [
             "2026-01-15", "2026-01-16", "2026-01-17",
         ]
 
+    def test_metadata_fields_present_on_every_row(self, pdf_bytes):
+        from src.pdf_extract import scan_pdf_for_transactions
+        rows, _ = scan_pdf_for_transactions(pdf_bytes)
+        # The fixture PDF has no statement-period or account
+        # header, so the metadata fields exist but are empty
+        # strings — the contract is: ALWAYS present on every row.
+        for r in rows:
+            assert "account_number" in r
+            assert "statement_period_start" in r
+            assert "statement_period_end" in r
+
     def test_parses_amounts_with_signs(self, pdf_bytes):
         from src.pdf_extract import scan_pdf_for_transactions
         rows, _ = scan_pdf_for_transactions(pdf_bytes)
@@ -144,6 +204,42 @@ class TestScanPdfForTransactions:
 # ---------------------------------------------------------------------------
 
 
+class TestStatementHeaderEndToEnd:
+    """A real PDF with a real header — exercise the full pipeline:
+    metadata extraction + year inference for short dates + format
+    application. This is the failure mode most likely to break on
+    the user's actual Chase statements."""
+
+    @pytest.fixture
+    def pdf_bytes(self) -> bytes:
+        return _build_statement_pdf_with_header()
+
+    def test_metadata_extracted_and_stamped(self, pdf_bytes):
+        from src.pdf_extract import scan_pdf_for_transactions
+        rows, _ = scan_pdf_for_transactions(pdf_bytes)
+        assert rows, "expected at least one transaction"
+        for r in rows:
+            assert r["account_number"] == "****5678"
+            assert r["statement_period_start"] == "20250101"
+            assert r["statement_period_end"] == "20250131"
+
+    def test_short_dates_get_year_from_period(self, pdf_bytes):
+        from src.pdf_extract import scan_pdf_for_transactions
+        rows, _ = scan_pdf_for_transactions(pdf_bytes)
+        # Short ``01/13`` + period ending in 2025 → 20250113
+        assert rows[0]["date"] == "20250113"
+        assert rows[1]["date"] == "20250116"
+
+    def test_iso_format_round_trip(self, pdf_bytes):
+        from src.pdf_extract import scan_pdf_for_transactions
+        rows, _ = scan_pdf_for_transactions(
+            pdf_bytes, output_date_format="%Y-%m-%d",
+        )
+        assert rows[0]["date"] == "2025-01-13"
+        assert rows[0]["statement_period_start"] == "2025-01-01"
+        assert rows[0]["statement_period_end"] == "2025-01-31"
+
+
 class TestMultiDateRow:
     """Some statements (Chase, BofA) show both a transaction date
     and a posting date per row. The scanner uses the first date