From 34b56b404a66844fa1bc8fe0a9e03505bd91b220 Mon Sep 17 00:00:00 2001
From: Michael <michael.dombaugh@gmail.com>
Date: Wed, 20 May 2026 01:28:32 +0000
Subject: [PATCH] fix(pdf): drop statement_period_start/end columns from output
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

User asked to remove them — the two columns repeated the same
value on every row from a given statement, took up screen space
in the editor, and offered limited value once the date column
already carries the inferred full date.

What's kept:
- ``account_number`` — still stamped onto every row so multi-
  statement CSVs are self-attributing
- ``extract_statement_metadata`` — still runs every scan because
  ``period_end`` is the source of the year inference that binds
  Chase-style short ``01/13`` dates to ``20250113``
- ``_extract_statement_period`` and its tests — period
  detection itself isn't going anywhere, just its appearance in
  the output rows

What's removed:
- ``record["statement_period_start"]`` / ``record["statement_period_end"]``
  assignments in ``scan_pdf_for_transactions``
- The two columns from the page's column-ordering setup
- Tests pinning their presence; replaced with assertions that
  they're explicitly absent

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/gui/pages/10_PDF_Extractor.py |  8 ++------
 src/pdf_extract.py                | 29 ++++++++++++++---------------
 tests/test_pdf_extract_smoke.py   | 30 ++++++++++++++++++------------
 3 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py
index e6a1698..b70f302 100644
--- a/src/gui/pages/10_PDF_Extractor.py
+++ b/src/gui/pages/10_PDF_Extractor.py
@@ -427,7 +427,7 @@ else:
 
     # Order columns so the user-facing fields are leftmost; raw +
     # internals are last and easy to scroll past or unselect at
-    # download time. Statement metadata sits with the transaction
+    # download time. ``account_number`` sits with the transaction
     # detail since it's per-row context an accountant typically
     # wants alongside the amounts.
     front = [
@@ -435,11 +435,7 @@ else:
         "description",
     ]
     amount_cols = sorted(c for c in df.columns if c.startswith("amount_"))
-    metadata_cols = [
-        "account_number",
-        "statement_period_start",
-        "statement_period_end",
-    ]
+    metadata_cols = ["account_number"]
     tail = ["source_file", "page", "raw"]
     ordered = [
         c for c in front + amount_cols + metadata_cols + tail
diff --git a/src/pdf_extract.py b/src/pdf_extract.py
index 8f43d3b..dd4e6f3 100644
--- a/src/pdf_extract.py
+++ b/src/pdf_extract.py
@@ -782,15 +782,15 @@ def scan_pdf_for_transactions(
           "page": 1,
           "raw": "01/15/2026 Coffee $4.50",
           "account_number": "****1234",      # from header
-          "statement_period_start": "20260101",
-          "statement_period_end": "20260131",
         }
 
-    Header metadata (``account_number`` /
-    ``statement_period_start`` / ``statement_period_end``) is
-    extracted once per PDF and stamped onto every detected row.
-    That way a multi-statement CSV remains attributable per row
-    when it's reshaped or imported elsewhere.
+    Account number is extracted from the statement header once
+    per PDF and stamped onto every detected row so the CSV is
+    self-attributing when statements are combined. The statement
+    period IS detected (used internally for year inference on
+    short dates like "01/13") but isn't surfaced as a per-row
+    column — the inferred year already lives in the ``date``
+    field.
 
     Short dates without a year (``01/13``, ``Jan 13``) are bound
     to the year of the statement period's end before formatting.
@@ -915,15 +915,14 @@ def scan_pdf_for_transactions(
             if not _has_real_transaction_amount(record):
                 continue
 
-            # Stamp the header metadata onto every kept row so the
-            # CSV is self-attributing.
+            # Stamp the account number onto every kept row so the
+            # CSV is self-attributing when statements are combined.
+            # The period start/end aren't surfaced per row — they're
+            # used only for the year-inference fallback above
+            # (binding short dates like "01/13" to the statement's
+            # year) but downstream the date column already carries
+            # the inferred full date.
             record["account_number"] = metadata["account_number"] or ""
-            record["statement_period_start"] = format_date(
-                metadata["period_start"], output_date_format,
-            )
-            record["statement_period_end"] = format_date(
-                metadata["period_end"], output_date_format,
-            )
 
             out_rows.append(record)
             prev = record
diff --git a/tests/test_pdf_extract_smoke.py b/tests/test_pdf_extract_smoke.py
index 2baca94..a7c0fd2 100644
--- a/tests/test_pdf_extract_smoke.py
+++ b/tests/test_pdf_extract_smoke.py
@@ -153,16 +153,16 @@ class TestScanPdfForTransactions:
             "2026-01-15", "2026-01-16", "2026-01-17",
         ]
 
-    def test_metadata_fields_present_on_every_row(self, pdf_bytes):
+    def test_account_number_field_present_on_every_row(self, pdf_bytes):
         from src.pdf_extract import scan_pdf_for_transactions
         rows, _ = scan_pdf_for_transactions(pdf_bytes)
-        # The fixture PDF has no statement-period or account
-        # header, so the metadata fields exist but are empty
-        # strings — the contract is: ALWAYS present on every row.
+        # ``account_number`` is the only per-row metadata field that
+        # surfaces in the CSV; the period fields are extracted but
+        # used only for internal year inference.
         for r in rows:
             assert "account_number" in r
-            assert "statement_period_start" in r
-            assert "statement_period_end" in r
+            assert "statement_period_start" not in r
+            assert "statement_period_end" not in r
 
     def test_parses_amounts_with_signs(self, pdf_bytes):
         from src.pdf_extract import scan_pdf_for_transactions
@@ -214,30 +214,36 @@ class TestStatementHeaderEndToEnd:
     def pdf_bytes(self) -> bytes:
         return _build_statement_pdf_with_header()
 
-    def test_metadata_extracted_and_stamped(self, pdf_bytes):
+    def test_account_number_stamped(self, pdf_bytes):
         from src.pdf_extract import scan_pdf_for_transactions
         rows, _ = scan_pdf_for_transactions(pdf_bytes)
         assert rows, "expected at least one transaction"
         for r in rows:
             assert r["account_number"] == "****5678"
-            assert r["statement_period_start"] == "20250101"
-            assert r["statement_period_end"] == "20250131"
 
     def test_short_dates_get_year_from_period(self, pdf_bytes):
         from src.pdf_extract import scan_pdf_for_transactions
         rows, _ = scan_pdf_for_transactions(pdf_bytes)
-        # Short ``01/13`` + period ending in 2025 → 20250113
+        # Short ``01/13`` + period ending in 2025 → 20250113.
+        # The period itself isn't surfaced as a column anymore, but
+        # the year inference that depends on it still works because
+        # extraction happens internally before the per-row stamp.
         assert rows[0]["date"] == "20250113"
         assert rows[1]["date"] == "20250116"
 
+    def test_period_fields_not_in_output(self, pdf_bytes):
+        from src.pdf_extract import scan_pdf_for_transactions
+        rows, _ = scan_pdf_for_transactions(pdf_bytes)
+        for r in rows:
+            assert "statement_period_start" not in r
+            assert "statement_period_end" not in r
+
     def test_iso_format_round_trip(self, pdf_bytes):
         from src.pdf_extract import scan_pdf_for_transactions
         rows, _ = scan_pdf_for_transactions(
             pdf_bytes, output_date_format="%Y-%m-%d",
         )
         assert rows[0]["date"] == "2025-01-13"
-        assert rows[0]["statement_period_start"] == "2025-01-01"
-        assert rows[0]["statement_period_end"] == "2025-01-31"
 
 
 class TestMultiDateRow: