From 34b56b404a66844fa1bc8fe0a9e03505bd91b220 Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 20 May 2026 01:28:32 +0000 Subject: [PATCH] fix(pdf): drop statement_period_start/end columns from output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User asked to remove them — the two columns repeated the same value on every row from a given statement, took up screen space in the editor, and offered limited value once the date column already carries the inferred full date. What's kept: - ``account_number`` — still stamped onto every row so multi- statement CSVs are self-attributing - ``extract_statement_metadata`` — still runs every scan because ``period_end`` is the source of the year inference that binds Chase-style short ``01/13`` dates to ``20250113`` - ``_extract_statement_period`` and its tests — period detection itself isn't going anywhere, just its appearance in the output rows What's removed: - ``record["statement_period_start"]`` / ``record["statement_period_end"]`` assignments in ``scan_pdf_for_transactions`` - The two columns from the page's column-ordering setup - Tests pinning their presence; replaced with assertions that they're explicitly absent Co-Authored-By: Claude Opus 4.7 (1M context) --- src/gui/pages/10_PDF_Extractor.py | 8 ++------ src/pdf_extract.py | 29 ++++++++++++++--------------- tests/test_pdf_extract_smoke.py | 30 ++++++++++++++++++------------ 3 files changed, 34 insertions(+), 33 deletions(-) diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py index e6a1698..b70f302 100644 --- a/src/gui/pages/10_PDF_Extractor.py +++ b/src/gui/pages/10_PDF_Extractor.py @@ -427,7 +427,7 @@ else: # Order columns so the user-facing fields are leftmost; raw + # internals are last and easy to scroll past or unselect at - # download time. Statement metadata sits with the transaction + # download time. ``account_number`` sits with the transaction # detail since it's per-row context an accountant typically # wants alongside the amounts. front = [ @@ -435,11 +435,7 @@ else: "description", ] amount_cols = sorted(c for c in df.columns if c.startswith("amount_")) - metadata_cols = [ - "account_number", - "statement_period_start", - "statement_period_end", - ] + metadata_cols = ["account_number"] tail = ["source_file", "page", "raw"] ordered = [ c for c in front + amount_cols + metadata_cols + tail diff --git a/src/pdf_extract.py b/src/pdf_extract.py index 8f43d3b..dd4e6f3 100644 --- a/src/pdf_extract.py +++ b/src/pdf_extract.py @@ -782,15 +782,15 @@ def scan_pdf_for_transactions( "page": 1, "raw": "01/15/2026 Coffee $4.50", "account_number": "****1234", # from header - "statement_period_start": "20260101", - "statement_period_end": "20260131", } - Header metadata (``account_number`` / - ``statement_period_start`` / ``statement_period_end``) is - extracted once per PDF and stamped onto every detected row. - That way a multi-statement CSV remains attributable per row - when it's reshaped or imported elsewhere. + Account number is extracted from the statement header once + per PDF and stamped onto every detected row so the CSV is + self-attributing when statements are combined. The statement + period IS detected (used internally for year inference on + short dates like "01/13") but isn't surfaced as a per-row + column — the inferred year already lives in the ``date`` + field. Short dates without a year (``01/13``, ``Jan 13``) are bound to the year of the statement period's end before formatting. @@ -915,15 +915,14 @@ def scan_pdf_for_transactions( if not _has_real_transaction_amount(record): continue - # Stamp the header metadata onto every kept row so the - # CSV is self-attributing. + # Stamp the account number onto every kept row so the + # CSV is self-attributing when statements are combined. + # The period start/end aren't surfaced per row — they're + # used only for the year-inference fallback above + # (binding short dates like "01/13" to the statement's + # year) but downstream the date column already carries + # the inferred full date. record["account_number"] = metadata["account_number"] or "" - record["statement_period_start"] = format_date( - metadata["period_start"], output_date_format, - ) - record["statement_period_end"] = format_date( - metadata["period_end"], output_date_format, - ) out_rows.append(record) prev = record diff --git a/tests/test_pdf_extract_smoke.py b/tests/test_pdf_extract_smoke.py index 2baca94..a7c0fd2 100644 --- a/tests/test_pdf_extract_smoke.py +++ b/tests/test_pdf_extract_smoke.py @@ -153,16 +153,16 @@ class TestScanPdfForTransactions: "2026-01-15", "2026-01-16", "2026-01-17", ] - def test_metadata_fields_present_on_every_row(self, pdf_bytes): + def test_account_number_field_present_on_every_row(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions rows, _ = scan_pdf_for_transactions(pdf_bytes) - # The fixture PDF has no statement-period or account - # header, so the metadata fields exist but are empty - # strings — the contract is: ALWAYS present on every row. + # ``account_number`` is the only per-row metadata field that + # surfaces in the CSV; the period fields are extracted but + # used only for internal year inference. for r in rows: assert "account_number" in r - assert "statement_period_start" in r - assert "statement_period_end" in r + assert "statement_period_start" not in r + assert "statement_period_end" not in r def test_parses_amounts_with_signs(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions @@ -214,30 +214,36 @@ class TestStatementHeaderEndToEnd: def pdf_bytes(self) -> bytes: return _build_statement_pdf_with_header() - def test_metadata_extracted_and_stamped(self, pdf_bytes): + def test_account_number_stamped(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions rows, _ = scan_pdf_for_transactions(pdf_bytes) assert rows, "expected at least one transaction" for r in rows: assert r["account_number"] == "****5678" - assert r["statement_period_start"] == "20250101" - assert r["statement_period_end"] == "20250131" def test_short_dates_get_year_from_period(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions rows, _ = scan_pdf_for_transactions(pdf_bytes) - # Short ``01/13`` + period ending in 2025 → 20250113 + # Short ``01/13`` + period ending in 2025 → 20250113. + # The period itself isn't surfaced as a column anymore, but + # the year inference that depends on it still works because + # extraction happens internally before the per-row stamp. assert rows[0]["date"] == "20250113" assert rows[1]["date"] == "20250116" + def test_period_fields_not_in_output(self, pdf_bytes): + from src.pdf_extract import scan_pdf_for_transactions + rows, _ = scan_pdf_for_transactions(pdf_bytes) + for r in rows: + assert "statement_period_start" not in r + assert "statement_period_end" not in r + def test_iso_format_round_trip(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions rows, _ = scan_pdf_for_transactions( pdf_bytes, output_date_format="%Y-%m-%d", ) assert rows[0]["date"] == "2025-01-13" - assert rows[0]["statement_period_start"] == "2025-01-01" - assert rows[0]["statement_period_end"] == "2025-01-31" class TestMultiDateRow: