fix(pdf): drop statement_period_start/end columns from output

User asked to remove them — the two columns repeated the same
value on every row from a given statement, took up screen space
in the editor, and offered limited value once the date column
already carries the inferred full date.

What's kept:
- ``account_number`` — still stamped onto every row so multi-
  statement CSVs are self-attributing
- ``extract_statement_metadata`` — still runs every scan because
  ``period_end`` is the source of the year inference that binds
  Chase-style short ``01/13`` dates to ``20250113``
- ``_extract_statement_period`` and its tests — period
  detection itself isn't going anywhere, just its appearance in
  the output rows

What's removed:
- ``record["statement_period_start"]`` / ``record["statement_period_end"]``
  assignments in ``scan_pdf_for_transactions``
- The two columns from the page's column-ordering setup
- Tests pinning their presence; replaced with assertions that
  they're explicitly absent

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-20 01:28:32 +00:00
parent ad7c22d7fb
commit 34b56b404a
3 changed files with 34 additions and 33 deletions

View File

@@ -427,7 +427,7 @@ else:
# Order columns so the user-facing fields are leftmost; raw +
# internals are last and easy to scroll past or unselect at
# download time. Statement metadata sits with the transaction
# download time. ``account_number`` sits with the transaction
# detail since it's per-row context an accountant typically
# wants alongside the amounts.
front = [
@@ -435,11 +435,7 @@ else:
"description",
]
amount_cols = sorted(c for c in df.columns if c.startswith("amount_"))
metadata_cols = [
"account_number",
"statement_period_start",
"statement_period_end",
]
metadata_cols = ["account_number"]
tail = ["source_file", "page", "raw"]
ordered = [
c for c in front + amount_cols + metadata_cols + tail

View File

@@ -782,15 +782,15 @@ def scan_pdf_for_transactions(
"page": 1,
"raw": "01/15/2026 Coffee $4.50",
"account_number": "****1234", # from header
"statement_period_start": "20260101",
"statement_period_end": "20260131",
}
Header metadata (``account_number`` /
``statement_period_start`` / ``statement_period_end``) is
extracted once per PDF and stamped onto every detected row.
That way a multi-statement CSV remains attributable per row
when it's reshaped or imported elsewhere.
Account number is extracted from the statement header once
per PDF and stamped onto every detected row so the CSV is
self-attributing when statements are combined. The statement
period IS detected (used internally for year inference on
short dates like "01/13") but isn't surfaced as a per-row
column — the inferred year already lives in the ``date``
field.
Short dates without a year (``01/13``, ``Jan 13``) are bound
to the year of the statement period's end before formatting.
@@ -915,15 +915,14 @@ def scan_pdf_for_transactions(
if not _has_real_transaction_amount(record):
continue
# Stamp the header metadata onto every kept row so the
# CSV is self-attributing.
# Stamp the account number onto every kept row so the
# CSV is self-attributing when statements are combined.
# The period start/end aren't surfaced per row — they're
# used only for the year-inference fallback above
# (binding short dates like "01/13" to the statement's
# year) but downstream the date column already carries
# the inferred full date.
record["account_number"] = metadata["account_number"] or ""
record["statement_period_start"] = format_date(
metadata["period_start"], output_date_format,
)
record["statement_period_end"] = format_date(
metadata["period_end"], output_date_format,
)
out_rows.append(record)
prev = record

View File

@@ -153,16 +153,16 @@ class TestScanPdfForTransactions:
"2026-01-15", "2026-01-16", "2026-01-17",
]
def test_metadata_fields_present_on_every_row(self, pdf_bytes):
def test_account_number_field_present_on_every_row(self, pdf_bytes):
from src.pdf_extract import scan_pdf_for_transactions
rows, _ = scan_pdf_for_transactions(pdf_bytes)
# The fixture PDF has no statement-period or account
# header, so the metadata fields exist but are empty
# strings — the contract is: ALWAYS present on every row.
# ``account_number`` is the only per-row metadata field that
# surfaces in the CSV; the period fields are extracted but
# used only for internal year inference.
for r in rows:
assert "account_number" in r
assert "statement_period_start" in r
assert "statement_period_end" in r
assert "statement_period_start" not in r
assert "statement_period_end" not in r
def test_parses_amounts_with_signs(self, pdf_bytes):
from src.pdf_extract import scan_pdf_for_transactions
@@ -214,30 +214,36 @@ class TestStatementHeaderEndToEnd:
def pdf_bytes(self) -> bytes:
return _build_statement_pdf_with_header()
def test_metadata_extracted_and_stamped(self, pdf_bytes):
def test_account_number_stamped(self, pdf_bytes):
from src.pdf_extract import scan_pdf_for_transactions
rows, _ = scan_pdf_for_transactions(pdf_bytes)
assert rows, "expected at least one transaction"
for r in rows:
assert r["account_number"] == "****5678"
assert r["statement_period_start"] == "20250101"
assert r["statement_period_end"] == "20250131"
def test_short_dates_get_year_from_period(self, pdf_bytes):
from src.pdf_extract import scan_pdf_for_transactions
rows, _ = scan_pdf_for_transactions(pdf_bytes)
# Short ``01/13`` + period ending in 2025 → 20250113
# Short ``01/13`` + period ending in 2025 → 20250113.
# The period itself isn't surfaced as a column anymore, but
# the year inference that depends on it still works because
# extraction happens internally before the per-row stamp.
assert rows[0]["date"] == "20250113"
assert rows[1]["date"] == "20250116"
def test_period_fields_not_in_output(self, pdf_bytes):
from src.pdf_extract import scan_pdf_for_transactions
rows, _ = scan_pdf_for_transactions(pdf_bytes)
for r in rows:
assert "statement_period_start" not in r
assert "statement_period_end" not in r
def test_iso_format_round_trip(self, pdf_bytes):
from src.pdf_extract import scan_pdf_for_transactions
rows, _ = scan_pdf_for_transactions(
pdf_bytes, output_date_format="%Y-%m-%d",
)
assert rows[0]["date"] == "2025-01-13"
assert rows[0]["statement_period_start"] == "2025-01-01"
assert rows[0]["statement_period_end"] == "2025-01-31"
class TestMultiDateRow: