fix(pdf): drop statement_period_start/end columns from output
User asked to remove them — the two columns repeated the same value on every row from a given statement, took up screen space in the editor, and offered limited value once the date column already carries the inferred full date. What's kept: - ``account_number`` — still stamped onto every row so multi- statement CSVs are self-attributing - ``extract_statement_metadata`` — still runs every scan because ``period_end`` is the source of the year inference that binds Chase-style short ``01/13`` dates to ``20250113`` - ``_extract_statement_period`` and its tests — period detection itself isn't going anywhere, just its appearance in the output rows What's removed: - ``record["statement_period_start"]`` / ``record["statement_period_end"]`` assignments in ``scan_pdf_for_transactions`` - The two columns from the page's column-ordering setup - Tests pinning their presence; replaced with assertions that they're explicitly absent Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -427,7 +427,7 @@ else:
|
|||||||
|
|
||||||
# Order columns so the user-facing fields are leftmost; raw +
|
# Order columns so the user-facing fields are leftmost; raw +
|
||||||
# internals are last and easy to scroll past or unselect at
|
# internals are last and easy to scroll past or unselect at
|
||||||
# download time. Statement metadata sits with the transaction
|
# download time. ``account_number`` sits with the transaction
|
||||||
# detail since it's per-row context an accountant typically
|
# detail since it's per-row context an accountant typically
|
||||||
# wants alongside the amounts.
|
# wants alongside the amounts.
|
||||||
front = [
|
front = [
|
||||||
@@ -435,11 +435,7 @@ else:
|
|||||||
"description",
|
"description",
|
||||||
]
|
]
|
||||||
amount_cols = sorted(c for c in df.columns if c.startswith("amount_"))
|
amount_cols = sorted(c for c in df.columns if c.startswith("amount_"))
|
||||||
metadata_cols = [
|
metadata_cols = ["account_number"]
|
||||||
"account_number",
|
|
||||||
"statement_period_start",
|
|
||||||
"statement_period_end",
|
|
||||||
]
|
|
||||||
tail = ["source_file", "page", "raw"]
|
tail = ["source_file", "page", "raw"]
|
||||||
ordered = [
|
ordered = [
|
||||||
c for c in front + amount_cols + metadata_cols + tail
|
c for c in front + amount_cols + metadata_cols + tail
|
||||||
|
|||||||
@@ -782,15 +782,15 @@ def scan_pdf_for_transactions(
|
|||||||
"page": 1,
|
"page": 1,
|
||||||
"raw": "01/15/2026 Coffee $4.50",
|
"raw": "01/15/2026 Coffee $4.50",
|
||||||
"account_number": "****1234", # from header
|
"account_number": "****1234", # from header
|
||||||
"statement_period_start": "20260101",
|
|
||||||
"statement_period_end": "20260131",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Header metadata (``account_number`` /
|
Account number is extracted from the statement header once
|
||||||
``statement_period_start`` / ``statement_period_end``) is
|
per PDF and stamped onto every detected row so the CSV is
|
||||||
extracted once per PDF and stamped onto every detected row.
|
self-attributing when statements are combined. The statement
|
||||||
That way a multi-statement CSV remains attributable per row
|
period IS detected (used internally for year inference on
|
||||||
when it's reshaped or imported elsewhere.
|
short dates like "01/13") but isn't surfaced as a per-row
|
||||||
|
column — the inferred year already lives in the ``date``
|
||||||
|
field.
|
||||||
|
|
||||||
Short dates without a year (``01/13``, ``Jan 13``) are bound
|
Short dates without a year (``01/13``, ``Jan 13``) are bound
|
||||||
to the year of the statement period's end before formatting.
|
to the year of the statement period's end before formatting.
|
||||||
@@ -915,15 +915,14 @@ def scan_pdf_for_transactions(
|
|||||||
if not _has_real_transaction_amount(record):
|
if not _has_real_transaction_amount(record):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Stamp the header metadata onto every kept row so the
|
# Stamp the account number onto every kept row so the
|
||||||
# CSV is self-attributing.
|
# CSV is self-attributing when statements are combined.
|
||||||
|
# The period start/end aren't surfaced per row — they're
|
||||||
|
# used only for the year-inference fallback above
|
||||||
|
# (binding short dates like "01/13" to the statement's
|
||||||
|
# year) but downstream the date column already carries
|
||||||
|
# the inferred full date.
|
||||||
record["account_number"] = metadata["account_number"] or ""
|
record["account_number"] = metadata["account_number"] or ""
|
||||||
record["statement_period_start"] = format_date(
|
|
||||||
metadata["period_start"], output_date_format,
|
|
||||||
)
|
|
||||||
record["statement_period_end"] = format_date(
|
|
||||||
metadata["period_end"], output_date_format,
|
|
||||||
)
|
|
||||||
|
|
||||||
out_rows.append(record)
|
out_rows.append(record)
|
||||||
prev = record
|
prev = record
|
||||||
|
|||||||
@@ -153,16 +153,16 @@ class TestScanPdfForTransactions:
|
|||||||
"2026-01-15", "2026-01-16", "2026-01-17",
|
"2026-01-15", "2026-01-16", "2026-01-17",
|
||||||
]
|
]
|
||||||
|
|
||||||
def test_metadata_fields_present_on_every_row(self, pdf_bytes):
|
def test_account_number_field_present_on_every_row(self, pdf_bytes):
|
||||||
from src.pdf_extract import scan_pdf_for_transactions
|
from src.pdf_extract import scan_pdf_for_transactions
|
||||||
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||||
# The fixture PDF has no statement-period or account
|
# ``account_number`` is the only per-row metadata field that
|
||||||
# header, so the metadata fields exist but are empty
|
# surfaces in the CSV; the period fields are extracted but
|
||||||
# strings — the contract is: ALWAYS present on every row.
|
# used only for internal year inference.
|
||||||
for r in rows:
|
for r in rows:
|
||||||
assert "account_number" in r
|
assert "account_number" in r
|
||||||
assert "statement_period_start" in r
|
assert "statement_period_start" not in r
|
||||||
assert "statement_period_end" in r
|
assert "statement_period_end" not in r
|
||||||
|
|
||||||
def test_parses_amounts_with_signs(self, pdf_bytes):
|
def test_parses_amounts_with_signs(self, pdf_bytes):
|
||||||
from src.pdf_extract import scan_pdf_for_transactions
|
from src.pdf_extract import scan_pdf_for_transactions
|
||||||
@@ -214,30 +214,36 @@ class TestStatementHeaderEndToEnd:
|
|||||||
def pdf_bytes(self) -> bytes:
|
def pdf_bytes(self) -> bytes:
|
||||||
return _build_statement_pdf_with_header()
|
return _build_statement_pdf_with_header()
|
||||||
|
|
||||||
def test_metadata_extracted_and_stamped(self, pdf_bytes):
|
def test_account_number_stamped(self, pdf_bytes):
|
||||||
from src.pdf_extract import scan_pdf_for_transactions
|
from src.pdf_extract import scan_pdf_for_transactions
|
||||||
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||||
assert rows, "expected at least one transaction"
|
assert rows, "expected at least one transaction"
|
||||||
for r in rows:
|
for r in rows:
|
||||||
assert r["account_number"] == "****5678"
|
assert r["account_number"] == "****5678"
|
||||||
assert r["statement_period_start"] == "20250101"
|
|
||||||
assert r["statement_period_end"] == "20250131"
|
|
||||||
|
|
||||||
def test_short_dates_get_year_from_period(self, pdf_bytes):
|
def test_short_dates_get_year_from_period(self, pdf_bytes):
|
||||||
from src.pdf_extract import scan_pdf_for_transactions
|
from src.pdf_extract import scan_pdf_for_transactions
|
||||||
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||||
# Short ``01/13`` + period ending in 2025 → 20250113
|
# Short ``01/13`` + period ending in 2025 → 20250113.
|
||||||
|
# The period itself isn't surfaced as a column anymore, but
|
||||||
|
# the year inference that depends on it still works because
|
||||||
|
# extraction happens internally before the per-row stamp.
|
||||||
assert rows[0]["date"] == "20250113"
|
assert rows[0]["date"] == "20250113"
|
||||||
assert rows[1]["date"] == "20250116"
|
assert rows[1]["date"] == "20250116"
|
||||||
|
|
||||||
|
def test_period_fields_not_in_output(self, pdf_bytes):
|
||||||
|
from src.pdf_extract import scan_pdf_for_transactions
|
||||||
|
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||||
|
for r in rows:
|
||||||
|
assert "statement_period_start" not in r
|
||||||
|
assert "statement_period_end" not in r
|
||||||
|
|
||||||
def test_iso_format_round_trip(self, pdf_bytes):
|
def test_iso_format_round_trip(self, pdf_bytes):
|
||||||
from src.pdf_extract import scan_pdf_for_transactions
|
from src.pdf_extract import scan_pdf_for_transactions
|
||||||
rows, _ = scan_pdf_for_transactions(
|
rows, _ = scan_pdf_for_transactions(
|
||||||
pdf_bytes, output_date_format="%Y-%m-%d",
|
pdf_bytes, output_date_format="%Y-%m-%d",
|
||||||
)
|
)
|
||||||
assert rows[0]["date"] == "2025-01-13"
|
assert rows[0]["date"] == "2025-01-13"
|
||||||
assert rows[0]["statement_period_start"] == "2025-01-01"
|
|
||||||
assert rows[0]["statement_period_end"] == "2025-01-31"
|
|
||||||
|
|
||||||
|
|
||||||
class TestMultiDateRow:
|
class TestMultiDateRow:
|
||||||
|
|||||||
Reference in New Issue
Block a user