fix(pdf): drop zero-amount rows; multi-date rows clean description
Two corrections from real-statement feedback:
**1. Drop rows where the transaction amount is exactly 0.**
Bank statements include date+amount-shaped noise like
"INTEREST EARNED 0.00", "PAGE TOTAL 0.00", "BALANCE FORWARD
0.00 1,234.56" — all match the date+amount heuristic but
aren't transactions. New filter in
``scan_pdf_for_transactions``: drop rows whose ``amount_1``
parses to exactly 0. Non-zero balances in ``amount_2`` don't
rescue a zero amount_1 — leftmost amount is the canonical
transaction amount. Unparsed-but-non-empty amount strings are
kept (user verifies in the editor).
**2. Multi-date rows: first date wins for the column, every
date excluded from the description.** Chase / BofA / Wells
commonly show both a transaction date and a posting date per
row:
01/13 01/14 COFFEE SHOP $4.50
Before this fix, ``_find_dates_in_words`` returned the first
date only and the second date leaked into description as
"01/14 COFFEE SHOP". Now it returns ALL dates with their word
ranges; the scanner uses ``dates[0]`` as the canonical date
and passes every range to the description builder for
exclusion.
The detector's two-pass strategy now also guards against
mixing full-year and short-date matches on the same row.
Previously, a header line like ``Page 1/2 of 3 ... Statement
Date 01/13/2026`` would return both ``1/2`` and ``01/13/2026``,
and ``1/2`` (being leftmost) would have won the date column.
Now: if any full-year date is found on the row, short patterns
are NOT also collected — full year anchors interpretation. A
row with no full-year date (Chase short-date case) still falls
back to short patterns and collects all of them.
New tests:
- ``test_multiple_dates_returned_in_position_order`` —
``01/13`` + ``01/14`` both returned, in order
- ``TestMultiDateRow.test_first_date_wins_second_excluded_from_description``
— end-to-end through ``scan_pdf_for_transactions``
- ``TestZeroAmountRowsAreDropped.test_zero_amount_row_dropped``
— "INTEREST EARNED 0.00" row dropped while real txn kept
- ``test_negative_amount_kept`` — pin that -40.00 is not
treated as zero by the filter
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -441,45 +441,61 @@ def extract_pages_auto(
|
||||
def _find_dates_in_words(
|
||||
row_words: list[WordBox],
|
||||
) -> list[tuple[int, int, str]]:
|
||||
"""Return ``[(start_idx, end_idx, date_text)]`` for the first
|
||||
date-like substring on this row, or ``[]`` if none.
|
||||
"""Return every date-like substring on this row, sorted by
|
||||
position. Each entry is ``(start_idx, end_idx_exclusive, text)``.
|
||||
|
||||
Two-pass search:
|
||||
|
||||
- **Pass 1** — full-year patterns (``01/15/2026``,
|
||||
``Jan 13, 2026``). Tries the longest window first within
|
||||
this pass so a multi-word ``Jan 15, 2026`` isn't truncated
|
||||
to ``Jan 15``.
|
||||
``Jan 13, 2026``). Longest window first so multi-word dates
|
||||
aren't truncated to a partial short match.
|
||||
- **Pass 2** — short patterns (``01/13``, ``Jan 13``). Only
|
||||
runs if pass 1 found nothing — otherwise a stray
|
||||
``Page 1/2`` on the same line could shadow the real dated
|
||||
transaction.
|
||||
claims word ranges that pass 1 didn't already take, so a
|
||||
real ``01/13/2026`` always wins over an adjacent
|
||||
``Page 1/2``.
|
||||
|
||||
``end_idx`` is exclusive — caller uses ``range(start, end)``
|
||||
to exclude all words the date consumed from the description
|
||||
(the previous single-index return mis-attributed the day
|
||||
token of multi-word dates like ``Jan 13`` to the description).
|
||||
Some statements show both a transaction date and a posting
|
||||
date per row (Chase, BofA, …). The scanner uses the first
|
||||
match as the canonical date for the CSV column, and excludes
|
||||
EVERY date from the description so the second / third dates
|
||||
don't leak into the description text.
|
||||
"""
|
||||
for patterns, window_order in (
|
||||
(_DATE_RES_FULL, (3, 2, 1)),
|
||||
(_DATE_RES_SHORT, (2, 1)),
|
||||
):
|
||||
def _scan(patterns, window_order):
|
||||
local_found: list[tuple[int, int, str]] = []
|
||||
local_claimed: set[int] = set()
|
||||
for i in range(len(row_words)):
|
||||
if i in local_claimed:
|
||||
continue
|
||||
matched = False
|
||||
for window in window_order:
|
||||
end = i + window
|
||||
if end > len(row_words):
|
||||
continue
|
||||
if any(j in local_claimed for j in range(i, end)):
|
||||
continue
|
||||
chunk = " ".join(x.text for x in row_words[i:end])
|
||||
for rx in patterns:
|
||||
m = rx.search(chunk)
|
||||
if m:
|
||||
# Count whitespace-separated tokens in the
|
||||
# MATCH, not in the window — the window may
|
||||
# have included extra trailing words the
|
||||
# regex didn't actually consume.
|
||||
consumed = max(1, len(m.group(1).split()))
|
||||
return [(i, i + consumed, m.group(1))]
|
||||
return []
|
||||
actual_end = i + consumed
|
||||
local_found.append((i, actual_end, m.group(1)))
|
||||
local_claimed.update(range(i, actual_end))
|
||||
matched = True
|
||||
break
|
||||
if matched:
|
||||
break
|
||||
return local_found
|
||||
|
||||
full = _scan(_DATE_RES_FULL, (3, 2, 1))
|
||||
if full:
|
||||
# A real full-year date on the row anchors interpretation.
|
||||
# Don't ALSO collect short patterns — they're almost always
|
||||
# page numbers ("Page 1/2") or fractions in memos when a
|
||||
# real date is present.
|
||||
return sorted(full, key=lambda t: t[0])
|
||||
short = _scan(_DATE_RES_SHORT, (2, 1))
|
||||
return sorted(short, key=lambda t: t[0])
|
||||
|
||||
|
||||
def _find_amount_tokens(
|
||||
@@ -506,23 +522,30 @@ def _find_amount_tokens(
|
||||
|
||||
def _description_from_row(
|
||||
row_words: list[WordBox],
|
||||
date_range: tuple[int, int],
|
||||
date_ranges: list[tuple[int, int]],
|
||||
amount_idxs: set[int],
|
||||
) -> str:
|
||||
"""Stitch the description from the row's non-date, non-amount
|
||||
tokens. ``date_range`` is ``(start, end)`` exclusive — every
|
||||
word in that range is excluded so multi-word dates like
|
||||
``Jan 13`` don't leak the day token into the description.
|
||||
tokens. ``date_ranges`` is a list of ``(start, end)`` (end
|
||||
exclusive) — every word in any range is excluded.
|
||||
|
||||
Why a list: some bank statements show two dates per row
|
||||
(transaction + posting). Without excluding all of them, the
|
||||
extra date(s) leak into the description and look like trash.
|
||||
|
||||
Keeps tokens before the first amount and after the last
|
||||
amount (trailing check numbers, memos); drops words between
|
||||
amount tokens (usually whitespace artifacts in column gaps)."""
|
||||
date_start, date_end = date_range
|
||||
amount tokens (usually whitespace artifacts in column gaps).
|
||||
"""
|
||||
excluded: set[int] = set()
|
||||
for start, end in date_ranges:
|
||||
excluded.update(range(start, end))
|
||||
|
||||
keep: list[str] = []
|
||||
seen_first_amount = False
|
||||
last_amount_idx = max(amount_idxs) if amount_idxs else -1
|
||||
for i, w in enumerate(row_words):
|
||||
if date_start <= i < date_end:
|
||||
if i in excluded:
|
||||
continue
|
||||
if i in amount_idxs:
|
||||
seen_first_amount = True
|
||||
@@ -594,14 +617,22 @@ def scan_pdf_for_transactions(
|
||||
)
|
||||
continue
|
||||
|
||||
date_start, date_end, date_text = dates[0]
|
||||
# First date wins for the "date" column; ALL dates are
|
||||
# excluded from the description so a row carrying both
|
||||
# a transaction date and a posting date doesn't leak
|
||||
# the second one into description text.
|
||||
_, _, first_date_text = dates[0]
|
||||
date_ranges = [(s, e) for s, e, _ in dates]
|
||||
amount_idxs = {idx for idx, _, _ in amount_tokens}
|
||||
desc = _description_from_row(
|
||||
row_words, (date_start, date_end), amount_idxs,
|
||||
row_words, date_ranges, amount_idxs,
|
||||
)
|
||||
|
||||
record: dict[str, Any] = {
|
||||
"date": parse_date(date_text, date_formats) or date_text,
|
||||
"date": (
|
||||
parse_date(first_date_text, date_formats)
|
||||
or first_date_text
|
||||
),
|
||||
"description": desc,
|
||||
"page": page.page_no,
|
||||
"raw": line,
|
||||
@@ -616,12 +647,36 @@ def scan_pdf_for_transactions(
|
||||
record[f"amount_{k}"] = (
|
||||
parsed if parsed is not None else txt
|
||||
)
|
||||
|
||||
# Drop rows where the transaction amount is exactly 0.
|
||||
# Bank statements include noise like "INTEREST EARNED
|
||||
# 0.00" or "PAGE TOTAL 0.00" that pass the date+amount
|
||||
# heuristic but aren't real transactions. We key off
|
||||
# ``amount_1`` (leftmost amount = usually the txn
|
||||
# amount); a non-zero balance in ``amount_2`` doesn't
|
||||
# rescue a zero ``amount_1``.
|
||||
if not _has_real_transaction_amount(record):
|
||||
continue
|
||||
|
||||
out_rows.append(record)
|
||||
prev = record
|
||||
|
||||
return out_rows, warnings
|
||||
|
||||
|
||||
def _has_real_transaction_amount(record: dict[str, Any]) -> bool:
|
||||
"""``amount_1`` is the row's primary amount. Drop rows whose
|
||||
amount_1 parsed to exactly 0; keep everything else (positive,
|
||||
negative, or unparsed-but-non-empty)."""
|
||||
amount_1 = record.get("amount_1")
|
||||
if amount_1 is None:
|
||||
return False
|
||||
if isinstance(amount_1, (int, float)):
|
||||
return amount_1 != 0
|
||||
# Unparsed string — keep so the user can verify in the editor.
|
||||
return bool(str(amount_1).strip())
|
||||
|
||||
|
||||
def diagnose_pdf_lines(
|
||||
pdf_bytes: bytes,
|
||||
*,
|
||||
|
||||
Reference in New Issue
Block a user