fix(pdf): drop zero-amount rows; multi-date rows clean description

Two corrections from real-statement feedback:

**1. Drop rows where the transaction amount is exactly 0.**
Bank statements include date+amount-shaped noise like
"INTEREST EARNED 0.00", "PAGE TOTAL 0.00", "BALANCE FORWARD
0.00 1,234.56" — all match the date+amount heuristic but
aren't transactions. New filter in
``scan_pdf_for_transactions``: drop rows whose ``amount_1``
parses to exactly 0. Non-zero balances in ``amount_2`` don't
rescue a zero amount_1 — leftmost amount is the canonical
transaction amount. Unparsed-but-non-empty amount strings are
kept (user verifies in the editor).

**2. Multi-date rows: first date wins for the column, every
date excluded from the description.** Chase / BofA / Wells
commonly show both a transaction date and a posting date per
row:

    01/13  01/14  COFFEE SHOP  $4.50

Before this fix, ``_find_dates_in_words`` returned the first
date only and the second date leaked into description as
"01/14 COFFEE SHOP". Now it returns ALL dates with their word
ranges; the scanner uses ``dates[0]`` as the canonical date
and passes every range to the description builder for
exclusion.

The detector's two-pass strategy now also guards against
mixing full-year and short-date matches on the same row.
Previously, a header line like ``Page 1/2 of 3 ... Statement
Date 01/13/2026`` would return both ``1/2`` and ``01/13/2026``,
and ``1/2`` (being leftmost) would have won the date column.
Now: if any full-year date is found on the row, short patterns
are NOT also collected — full year anchors interpretation. A
row with no full-year date (Chase short-date case) still falls
back to short patterns and collects all of them.

New tests:
- ``test_multiple_dates_returned_in_position_order`` —
  ``01/13`` + ``01/14`` both returned, in order
- ``TestMultiDateRow.test_first_date_wins_second_excluded_from_description``
  — end-to-end through ``scan_pdf_for_transactions``
- ``TestZeroAmountRowsAreDropped.test_zero_amount_row_dropped``
  — "INTEREST EARNED 0.00" row dropped while real txn kept
- ``test_negative_amount_kept`` — pin that -40.00 is not
  treated as zero by the filter

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-20 00:12:21 +00:00
parent 263af3c7c2
commit 3cf935c999
3 changed files with 205 additions and 34 deletions

View File

@@ -441,45 +441,61 @@ def extract_pages_auto(
def _find_dates_in_words(
row_words: list[WordBox],
) -> list[tuple[int, int, str]]:
"""Return ``[(start_idx, end_idx, date_text)]`` for the first
date-like substring on this row, or ``[]`` if none.
"""Return every date-like substring on this row, sorted by
position. Each entry is ``(start_idx, end_idx_exclusive, text)``.
Two-pass search:
- **Pass 1** — full-year patterns (``01/15/2026``,
``Jan 13, 2026``). Tries the longest window first within
this pass so a multi-word ``Jan 15, 2026`` isn't truncated
to ``Jan 15``.
``Jan 13, 2026``). Longest window first so multi-word dates
aren't truncated to a partial short match.
- **Pass 2** — short patterns (``01/13``, ``Jan 13``). Only
runs if pass 1 found nothing — otherwise a stray
``Page 1/2`` on the same line could shadow the real dated
transaction.
claims word ranges that pass 1 didn't already take, so a
real ``01/13/2026`` always wins over an adjacent
``Page 1/2``.
``end_idx`` is exclusive — caller uses ``range(start, end)``
to exclude all words the date consumed from the description
(the previous single-index return mis-attributed the day
token of multi-word dates like ``Jan 13`` to the description).
Some statements show both a transaction date and a posting
date per row (Chase, BofA, …). The scanner uses the first
match as the canonical date for the CSV column, and excludes
EVERY date from the description so the second / third dates
don't leak into the description text.
"""
for patterns, window_order in (
(_DATE_RES_FULL, (3, 2, 1)),
(_DATE_RES_SHORT, (2, 1)),
):
def _scan(patterns, window_order):
local_found: list[tuple[int, int, str]] = []
local_claimed: set[int] = set()
for i in range(len(row_words)):
if i in local_claimed:
continue
matched = False
for window in window_order:
end = i + window
if end > len(row_words):
continue
if any(j in local_claimed for j in range(i, end)):
continue
chunk = " ".join(x.text for x in row_words[i:end])
for rx in patterns:
m = rx.search(chunk)
if m:
# Count whitespace-separated tokens in the
# MATCH, not in the window — the window may
# have included extra trailing words the
# regex didn't actually consume.
consumed = max(1, len(m.group(1).split()))
return [(i, i + consumed, m.group(1))]
return []
actual_end = i + consumed
local_found.append((i, actual_end, m.group(1)))
local_claimed.update(range(i, actual_end))
matched = True
break
if matched:
break
return local_found
full = _scan(_DATE_RES_FULL, (3, 2, 1))
if full:
# A real full-year date on the row anchors interpretation.
# Don't ALSO collect short patterns — they're almost always
# page numbers ("Page 1/2") or fractions in memos when a
# real date is present.
return sorted(full, key=lambda t: t[0])
short = _scan(_DATE_RES_SHORT, (2, 1))
return sorted(short, key=lambda t: t[0])
def _find_amount_tokens(
@@ -506,23 +522,30 @@ def _find_amount_tokens(
def _description_from_row(
row_words: list[WordBox],
date_range: tuple[int, int],
date_ranges: list[tuple[int, int]],
amount_idxs: set[int],
) -> str:
"""Stitch the description from the row's non-date, non-amount
tokens. ``date_range`` is ``(start, end)`` exclusive — every
word in that range is excluded so multi-word dates like
``Jan 13`` don't leak the day token into the description.
tokens. ``date_ranges`` is a list of ``(start, end)`` (end
exclusive) — every word in any range is excluded.
Why a list: some bank statements show two dates per row
(transaction + posting). Without excluding all of them, the
extra date(s) leak into the description and look like trash.
Keeps tokens before the first amount and after the last
amount (trailing check numbers, memos); drops words between
amount tokens (usually whitespace artifacts in column gaps)."""
date_start, date_end = date_range
amount tokens (usually whitespace artifacts in column gaps).
"""
excluded: set[int] = set()
for start, end in date_ranges:
excluded.update(range(start, end))
keep: list[str] = []
seen_first_amount = False
last_amount_idx = max(amount_idxs) if amount_idxs else -1
for i, w in enumerate(row_words):
if date_start <= i < date_end:
if i in excluded:
continue
if i in amount_idxs:
seen_first_amount = True
@@ -594,14 +617,22 @@ def scan_pdf_for_transactions(
)
continue
date_start, date_end, date_text = dates[0]
# First date wins for the "date" column; ALL dates are
# excluded from the description so a row carrying both
# a transaction date and a posting date doesn't leak
# the second one into description text.
_, _, first_date_text = dates[0]
date_ranges = [(s, e) for s, e, _ in dates]
amount_idxs = {idx for idx, _, _ in amount_tokens}
desc = _description_from_row(
row_words, (date_start, date_end), amount_idxs,
row_words, date_ranges, amount_idxs,
)
record: dict[str, Any] = {
"date": parse_date(date_text, date_formats) or date_text,
"date": (
parse_date(first_date_text, date_formats)
or first_date_text
),
"description": desc,
"page": page.page_no,
"raw": line,
@@ -616,12 +647,36 @@ def scan_pdf_for_transactions(
record[f"amount_{k}"] = (
parsed if parsed is not None else txt
)
# Drop rows where the transaction amount is exactly 0.
# Bank statements include noise like "INTEREST EARNED
# 0.00" or "PAGE TOTAL 0.00" that pass the date+amount
# heuristic but aren't real transactions. We key off
# ``amount_1`` (leftmost amount = usually the txn
# amount); a non-zero balance in ``amount_2`` doesn't
# rescue a zero ``amount_1``.
if not _has_real_transaction_amount(record):
continue
out_rows.append(record)
prev = record
return out_rows, warnings
def _has_real_transaction_amount(record: dict[str, Any]) -> bool:
"""``amount_1`` is the row's primary amount. Drop rows whose
amount_1 parsed to exactly 0; keep everything else (positive,
negative, or unparsed-but-non-empty)."""
amount_1 = record.get("amount_1")
if amount_1 is None:
return False
if isinstance(amount_1, (int, float)):
return amount_1 != 0
# Unparsed string — keep so the user can verify in the editor.
return bool(str(amount_1).strip())
def diagnose_pdf_lines(
pdf_bytes: bytes,
*,

View File

@@ -151,14 +151,29 @@ class TestFindDatesInWords:
def test_short_pattern_does_not_shadow_full_year(self):
"""If a full-year date is present, short patterns shouldn't
steal — e.g. ``Page 1/2 of 3 ... 01/13/2026 Coffee`` should
return the real ``01/13/2026``, not the ``1/2`` page marker."""
return the real ``01/13/2026`` first."""
row = [
_w("Page", 0, 0), _w("1/2", 40, 0), _w("of", 80, 0),
_w("3", 100, 0),
_w("01/13/2026", 200, 0), _w("Coffee", 300, 0),
]
result = _find_dates_in_words(row)
assert result and result[0][2] == "01/13/2026"
# Full-year match wins position 0 in the returned list.
assert result[0][2] == "01/13/2026"
def test_multiple_dates_returned_in_position_order(self):
"""Chase-style transaction with both posting and txn dates."""
row = [
_w("01/13", 0, 0), _w("01/14", 50, 0),
_w("Coffee", 100, 0), _w("$4.50", 200, 0),
]
result = _find_dates_in_words(row)
assert len(result) == 2
assert result[0][2] == "01/13"
assert result[1][2] == "01/14"
# First date claims word 0, second claims word 1
assert result[0][:2] == (0, 1)
assert result[1][:2] == (1, 2)
def test_no_date(self):
row = [_w("Just", 0, 0), _w("text", 50, 0)]

View File

@@ -144,6 +144,107 @@ class TestScanPdfForTransactions:
# ---------------------------------------------------------------------------
class TestMultiDateRow:
"""Some statements (Chase, BofA) show both a transaction date
and a posting date per row. The scanner uses the first date
in position order and excludes every date from the description."""
def test_first_date_wins_second_excluded_from_description(self):
from src import pdf_extract as mod
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
original = mod.extract_pages_auto
def fake(_b, *, allow_ocr=True):
words = [
WordBox(x0=0, top=0, x1=40, bottom=10, text="01/13"),
WordBox(x0=50, top=0, x1=90, bottom=10, text="01/14"),
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
WordBox(x0=170, top=0, x1=210, bottom=10, text="Shop"),
WordBox(x0=220, top=0, x1=270, bottom=10, text="$4.50"),
]
return [Page(
page_no=1, width=300, height=20, text="", words=words,
)], []
mod.extract_pages_auto = fake
try:
rows, _ = scan_pdf_for_transactions(b"")
finally:
mod.extract_pages_auto = original
assert len(rows) == 1
# First date used as the canonical
assert rows[0]["date"] == "01/13"
# Second date NOT in description
assert "01/14" not in rows[0]["description"]
# Description is the actual content between dates and amount
assert rows[0]["description"] == "Coffee Shop"
class TestZeroAmountRowsAreDropped:
"""Rows where the transaction amount is exactly 0 are noise
(statements love to print "INTEREST EARNED 0.00" or
"PAGE TOTAL 0.00") and get filtered out."""
def test_zero_amount_row_dropped(self):
from src import pdf_extract as mod
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
original = mod.extract_pages_auto
def fake(_b, *, allow_ocr=True):
words = [
# Real transaction
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
# Zero-amount noise row (should be dropped)
WordBox(x0=0, top=20, x1=80, bottom=30, text="01/14/2026"),
WordBox(x0=100, top=20, x1=180, bottom=30, text="INTEREST"),
WordBox(x0=200, top=20, x1=240, bottom=30, text="0.00"),
]
return [Page(
page_no=1, width=300, height=40, text="", words=words,
)], []
mod.extract_pages_auto = fake
try:
rows, _ = scan_pdf_for_transactions(b"")
finally:
mod.extract_pages_auto = original
assert len(rows) == 1
assert rows[0]["amount_1"] == 4.50
assert "INTEREST" not in rows[0]["description"]
def test_negative_amount_kept(self):
from src import pdf_extract as mod
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
original = mod.extract_pages_auto
def fake(_b, *, allow_ocr=True):
words = [
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
WordBox(x0=100, top=0, x1=160, bottom=10, text="Withdraw"),
WordBox(x0=200, top=0, x1=240, bottom=10, text="(40.00)"),
]
return [Page(
page_no=1, width=300, height=20, text="", words=words,
)], []
mod.extract_pages_auto = fake
try:
rows, _ = scan_pdf_for_transactions(b"")
finally:
mod.extract_pages_auto = original
# -40 is not zero — keep it
assert len(rows) == 1
assert rows[0]["amount_1"] == -40.00
class TestMultilineDescription:
def test_continuation_line_merges(self):
"""A line with no date and no amount, sitting between two