fix(pdf): drop zero-amount rows; multi-date rows clean description
Two corrections from real-statement feedback:
**1. Drop rows where the transaction amount is exactly 0.**
Bank statements include date+amount-shaped noise like
"INTEREST EARNED 0.00", "PAGE TOTAL 0.00", "BALANCE FORWARD
0.00 1,234.56" — all match the date+amount heuristic but
aren't transactions. New filter in
``scan_pdf_for_transactions``: drop rows whose ``amount_1``
parses to exactly 0. Non-zero balances in ``amount_2`` don't
rescue a zero amount_1 — leftmost amount is the canonical
transaction amount. Unparsed-but-non-empty amount strings are
kept (user verifies in the editor).
**2. Multi-date rows: first date wins for the column, every
date excluded from the description.** Chase / BofA / Wells
commonly show both a transaction date and a posting date per
row:
01/13 01/14 COFFEE SHOP $4.50
Before this fix, ``_find_dates_in_words`` returned the first
date only and the second date leaked into description as
"01/14 COFFEE SHOP". Now it returns ALL dates with their word
ranges; the scanner uses ``dates[0]`` as the canonical date
and passes every range to the description builder for
exclusion.
The detector's two-pass strategy now also guards against
mixing full-year and short-date matches on the same row.
Previously, a header line like ``Page 1/2 of 3 ... Statement
Date 01/13/2026`` would return both ``1/2`` and ``01/13/2026``,
and ``1/2`` (being leftmost) would have won the date column.
Now: if any full-year date is found on the row, short patterns
are NOT also collected — full year anchors interpretation. A
row with no full-year date (Chase short-date case) still falls
back to short patterns and collects all of them.
New tests:
- ``test_multiple_dates_returned_in_position_order`` —
``01/13`` + ``01/14`` both returned, in order
- ``TestMultiDateRow.test_first_date_wins_second_excluded_from_description``
— end-to-end through ``scan_pdf_for_transactions``
- ``TestZeroAmountRowsAreDropped.test_zero_amount_row_dropped``
— "INTEREST EARNED 0.00" row dropped while real txn kept
- ``test_negative_amount_kept`` — pin that -40.00 is not
treated as zero by the filter
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -151,14 +151,29 @@ class TestFindDatesInWords:
|
||||
def test_short_pattern_does_not_shadow_full_year(self):
|
||||
"""If a full-year date is present, short patterns shouldn't
|
||||
steal — e.g. ``Page 1/2 of 3 ... 01/13/2026 Coffee`` should
|
||||
return the real ``01/13/2026``, not the ``1/2`` page marker."""
|
||||
return the real ``01/13/2026`` first."""
|
||||
row = [
|
||||
_w("Page", 0, 0), _w("1/2", 40, 0), _w("of", 80, 0),
|
||||
_w("3", 100, 0),
|
||||
_w("01/13/2026", 200, 0), _w("Coffee", 300, 0),
|
||||
]
|
||||
result = _find_dates_in_words(row)
|
||||
assert result and result[0][2] == "01/13/2026"
|
||||
# Full-year match wins position 0 in the returned list.
|
||||
assert result[0][2] == "01/13/2026"
|
||||
|
||||
def test_multiple_dates_returned_in_position_order(self):
|
||||
"""Chase-style transaction with both posting and txn dates."""
|
||||
row = [
|
||||
_w("01/13", 0, 0), _w("01/14", 50, 0),
|
||||
_w("Coffee", 100, 0), _w("$4.50", 200, 0),
|
||||
]
|
||||
result = _find_dates_in_words(row)
|
||||
assert len(result) == 2
|
||||
assert result[0][2] == "01/13"
|
||||
assert result[1][2] == "01/14"
|
||||
# First date claims word 0, second claims word 1
|
||||
assert result[0][:2] == (0, 1)
|
||||
assert result[1][:2] == (1, 2)
|
||||
|
||||
def test_no_date(self):
|
||||
row = [_w("Just", 0, 0), _w("text", 50, 0)]
|
||||
|
||||
@@ -144,6 +144,107 @@ class TestScanPdfForTransactions:
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestMultiDateRow:
|
||||
"""Some statements (Chase, BofA) show both a transaction date
|
||||
and a posting date per row. The scanner uses the first date
|
||||
in position order and excludes every date from the description."""
|
||||
|
||||
def test_first_date_wins_second_excluded_from_description(self):
|
||||
from src import pdf_extract as mod
|
||||
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
|
||||
|
||||
original = mod.extract_pages_auto
|
||||
|
||||
def fake(_b, *, allow_ocr=True):
|
||||
words = [
|
||||
WordBox(x0=0, top=0, x1=40, bottom=10, text="01/13"),
|
||||
WordBox(x0=50, top=0, x1=90, bottom=10, text="01/14"),
|
||||
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
|
||||
WordBox(x0=170, top=0, x1=210, bottom=10, text="Shop"),
|
||||
WordBox(x0=220, top=0, x1=270, bottom=10, text="$4.50"),
|
||||
]
|
||||
return [Page(
|
||||
page_no=1, width=300, height=20, text="", words=words,
|
||||
)], []
|
||||
|
||||
mod.extract_pages_auto = fake
|
||||
try:
|
||||
rows, _ = scan_pdf_for_transactions(b"")
|
||||
finally:
|
||||
mod.extract_pages_auto = original
|
||||
|
||||
assert len(rows) == 1
|
||||
# First date used as the canonical
|
||||
assert rows[0]["date"] == "01/13"
|
||||
# Second date NOT in description
|
||||
assert "01/14" not in rows[0]["description"]
|
||||
# Description is the actual content between dates and amount
|
||||
assert rows[0]["description"] == "Coffee Shop"
|
||||
|
||||
|
||||
class TestZeroAmountRowsAreDropped:
|
||||
"""Rows where the transaction amount is exactly 0 are noise
|
||||
(statements love to print "INTEREST EARNED 0.00" or
|
||||
"PAGE TOTAL 0.00") and get filtered out."""
|
||||
|
||||
def test_zero_amount_row_dropped(self):
|
||||
from src import pdf_extract as mod
|
||||
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
|
||||
|
||||
original = mod.extract_pages_auto
|
||||
|
||||
def fake(_b, *, allow_ocr=True):
|
||||
words = [
|
||||
# Real transaction
|
||||
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
|
||||
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
|
||||
WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
|
||||
# Zero-amount noise row (should be dropped)
|
||||
WordBox(x0=0, top=20, x1=80, bottom=30, text="01/14/2026"),
|
||||
WordBox(x0=100, top=20, x1=180, bottom=30, text="INTEREST"),
|
||||
WordBox(x0=200, top=20, x1=240, bottom=30, text="0.00"),
|
||||
]
|
||||
return [Page(
|
||||
page_no=1, width=300, height=40, text="", words=words,
|
||||
)], []
|
||||
|
||||
mod.extract_pages_auto = fake
|
||||
try:
|
||||
rows, _ = scan_pdf_for_transactions(b"")
|
||||
finally:
|
||||
mod.extract_pages_auto = original
|
||||
|
||||
assert len(rows) == 1
|
||||
assert rows[0]["amount_1"] == 4.50
|
||||
assert "INTEREST" not in rows[0]["description"]
|
||||
|
||||
def test_negative_amount_kept(self):
|
||||
from src import pdf_extract as mod
|
||||
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
|
||||
|
||||
original = mod.extract_pages_auto
|
||||
|
||||
def fake(_b, *, allow_ocr=True):
|
||||
words = [
|
||||
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
|
||||
WordBox(x0=100, top=0, x1=160, bottom=10, text="Withdraw"),
|
||||
WordBox(x0=200, top=0, x1=240, bottom=10, text="(40.00)"),
|
||||
]
|
||||
return [Page(
|
||||
page_no=1, width=300, height=20, text="", words=words,
|
||||
)], []
|
||||
|
||||
mod.extract_pages_auto = fake
|
||||
try:
|
||||
rows, _ = scan_pdf_for_transactions(b"")
|
||||
finally:
|
||||
mod.extract_pages_auto = original
|
||||
|
||||
# -40 is not zero — keep it
|
||||
assert len(rows) == 1
|
||||
assert rows[0]["amount_1"] == -40.00
|
||||
|
||||
|
||||
class TestMultilineDescription:
|
||||
def test_continuation_line_merges(self):
|
||||
"""A line with no date and no amount, sitting between two
|
||||
|
||||
Reference in New Issue
Block a user