fix(pdf): drop zero-amount rows; multi-date rows clean description
Two corrections from real-statement feedback:
**1. Drop rows where the transaction amount is exactly 0.**
Bank statements include date+amount-shaped noise like
"INTEREST EARNED 0.00", "PAGE TOTAL 0.00", "BALANCE FORWARD
0.00 1,234.56" — all match the date+amount heuristic but
aren't transactions. New filter in
``scan_pdf_for_transactions``: drop rows whose ``amount_1``
parses to exactly 0. Non-zero balances in ``amount_2`` don't
rescue a zero amount_1 — leftmost amount is the canonical
transaction amount. Unparsed-but-non-empty amount strings are
kept (user verifies in the editor).
**2. Multi-date rows: first date wins for the column, every
date excluded from the description.** Chase / BofA / Wells
commonly show both a transaction date and a posting date per
row:
01/13 01/14 COFFEE SHOP $4.50
Before this fix, ``_find_dates_in_words`` returned the first
date only and the second date leaked into description as
"01/14 COFFEE SHOP". Now it returns ALL dates with their word
ranges; the scanner uses ``dates[0]`` as the canonical date
and passes every range to the description builder for
exclusion.
The detector's two-pass strategy now also guards against
mixing full-year and short-date matches on the same row.
Previously, a header line like ``Page 1/2 of 3 ... Statement
Date 01/13/2026`` would return both ``1/2`` and ``01/13/2026``,
and ``1/2`` (being leftmost) would have won the date column.
Now: if any full-year date is found on the row, short patterns
are NOT also collected — full year anchors interpretation. A
row with no full-year date (Chase short-date case) still falls
back to short patterns and collects all of them.
New tests:
- ``test_multiple_dates_returned_in_position_order`` —
``01/13`` + ``01/14`` both returned, in order
- ``TestMultiDateRow.test_first_date_wins_second_excluded_from_description``
— end-to-end through ``scan_pdf_for_transactions``
- ``TestZeroAmountRowsAreDropped.test_zero_amount_row_dropped``
— "INTEREST EARNED 0.00" row dropped while real txn kept
- ``test_negative_amount_kept`` — pin that -40.00 is not
treated as zero by the filter
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -441,45 +441,61 @@ def extract_pages_auto(
|
|||||||
def _find_dates_in_words(
|
def _find_dates_in_words(
|
||||||
row_words: list[WordBox],
|
row_words: list[WordBox],
|
||||||
) -> list[tuple[int, int, str]]:
|
) -> list[tuple[int, int, str]]:
|
||||||
"""Return ``[(start_idx, end_idx, date_text)]`` for the first
|
"""Return every date-like substring on this row, sorted by
|
||||||
date-like substring on this row, or ``[]`` if none.
|
position. Each entry is ``(start_idx, end_idx_exclusive, text)``.
|
||||||
|
|
||||||
Two-pass search:
|
Two-pass search:
|
||||||
|
|
||||||
- **Pass 1** — full-year patterns (``01/15/2026``,
|
- **Pass 1** — full-year patterns (``01/15/2026``,
|
||||||
``Jan 13, 2026``). Tries the longest window first within
|
``Jan 13, 2026``). Longest window first so multi-word dates
|
||||||
this pass so a multi-word ``Jan 15, 2026`` isn't truncated
|
aren't truncated to a partial short match.
|
||||||
to ``Jan 15``.
|
|
||||||
- **Pass 2** — short patterns (``01/13``, ``Jan 13``). Only
|
- **Pass 2** — short patterns (``01/13``, ``Jan 13``). Only
|
||||||
runs if pass 1 found nothing — otherwise a stray
|
claims word ranges that pass 1 didn't already take, so a
|
||||||
``Page 1/2`` on the same line could shadow the real dated
|
real ``01/13/2026`` always wins over an adjacent
|
||||||
transaction.
|
``Page 1/2``.
|
||||||
|
|
||||||
``end_idx`` is exclusive — caller uses ``range(start, end)``
|
Some statements show both a transaction date and a posting
|
||||||
to exclude all words the date consumed from the description
|
date per row (Chase, BofA, …). The scanner uses the first
|
||||||
(the previous single-index return mis-attributed the day
|
match as the canonical date for the CSV column, and excludes
|
||||||
token of multi-word dates like ``Jan 13`` to the description).
|
EVERY date from the description so the second / third dates
|
||||||
|
don't leak into the description text.
|
||||||
"""
|
"""
|
||||||
for patterns, window_order in (
|
def _scan(patterns, window_order):
|
||||||
(_DATE_RES_FULL, (3, 2, 1)),
|
local_found: list[tuple[int, int, str]] = []
|
||||||
(_DATE_RES_SHORT, (2, 1)),
|
local_claimed: set[int] = set()
|
||||||
):
|
|
||||||
for i in range(len(row_words)):
|
for i in range(len(row_words)):
|
||||||
|
if i in local_claimed:
|
||||||
|
continue
|
||||||
|
matched = False
|
||||||
for window in window_order:
|
for window in window_order:
|
||||||
end = i + window
|
end = i + window
|
||||||
if end > len(row_words):
|
if end > len(row_words):
|
||||||
continue
|
continue
|
||||||
|
if any(j in local_claimed for j in range(i, end)):
|
||||||
|
continue
|
||||||
chunk = " ".join(x.text for x in row_words[i:end])
|
chunk = " ".join(x.text for x in row_words[i:end])
|
||||||
for rx in patterns:
|
for rx in patterns:
|
||||||
m = rx.search(chunk)
|
m = rx.search(chunk)
|
||||||
if m:
|
if m:
|
||||||
# Count whitespace-separated tokens in the
|
|
||||||
# MATCH, not in the window — the window may
|
|
||||||
# have included extra trailing words the
|
|
||||||
# regex didn't actually consume.
|
|
||||||
consumed = max(1, len(m.group(1).split()))
|
consumed = max(1, len(m.group(1).split()))
|
||||||
return [(i, i + consumed, m.group(1))]
|
actual_end = i + consumed
|
||||||
return []
|
local_found.append((i, actual_end, m.group(1)))
|
||||||
|
local_claimed.update(range(i, actual_end))
|
||||||
|
matched = True
|
||||||
|
break
|
||||||
|
if matched:
|
||||||
|
break
|
||||||
|
return local_found
|
||||||
|
|
||||||
|
full = _scan(_DATE_RES_FULL, (3, 2, 1))
|
||||||
|
if full:
|
||||||
|
# A real full-year date on the row anchors interpretation.
|
||||||
|
# Don't ALSO collect short patterns — they're almost always
|
||||||
|
# page numbers ("Page 1/2") or fractions in memos when a
|
||||||
|
# real date is present.
|
||||||
|
return sorted(full, key=lambda t: t[0])
|
||||||
|
short = _scan(_DATE_RES_SHORT, (2, 1))
|
||||||
|
return sorted(short, key=lambda t: t[0])
|
||||||
|
|
||||||
|
|
||||||
def _find_amount_tokens(
|
def _find_amount_tokens(
|
||||||
@@ -506,23 +522,30 @@ def _find_amount_tokens(
|
|||||||
|
|
||||||
def _description_from_row(
|
def _description_from_row(
|
||||||
row_words: list[WordBox],
|
row_words: list[WordBox],
|
||||||
date_range: tuple[int, int],
|
date_ranges: list[tuple[int, int]],
|
||||||
amount_idxs: set[int],
|
amount_idxs: set[int],
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Stitch the description from the row's non-date, non-amount
|
"""Stitch the description from the row's non-date, non-amount
|
||||||
tokens. ``date_range`` is ``(start, end)`` exclusive — every
|
tokens. ``date_ranges`` is a list of ``(start, end)`` (end
|
||||||
word in that range is excluded so multi-word dates like
|
exclusive) — every word in any range is excluded.
|
||||||
``Jan 13`` don't leak the day token into the description.
|
|
||||||
|
Why a list: some bank statements show two dates per row
|
||||||
|
(transaction + posting). Without excluding all of them, the
|
||||||
|
extra date(s) leak into the description and look like trash.
|
||||||
|
|
||||||
Keeps tokens before the first amount and after the last
|
Keeps tokens before the first amount and after the last
|
||||||
amount (trailing check numbers, memos); drops words between
|
amount (trailing check numbers, memos); drops words between
|
||||||
amount tokens (usually whitespace artifacts in column gaps)."""
|
amount tokens (usually whitespace artifacts in column gaps).
|
||||||
date_start, date_end = date_range
|
"""
|
||||||
|
excluded: set[int] = set()
|
||||||
|
for start, end in date_ranges:
|
||||||
|
excluded.update(range(start, end))
|
||||||
|
|
||||||
keep: list[str] = []
|
keep: list[str] = []
|
||||||
seen_first_amount = False
|
seen_first_amount = False
|
||||||
last_amount_idx = max(amount_idxs) if amount_idxs else -1
|
last_amount_idx = max(amount_idxs) if amount_idxs else -1
|
||||||
for i, w in enumerate(row_words):
|
for i, w in enumerate(row_words):
|
||||||
if date_start <= i < date_end:
|
if i in excluded:
|
||||||
continue
|
continue
|
||||||
if i in amount_idxs:
|
if i in amount_idxs:
|
||||||
seen_first_amount = True
|
seen_first_amount = True
|
||||||
@@ -594,14 +617,22 @@ def scan_pdf_for_transactions(
|
|||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
date_start, date_end, date_text = dates[0]
|
# First date wins for the "date" column; ALL dates are
|
||||||
|
# excluded from the description so a row carrying both
|
||||||
|
# a transaction date and a posting date doesn't leak
|
||||||
|
# the second one into description text.
|
||||||
|
_, _, first_date_text = dates[0]
|
||||||
|
date_ranges = [(s, e) for s, e, _ in dates]
|
||||||
amount_idxs = {idx for idx, _, _ in amount_tokens}
|
amount_idxs = {idx for idx, _, _ in amount_tokens}
|
||||||
desc = _description_from_row(
|
desc = _description_from_row(
|
||||||
row_words, (date_start, date_end), amount_idxs,
|
row_words, date_ranges, amount_idxs,
|
||||||
)
|
)
|
||||||
|
|
||||||
record: dict[str, Any] = {
|
record: dict[str, Any] = {
|
||||||
"date": parse_date(date_text, date_formats) or date_text,
|
"date": (
|
||||||
|
parse_date(first_date_text, date_formats)
|
||||||
|
or first_date_text
|
||||||
|
),
|
||||||
"description": desc,
|
"description": desc,
|
||||||
"page": page.page_no,
|
"page": page.page_no,
|
||||||
"raw": line,
|
"raw": line,
|
||||||
@@ -616,12 +647,36 @@ def scan_pdf_for_transactions(
|
|||||||
record[f"amount_{k}"] = (
|
record[f"amount_{k}"] = (
|
||||||
parsed if parsed is not None else txt
|
parsed if parsed is not None else txt
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Drop rows where the transaction amount is exactly 0.
|
||||||
|
# Bank statements include noise like "INTEREST EARNED
|
||||||
|
# 0.00" or "PAGE TOTAL 0.00" that pass the date+amount
|
||||||
|
# heuristic but aren't real transactions. We key off
|
||||||
|
# ``amount_1`` (leftmost amount = usually the txn
|
||||||
|
# amount); a non-zero balance in ``amount_2`` doesn't
|
||||||
|
# rescue a zero ``amount_1``.
|
||||||
|
if not _has_real_transaction_amount(record):
|
||||||
|
continue
|
||||||
|
|
||||||
out_rows.append(record)
|
out_rows.append(record)
|
||||||
prev = record
|
prev = record
|
||||||
|
|
||||||
return out_rows, warnings
|
return out_rows, warnings
|
||||||
|
|
||||||
|
|
||||||
|
def _has_real_transaction_amount(record: dict[str, Any]) -> bool:
|
||||||
|
"""``amount_1`` is the row's primary amount. Drop rows whose
|
||||||
|
amount_1 parsed to exactly 0; keep everything else (positive,
|
||||||
|
negative, or unparsed-but-non-empty)."""
|
||||||
|
amount_1 = record.get("amount_1")
|
||||||
|
if amount_1 is None:
|
||||||
|
return False
|
||||||
|
if isinstance(amount_1, (int, float)):
|
||||||
|
return amount_1 != 0
|
||||||
|
# Unparsed string — keep so the user can verify in the editor.
|
||||||
|
return bool(str(amount_1).strip())
|
||||||
|
|
||||||
|
|
||||||
def diagnose_pdf_lines(
|
def diagnose_pdf_lines(
|
||||||
pdf_bytes: bytes,
|
pdf_bytes: bytes,
|
||||||
*,
|
*,
|
||||||
|
|||||||
@@ -151,14 +151,29 @@ class TestFindDatesInWords:
|
|||||||
def test_short_pattern_does_not_shadow_full_year(self):
|
def test_short_pattern_does_not_shadow_full_year(self):
|
||||||
"""If a full-year date is present, short patterns shouldn't
|
"""If a full-year date is present, short patterns shouldn't
|
||||||
steal — e.g. ``Page 1/2 of 3 ... 01/13/2026 Coffee`` should
|
steal — e.g. ``Page 1/2 of 3 ... 01/13/2026 Coffee`` should
|
||||||
return the real ``01/13/2026``, not the ``1/2`` page marker."""
|
return the real ``01/13/2026`` first."""
|
||||||
row = [
|
row = [
|
||||||
_w("Page", 0, 0), _w("1/2", 40, 0), _w("of", 80, 0),
|
_w("Page", 0, 0), _w("1/2", 40, 0), _w("of", 80, 0),
|
||||||
_w("3", 100, 0),
|
_w("3", 100, 0),
|
||||||
_w("01/13/2026", 200, 0), _w("Coffee", 300, 0),
|
_w("01/13/2026", 200, 0), _w("Coffee", 300, 0),
|
||||||
]
|
]
|
||||||
result = _find_dates_in_words(row)
|
result = _find_dates_in_words(row)
|
||||||
assert result and result[0][2] == "01/13/2026"
|
# Full-year match wins position 0 in the returned list.
|
||||||
|
assert result[0][2] == "01/13/2026"
|
||||||
|
|
||||||
|
def test_multiple_dates_returned_in_position_order(self):
|
||||||
|
"""Chase-style transaction with both posting and txn dates."""
|
||||||
|
row = [
|
||||||
|
_w("01/13", 0, 0), _w("01/14", 50, 0),
|
||||||
|
_w("Coffee", 100, 0), _w("$4.50", 200, 0),
|
||||||
|
]
|
||||||
|
result = _find_dates_in_words(row)
|
||||||
|
assert len(result) == 2
|
||||||
|
assert result[0][2] == "01/13"
|
||||||
|
assert result[1][2] == "01/14"
|
||||||
|
# First date claims word 0, second claims word 1
|
||||||
|
assert result[0][:2] == (0, 1)
|
||||||
|
assert result[1][:2] == (1, 2)
|
||||||
|
|
||||||
def test_no_date(self):
|
def test_no_date(self):
|
||||||
row = [_w("Just", 0, 0), _w("text", 50, 0)]
|
row = [_w("Just", 0, 0), _w("text", 50, 0)]
|
||||||
|
|||||||
@@ -144,6 +144,107 @@ class TestScanPdfForTransactions:
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestMultiDateRow:
|
||||||
|
"""Some statements (Chase, BofA) show both a transaction date
|
||||||
|
and a posting date per row. The scanner uses the first date
|
||||||
|
in position order and excludes every date from the description."""
|
||||||
|
|
||||||
|
def test_first_date_wins_second_excluded_from_description(self):
|
||||||
|
from src import pdf_extract as mod
|
||||||
|
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
|
||||||
|
|
||||||
|
original = mod.extract_pages_auto
|
||||||
|
|
||||||
|
def fake(_b, *, allow_ocr=True):
|
||||||
|
words = [
|
||||||
|
WordBox(x0=0, top=0, x1=40, bottom=10, text="01/13"),
|
||||||
|
WordBox(x0=50, top=0, x1=90, bottom=10, text="01/14"),
|
||||||
|
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
|
||||||
|
WordBox(x0=170, top=0, x1=210, bottom=10, text="Shop"),
|
||||||
|
WordBox(x0=220, top=0, x1=270, bottom=10, text="$4.50"),
|
||||||
|
]
|
||||||
|
return [Page(
|
||||||
|
page_no=1, width=300, height=20, text="", words=words,
|
||||||
|
)], []
|
||||||
|
|
||||||
|
mod.extract_pages_auto = fake
|
||||||
|
try:
|
||||||
|
rows, _ = scan_pdf_for_transactions(b"")
|
||||||
|
finally:
|
||||||
|
mod.extract_pages_auto = original
|
||||||
|
|
||||||
|
assert len(rows) == 1
|
||||||
|
# First date used as the canonical
|
||||||
|
assert rows[0]["date"] == "01/13"
|
||||||
|
# Second date NOT in description
|
||||||
|
assert "01/14" not in rows[0]["description"]
|
||||||
|
# Description is the actual content between dates and amount
|
||||||
|
assert rows[0]["description"] == "Coffee Shop"
|
||||||
|
|
||||||
|
|
||||||
|
class TestZeroAmountRowsAreDropped:
|
||||||
|
"""Rows where the transaction amount is exactly 0 are noise
|
||||||
|
(statements love to print "INTEREST EARNED 0.00" or
|
||||||
|
"PAGE TOTAL 0.00") and get filtered out."""
|
||||||
|
|
||||||
|
def test_zero_amount_row_dropped(self):
|
||||||
|
from src import pdf_extract as mod
|
||||||
|
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
|
||||||
|
|
||||||
|
original = mod.extract_pages_auto
|
||||||
|
|
||||||
|
def fake(_b, *, allow_ocr=True):
|
||||||
|
words = [
|
||||||
|
# Real transaction
|
||||||
|
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
|
||||||
|
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
|
||||||
|
WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
|
||||||
|
# Zero-amount noise row (should be dropped)
|
||||||
|
WordBox(x0=0, top=20, x1=80, bottom=30, text="01/14/2026"),
|
||||||
|
WordBox(x0=100, top=20, x1=180, bottom=30, text="INTEREST"),
|
||||||
|
WordBox(x0=200, top=20, x1=240, bottom=30, text="0.00"),
|
||||||
|
]
|
||||||
|
return [Page(
|
||||||
|
page_no=1, width=300, height=40, text="", words=words,
|
||||||
|
)], []
|
||||||
|
|
||||||
|
mod.extract_pages_auto = fake
|
||||||
|
try:
|
||||||
|
rows, _ = scan_pdf_for_transactions(b"")
|
||||||
|
finally:
|
||||||
|
mod.extract_pages_auto = original
|
||||||
|
|
||||||
|
assert len(rows) == 1
|
||||||
|
assert rows[0]["amount_1"] == 4.50
|
||||||
|
assert "INTEREST" not in rows[0]["description"]
|
||||||
|
|
||||||
|
def test_negative_amount_kept(self):
|
||||||
|
from src import pdf_extract as mod
|
||||||
|
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
|
||||||
|
|
||||||
|
original = mod.extract_pages_auto
|
||||||
|
|
||||||
|
def fake(_b, *, allow_ocr=True):
|
||||||
|
words = [
|
||||||
|
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
|
||||||
|
WordBox(x0=100, top=0, x1=160, bottom=10, text="Withdraw"),
|
||||||
|
WordBox(x0=200, top=0, x1=240, bottom=10, text="(40.00)"),
|
||||||
|
]
|
||||||
|
return [Page(
|
||||||
|
page_no=1, width=300, height=20, text="", words=words,
|
||||||
|
)], []
|
||||||
|
|
||||||
|
mod.extract_pages_auto = fake
|
||||||
|
try:
|
||||||
|
rows, _ = scan_pdf_for_transactions(b"")
|
||||||
|
finally:
|
||||||
|
mod.extract_pages_auto = original
|
||||||
|
|
||||||
|
# -40 is not zero — keep it
|
||||||
|
assert len(rows) == 1
|
||||||
|
assert rows[0]["amount_1"] == -40.00
|
||||||
|
|
||||||
|
|
||||||
class TestMultilineDescription:
|
class TestMultilineDescription:
|
||||||
def test_continuation_line_merges(self):
|
def test_continuation_line_merges(self):
|
||||||
"""A line with no date and no amount, sitting between two
|
"""A line with no date and no amount, sitting between two
|
||||||
|
|||||||
Reference in New Issue
Block a user