Two corrections from real-statement feedback:
**1. Drop rows where the transaction amount is exactly 0.**
Bank statements include date+amount-shaped noise like
"INTEREST EARNED 0.00", "PAGE TOTAL 0.00", "BALANCE FORWARD
0.00 1,234.56" — all match the date+amount heuristic but
aren't transactions. New filter in
``scan_pdf_for_transactions``: drop rows whose ``amount_1``
parses to exactly 0. Non-zero balances in ``amount_2`` don't
rescue a zero amount_1 — leftmost amount is the canonical
transaction amount. Unparsed-but-non-empty amount strings are
kept (user verifies in the editor).
**2. Multi-date rows: first date wins for the column, every
date excluded from the description.** Chase / BofA / Wells
commonly show both a transaction date and a posting date per
row:
01/13 01/14 COFFEE SHOP $4.50
Before this fix, ``_find_dates_in_words`` returned the first
date only and the second date leaked into description as
"01/14 COFFEE SHOP". Now it returns ALL dates with their word
ranges; the scanner uses ``dates[0]`` as the canonical date
and passes every range to the description builder for
exclusion.
The detector's two-pass strategy now also guards against
mixing full-year and short-date matches on the same row.
Previously, a header line like ``Page 1/2 of 3 ... Statement
Date 01/13/2026`` would return both ``1/2`` and ``01/13/2026``,
and ``1/2`` (being leftmost) would have won the date column.
Now: if any full-year date is found on the row, short patterns
are NOT also collected — full year anchors interpretation. A
row with no full-year date (Chase short-date case) still falls
back to short patterns and collects all of them.
New tests:
- ``test_multiple_dates_returned_in_position_order`` —
``01/13`` + ``01/14`` both returned, in order
- ``TestMultiDateRow.test_first_date_wins_second_excluded_from_description``
— end-to-end through ``scan_pdf_for_transactions``
- ``TestZeroAmountRowsAreDropped.test_zero_amount_row_dropped``
— "INTEREST EARNED 0.00" row dropped while real txn kept
- ``test_negative_amount_kept`` — pin that -40.00 is not
treated as zero by the filter
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
210 lines
7.0 KiB
Python
210 lines
7.0 KiB
Python
"""Tests for the minimal PDF transaction scanner.
|
|
|
|
The public API is one function: ``scan_pdf_for_transactions``.
|
|
These tests cover the value-parsing helpers, the row clusterer,
|
|
the date/amount token finders, and the end-to-end scanner
|
|
against synthetic ``Page`` objects with no real PDF involved.
|
|
|
|
End-to-end-on-a-real-PDF coverage lives in
|
|
``test_pdf_extract_smoke.py``, which uses ``fpdf2`` to generate
|
|
a fixture statement at test time.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from src.pdf_extract import (
|
|
Page,
|
|
WordBox,
|
|
_find_amount_tokens,
|
|
_find_dates_in_words,
|
|
cluster_rows,
|
|
parse_amount,
|
|
parse_date,
|
|
)
|
|
|
|
|
|
def _w(text: str, x0: float, top: float, x1: float | None = None) -> WordBox:
|
|
return WordBox(
|
|
x0=x0,
|
|
top=top,
|
|
x1=x1 if x1 is not None else x0 + 8 * len(text),
|
|
bottom=top + 10,
|
|
text=text,
|
|
)
|
|
|
|
|
|
class TestParseAmount:
|
|
def test_plain_positive(self):
|
|
assert parse_amount("1234.56") == 1234.56
|
|
|
|
def test_currency_and_thousands(self):
|
|
assert parse_amount("$1,234.56") == 1234.56
|
|
|
|
def test_parens_negative(self):
|
|
assert parse_amount("(1,234.56)") == -1234.56
|
|
|
|
def test_leading_minus(self):
|
|
assert parse_amount("-100.00") == -100.0
|
|
|
|
def test_trailing_minus(self):
|
|
assert parse_amount("100.00-") == -100.0
|
|
|
|
def test_blank_returns_none(self):
|
|
assert parse_amount("") is None
|
|
assert parse_amount(" ") is None
|
|
assert parse_amount(None) is None
|
|
|
|
def test_garbage_returns_none(self):
|
|
assert parse_amount("not a number") is None
|
|
|
|
def test_european_decimal(self):
|
|
assert parse_amount(
|
|
"€1.234,56",
|
|
decimal=",",
|
|
thousands=".",
|
|
currency_strip="€",
|
|
) == 1234.56
|
|
|
|
def test_parens_off_disables_paren_negative(self):
|
|
# With parens off, (4.50) won't be treated as negative —
|
|
# but it also won't parse cleanly since "(4.50)" isn't a
|
|
# plain number. Verify the off-path is non-flipping.
|
|
assert parse_amount("(4.50)", negative_in_parens=False) is None
|
|
|
|
|
|
class TestParseDate:
|
|
def test_us_slash(self):
|
|
assert parse_date("01/15/2026", ["%m/%d/%Y"]) == "2026-01-15"
|
|
|
|
def test_iso(self):
|
|
assert parse_date("2026-01-15", ["%Y-%m-%d"]) == "2026-01-15"
|
|
|
|
def test_fallback_format(self):
|
|
# Not in supplied list — should still parse via fallback.
|
|
assert parse_date("01/15/26") == "2026-01-15"
|
|
|
|
def test_invalid(self):
|
|
assert parse_date("not-a-date") is None
|
|
|
|
|
|
class TestClusterRows:
|
|
def test_groups_close_y(self):
|
|
words = [
|
|
_w("A", 0, 100), _w("B", 20, 101), _w("C", 40, 102),
|
|
]
|
|
rows = cluster_rows(words)
|
|
assert len(rows) == 1
|
|
assert [w.text for w in rows[0]] == ["A", "B", "C"]
|
|
|
|
def test_separates_far_y(self):
|
|
words = [_w("A", 0, 100), _w("B", 0, 120)]
|
|
assert [
|
|
[w.text for w in r] for r in cluster_rows(words)
|
|
] == [["A"], ["B"]]
|
|
|
|
def test_sorts_left_to_right_within_row(self):
|
|
words = [_w("C", 40, 100), _w("A", 0, 100), _w("B", 20, 100)]
|
|
assert [w.text for w in cluster_rows(words)[0]] == ["A", "B", "C"]
|
|
|
|
def test_empty(self):
|
|
assert cluster_rows([]) == []
|
|
|
|
|
|
class TestFindDatesInWords:
|
|
"""Returns ``[(start, end, text)]`` — end is exclusive index of
|
|
words the date consumed."""
|
|
|
|
def test_us_slash(self):
|
|
row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)]
|
|
assert _find_dates_in_words(row) == [(0, 1, "01/15/2026")]
|
|
|
|
def test_two_digit_year(self):
|
|
row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)]
|
|
result = _find_dates_in_words(row)
|
|
assert result and result[0][2] == "01/15/26"
|
|
|
|
def test_iso(self):
|
|
row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)]
|
|
assert _find_dates_in_words(row) == [(0, 1, "2026-01-15")]
|
|
|
|
def test_month_name_with_year_consumes_three_words(self):
|
|
row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)]
|
|
result = _find_dates_in_words(row)
|
|
assert result and "Jan 15" in result[0][2]
|
|
# Date consumes all 3 words so they don't leak to description.
|
|
assert result[0][1] == 3
|
|
|
|
def test_short_us_date_no_year(self):
|
|
"""Chase-style ``01/13`` without a year still detects."""
|
|
row = [_w("01/13", 0, 0), _w("Coffee", 100, 0), _w("$4.50", 200, 0)]
|
|
result = _find_dates_in_words(row)
|
|
assert result and result[0][2] == "01/13"
|
|
assert result[0][1] == 1 # one word consumed
|
|
|
|
def test_short_month_name_no_year_consumes_two_words(self):
|
|
row = [_w("Jan", 0, 0), _w("13", 30, 0), _w("Coffee", 100, 0)]
|
|
result = _find_dates_in_words(row)
|
|
assert result
|
|
assert "Jan 13" in result[0][2]
|
|
assert result[0][1] == 2 # "Jan" + "13" both consumed
|
|
|
|
def test_short_pattern_does_not_shadow_full_year(self):
|
|
"""If a full-year date is present, short patterns shouldn't
|
|
steal — e.g. ``Page 1/2 of 3 ... 01/13/2026 Coffee`` should
|
|
return the real ``01/13/2026`` first."""
|
|
row = [
|
|
_w("Page", 0, 0), _w("1/2", 40, 0), _w("of", 80, 0),
|
|
_w("3", 100, 0),
|
|
_w("01/13/2026", 200, 0), _w("Coffee", 300, 0),
|
|
]
|
|
result = _find_dates_in_words(row)
|
|
# Full-year match wins position 0 in the returned list.
|
|
assert result[0][2] == "01/13/2026"
|
|
|
|
def test_multiple_dates_returned_in_position_order(self):
|
|
"""Chase-style transaction with both posting and txn dates."""
|
|
row = [
|
|
_w("01/13", 0, 0), _w("01/14", 50, 0),
|
|
_w("Coffee", 100, 0), _w("$4.50", 200, 0),
|
|
]
|
|
result = _find_dates_in_words(row)
|
|
assert len(result) == 2
|
|
assert result[0][2] == "01/13"
|
|
assert result[1][2] == "01/14"
|
|
# First date claims word 0, second claims word 1
|
|
assert result[0][:2] == (0, 1)
|
|
assert result[1][:2] == (1, 2)
|
|
|
|
def test_no_date(self):
|
|
row = [_w("Just", 0, 0), _w("text", 50, 0)]
|
|
assert _find_dates_in_words(row) == []
|
|
|
|
|
|
class TestFindAmountTokens:
|
|
def test_currency_format(self):
|
|
row = [_w("Coffee", 0, 0), _w("$4.50", 100, 0)]
|
|
out = _find_amount_tokens(row)
|
|
assert len(out) == 1
|
|
assert out[0][2] == "$4.50"
|
|
|
|
def test_parens_negative(self):
|
|
row = [_w("(123.45)", 0, 0)]
|
|
out = _find_amount_tokens(row)
|
|
assert out and out[0][2] == "(123.45)"
|
|
|
|
def test_no_amount_on_pure_text(self):
|
|
row = [_w("Hello", 0, 0), _w("World", 50, 0)]
|
|
assert _find_amount_tokens(row) == []
|
|
|
|
def test_rejects_bare_year(self):
|
|
# A bare 4-digit year matches the digit pattern but lacks
|
|
# any money marker — should be filtered out.
|
|
row = [_w("2026", 0, 0)]
|
|
assert _find_amount_tokens(row) == []
|
|
|
|
|
|
# End-to-end tests against synthetic Page objects are in the smoke
|
|
# test module — they need ``scan_pdf_for_transactions`` which in
|
|
# turn uses ``extract_pages_auto``. The unit-test layer here pins
|
|
# the building blocks; smoke tests pin the wiring.
|