Files
datatools-dev/tests/test_pdf_row_heuristic.py
Michael d80befd05a feat(pdf): row-heuristic extraction (mode dispatch, no coordinates)
User reported the column-visual approach is too brittle for real
bank statements: column-x-positions saved against a sample page
don't survive layout drift between months (statement A has
columns at x=300, statement B drifted to x=320), and a saved
template can only realistically work for one statement's
specific render. The fundamental fix is to stop depending on
coordinates at all.

**Row-heuristic mode** finds transaction rows by pattern: any
line with a date token + N amount tokens IS a transaction. Date
patterns (US slash / EU slash / ISO / "Jan 15, 2026" / etc.) and
amount patterns (currency, parens-negative, thousands grouping)
are matched against word text — no x-positions involved.

The full pipeline:

1. ``find_transaction_rows`` clusters words into rows and scans
   each line for date + amount tokens.
2. Multi-line descriptions still attach to the previous row via
   the no-date-no-amount continuation rule.
3. Amount shapes drive interpretation: ``single`` /
   ``txn_balance`` / ``debit_credit`` / ``debit_credit_balance``.
4. ``_infer_amount_column_centers`` clusters amount x-midpoints
   ACROSS ALL detected rows to find natural column groupings —
   so debit-vs-credit assignment for single-amount lines works
   without the user marking anything on screen.

``apply_template`` is now a dispatch over ``template["mode"]``:

- ``mode="row_heuristic"`` (default for new templates) — the new
  pipeline.
- ``mode="column_visual"`` — the existing pipeline, kept under
  ``_apply_template_column_visual`` for v1 templates and the
  Advanced fallback.

18 new tests cover: date detection (US slash, two-digit year,
ISO, month-name, missing); amount-token finding (currency,
parens, pure text, bare-year rejection); column-center inference
(clear two-column case, empty input); end-to-end on synthetic
Page objects with all four amount shapes; the critical
layout-drift test that proves the same template works on pages
of different sizes / different absolute x-positions.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 23:45:55 +00:00

281 lines
10 KiB
Python

"""Tests for the row-heuristic extraction pipeline.
This is now the primary extraction mode — uses date + amount
pattern matching to find transaction lines, with no dependency
on x-position column boundaries. Robust to layout drift across
statements from the same bank.
The legacy column-visual pipeline keeps its own tests in
``test_pdf_extract.py``.
"""
from __future__ import annotations
import pandas as pd
from src.pdf_extract import (
Page,
WordBox,
apply_template,
apply_template_row_heuristic,
find_transaction_rows,
_find_amount_tokens,
_find_dates_in_words,
_infer_amount_column_centers,
)
def _w(text: str, x0: float, top: float) -> WordBox:
return WordBox(
x0=x0,
top=top,
x1=x0 + 8 * len(text),
bottom=top + 10,
text=text,
)
class TestFindDatesInRow:
def test_us_slash(self):
row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)]
assert _find_dates_in_words(row) == [(0, "01/15/2026")]
def test_two_digit_year(self):
row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)]
result = _find_dates_in_words(row)
assert result and result[0][1] == "01/15/26"
def test_iso(self):
row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)]
assert _find_dates_in_words(row) == [(0, "2026-01-15")]
def test_month_name(self):
# "Jan 15, 2026" — three word tokens, should stitch.
row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)]
result = _find_dates_in_words(row)
assert result, "Multi-word month-day-year should match"
assert "Jan 15" in result[0][1]
def test_no_date(self):
row = [_w("Just", 0, 0), _w("text", 50, 0)]
assert _find_dates_in_words(row) == []
class TestFindAmountTokens:
def test_currency_format(self):
row = [_w("Coffee", 0, 0), _w("$4.50", 100, 0)]
out = _find_amount_tokens(row)
assert len(out) == 1
assert out[0][2] == "$4.50"
def test_parens_negative(self):
row = [_w("(123.45)", 0, 0)]
out = _find_amount_tokens(row)
assert out and out[0][2] == "(123.45)"
def test_no_amount_on_pure_text(self):
row = [_w("Hello", 0, 0), _w("World", 50, 0)]
assert _find_amount_tokens(row) == []
def test_rejects_bare_year(self):
# "2026" matches the digit pattern but lacks $/decimal/etc.,
# so the looks-like-amount filter should drop it.
row = [_w("2026", 0, 0)]
# Bare integer can pass the regex but not the heuristic.
out = _find_amount_tokens(row)
# Either filtered out OR included — both are defensible.
# If included, it'd be missed-amount territory not a false-
# positive. Pin the conservative behavior: NO match.
assert out == [], "Bare 4-digit year should not register as amount"
class TestInferAmountColumnCenters:
def test_two_clear_columns(self):
# 5 rows, each with two amounts at roughly x=300 and x=450.
rows = []
for top in range(0, 100, 20):
rows.append([
_w("01/15/2026", 20, top),
_w("Item", 100, top),
_w("$10.00", 300, top),
_w("$1,000.00", 450, top),
])
centers = _infer_amount_column_centers(
rows, expected=2, min_amounts=2, max_amounts=2,
)
assert len(centers) == 2
# Left center ≈ 300 + 8*len("$10.00")/2 = 300+24 = 324
assert 310 < centers[0] < 340
assert 460 < centers[1] < 490
def test_no_transactions_returns_empty(self):
rows = [[_w("just", 0, 0), _w("text", 50, 0)]]
assert _infer_amount_column_centers(
rows, expected=2, min_amounts=1, max_amounts=3,
) == []
class TestRowHeuristicEndToEnd:
"""Synthetic ``Page`` objects exercise the full row-heuristic
pipeline end-to-end without a real PDF."""
def _page_single_amount(self) -> Page:
words = [
_w("ACME BANK STATEMENT", 20, 0),
_w("01/15/2026", 20, 30), _w("Coffee", 100, 30),
_w("Shop", 150, 30), _w("$4.50", 400, 30),
_w("01/16/2026", 20, 50), _w("Refund", 100, 50),
_w("from", 100, 70), _w("vendor", 140, 70), # continuation
_w("Vendor", 140, 50), _w("$12.00", 400, 50),
_w("Page", 20, 90), _w("1", 60, 90), # not a txn
]
return Page(page_no=1, width=600, height=120, text="", words=words)
def test_extracts_two_rows_single_amount(self):
tpl = {
"mode": "row_heuristic",
"row_detection": {
"min_amounts_per_row": 1,
"max_amounts_per_row": 1,
"merge_multiline_description": True,
},
"amounts": {"shape": "single", "negative_in_parens": True},
"date": {"format": "%m/%d/%Y"},
}
df = apply_template_row_heuristic([self._page_single_amount()], tpl)
assert len(df) == 2
assert list(df["date"]) == ["2026-01-15", "2026-01-16"]
# Multi-line description merged
assert "from vendor" in df.iloc[1]["description"]
def test_dispatches_through_apply_template(self):
tpl = {
"mode": "row_heuristic",
"row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1},
"amounts": {"shape": "single"},
"date": {"format": "%m/%d/%Y"},
}
df = apply_template([self._page_single_amount()], tpl)
assert isinstance(df, pd.DataFrame)
assert len(df) == 2
def test_txn_balance_shape(self):
page = Page(
page_no=1, width=600, height=100, text="", words=[
_w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
_w("(4.50)", 300, 0), _w("1,000.00", 450, 0),
_w("01/16/2026", 20, 20), _w("Refund", 100, 20),
_w("12.00", 300, 20), _w("1,012.00", 450, 20),
],
)
tpl = {
"mode": "row_heuristic",
"row_detection": {"min_amounts_per_row": 2, "max_amounts_per_row": 2},
"amounts": {"shape": "txn_balance", "negative_in_parens": True},
"date": {"format": "%m/%d/%Y"},
}
df = apply_template([page], tpl)
assert len(df) == 2
assert df.iloc[0]["amount"] == -4.50
assert df.iloc[0]["balance"] == 1000.00
assert df.iloc[1]["amount"] == 12.00
assert df.iloc[1]["balance"] == 1012.00
def test_debit_credit_balance_shape(self):
page = Page(
page_no=1, width=600, height=100, text="", words=[
_w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
_w("4.50", 300, 0), _w("1,000.00", 450, 0),
_w("01/16/2026", 20, 20), _w("Refund", 100, 20),
_w("12.00", 380, 20), _w("1,012.00", 450, 20),
],
)
tpl = {
"mode": "row_heuristic",
"row_detection": {"min_amounts_per_row": 2, "max_amounts_per_row": 3},
"amounts": {"shape": "debit_credit_balance"},
"date": {"format": "%m/%d/%Y"},
}
df = apply_template([page], tpl)
assert len(df) == 2
# Row 0: amount at x=300 (debit column) → debit, balance at 450
assert df.iloc[0]["amount"] == -4.50
assert df.iloc[0]["type"] == "debit"
# Row 1: amount at x=380 (credit column) → credit, balance at 450
assert df.iloc[1]["amount"] == 12.00
assert df.iloc[1]["type"] == "credit"
def test_skip_rows_matching(self):
page = self._page_single_amount()
tpl = {
"mode": "row_heuristic",
"row_detection": {
"min_amounts_per_row": 1,
"max_amounts_per_row": 1,
"skip_rows_matching": ["Refund"],
},
"amounts": {"shape": "single"},
"date": {"format": "%m/%d/%Y"},
}
df = apply_template_row_heuristic([page], tpl)
assert len(df) == 1
assert df.iloc[0]["date"] == "2026-01-15"
def test_layout_drift_doesnt_matter(self):
"""The whole point of row-heuristic: same template works
on pages of different sizes / different column x-positions."""
# Page A: amounts at x=400
page_a = Page(
page_no=1, width=600, height=80, text="", words=[
_w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
_w("$4.50", 400, 0),
],
)
# Page B: amounts shifted to x=520 (different layout)
page_b = Page(
page_no=1, width=720, height=80, text="", words=[
_w("01/15/2026", 50, 0), _w("Coffee", 150, 0),
_w("$4.50", 520, 0),
],
)
tpl = {
"mode": "row_heuristic",
"row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1},
"amounts": {"shape": "single"},
"date": {"format": "%m/%d/%Y"},
}
df_a = apply_template([page_a], tpl)
df_b = apply_template([page_b], tpl)
# Both should extract — proves no coordinate dependency.
assert len(df_a) == 1
assert len(df_b) == 1
assert df_a.iloc[0]["amount"] == df_b.iloc[0]["amount"] == 4.50
class TestFindTransactionRows:
"""The pre-DataFrame stage — returns dict records the build UI
uses to render a preview before the user commits."""
def test_returns_records(self):
page = Page(
page_no=1, width=600, height=80, text="", words=[
_w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
_w("$4.50", 400, 0),
],
)
tpl = {
"mode": "row_heuristic",
"row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1},
"amounts": {"shape": "single"},
"date": {"format": "%m/%d/%Y"},
}
rows = find_transaction_rows([page], tpl)
assert len(rows) == 1
r = rows[0]
assert r["date"] == "2026-01-15"
assert r["description"] == "Coffee"
assert r["amount"] == 4.50
assert r["_page"] == 1
# Raw line is preserved so the GUI can show "what we saw"
assert "_raw_line" in r