"""Tests for the row-heuristic extraction pipeline. This is now the primary extraction mode — uses date + amount pattern matching to find transaction lines, with no dependency on x-position column boundaries. Robust to layout drift across statements from the same bank. The legacy column-visual pipeline keeps its own tests in ``test_pdf_extract.py``. """ from __future__ import annotations import pandas as pd from src.pdf_extract import ( Page, WordBox, apply_template, apply_template_row_heuristic, find_transaction_rows, _find_amount_tokens, _find_dates_in_words, _infer_amount_column_centers, ) def _w(text: str, x0: float, top: float) -> WordBox: return WordBox( x0=x0, top=top, x1=x0 + 8 * len(text), bottom=top + 10, text=text, ) class TestFindDatesInRow: def test_us_slash(self): row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)] assert _find_dates_in_words(row) == [(0, "01/15/2026")] def test_two_digit_year(self): row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)] result = _find_dates_in_words(row) assert result and result[0][1] == "01/15/26" def test_iso(self): row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)] assert _find_dates_in_words(row) == [(0, "2026-01-15")] def test_month_name(self): # "Jan 15, 2026" — three word tokens, should stitch. row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)] result = _find_dates_in_words(row) assert result, "Multi-word month-day-year should match" assert "Jan 15" in result[0][1] def test_no_date(self): row = [_w("Just", 0, 0), _w("text", 50, 0)] assert _find_dates_in_words(row) == [] class TestFindAmountTokens: def test_currency_format(self): row = [_w("Coffee", 0, 0), _w("$4.50", 100, 0)] out = _find_amount_tokens(row) assert len(out) == 1 assert out[0][2] == "$4.50" def test_parens_negative(self): row = [_w("(123.45)", 0, 0)] out = _find_amount_tokens(row) assert out and out[0][2] == "(123.45)" def test_no_amount_on_pure_text(self): row = [_w("Hello", 0, 0), _w("World", 50, 0)] assert _find_amount_tokens(row) == [] def test_rejects_bare_year(self): # "2026" matches the digit pattern but lacks $/decimal/etc., # so the looks-like-amount filter should drop it. row = [_w("2026", 0, 0)] # Bare integer can pass the regex but not the heuristic. out = _find_amount_tokens(row) # Either filtered out OR included — both are defensible. # If included, it'd be missed-amount territory not a false- # positive. Pin the conservative behavior: NO match. assert out == [], "Bare 4-digit year should not register as amount" class TestInferAmountColumnCenters: def test_two_clear_columns(self): # 5 rows, each with two amounts at roughly x=300 and x=450. rows = [] for top in range(0, 100, 20): rows.append([ _w("01/15/2026", 20, top), _w("Item", 100, top), _w("$10.00", 300, top), _w("$1,000.00", 450, top), ]) centers = _infer_amount_column_centers( rows, expected=2, min_amounts=2, max_amounts=2, ) assert len(centers) == 2 # Left center ≈ 300 + 8*len("$10.00")/2 = 300+24 = 324 assert 310 < centers[0] < 340 assert 460 < centers[1] < 490 def test_no_transactions_returns_empty(self): rows = [[_w("just", 0, 0), _w("text", 50, 0)]] assert _infer_amount_column_centers( rows, expected=2, min_amounts=1, max_amounts=3, ) == [] class TestRowHeuristicEndToEnd: """Synthetic ``Page`` objects exercise the full row-heuristic pipeline end-to-end without a real PDF.""" def _page_single_amount(self) -> Page: words = [ _w("ACME BANK STATEMENT", 20, 0), _w("01/15/2026", 20, 30), _w("Coffee", 100, 30), _w("Shop", 150, 30), _w("$4.50", 400, 30), _w("01/16/2026", 20, 50), _w("Refund", 100, 50), _w("from", 100, 70), _w("vendor", 140, 70), # continuation _w("Vendor", 140, 50), _w("$12.00", 400, 50), _w("Page", 20, 90), _w("1", 60, 90), # not a txn ] return Page(page_no=1, width=600, height=120, text="", words=words) def test_extracts_two_rows_single_amount(self): tpl = { "mode": "row_heuristic", "row_detection": { "min_amounts_per_row": 1, "max_amounts_per_row": 1, "merge_multiline_description": True, }, "amounts": {"shape": "single", "negative_in_parens": True}, "date": {"format": "%m/%d/%Y"}, } df = apply_template_row_heuristic([self._page_single_amount()], tpl) assert len(df) == 2 assert list(df["date"]) == ["2026-01-15", "2026-01-16"] # Multi-line description merged assert "from vendor" in df.iloc[1]["description"] def test_dispatches_through_apply_template(self): tpl = { "mode": "row_heuristic", "row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1}, "amounts": {"shape": "single"}, "date": {"format": "%m/%d/%Y"}, } df = apply_template([self._page_single_amount()], tpl) assert isinstance(df, pd.DataFrame) assert len(df) == 2 def test_txn_balance_shape(self): page = Page( page_no=1, width=600, height=100, text="", words=[ _w("01/15/2026", 20, 0), _w("Coffee", 100, 0), _w("(4.50)", 300, 0), _w("1,000.00", 450, 0), _w("01/16/2026", 20, 20), _w("Refund", 100, 20), _w("12.00", 300, 20), _w("1,012.00", 450, 20), ], ) tpl = { "mode": "row_heuristic", "row_detection": {"min_amounts_per_row": 2, "max_amounts_per_row": 2}, "amounts": {"shape": "txn_balance", "negative_in_parens": True}, "date": {"format": "%m/%d/%Y"}, } df = apply_template([page], tpl) assert len(df) == 2 assert df.iloc[0]["amount"] == -4.50 assert df.iloc[0]["balance"] == 1000.00 assert df.iloc[1]["amount"] == 12.00 assert df.iloc[1]["balance"] == 1012.00 def test_debit_credit_balance_shape(self): page = Page( page_no=1, width=600, height=100, text="", words=[ _w("01/15/2026", 20, 0), _w("Coffee", 100, 0), _w("4.50", 300, 0), _w("1,000.00", 450, 0), _w("01/16/2026", 20, 20), _w("Refund", 100, 20), _w("12.00", 380, 20), _w("1,012.00", 450, 20), ], ) tpl = { "mode": "row_heuristic", "row_detection": {"min_amounts_per_row": 2, "max_amounts_per_row": 3}, "amounts": {"shape": "debit_credit_balance"}, "date": {"format": "%m/%d/%Y"}, } df = apply_template([page], tpl) assert len(df) == 2 # Row 0: amount at x=300 (debit column) → debit, balance at 450 assert df.iloc[0]["amount"] == -4.50 assert df.iloc[0]["type"] == "debit" # Row 1: amount at x=380 (credit column) → credit, balance at 450 assert df.iloc[1]["amount"] == 12.00 assert df.iloc[1]["type"] == "credit" def test_skip_rows_matching(self): page = self._page_single_amount() tpl = { "mode": "row_heuristic", "row_detection": { "min_amounts_per_row": 1, "max_amounts_per_row": 1, "skip_rows_matching": ["Refund"], }, "amounts": {"shape": "single"}, "date": {"format": "%m/%d/%Y"}, } df = apply_template_row_heuristic([page], tpl) assert len(df) == 1 assert df.iloc[0]["date"] == "2026-01-15" def test_layout_drift_doesnt_matter(self): """The whole point of row-heuristic: same template works on pages of different sizes / different column x-positions.""" # Page A: amounts at x=400 page_a = Page( page_no=1, width=600, height=80, text="", words=[ _w("01/15/2026", 20, 0), _w("Coffee", 100, 0), _w("$4.50", 400, 0), ], ) # Page B: amounts shifted to x=520 (different layout) page_b = Page( page_no=1, width=720, height=80, text="", words=[ _w("01/15/2026", 50, 0), _w("Coffee", 150, 0), _w("$4.50", 520, 0), ], ) tpl = { "mode": "row_heuristic", "row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1}, "amounts": {"shape": "single"}, "date": {"format": "%m/%d/%Y"}, } df_a = apply_template([page_a], tpl) df_b = apply_template([page_b], tpl) # Both should extract — proves no coordinate dependency. assert len(df_a) == 1 assert len(df_b) == 1 assert df_a.iloc[0]["amount"] == df_b.iloc[0]["amount"] == 4.50 class TestFindTransactionRows: """The pre-DataFrame stage — returns dict records the build UI uses to render a preview before the user commits.""" def test_returns_records(self): page = Page( page_no=1, width=600, height=80, text="", words=[ _w("01/15/2026", 20, 0), _w("Coffee", 100, 0), _w("$4.50", 400, 0), ], ) tpl = { "mode": "row_heuristic", "row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1}, "amounts": {"shape": "single"}, "date": {"format": "%m/%d/%Y"}, } rows = find_transaction_rows([page], tpl) assert len(rows) == 1 r = rows[0] assert r["date"] == "2026-01-15" assert r["description"] == "Coffee" assert r["amount"] == 4.50 assert r["_page"] == 1 # Raw line is preserved so the GUI can show "what we saw" assert "_raw_line" in r