"""Tests for the pure PDF-extraction pipeline. Real PDF parsing (``extract_pages``) is a thin wrapper around ``pdfplumber`` and is exercised by hand on real bank statements. These tests pin the meaty bits — value parsing, row clustering, column assignment, template-driven extraction — against synthetic ``WordBox`` data so they run fast and have no PDF dependency. """ from __future__ import annotations import pandas as pd from src.pdf_extract import ( Page, WordBox, apply_template, assign_columns, cluster_rows, parse_amount, parse_date, _pages_in_range, _within_table_window, ) def _w(text: str, x0: float, top: float, x1: float | None = None) -> WordBox: """Convenience constructor — heights and exact x1 don't matter for the tests we write.""" return WordBox( x0=x0, top=top, x1=x1 if x1 is not None else x0 + 10 * len(text), bottom=top + 10, text=text, ) class TestParseAmount: def test_plain_positive(self): assert parse_amount("1234.56") == 1234.56 def test_currency_and_thousands(self): assert parse_amount("$1,234.56") == 1234.56 def test_parens_negative(self): assert parse_amount("(1,234.56)") == -1234.56 def test_leading_minus(self): assert parse_amount("-100.00") == -100.0 def test_trailing_minus(self): assert parse_amount("100.00-") == -100.0 def test_blank_returns_none(self): assert parse_amount("") is None assert parse_amount(" ") is None assert parse_amount(None) is None def test_garbage_returns_none(self): assert parse_amount("not a number") is None def test_european_decimal(self): opts = { "decimal_separator": ",", "thousands_separator": ".", "currency_strip": "€", "negative_in_parens": True, } assert parse_amount("€1.234,56", opts) == 1234.56 class TestParseDate: def test_us_slash(self): assert parse_date("01/15/2026", ["%m/%d/%Y"]) == "2026-01-15" def test_iso(self): assert parse_date("2026-01-15", ["%Y-%m-%d"]) == "2026-01-15" def test_fallback_format(self): # Not in the supplied list — should still parse via fallback. assert parse_date("01/15/26") == "2026-01-15" def test_invalid(self): assert parse_date("not-a-date") is None class TestClusterRows: def test_groups_close_y(self): words = [ _w("A", x0=0, top=100), _w("B", x0=20, top=101), _w("C", x0=40, top=102), ] rows = cluster_rows(words, y_tolerance=3.0) assert len(rows) == 1 assert [w.text for w in rows[0]] == ["A", "B", "C"] def test_separates_far_y(self): words = [ _w("A", x0=0, top=100), _w("B", x0=0, top=120), ] rows = cluster_rows(words, y_tolerance=3.0) assert [[w.text for w in r] for r in rows] == [["A"], ["B"]] def test_sorts_left_to_right_within_row(self): words = [ _w("C", x0=40, top=100), _w("A", x0=0, top=100), _w("B", x0=20, top=100), ] rows = cluster_rows(words) assert [w.text for w in rows[0]] == ["A", "B", "C"] def test_empty(self): assert cluster_rows([]) == [] class TestAssignColumns: def test_three_columns(self): # boundaries at x=100, 200 → columns [0,100), [100,200), [200,∞) row = [ _w("Jan", x0=10, top=0, x1=40), # col 0 _w("1", x0=45, top=0, x1=55), # col 0 _w("Deposit", x0=110, top=0, x1=180), # col 1 _w("250.00", x0=210, top=0, x1=260), # col 2 ] cells = assign_columns(row, [100, 200]) assert cells[0] == "Jan 1" assert cells[1] == "Deposit" assert cells[2] == "250.00" def test_no_boundaries_one_column(self): row = [_w("A", 0, 0), _w("B", 20, 0)] cells = assign_columns(row, []) assert cells == ["A B"] class TestPagesInRange: def _mk(self, n): return [Page(page_no=i + 1, width=600, height=800, text="", words=[]) for i in range(n)] def test_all(self): pages = self._mk(5) assert len(_pages_in_range(pages, "all")) == 5 assert len(_pages_in_range(pages, "")) == 5 def test_explicit_list(self): pages = self._mk(5) got = [p.page_no for p in _pages_in_range(pages, "1,3,5")] assert got == [1, 3, 5] def test_range(self): pages = self._mk(5) got = [p.page_no for p in _pages_in_range(pages, "2-4")] assert got == [2, 3, 4] def test_open_ended(self): pages = self._mk(5) got = [p.page_no for p in _pages_in_range(pages, "3-")] assert got == [3, 4, 5] class TestWithinTableWindow: def test_header_skipped_end_excluded(self): rows = [ [_w("STATEMENT", 0, 0)], [_w("Date", 0, 20), _w("Description", 50, 20), _w("Amount", 200, 20)], [_w("01/15", 0, 40), _w("Coffee", 50, 40), _w("4.50", 200, 40)], [_w("01/16", 0, 60), _w("Refund", 50, 60), _w("12.00", 200, 60)], [_w("Closing", 0, 80), _w("balance", 50, 80)], [_w("Page", 0, 100), _w("1", 50, 100)], ] out = _within_table_window(rows, "Date Description Amount", ["Closing balance"]) # Should keep just the two transaction rows. assert len(out) == 2 assert out[0][0].text == "01/15" assert out[1][0].text == "01/16" def test_no_header_returns_empty_when_required(self): rows = [[_w("foo", 0, 0)]] assert _within_table_window(rows, "Date Description Amount", []) == [] def test_blank_header_passes_through(self): rows = [[_w("x", 0, 0)], [_w("y", 0, 20)]] assert _within_table_window(rows, "", []) == rows class TestApplyTemplate: """End-to-end on synthetic ``Page`` objects.""" def _statement_page(self) -> Page: # Mock layout: 3 columns at x=0/100/200, header at y=20, data at 40+. words = [ _w("STATEMENT", 0, 0), # Header _w("Date", 5, 20), _w("Description", 105, 20), _w("Amount", 205, 20), # Row 1 _w("01/15/2026", 5, 40), _w("Coffee", 105, 40), _w("Shop", 140, 40), _w("(4.50)", 205, 40), # Row 2 _w("01/16/2026", 5, 60), _w("Refund", 105, 60), _w("$12.00", 205, 60), # Continuation row (no date) — should merge into row 2 _w("from", 105, 80), _w("vendor", 140, 80), # End marker _w("Closing", 5, 100), _w("balance", 105, 100), _w("$1,000.00", 205, 100), ] return Page(page_no=1, width=300, height=120, text="", words=words) def _template(self) -> dict: return { "pages": {"range": "all"}, "table": { "header_text": "Date Description Amount", "end_markers": ["Closing balance"], "column_boundaries": [100, 200], "y_tolerance": 3.0, "skip_rows_matching": [], }, "columns": [ {"source": 0, "target": "date"}, {"source": 1, "target": "description"}, {"source": 2, "target": "amount"}, ], "parse": { "date_format": "%m/%d/%Y", "amount_negative_in_parens": True, "merge_multiline_description": True, }, } def test_basic_extraction(self): df = apply_template([self._statement_page()], self._template()) assert isinstance(df, pd.DataFrame) assert len(df) == 2 assert list(df["date"]) == ["2026-01-15", "2026-01-16"] # Parens-negative assert df.iloc[0]["amount"] == -4.50 # Plain positive with currency strip assert df.iloc[1]["amount"] == 12.00 # Multi-line description merged assert "from vendor" in df.iloc[1]["description"] def test_debit_credit_split_columns(self): # Layout: date | description | debit | credit columns page = Page( page_no=1, width=400, height=80, text="", words=[ _w("Date", 5, 0), _w("Desc", 105, 0), _w("Debit", 205, 0), _w("Credit", 305, 0), _w("01/15/2026", 5, 20), _w("Coffee", 105, 20), _w("4.50", 205, 20), _w("01/16/2026", 5, 40), _w("Refund", 105, 40), _w("", 205, 40), # no debit _w("12.00", 305, 40), ], ) tpl = { "table": { "header_text": "Date Desc Debit Credit", "column_boundaries": [100, 200, 300], }, "columns": [ {"source": 0, "target": "date"}, {"source": 1, "target": "description"}, {"source": 2, "target": "amount_debit"}, {"source": 3, "target": "amount_credit"}, ], "parse": {"date_format": "%m/%d/%Y"}, } df = apply_template([page], tpl) assert list(df["amount"]) == [-4.50, 12.00] assert list(df["type"]) == ["debit", "credit"] def test_skip_rows_matching(self): page = self._statement_page() tpl = self._template() tpl["table"]["skip_rows_matching"] = ["Refund"] df = apply_template([page], tpl) # Refund row is dropped — only one transaction left assert len(df) == 1 assert df.iloc[0]["amount"] == -4.50 def test_empty_pages_returns_empty_df(self): df = apply_template([], self._template()) assert df.empty