"""Tests for the minimal PDF transaction scanner. The public API is one function: ``scan_pdf_for_transactions``. These tests cover the value-parsing helpers, the row clusterer, the date/amount token finders, and the end-to-end scanner against synthetic ``Page`` objects with no real PDF involved. End-to-end-on-a-real-PDF coverage lives in ``test_pdf_extract_smoke.py``, which uses ``fpdf2`` to generate a fixture statement at test time. """ from __future__ import annotations from src.pdf_extract import ( Page, WordBox, _find_amount_tokens, _find_dates_in_words, cluster_rows, parse_amount, parse_date, ) def _w(text: str, x0: float, top: float, x1: float | None = None) -> WordBox: return WordBox( x0=x0, top=top, x1=x1 if x1 is not None else x0 + 8 * len(text), bottom=top + 10, text=text, ) class TestParseAmount: def test_plain_positive(self): assert parse_amount("1234.56") == 1234.56 def test_currency_and_thousands(self): assert parse_amount("$1,234.56") == 1234.56 def test_parens_negative(self): assert parse_amount("(1,234.56)") == -1234.56 def test_leading_minus(self): assert parse_amount("-100.00") == -100.0 def test_trailing_minus(self): assert parse_amount("100.00-") == -100.0 def test_blank_returns_none(self): assert parse_amount("") is None assert parse_amount(" ") is None assert parse_amount(None) is None def test_garbage_returns_none(self): assert parse_amount("not a number") is None def test_european_decimal(self): assert parse_amount( "€1.234,56", decimal=",", thousands=".", currency_strip="€", ) == 1234.56 def test_parens_off_disables_paren_negative(self): # With parens off, (4.50) won't be treated as negative — # but it also won't parse cleanly since "(4.50)" isn't a # plain number. Verify the off-path is non-flipping. assert parse_amount("(4.50)", negative_in_parens=False) is None class TestParseDate: def test_us_slash(self): assert parse_date("01/15/2026", ["%m/%d/%Y"]) == "2026-01-15" def test_iso(self): assert parse_date("2026-01-15", ["%Y-%m-%d"]) == "2026-01-15" def test_fallback_format(self): # Not in supplied list — should still parse via fallback. assert parse_date("01/15/26") == "2026-01-15" def test_invalid(self): assert parse_date("not-a-date") is None class TestClusterRows: def test_groups_close_y(self): words = [ _w("A", 0, 100), _w("B", 20, 101), _w("C", 40, 102), ] rows = cluster_rows(words) assert len(rows) == 1 assert [w.text for w in rows[0]] == ["A", "B", "C"] def test_separates_far_y(self): words = [_w("A", 0, 100), _w("B", 0, 120)] assert [ [w.text for w in r] for r in cluster_rows(words) ] == [["A"], ["B"]] def test_sorts_left_to_right_within_row(self): words = [_w("C", 40, 100), _w("A", 0, 100), _w("B", 20, 100)] assert [w.text for w in cluster_rows(words)[0]] == ["A", "B", "C"] def test_empty(self): assert cluster_rows([]) == [] class TestFindDatesInWords: def test_us_slash(self): row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)] assert _find_dates_in_words(row) == [(0, "01/15/2026")] def test_two_digit_year(self): row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)] result = _find_dates_in_words(row) assert result and result[0][1] == "01/15/26" def test_iso(self): row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)] assert _find_dates_in_words(row) == [(0, "2026-01-15")] def test_month_name(self): row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)] result = _find_dates_in_words(row) assert result and "Jan 15" in result[0][1] def test_no_date(self): row = [_w("Just", 0, 0), _w("text", 50, 0)] assert _find_dates_in_words(row) == [] class TestFindAmountTokens: def test_currency_format(self): row = [_w("Coffee", 0, 0), _w("$4.50", 100, 0)] out = _find_amount_tokens(row) assert len(out) == 1 assert out[0][2] == "$4.50" def test_parens_negative(self): row = [_w("(123.45)", 0, 0)] out = _find_amount_tokens(row) assert out and out[0][2] == "(123.45)" def test_no_amount_on_pure_text(self): row = [_w("Hello", 0, 0), _w("World", 50, 0)] assert _find_amount_tokens(row) == [] def test_rejects_bare_year(self): # A bare 4-digit year matches the digit pattern but lacks # any money marker — should be filtered out. row = [_w("2026", 0, 0)] assert _find_amount_tokens(row) == [] # End-to-end tests against synthetic Page objects are in the smoke # test module — they need ``scan_pdf_for_transactions`` which in # turn uses ``extract_pages_auto``. The unit-test layer here pins # the building blocks; smoke tests pin the wiring.