"""Tests for the minimal PDF transaction scanner. The public API is one function: ``scan_pdf_for_transactions``. These tests cover the value-parsing helpers, the row clusterer, the date/amount token finders, and the end-to-end scanner against synthetic ``Page`` objects with no real PDF involved. End-to-end-on-a-real-PDF coverage lives in ``test_pdf_extract_smoke.py``, which uses ``fpdf2`` to generate a fixture statement at test time. """ from __future__ import annotations from src.pdf_extract import ( Page, WordBox, _find_amount_tokens, _find_dates_in_words, cluster_rows, parse_amount, parse_date, ) def _w(text: str, x0: float, top: float, x1: float | None = None) -> WordBox: return WordBox( x0=x0, top=top, x1=x1 if x1 is not None else x0 + 8 * len(text), bottom=top + 10, text=text, ) class TestParseAmount: def test_plain_positive(self): assert parse_amount("1234.56") == 1234.56 def test_currency_and_thousands(self): assert parse_amount("$1,234.56") == 1234.56 def test_parens_negative(self): assert parse_amount("(1,234.56)") == -1234.56 def test_leading_minus(self): assert parse_amount("-100.00") == -100.0 def test_trailing_minus(self): assert parse_amount("100.00-") == -100.0 def test_blank_returns_none(self): assert parse_amount("") is None assert parse_amount(" ") is None assert parse_amount(None) is None def test_garbage_returns_none(self): assert parse_amount("not a number") is None def test_european_decimal(self): assert parse_amount( "€1.234,56", decimal=",", thousands=".", currency_strip="€", ) == 1234.56 def test_parens_off_disables_paren_negative(self): # With parens off, (4.50) won't be treated as negative — # but it also won't parse cleanly since "(4.50)" isn't a # plain number. Verify the off-path is non-flipping. assert parse_amount("(4.50)", negative_in_parens=False) is None class TestParseDate: def test_us_slash(self): assert parse_date("01/15/2026", ["%m/%d/%Y"]) == "2026-01-15" def test_iso(self): assert parse_date("2026-01-15", ["%Y-%m-%d"]) == "2026-01-15" def test_fallback_format(self): # Not in supplied list — should still parse via fallback. assert parse_date("01/15/26") == "2026-01-15" def test_invalid(self): assert parse_date("not-a-date") is None class TestClusterRows: def test_groups_close_y(self): words = [ _w("A", 0, 100), _w("B", 20, 101), _w("C", 40, 102), ] rows = cluster_rows(words) assert len(rows) == 1 assert [w.text for w in rows[0]] == ["A", "B", "C"] def test_separates_far_y(self): words = [_w("A", 0, 100), _w("B", 0, 120)] assert [ [w.text for w in r] for r in cluster_rows(words) ] == [["A"], ["B"]] def test_sorts_left_to_right_within_row(self): words = [_w("C", 40, 100), _w("A", 0, 100), _w("B", 20, 100)] assert [w.text for w in cluster_rows(words)[0]] == ["A", "B", "C"] def test_empty(self): assert cluster_rows([]) == [] class TestFindDatesInWords: """Returns ``[(start, end, text)]`` — end is exclusive index of words the date consumed.""" def test_us_slash(self): row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)] assert _find_dates_in_words(row) == [(0, 1, "01/15/2026")] def test_two_digit_year(self): row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)] result = _find_dates_in_words(row) assert result and result[0][2] == "01/15/26" def test_iso(self): row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)] assert _find_dates_in_words(row) == [(0, 1, "2026-01-15")] def test_month_name_with_year_consumes_three_words(self): row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)] result = _find_dates_in_words(row) assert result and "Jan 15" in result[0][2] # Date consumes all 3 words so they don't leak to description. assert result[0][1] == 3 def test_short_us_date_no_year(self): """Chase-style ``01/13`` without a year still detects.""" row = [_w("01/13", 0, 0), _w("Coffee", 100, 0), _w("$4.50", 200, 0)] result = _find_dates_in_words(row) assert result and result[0][2] == "01/13" assert result[0][1] == 1 # one word consumed def test_short_month_name_no_year_consumes_two_words(self): row = [_w("Jan", 0, 0), _w("13", 30, 0), _w("Coffee", 100, 0)] result = _find_dates_in_words(row) assert result assert "Jan 13" in result[0][2] assert result[0][1] == 2 # "Jan" + "13" both consumed def test_short_pattern_does_not_shadow_full_year(self): """If a full-year date is present, short patterns shouldn't steal — e.g. ``Page 1/2 of 3 ... 01/13/2026 Coffee`` should return the real ``01/13/2026``, not the ``1/2`` page marker.""" row = [ _w("Page", 0, 0), _w("1/2", 40, 0), _w("of", 80, 0), _w("3", 100, 0), _w("01/13/2026", 200, 0), _w("Coffee", 300, 0), ] result = _find_dates_in_words(row) assert result and result[0][2] == "01/13/2026" def test_no_date(self): row = [_w("Just", 0, 0), _w("text", 50, 0)] assert _find_dates_in_words(row) == [] class TestFindAmountTokens: def test_currency_format(self): row = [_w("Coffee", 0, 0), _w("$4.50", 100, 0)] out = _find_amount_tokens(row) assert len(out) == 1 assert out[0][2] == "$4.50" def test_parens_negative(self): row = [_w("(123.45)", 0, 0)] out = _find_amount_tokens(row) assert out and out[0][2] == "(123.45)" def test_no_amount_on_pure_text(self): row = [_w("Hello", 0, 0), _w("World", 50, 0)] assert _find_amount_tokens(row) == [] def test_rejects_bare_year(self): # A bare 4-digit year matches the digit pattern but lacks # any money marker — should be filtered out. row = [_w("2026", 0, 0)] assert _find_amount_tokens(row) == [] # End-to-end tests against synthetic Page objects are in the smoke # test module — they need ``scan_pdf_for_transactions`` which in # turn uses ``extract_pages_auto``. The unit-test layer here pins # the building blocks; smoke tests pin the wiring.