"""Tests for the minimal PDF transaction scanner. The public API is one function: ``scan_pdf_for_transactions``. These tests cover the value-parsing helpers, the row clusterer, the date/amount token finders, and the end-to-end scanner against synthetic ``Page`` objects with no real PDF involved. End-to-end-on-a-real-PDF coverage lives in ``test_pdf_extract_smoke.py``, which uses ``fpdf2`` to generate a fixture statement at test time. """ from __future__ import annotations from src.pdf_extract import ( Page, WordBox, _extract_account_number, _extract_statement_period, _find_amount_tokens, _find_dates_in_words, _infer_year_for_short_date, cluster_rows, extract_statement_metadata, format_amount, format_date, parse_amount, parse_date, year_from_filename, ) def _w(text: str, x0: float, top: float, x1: float | None = None) -> WordBox: return WordBox( x0=x0, top=top, x1=x1 if x1 is not None else x0 + 8 * len(text), bottom=top + 10, text=text, ) class TestParseAmount: def test_plain_positive(self): assert parse_amount("1234.56") == 1234.56 def test_currency_and_thousands(self): assert parse_amount("$1,234.56") == 1234.56 def test_parens_negative(self): assert parse_amount("(1,234.56)") == -1234.56 def test_leading_minus(self): assert parse_amount("-100.00") == -100.0 def test_trailing_minus(self): assert parse_amount("100.00-") == -100.0 def test_blank_returns_none(self): assert parse_amount("") is None assert parse_amount(" ") is None assert parse_amount(None) is None def test_garbage_returns_none(self): assert parse_amount("not a number") is None def test_european_decimal(self): assert parse_amount( "€1.234,56", decimal=",", thousands=".", currency_strip="€", ) == 1234.56 def test_parens_off_disables_paren_negative(self): # With parens off, (4.50) won't be treated as negative — # but it also won't parse cleanly since "(4.50)" isn't a # plain number. Verify the off-path is non-flipping. assert parse_amount("(4.50)", negative_in_parens=False) is None class TestParseDate: def test_us_slash(self): assert parse_date("01/15/2026", ["%m/%d/%Y"]) == "2026-01-15" def test_iso(self): assert parse_date("2026-01-15", ["%Y-%m-%d"]) == "2026-01-15" def test_fallback_format(self): # Not in supplied list — should still parse via fallback. assert parse_date("01/15/26") == "2026-01-15" def test_invalid(self): assert parse_date("not-a-date") is None class TestClusterRows: def test_groups_close_y(self): words = [ _w("A", 0, 100), _w("B", 20, 101), _w("C", 40, 102), ] rows = cluster_rows(words) assert len(rows) == 1 assert [w.text for w in rows[0]] == ["A", "B", "C"] def test_separates_far_y(self): words = [_w("A", 0, 100), _w("B", 0, 120)] assert [ [w.text for w in r] for r in cluster_rows(words) ] == [["A"], ["B"]] def test_sorts_left_to_right_within_row(self): words = [_w("C", 40, 100), _w("A", 0, 100), _w("B", 20, 100)] assert [w.text for w in cluster_rows(words)[0]] == ["A", "B", "C"] def test_empty(self): assert cluster_rows([]) == [] class TestFindDatesInWords: """Returns ``[(start, end, text)]`` — end is exclusive index of words the date consumed.""" def test_us_slash(self): row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)] assert _find_dates_in_words(row) == [(0, 1, "01/15/2026")] def test_two_digit_year(self): row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)] result = _find_dates_in_words(row) assert result and result[0][2] == "01/15/26" def test_iso(self): row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)] assert _find_dates_in_words(row) == [(0, 1, "2026-01-15")] def test_month_name_with_year_consumes_three_words(self): row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)] result = _find_dates_in_words(row) assert result and "Jan 15" in result[0][2] # Date consumes all 3 words so they don't leak to description. assert result[0][1] == 3 def test_short_us_date_no_year(self): """Chase-style ``01/13`` without a year still detects.""" row = [_w("01/13", 0, 0), _w("Coffee", 100, 0), _w("$4.50", 200, 0)] result = _find_dates_in_words(row) assert result and result[0][2] == "01/13" assert result[0][1] == 1 # one word consumed def test_short_month_name_no_year_consumes_two_words(self): row = [_w("Jan", 0, 0), _w("13", 30, 0), _w("Coffee", 100, 0)] result = _find_dates_in_words(row) assert result assert "Jan 13" in result[0][2] assert result[0][1] == 2 # "Jan" + "13" both consumed def test_short_pattern_does_not_shadow_full_year(self): """If a full-year date is present, short patterns shouldn't steal — e.g. ``Page 1/2 of 3 ... 01/13/2026 Coffee`` should return the real ``01/13/2026`` first.""" row = [ _w("Page", 0, 0), _w("1/2", 40, 0), _w("of", 80, 0), _w("3", 100, 0), _w("01/13/2026", 200, 0), _w("Coffee", 300, 0), ] result = _find_dates_in_words(row) # Full-year match wins position 0 in the returned list. assert result[0][2] == "01/13/2026" def test_multiple_dates_returned_in_position_order(self): """Chase-style transaction with both posting and txn dates.""" row = [ _w("01/13", 0, 0), _w("01/14", 50, 0), _w("Coffee", 100, 0), _w("$4.50", 200, 0), ] result = _find_dates_in_words(row) assert len(result) == 2 assert result[0][2] == "01/13" assert result[1][2] == "01/14" # First date claims word 0, second claims word 1 assert result[0][:2] == (0, 1) assert result[1][:2] == (1, 2) def test_no_date(self): row = [_w("Just", 0, 0), _w("text", 50, 0)] assert _find_dates_in_words(row) == [] class TestFindAmountTokens: def test_currency_format(self): row = [_w("Coffee", 0, 0), _w("$4.50", 100, 0)] out = _find_amount_tokens(row) assert len(out) == 1 assert out[0][2] == "$4.50" def test_parens_negative(self): row = [_w("(123.45)", 0, 0)] out = _find_amount_tokens(row) assert out and out[0][2] == "(123.45)" def test_no_amount_on_pure_text(self): row = [_w("Hello", 0, 0), _w("World", 50, 0)] assert _find_amount_tokens(row) == [] def test_rejects_bare_year(self): # A bare 4-digit year matches the digit pattern but lacks # any money marker — should be filtered out. row = [_w("2026", 0, 0)] assert _find_amount_tokens(row) == [] # End-to-end tests against synthetic Page objects are in the smoke # test module — they need ``scan_pdf_for_transactions`` which in # turn uses ``extract_pages_auto``. The unit-test layer here pins # the building blocks; smoke tests pin the wiring. class TestFormatAmount: """Two-decimal-place consistency at the display + export layer.""" def test_drops_no_trailing_zeros(self): # The bug: 4.5 should NOT render as "4.5" — accountants # need consistent precision across rows. assert format_amount(4.5) == "4.50" assert format_amount(12.0) == "12.00" assert format_amount(1000) == "1000.00" def test_negatives(self): assert format_amount(-40.0) == "-40.00" assert format_amount(-4.5) == "-4.50" def test_none_and_empty(self): assert format_amount(None) == "" assert format_amount("") == "" def test_string_passthrough(self): # ``(4.50)`` was preserved by the scanner because parsing # failed; the user sees the raw text and can fix in editor. assert format_amount("(4.50)") == "(4.50)" def test_bool_doesnt_render_as_number(self): # bool is an int subclass — guard prevents True → "1.00". assert format_amount(True) == "True" assert format_amount(False) == "False" def test_nan_inf_become_empty(self): assert format_amount(float("nan")) == "" assert format_amount(float("inf")) == "" def test_custom_places(self): assert format_amount(4.5, places=4) == "4.5000" assert format_amount(4.567, places=0) == "5" class TestFormatDate: def test_yyyymmdd(self): assert format_date("2026-01-13", "%Y%m%d") == "20260113" def test_iso_passthrough(self): assert format_date("2026-01-13", "%Y-%m-%d") == "2026-01-13" def test_us(self): assert format_date("2026-01-13", "%m/%d/%Y") == "01/13/2026" def test_invalid_input_passes_through(self): # Non-ISO input — return as-is so the user sees what was # actually there rather than a silent empty string. assert format_date("01/13", "%Y%m%d") == "01/13" def test_none_or_empty(self): assert format_date(None) == "" assert format_date("") == "" def test_default_is_iso(self): # Default format changed to ISO ``YYYY-MM-DD`` — sorts # naturally and parses across every spreadsheet tool. assert format_date("2026-01-13") == "2026-01-13" class TestExtractAccountNumber: def test_masked(self): text = "Customer Name\nAccount Number: ****1234\nBalance" assert _extract_account_number(text) == "****1234" def test_with_hyphens(self): text = "Account #: 1234-5678-9012" assert _extract_account_number(text) == "1234-5678-9012" def test_with_spaces(self): text = "Account: 1234 5678 9012" assert _extract_account_number(text) == "1234 5678 9012" def test_no_label_no_match(self): text = "Just some text with 1234567890 in it" assert _extract_account_number(text) is None def test_requires_at_least_four_digits(self): # An "account" label followed by only XX shouldn't count. text = "Account: XX" assert _extract_account_number(text) is None class TestExtractStatementPeriod: def test_standard_period(self): text = "Statement Period: 01/01/2025 - 01/31/2025\nBalance" start, end = _extract_statement_period(text) assert start == "2025-01-01" assert end == "2025-01-31" def test_from_to(self): text = "From 01/01/2025 to 01/31/2025" start, end = _extract_statement_period(text) assert start == "2025-01-01" assert end == "2025-01-31" def test_single_date_both_fields(self): # When only one date appears near the label, return it for both. text = "Statement Date: 01/31/2025" start, end = _extract_statement_period(text) assert start == "2025-01-31" assert end == "2025-01-31" def test_no_label_no_match(self): text = "Some random text with 01/01/2025 in it" start, end = _extract_statement_period(text) # No "Period" / "From" / "Statement Date" label assert (start, end) == (None, None) class TestExtractStatementMetadata: def test_full_header(self): pages = [Page( page_no=1, width=600, height=800, text=( "ACME BANK\n" "Customer: John Doe\n" "Account Number: ****5678\n" "Statement Period: 01/01/2025 - 01/31/2025\n" "Beginning balance: $1,000.00\n" ), words=[], )] meta = extract_statement_metadata(pages) assert meta["account_number"] == "****5678" assert meta["period_start"] == "2025-01-01" assert meta["period_end"] == "2025-01-31" def test_no_pages(self): meta = extract_statement_metadata([]) assert meta == { "account_number": None, "period_start": None, "period_end": None, } def test_fallback_to_page_two(self): # Page 1 has only account; period is on page 2. p1 = Page( page_no=1, width=600, height=800, text="Account Number: ****1234\nBalance summary", words=[], ) p2 = Page( page_no=2, width=600, height=800, text="Statement Period: 02/01/2025 - 02/28/2025", words=[], ) meta = extract_statement_metadata([p1, p2]) assert meta["account_number"] == "****1234" assert meta["period_start"] == "2025-02-01" assert meta["period_end"] == "2025-02-28" class TestInferYearForShortDate: """The Dec/Jan-boundary-aware year inference. Picks the year whose candidate date lands inside (or closest to) the period.""" def test_within_period_uses_period_year(self): assert _infer_year_for_short_date( "01/13", "2025-01-01", "2025-01-31", ) == "2025-01-13" def test_dec_jan_boundary_dec_resolves_to_start_year(self): # Statement period: 2024-12-16 → 2025-01-15 # Row "12/30" → should be 2024-12-30 (in period), not 2025. assert _infer_year_for_short_date( "12/30", "2024-12-16", "2025-01-15", ) == "2024-12-30" def test_dec_jan_boundary_jan_resolves_to_end_year(self): # Same period; "01/05" → 2025-01-05 (in period), not 2024. assert _infer_year_for_short_date( "01/05", "2024-12-16", "2025-01-15", ) == "2025-01-05" def test_just_before_period_picks_closer_year(self): # "12/15" is one day before period start (2024-12-16). # 2024-12-15 is 1 day off; 2025-12-15 is 11 months off. # The closer-by-distance candidate wins. assert _infer_year_for_short_date( "12/15", "2024-12-16", "2025-01-15", ) == "2024-12-15" def test_override_beats_period(self): assert _infer_year_for_short_date( "01/13", "2025-01-01", "2025-01-31", override_year=2030, ) == "2030-01-13" def test_filename_hint_when_no_period(self): assert _infer_year_for_short_date( "01/13", None, None, filename_year_hint=2025, ) == "2025-01-13" def test_no_signal_returns_none(self): assert _infer_year_for_short_date("01/13", None, None) is None def test_short_dash(self): assert _infer_year_for_short_date( "01-13", "2025-01-01", "2025-01-31", ) == "2025-01-13" def test_month_name(self): assert _infer_year_for_short_date( "Jan 13", "2025-01-01", "2025-01-31", ) == "2025-01-13" def test_unparseable(self): assert _infer_year_for_short_date( "xx/yy", "2025-01-01", "2025-01-31", ) is None class TestYearFromFilename: def test_estmt_pattern(self): assert year_from_filename("eStmt_2025-01-13.pdf") == 2025 def test_year_embedded(self): assert year_from_filename("chase-2024-statement.pdf") == 2024 def test_no_year(self): assert year_from_filename("statement.pdf") is None def test_rejects_non_20XX(self): # Filename contains a long number but no 20XX-shaped year. assert year_from_filename("doc-1234567890.pdf") is None def test_first_match_wins(self): # Filenames sometimes carry both period start and end years. assert ( year_from_filename("statement-2024-12-16-to-2025-01-15.pdf") == 2024 ) def test_empty_filename(self): assert year_from_filename("") is None assert year_from_filename(None) is None