Files
datatools-dev/tests/test_pdf_extract.py
Michael 263af3c7c2 fix(pdf): short dates without year + diagnostic for "0 rows" runs
User uploaded a real Chase statement and got "0 rows detected."
Two bugs the rewrite shipped with, plus a diagnostic:

**1. Short dates without year weren't recognized.** Most bank
statements (Chase, Wells, BofA, …) display transaction dates as
``01/13`` or ``Jan 13`` because the year is implied by the
statement period. The original regex required ``\d{2,4}`` after
the second slash, so ``01/13`` failed to match and rows with no
detected date got dropped.

Split ``_DATE_RES`` into ``_FULL`` (with year) and ``_SHORT``
(no year), with a two-pass detector: pass 1 tries full-year
patterns across the whole row; pass 2 only tries short patterns
if pass 1 found nothing. This prevents a stray ``Page 1/2`` from
shadowing the real dated transaction on the same line.

Short patterns:
- ``\d{1,2}/\d{1,2}`` — Chase, etc.
- ``\d{1,2}-\d{1,2}``
- ``[A-Z][a-z]{2}\s+\d{1,2}`` — "Jan 13"

When parsing, short dates pass through ``parse_date`` and
return None (no year to bind to), so the scanner falls back to
the raw text — the user sees ``01/13`` in the date column and
can correct in the editor.

**2. Multi-word dates leaked the day token into the description.**
A pre-existing bug: ``_find_dates_in_words`` returned only the
START word index, and ``_description_from_row`` only excluded
that single word. For "Jan 13 Coffee $4.50", the description
became "13 Coffee" instead of "Coffee". Fixed by returning
``(start, end, text)`` with ``end`` exclusive (computed from
``len(m.group(1).split())`` so window-overrun doesn't
over-consume), and the description builder now skips the full
range.

**3. New diagnostic: ``diagnose_pdf_lines(pdf_bytes)``.** Returns
every clustered text line the scanner saw with ``has_date`` /
``has_amount`` flags. When the page's scan returns 0 rows, an
auto-expanded "what the scanner saw" expander now renders a
table of all extracted lines so the user can:

- Spot scanned-PDF cases (empty result → enable OCR)
- See which lines have a date but no amount (or vice versa)
- Eyeball the date / amount format the scanner missed

Without leaving the app or asking the developer for help.

Eight new tests cover: short US date (``01/13``), short month-
name date with two-word consumption (``Jan 13``), the
``Page 1/2 ... 01/13/2026`` shadowing case, and the multi-word-
date description fix.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 00:06:07 +00:00

195 lines
6.5 KiB
Python

"""Tests for the minimal PDF transaction scanner.
The public API is one function: ``scan_pdf_for_transactions``.
These tests cover the value-parsing helpers, the row clusterer,
the date/amount token finders, and the end-to-end scanner
against synthetic ``Page`` objects with no real PDF involved.
End-to-end-on-a-real-PDF coverage lives in
``test_pdf_extract_smoke.py``, which uses ``fpdf2`` to generate
a fixture statement at test time.
"""
from __future__ import annotations
from src.pdf_extract import (
Page,
WordBox,
_find_amount_tokens,
_find_dates_in_words,
cluster_rows,
parse_amount,
parse_date,
)
def _w(text: str, x0: float, top: float, x1: float | None = None) -> WordBox:
return WordBox(
x0=x0,
top=top,
x1=x1 if x1 is not None else x0 + 8 * len(text),
bottom=top + 10,
text=text,
)
class TestParseAmount:
def test_plain_positive(self):
assert parse_amount("1234.56") == 1234.56
def test_currency_and_thousands(self):
assert parse_amount("$1,234.56") == 1234.56
def test_parens_negative(self):
assert parse_amount("(1,234.56)") == -1234.56
def test_leading_minus(self):
assert parse_amount("-100.00") == -100.0
def test_trailing_minus(self):
assert parse_amount("100.00-") == -100.0
def test_blank_returns_none(self):
assert parse_amount("") is None
assert parse_amount(" ") is None
assert parse_amount(None) is None
def test_garbage_returns_none(self):
assert parse_amount("not a number") is None
def test_european_decimal(self):
assert parse_amount(
"€1.234,56",
decimal=",",
thousands=".",
currency_strip="",
) == 1234.56
def test_parens_off_disables_paren_negative(self):
# With parens off, (4.50) won't be treated as negative —
# but it also won't parse cleanly since "(4.50)" isn't a
# plain number. Verify the off-path is non-flipping.
assert parse_amount("(4.50)", negative_in_parens=False) is None
class TestParseDate:
def test_us_slash(self):
assert parse_date("01/15/2026", ["%m/%d/%Y"]) == "2026-01-15"
def test_iso(self):
assert parse_date("2026-01-15", ["%Y-%m-%d"]) == "2026-01-15"
def test_fallback_format(self):
# Not in supplied list — should still parse via fallback.
assert parse_date("01/15/26") == "2026-01-15"
def test_invalid(self):
assert parse_date("not-a-date") is None
class TestClusterRows:
def test_groups_close_y(self):
words = [
_w("A", 0, 100), _w("B", 20, 101), _w("C", 40, 102),
]
rows = cluster_rows(words)
assert len(rows) == 1
assert [w.text for w in rows[0]] == ["A", "B", "C"]
def test_separates_far_y(self):
words = [_w("A", 0, 100), _w("B", 0, 120)]
assert [
[w.text for w in r] for r in cluster_rows(words)
] == [["A"], ["B"]]
def test_sorts_left_to_right_within_row(self):
words = [_w("C", 40, 100), _w("A", 0, 100), _w("B", 20, 100)]
assert [w.text for w in cluster_rows(words)[0]] == ["A", "B", "C"]
def test_empty(self):
assert cluster_rows([]) == []
class TestFindDatesInWords:
"""Returns ``[(start, end, text)]`` — end is exclusive index of
words the date consumed."""
def test_us_slash(self):
row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)]
assert _find_dates_in_words(row) == [(0, 1, "01/15/2026")]
def test_two_digit_year(self):
row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)]
result = _find_dates_in_words(row)
assert result and result[0][2] == "01/15/26"
def test_iso(self):
row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)]
assert _find_dates_in_words(row) == [(0, 1, "2026-01-15")]
def test_month_name_with_year_consumes_three_words(self):
row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)]
result = _find_dates_in_words(row)
assert result and "Jan 15" in result[0][2]
# Date consumes all 3 words so they don't leak to description.
assert result[0][1] == 3
def test_short_us_date_no_year(self):
"""Chase-style ``01/13`` without a year still detects."""
row = [_w("01/13", 0, 0), _w("Coffee", 100, 0), _w("$4.50", 200, 0)]
result = _find_dates_in_words(row)
assert result and result[0][2] == "01/13"
assert result[0][1] == 1 # one word consumed
def test_short_month_name_no_year_consumes_two_words(self):
row = [_w("Jan", 0, 0), _w("13", 30, 0), _w("Coffee", 100, 0)]
result = _find_dates_in_words(row)
assert result
assert "Jan 13" in result[0][2]
assert result[0][1] == 2 # "Jan" + "13" both consumed
def test_short_pattern_does_not_shadow_full_year(self):
"""If a full-year date is present, short patterns shouldn't
steal — e.g. ``Page 1/2 of 3 ... 01/13/2026 Coffee`` should
return the real ``01/13/2026``, not the ``1/2`` page marker."""
row = [
_w("Page", 0, 0), _w("1/2", 40, 0), _w("of", 80, 0),
_w("3", 100, 0),
_w("01/13/2026", 200, 0), _w("Coffee", 300, 0),
]
result = _find_dates_in_words(row)
assert result and result[0][2] == "01/13/2026"
def test_no_date(self):
row = [_w("Just", 0, 0), _w("text", 50, 0)]
assert _find_dates_in_words(row) == []
class TestFindAmountTokens:
def test_currency_format(self):
row = [_w("Coffee", 0, 0), _w("$4.50", 100, 0)]
out = _find_amount_tokens(row)
assert len(out) == 1
assert out[0][2] == "$4.50"
def test_parens_negative(self):
row = [_w("(123.45)", 0, 0)]
out = _find_amount_tokens(row)
assert out and out[0][2] == "(123.45)"
def test_no_amount_on_pure_text(self):
row = [_w("Hello", 0, 0), _w("World", 50, 0)]
assert _find_amount_tokens(row) == []
def test_rejects_bare_year(self):
# A bare 4-digit year matches the digit pattern but lacks
# any money marker — should be filtered out.
row = [_w("2026", 0, 0)]
assert _find_amount_tokens(row) == []
# End-to-end tests against synthetic Page objects are in the smoke
# test module — they need ``scan_pdf_for_transactions`` which in
# turn uses ``extract_pages_auto``. The unit-test layer here pins
# the building blocks; smoke tests pin the wiring.