Files
datatools-dev/tests/test_pdf_extract.py
Michael bece2b4030 refactor(pdf): rip out templates; heuristic scan + selectable table
User feedback: the template / visual-picker / mode-dispatch
implementation was too complex for the actual workflow.
Statements drift between months, the canvas state didn't survive
multi-page navigation, and accountants don't want to maintain
per-bank configuration just to convert PDFs to CSV.

Start-over design — one public function, one page, no
persistence:

  ``scan_pdf_for_transactions(pdf_bytes) → (rows, warnings)``

A row is "any text line with a date pattern AND at least one
amount pattern." Each detected row is a dict shaped::

    {
      "date": "2026-01-15",
      "description": "Coffee Shop",
      "amount_1": -4.50,
      "amount_2": 1000.00,   # if a second amount was found
      "page": 1,
      "raw": "01/15/2026 Coffee Shop (4.50) 1,000.00",
      "source_file": "chase-jan-2026.pdf",
    }

Multi-line descriptions still merge (no-date no-amount lines
attach to the previous transaction). Multi-PDF batches share a
single combined table with a ``source_file`` column.

**Page UX:**

- Upload PDF(s) → optional Options expander (parens-negative,
  use-OCR) → click Scan → see all detected rows in an
  ``st.data_editor``.
- The editor has an ``Include`` checkbox column (default on),
  plus user-editable date / description / amount cells and a
  read-only ``raw`` column showing the original PDF text for
  verification.
- A ``Columns to include in CSV`` multiselect hides
  ``page`` / ``raw`` from the download by default; user can
  re-add either.
- Download CSV gets only the checked rows.

No template save/load. No visual picker. No mode dispatch. No
column boundaries. No schema migration. No per-bank
configuration files.

**Deletions:**

- ``src/pdf_templates.py`` — template storage layer
- ``src/gui/_drawable_canvas_compat.py`` — Streamlit compat shim
  for the canvas (no canvas now)
- ``tests/test_pdf_templates.py``, ``test_pdf_row_heuristic.py``,
  ``test_drawable_canvas_compat.py`` — covered the removed APIs
- ``build/hooks/hook-streamlit_drawable_canvas.py`` — hook for
  the removed dep
- ``streamlit-drawable-canvas==0.9.3`` from ``requirements.txt``
- The drawable-canvas references in ``build/datatools.spec``

**``src/pdf_extract.py``** shrinks from ~30 helper functions to
~10. Keeps: value parsers, row clusterer, date/amount token
finders, OCR pipeline, dependency guards. The one new public
function ``scan_pdf_for_transactions`` glues them together.

**Tests** (59 passing): the unit layer keeps full coverage of
the building blocks; the smoke layer pins the end-to-end PDF
roundtrip, OCR discovery, dependency-import behavior, and the
multi-line-description merge. The fpdf2-generated fixture PDF
still drives the real-PDF test.

Rollback: ``git revert HEAD`` brings back the template system if
needed — but the simpler model should make that unlikely.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 23:57:30 +00:00

164 lines
5.1 KiB
Python

"""Tests for the minimal PDF transaction scanner.
The public API is one function: ``scan_pdf_for_transactions``.
These tests cover the value-parsing helpers, the row clusterer,
the date/amount token finders, and the end-to-end scanner
against synthetic ``Page`` objects with no real PDF involved.
End-to-end-on-a-real-PDF coverage lives in
``test_pdf_extract_smoke.py``, which uses ``fpdf2`` to generate
a fixture statement at test time.
"""
from __future__ import annotations
from src.pdf_extract import (
Page,
WordBox,
_find_amount_tokens,
_find_dates_in_words,
cluster_rows,
parse_amount,
parse_date,
)
def _w(text: str, x0: float, top: float, x1: float | None = None) -> WordBox:
return WordBox(
x0=x0,
top=top,
x1=x1 if x1 is not None else x0 + 8 * len(text),
bottom=top + 10,
text=text,
)
class TestParseAmount:
def test_plain_positive(self):
assert parse_amount("1234.56") == 1234.56
def test_currency_and_thousands(self):
assert parse_amount("$1,234.56") == 1234.56
def test_parens_negative(self):
assert parse_amount("(1,234.56)") == -1234.56
def test_leading_minus(self):
assert parse_amount("-100.00") == -100.0
def test_trailing_minus(self):
assert parse_amount("100.00-") == -100.0
def test_blank_returns_none(self):
assert parse_amount("") is None
assert parse_amount(" ") is None
assert parse_amount(None) is None
def test_garbage_returns_none(self):
assert parse_amount("not a number") is None
def test_european_decimal(self):
assert parse_amount(
"€1.234,56",
decimal=",",
thousands=".",
currency_strip="",
) == 1234.56
def test_parens_off_disables_paren_negative(self):
# With parens off, (4.50) won't be treated as negative —
# but it also won't parse cleanly since "(4.50)" isn't a
# plain number. Verify the off-path is non-flipping.
assert parse_amount("(4.50)", negative_in_parens=False) is None
class TestParseDate:
def test_us_slash(self):
assert parse_date("01/15/2026", ["%m/%d/%Y"]) == "2026-01-15"
def test_iso(self):
assert parse_date("2026-01-15", ["%Y-%m-%d"]) == "2026-01-15"
def test_fallback_format(self):
# Not in supplied list — should still parse via fallback.
assert parse_date("01/15/26") == "2026-01-15"
def test_invalid(self):
assert parse_date("not-a-date") is None
class TestClusterRows:
def test_groups_close_y(self):
words = [
_w("A", 0, 100), _w("B", 20, 101), _w("C", 40, 102),
]
rows = cluster_rows(words)
assert len(rows) == 1
assert [w.text for w in rows[0]] == ["A", "B", "C"]
def test_separates_far_y(self):
words = [_w("A", 0, 100), _w("B", 0, 120)]
assert [
[w.text for w in r] for r in cluster_rows(words)
] == [["A"], ["B"]]
def test_sorts_left_to_right_within_row(self):
words = [_w("C", 40, 100), _w("A", 0, 100), _w("B", 20, 100)]
assert [w.text for w in cluster_rows(words)[0]] == ["A", "B", "C"]
def test_empty(self):
assert cluster_rows([]) == []
class TestFindDatesInWords:
def test_us_slash(self):
row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)]
assert _find_dates_in_words(row) == [(0, "01/15/2026")]
def test_two_digit_year(self):
row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)]
result = _find_dates_in_words(row)
assert result and result[0][1] == "01/15/26"
def test_iso(self):
row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)]
assert _find_dates_in_words(row) == [(0, "2026-01-15")]
def test_month_name(self):
row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)]
result = _find_dates_in_words(row)
assert result and "Jan 15" in result[0][1]
def test_no_date(self):
row = [_w("Just", 0, 0), _w("text", 50, 0)]
assert _find_dates_in_words(row) == []
class TestFindAmountTokens:
def test_currency_format(self):
row = [_w("Coffee", 0, 0), _w("$4.50", 100, 0)]
out = _find_amount_tokens(row)
assert len(out) == 1
assert out[0][2] == "$4.50"
def test_parens_negative(self):
row = [_w("(123.45)", 0, 0)]
out = _find_amount_tokens(row)
assert out and out[0][2] == "(123.45)"
def test_no_amount_on_pure_text(self):
row = [_w("Hello", 0, 0), _w("World", 50, 0)]
assert _find_amount_tokens(row) == []
def test_rejects_bare_year(self):
# A bare 4-digit year matches the digit pattern but lacks
# any money marker — should be filtered out.
row = [_w("2026", 0, 0)]
assert _find_amount_tokens(row) == []
# End-to-end tests against synthetic Page objects are in the smoke
# test module — they need ``scan_pdf_for_transactions`` which in
# turn uses ``extract_pages_auto``. The unit-test layer here pins
# the building blocks; smoke tests pin the wiring.