refactor(pdf): rip out templates; heuristic scan + selectable table
User feedback: the template / visual-picker / mode-dispatch
implementation was too complex for the actual workflow.
Statements drift between months, the canvas state didn't survive
multi-page navigation, and accountants don't want to maintain
per-bank configuration just to convert PDFs to CSV.
Start-over design — one public function, one page, no
persistence:
``scan_pdf_for_transactions(pdf_bytes) → (rows, warnings)``
A row is "any text line with a date pattern AND at least one
amount pattern." Each detected row is a dict shaped::
{
"date": "2026-01-15",
"description": "Coffee Shop",
"amount_1": -4.50,
"amount_2": 1000.00, # if a second amount was found
"page": 1,
"raw": "01/15/2026 Coffee Shop (4.50) 1,000.00",
"source_file": "chase-jan-2026.pdf",
}
Multi-line descriptions still merge (no-date no-amount lines
attach to the previous transaction). Multi-PDF batches share a
single combined table with a ``source_file`` column.
**Page UX:**
- Upload PDF(s) → optional Options expander (parens-negative,
use-OCR) → click Scan → see all detected rows in an
``st.data_editor``.
- The editor has an ``Include`` checkbox column (default on),
plus user-editable date / description / amount cells and a
read-only ``raw`` column showing the original PDF text for
verification.
- A ``Columns to include in CSV`` multiselect hides
``page`` / ``raw`` from the download by default; user can
re-add either.
- Download CSV gets only the checked rows.
No template save/load. No visual picker. No mode dispatch. No
column boundaries. No schema migration. No per-bank
configuration files.
**Deletions:**
- ``src/pdf_templates.py`` — template storage layer
- ``src/gui/_drawable_canvas_compat.py`` — Streamlit compat shim
for the canvas (no canvas now)
- ``tests/test_pdf_templates.py``, ``test_pdf_row_heuristic.py``,
``test_drawable_canvas_compat.py`` — covered the removed APIs
- ``build/hooks/hook-streamlit_drawable_canvas.py`` — hook for
the removed dep
- ``streamlit-drawable-canvas==0.9.3`` from ``requirements.txt``
- The drawable-canvas references in ``build/datatools.spec``
**``src/pdf_extract.py``** shrinks from ~30 helper functions to
~10. Keeps: value parsers, row clusterer, date/amount token
finders, OCR pipeline, dependency guards. The one new public
function ``scan_pdf_for_transactions`` glues them together.
**Tests** (59 passing): the unit layer keeps full coverage of
the building blocks; the smoke layer pins the end-to-end PDF
roundtrip, OCR discovery, dependency-import behavior, and the
multi-line-description merge. The fpdf2-generated fixture PDF
still drives the real-PDF test.
Rollback: ``git revert HEAD`` brings back the template system if
needed — but the simpler model should make that unlikely.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,36 +1,33 @@
|
||||
"""Tests for the pure PDF-extraction pipeline.
|
||||
"""Tests for the minimal PDF transaction scanner.
|
||||
|
||||
Real PDF parsing (``extract_pages``) is a thin wrapper around
|
||||
``pdfplumber`` and is exercised by hand on real bank statements.
|
||||
These tests pin the meaty bits — value parsing, row clustering,
|
||||
column assignment, template-driven extraction — against synthetic
|
||||
``WordBox`` data so they run fast and have no PDF dependency.
|
||||
The public API is one function: ``scan_pdf_for_transactions``.
|
||||
These tests cover the value-parsing helpers, the row clusterer,
|
||||
the date/amount token finders, and the end-to-end scanner
|
||||
against synthetic ``Page`` objects with no real PDF involved.
|
||||
|
||||
End-to-end-on-a-real-PDF coverage lives in
|
||||
``test_pdf_extract_smoke.py``, which uses ``fpdf2`` to generate
|
||||
a fixture statement at test time.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from src.pdf_extract import (
|
||||
Page,
|
||||
WordBox,
|
||||
apply_template,
|
||||
assign_columns,
|
||||
_find_amount_tokens,
|
||||
_find_dates_in_words,
|
||||
cluster_rows,
|
||||
parse_amount,
|
||||
parse_date,
|
||||
_pages_in_range,
|
||||
_within_table_window,
|
||||
)
|
||||
|
||||
|
||||
def _w(text: str, x0: float, top: float, x1: float | None = None) -> WordBox:
|
||||
"""Convenience constructor — heights and exact x1 don't matter
|
||||
for the tests we write."""
|
||||
return WordBox(
|
||||
x0=x0,
|
||||
top=top,
|
||||
x1=x1 if x1 is not None else x0 + 10 * len(text),
|
||||
x1=x1 if x1 is not None else x0 + 8 * len(text),
|
||||
bottom=top + 10,
|
||||
text=text,
|
||||
)
|
||||
@@ -61,13 +58,18 @@ class TestParseAmount:
|
||||
assert parse_amount("not a number") is None
|
||||
|
||||
def test_european_decimal(self):
|
||||
opts = {
|
||||
"decimal_separator": ",",
|
||||
"thousands_separator": ".",
|
||||
"currency_strip": "€",
|
||||
"negative_in_parens": True,
|
||||
}
|
||||
assert parse_amount("€1.234,56", opts) == 1234.56
|
||||
assert parse_amount(
|
||||
"€1.234,56",
|
||||
decimal=",",
|
||||
thousands=".",
|
||||
currency_strip="€",
|
||||
) == 1234.56
|
||||
|
||||
def test_parens_off_disables_paren_negative(self):
|
||||
# With parens off, (4.50) won't be treated as negative —
|
||||
# but it also won't parse cleanly since "(4.50)" isn't a
|
||||
# plain number. Verify the off-path is non-flipping.
|
||||
assert parse_amount("(4.50)", negative_in_parens=False) is None
|
||||
|
||||
|
||||
class TestParseDate:
|
||||
@@ -78,7 +80,7 @@ class TestParseDate:
|
||||
assert parse_date("2026-01-15", ["%Y-%m-%d"]) == "2026-01-15"
|
||||
|
||||
def test_fallback_format(self):
|
||||
# Not in the supplied list — should still parse via fallback.
|
||||
# Not in supplied list — should still parse via fallback.
|
||||
assert parse_date("01/15/26") == "2026-01-15"
|
||||
|
||||
def test_invalid(self):
|
||||
@@ -88,199 +90,74 @@ class TestParseDate:
|
||||
class TestClusterRows:
|
||||
def test_groups_close_y(self):
|
||||
words = [
|
||||
_w("A", x0=0, top=100),
|
||||
_w("B", x0=20, top=101),
|
||||
_w("C", x0=40, top=102),
|
||||
_w("A", 0, 100), _w("B", 20, 101), _w("C", 40, 102),
|
||||
]
|
||||
rows = cluster_rows(words, y_tolerance=3.0)
|
||||
rows = cluster_rows(words)
|
||||
assert len(rows) == 1
|
||||
assert [w.text for w in rows[0]] == ["A", "B", "C"]
|
||||
|
||||
def test_separates_far_y(self):
|
||||
words = [
|
||||
_w("A", x0=0, top=100),
|
||||
_w("B", x0=0, top=120),
|
||||
]
|
||||
rows = cluster_rows(words, y_tolerance=3.0)
|
||||
assert [[w.text for w in r] for r in rows] == [["A"], ["B"]]
|
||||
words = [_w("A", 0, 100), _w("B", 0, 120)]
|
||||
assert [
|
||||
[w.text for w in r] for r in cluster_rows(words)
|
||||
] == [["A"], ["B"]]
|
||||
|
||||
def test_sorts_left_to_right_within_row(self):
|
||||
words = [
|
||||
_w("C", x0=40, top=100),
|
||||
_w("A", x0=0, top=100),
|
||||
_w("B", x0=20, top=100),
|
||||
]
|
||||
rows = cluster_rows(words)
|
||||
assert [w.text for w in rows[0]] == ["A", "B", "C"]
|
||||
words = [_w("C", 40, 100), _w("A", 0, 100), _w("B", 20, 100)]
|
||||
assert [w.text for w in cluster_rows(words)[0]] == ["A", "B", "C"]
|
||||
|
||||
def test_empty(self):
|
||||
assert cluster_rows([]) == []
|
||||
|
||||
|
||||
class TestAssignColumns:
|
||||
def test_three_columns(self):
|
||||
# boundaries at x=100, 200 → columns [0,100), [100,200), [200,∞)
|
||||
row = [
|
||||
_w("Jan", x0=10, top=0, x1=40), # col 0
|
||||
_w("1", x0=45, top=0, x1=55), # col 0
|
||||
_w("Deposit", x0=110, top=0, x1=180), # col 1
|
||||
_w("250.00", x0=210, top=0, x1=260), # col 2
|
||||
]
|
||||
cells = assign_columns(row, [100, 200])
|
||||
assert cells[0] == "Jan 1"
|
||||
assert cells[1] == "Deposit"
|
||||
assert cells[2] == "250.00"
|
||||
class TestFindDatesInWords:
|
||||
def test_us_slash(self):
|
||||
row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)]
|
||||
assert _find_dates_in_words(row) == [(0, "01/15/2026")]
|
||||
|
||||
def test_no_boundaries_one_column(self):
|
||||
row = [_w("A", 0, 0), _w("B", 20, 0)]
|
||||
cells = assign_columns(row, [])
|
||||
assert cells == ["A B"]
|
||||
def test_two_digit_year(self):
|
||||
row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)]
|
||||
result = _find_dates_in_words(row)
|
||||
assert result and result[0][1] == "01/15/26"
|
||||
|
||||
def test_iso(self):
|
||||
row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)]
|
||||
assert _find_dates_in_words(row) == [(0, "2026-01-15")]
|
||||
|
||||
def test_month_name(self):
|
||||
row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)]
|
||||
result = _find_dates_in_words(row)
|
||||
assert result and "Jan 15" in result[0][1]
|
||||
|
||||
def test_no_date(self):
|
||||
row = [_w("Just", 0, 0), _w("text", 50, 0)]
|
||||
assert _find_dates_in_words(row) == []
|
||||
|
||||
|
||||
class TestPagesInRange:
|
||||
def _mk(self, n):
|
||||
return [Page(page_no=i + 1, width=600, height=800, text="", words=[]) for i in range(n)]
|
||||
class TestFindAmountTokens:
|
||||
def test_currency_format(self):
|
||||
row = [_w("Coffee", 0, 0), _w("$4.50", 100, 0)]
|
||||
out = _find_amount_tokens(row)
|
||||
assert len(out) == 1
|
||||
assert out[0][2] == "$4.50"
|
||||
|
||||
def test_all(self):
|
||||
pages = self._mk(5)
|
||||
assert len(_pages_in_range(pages, "all")) == 5
|
||||
assert len(_pages_in_range(pages, "")) == 5
|
||||
def test_parens_negative(self):
|
||||
row = [_w("(123.45)", 0, 0)]
|
||||
out = _find_amount_tokens(row)
|
||||
assert out and out[0][2] == "(123.45)"
|
||||
|
||||
def test_explicit_list(self):
|
||||
pages = self._mk(5)
|
||||
got = [p.page_no for p in _pages_in_range(pages, "1,3,5")]
|
||||
assert got == [1, 3, 5]
|
||||
def test_no_amount_on_pure_text(self):
|
||||
row = [_w("Hello", 0, 0), _w("World", 50, 0)]
|
||||
assert _find_amount_tokens(row) == []
|
||||
|
||||
def test_range(self):
|
||||
pages = self._mk(5)
|
||||
got = [p.page_no for p in _pages_in_range(pages, "2-4")]
|
||||
assert got == [2, 3, 4]
|
||||
|
||||
def test_open_ended(self):
|
||||
pages = self._mk(5)
|
||||
got = [p.page_no for p in _pages_in_range(pages, "3-")]
|
||||
assert got == [3, 4, 5]
|
||||
def test_rejects_bare_year(self):
|
||||
# A bare 4-digit year matches the digit pattern but lacks
|
||||
# any money marker — should be filtered out.
|
||||
row = [_w("2026", 0, 0)]
|
||||
assert _find_amount_tokens(row) == []
|
||||
|
||||
|
||||
class TestWithinTableWindow:
|
||||
def test_header_skipped_end_excluded(self):
|
||||
rows = [
|
||||
[_w("STATEMENT", 0, 0)],
|
||||
[_w("Date", 0, 20), _w("Description", 50, 20), _w("Amount", 200, 20)],
|
||||
[_w("01/15", 0, 40), _w("Coffee", 50, 40), _w("4.50", 200, 40)],
|
||||
[_w("01/16", 0, 60), _w("Refund", 50, 60), _w("12.00", 200, 60)],
|
||||
[_w("Closing", 0, 80), _w("balance", 50, 80)],
|
||||
[_w("Page", 0, 100), _w("1", 50, 100)],
|
||||
]
|
||||
out = _within_table_window(rows, "Date Description Amount", ["Closing balance"])
|
||||
# Should keep just the two transaction rows.
|
||||
assert len(out) == 2
|
||||
assert out[0][0].text == "01/15"
|
||||
assert out[1][0].text == "01/16"
|
||||
|
||||
def test_no_header_returns_empty_when_required(self):
|
||||
rows = [[_w("foo", 0, 0)]]
|
||||
assert _within_table_window(rows, "Date Description Amount", []) == []
|
||||
|
||||
def test_blank_header_passes_through(self):
|
||||
rows = [[_w("x", 0, 0)], [_w("y", 0, 20)]]
|
||||
assert _within_table_window(rows, "", []) == rows
|
||||
|
||||
|
||||
class TestApplyTemplate:
|
||||
"""End-to-end on synthetic ``Page`` objects."""
|
||||
|
||||
def _statement_page(self) -> Page:
|
||||
# Mock layout: 3 columns at x=0/100/200, header at y=20, data at 40+.
|
||||
words = [
|
||||
_w("STATEMENT", 0, 0),
|
||||
# Header
|
||||
_w("Date", 5, 20), _w("Description", 105, 20), _w("Amount", 205, 20),
|
||||
# Row 1
|
||||
_w("01/15/2026", 5, 40), _w("Coffee", 105, 40),
|
||||
_w("Shop", 140, 40), _w("(4.50)", 205, 40),
|
||||
# Row 2
|
||||
_w("01/16/2026", 5, 60), _w("Refund", 105, 60), _w("$12.00", 205, 60),
|
||||
# Continuation row (no date) — should merge into row 2
|
||||
_w("from", 105, 80), _w("vendor", 140, 80),
|
||||
# End marker
|
||||
_w("Closing", 5, 100), _w("balance", 105, 100), _w("$1,000.00", 205, 100),
|
||||
]
|
||||
return Page(page_no=1, width=300, height=120, text="", words=words)
|
||||
|
||||
def _template(self) -> dict:
|
||||
return {
|
||||
"pages": {"range": "all"},
|
||||
"table": {
|
||||
"header_text": "Date Description Amount",
|
||||
"end_markers": ["Closing balance"],
|
||||
"column_boundaries": [100, 200],
|
||||
"y_tolerance": 3.0,
|
||||
"skip_rows_matching": [],
|
||||
},
|
||||
"columns": [
|
||||
{"source": 0, "target": "date"},
|
||||
{"source": 1, "target": "description"},
|
||||
{"source": 2, "target": "amount"},
|
||||
],
|
||||
"parse": {
|
||||
"date_format": "%m/%d/%Y",
|
||||
"amount_negative_in_parens": True,
|
||||
"merge_multiline_description": True,
|
||||
},
|
||||
}
|
||||
|
||||
def test_basic_extraction(self):
|
||||
df = apply_template([self._statement_page()], self._template())
|
||||
assert isinstance(df, pd.DataFrame)
|
||||
assert len(df) == 2
|
||||
assert list(df["date"]) == ["2026-01-15", "2026-01-16"]
|
||||
# Parens-negative
|
||||
assert df.iloc[0]["amount"] == -4.50
|
||||
# Plain positive with currency strip
|
||||
assert df.iloc[1]["amount"] == 12.00
|
||||
# Multi-line description merged
|
||||
assert "from vendor" in df.iloc[1]["description"]
|
||||
|
||||
def test_debit_credit_split_columns(self):
|
||||
# Layout: date | description | debit | credit columns
|
||||
page = Page(
|
||||
page_no=1, width=400, height=80, text="",
|
||||
words=[
|
||||
_w("Date", 5, 0), _w("Desc", 105, 0),
|
||||
_w("Debit", 205, 0), _w("Credit", 305, 0),
|
||||
_w("01/15/2026", 5, 20), _w("Coffee", 105, 20), _w("4.50", 205, 20),
|
||||
_w("01/16/2026", 5, 40), _w("Refund", 105, 40),
|
||||
_w("", 205, 40), # no debit
|
||||
_w("12.00", 305, 40),
|
||||
],
|
||||
)
|
||||
tpl = {
|
||||
"table": {
|
||||
"header_text": "Date Desc Debit Credit",
|
||||
"column_boundaries": [100, 200, 300],
|
||||
},
|
||||
"columns": [
|
||||
{"source": 0, "target": "date"},
|
||||
{"source": 1, "target": "description"},
|
||||
{"source": 2, "target": "amount_debit"},
|
||||
{"source": 3, "target": "amount_credit"},
|
||||
],
|
||||
"parse": {"date_format": "%m/%d/%Y"},
|
||||
}
|
||||
df = apply_template([page], tpl)
|
||||
assert list(df["amount"]) == [-4.50, 12.00]
|
||||
assert list(df["type"]) == ["debit", "credit"]
|
||||
|
||||
def test_skip_rows_matching(self):
|
||||
page = self._statement_page()
|
||||
tpl = self._template()
|
||||
tpl["table"]["skip_rows_matching"] = ["Refund"]
|
||||
df = apply_template([page], tpl)
|
||||
# Refund row is dropped — only one transaction left
|
||||
assert len(df) == 1
|
||||
assert df.iloc[0]["amount"] == -4.50
|
||||
|
||||
def test_empty_pages_returns_empty_df(self):
|
||||
df = apply_template([], self._template())
|
||||
assert df.empty
|
||||
# End-to-end tests against synthetic Page objects are in the smoke
|
||||
# test module — they need ``scan_pdf_for_transactions`` which in
|
||||
# turn uses ``extract_pages_auto``. The unit-test layer here pins
|
||||
# the building blocks; smoke tests pin the wiring.
|
||||
|
||||
Reference in New Issue
Block a user