refactor(pdf): rip out templates; heuristic scan + selectable table
User feedback: the template / visual-picker / mode-dispatch
implementation was too complex for the actual workflow.
Statements drift between months, the canvas state didn't survive
multi-page navigation, and accountants don't want to maintain
per-bank configuration just to convert PDFs to CSV.
Start-over design — one public function, one page, no
persistence:
``scan_pdf_for_transactions(pdf_bytes) → (rows, warnings)``
A row is "any text line with a date pattern AND at least one
amount pattern." Each detected row is a dict shaped::
{
"date": "2026-01-15",
"description": "Coffee Shop",
"amount_1": -4.50,
"amount_2": 1000.00, # if a second amount was found
"page": 1,
"raw": "01/15/2026 Coffee Shop (4.50) 1,000.00",
"source_file": "chase-jan-2026.pdf",
}
Multi-line descriptions still merge (no-date no-amount lines
attach to the previous transaction). Multi-PDF batches share a
single combined table with a ``source_file`` column.
**Page UX:**
- Upload PDF(s) → optional Options expander (parens-negative,
use-OCR) → click Scan → see all detected rows in an
``st.data_editor``.
- The editor has an ``Include`` checkbox column (default on),
plus user-editable date / description / amount cells and a
read-only ``raw`` column showing the original PDF text for
verification.
- A ``Columns to include in CSV`` multiselect hides
``page`` / ``raw`` from the download by default; user can
re-add either.
- Download CSV gets only the checked rows.
No template save/load. No visual picker. No mode dispatch. No
column boundaries. No schema migration. No per-bank
configuration files.
**Deletions:**
- ``src/pdf_templates.py`` — template storage layer
- ``src/gui/_drawable_canvas_compat.py`` — Streamlit compat shim
for the canvas (no canvas now)
- ``tests/test_pdf_templates.py``, ``test_pdf_row_heuristic.py``,
``test_drawable_canvas_compat.py`` — covered the removed APIs
- ``build/hooks/hook-streamlit_drawable_canvas.py`` — hook for
the removed dep
- ``streamlit-drawable-canvas==0.9.3`` from ``requirements.txt``
- The drawable-canvas references in ``build/datatools.spec``
**``src/pdf_extract.py``** shrinks from ~30 helper functions to
~10. Keeps: value parsers, row clusterer, date/amount token
finders, OCR pipeline, dependency guards. The one new public
function ``scan_pdf_for_transactions`` glues them together.
**Tests** (59 passing): the unit layer keeps full coverage of
the building blocks; the smoke layer pins the end-to-end PDF
roundtrip, OCR discovery, dependency-import behavior, and the
multi-line-description merge. The fpdf2-generated fixture PDF
still drives the real-PDF test.
Rollback: ``git revert HEAD`` brings back the template system if
needed — but the simpler model should make that unlikely.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,116 +0,0 @@
|
||||
"""Tests for the streamlit-drawable-canvas compatibility shim.
|
||||
|
||||
The shim re-attaches ``image_to_url`` to ``streamlit.elements.image``
|
||||
on modern Streamlit where the helper was relocated to
|
||||
``streamlit.elements.lib.image_utils`` and given a new signature
|
||||
(takes a ``LayoutConfig`` dataclass instead of a plain ``int``
|
||||
width).
|
||||
|
||||
If this test ever fails on a Streamlit upgrade, it almost
|
||||
certainly means the ``image_to_url`` function moved AGAIN — the
|
||||
shim's fallback message points to where to look. Update
|
||||
``_drawable_canvas_compat.py`` to find the new location.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import types
|
||||
|
||||
|
||||
def test_shim_attaches_image_to_url():
|
||||
"""After ``install()`` the old import path resolves to a
|
||||
callable, even on modern Streamlit where the original was
|
||||
relocated."""
|
||||
# Force a fresh import so the module-level _PATCHED guard
|
||||
# doesn't short-circuit between tests.
|
||||
sys.modules.pop("src.gui._drawable_canvas_compat", None)
|
||||
from src.gui._drawable_canvas_compat import install
|
||||
install()
|
||||
import streamlit.elements.image as old_loc
|
||||
assert hasattr(old_loc, "image_to_url")
|
||||
assert callable(old_loc.image_to_url)
|
||||
|
||||
|
||||
def test_shim_is_idempotent():
|
||||
"""Calling ``install()`` twice doesn't double-wrap or break
|
||||
anything — important because the page module imports + calls
|
||||
it once, and a Streamlit script-rerun re-executes the page
|
||||
module top-to-bottom."""
|
||||
sys.modules.pop("src.gui._drawable_canvas_compat", None)
|
||||
from src.gui._drawable_canvas_compat import install
|
||||
install()
|
||||
import streamlit.elements.image as old_loc
|
||||
first = old_loc.image_to_url
|
||||
install()
|
||||
second = old_loc.image_to_url
|
||||
assert first is second
|
||||
|
||||
|
||||
def test_shim_no_op_when_image_to_url_already_present():
|
||||
"""If a future Streamlit restores ``image_to_url`` at the old
|
||||
location, the shim must not overwrite it — leave the upstream
|
||||
function in place so the canvas package gets the official
|
||||
version, not our compatibility wrapper."""
|
||||
sys.modules.pop("src.gui._drawable_canvas_compat", None)
|
||||
import streamlit.elements.image as old_loc
|
||||
|
||||
sentinel = lambda *a, **kw: "sentinel-url" # noqa: E731
|
||||
old_loc.image_to_url = sentinel
|
||||
try:
|
||||
from src.gui._drawable_canvas_compat import install
|
||||
install()
|
||||
assert old_loc.image_to_url is sentinel, (
|
||||
"Shim must not clobber an existing image_to_url."
|
||||
)
|
||||
finally:
|
||||
# Tidy up so subsequent tests see a clean module.
|
||||
delattr(old_loc, "image_to_url")
|
||||
sys.modules.pop("src.gui._drawable_canvas_compat", None)
|
||||
|
||||
|
||||
def test_shim_calls_new_function_with_layout_config():
|
||||
"""The shim's wrapper must translate the old ``(image, width,
|
||||
clamp, channels, output_format, image_id)`` call into the new
|
||||
``(image, layout_config, …)`` signature without breaking."""
|
||||
sys.modules.pop("src.gui._drawable_canvas_compat", None)
|
||||
import streamlit.elements.image as old_loc
|
||||
if hasattr(old_loc, "image_to_url"):
|
||||
delattr(old_loc, "image_to_url")
|
||||
|
||||
# Replace the new function with a recorder so we can inspect
|
||||
# what arguments the shim passed through.
|
||||
from streamlit.elements.lib import image_utils
|
||||
captured: dict = {}
|
||||
original = image_utils.image_to_url
|
||||
|
||||
def recorder(image, layout_config, clamp, channels, output_format, image_id):
|
||||
captured["image"] = image
|
||||
captured["layout_config"] = layout_config
|
||||
captured["clamp"] = clamp
|
||||
captured["channels"] = channels
|
||||
captured["output_format"] = output_format
|
||||
captured["image_id"] = image_id
|
||||
return "fake-url"
|
||||
|
||||
image_utils.image_to_url = recorder
|
||||
try:
|
||||
from src.gui._drawable_canvas_compat import install
|
||||
install()
|
||||
result = old_loc.image_to_url(
|
||||
"fake-image", -1, False, "RGB", "PNG", "test-id",
|
||||
)
|
||||
assert result == "fake-url"
|
||||
assert captured["image"] == "fake-image"
|
||||
assert captured["clamp"] is False
|
||||
assert captured["channels"] == "RGB"
|
||||
assert captured["output_format"] == "PNG"
|
||||
assert captured["image_id"] == "test-id"
|
||||
# The shim wraps the int width into a LayoutConfig.
|
||||
from streamlit.elements.lib.layout_utils import LayoutConfig
|
||||
assert isinstance(captured["layout_config"], LayoutConfig)
|
||||
finally:
|
||||
image_utils.image_to_url = original
|
||||
if hasattr(old_loc, "image_to_url"):
|
||||
delattr(old_loc, "image_to_url")
|
||||
sys.modules.pop("src.gui._drawable_canvas_compat", None)
|
||||
@@ -1,36 +1,33 @@
|
||||
"""Tests for the pure PDF-extraction pipeline.
|
||||
"""Tests for the minimal PDF transaction scanner.
|
||||
|
||||
Real PDF parsing (``extract_pages``) is a thin wrapper around
|
||||
``pdfplumber`` and is exercised by hand on real bank statements.
|
||||
These tests pin the meaty bits — value parsing, row clustering,
|
||||
column assignment, template-driven extraction — against synthetic
|
||||
``WordBox`` data so they run fast and have no PDF dependency.
|
||||
The public API is one function: ``scan_pdf_for_transactions``.
|
||||
These tests cover the value-parsing helpers, the row clusterer,
|
||||
the date/amount token finders, and the end-to-end scanner
|
||||
against synthetic ``Page`` objects with no real PDF involved.
|
||||
|
||||
End-to-end-on-a-real-PDF coverage lives in
|
||||
``test_pdf_extract_smoke.py``, which uses ``fpdf2`` to generate
|
||||
a fixture statement at test time.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from src.pdf_extract import (
|
||||
Page,
|
||||
WordBox,
|
||||
apply_template,
|
||||
assign_columns,
|
||||
_find_amount_tokens,
|
||||
_find_dates_in_words,
|
||||
cluster_rows,
|
||||
parse_amount,
|
||||
parse_date,
|
||||
_pages_in_range,
|
||||
_within_table_window,
|
||||
)
|
||||
|
||||
|
||||
def _w(text: str, x0: float, top: float, x1: float | None = None) -> WordBox:
|
||||
"""Convenience constructor — heights and exact x1 don't matter
|
||||
for the tests we write."""
|
||||
return WordBox(
|
||||
x0=x0,
|
||||
top=top,
|
||||
x1=x1 if x1 is not None else x0 + 10 * len(text),
|
||||
x1=x1 if x1 is not None else x0 + 8 * len(text),
|
||||
bottom=top + 10,
|
||||
text=text,
|
||||
)
|
||||
@@ -61,13 +58,18 @@ class TestParseAmount:
|
||||
assert parse_amount("not a number") is None
|
||||
|
||||
def test_european_decimal(self):
|
||||
opts = {
|
||||
"decimal_separator": ",",
|
||||
"thousands_separator": ".",
|
||||
"currency_strip": "€",
|
||||
"negative_in_parens": True,
|
||||
}
|
||||
assert parse_amount("€1.234,56", opts) == 1234.56
|
||||
assert parse_amount(
|
||||
"€1.234,56",
|
||||
decimal=",",
|
||||
thousands=".",
|
||||
currency_strip="€",
|
||||
) == 1234.56
|
||||
|
||||
def test_parens_off_disables_paren_negative(self):
|
||||
# With parens off, (4.50) won't be treated as negative —
|
||||
# but it also won't parse cleanly since "(4.50)" isn't a
|
||||
# plain number. Verify the off-path is non-flipping.
|
||||
assert parse_amount("(4.50)", negative_in_parens=False) is None
|
||||
|
||||
|
||||
class TestParseDate:
|
||||
@@ -78,7 +80,7 @@ class TestParseDate:
|
||||
assert parse_date("2026-01-15", ["%Y-%m-%d"]) == "2026-01-15"
|
||||
|
||||
def test_fallback_format(self):
|
||||
# Not in the supplied list — should still parse via fallback.
|
||||
# Not in supplied list — should still parse via fallback.
|
||||
assert parse_date("01/15/26") == "2026-01-15"
|
||||
|
||||
def test_invalid(self):
|
||||
@@ -88,199 +90,74 @@ class TestParseDate:
|
||||
class TestClusterRows:
|
||||
def test_groups_close_y(self):
|
||||
words = [
|
||||
_w("A", x0=0, top=100),
|
||||
_w("B", x0=20, top=101),
|
||||
_w("C", x0=40, top=102),
|
||||
_w("A", 0, 100), _w("B", 20, 101), _w("C", 40, 102),
|
||||
]
|
||||
rows = cluster_rows(words, y_tolerance=3.0)
|
||||
rows = cluster_rows(words)
|
||||
assert len(rows) == 1
|
||||
assert [w.text for w in rows[0]] == ["A", "B", "C"]
|
||||
|
||||
def test_separates_far_y(self):
|
||||
words = [
|
||||
_w("A", x0=0, top=100),
|
||||
_w("B", x0=0, top=120),
|
||||
]
|
||||
rows = cluster_rows(words, y_tolerance=3.0)
|
||||
assert [[w.text for w in r] for r in rows] == [["A"], ["B"]]
|
||||
words = [_w("A", 0, 100), _w("B", 0, 120)]
|
||||
assert [
|
||||
[w.text for w in r] for r in cluster_rows(words)
|
||||
] == [["A"], ["B"]]
|
||||
|
||||
def test_sorts_left_to_right_within_row(self):
|
||||
words = [
|
||||
_w("C", x0=40, top=100),
|
||||
_w("A", x0=0, top=100),
|
||||
_w("B", x0=20, top=100),
|
||||
]
|
||||
rows = cluster_rows(words)
|
||||
assert [w.text for w in rows[0]] == ["A", "B", "C"]
|
||||
words = [_w("C", 40, 100), _w("A", 0, 100), _w("B", 20, 100)]
|
||||
assert [w.text for w in cluster_rows(words)[0]] == ["A", "B", "C"]
|
||||
|
||||
def test_empty(self):
|
||||
assert cluster_rows([]) == []
|
||||
|
||||
|
||||
class TestAssignColumns:
|
||||
def test_three_columns(self):
|
||||
# boundaries at x=100, 200 → columns [0,100), [100,200), [200,∞)
|
||||
row = [
|
||||
_w("Jan", x0=10, top=0, x1=40), # col 0
|
||||
_w("1", x0=45, top=0, x1=55), # col 0
|
||||
_w("Deposit", x0=110, top=0, x1=180), # col 1
|
||||
_w("250.00", x0=210, top=0, x1=260), # col 2
|
||||
]
|
||||
cells = assign_columns(row, [100, 200])
|
||||
assert cells[0] == "Jan 1"
|
||||
assert cells[1] == "Deposit"
|
||||
assert cells[2] == "250.00"
|
||||
class TestFindDatesInWords:
|
||||
def test_us_slash(self):
|
||||
row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)]
|
||||
assert _find_dates_in_words(row) == [(0, "01/15/2026")]
|
||||
|
||||
def test_no_boundaries_one_column(self):
|
||||
row = [_w("A", 0, 0), _w("B", 20, 0)]
|
||||
cells = assign_columns(row, [])
|
||||
assert cells == ["A B"]
|
||||
def test_two_digit_year(self):
|
||||
row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)]
|
||||
result = _find_dates_in_words(row)
|
||||
assert result and result[0][1] == "01/15/26"
|
||||
|
||||
def test_iso(self):
|
||||
row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)]
|
||||
assert _find_dates_in_words(row) == [(0, "2026-01-15")]
|
||||
|
||||
def test_month_name(self):
|
||||
row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)]
|
||||
result = _find_dates_in_words(row)
|
||||
assert result and "Jan 15" in result[0][1]
|
||||
|
||||
def test_no_date(self):
|
||||
row = [_w("Just", 0, 0), _w("text", 50, 0)]
|
||||
assert _find_dates_in_words(row) == []
|
||||
|
||||
|
||||
class TestPagesInRange:
|
||||
def _mk(self, n):
|
||||
return [Page(page_no=i + 1, width=600, height=800, text="", words=[]) for i in range(n)]
|
||||
class TestFindAmountTokens:
|
||||
def test_currency_format(self):
|
||||
row = [_w("Coffee", 0, 0), _w("$4.50", 100, 0)]
|
||||
out = _find_amount_tokens(row)
|
||||
assert len(out) == 1
|
||||
assert out[0][2] == "$4.50"
|
||||
|
||||
def test_all(self):
|
||||
pages = self._mk(5)
|
||||
assert len(_pages_in_range(pages, "all")) == 5
|
||||
assert len(_pages_in_range(pages, "")) == 5
|
||||
def test_parens_negative(self):
|
||||
row = [_w("(123.45)", 0, 0)]
|
||||
out = _find_amount_tokens(row)
|
||||
assert out and out[0][2] == "(123.45)"
|
||||
|
||||
def test_explicit_list(self):
|
||||
pages = self._mk(5)
|
||||
got = [p.page_no for p in _pages_in_range(pages, "1,3,5")]
|
||||
assert got == [1, 3, 5]
|
||||
def test_no_amount_on_pure_text(self):
|
||||
row = [_w("Hello", 0, 0), _w("World", 50, 0)]
|
||||
assert _find_amount_tokens(row) == []
|
||||
|
||||
def test_range(self):
|
||||
pages = self._mk(5)
|
||||
got = [p.page_no for p in _pages_in_range(pages, "2-4")]
|
||||
assert got == [2, 3, 4]
|
||||
|
||||
def test_open_ended(self):
|
||||
pages = self._mk(5)
|
||||
got = [p.page_no for p in _pages_in_range(pages, "3-")]
|
||||
assert got == [3, 4, 5]
|
||||
def test_rejects_bare_year(self):
|
||||
# A bare 4-digit year matches the digit pattern but lacks
|
||||
# any money marker — should be filtered out.
|
||||
row = [_w("2026", 0, 0)]
|
||||
assert _find_amount_tokens(row) == []
|
||||
|
||||
|
||||
class TestWithinTableWindow:
|
||||
def test_header_skipped_end_excluded(self):
|
||||
rows = [
|
||||
[_w("STATEMENT", 0, 0)],
|
||||
[_w("Date", 0, 20), _w("Description", 50, 20), _w("Amount", 200, 20)],
|
||||
[_w("01/15", 0, 40), _w("Coffee", 50, 40), _w("4.50", 200, 40)],
|
||||
[_w("01/16", 0, 60), _w("Refund", 50, 60), _w("12.00", 200, 60)],
|
||||
[_w("Closing", 0, 80), _w("balance", 50, 80)],
|
||||
[_w("Page", 0, 100), _w("1", 50, 100)],
|
||||
]
|
||||
out = _within_table_window(rows, "Date Description Amount", ["Closing balance"])
|
||||
# Should keep just the two transaction rows.
|
||||
assert len(out) == 2
|
||||
assert out[0][0].text == "01/15"
|
||||
assert out[1][0].text == "01/16"
|
||||
|
||||
def test_no_header_returns_empty_when_required(self):
|
||||
rows = [[_w("foo", 0, 0)]]
|
||||
assert _within_table_window(rows, "Date Description Amount", []) == []
|
||||
|
||||
def test_blank_header_passes_through(self):
|
||||
rows = [[_w("x", 0, 0)], [_w("y", 0, 20)]]
|
||||
assert _within_table_window(rows, "", []) == rows
|
||||
|
||||
|
||||
class TestApplyTemplate:
|
||||
"""End-to-end on synthetic ``Page`` objects."""
|
||||
|
||||
def _statement_page(self) -> Page:
|
||||
# Mock layout: 3 columns at x=0/100/200, header at y=20, data at 40+.
|
||||
words = [
|
||||
_w("STATEMENT", 0, 0),
|
||||
# Header
|
||||
_w("Date", 5, 20), _w("Description", 105, 20), _w("Amount", 205, 20),
|
||||
# Row 1
|
||||
_w("01/15/2026", 5, 40), _w("Coffee", 105, 40),
|
||||
_w("Shop", 140, 40), _w("(4.50)", 205, 40),
|
||||
# Row 2
|
||||
_w("01/16/2026", 5, 60), _w("Refund", 105, 60), _w("$12.00", 205, 60),
|
||||
# Continuation row (no date) — should merge into row 2
|
||||
_w("from", 105, 80), _w("vendor", 140, 80),
|
||||
# End marker
|
||||
_w("Closing", 5, 100), _w("balance", 105, 100), _w("$1,000.00", 205, 100),
|
||||
]
|
||||
return Page(page_no=1, width=300, height=120, text="", words=words)
|
||||
|
||||
def _template(self) -> dict:
|
||||
return {
|
||||
"pages": {"range": "all"},
|
||||
"table": {
|
||||
"header_text": "Date Description Amount",
|
||||
"end_markers": ["Closing balance"],
|
||||
"column_boundaries": [100, 200],
|
||||
"y_tolerance": 3.0,
|
||||
"skip_rows_matching": [],
|
||||
},
|
||||
"columns": [
|
||||
{"source": 0, "target": "date"},
|
||||
{"source": 1, "target": "description"},
|
||||
{"source": 2, "target": "amount"},
|
||||
],
|
||||
"parse": {
|
||||
"date_format": "%m/%d/%Y",
|
||||
"amount_negative_in_parens": True,
|
||||
"merge_multiline_description": True,
|
||||
},
|
||||
}
|
||||
|
||||
def test_basic_extraction(self):
|
||||
df = apply_template([self._statement_page()], self._template())
|
||||
assert isinstance(df, pd.DataFrame)
|
||||
assert len(df) == 2
|
||||
assert list(df["date"]) == ["2026-01-15", "2026-01-16"]
|
||||
# Parens-negative
|
||||
assert df.iloc[0]["amount"] == -4.50
|
||||
# Plain positive with currency strip
|
||||
assert df.iloc[1]["amount"] == 12.00
|
||||
# Multi-line description merged
|
||||
assert "from vendor" in df.iloc[1]["description"]
|
||||
|
||||
def test_debit_credit_split_columns(self):
|
||||
# Layout: date | description | debit | credit columns
|
||||
page = Page(
|
||||
page_no=1, width=400, height=80, text="",
|
||||
words=[
|
||||
_w("Date", 5, 0), _w("Desc", 105, 0),
|
||||
_w("Debit", 205, 0), _w("Credit", 305, 0),
|
||||
_w("01/15/2026", 5, 20), _w("Coffee", 105, 20), _w("4.50", 205, 20),
|
||||
_w("01/16/2026", 5, 40), _w("Refund", 105, 40),
|
||||
_w("", 205, 40), # no debit
|
||||
_w("12.00", 305, 40),
|
||||
],
|
||||
)
|
||||
tpl = {
|
||||
"table": {
|
||||
"header_text": "Date Desc Debit Credit",
|
||||
"column_boundaries": [100, 200, 300],
|
||||
},
|
||||
"columns": [
|
||||
{"source": 0, "target": "date"},
|
||||
{"source": 1, "target": "description"},
|
||||
{"source": 2, "target": "amount_debit"},
|
||||
{"source": 3, "target": "amount_credit"},
|
||||
],
|
||||
"parse": {"date_format": "%m/%d/%Y"},
|
||||
}
|
||||
df = apply_template([page], tpl)
|
||||
assert list(df["amount"]) == [-4.50, 12.00]
|
||||
assert list(df["type"]) == ["debit", "credit"]
|
||||
|
||||
def test_skip_rows_matching(self):
|
||||
page = self._statement_page()
|
||||
tpl = self._template()
|
||||
tpl["table"]["skip_rows_matching"] = ["Refund"]
|
||||
df = apply_template([page], tpl)
|
||||
# Refund row is dropped — only one transaction left
|
||||
assert len(df) == 1
|
||||
assert df.iloc[0]["amount"] == -4.50
|
||||
|
||||
def test_empty_pages_returns_empty_df(self):
|
||||
df = apply_template([], self._template())
|
||||
assert df.empty
|
||||
# End-to-end tests against synthetic Page objects are in the smoke
|
||||
# test module — they need ``scan_pdf_for_transactions`` which in
|
||||
# turn uses ``extract_pages_auto``. The unit-test layer here pins
|
||||
# the building blocks; smoke tests pin the wiring.
|
||||
|
||||
@@ -1,55 +1,43 @@
|
||||
"""End-to-end smoke tests for the PDF extraction stack.
|
||||
"""End-to-end smoke tests for the PDF transaction scanner.
|
||||
|
||||
These tests run real ``pdfplumber`` + ``pypdfium2`` calls against
|
||||
a small PDF generated in-memory with ``fpdf2``. They exist to
|
||||
catch the failure mode the user hit on first install — a missing
|
||||
or mismatched native dependency that doesn't show up until the
|
||||
extractor actually tries to open a PDF.
|
||||
These run real ``pdfplumber`` + ``pypdfium2`` (when OCR is in play)
|
||||
calls against a small statement-shaped PDF generated in memory
|
||||
with ``fpdf2``. They catch the failure modes most likely to bite
|
||||
an end-user installer build: missing native lib, broken hook
|
||||
bundling, pin/installed mismatch.
|
||||
|
||||
Per ``project-pdf-extractor`` memory: ``test_pdf_extract.py``
|
||||
covers the parsing logic on synthetic ``WordBox`` data with no
|
||||
PDF dep involved. This file is the layer above: it confirms the
|
||||
deps themselves work, that hooks bundled them correctly (the
|
||||
versions pinned in ``requirements.txt`` matter here), and that
|
||||
the extractor's pipeline survives a round-trip through real
|
||||
``pdfplumber.extract_words`` and real ``pypdfium2.render``.
|
||||
|
||||
Generation note: ``fpdf2`` is a test-only dep listed in
|
||||
Generation note: ``fpdf2`` is a test-only dep in
|
||||
``requirements-dev.txt``. We don't ship it.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def _build_tiny_statement_pdf() -> bytes:
|
||||
"""Render a one-page PDF that looks roughly like the simplest
|
||||
possible bank statement: a header line + three transaction
|
||||
rows + a closing-balance footer. Word positions are stable
|
||||
enough that the parser can identify columns by x-position."""
|
||||
"""One-page PDF: header line + three transaction rows + a
|
||||
closing-balance footer. The scanner should pick up exactly the
|
||||
three transactions."""
|
||||
from fpdf import FPDF
|
||||
|
||||
pdf = FPDF(orientation="P", unit="pt", format="letter")
|
||||
pdf.add_page()
|
||||
pdf.set_font("Helvetica", size=12)
|
||||
# Header
|
||||
pdf.set_xy(40, 50)
|
||||
pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT")
|
||||
# Transaction-table header row
|
||||
# Header row (not a transaction — no amount)
|
||||
pdf.set_xy(40, 100)
|
||||
pdf.cell(120, 14, "Date")
|
||||
pdf.set_xy(160, 100)
|
||||
pdf.cell(200, 14, "Description")
|
||||
pdf.set_xy(360, 100)
|
||||
pdf.cell(80, 14, "Amount")
|
||||
# Three rows
|
||||
# Three transactions
|
||||
rows = [
|
||||
("01/15/2026", "Coffee Shop", "(4.50)"),
|
||||
("01/16/2026", "Refund Vendor", "$12.00"),
|
||||
("01/17/2026", "ATM Withdrawal","(40.00)"),
|
||||
("01/15/2026", "Coffee Shop", "(4.50)"),
|
||||
("01/16/2026", "Refund Vendor", "$12.00"),
|
||||
("01/17/2026", "ATM Withdrawal", "(40.00)"),
|
||||
]
|
||||
y = 130
|
||||
for date, desc, amt in rows:
|
||||
@@ -60,7 +48,7 @@ def _build_tiny_statement_pdf() -> bytes:
|
||||
pdf.set_xy(360, y)
|
||||
pdf.cell(80, 14, amt)
|
||||
y += 20
|
||||
# Closing-balance footer
|
||||
# Footer — has a date-like number maybe but no real txn shape
|
||||
pdf.set_xy(40, y + 20)
|
||||
pdf.cell(0, 14, "Closing balance: $1,000.00")
|
||||
return bytes(pdf.output())
|
||||
@@ -72,12 +60,8 @@ def _build_tiny_statement_pdf() -> bytes:
|
||||
|
||||
|
||||
class TestDependencyImports:
|
||||
"""Each runtime PDF dep must be importable.
|
||||
|
||||
These tests will fail fast on a stripped/broken install — most
|
||||
valuable as a CI gate when the requirements.txt pins are
|
||||
bumped, so we know the new pin still installs cleanly across
|
||||
the matrix."""
|
||||
"""Each runtime PDF dep must be importable. Fails fast on a
|
||||
stripped install or a missing CI pin."""
|
||||
|
||||
def test_pdfplumber(self):
|
||||
import pdfplumber # noqa: F401
|
||||
@@ -85,130 +69,135 @@ class TestDependencyImports:
|
||||
def test_pypdfium2(self):
|
||||
import pypdfium2 # noqa: F401
|
||||
|
||||
def test_streamlit_drawable_canvas(self):
|
||||
# Don't instantiate the canvas — that needs a Streamlit
|
||||
# script-run context. Just confirm the module loads.
|
||||
import streamlit_drawable_canvas # noqa: F401
|
||||
|
||||
def test_pytesseract(self):
|
||||
# The Python binding must import even when the Tesseract
|
||||
# binary isn't installed — the OCR availability check
|
||||
# handles binary absence separately.
|
||||
import pytesseract # noqa: F401
|
||||
|
||||
def test_PIL(self):
|
||||
# Transitively required by pdfplumber + pypdfium2 + canvas.
|
||||
# Pinning explicit confirms hooks pull it through.
|
||||
from PIL import Image # noqa: F401
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Real-PDF round-trip
|
||||
# End-to-end against a real PDF
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRealPdfRoundTrip:
|
||||
"""``extract_pages`` + ``apply_template`` against a real PDF."""
|
||||
|
||||
class TestScanPdfForTransactions:
|
||||
@pytest.fixture
|
||||
def pdf_bytes(self) -> bytes:
|
||||
return _build_tiny_statement_pdf()
|
||||
|
||||
def test_extract_pages_returns_words(self, pdf_bytes):
|
||||
from src.pdf_extract import extract_pages
|
||||
pages = extract_pages(pdf_bytes)
|
||||
assert len(pages) == 1
|
||||
assert pages[0].width > 0 and pages[0].height > 0
|
||||
# At minimum we should have the words from the header and
|
||||
# one transaction row — proves pdfplumber wired up.
|
||||
all_text = " ".join(w.text for w in pages[0].words)
|
||||
assert "ACME" in all_text
|
||||
assert "Coffee" in all_text
|
||||
assert "01/15/2026" in all_text
|
||||
def test_finds_three_transactions(self, pdf_bytes):
|
||||
from src.pdf_extract import scan_pdf_for_transactions
|
||||
rows, warnings = scan_pdf_for_transactions(pdf_bytes)
|
||||
# The PDF has 3 transactions plus a header and a closing-
|
||||
# balance footer. Header has no amount; closing-balance has
|
||||
# no date in the same line — neither qualifies as a txn.
|
||||
assert len(rows) == 3, (
|
||||
f"expected 3 rows, got {len(rows)}:\n"
|
||||
f"{[r.get('raw') for r in rows]}"
|
||||
)
|
||||
|
||||
def test_apply_template_extracts_three_rows(self, pdf_bytes):
|
||||
from src.pdf_extract import apply_template, extract_pages
|
||||
# The template's column boundaries are tuned to fpdf2's
|
||||
# x-coordinates above (40 / 160 / 360 pt).
|
||||
tpl = {
|
||||
"pages": {"range": "all"},
|
||||
"table": {
|
||||
"header_text": "Date Description Amount",
|
||||
"end_markers": ["Closing balance"],
|
||||
"column_boundaries": [150, 350],
|
||||
"y_tolerance": 3.0,
|
||||
},
|
||||
"columns": [
|
||||
{"source": 0, "target": "date"},
|
||||
{"source": 1, "target": "description"},
|
||||
{"source": 2, "target": "amount"},
|
||||
],
|
||||
"parse": {
|
||||
"date_format": "%m/%d/%Y",
|
||||
"amount_negative_in_parens": True,
|
||||
"merge_multiline_description": True,
|
||||
},
|
||||
}
|
||||
pages = extract_pages(pdf_bytes)
|
||||
df = apply_template(pages, tpl)
|
||||
assert len(df) == 3, f"expected 3 rows, got {len(df)}:\n{df}"
|
||||
assert list(df["date"]) == [
|
||||
def test_parses_dates_to_iso(self, pdf_bytes):
|
||||
from src.pdf_extract import scan_pdf_for_transactions
|
||||
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||
assert [r["date"] for r in rows] == [
|
||||
"2026-01-15", "2026-01-16", "2026-01-17",
|
||||
]
|
||||
# Parens-negative + currency-positive both round-trip
|
||||
assert df.iloc[0]["amount"] == -4.50
|
||||
assert df.iloc[1]["amount"] == 12.00
|
||||
assert df.iloc[2]["amount"] == -40.00
|
||||
|
||||
def test_parses_amounts_with_signs(self, pdf_bytes):
|
||||
from src.pdf_extract import scan_pdf_for_transactions
|
||||
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||
assert rows[0]["amount_1"] == -4.50
|
||||
assert rows[1]["amount_1"] == 12.00
|
||||
assert rows[2]["amount_1"] == -40.00
|
||||
|
||||
def test_preserves_raw_line(self, pdf_bytes):
|
||||
from src.pdf_extract import scan_pdf_for_transactions
|
||||
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||
# Raw line lets the user verify what was matched.
|
||||
assert all("raw" in r and r["raw"] for r in rows)
|
||||
assert "Coffee" in rows[0]["raw"]
|
||||
|
||||
def test_page_tagged(self, pdf_bytes):
|
||||
from src.pdf_extract import scan_pdf_for_transactions
|
||||
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||
assert all(r["page"] == 1 for r in rows)
|
||||
|
||||
def test_negative_in_parens_off(self, pdf_bytes):
|
||||
"""With parens-negative off, the parser can't decode
|
||||
``(4.50)`` and falls back to the raw text — the row still
|
||||
surfaces, just with the unparsed string in the amount slot
|
||||
so the user can see and fix it in the editor."""
|
||||
from src.pdf_extract import scan_pdf_for_transactions
|
||||
rows, _ = scan_pdf_for_transactions(
|
||||
pdf_bytes, negative_in_parens=False,
|
||||
)
|
||||
# Row 0 had "(4.50)" — without parens-negative, parse_amount
|
||||
# returns None and the scanner keeps the raw token.
|
||||
assert rows[0]["amount_1"] == "(4.50)"
|
||||
# Row 1 had "$12.00" — still parses to positive.
|
||||
assert rows[1]["amount_1"] == 12.00
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pypdfium2 rendering (powers the visual picker)
|
||||
# Multi-line description merging
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRenderPageImage:
|
||||
"""``render_page_image`` is what feeds the drawable canvas.
|
||||
class TestMultilineDescription:
|
||||
def test_continuation_line_merges(self):
|
||||
"""A line with no date and no amount, sitting between two
|
||||
transaction rows, attaches to the previous transaction's
|
||||
description."""
|
||||
from src.pdf_extract import (
|
||||
Page,
|
||||
WordBox,
|
||||
scan_pdf_for_transactions,
|
||||
)
|
||||
# Build a synthetic page through the public entry point by
|
||||
# going through extract_pages_auto's intermediate? Easier:
|
||||
# call the internals directly via a fake PDF. For unit
|
||||
# coverage of the merge behavior, route through the helper:
|
||||
from src import pdf_extract as mod
|
||||
|
||||
Catches the most common installer-bug: native PDFium .dll/.so
|
||||
missing from the bundle. If this test crashes with a
|
||||
``FileNotFoundError`` it almost always means the
|
||||
``hook-pypdfium2.py`` didn't pick up the shared lib."""
|
||||
original = mod.extract_pages_auto
|
||||
|
||||
def test_renders_a_real_pil_image(self):
|
||||
from src.pdf_extract import render_page_image
|
||||
pdf_bytes = _build_tiny_statement_pdf()
|
||||
image, scale = render_page_image(pdf_bytes, page_no=1)
|
||||
# Letter-size at scale ≈ 900/612 ≈ 1.47 → ~900px wide.
|
||||
assert image.width > 800
|
||||
assert image.height > 800
|
||||
assert scale > 0
|
||||
# PIL Image is duck-typed; check the attrs we depend on.
|
||||
assert hasattr(image, "save")
|
||||
assert hasattr(image, "tobytes")
|
||||
def fake(_pdf_bytes, *, allow_ocr=True):
|
||||
words = [
|
||||
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/15/2026"),
|
||||
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
|
||||
WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
|
||||
# Continuation: no date, no amount
|
||||
WordBox(x0=100, top=20, x1=160, bottom=30, text="Vendor"),
|
||||
WordBox(x0=170, top=20, x1=230, bottom=30, text="memo"),
|
||||
# Next transaction
|
||||
WordBox(x0=0, top=40, x1=80, bottom=50, text="01/16/2026"),
|
||||
WordBox(x0=100, top=40, x1=160, bottom=50, text="Other"),
|
||||
WordBox(x0=200, top=40, x1=240, bottom=50, text="$10.00"),
|
||||
]
|
||||
return [Page(
|
||||
page_no=1, width=300, height=100, text="", words=words,
|
||||
)], []
|
||||
|
||||
def test_invalid_page_number_clamps(self):
|
||||
from src.pdf_extract import render_page_image
|
||||
pdf_bytes = _build_tiny_statement_pdf()
|
||||
# PDF has 1 page; page_no=99 should clamp, not raise.
|
||||
image, scale = render_page_image(pdf_bytes, page_no=99)
|
||||
assert image.width > 0
|
||||
mod.extract_pages_auto = fake
|
||||
try:
|
||||
rows, _ = scan_pdf_for_transactions(b"")
|
||||
finally:
|
||||
mod.extract_pages_auto = original
|
||||
|
||||
assert len(rows) == 2
|
||||
assert "Vendor memo" in rows[0]["description"]
|
||||
assert rows[1]["description"] == "Other"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Graceful-fallback behavior
|
||||
# Graceful fallback when deps absent
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestPdfDependencyMissing:
|
||||
"""The page should see a clean exception when a dep is absent,
|
||||
not a raw ``ImportError`` that leaks into the Streamlit traceback."""
|
||||
|
||||
def test_require_pdfplumber_raises_typed_on_absence(self, monkeypatch):
|
||||
from src import pdf_extract
|
||||
# Simulate "pdfplumber not installed" without uninstalling.
|
||||
# ``_require_pdfplumber`` does its own ``import pdfplumber``
|
||||
# at call time; patch ``__import__`` to throw for that one
|
||||
# name only.
|
||||
import builtins
|
||||
real_import = builtins.__import__
|
||||
|
||||
@@ -218,10 +207,10 @@ class TestPdfDependencyMissing:
|
||||
return real_import(name, *a, **kw)
|
||||
|
||||
monkeypatch.setattr(builtins, "__import__", fake_import)
|
||||
with pytest.raises(pdf_extract.PdfDependencyMissing) as exc_info:
|
||||
with pytest.raises(pdf_extract.PdfDependencyMissing) as exc:
|
||||
pdf_extract._require_pdfplumber()
|
||||
assert "pdfplumber" in str(exc_info.value)
|
||||
assert exc_info.value.hint # actionable hint must be populated
|
||||
assert "pdfplumber" in str(exc.value)
|
||||
assert exc.value.hint
|
||||
|
||||
def test_require_pdfium_raises_typed_on_absence(self, monkeypatch):
|
||||
from src import pdf_extract
|
||||
@@ -239,17 +228,13 @@ class TestPdfDependencyMissing:
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Requirements-pin consistency
|
||||
# Requirements pin consistency
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestPinnedVersionsMatchInstalled:
|
||||
"""If someone bumps the pin in ``requirements.txt`` without
|
||||
actually reinstalling, this test points it out before CI does.
|
||||
|
||||
Uses ``importlib.metadata`` rather than each library's
|
||||
``__version__`` attribute because not every PDF dep exposes
|
||||
one (``pypdfium2`` keeps version info on a submodule)."""
|
||||
actually reinstalling, this test points it out before CI does."""
|
||||
|
||||
def _parse_pins(self) -> dict[str, str]:
|
||||
from pathlib import Path
|
||||
@@ -266,21 +251,17 @@ class TestPinnedVersionsMatchInstalled:
|
||||
pins[name.strip()] = version.strip()
|
||||
return pins
|
||||
|
||||
def _installed(self, dist_name: str) -> str:
|
||||
import importlib.metadata as md
|
||||
return md.version(dist_name)
|
||||
|
||||
@pytest.mark.parametrize("dist_name", [
|
||||
"pdfplumber",
|
||||
"pypdfium2",
|
||||
"pytesseract",
|
||||
"streamlit-drawable-canvas",
|
||||
])
|
||||
def test_pin_matches_installed(self, dist_name):
|
||||
import importlib.metadata as md
|
||||
pins = self._parse_pins()
|
||||
if dist_name not in pins:
|
||||
pytest.skip(f"{dist_name} not exact-pinned in requirements.txt")
|
||||
installed = self._installed(dist_name)
|
||||
installed = md.version(dist_name)
|
||||
assert installed == pins[dist_name], (
|
||||
f"installed {dist_name}=={installed} but requirements.txt "
|
||||
f"pins {pins[dist_name]} — bump the pin, or reinstall."
|
||||
@@ -288,79 +269,52 @@ class TestPinnedVersionsMatchInstalled:
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OCR availability runtime probe
|
||||
# OCR availability
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestOcrAvailability:
|
||||
"""``ocr_available`` is the linchpin of the UI's OCR banner.
|
||||
Returns ``(bool, str)`` — both branches must round-trip."""
|
||||
|
||||
def test_returns_a_tuple(self):
|
||||
from src.pdf_extract import ocr_available
|
||||
result = ocr_available()
|
||||
assert isinstance(result, tuple)
|
||||
assert len(result) == 2
|
||||
assert isinstance(result, tuple) and len(result) == 2
|
||||
ok, reason = result
|
||||
assert isinstance(ok, bool)
|
||||
assert isinstance(reason, str)
|
||||
|
||||
def test_extract_pages_auto_skips_ocr_when_disabled(self):
|
||||
from src.pdf_extract import extract_pages_auto
|
||||
# With allow_ocr=False, no OCR even if pages are blank.
|
||||
pdf_bytes = _build_tiny_statement_pdf()
|
||||
pages, warnings = extract_pages_auto(pdf_bytes, allow_ocr=False)
|
||||
assert len(pages) == 1
|
||||
# No OCR-disabled warning on a text PDF, since pages have text.
|
||||
assert not any("OCR is disabled" in w for w in warnings)
|
||||
|
||||
|
||||
class TestTesseractDiscovery:
|
||||
"""Windows install paths + env-var override are how a real user
|
||||
(no PATH munging) gets OCR working. Cover the discovery logic
|
||||
even on Linux/macOS test runners by mocking out the OS check
|
||||
and ``Path.exists``."""
|
||||
|
||||
def test_autodetect_returns_none_on_non_windows(self, monkeypatch):
|
||||
from src import pdf_extract
|
||||
monkeypatch.setattr(
|
||||
"platform.system",
|
||||
lambda: "Linux",
|
||||
)
|
||||
monkeypatch.setattr("platform.system", lambda: "Linux")
|
||||
assert pdf_extract._autodetect_tesseract_path() is None
|
||||
|
||||
def test_autodetect_finds_program_files_on_windows(self, monkeypatch):
|
||||
from src import pdf_extract
|
||||
monkeypatch.setattr("platform.system", lambda: "Windows")
|
||||
|
||||
target = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
|
||||
|
||||
def fake_exists(self):
|
||||
return str(self) == target
|
||||
|
||||
monkeypatch.setattr(
|
||||
"pathlib.Path.exists",
|
||||
fake_exists,
|
||||
)
|
||||
monkeypatch.setattr("pathlib.Path.exists", fake_exists)
|
||||
assert pdf_extract._autodetect_tesseract_path() == target
|
||||
|
||||
def test_autodetect_returns_none_when_nothing_installed(
|
||||
self, monkeypatch,
|
||||
):
|
||||
def test_autodetect_returns_none_when_nothing_installed(self, monkeypatch):
|
||||
from src import pdf_extract
|
||||
monkeypatch.setattr("platform.system", lambda: "Windows")
|
||||
monkeypatch.setattr("pathlib.Path.exists", lambda self: False)
|
||||
assert pdf_extract._autodetect_tesseract_path() is None
|
||||
|
||||
def test_env_var_override_takes_precedence(self, monkeypatch, tmp_path):
|
||||
"""``DATATOOLS_TESSERACT_PATH`` wins over discovery so a
|
||||
portable install at a non-default path works without
|
||||
relying on PATH."""
|
||||
from src import pdf_extract
|
||||
# Point the override at a path that doesn't exist —
|
||||
# ocr_available will try it and report the failure, but
|
||||
# importantly the cmd attribute is set BEFORE the call,
|
||||
# which is what we're verifying.
|
||||
fake_bin = str(tmp_path / "fake-tesseract.exe")
|
||||
monkeypatch.setenv("DATATOOLS_TESSERACT_PATH", fake_bin)
|
||||
pdf_extract.ocr_available()
|
||||
|
||||
@@ -1,280 +0,0 @@
|
||||
"""Tests for the row-heuristic extraction pipeline.
|
||||
|
||||
This is now the primary extraction mode — uses date + amount
|
||||
pattern matching to find transaction lines, with no dependency
|
||||
on x-position column boundaries. Robust to layout drift across
|
||||
statements from the same bank.
|
||||
|
||||
The legacy column-visual pipeline keeps its own tests in
|
||||
``test_pdf_extract.py``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from src.pdf_extract import (
|
||||
Page,
|
||||
WordBox,
|
||||
apply_template,
|
||||
apply_template_row_heuristic,
|
||||
find_transaction_rows,
|
||||
_find_amount_tokens,
|
||||
_find_dates_in_words,
|
||||
_infer_amount_column_centers,
|
||||
)
|
||||
|
||||
|
||||
def _w(text: str, x0: float, top: float) -> WordBox:
|
||||
return WordBox(
|
||||
x0=x0,
|
||||
top=top,
|
||||
x1=x0 + 8 * len(text),
|
||||
bottom=top + 10,
|
||||
text=text,
|
||||
)
|
||||
|
||||
|
||||
class TestFindDatesInRow:
|
||||
def test_us_slash(self):
|
||||
row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)]
|
||||
assert _find_dates_in_words(row) == [(0, "01/15/2026")]
|
||||
|
||||
def test_two_digit_year(self):
|
||||
row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)]
|
||||
result = _find_dates_in_words(row)
|
||||
assert result and result[0][1] == "01/15/26"
|
||||
|
||||
def test_iso(self):
|
||||
row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)]
|
||||
assert _find_dates_in_words(row) == [(0, "2026-01-15")]
|
||||
|
||||
def test_month_name(self):
|
||||
# "Jan 15, 2026" — three word tokens, should stitch.
|
||||
row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)]
|
||||
result = _find_dates_in_words(row)
|
||||
assert result, "Multi-word month-day-year should match"
|
||||
assert "Jan 15" in result[0][1]
|
||||
|
||||
def test_no_date(self):
|
||||
row = [_w("Just", 0, 0), _w("text", 50, 0)]
|
||||
assert _find_dates_in_words(row) == []
|
||||
|
||||
|
||||
class TestFindAmountTokens:
|
||||
def test_currency_format(self):
|
||||
row = [_w("Coffee", 0, 0), _w("$4.50", 100, 0)]
|
||||
out = _find_amount_tokens(row)
|
||||
assert len(out) == 1
|
||||
assert out[0][2] == "$4.50"
|
||||
|
||||
def test_parens_negative(self):
|
||||
row = [_w("(123.45)", 0, 0)]
|
||||
out = _find_amount_tokens(row)
|
||||
assert out and out[0][2] == "(123.45)"
|
||||
|
||||
def test_no_amount_on_pure_text(self):
|
||||
row = [_w("Hello", 0, 0), _w("World", 50, 0)]
|
||||
assert _find_amount_tokens(row) == []
|
||||
|
||||
def test_rejects_bare_year(self):
|
||||
# "2026" matches the digit pattern but lacks $/decimal/etc.,
|
||||
# so the looks-like-amount filter should drop it.
|
||||
row = [_w("2026", 0, 0)]
|
||||
# Bare integer can pass the regex but not the heuristic.
|
||||
out = _find_amount_tokens(row)
|
||||
# Either filtered out OR included — both are defensible.
|
||||
# If included, it'd be missed-amount territory not a false-
|
||||
# positive. Pin the conservative behavior: NO match.
|
||||
assert out == [], "Bare 4-digit year should not register as amount"
|
||||
|
||||
|
||||
class TestInferAmountColumnCenters:
|
||||
def test_two_clear_columns(self):
|
||||
# 5 rows, each with two amounts at roughly x=300 and x=450.
|
||||
rows = []
|
||||
for top in range(0, 100, 20):
|
||||
rows.append([
|
||||
_w("01/15/2026", 20, top),
|
||||
_w("Item", 100, top),
|
||||
_w("$10.00", 300, top),
|
||||
_w("$1,000.00", 450, top),
|
||||
])
|
||||
centers = _infer_amount_column_centers(
|
||||
rows, expected=2, min_amounts=2, max_amounts=2,
|
||||
)
|
||||
assert len(centers) == 2
|
||||
# Left center ≈ 300 + 8*len("$10.00")/2 = 300+24 = 324
|
||||
assert 310 < centers[0] < 340
|
||||
assert 460 < centers[1] < 490
|
||||
|
||||
def test_no_transactions_returns_empty(self):
|
||||
rows = [[_w("just", 0, 0), _w("text", 50, 0)]]
|
||||
assert _infer_amount_column_centers(
|
||||
rows, expected=2, min_amounts=1, max_amounts=3,
|
||||
) == []
|
||||
|
||||
|
||||
class TestRowHeuristicEndToEnd:
|
||||
"""Synthetic ``Page`` objects exercise the full row-heuristic
|
||||
pipeline end-to-end without a real PDF."""
|
||||
|
||||
def _page_single_amount(self) -> Page:
|
||||
words = [
|
||||
_w("ACME BANK STATEMENT", 20, 0),
|
||||
_w("01/15/2026", 20, 30), _w("Coffee", 100, 30),
|
||||
_w("Shop", 150, 30), _w("$4.50", 400, 30),
|
||||
_w("01/16/2026", 20, 50), _w("Refund", 100, 50),
|
||||
_w("from", 100, 70), _w("vendor", 140, 70), # continuation
|
||||
_w("Vendor", 140, 50), _w("$12.00", 400, 50),
|
||||
_w("Page", 20, 90), _w("1", 60, 90), # not a txn
|
||||
]
|
||||
return Page(page_no=1, width=600, height=120, text="", words=words)
|
||||
|
||||
def test_extracts_two_rows_single_amount(self):
|
||||
tpl = {
|
||||
"mode": "row_heuristic",
|
||||
"row_detection": {
|
||||
"min_amounts_per_row": 1,
|
||||
"max_amounts_per_row": 1,
|
||||
"merge_multiline_description": True,
|
||||
},
|
||||
"amounts": {"shape": "single", "negative_in_parens": True},
|
||||
"date": {"format": "%m/%d/%Y"},
|
||||
}
|
||||
df = apply_template_row_heuristic([self._page_single_amount()], tpl)
|
||||
assert len(df) == 2
|
||||
assert list(df["date"]) == ["2026-01-15", "2026-01-16"]
|
||||
# Multi-line description merged
|
||||
assert "from vendor" in df.iloc[1]["description"]
|
||||
|
||||
def test_dispatches_through_apply_template(self):
|
||||
tpl = {
|
||||
"mode": "row_heuristic",
|
||||
"row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1},
|
||||
"amounts": {"shape": "single"},
|
||||
"date": {"format": "%m/%d/%Y"},
|
||||
}
|
||||
df = apply_template([self._page_single_amount()], tpl)
|
||||
assert isinstance(df, pd.DataFrame)
|
||||
assert len(df) == 2
|
||||
|
||||
def test_txn_balance_shape(self):
|
||||
page = Page(
|
||||
page_no=1, width=600, height=100, text="", words=[
|
||||
_w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
|
||||
_w("(4.50)", 300, 0), _w("1,000.00", 450, 0),
|
||||
_w("01/16/2026", 20, 20), _w("Refund", 100, 20),
|
||||
_w("12.00", 300, 20), _w("1,012.00", 450, 20),
|
||||
],
|
||||
)
|
||||
tpl = {
|
||||
"mode": "row_heuristic",
|
||||
"row_detection": {"min_amounts_per_row": 2, "max_amounts_per_row": 2},
|
||||
"amounts": {"shape": "txn_balance", "negative_in_parens": True},
|
||||
"date": {"format": "%m/%d/%Y"},
|
||||
}
|
||||
df = apply_template([page], tpl)
|
||||
assert len(df) == 2
|
||||
assert df.iloc[0]["amount"] == -4.50
|
||||
assert df.iloc[0]["balance"] == 1000.00
|
||||
assert df.iloc[1]["amount"] == 12.00
|
||||
assert df.iloc[1]["balance"] == 1012.00
|
||||
|
||||
def test_debit_credit_balance_shape(self):
|
||||
page = Page(
|
||||
page_no=1, width=600, height=100, text="", words=[
|
||||
_w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
|
||||
_w("4.50", 300, 0), _w("1,000.00", 450, 0),
|
||||
_w("01/16/2026", 20, 20), _w("Refund", 100, 20),
|
||||
_w("12.00", 380, 20), _w("1,012.00", 450, 20),
|
||||
],
|
||||
)
|
||||
tpl = {
|
||||
"mode": "row_heuristic",
|
||||
"row_detection": {"min_amounts_per_row": 2, "max_amounts_per_row": 3},
|
||||
"amounts": {"shape": "debit_credit_balance"},
|
||||
"date": {"format": "%m/%d/%Y"},
|
||||
}
|
||||
df = apply_template([page], tpl)
|
||||
assert len(df) == 2
|
||||
# Row 0: amount at x=300 (debit column) → debit, balance at 450
|
||||
assert df.iloc[0]["amount"] == -4.50
|
||||
assert df.iloc[0]["type"] == "debit"
|
||||
# Row 1: amount at x=380 (credit column) → credit, balance at 450
|
||||
assert df.iloc[1]["amount"] == 12.00
|
||||
assert df.iloc[1]["type"] == "credit"
|
||||
|
||||
def test_skip_rows_matching(self):
|
||||
page = self._page_single_amount()
|
||||
tpl = {
|
||||
"mode": "row_heuristic",
|
||||
"row_detection": {
|
||||
"min_amounts_per_row": 1,
|
||||
"max_amounts_per_row": 1,
|
||||
"skip_rows_matching": ["Refund"],
|
||||
},
|
||||
"amounts": {"shape": "single"},
|
||||
"date": {"format": "%m/%d/%Y"},
|
||||
}
|
||||
df = apply_template_row_heuristic([page], tpl)
|
||||
assert len(df) == 1
|
||||
assert df.iloc[0]["date"] == "2026-01-15"
|
||||
|
||||
def test_layout_drift_doesnt_matter(self):
|
||||
"""The whole point of row-heuristic: same template works
|
||||
on pages of different sizes / different column x-positions."""
|
||||
# Page A: amounts at x=400
|
||||
page_a = Page(
|
||||
page_no=1, width=600, height=80, text="", words=[
|
||||
_w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
|
||||
_w("$4.50", 400, 0),
|
||||
],
|
||||
)
|
||||
# Page B: amounts shifted to x=520 (different layout)
|
||||
page_b = Page(
|
||||
page_no=1, width=720, height=80, text="", words=[
|
||||
_w("01/15/2026", 50, 0), _w("Coffee", 150, 0),
|
||||
_w("$4.50", 520, 0),
|
||||
],
|
||||
)
|
||||
tpl = {
|
||||
"mode": "row_heuristic",
|
||||
"row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1},
|
||||
"amounts": {"shape": "single"},
|
||||
"date": {"format": "%m/%d/%Y"},
|
||||
}
|
||||
df_a = apply_template([page_a], tpl)
|
||||
df_b = apply_template([page_b], tpl)
|
||||
# Both should extract — proves no coordinate dependency.
|
||||
assert len(df_a) == 1
|
||||
assert len(df_b) == 1
|
||||
assert df_a.iloc[0]["amount"] == df_b.iloc[0]["amount"] == 4.50
|
||||
|
||||
|
||||
class TestFindTransactionRows:
|
||||
"""The pre-DataFrame stage — returns dict records the build UI
|
||||
uses to render a preview before the user commits."""
|
||||
|
||||
def test_returns_records(self):
|
||||
page = Page(
|
||||
page_no=1, width=600, height=80, text="", words=[
|
||||
_w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
|
||||
_w("$4.50", 400, 0),
|
||||
],
|
||||
)
|
||||
tpl = {
|
||||
"mode": "row_heuristic",
|
||||
"row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1},
|
||||
"amounts": {"shape": "single"},
|
||||
"date": {"format": "%m/%d/%Y"},
|
||||
}
|
||||
rows = find_transaction_rows([page], tpl)
|
||||
assert len(rows) == 1
|
||||
r = rows[0]
|
||||
assert r["date"] == "2026-01-15"
|
||||
assert r["description"] == "Coffee"
|
||||
assert r["amount"] == 4.50
|
||||
assert r["_page"] == 1
|
||||
# Raw line is preserved so the GUI can show "what we saw"
|
||||
assert "_raw_line" in r
|
||||
@@ -1,316 +0,0 @@
|
||||
"""Tests for the PDF template storage layer."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from src.pdf_templates import (
|
||||
SCHEMA_VERSION,
|
||||
delete_template,
|
||||
list_templates,
|
||||
load_template,
|
||||
new_template,
|
||||
save_template,
|
||||
slugify,
|
||||
template_from_json,
|
||||
template_path,
|
||||
templates_dir,
|
||||
template_to_json,
|
||||
validate_template,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def isolated_templates(monkeypatch, tmp_path):
|
||||
"""Redirect the templates directory into ``tmp_path``."""
|
||||
monkeypatch.setenv("DATATOOLS_PDF_TEMPLATES_DIR", str(tmp_path))
|
||||
yield tmp_path
|
||||
|
||||
|
||||
class TestSlugify:
|
||||
def test_basic(self):
|
||||
assert slugify("Chase Personal Checking") == "chase-personal-checking"
|
||||
|
||||
def test_strips_punctuation(self):
|
||||
assert slugify("BofA: Business (USD)") == "bofa-business-usd"
|
||||
|
||||
def test_empty_falls_back(self):
|
||||
assert slugify("") == "untitled"
|
||||
assert slugify(" ") == "untitled"
|
||||
|
||||
|
||||
class TestNewTemplate:
|
||||
def test_has_schema_version(self):
|
||||
t = new_template("Sample")
|
||||
assert t["schema_version"] == SCHEMA_VERSION
|
||||
|
||||
def test_slug_derived_from_name(self):
|
||||
t = new_template("Sample Bank")
|
||||
assert t["slug"] == "sample-bank"
|
||||
assert t["name"] == "Sample Bank"
|
||||
|
||||
def test_timestamps_present(self):
|
||||
t = new_template("X")
|
||||
assert t["created_at"]
|
||||
assert t["updated_at"]
|
||||
|
||||
|
||||
class TestValidateTemplateRowHeuristic:
|
||||
"""Row-heuristic mode is the v2 default."""
|
||||
|
||||
def _valid(self) -> dict:
|
||||
return {
|
||||
"schema_version": SCHEMA_VERSION,
|
||||
"slug": "x",
|
||||
"name": "X",
|
||||
"mode": "row_heuristic",
|
||||
"row_detection": {
|
||||
"min_amounts_per_row": 1,
|
||||
"max_amounts_per_row": 3,
|
||||
},
|
||||
"amounts": {"shape": "single"},
|
||||
"date": {"format": "%m/%d/%Y"},
|
||||
}
|
||||
|
||||
def test_valid_passes(self):
|
||||
ok, errs = validate_template(self._valid())
|
||||
assert ok, errs
|
||||
|
||||
def test_missing_name_fails(self):
|
||||
t = self._valid()
|
||||
t["name"] = ""
|
||||
ok, errs = validate_template(t)
|
||||
assert not ok
|
||||
|
||||
def test_bad_mode_fails(self):
|
||||
t = self._valid()
|
||||
t["mode"] = "magic"
|
||||
ok, errs = validate_template(t)
|
||||
assert not ok
|
||||
assert any("mode" in e for e in errs)
|
||||
|
||||
def test_bad_shape_fails(self):
|
||||
t = self._valid()
|
||||
t["amounts"]["shape"] = "telepathic"
|
||||
ok, errs = validate_template(t)
|
||||
assert not ok
|
||||
assert any("shape" in e for e in errs)
|
||||
|
||||
def test_inverted_amount_range_fails(self):
|
||||
t = self._valid()
|
||||
t["row_detection"]["min_amounts_per_row"] = 5
|
||||
t["row_detection"]["max_amounts_per_row"] = 2
|
||||
ok, errs = validate_template(t)
|
||||
assert not ok
|
||||
|
||||
def test_does_not_require_columns_in_row_mode(self):
|
||||
"""Key point: row mode doesn't need ``columns`` populated.
|
||||
That's what makes the GUI's primary path simpler than v1."""
|
||||
t = self._valid()
|
||||
# No columns key at all.
|
||||
ok, errs = validate_template(t)
|
||||
assert ok, errs
|
||||
|
||||
|
||||
class TestValidateTemplateColumnVisual:
|
||||
"""Legacy column-visual mode keeps its own contract."""
|
||||
|
||||
def _valid(self) -> dict:
|
||||
return {
|
||||
"schema_version": SCHEMA_VERSION,
|
||||
"slug": "x",
|
||||
"name": "X",
|
||||
"mode": "column_visual",
|
||||
"pages": {"range": "all"},
|
||||
"table": {"column_boundaries": [100, 200]},
|
||||
"columns": [
|
||||
{"source": 0, "target": "date"},
|
||||
{"source": 1, "target": "description"},
|
||||
{"source": 2, "target": "amount"},
|
||||
],
|
||||
"parse": {},
|
||||
}
|
||||
|
||||
def test_valid_passes(self):
|
||||
ok, errs = validate_template(self._valid())
|
||||
assert ok, errs
|
||||
|
||||
def test_requires_date_column(self):
|
||||
t = self._valid()
|
||||
t["columns"] = [
|
||||
{"source": 0, "target": "description"},
|
||||
{"source": 1, "target": "amount"},
|
||||
]
|
||||
ok, errs = validate_template(t)
|
||||
assert not ok
|
||||
assert any("date" in e for e in errs)
|
||||
|
||||
def test_requires_amount_or_debit_credit(self):
|
||||
t = self._valid()
|
||||
t["columns"] = [
|
||||
{"source": 0, "target": "date"},
|
||||
{"source": 1, "target": "description"},
|
||||
]
|
||||
ok, errs = validate_template(t)
|
||||
assert not ok
|
||||
assert any("amount" in e for e in errs)
|
||||
|
||||
def test_debit_credit_pair_is_valid(self):
|
||||
t = self._valid()
|
||||
t["columns"] = [
|
||||
{"source": 0, "target": "date"},
|
||||
{"source": 1, "target": "description"},
|
||||
{"source": 2, "target": "amount_debit"},
|
||||
{"source": 3, "target": "amount_credit"},
|
||||
]
|
||||
t["table"]["column_boundaries"] = [100, 200, 300]
|
||||
ok, errs = validate_template(t)
|
||||
assert ok, errs
|
||||
|
||||
|
||||
class TestV1Migration:
|
||||
"""v1 templates load with mode='column_visual' auto-injected;
|
||||
the file on disk stays v1 until the user re-saves."""
|
||||
|
||||
def test_loads_v1_template(self, isolated_templates, tmp_path):
|
||||
import json
|
||||
v1_payload = {
|
||||
"schema_version": 1,
|
||||
"slug": "legacy",
|
||||
"name": "Legacy Bank",
|
||||
"pages": {"range": "all"},
|
||||
"table": {"column_boundaries": [100, 200]},
|
||||
"columns": [
|
||||
{"source": 0, "target": "date"},
|
||||
{"source": 1, "target": "description"},
|
||||
{"source": 2, "target": "amount"},
|
||||
],
|
||||
"parse": {},
|
||||
}
|
||||
(tmp_path / "legacy.json").write_text(
|
||||
json.dumps(v1_payload), encoding="utf-8",
|
||||
)
|
||||
loaded = load_template("legacy")
|
||||
# In-memory migration adds mode + bumps schema_version
|
||||
assert loaded["mode"] == "column_visual"
|
||||
assert loaded["schema_version"] == SCHEMA_VERSION
|
||||
# Original keys still intact
|
||||
assert loaded["columns"][0]["target"] == "date"
|
||||
|
||||
|
||||
class TestPersistence:
|
||||
def test_round_trip(self, isolated_templates):
|
||||
t = new_template("Round Trip Bank")
|
||||
t["columns"] = [
|
||||
{"source": 0, "target": "date"},
|
||||
{"source": 1, "target": "description"},
|
||||
{"source": 2, "target": "amount"},
|
||||
]
|
||||
t["table"]["column_boundaries"] = [100, 200]
|
||||
slug = save_template(t)
|
||||
assert slug == "round-trip-bank"
|
||||
|
||||
path = template_path(slug)
|
||||
assert path.exists()
|
||||
loaded = load_template(slug)
|
||||
assert loaded["name"] == "Round Trip Bank"
|
||||
assert loaded["columns"][0]["target"] == "date"
|
||||
|
||||
def test_save_rejects_invalid(self, isolated_templates):
|
||||
with pytest.raises(ValueError):
|
||||
save_template({"schema_version": 1, "name": ""})
|
||||
|
||||
def test_load_missing_raises(self, isolated_templates):
|
||||
with pytest.raises(FileNotFoundError):
|
||||
load_template("does-not-exist")
|
||||
|
||||
def test_load_corrupt_raises(self, isolated_templates, tmp_path):
|
||||
bad = tmp_path / "bad.json"
|
||||
bad.write_text("not json", encoding="utf-8")
|
||||
with pytest.raises(ValueError):
|
||||
load_template("bad")
|
||||
|
||||
def test_delete(self, isolated_templates):
|
||||
t = new_template("To Delete")
|
||||
t["columns"] = [
|
||||
{"source": 0, "target": "date"},
|
||||
{"source": 1, "target": "amount"},
|
||||
]
|
||||
t["table"]["column_boundaries"] = [100]
|
||||
save_template(t)
|
||||
assert delete_template("to-delete") is True
|
||||
assert delete_template("to-delete") is False
|
||||
|
||||
def test_list_returns_summaries(self, isolated_templates):
|
||||
for name in ["Alpha", "Bravo"]:
|
||||
t = new_template(name)
|
||||
t["columns"] = [
|
||||
{"source": 0, "target": "date"},
|
||||
{"source": 1, "target": "amount"},
|
||||
]
|
||||
t["table"]["column_boundaries"] = [100]
|
||||
save_template(t)
|
||||
rows = list_templates()
|
||||
assert {r["slug"] for r in rows} == {"alpha", "bravo"}
|
||||
|
||||
def test_list_skips_corrupt(self, isolated_templates, tmp_path):
|
||||
(tmp_path / "broken.json").write_text("nope", encoding="utf-8")
|
||||
# Even with a broken file present, list still returns []
|
||||
rows = list_templates()
|
||||
assert rows == []
|
||||
|
||||
def test_atomic_save_no_partial_file_on_failure(
|
||||
self, isolated_templates, monkeypatch
|
||||
):
|
||||
"""If the write step fails mid-way, no half-written JSON survives
|
||||
at the target path. Tests the temp-file-rename safety pattern."""
|
||||
t = new_template("Atomic")
|
||||
t["columns"] = [
|
||||
{"source": 0, "target": "date"},
|
||||
{"source": 1, "target": "amount"},
|
||||
]
|
||||
t["table"]["column_boundaries"] = [100]
|
||||
|
||||
# Make json.dumps blow up to simulate a failure during write.
|
||||
# save_template already validated before this step, so the
|
||||
# crash is "after validation, during write".
|
||||
import src.pdf_templates as mod
|
||||
original_dumps = mod.json.dumps
|
||||
|
||||
def boom(*a, **kw):
|
||||
raise IOError("disk full")
|
||||
|
||||
monkeypatch.setattr(mod.json, "dumps", boom)
|
||||
with pytest.raises(IOError):
|
||||
save_template(t)
|
||||
monkeypatch.setattr(mod.json, "dumps", original_dumps)
|
||||
|
||||
assert not template_path("atomic").exists()
|
||||
|
||||
|
||||
class TestImportExport:
|
||||
def test_round_trip_via_json(self):
|
||||
t = new_template("Exported")
|
||||
t["columns"] = [
|
||||
{"source": 0, "target": "date"},
|
||||
{"source": 1, "target": "amount"},
|
||||
]
|
||||
payload = template_to_json(t)
|
||||
loaded = template_from_json(payload)
|
||||
assert loaded["name"] == "Exported"
|
||||
|
||||
def test_import_rejects_bad_schema(self):
|
||||
bad = json.dumps({"schema_version": 999, "name": "X"})
|
||||
with pytest.raises(ValueError):
|
||||
template_from_json(bad)
|
||||
|
||||
def test_import_rejects_non_object(self):
|
||||
with pytest.raises(ValueError):
|
||||
template_from_json('["not", "an", "object"]')
|
||||
|
||||
|
||||
def test_templates_dir_env_override(monkeypatch, tmp_path):
|
||||
monkeypatch.setenv("DATATOOLS_PDF_TEMPLATES_DIR", str(tmp_path))
|
||||
assert templates_dir() == tmp_path
|
||||
Reference in New Issue
Block a user