Phase 1/6 of the PDF Extractor tool. Pure module — no Streamlit,
no user-config I/O — that turns a PDF blob plus a template dict
into a ``pandas.DataFrame`` of transaction rows. Primary use case
is accountant-style extraction of bank-statement transactions,
where each bank's format is encoded as a reusable template.
Pipeline:
1. ``extract_pages(pdf_bytes)`` reads with pdfplumber and surfaces
words with bounding boxes.
2. ``cluster_rows(words)`` groups words into rows by ``top``
tolerance — no reliance on PDF table-line detection (most bank
statements have no visible cell borders).
3. ``assign_columns(row_words, boundaries)`` buckets each word by
its horizontal midpoint into N+1 columns defined by N interior
x-boundaries.
4. ``_within_table_window`` slices to the band between the header
line and the end-marker (e.g. "Closing balance").
5. ``apply_template`` orchestrates the above, handling:
- parens-style negative amounts, currency stripping, custom
decimal/thousands separators
- separate debit + credit columns combined into a single signed
``amount`` (credit positive, debit negative — accounting
register convention; matches QuickBooks/Xero imports)
- multi-line description wrapping (rows with empty date column
attach to the previous row's description)
- row-level regex skip filters (e.g., "Total", "Subtotal")
- page-range filters ("all", "2-", "1,3-5")
Optional OCR fallback for scanned statements:
- ``page_has_extractable_text`` heuristic flags pages with <5
words as likely-scanned.
- ``ocr_available()`` checks both the ``pytesseract`` Python
binding and the Tesseract binary; surfaces a clear reason
string when either is missing.
- ``extract_pages_auto`` does text-first, OCR-the-blanks, and
returns warnings the UI can surface.
29 unit tests cover the parsing pipeline against synthetic
WordBox/Page data — no fixture PDFs required, runs in 0.1s. Real
PDF extraction is exercised by hand on the user's statements.
Dependencies added:
- ``pdfplumber>=0.10,<1`` — text + position extraction
- ``pypdfium2>=4,<6`` — page rasterization for OCR + visual picker
- ``streamlit-drawable-canvas>=0.9,<1`` — visual region picker
(used in commit 5)
- ``pytesseract>=0.3,<1`` — OCR (used in commit 6; system
Tesseract binary required separately)
- ``cryptography>=41,<49`` — bumped upper bound; pdfminer.six
transitively requires a recent release. Internal ed25519
license-signing usage is API-stable across the bump.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
287 lines
9.6 KiB
Python
287 lines
9.6 KiB
Python
"""Tests for the pure PDF-extraction pipeline.
|
|
|
|
Real PDF parsing (``extract_pages``) is a thin wrapper around
|
|
``pdfplumber`` and is exercised by hand on real bank statements.
|
|
These tests pin the meaty bits — value parsing, row clustering,
|
|
column assignment, template-driven extraction — against synthetic
|
|
``WordBox`` data so they run fast and have no PDF dependency.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import pandas as pd
|
|
|
|
from src.pdf_extract import (
|
|
Page,
|
|
WordBox,
|
|
apply_template,
|
|
assign_columns,
|
|
cluster_rows,
|
|
parse_amount,
|
|
parse_date,
|
|
_pages_in_range,
|
|
_within_table_window,
|
|
)
|
|
|
|
|
|
def _w(text: str, x0: float, top: float, x1: float | None = None) -> WordBox:
|
|
"""Convenience constructor — heights and exact x1 don't matter
|
|
for the tests we write."""
|
|
return WordBox(
|
|
x0=x0,
|
|
top=top,
|
|
x1=x1 if x1 is not None else x0 + 10 * len(text),
|
|
bottom=top + 10,
|
|
text=text,
|
|
)
|
|
|
|
|
|
class TestParseAmount:
|
|
def test_plain_positive(self):
|
|
assert parse_amount("1234.56") == 1234.56
|
|
|
|
def test_currency_and_thousands(self):
|
|
assert parse_amount("$1,234.56") == 1234.56
|
|
|
|
def test_parens_negative(self):
|
|
assert parse_amount("(1,234.56)") == -1234.56
|
|
|
|
def test_leading_minus(self):
|
|
assert parse_amount("-100.00") == -100.0
|
|
|
|
def test_trailing_minus(self):
|
|
assert parse_amount("100.00-") == -100.0
|
|
|
|
def test_blank_returns_none(self):
|
|
assert parse_amount("") is None
|
|
assert parse_amount(" ") is None
|
|
assert parse_amount(None) is None
|
|
|
|
def test_garbage_returns_none(self):
|
|
assert parse_amount("not a number") is None
|
|
|
|
def test_european_decimal(self):
|
|
opts = {
|
|
"decimal_separator": ",",
|
|
"thousands_separator": ".",
|
|
"currency_strip": "€",
|
|
"negative_in_parens": True,
|
|
}
|
|
assert parse_amount("€1.234,56", opts) == 1234.56
|
|
|
|
|
|
class TestParseDate:
|
|
def test_us_slash(self):
|
|
assert parse_date("01/15/2026", ["%m/%d/%Y"]) == "2026-01-15"
|
|
|
|
def test_iso(self):
|
|
assert parse_date("2026-01-15", ["%Y-%m-%d"]) == "2026-01-15"
|
|
|
|
def test_fallback_format(self):
|
|
# Not in the supplied list — should still parse via fallback.
|
|
assert parse_date("01/15/26") == "2026-01-15"
|
|
|
|
def test_invalid(self):
|
|
assert parse_date("not-a-date") is None
|
|
|
|
|
|
class TestClusterRows:
|
|
def test_groups_close_y(self):
|
|
words = [
|
|
_w("A", x0=0, top=100),
|
|
_w("B", x0=20, top=101),
|
|
_w("C", x0=40, top=102),
|
|
]
|
|
rows = cluster_rows(words, y_tolerance=3.0)
|
|
assert len(rows) == 1
|
|
assert [w.text for w in rows[0]] == ["A", "B", "C"]
|
|
|
|
def test_separates_far_y(self):
|
|
words = [
|
|
_w("A", x0=0, top=100),
|
|
_w("B", x0=0, top=120),
|
|
]
|
|
rows = cluster_rows(words, y_tolerance=3.0)
|
|
assert [[w.text for w in r] for r in rows] == [["A"], ["B"]]
|
|
|
|
def test_sorts_left_to_right_within_row(self):
|
|
words = [
|
|
_w("C", x0=40, top=100),
|
|
_w("A", x0=0, top=100),
|
|
_w("B", x0=20, top=100),
|
|
]
|
|
rows = cluster_rows(words)
|
|
assert [w.text for w in rows[0]] == ["A", "B", "C"]
|
|
|
|
def test_empty(self):
|
|
assert cluster_rows([]) == []
|
|
|
|
|
|
class TestAssignColumns:
|
|
def test_three_columns(self):
|
|
# boundaries at x=100, 200 → columns [0,100), [100,200), [200,∞)
|
|
row = [
|
|
_w("Jan", x0=10, top=0, x1=40), # col 0
|
|
_w("1", x0=45, top=0, x1=55), # col 0
|
|
_w("Deposit", x0=110, top=0, x1=180), # col 1
|
|
_w("250.00", x0=210, top=0, x1=260), # col 2
|
|
]
|
|
cells = assign_columns(row, [100, 200])
|
|
assert cells[0] == "Jan 1"
|
|
assert cells[1] == "Deposit"
|
|
assert cells[2] == "250.00"
|
|
|
|
def test_no_boundaries_one_column(self):
|
|
row = [_w("A", 0, 0), _w("B", 20, 0)]
|
|
cells = assign_columns(row, [])
|
|
assert cells == ["A B"]
|
|
|
|
|
|
class TestPagesInRange:
|
|
def _mk(self, n):
|
|
return [Page(page_no=i + 1, width=600, height=800, text="", words=[]) for i in range(n)]
|
|
|
|
def test_all(self):
|
|
pages = self._mk(5)
|
|
assert len(_pages_in_range(pages, "all")) == 5
|
|
assert len(_pages_in_range(pages, "")) == 5
|
|
|
|
def test_explicit_list(self):
|
|
pages = self._mk(5)
|
|
got = [p.page_no for p in _pages_in_range(pages, "1,3,5")]
|
|
assert got == [1, 3, 5]
|
|
|
|
def test_range(self):
|
|
pages = self._mk(5)
|
|
got = [p.page_no for p in _pages_in_range(pages, "2-4")]
|
|
assert got == [2, 3, 4]
|
|
|
|
def test_open_ended(self):
|
|
pages = self._mk(5)
|
|
got = [p.page_no for p in _pages_in_range(pages, "3-")]
|
|
assert got == [3, 4, 5]
|
|
|
|
|
|
class TestWithinTableWindow:
|
|
def test_header_skipped_end_excluded(self):
|
|
rows = [
|
|
[_w("STATEMENT", 0, 0)],
|
|
[_w("Date", 0, 20), _w("Description", 50, 20), _w("Amount", 200, 20)],
|
|
[_w("01/15", 0, 40), _w("Coffee", 50, 40), _w("4.50", 200, 40)],
|
|
[_w("01/16", 0, 60), _w("Refund", 50, 60), _w("12.00", 200, 60)],
|
|
[_w("Closing", 0, 80), _w("balance", 50, 80)],
|
|
[_w("Page", 0, 100), _w("1", 50, 100)],
|
|
]
|
|
out = _within_table_window(rows, "Date Description Amount", ["Closing balance"])
|
|
# Should keep just the two transaction rows.
|
|
assert len(out) == 2
|
|
assert out[0][0].text == "01/15"
|
|
assert out[1][0].text == "01/16"
|
|
|
|
def test_no_header_returns_empty_when_required(self):
|
|
rows = [[_w("foo", 0, 0)]]
|
|
assert _within_table_window(rows, "Date Description Amount", []) == []
|
|
|
|
def test_blank_header_passes_through(self):
|
|
rows = [[_w("x", 0, 0)], [_w("y", 0, 20)]]
|
|
assert _within_table_window(rows, "", []) == rows
|
|
|
|
|
|
class TestApplyTemplate:
|
|
"""End-to-end on synthetic ``Page`` objects."""
|
|
|
|
def _statement_page(self) -> Page:
|
|
# Mock layout: 3 columns at x=0/100/200, header at y=20, data at 40+.
|
|
words = [
|
|
_w("STATEMENT", 0, 0),
|
|
# Header
|
|
_w("Date", 5, 20), _w("Description", 105, 20), _w("Amount", 205, 20),
|
|
# Row 1
|
|
_w("01/15/2026", 5, 40), _w("Coffee", 105, 40),
|
|
_w("Shop", 140, 40), _w("(4.50)", 205, 40),
|
|
# Row 2
|
|
_w("01/16/2026", 5, 60), _w("Refund", 105, 60), _w("$12.00", 205, 60),
|
|
# Continuation row (no date) — should merge into row 2
|
|
_w("from", 105, 80), _w("vendor", 140, 80),
|
|
# End marker
|
|
_w("Closing", 5, 100), _w("balance", 105, 100), _w("$1,000.00", 205, 100),
|
|
]
|
|
return Page(page_no=1, width=300, height=120, text="", words=words)
|
|
|
|
def _template(self) -> dict:
|
|
return {
|
|
"pages": {"range": "all"},
|
|
"table": {
|
|
"header_text": "Date Description Amount",
|
|
"end_markers": ["Closing balance"],
|
|
"column_boundaries": [100, 200],
|
|
"y_tolerance": 3.0,
|
|
"skip_rows_matching": [],
|
|
},
|
|
"columns": [
|
|
{"source": 0, "target": "date"},
|
|
{"source": 1, "target": "description"},
|
|
{"source": 2, "target": "amount"},
|
|
],
|
|
"parse": {
|
|
"date_format": "%m/%d/%Y",
|
|
"amount_negative_in_parens": True,
|
|
"merge_multiline_description": True,
|
|
},
|
|
}
|
|
|
|
def test_basic_extraction(self):
|
|
df = apply_template([self._statement_page()], self._template())
|
|
assert isinstance(df, pd.DataFrame)
|
|
assert len(df) == 2
|
|
assert list(df["date"]) == ["2026-01-15", "2026-01-16"]
|
|
# Parens-negative
|
|
assert df.iloc[0]["amount"] == -4.50
|
|
# Plain positive with currency strip
|
|
assert df.iloc[1]["amount"] == 12.00
|
|
# Multi-line description merged
|
|
assert "from vendor" in df.iloc[1]["description"]
|
|
|
|
def test_debit_credit_split_columns(self):
|
|
# Layout: date | description | debit | credit columns
|
|
page = Page(
|
|
page_no=1, width=400, height=80, text="",
|
|
words=[
|
|
_w("Date", 5, 0), _w("Desc", 105, 0),
|
|
_w("Debit", 205, 0), _w("Credit", 305, 0),
|
|
_w("01/15/2026", 5, 20), _w("Coffee", 105, 20), _w("4.50", 205, 20),
|
|
_w("01/16/2026", 5, 40), _w("Refund", 105, 40),
|
|
_w("", 205, 40), # no debit
|
|
_w("12.00", 305, 40),
|
|
],
|
|
)
|
|
tpl = {
|
|
"table": {
|
|
"header_text": "Date Desc Debit Credit",
|
|
"column_boundaries": [100, 200, 300],
|
|
},
|
|
"columns": [
|
|
{"source": 0, "target": "date"},
|
|
{"source": 1, "target": "description"},
|
|
{"source": 2, "target": "amount_debit"},
|
|
{"source": 3, "target": "amount_credit"},
|
|
],
|
|
"parse": {"date_format": "%m/%d/%Y"},
|
|
}
|
|
df = apply_template([page], tpl)
|
|
assert list(df["amount"]) == [-4.50, 12.00]
|
|
assert list(df["type"]) == ["debit", "credit"]
|
|
|
|
def test_skip_rows_matching(self):
|
|
page = self._statement_page()
|
|
tpl = self._template()
|
|
tpl["table"]["skip_rows_matching"] = ["Refund"]
|
|
df = apply_template([page], tpl)
|
|
# Refund row is dropped — only one transaction left
|
|
assert len(df) == 1
|
|
assert df.iloc[0]["amount"] == -4.50
|
|
|
|
def test_empty_pages_returns_empty_df(self):
|
|
df = apply_template([], self._template())
|
|
assert df.empty
|