Files
datatools-dev/tests/test_pdf_extract.py
Michael b8aff862ed feat(pdf): add pure PDF→DataFrame extraction module
Phase 1/6 of the PDF Extractor tool. Pure module — no Streamlit,
no user-config I/O — that turns a PDF blob plus a template dict
into a ``pandas.DataFrame`` of transaction rows. Primary use case
is accountant-style extraction of bank-statement transactions,
where each bank's format is encoded as a reusable template.

Pipeline:

1. ``extract_pages(pdf_bytes)`` reads with pdfplumber and surfaces
   words with bounding boxes.
2. ``cluster_rows(words)`` groups words into rows by ``top``
   tolerance — no reliance on PDF table-line detection (most bank
   statements have no visible cell borders).
3. ``assign_columns(row_words, boundaries)`` buckets each word by
   its horizontal midpoint into N+1 columns defined by N interior
   x-boundaries.
4. ``_within_table_window`` slices to the band between the header
   line and the end-marker (e.g. "Closing balance").
5. ``apply_template`` orchestrates the above, handling:
   - parens-style negative amounts, currency stripping, custom
     decimal/thousands separators
   - separate debit + credit columns combined into a single signed
     ``amount`` (credit positive, debit negative — accounting
     register convention; matches QuickBooks/Xero imports)
   - multi-line description wrapping (rows with empty date column
     attach to the previous row's description)
   - row-level regex skip filters (e.g., "Total", "Subtotal")
   - page-range filters ("all", "2-", "1,3-5")

Optional OCR fallback for scanned statements:

- ``page_has_extractable_text`` heuristic flags pages with <5
  words as likely-scanned.
- ``ocr_available()`` checks both the ``pytesseract`` Python
  binding and the Tesseract binary; surfaces a clear reason
  string when either is missing.
- ``extract_pages_auto`` does text-first, OCR-the-blanks, and
  returns warnings the UI can surface.

29 unit tests cover the parsing pipeline against synthetic
WordBox/Page data — no fixture PDFs required, runs in 0.1s. Real
PDF extraction is exercised by hand on the user's statements.

Dependencies added:
- ``pdfplumber>=0.10,<1`` — text + position extraction
- ``pypdfium2>=4,<6`` — page rasterization for OCR + visual picker
- ``streamlit-drawable-canvas>=0.9,<1`` — visual region picker
  (used in commit 5)
- ``pytesseract>=0.3,<1`` — OCR (used in commit 6; system
  Tesseract binary required separately)
- ``cryptography>=41,<49`` — bumped upper bound; pdfminer.six
  transitively requires a recent release. Internal ed25519
  license-signing usage is API-stable across the bump.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 22:44:51 +00:00

287 lines
9.6 KiB
Python

"""Tests for the pure PDF-extraction pipeline.
Real PDF parsing (``extract_pages``) is a thin wrapper around
``pdfplumber`` and is exercised by hand on real bank statements.
These tests pin the meaty bits — value parsing, row clustering,
column assignment, template-driven extraction — against synthetic
``WordBox`` data so they run fast and have no PDF dependency.
"""
from __future__ import annotations
import pandas as pd
from src.pdf_extract import (
Page,
WordBox,
apply_template,
assign_columns,
cluster_rows,
parse_amount,
parse_date,
_pages_in_range,
_within_table_window,
)
def _w(text: str, x0: float, top: float, x1: float | None = None) -> WordBox:
"""Convenience constructor — heights and exact x1 don't matter
for the tests we write."""
return WordBox(
x0=x0,
top=top,
x1=x1 if x1 is not None else x0 + 10 * len(text),
bottom=top + 10,
text=text,
)
class TestParseAmount:
def test_plain_positive(self):
assert parse_amount("1234.56") == 1234.56
def test_currency_and_thousands(self):
assert parse_amount("$1,234.56") == 1234.56
def test_parens_negative(self):
assert parse_amount("(1,234.56)") == -1234.56
def test_leading_minus(self):
assert parse_amount("-100.00") == -100.0
def test_trailing_minus(self):
assert parse_amount("100.00-") == -100.0
def test_blank_returns_none(self):
assert parse_amount("") is None
assert parse_amount(" ") is None
assert parse_amount(None) is None
def test_garbage_returns_none(self):
assert parse_amount("not a number") is None
def test_european_decimal(self):
opts = {
"decimal_separator": ",",
"thousands_separator": ".",
"currency_strip": "",
"negative_in_parens": True,
}
assert parse_amount("€1.234,56", opts) == 1234.56
class TestParseDate:
def test_us_slash(self):
assert parse_date("01/15/2026", ["%m/%d/%Y"]) == "2026-01-15"
def test_iso(self):
assert parse_date("2026-01-15", ["%Y-%m-%d"]) == "2026-01-15"
def test_fallback_format(self):
# Not in the supplied list — should still parse via fallback.
assert parse_date("01/15/26") == "2026-01-15"
def test_invalid(self):
assert parse_date("not-a-date") is None
class TestClusterRows:
def test_groups_close_y(self):
words = [
_w("A", x0=0, top=100),
_w("B", x0=20, top=101),
_w("C", x0=40, top=102),
]
rows = cluster_rows(words, y_tolerance=3.0)
assert len(rows) == 1
assert [w.text for w in rows[0]] == ["A", "B", "C"]
def test_separates_far_y(self):
words = [
_w("A", x0=0, top=100),
_w("B", x0=0, top=120),
]
rows = cluster_rows(words, y_tolerance=3.0)
assert [[w.text for w in r] for r in rows] == [["A"], ["B"]]
def test_sorts_left_to_right_within_row(self):
words = [
_w("C", x0=40, top=100),
_w("A", x0=0, top=100),
_w("B", x0=20, top=100),
]
rows = cluster_rows(words)
assert [w.text for w in rows[0]] == ["A", "B", "C"]
def test_empty(self):
assert cluster_rows([]) == []
class TestAssignColumns:
def test_three_columns(self):
# boundaries at x=100, 200 → columns [0,100), [100,200), [200,∞)
row = [
_w("Jan", x0=10, top=0, x1=40), # col 0
_w("1", x0=45, top=0, x1=55), # col 0
_w("Deposit", x0=110, top=0, x1=180), # col 1
_w("250.00", x0=210, top=0, x1=260), # col 2
]
cells = assign_columns(row, [100, 200])
assert cells[0] == "Jan 1"
assert cells[1] == "Deposit"
assert cells[2] == "250.00"
def test_no_boundaries_one_column(self):
row = [_w("A", 0, 0), _w("B", 20, 0)]
cells = assign_columns(row, [])
assert cells == ["A B"]
class TestPagesInRange:
def _mk(self, n):
return [Page(page_no=i + 1, width=600, height=800, text="", words=[]) for i in range(n)]
def test_all(self):
pages = self._mk(5)
assert len(_pages_in_range(pages, "all")) == 5
assert len(_pages_in_range(pages, "")) == 5
def test_explicit_list(self):
pages = self._mk(5)
got = [p.page_no for p in _pages_in_range(pages, "1,3,5")]
assert got == [1, 3, 5]
def test_range(self):
pages = self._mk(5)
got = [p.page_no for p in _pages_in_range(pages, "2-4")]
assert got == [2, 3, 4]
def test_open_ended(self):
pages = self._mk(5)
got = [p.page_no for p in _pages_in_range(pages, "3-")]
assert got == [3, 4, 5]
class TestWithinTableWindow:
def test_header_skipped_end_excluded(self):
rows = [
[_w("STATEMENT", 0, 0)],
[_w("Date", 0, 20), _w("Description", 50, 20), _w("Amount", 200, 20)],
[_w("01/15", 0, 40), _w("Coffee", 50, 40), _w("4.50", 200, 40)],
[_w("01/16", 0, 60), _w("Refund", 50, 60), _w("12.00", 200, 60)],
[_w("Closing", 0, 80), _w("balance", 50, 80)],
[_w("Page", 0, 100), _w("1", 50, 100)],
]
out = _within_table_window(rows, "Date Description Amount", ["Closing balance"])
# Should keep just the two transaction rows.
assert len(out) == 2
assert out[0][0].text == "01/15"
assert out[1][0].text == "01/16"
def test_no_header_returns_empty_when_required(self):
rows = [[_w("foo", 0, 0)]]
assert _within_table_window(rows, "Date Description Amount", []) == []
def test_blank_header_passes_through(self):
rows = [[_w("x", 0, 0)], [_w("y", 0, 20)]]
assert _within_table_window(rows, "", []) == rows
class TestApplyTemplate:
"""End-to-end on synthetic ``Page`` objects."""
def _statement_page(self) -> Page:
# Mock layout: 3 columns at x=0/100/200, header at y=20, data at 40+.
words = [
_w("STATEMENT", 0, 0),
# Header
_w("Date", 5, 20), _w("Description", 105, 20), _w("Amount", 205, 20),
# Row 1
_w("01/15/2026", 5, 40), _w("Coffee", 105, 40),
_w("Shop", 140, 40), _w("(4.50)", 205, 40),
# Row 2
_w("01/16/2026", 5, 60), _w("Refund", 105, 60), _w("$12.00", 205, 60),
# Continuation row (no date) — should merge into row 2
_w("from", 105, 80), _w("vendor", 140, 80),
# End marker
_w("Closing", 5, 100), _w("balance", 105, 100), _w("$1,000.00", 205, 100),
]
return Page(page_no=1, width=300, height=120, text="", words=words)
def _template(self) -> dict:
return {
"pages": {"range": "all"},
"table": {
"header_text": "Date Description Amount",
"end_markers": ["Closing balance"],
"column_boundaries": [100, 200],
"y_tolerance": 3.0,
"skip_rows_matching": [],
},
"columns": [
{"source": 0, "target": "date"},
{"source": 1, "target": "description"},
{"source": 2, "target": "amount"},
],
"parse": {
"date_format": "%m/%d/%Y",
"amount_negative_in_parens": True,
"merge_multiline_description": True,
},
}
def test_basic_extraction(self):
df = apply_template([self._statement_page()], self._template())
assert isinstance(df, pd.DataFrame)
assert len(df) == 2
assert list(df["date"]) == ["2026-01-15", "2026-01-16"]
# Parens-negative
assert df.iloc[0]["amount"] == -4.50
# Plain positive with currency strip
assert df.iloc[1]["amount"] == 12.00
# Multi-line description merged
assert "from vendor" in df.iloc[1]["description"]
def test_debit_credit_split_columns(self):
# Layout: date | description | debit | credit columns
page = Page(
page_no=1, width=400, height=80, text="",
words=[
_w("Date", 5, 0), _w("Desc", 105, 0),
_w("Debit", 205, 0), _w("Credit", 305, 0),
_w("01/15/2026", 5, 20), _w("Coffee", 105, 20), _w("4.50", 205, 20),
_w("01/16/2026", 5, 40), _w("Refund", 105, 40),
_w("", 205, 40), # no debit
_w("12.00", 305, 40),
],
)
tpl = {
"table": {
"header_text": "Date Desc Debit Credit",
"column_boundaries": [100, 200, 300],
},
"columns": [
{"source": 0, "target": "date"},
{"source": 1, "target": "description"},
{"source": 2, "target": "amount_debit"},
{"source": 3, "target": "amount_credit"},
],
"parse": {"date_format": "%m/%d/%Y"},
}
df = apply_template([page], tpl)
assert list(df["amount"]) == [-4.50, 12.00]
assert list(df["type"]) == ["debit", "credit"]
def test_skip_rows_matching(self):
page = self._statement_page()
tpl = self._template()
tpl["table"]["skip_rows_matching"] = ["Refund"]
df = apply_template([page], tpl)
# Refund row is dropped — only one transaction left
assert len(df) == 1
assert df.iloc[0]["amount"] == -4.50
def test_empty_pages_returns_empty_df(self):
df = apply_template([], self._template())
assert df.empty