refactor(pdf): rip out templates; heuristic scan + selectable table
User feedback: the template / visual-picker / mode-dispatch
implementation was too complex for the actual workflow.
Statements drift between months, the canvas state didn't survive
multi-page navigation, and accountants don't want to maintain
per-bank configuration just to convert PDFs to CSV.
Start-over design — one public function, one page, no
persistence:
``scan_pdf_for_transactions(pdf_bytes) → (rows, warnings)``
A row is "any text line with a date pattern AND at least one
amount pattern." Each detected row is a dict shaped::
{
"date": "2026-01-15",
"description": "Coffee Shop",
"amount_1": -4.50,
"amount_2": 1000.00, # if a second amount was found
"page": 1,
"raw": "01/15/2026 Coffee Shop (4.50) 1,000.00",
"source_file": "chase-jan-2026.pdf",
}
Multi-line descriptions still merge (no-date no-amount lines
attach to the previous transaction). Multi-PDF batches share a
single combined table with a ``source_file`` column.
**Page UX:**
- Upload PDF(s) → optional Options expander (parens-negative,
use-OCR) → click Scan → see all detected rows in an
``st.data_editor``.
- The editor has an ``Include`` checkbox column (default on),
plus user-editable date / description / amount cells and a
read-only ``raw`` column showing the original PDF text for
verification.
- A ``Columns to include in CSV`` multiselect hides
``page`` / ``raw`` from the download by default; user can
re-add either.
- Download CSV gets only the checked rows.
No template save/load. No visual picker. No mode dispatch. No
column boundaries. No schema migration. No per-bank
configuration files.
**Deletions:**
- ``src/pdf_templates.py`` — template storage layer
- ``src/gui/_drawable_canvas_compat.py`` — Streamlit compat shim
for the canvas (no canvas now)
- ``tests/test_pdf_templates.py``, ``test_pdf_row_heuristic.py``,
``test_drawable_canvas_compat.py`` — covered the removed APIs
- ``build/hooks/hook-streamlit_drawable_canvas.py`` — hook for
the removed dep
- ``streamlit-drawable-canvas==0.9.3`` from ``requirements.txt``
- The drawable-canvas references in ``build/datatools.spec``
**``src/pdf_extract.py``** shrinks from ~30 helper functions to
~10. Keeps: value parsers, row clusterer, date/amount token
finders, OCR pipeline, dependency guards. The one new public
function ``scan_pdf_for_transactions`` glues them together.
**Tests** (59 passing): the unit layer keeps full coverage of
the building blocks; the smoke layer pins the end-to-end PDF
roundtrip, OCR discovery, dependency-import behavior, and the
multi-line-description merge. The fpdf2-generated fixture PDF
still drives the real-PDF test.
Rollback: ``git revert HEAD`` brings back the template system if
needed — but the simpler model should make that unlikely.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -58,15 +58,12 @@ hidden_imports += collect_submodules("charset_normalizer")
|
|||||||
hidden_imports += collect_submodules("openpyxl")
|
hidden_imports += collect_submodules("openpyxl")
|
||||||
hidden_imports += collect_submodules("loguru")
|
hidden_imports += collect_submodules("loguru")
|
||||||
|
|
||||||
# PDF Extractor stack. ``streamlit_drawable_canvas`` and
|
# PDF Extractor stack. ``pypdfium2`` has its own PyInstaller hook
|
||||||
# ``pypdfium2`` both have their own PyInstaller hooks under
|
# under ``build/hooks/`` that pulls in the native PDFium binary —
|
||||||
# ``build/hooks/`` that pull in the native binary + frontend
|
# keep the ``collect_submodules`` calls here for belt-and-braces.
|
||||||
# assets — keep the ``collect_submodules`` calls here for
|
|
||||||
# belt-and-braces.
|
|
||||||
hidden_imports += collect_submodules("pdfplumber")
|
hidden_imports += collect_submodules("pdfplumber")
|
||||||
hidden_imports += collect_submodules("pdfminer")
|
hidden_imports += collect_submodules("pdfminer")
|
||||||
hidden_imports += collect_submodules("pypdfium2")
|
hidden_imports += collect_submodules("pypdfium2")
|
||||||
hidden_imports += collect_submodules("streamlit_drawable_canvas")
|
|
||||||
hidden_imports += collect_submodules("PIL")
|
hidden_imports += collect_submodules("PIL")
|
||||||
hidden_imports += collect_submodules("pytesseract")
|
hidden_imports += collect_submodules("pytesseract")
|
||||||
|
|
||||||
@@ -91,13 +88,10 @@ datas += collect_data_files("phonenumbers", include_py_files=False)
|
|||||||
|
|
||||||
# PDF Extractor data files. ``pypdfium2`` ships a native PDFium
|
# PDF Extractor data files. ``pypdfium2`` ships a native PDFium
|
||||||
# shared library (``.dll`` / ``.so`` / ``.dylib``) under its package
|
# shared library (``.dll`` / ``.so`` / ``.dylib``) under its package
|
||||||
# dir; ``streamlit-drawable-canvas`` ships a built JS bundle that
|
# dir; ``pdfminer`` ships the Adobe CMap tables it uses for
|
||||||
# Streamlit serves from the package dir at runtime; pdfminer ships
|
# character mapping. The drawable-canvas frontend bundle is gone
|
||||||
# the Adobe CMap tables it uses for character mapping. Hooks
|
# now that the visual picker was removed.
|
||||||
# under ``build/hooks/`` mirror these calls for explicit
|
|
||||||
# documentation and survive ``collect_data_files`` regressions.
|
|
||||||
datas += collect_data_files("pypdfium2", include_py_files=False)
|
datas += collect_data_files("pypdfium2", include_py_files=False)
|
||||||
datas += collect_data_files("streamlit_drawable_canvas")
|
|
||||||
datas += collect_data_files("pdfminer", include_py_files=False)
|
datas += collect_data_files("pdfminer", include_py_files=False)
|
||||||
|
|
||||||
# Our application files. PyInstaller's bundler treats source as code
|
# Our application files. PyInstaller's bundler treats source as code
|
||||||
|
|||||||
@@ -1,19 +0,0 @@
|
|||||||
"""PyInstaller hook for streamlit-drawable-canvas.
|
|
||||||
|
|
||||||
Streamlit components are Python packages that also ship a built
|
|
||||||
JavaScript/CSS bundle Streamlit serves from disk at component-
|
|
||||||
render time. Without those assets in the bundle the canvas
|
|
||||||
iframe loads blank — the user sees the page render fine but the
|
|
||||||
visual picker shows no image and no drawing controls.
|
|
||||||
|
|
||||||
``collect_data_files`` covers the frontend bundle directory
|
|
||||||
(named ``frontend`` or ``frontend/build`` depending on the
|
|
||||||
component version). Hidden imports are picked up by the main
|
|
||||||
spec's ``collect_submodules`` call, repeated here for the same
|
|
||||||
belt-and-braces reason as ``hook-pypdfium2.py``.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from PyInstaller.utils.hooks import collect_data_files, collect_submodules
|
|
||||||
|
|
||||||
datas = collect_data_files("streamlit_drawable_canvas")
|
|
||||||
hiddenimports = collect_submodules("streamlit_drawable_canvas")
|
|
||||||
@@ -10,10 +10,14 @@ phonenumbers>=8.13,<9
|
|||||||
streamlit>=1.35,<2
|
streamlit>=1.35,<2
|
||||||
cryptography>=41,<49
|
cryptography>=41,<49
|
||||||
# PDF Extractor stack — pinned to exact tested versions so a future
|
# PDF Extractor stack — pinned to exact tested versions so a future
|
||||||
# upstream release can't change the visual picker's coordinate model
|
# upstream release can't quietly change pdfplumber's word-position
|
||||||
# or pdfplumber's word-position behavior mid-build. Bump these
|
# behavior or pypdfium2's OCR rendering mid-build. Bump these
|
||||||
# explicitly when re-testing against a new release.
|
# explicitly when re-testing against a new release.
|
||||||
|
#
|
||||||
|
# ``pypdfium2`` is here for the OCR fallback path only (rasterizing
|
||||||
|
# pages to images for Tesseract). The drawable-canvas dep was
|
||||||
|
# removed when the visual picker was ripped out — the scanner is
|
||||||
|
# pure heuristic now, no coordinate UI.
|
||||||
pdfplumber==0.11.9
|
pdfplumber==0.11.9
|
||||||
pypdfium2==5.8.0
|
pypdfium2==5.8.0
|
||||||
pytesseract==0.3.13
|
pytesseract==0.3.13
|
||||||
streamlit-drawable-canvas==0.9.3
|
|
||||||
|
|||||||
@@ -1,86 +0,0 @@
|
|||||||
"""Compatibility shim for streamlit-drawable-canvas on modern Streamlit.
|
|
||||||
|
|
||||||
``streamlit-drawable-canvas`` 0.9.3 (last release 2023) calls
|
|
||||||
``streamlit.elements.image.image_to_url(image, width, clamp,
|
|
||||||
channels, output_format, image_id)``. Streamlit ~1.30+ moved this
|
|
||||||
helper out of ``streamlit.elements.image`` and changed its
|
|
||||||
signature so the second positional argument is now a
|
|
||||||
``LayoutConfig`` dataclass instead of a plain ``int`` width.
|
|
||||||
|
|
||||||
The canvas package hasn't been updated, so on modern Streamlit
|
|
||||||
its very first call fails with::
|
|
||||||
|
|
||||||
AttributeError: module 'streamlit.elements.image'
|
|
||||||
has no attribute 'image_to_url'
|
|
||||||
|
|
||||||
This module re-attaches a wrapper at the old import path that
|
|
||||||
adapts the old call shape to the new function. Import it once
|
|
||||||
before any ``st_canvas`` call; idempotent.
|
|
||||||
|
|
||||||
The shim is opt-in (not auto-installed at module import) so the
|
|
||||||
audit log of "I patched a third-party internal" is visible in
|
|
||||||
``grep`` rather than silently happening on every page load.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
|
|
||||||
_PATCHED = False
|
|
||||||
|
|
||||||
|
|
||||||
def install() -> None:
|
|
||||||
"""Install the ``image_to_url`` compatibility shim.
|
|
||||||
|
|
||||||
Idempotent — safe to call multiple times. Returns silently
|
|
||||||
if the canvas package or Streamlit can't be imported (lets
|
|
||||||
the caller handle the "PDF deps missing" path on its own).
|
|
||||||
"""
|
|
||||||
global _PATCHED
|
|
||||||
if _PATCHED:
|
|
||||||
return
|
|
||||||
|
|
||||||
try:
|
|
||||||
import streamlit.elements.image as _old_image_module
|
|
||||||
except ImportError:
|
|
||||||
return
|
|
||||||
|
|
||||||
# Already present (old Streamlit, or already shimmed) — bail.
|
|
||||||
if hasattr(_old_image_module, "image_to_url"):
|
|
||||||
_PATCHED = True
|
|
||||||
return
|
|
||||||
|
|
||||||
try:
|
|
||||||
from streamlit.elements.lib.image_utils import (
|
|
||||||
image_to_url as _new_image_to_url,
|
|
||||||
)
|
|
||||||
from streamlit.elements.lib.layout_utils import LayoutConfig
|
|
||||||
except ImportError:
|
|
||||||
# ``image_to_url`` is in some other location we don't know
|
|
||||||
# about yet — let the canvas surface its own error so we
|
|
||||||
# learn where to look. Don't fail silently.
|
|
||||||
return
|
|
||||||
|
|
||||||
def _shim(
|
|
||||||
image,
|
|
||||||
width,
|
|
||||||
clamp,
|
|
||||||
channels,
|
|
||||||
output_format,
|
|
||||||
image_id,
|
|
||||||
) -> str:
|
|
||||||
"""Old API → new API. The old ``width=-1`` sentinel meant
|
|
||||||
"use the image's natural width", which is also the new
|
|
||||||
function's default behavior when ``LayoutConfig`` is left
|
|
||||||
unconfigured."""
|
|
||||||
layout = LayoutConfig()
|
|
||||||
return _new_image_to_url(
|
|
||||||
image,
|
|
||||||
layout,
|
|
||||||
clamp,
|
|
||||||
channels,
|
|
||||||
output_format,
|
|
||||||
image_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
_old_image_module.image_to_url = _shim
|
|
||||||
_PATCHED = True
|
|
||||||
File diff suppressed because it is too large
Load Diff
1218
src/pdf_extract.py
1218
src/pdf_extract.py
File diff suppressed because it is too large
Load Diff
@@ -1,508 +0,0 @@
|
|||||||
"""PDF extract template storage.
|
|
||||||
|
|
||||||
Templates encode "how to read this bank's statements" — page
|
|
||||||
range, table window markers, column x-positions, target field
|
|
||||||
mapping, amount/date parse options. They live as JSON files in
|
|
||||||
``~/.datatools/pdf_templates/`` so an accountant can build one
|
|
||||||
per source and reuse it for every statement that follows the
|
|
||||||
same layout. Templates are portable: the ``export`` / ``import``
|
|
||||||
flow is just a file copy of the JSON.
|
|
||||||
|
|
||||||
The schema is intentionally a plain dict (not a frozen dataclass)
|
|
||||||
because the GUI mutates it incrementally during the build flow.
|
|
||||||
``validate_template`` enforces the contract at save time.
|
|
||||||
|
|
||||||
Schema (``schema_version: 1``)::
|
|
||||||
|
|
||||||
{
|
|
||||||
"schema_version": 1,
|
|
||||||
"slug": "chase-personal-checking",
|
|
||||||
"name": "Chase Personal Checking",
|
|
||||||
"notes": "",
|
|
||||||
"created_at": "<iso8601>",
|
|
||||||
"updated_at": "<iso8601>",
|
|
||||||
"pages": {
|
|
||||||
"range": "all" | "1-3" | "2,4,6-",
|
|
||||||
"skip_matching": "<regex>"
|
|
||||||
},
|
|
||||||
"table": {
|
|
||||||
"header_text": "<text containing all header words>",
|
|
||||||
"end_markers": ["<regex>", ...],
|
|
||||||
"column_boundaries": [x0, x1, ...],
|
|
||||||
"y_tolerance": 3.0,
|
|
||||||
"skip_rows_matching": ["<regex>", ...]
|
|
||||||
},
|
|
||||||
"columns": [
|
|
||||||
{"source": 0, "target": "date"},
|
|
||||||
...
|
|
||||||
# ``target`` is one of: date | description | amount |
|
|
||||||
# amount_debit | amount_credit | balance | <free text>
|
|
||||||
],
|
|
||||||
"parse": {
|
|
||||||
"date_format": "%m/%d/%Y",
|
|
||||||
"date_formats": [],
|
|
||||||
"decimal_separator": ".",
|
|
||||||
"thousands_separator": ",",
|
|
||||||
"currency_strip": "$",
|
|
||||||
"amount_negative_in_parens": true,
|
|
||||||
"merge_multiline_description": true
|
|
||||||
},
|
|
||||||
"visual": {
|
|
||||||
"page_width": 612.0,
|
|
||||||
"page_height": 792.0,
|
|
||||||
"sample_page": 1,
|
|
||||||
"table_bbox": [x0, top, x1, bottom] | null
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
The ``visual`` block is preserved across save/load so the build
|
|
||||||
UI can round-trip the user's last visual-picker state.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import tempfile
|
|
||||||
from datetime import datetime, timezone
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
|
|
||||||
SCHEMA_VERSION = 2
|
|
||||||
|
|
||||||
# Backward-compatible versions ``load_template`` will accept.
|
|
||||||
# v1 templates predate the row-heuristic shift and are loaded as
|
|
||||||
# ``mode="column_visual"``; they're not auto-migrated on disk, so
|
|
||||||
# the user keeps their canonical original until they re-save.
|
|
||||||
_LOAD_SUPPORTED_VERSIONS = frozenset({1, 2})
|
|
||||||
|
|
||||||
# Extraction modes. ``row_heuristic`` is the default for new
|
|
||||||
# templates — finds transactions by date+amount pattern matching
|
|
||||||
# with no coordinate dependency. ``column_visual`` is the legacy
|
|
||||||
# x-position-boundary approach, kept for old templates and for
|
|
||||||
# the "Advanced" build-mode fallback when the heuristic misfires.
|
|
||||||
VALID_MODES = frozenset({"row_heuristic", "column_visual"})
|
|
||||||
|
|
||||||
# Amount shapes for row_heuristic mode. The GUI offers these as a
|
|
||||||
# dropdown; the parser uses them to assign amount tokens to fields.
|
|
||||||
VALID_AMOUNT_SHAPES = frozenset({
|
|
||||||
"single",
|
|
||||||
"txn_balance",
|
|
||||||
"debit_credit",
|
|
||||||
"debit_credit_balance",
|
|
||||||
})
|
|
||||||
|
|
||||||
VALID_TARGETS = frozenset({
|
|
||||||
"date",
|
|
||||||
"description",
|
|
||||||
"amount",
|
|
||||||
"amount_debit",
|
|
||||||
"amount_credit",
|
|
||||||
"balance",
|
|
||||||
"type",
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Filesystem layout
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def templates_dir() -> Path:
|
|
||||||
"""Return ``~/.datatools/pdf_templates/``. Override via the
|
|
||||||
``DATATOOLS_PDF_TEMPLATES_DIR`` env var (used by tests)."""
|
|
||||||
override = os.environ.get("DATATOOLS_PDF_TEMPLATES_DIR")
|
|
||||||
if override:
|
|
||||||
return Path(override)
|
|
||||||
try:
|
|
||||||
return Path.home() / ".datatools" / "pdf_templates"
|
|
||||||
except Exception:
|
|
||||||
return Path(tempfile.gettempdir()) / "datatools-pdf-templates"
|
|
||||||
|
|
||||||
|
|
||||||
def template_path(slug: str) -> Path:
|
|
||||||
"""Resolve *slug* to its on-disk JSON path."""
|
|
||||||
return templates_dir() / f"{slug}.json"
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Slugify
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
_SLUG_STRIP = re.compile(r"[^a-z0-9]+")
|
|
||||||
|
|
||||||
|
|
||||||
def slugify(name: str) -> str:
|
|
||||||
"""Make a filesystem-safe slug from a human-friendly name."""
|
|
||||||
s = (name or "").strip().lower()
|
|
||||||
s = _SLUG_STRIP.sub("-", s).strip("-")
|
|
||||||
return s or "untitled"
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Construction + defaults
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def new_template(name: str) -> dict[str, Any]:
|
|
||||||
"""Build a blank template with sensible defaults.
|
|
||||||
|
|
||||||
Defaults to ``mode="row_heuristic"`` — the simpler, more
|
|
||||||
robust approach. The GUI's build flow lets the user switch to
|
|
||||||
``mode="column_visual"`` if the heuristic doesn't fit their
|
|
||||||
statement layout.
|
|
||||||
"""
|
|
||||||
now = datetime.now(tz=timezone.utc).isoformat(timespec="seconds")
|
|
||||||
slug = slugify(name)
|
|
||||||
return {
|
|
||||||
"schema_version": SCHEMA_VERSION,
|
|
||||||
"slug": slug,
|
|
||||||
"name": name or slug,
|
|
||||||
"notes": "",
|
|
||||||
"mode": "row_heuristic",
|
|
||||||
"created_at": now,
|
|
||||||
"updated_at": now,
|
|
||||||
"pages": {
|
|
||||||
"range": "all",
|
|
||||||
"skip_matching": "",
|
|
||||||
},
|
|
||||||
# Row-heuristic config (primary path).
|
|
||||||
"row_detection": {
|
|
||||||
"min_amounts_per_row": 1,
|
|
||||||
"max_amounts_per_row": 3,
|
|
||||||
"y_tolerance": 3.0,
|
|
||||||
"skip_rows_matching": [],
|
|
||||||
"merge_multiline_description": True,
|
|
||||||
},
|
|
||||||
"amounts": {
|
|
||||||
"shape": "single",
|
|
||||||
"negative_in_parens": True,
|
|
||||||
"decimal_separator": ".",
|
|
||||||
"thousands_separator": ",",
|
|
||||||
"currency_strip": "$",
|
|
||||||
},
|
|
||||||
"date": {
|
|
||||||
"format": "%m/%d/%Y",
|
|
||||||
"formats_fallback": [],
|
|
||||||
},
|
|
||||||
# Column-visual config (legacy / Advanced fallback). Empty
|
|
||||||
# placeholders so the GUI can populate when the user
|
|
||||||
# switches modes without inserting keys at runtime.
|
|
||||||
"table": {
|
|
||||||
"header_text": "",
|
|
||||||
"end_markers": [],
|
|
||||||
"column_boundaries": [],
|
|
||||||
"y_tolerance": 3.0,
|
|
||||||
"skip_rows_matching": [],
|
|
||||||
},
|
|
||||||
"columns": [],
|
|
||||||
"parse": {
|
|
||||||
"date_format": "%m/%d/%Y",
|
|
||||||
"date_formats": [],
|
|
||||||
"decimal_separator": ".",
|
|
||||||
"thousands_separator": ",",
|
|
||||||
"currency_strip": "$",
|
|
||||||
"amount_negative_in_parens": True,
|
|
||||||
"merge_multiline_description": True,
|
|
||||||
},
|
|
||||||
"visual": {
|
|
||||||
"page_width": 612.0,
|
|
||||||
"page_height": 792.0,
|
|
||||||
"sample_page": 1,
|
|
||||||
"table_bbox": None,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Validation
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def validate_template(template: dict[str, Any]) -> tuple[bool, list[str]]:
|
|
||||||
"""Check the template before saving. Returns ``(ok, errors)``.
|
|
||||||
|
|
||||||
Mode-aware: row-heuristic templates and column-visual
|
|
||||||
templates have different required fields. The GUI shows the
|
|
||||||
errors next to the Save button; nothing silent here."""
|
|
||||||
errors: list[str] = []
|
|
||||||
if not isinstance(template, dict):
|
|
||||||
return False, ["Template must be a JSON object."]
|
|
||||||
|
|
||||||
sv = template.get("schema_version")
|
|
||||||
if sv != SCHEMA_VERSION:
|
|
||||||
errors.append(
|
|
||||||
f"Unsupported schema_version {sv!r} (expected {SCHEMA_VERSION})."
|
|
||||||
)
|
|
||||||
|
|
||||||
name = template.get("name", "")
|
|
||||||
if not isinstance(name, str) or not name.strip():
|
|
||||||
errors.append("name is required.")
|
|
||||||
|
|
||||||
slug = template.get("slug") or slugify(name)
|
|
||||||
if not re.match(r"^[a-z0-9][a-z0-9-]{0,63}$", slug or ""):
|
|
||||||
errors.append(
|
|
||||||
"slug must be lowercase alphanumeric + hyphens, "
|
|
||||||
"1–64 chars, starting with a letter or digit."
|
|
||||||
)
|
|
||||||
|
|
||||||
mode = template.get("mode", "row_heuristic")
|
|
||||||
if mode not in VALID_MODES:
|
|
||||||
errors.append(
|
|
||||||
f"mode {mode!r} must be one of: {sorted(VALID_MODES)}."
|
|
||||||
)
|
|
||||||
|
|
||||||
if mode == "row_heuristic":
|
|
||||||
amounts = template.get("amounts", {}) or {}
|
|
||||||
shape = amounts.get("shape", "single")
|
|
||||||
if shape not in VALID_AMOUNT_SHAPES:
|
|
||||||
errors.append(
|
|
||||||
f"amounts.shape {shape!r} must be one of: "
|
|
||||||
f"{sorted(VALID_AMOUNT_SHAPES)}."
|
|
||||||
)
|
|
||||||
rd = template.get("row_detection", {}) or {}
|
|
||||||
min_a = rd.get("min_amounts_per_row", 1)
|
|
||||||
max_a = rd.get("max_amounts_per_row", 3)
|
|
||||||
if not (isinstance(min_a, int) and isinstance(max_a, int)):
|
|
||||||
errors.append(
|
|
||||||
"row_detection.min_amounts_per_row and "
|
|
||||||
"max_amounts_per_row must be integers."
|
|
||||||
)
|
|
||||||
elif min_a < 1 or max_a < min_a:
|
|
||||||
errors.append(
|
|
||||||
"row_detection.min_amounts_per_row must be ≥1 and ≤ "
|
|
||||||
"max_amounts_per_row."
|
|
||||||
)
|
|
||||||
|
|
||||||
elif mode == "column_visual":
|
|
||||||
columns = template.get("columns", [])
|
|
||||||
if not isinstance(columns, list) or len(columns) < 2:
|
|
||||||
errors.append(
|
|
||||||
"column_visual mode: at least two output columns "
|
|
||||||
"are required."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
seen_targets: list[str] = []
|
|
||||||
for i, col in enumerate(columns):
|
|
||||||
if not isinstance(col, dict):
|
|
||||||
errors.append(f"columns[{i}] must be an object.")
|
|
||||||
continue
|
|
||||||
src = col.get("source")
|
|
||||||
tgt = col.get("target")
|
|
||||||
if not isinstance(src, int) or src < 0:
|
|
||||||
errors.append(
|
|
||||||
f"columns[{i}].source must be a non-negative "
|
|
||||||
f"integer."
|
|
||||||
)
|
|
||||||
if not isinstance(tgt, str) or not tgt:
|
|
||||||
errors.append(
|
|
||||||
f"columns[{i}].target must be a non-empty string."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
seen_targets.append(tgt)
|
|
||||||
if "date" not in seen_targets:
|
|
||||||
errors.append(
|
|
||||||
"column_visual mode: at least one column must map "
|
|
||||||
"to 'date'."
|
|
||||||
)
|
|
||||||
if (
|
|
||||||
"amount" not in seen_targets
|
|
||||||
and not (
|
|
||||||
"amount_debit" in seen_targets
|
|
||||||
and "amount_credit" in seen_targets
|
|
||||||
)
|
|
||||||
):
|
|
||||||
errors.append(
|
|
||||||
"column_visual mode: either an 'amount' column or "
|
|
||||||
"both 'amount_debit' + 'amount_credit' columns "
|
|
||||||
"are required."
|
|
||||||
)
|
|
||||||
|
|
||||||
table = template.get("table", {}) or {}
|
|
||||||
boundaries = table.get("column_boundaries", [])
|
|
||||||
if not isinstance(boundaries, list):
|
|
||||||
errors.append("table.column_boundaries must be a list.")
|
|
||||||
|
|
||||||
return (not errors), errors
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Persistence
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def _atomic_write(path: Path, payload: str) -> None:
|
|
||||||
"""Write *payload* to *path* via a temp file + rename.
|
|
||||||
|
|
||||||
Avoids leaving a half-written JSON if the process dies mid-save —
|
|
||||||
the GUI saves on every visual-picker change, and a corrupt
|
|
||||||
template file would be hostile to recover from.
|
|
||||||
"""
|
|
||||||
path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
fd, tmp_path = tempfile.mkstemp(
|
|
||||||
prefix=f".{path.name}.",
|
|
||||||
suffix=".tmp",
|
|
||||||
dir=str(path.parent),
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
|
||||||
f.write(payload)
|
|
||||||
os.replace(tmp_path, path)
|
|
||||||
except Exception:
|
|
||||||
try:
|
|
||||||
os.unlink(tmp_path)
|
|
||||||
except FileNotFoundError:
|
|
||||||
pass
|
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
def save_template(template: dict[str, Any]) -> str:
|
|
||||||
"""Persist *template* to disk; return the slug it was saved as.
|
|
||||||
|
|
||||||
Stamps ``updated_at``. Atomic via temp-file + rename.
|
|
||||||
Raises ``ValueError`` with a multi-line error list if validation
|
|
||||||
fails — caller should surface that to the user.
|
|
||||||
"""
|
|
||||||
ok, errors = validate_template(template)
|
|
||||||
if not ok:
|
|
||||||
raise ValueError("\n".join(errors))
|
|
||||||
template = dict(template)
|
|
||||||
template["updated_at"] = datetime.now(tz=timezone.utc).isoformat(
|
|
||||||
timespec="seconds"
|
|
||||||
)
|
|
||||||
slug = template["slug"]
|
|
||||||
payload = json.dumps(template, indent=2, ensure_ascii=False)
|
|
||||||
_atomic_write(template_path(slug), payload)
|
|
||||||
return slug
|
|
||||||
|
|
||||||
|
|
||||||
def load_template(slug: str) -> dict[str, Any]:
|
|
||||||
"""Read the template at *slug*. Raises ``FileNotFoundError`` if
|
|
||||||
missing, ``ValueError`` if the JSON is corrupt or the schema
|
|
||||||
version is unsupported.
|
|
||||||
|
|
||||||
v1 templates (pre row-heuristic) are accepted and migrated
|
|
||||||
in-memory to v2 shape with ``mode="column_visual"``. The file
|
|
||||||
on disk is NOT rewritten — the user's canonical original stays
|
|
||||||
intact until they explicitly re-save, so a buggy migration
|
|
||||||
can't silently corrupt their template library.
|
|
||||||
"""
|
|
||||||
p = template_path(slug)
|
|
||||||
try:
|
|
||||||
raw = p.read_text(encoding="utf-8")
|
|
||||||
except FileNotFoundError:
|
|
||||||
raise
|
|
||||||
try:
|
|
||||||
data = json.loads(raw)
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
raise ValueError(f"Corrupt template {slug!r}: {e}") from e
|
|
||||||
sv = data.get("schema_version")
|
|
||||||
if sv not in _LOAD_SUPPORTED_VERSIONS:
|
|
||||||
raise ValueError(
|
|
||||||
f"Template {slug!r} has unsupported schema_version {sv!r}; "
|
|
||||||
f"this build supports {sorted(_LOAD_SUPPORTED_VERSIONS)}."
|
|
||||||
)
|
|
||||||
return _migrate_to_current(data)
|
|
||||||
|
|
||||||
|
|
||||||
def _migrate_to_current(data: dict[str, Any]) -> dict[str, Any]:
|
|
||||||
"""In-memory migration of older schemas to the current shape.
|
|
||||||
|
|
||||||
v1 → v2 adds a ``mode`` key defaulting to ``"column_visual"``
|
|
||||||
(since v1 was the column-x-position approach) and stamps
|
|
||||||
``schema_version`` to the current value. All v1 keys keep
|
|
||||||
their original meaning."""
|
|
||||||
if data.get("schema_version") == 1:
|
|
||||||
data = dict(data)
|
|
||||||
data["schema_version"] = SCHEMA_VERSION
|
|
||||||
data.setdefault("mode", "column_visual")
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def delete_template(slug: str) -> bool:
|
|
||||||
"""Remove the template file; returns ``True`` if it existed."""
|
|
||||||
p = template_path(slug)
|
|
||||||
try:
|
|
||||||
p.unlink()
|
|
||||||
return True
|
|
||||||
except FileNotFoundError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def list_templates() -> list[dict[str, Any]]:
|
|
||||||
"""Return a sorted list of ``{slug, name, updated_at}`` summaries.
|
|
||||||
|
|
||||||
Skips files that fail to parse — surfaces them in the manage UI
|
|
||||||
as warnings rather than crashing the list view.
|
|
||||||
"""
|
|
||||||
d = templates_dir()
|
|
||||||
if not d.exists():
|
|
||||||
return []
|
|
||||||
out: list[dict[str, Any]] = []
|
|
||||||
for p in sorted(d.glob("*.json")):
|
|
||||||
try:
|
|
||||||
data = json.loads(p.read_text(encoding="utf-8"))
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
if not isinstance(data, dict):
|
|
||||||
continue
|
|
||||||
out.append({
|
|
||||||
"slug": data.get("slug") or p.stem,
|
|
||||||
"name": data.get("name") or p.stem,
|
|
||||||
"updated_at": data.get("updated_at", ""),
|
|
||||||
"notes": data.get("notes", ""),
|
|
||||||
})
|
|
||||||
out.sort(key=lambda r: r["updated_at"] or r["name"], reverse=True)
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Import / export
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def template_to_json(template: dict[str, Any]) -> str:
|
|
||||||
"""Serialize a template for download. Pretty-printed for human
|
|
||||||
inspection / diffing."""
|
|
||||||
return json.dumps(template, indent=2, ensure_ascii=False)
|
|
||||||
|
|
||||||
|
|
||||||
def template_from_json(payload: str) -> dict[str, Any]:
|
|
||||||
"""Deserialize uploaded template JSON. Validates schema version
|
|
||||||
but does NOT save — caller decides whether to ``save_template``
|
|
||||||
or merge into the current build.
|
|
||||||
|
|
||||||
Raises ``ValueError`` on malformed input."""
|
|
||||||
try:
|
|
||||||
data = json.loads(payload)
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
raise ValueError(f"Not valid JSON: {e}") from e
|
|
||||||
if not isinstance(data, dict):
|
|
||||||
raise ValueError("Top-level JSON must be an object.")
|
|
||||||
sv = data.get("schema_version")
|
|
||||||
if sv != SCHEMA_VERSION:
|
|
||||||
raise ValueError(
|
|
||||||
f"Imported template has schema_version {sv!r}; "
|
|
||||||
f"this build expects {SCHEMA_VERSION}."
|
|
||||||
)
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"SCHEMA_VERSION",
|
|
||||||
"VALID_TARGETS",
|
|
||||||
"delete_template",
|
|
||||||
"list_templates",
|
|
||||||
"load_template",
|
|
||||||
"new_template",
|
|
||||||
"save_template",
|
|
||||||
"slugify",
|
|
||||||
"template_from_json",
|
|
||||||
"template_path",
|
|
||||||
"template_to_json",
|
|
||||||
"templates_dir",
|
|
||||||
"validate_template",
|
|
||||||
]
|
|
||||||
@@ -1,116 +0,0 @@
|
|||||||
"""Tests for the streamlit-drawable-canvas compatibility shim.
|
|
||||||
|
|
||||||
The shim re-attaches ``image_to_url`` to ``streamlit.elements.image``
|
|
||||||
on modern Streamlit where the helper was relocated to
|
|
||||||
``streamlit.elements.lib.image_utils`` and given a new signature
|
|
||||||
(takes a ``LayoutConfig`` dataclass instead of a plain ``int``
|
|
||||||
width).
|
|
||||||
|
|
||||||
If this test ever fails on a Streamlit upgrade, it almost
|
|
||||||
certainly means the ``image_to_url`` function moved AGAIN — the
|
|
||||||
shim's fallback message points to where to look. Update
|
|
||||||
``_drawable_canvas_compat.py`` to find the new location.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import types
|
|
||||||
|
|
||||||
|
|
||||||
def test_shim_attaches_image_to_url():
|
|
||||||
"""After ``install()`` the old import path resolves to a
|
|
||||||
callable, even on modern Streamlit where the original was
|
|
||||||
relocated."""
|
|
||||||
# Force a fresh import so the module-level _PATCHED guard
|
|
||||||
# doesn't short-circuit between tests.
|
|
||||||
sys.modules.pop("src.gui._drawable_canvas_compat", None)
|
|
||||||
from src.gui._drawable_canvas_compat import install
|
|
||||||
install()
|
|
||||||
import streamlit.elements.image as old_loc
|
|
||||||
assert hasattr(old_loc, "image_to_url")
|
|
||||||
assert callable(old_loc.image_to_url)
|
|
||||||
|
|
||||||
|
|
||||||
def test_shim_is_idempotent():
|
|
||||||
"""Calling ``install()`` twice doesn't double-wrap or break
|
|
||||||
anything — important because the page module imports + calls
|
|
||||||
it once, and a Streamlit script-rerun re-executes the page
|
|
||||||
module top-to-bottom."""
|
|
||||||
sys.modules.pop("src.gui._drawable_canvas_compat", None)
|
|
||||||
from src.gui._drawable_canvas_compat import install
|
|
||||||
install()
|
|
||||||
import streamlit.elements.image as old_loc
|
|
||||||
first = old_loc.image_to_url
|
|
||||||
install()
|
|
||||||
second = old_loc.image_to_url
|
|
||||||
assert first is second
|
|
||||||
|
|
||||||
|
|
||||||
def test_shim_no_op_when_image_to_url_already_present():
|
|
||||||
"""If a future Streamlit restores ``image_to_url`` at the old
|
|
||||||
location, the shim must not overwrite it — leave the upstream
|
|
||||||
function in place so the canvas package gets the official
|
|
||||||
version, not our compatibility wrapper."""
|
|
||||||
sys.modules.pop("src.gui._drawable_canvas_compat", None)
|
|
||||||
import streamlit.elements.image as old_loc
|
|
||||||
|
|
||||||
sentinel = lambda *a, **kw: "sentinel-url" # noqa: E731
|
|
||||||
old_loc.image_to_url = sentinel
|
|
||||||
try:
|
|
||||||
from src.gui._drawable_canvas_compat import install
|
|
||||||
install()
|
|
||||||
assert old_loc.image_to_url is sentinel, (
|
|
||||||
"Shim must not clobber an existing image_to_url."
|
|
||||||
)
|
|
||||||
finally:
|
|
||||||
# Tidy up so subsequent tests see a clean module.
|
|
||||||
delattr(old_loc, "image_to_url")
|
|
||||||
sys.modules.pop("src.gui._drawable_canvas_compat", None)
|
|
||||||
|
|
||||||
|
|
||||||
def test_shim_calls_new_function_with_layout_config():
|
|
||||||
"""The shim's wrapper must translate the old ``(image, width,
|
|
||||||
clamp, channels, output_format, image_id)`` call into the new
|
|
||||||
``(image, layout_config, …)`` signature without breaking."""
|
|
||||||
sys.modules.pop("src.gui._drawable_canvas_compat", None)
|
|
||||||
import streamlit.elements.image as old_loc
|
|
||||||
if hasattr(old_loc, "image_to_url"):
|
|
||||||
delattr(old_loc, "image_to_url")
|
|
||||||
|
|
||||||
# Replace the new function with a recorder so we can inspect
|
|
||||||
# what arguments the shim passed through.
|
|
||||||
from streamlit.elements.lib import image_utils
|
|
||||||
captured: dict = {}
|
|
||||||
original = image_utils.image_to_url
|
|
||||||
|
|
||||||
def recorder(image, layout_config, clamp, channels, output_format, image_id):
|
|
||||||
captured["image"] = image
|
|
||||||
captured["layout_config"] = layout_config
|
|
||||||
captured["clamp"] = clamp
|
|
||||||
captured["channels"] = channels
|
|
||||||
captured["output_format"] = output_format
|
|
||||||
captured["image_id"] = image_id
|
|
||||||
return "fake-url"
|
|
||||||
|
|
||||||
image_utils.image_to_url = recorder
|
|
||||||
try:
|
|
||||||
from src.gui._drawable_canvas_compat import install
|
|
||||||
install()
|
|
||||||
result = old_loc.image_to_url(
|
|
||||||
"fake-image", -1, False, "RGB", "PNG", "test-id",
|
|
||||||
)
|
|
||||||
assert result == "fake-url"
|
|
||||||
assert captured["image"] == "fake-image"
|
|
||||||
assert captured["clamp"] is False
|
|
||||||
assert captured["channels"] == "RGB"
|
|
||||||
assert captured["output_format"] == "PNG"
|
|
||||||
assert captured["image_id"] == "test-id"
|
|
||||||
# The shim wraps the int width into a LayoutConfig.
|
|
||||||
from streamlit.elements.lib.layout_utils import LayoutConfig
|
|
||||||
assert isinstance(captured["layout_config"], LayoutConfig)
|
|
||||||
finally:
|
|
||||||
image_utils.image_to_url = original
|
|
||||||
if hasattr(old_loc, "image_to_url"):
|
|
||||||
delattr(old_loc, "image_to_url")
|
|
||||||
sys.modules.pop("src.gui._drawable_canvas_compat", None)
|
|
||||||
@@ -1,36 +1,33 @@
|
|||||||
"""Tests for the pure PDF-extraction pipeline.
|
"""Tests for the minimal PDF transaction scanner.
|
||||||
|
|
||||||
Real PDF parsing (``extract_pages``) is a thin wrapper around
|
The public API is one function: ``scan_pdf_for_transactions``.
|
||||||
``pdfplumber`` and is exercised by hand on real bank statements.
|
These tests cover the value-parsing helpers, the row clusterer,
|
||||||
These tests pin the meaty bits — value parsing, row clustering,
|
the date/amount token finders, and the end-to-end scanner
|
||||||
column assignment, template-driven extraction — against synthetic
|
against synthetic ``Page`` objects with no real PDF involved.
|
||||||
``WordBox`` data so they run fast and have no PDF dependency.
|
|
||||||
|
End-to-end-on-a-real-PDF coverage lives in
|
||||||
|
``test_pdf_extract_smoke.py``, which uses ``fpdf2`` to generate
|
||||||
|
a fixture statement at test time.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
from src.pdf_extract import (
|
from src.pdf_extract import (
|
||||||
Page,
|
Page,
|
||||||
WordBox,
|
WordBox,
|
||||||
apply_template,
|
_find_amount_tokens,
|
||||||
assign_columns,
|
_find_dates_in_words,
|
||||||
cluster_rows,
|
cluster_rows,
|
||||||
parse_amount,
|
parse_amount,
|
||||||
parse_date,
|
parse_date,
|
||||||
_pages_in_range,
|
|
||||||
_within_table_window,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _w(text: str, x0: float, top: float, x1: float | None = None) -> WordBox:
|
def _w(text: str, x0: float, top: float, x1: float | None = None) -> WordBox:
|
||||||
"""Convenience constructor — heights and exact x1 don't matter
|
|
||||||
for the tests we write."""
|
|
||||||
return WordBox(
|
return WordBox(
|
||||||
x0=x0,
|
x0=x0,
|
||||||
top=top,
|
top=top,
|
||||||
x1=x1 if x1 is not None else x0 + 10 * len(text),
|
x1=x1 if x1 is not None else x0 + 8 * len(text),
|
||||||
bottom=top + 10,
|
bottom=top + 10,
|
||||||
text=text,
|
text=text,
|
||||||
)
|
)
|
||||||
@@ -61,13 +58,18 @@ class TestParseAmount:
|
|||||||
assert parse_amount("not a number") is None
|
assert parse_amount("not a number") is None
|
||||||
|
|
||||||
def test_european_decimal(self):
|
def test_european_decimal(self):
|
||||||
opts = {
|
assert parse_amount(
|
||||||
"decimal_separator": ",",
|
"€1.234,56",
|
||||||
"thousands_separator": ".",
|
decimal=",",
|
||||||
"currency_strip": "€",
|
thousands=".",
|
||||||
"negative_in_parens": True,
|
currency_strip="€",
|
||||||
}
|
) == 1234.56
|
||||||
assert parse_amount("€1.234,56", opts) == 1234.56
|
|
||||||
|
def test_parens_off_disables_paren_negative(self):
|
||||||
|
# With parens off, (4.50) won't be treated as negative —
|
||||||
|
# but it also won't parse cleanly since "(4.50)" isn't a
|
||||||
|
# plain number. Verify the off-path is non-flipping.
|
||||||
|
assert parse_amount("(4.50)", negative_in_parens=False) is None
|
||||||
|
|
||||||
|
|
||||||
class TestParseDate:
|
class TestParseDate:
|
||||||
@@ -78,7 +80,7 @@ class TestParseDate:
|
|||||||
assert parse_date("2026-01-15", ["%Y-%m-%d"]) == "2026-01-15"
|
assert parse_date("2026-01-15", ["%Y-%m-%d"]) == "2026-01-15"
|
||||||
|
|
||||||
def test_fallback_format(self):
|
def test_fallback_format(self):
|
||||||
# Not in the supplied list — should still parse via fallback.
|
# Not in supplied list — should still parse via fallback.
|
||||||
assert parse_date("01/15/26") == "2026-01-15"
|
assert parse_date("01/15/26") == "2026-01-15"
|
||||||
|
|
||||||
def test_invalid(self):
|
def test_invalid(self):
|
||||||
@@ -88,199 +90,74 @@ class TestParseDate:
|
|||||||
class TestClusterRows:
|
class TestClusterRows:
|
||||||
def test_groups_close_y(self):
|
def test_groups_close_y(self):
|
||||||
words = [
|
words = [
|
||||||
_w("A", x0=0, top=100),
|
_w("A", 0, 100), _w("B", 20, 101), _w("C", 40, 102),
|
||||||
_w("B", x0=20, top=101),
|
|
||||||
_w("C", x0=40, top=102),
|
|
||||||
]
|
]
|
||||||
rows = cluster_rows(words, y_tolerance=3.0)
|
rows = cluster_rows(words)
|
||||||
assert len(rows) == 1
|
assert len(rows) == 1
|
||||||
assert [w.text for w in rows[0]] == ["A", "B", "C"]
|
assert [w.text for w in rows[0]] == ["A", "B", "C"]
|
||||||
|
|
||||||
def test_separates_far_y(self):
|
def test_separates_far_y(self):
|
||||||
words = [
|
words = [_w("A", 0, 100), _w("B", 0, 120)]
|
||||||
_w("A", x0=0, top=100),
|
assert [
|
||||||
_w("B", x0=0, top=120),
|
[w.text for w in r] for r in cluster_rows(words)
|
||||||
]
|
] == [["A"], ["B"]]
|
||||||
rows = cluster_rows(words, y_tolerance=3.0)
|
|
||||||
assert [[w.text for w in r] for r in rows] == [["A"], ["B"]]
|
|
||||||
|
|
||||||
def test_sorts_left_to_right_within_row(self):
|
def test_sorts_left_to_right_within_row(self):
|
||||||
words = [
|
words = [_w("C", 40, 100), _w("A", 0, 100), _w("B", 20, 100)]
|
||||||
_w("C", x0=40, top=100),
|
assert [w.text for w in cluster_rows(words)[0]] == ["A", "B", "C"]
|
||||||
_w("A", x0=0, top=100),
|
|
||||||
_w("B", x0=20, top=100),
|
|
||||||
]
|
|
||||||
rows = cluster_rows(words)
|
|
||||||
assert [w.text for w in rows[0]] == ["A", "B", "C"]
|
|
||||||
|
|
||||||
def test_empty(self):
|
def test_empty(self):
|
||||||
assert cluster_rows([]) == []
|
assert cluster_rows([]) == []
|
||||||
|
|
||||||
|
|
||||||
class TestAssignColumns:
|
class TestFindDatesInWords:
|
||||||
def test_three_columns(self):
|
def test_us_slash(self):
|
||||||
# boundaries at x=100, 200 → columns [0,100), [100,200), [200,∞)
|
row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)]
|
||||||
row = [
|
assert _find_dates_in_words(row) == [(0, "01/15/2026")]
|
||||||
_w("Jan", x0=10, top=0, x1=40), # col 0
|
|
||||||
_w("1", x0=45, top=0, x1=55), # col 0
|
|
||||||
_w("Deposit", x0=110, top=0, x1=180), # col 1
|
|
||||||
_w("250.00", x0=210, top=0, x1=260), # col 2
|
|
||||||
]
|
|
||||||
cells = assign_columns(row, [100, 200])
|
|
||||||
assert cells[0] == "Jan 1"
|
|
||||||
assert cells[1] == "Deposit"
|
|
||||||
assert cells[2] == "250.00"
|
|
||||||
|
|
||||||
def test_no_boundaries_one_column(self):
|
def test_two_digit_year(self):
|
||||||
row = [_w("A", 0, 0), _w("B", 20, 0)]
|
row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)]
|
||||||
cells = assign_columns(row, [])
|
result = _find_dates_in_words(row)
|
||||||
assert cells == ["A B"]
|
assert result and result[0][1] == "01/15/26"
|
||||||
|
|
||||||
|
def test_iso(self):
|
||||||
|
row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)]
|
||||||
|
assert _find_dates_in_words(row) == [(0, "2026-01-15")]
|
||||||
|
|
||||||
|
def test_month_name(self):
|
||||||
|
row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)]
|
||||||
|
result = _find_dates_in_words(row)
|
||||||
|
assert result and "Jan 15" in result[0][1]
|
||||||
|
|
||||||
|
def test_no_date(self):
|
||||||
|
row = [_w("Just", 0, 0), _w("text", 50, 0)]
|
||||||
|
assert _find_dates_in_words(row) == []
|
||||||
|
|
||||||
|
|
||||||
class TestPagesInRange:
|
class TestFindAmountTokens:
|
||||||
def _mk(self, n):
|
def test_currency_format(self):
|
||||||
return [Page(page_no=i + 1, width=600, height=800, text="", words=[]) for i in range(n)]
|
row = [_w("Coffee", 0, 0), _w("$4.50", 100, 0)]
|
||||||
|
out = _find_amount_tokens(row)
|
||||||
|
assert len(out) == 1
|
||||||
|
assert out[0][2] == "$4.50"
|
||||||
|
|
||||||
def test_all(self):
|
def test_parens_negative(self):
|
||||||
pages = self._mk(5)
|
row = [_w("(123.45)", 0, 0)]
|
||||||
assert len(_pages_in_range(pages, "all")) == 5
|
out = _find_amount_tokens(row)
|
||||||
assert len(_pages_in_range(pages, "")) == 5
|
assert out and out[0][2] == "(123.45)"
|
||||||
|
|
||||||
def test_explicit_list(self):
|
def test_no_amount_on_pure_text(self):
|
||||||
pages = self._mk(5)
|
row = [_w("Hello", 0, 0), _w("World", 50, 0)]
|
||||||
got = [p.page_no for p in _pages_in_range(pages, "1,3,5")]
|
assert _find_amount_tokens(row) == []
|
||||||
assert got == [1, 3, 5]
|
|
||||||
|
|
||||||
def test_range(self):
|
def test_rejects_bare_year(self):
|
||||||
pages = self._mk(5)
|
# A bare 4-digit year matches the digit pattern but lacks
|
||||||
got = [p.page_no for p in _pages_in_range(pages, "2-4")]
|
# any money marker — should be filtered out.
|
||||||
assert got == [2, 3, 4]
|
row = [_w("2026", 0, 0)]
|
||||||
|
assert _find_amount_tokens(row) == []
|
||||||
def test_open_ended(self):
|
|
||||||
pages = self._mk(5)
|
|
||||||
got = [p.page_no for p in _pages_in_range(pages, "3-")]
|
|
||||||
assert got == [3, 4, 5]
|
|
||||||
|
|
||||||
|
|
||||||
class TestWithinTableWindow:
|
# End-to-end tests against synthetic Page objects are in the smoke
|
||||||
def test_header_skipped_end_excluded(self):
|
# test module — they need ``scan_pdf_for_transactions`` which in
|
||||||
rows = [
|
# turn uses ``extract_pages_auto``. The unit-test layer here pins
|
||||||
[_w("STATEMENT", 0, 0)],
|
# the building blocks; smoke tests pin the wiring.
|
||||||
[_w("Date", 0, 20), _w("Description", 50, 20), _w("Amount", 200, 20)],
|
|
||||||
[_w("01/15", 0, 40), _w("Coffee", 50, 40), _w("4.50", 200, 40)],
|
|
||||||
[_w("01/16", 0, 60), _w("Refund", 50, 60), _w("12.00", 200, 60)],
|
|
||||||
[_w("Closing", 0, 80), _w("balance", 50, 80)],
|
|
||||||
[_w("Page", 0, 100), _w("1", 50, 100)],
|
|
||||||
]
|
|
||||||
out = _within_table_window(rows, "Date Description Amount", ["Closing balance"])
|
|
||||||
# Should keep just the two transaction rows.
|
|
||||||
assert len(out) == 2
|
|
||||||
assert out[0][0].text == "01/15"
|
|
||||||
assert out[1][0].text == "01/16"
|
|
||||||
|
|
||||||
def test_no_header_returns_empty_when_required(self):
|
|
||||||
rows = [[_w("foo", 0, 0)]]
|
|
||||||
assert _within_table_window(rows, "Date Description Amount", []) == []
|
|
||||||
|
|
||||||
def test_blank_header_passes_through(self):
|
|
||||||
rows = [[_w("x", 0, 0)], [_w("y", 0, 20)]]
|
|
||||||
assert _within_table_window(rows, "", []) == rows
|
|
||||||
|
|
||||||
|
|
||||||
class TestApplyTemplate:
|
|
||||||
"""End-to-end on synthetic ``Page`` objects."""
|
|
||||||
|
|
||||||
def _statement_page(self) -> Page:
|
|
||||||
# Mock layout: 3 columns at x=0/100/200, header at y=20, data at 40+.
|
|
||||||
words = [
|
|
||||||
_w("STATEMENT", 0, 0),
|
|
||||||
# Header
|
|
||||||
_w("Date", 5, 20), _w("Description", 105, 20), _w("Amount", 205, 20),
|
|
||||||
# Row 1
|
|
||||||
_w("01/15/2026", 5, 40), _w("Coffee", 105, 40),
|
|
||||||
_w("Shop", 140, 40), _w("(4.50)", 205, 40),
|
|
||||||
# Row 2
|
|
||||||
_w("01/16/2026", 5, 60), _w("Refund", 105, 60), _w("$12.00", 205, 60),
|
|
||||||
# Continuation row (no date) — should merge into row 2
|
|
||||||
_w("from", 105, 80), _w("vendor", 140, 80),
|
|
||||||
# End marker
|
|
||||||
_w("Closing", 5, 100), _w("balance", 105, 100), _w("$1,000.00", 205, 100),
|
|
||||||
]
|
|
||||||
return Page(page_no=1, width=300, height=120, text="", words=words)
|
|
||||||
|
|
||||||
def _template(self) -> dict:
|
|
||||||
return {
|
|
||||||
"pages": {"range": "all"},
|
|
||||||
"table": {
|
|
||||||
"header_text": "Date Description Amount",
|
|
||||||
"end_markers": ["Closing balance"],
|
|
||||||
"column_boundaries": [100, 200],
|
|
||||||
"y_tolerance": 3.0,
|
|
||||||
"skip_rows_matching": [],
|
|
||||||
},
|
|
||||||
"columns": [
|
|
||||||
{"source": 0, "target": "date"},
|
|
||||||
{"source": 1, "target": "description"},
|
|
||||||
{"source": 2, "target": "amount"},
|
|
||||||
],
|
|
||||||
"parse": {
|
|
||||||
"date_format": "%m/%d/%Y",
|
|
||||||
"amount_negative_in_parens": True,
|
|
||||||
"merge_multiline_description": True,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
def test_basic_extraction(self):
|
|
||||||
df = apply_template([self._statement_page()], self._template())
|
|
||||||
assert isinstance(df, pd.DataFrame)
|
|
||||||
assert len(df) == 2
|
|
||||||
assert list(df["date"]) == ["2026-01-15", "2026-01-16"]
|
|
||||||
# Parens-negative
|
|
||||||
assert df.iloc[0]["amount"] == -4.50
|
|
||||||
# Plain positive with currency strip
|
|
||||||
assert df.iloc[1]["amount"] == 12.00
|
|
||||||
# Multi-line description merged
|
|
||||||
assert "from vendor" in df.iloc[1]["description"]
|
|
||||||
|
|
||||||
def test_debit_credit_split_columns(self):
|
|
||||||
# Layout: date | description | debit | credit columns
|
|
||||||
page = Page(
|
|
||||||
page_no=1, width=400, height=80, text="",
|
|
||||||
words=[
|
|
||||||
_w("Date", 5, 0), _w("Desc", 105, 0),
|
|
||||||
_w("Debit", 205, 0), _w("Credit", 305, 0),
|
|
||||||
_w("01/15/2026", 5, 20), _w("Coffee", 105, 20), _w("4.50", 205, 20),
|
|
||||||
_w("01/16/2026", 5, 40), _w("Refund", 105, 40),
|
|
||||||
_w("", 205, 40), # no debit
|
|
||||||
_w("12.00", 305, 40),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
tpl = {
|
|
||||||
"table": {
|
|
||||||
"header_text": "Date Desc Debit Credit",
|
|
||||||
"column_boundaries": [100, 200, 300],
|
|
||||||
},
|
|
||||||
"columns": [
|
|
||||||
{"source": 0, "target": "date"},
|
|
||||||
{"source": 1, "target": "description"},
|
|
||||||
{"source": 2, "target": "amount_debit"},
|
|
||||||
{"source": 3, "target": "amount_credit"},
|
|
||||||
],
|
|
||||||
"parse": {"date_format": "%m/%d/%Y"},
|
|
||||||
}
|
|
||||||
df = apply_template([page], tpl)
|
|
||||||
assert list(df["amount"]) == [-4.50, 12.00]
|
|
||||||
assert list(df["type"]) == ["debit", "credit"]
|
|
||||||
|
|
||||||
def test_skip_rows_matching(self):
|
|
||||||
page = self._statement_page()
|
|
||||||
tpl = self._template()
|
|
||||||
tpl["table"]["skip_rows_matching"] = ["Refund"]
|
|
||||||
df = apply_template([page], tpl)
|
|
||||||
# Refund row is dropped — only one transaction left
|
|
||||||
assert len(df) == 1
|
|
||||||
assert df.iloc[0]["amount"] == -4.50
|
|
||||||
|
|
||||||
def test_empty_pages_returns_empty_df(self):
|
|
||||||
df = apply_template([], self._template())
|
|
||||||
assert df.empty
|
|
||||||
|
|||||||
@@ -1,55 +1,43 @@
|
|||||||
"""End-to-end smoke tests for the PDF extraction stack.
|
"""End-to-end smoke tests for the PDF transaction scanner.
|
||||||
|
|
||||||
These tests run real ``pdfplumber`` + ``pypdfium2`` calls against
|
These run real ``pdfplumber`` + ``pypdfium2`` (when OCR is in play)
|
||||||
a small PDF generated in-memory with ``fpdf2``. They exist to
|
calls against a small statement-shaped PDF generated in memory
|
||||||
catch the failure mode the user hit on first install — a missing
|
with ``fpdf2``. They catch the failure modes most likely to bite
|
||||||
or mismatched native dependency that doesn't show up until the
|
an end-user installer build: missing native lib, broken hook
|
||||||
extractor actually tries to open a PDF.
|
bundling, pin/installed mismatch.
|
||||||
|
|
||||||
Per ``project-pdf-extractor`` memory: ``test_pdf_extract.py``
|
Generation note: ``fpdf2`` is a test-only dep in
|
||||||
covers the parsing logic on synthetic ``WordBox`` data with no
|
|
||||||
PDF dep involved. This file is the layer above: it confirms the
|
|
||||||
deps themselves work, that hooks bundled them correctly (the
|
|
||||||
versions pinned in ``requirements.txt`` matter here), and that
|
|
||||||
the extractor's pipeline survives a round-trip through real
|
|
||||||
``pdfplumber.extract_words`` and real ``pypdfium2.render``.
|
|
||||||
|
|
||||||
Generation note: ``fpdf2`` is a test-only dep listed in
|
|
||||||
``requirements-dev.txt``. We don't ship it.
|
``requirements-dev.txt``. We don't ship it.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import io
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
def _build_tiny_statement_pdf() -> bytes:
|
def _build_tiny_statement_pdf() -> bytes:
|
||||||
"""Render a one-page PDF that looks roughly like the simplest
|
"""One-page PDF: header line + three transaction rows + a
|
||||||
possible bank statement: a header line + three transaction
|
closing-balance footer. The scanner should pick up exactly the
|
||||||
rows + a closing-balance footer. Word positions are stable
|
three transactions."""
|
||||||
enough that the parser can identify columns by x-position."""
|
|
||||||
from fpdf import FPDF
|
from fpdf import FPDF
|
||||||
|
|
||||||
pdf = FPDF(orientation="P", unit="pt", format="letter")
|
pdf = FPDF(orientation="P", unit="pt", format="letter")
|
||||||
pdf.add_page()
|
pdf.add_page()
|
||||||
pdf.set_font("Helvetica", size=12)
|
pdf.set_font("Helvetica", size=12)
|
||||||
# Header
|
|
||||||
pdf.set_xy(40, 50)
|
pdf.set_xy(40, 50)
|
||||||
pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT")
|
pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT")
|
||||||
# Transaction-table header row
|
# Header row (not a transaction — no amount)
|
||||||
pdf.set_xy(40, 100)
|
pdf.set_xy(40, 100)
|
||||||
pdf.cell(120, 14, "Date")
|
pdf.cell(120, 14, "Date")
|
||||||
pdf.set_xy(160, 100)
|
pdf.set_xy(160, 100)
|
||||||
pdf.cell(200, 14, "Description")
|
pdf.cell(200, 14, "Description")
|
||||||
pdf.set_xy(360, 100)
|
pdf.set_xy(360, 100)
|
||||||
pdf.cell(80, 14, "Amount")
|
pdf.cell(80, 14, "Amount")
|
||||||
# Three rows
|
# Three transactions
|
||||||
rows = [
|
rows = [
|
||||||
("01/15/2026", "Coffee Shop", "(4.50)"),
|
("01/15/2026", "Coffee Shop", "(4.50)"),
|
||||||
("01/16/2026", "Refund Vendor", "$12.00"),
|
("01/16/2026", "Refund Vendor", "$12.00"),
|
||||||
("01/17/2026", "ATM Withdrawal","(40.00)"),
|
("01/17/2026", "ATM Withdrawal", "(40.00)"),
|
||||||
]
|
]
|
||||||
y = 130
|
y = 130
|
||||||
for date, desc, amt in rows:
|
for date, desc, amt in rows:
|
||||||
@@ -60,7 +48,7 @@ def _build_tiny_statement_pdf() -> bytes:
|
|||||||
pdf.set_xy(360, y)
|
pdf.set_xy(360, y)
|
||||||
pdf.cell(80, 14, amt)
|
pdf.cell(80, 14, amt)
|
||||||
y += 20
|
y += 20
|
||||||
# Closing-balance footer
|
# Footer — has a date-like number maybe but no real txn shape
|
||||||
pdf.set_xy(40, y + 20)
|
pdf.set_xy(40, y + 20)
|
||||||
pdf.cell(0, 14, "Closing balance: $1,000.00")
|
pdf.cell(0, 14, "Closing balance: $1,000.00")
|
||||||
return bytes(pdf.output())
|
return bytes(pdf.output())
|
||||||
@@ -72,12 +60,8 @@ def _build_tiny_statement_pdf() -> bytes:
|
|||||||
|
|
||||||
|
|
||||||
class TestDependencyImports:
|
class TestDependencyImports:
|
||||||
"""Each runtime PDF dep must be importable.
|
"""Each runtime PDF dep must be importable. Fails fast on a
|
||||||
|
stripped install or a missing CI pin."""
|
||||||
These tests will fail fast on a stripped/broken install — most
|
|
||||||
valuable as a CI gate when the requirements.txt pins are
|
|
||||||
bumped, so we know the new pin still installs cleanly across
|
|
||||||
the matrix."""
|
|
||||||
|
|
||||||
def test_pdfplumber(self):
|
def test_pdfplumber(self):
|
||||||
import pdfplumber # noqa: F401
|
import pdfplumber # noqa: F401
|
||||||
@@ -85,130 +69,135 @@ class TestDependencyImports:
|
|||||||
def test_pypdfium2(self):
|
def test_pypdfium2(self):
|
||||||
import pypdfium2 # noqa: F401
|
import pypdfium2 # noqa: F401
|
||||||
|
|
||||||
def test_streamlit_drawable_canvas(self):
|
|
||||||
# Don't instantiate the canvas — that needs a Streamlit
|
|
||||||
# script-run context. Just confirm the module loads.
|
|
||||||
import streamlit_drawable_canvas # noqa: F401
|
|
||||||
|
|
||||||
def test_pytesseract(self):
|
def test_pytesseract(self):
|
||||||
# The Python binding must import even when the Tesseract
|
|
||||||
# binary isn't installed — the OCR availability check
|
|
||||||
# handles binary absence separately.
|
|
||||||
import pytesseract # noqa: F401
|
import pytesseract # noqa: F401
|
||||||
|
|
||||||
def test_PIL(self):
|
def test_PIL(self):
|
||||||
# Transitively required by pdfplumber + pypdfium2 + canvas.
|
|
||||||
# Pinning explicit confirms hooks pull it through.
|
|
||||||
from PIL import Image # noqa: F401
|
from PIL import Image # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Real-PDF round-trip
|
# End-to-end against a real PDF
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
class TestRealPdfRoundTrip:
|
class TestScanPdfForTransactions:
|
||||||
"""``extract_pages`` + ``apply_template`` against a real PDF."""
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def pdf_bytes(self) -> bytes:
|
def pdf_bytes(self) -> bytes:
|
||||||
return _build_tiny_statement_pdf()
|
return _build_tiny_statement_pdf()
|
||||||
|
|
||||||
def test_extract_pages_returns_words(self, pdf_bytes):
|
def test_finds_three_transactions(self, pdf_bytes):
|
||||||
from src.pdf_extract import extract_pages
|
from src.pdf_extract import scan_pdf_for_transactions
|
||||||
pages = extract_pages(pdf_bytes)
|
rows, warnings = scan_pdf_for_transactions(pdf_bytes)
|
||||||
assert len(pages) == 1
|
# The PDF has 3 transactions plus a header and a closing-
|
||||||
assert pages[0].width > 0 and pages[0].height > 0
|
# balance footer. Header has no amount; closing-balance has
|
||||||
# At minimum we should have the words from the header and
|
# no date in the same line — neither qualifies as a txn.
|
||||||
# one transaction row — proves pdfplumber wired up.
|
assert len(rows) == 3, (
|
||||||
all_text = " ".join(w.text for w in pages[0].words)
|
f"expected 3 rows, got {len(rows)}:\n"
|
||||||
assert "ACME" in all_text
|
f"{[r.get('raw') for r in rows]}"
|
||||||
assert "Coffee" in all_text
|
)
|
||||||
assert "01/15/2026" in all_text
|
|
||||||
|
|
||||||
def test_apply_template_extracts_three_rows(self, pdf_bytes):
|
def test_parses_dates_to_iso(self, pdf_bytes):
|
||||||
from src.pdf_extract import apply_template, extract_pages
|
from src.pdf_extract import scan_pdf_for_transactions
|
||||||
# The template's column boundaries are tuned to fpdf2's
|
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||||
# x-coordinates above (40 / 160 / 360 pt).
|
assert [r["date"] for r in rows] == [
|
||||||
tpl = {
|
|
||||||
"pages": {"range": "all"},
|
|
||||||
"table": {
|
|
||||||
"header_text": "Date Description Amount",
|
|
||||||
"end_markers": ["Closing balance"],
|
|
||||||
"column_boundaries": [150, 350],
|
|
||||||
"y_tolerance": 3.0,
|
|
||||||
},
|
|
||||||
"columns": [
|
|
||||||
{"source": 0, "target": "date"},
|
|
||||||
{"source": 1, "target": "description"},
|
|
||||||
{"source": 2, "target": "amount"},
|
|
||||||
],
|
|
||||||
"parse": {
|
|
||||||
"date_format": "%m/%d/%Y",
|
|
||||||
"amount_negative_in_parens": True,
|
|
||||||
"merge_multiline_description": True,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
pages = extract_pages(pdf_bytes)
|
|
||||||
df = apply_template(pages, tpl)
|
|
||||||
assert len(df) == 3, f"expected 3 rows, got {len(df)}:\n{df}"
|
|
||||||
assert list(df["date"]) == [
|
|
||||||
"2026-01-15", "2026-01-16", "2026-01-17",
|
"2026-01-15", "2026-01-16", "2026-01-17",
|
||||||
]
|
]
|
||||||
# Parens-negative + currency-positive both round-trip
|
|
||||||
assert df.iloc[0]["amount"] == -4.50
|
def test_parses_amounts_with_signs(self, pdf_bytes):
|
||||||
assert df.iloc[1]["amount"] == 12.00
|
from src.pdf_extract import scan_pdf_for_transactions
|
||||||
assert df.iloc[2]["amount"] == -40.00
|
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||||
|
assert rows[0]["amount_1"] == -4.50
|
||||||
|
assert rows[1]["amount_1"] == 12.00
|
||||||
|
assert rows[2]["amount_1"] == -40.00
|
||||||
|
|
||||||
|
def test_preserves_raw_line(self, pdf_bytes):
|
||||||
|
from src.pdf_extract import scan_pdf_for_transactions
|
||||||
|
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||||
|
# Raw line lets the user verify what was matched.
|
||||||
|
assert all("raw" in r and r["raw"] for r in rows)
|
||||||
|
assert "Coffee" in rows[0]["raw"]
|
||||||
|
|
||||||
|
def test_page_tagged(self, pdf_bytes):
|
||||||
|
from src.pdf_extract import scan_pdf_for_transactions
|
||||||
|
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||||
|
assert all(r["page"] == 1 for r in rows)
|
||||||
|
|
||||||
|
def test_negative_in_parens_off(self, pdf_bytes):
|
||||||
|
"""With parens-negative off, the parser can't decode
|
||||||
|
``(4.50)`` and falls back to the raw text — the row still
|
||||||
|
surfaces, just with the unparsed string in the amount slot
|
||||||
|
so the user can see and fix it in the editor."""
|
||||||
|
from src.pdf_extract import scan_pdf_for_transactions
|
||||||
|
rows, _ = scan_pdf_for_transactions(
|
||||||
|
pdf_bytes, negative_in_parens=False,
|
||||||
|
)
|
||||||
|
# Row 0 had "(4.50)" — without parens-negative, parse_amount
|
||||||
|
# returns None and the scanner keeps the raw token.
|
||||||
|
assert rows[0]["amount_1"] == "(4.50)"
|
||||||
|
# Row 1 had "$12.00" — still parses to positive.
|
||||||
|
assert rows[1]["amount_1"] == 12.00
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# pypdfium2 rendering (powers the visual picker)
|
# Multi-line description merging
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
class TestRenderPageImage:
|
class TestMultilineDescription:
|
||||||
"""``render_page_image`` is what feeds the drawable canvas.
|
def test_continuation_line_merges(self):
|
||||||
|
"""A line with no date and no amount, sitting between two
|
||||||
|
transaction rows, attaches to the previous transaction's
|
||||||
|
description."""
|
||||||
|
from src.pdf_extract import (
|
||||||
|
Page,
|
||||||
|
WordBox,
|
||||||
|
scan_pdf_for_transactions,
|
||||||
|
)
|
||||||
|
# Build a synthetic page through the public entry point by
|
||||||
|
# going through extract_pages_auto's intermediate? Easier:
|
||||||
|
# call the internals directly via a fake PDF. For unit
|
||||||
|
# coverage of the merge behavior, route through the helper:
|
||||||
|
from src import pdf_extract as mod
|
||||||
|
|
||||||
Catches the most common installer-bug: native PDFium .dll/.so
|
original = mod.extract_pages_auto
|
||||||
missing from the bundle. If this test crashes with a
|
|
||||||
``FileNotFoundError`` it almost always means the
|
|
||||||
``hook-pypdfium2.py`` didn't pick up the shared lib."""
|
|
||||||
|
|
||||||
def test_renders_a_real_pil_image(self):
|
def fake(_pdf_bytes, *, allow_ocr=True):
|
||||||
from src.pdf_extract import render_page_image
|
words = [
|
||||||
pdf_bytes = _build_tiny_statement_pdf()
|
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/15/2026"),
|
||||||
image, scale = render_page_image(pdf_bytes, page_no=1)
|
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
|
||||||
# Letter-size at scale ≈ 900/612 ≈ 1.47 → ~900px wide.
|
WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
|
||||||
assert image.width > 800
|
# Continuation: no date, no amount
|
||||||
assert image.height > 800
|
WordBox(x0=100, top=20, x1=160, bottom=30, text="Vendor"),
|
||||||
assert scale > 0
|
WordBox(x0=170, top=20, x1=230, bottom=30, text="memo"),
|
||||||
# PIL Image is duck-typed; check the attrs we depend on.
|
# Next transaction
|
||||||
assert hasattr(image, "save")
|
WordBox(x0=0, top=40, x1=80, bottom=50, text="01/16/2026"),
|
||||||
assert hasattr(image, "tobytes")
|
WordBox(x0=100, top=40, x1=160, bottom=50, text="Other"),
|
||||||
|
WordBox(x0=200, top=40, x1=240, bottom=50, text="$10.00"),
|
||||||
|
]
|
||||||
|
return [Page(
|
||||||
|
page_no=1, width=300, height=100, text="", words=words,
|
||||||
|
)], []
|
||||||
|
|
||||||
def test_invalid_page_number_clamps(self):
|
mod.extract_pages_auto = fake
|
||||||
from src.pdf_extract import render_page_image
|
try:
|
||||||
pdf_bytes = _build_tiny_statement_pdf()
|
rows, _ = scan_pdf_for_transactions(b"")
|
||||||
# PDF has 1 page; page_no=99 should clamp, not raise.
|
finally:
|
||||||
image, scale = render_page_image(pdf_bytes, page_no=99)
|
mod.extract_pages_auto = original
|
||||||
assert image.width > 0
|
|
||||||
|
assert len(rows) == 2
|
||||||
|
assert "Vendor memo" in rows[0]["description"]
|
||||||
|
assert rows[1]["description"] == "Other"
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Graceful-fallback behavior
|
# Graceful fallback when deps absent
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
class TestPdfDependencyMissing:
|
class TestPdfDependencyMissing:
|
||||||
"""The page should see a clean exception when a dep is absent,
|
|
||||||
not a raw ``ImportError`` that leaks into the Streamlit traceback."""
|
|
||||||
|
|
||||||
def test_require_pdfplumber_raises_typed_on_absence(self, monkeypatch):
|
def test_require_pdfplumber_raises_typed_on_absence(self, monkeypatch):
|
||||||
from src import pdf_extract
|
from src import pdf_extract
|
||||||
# Simulate "pdfplumber not installed" without uninstalling.
|
|
||||||
# ``_require_pdfplumber`` does its own ``import pdfplumber``
|
|
||||||
# at call time; patch ``__import__`` to throw for that one
|
|
||||||
# name only.
|
|
||||||
import builtins
|
import builtins
|
||||||
real_import = builtins.__import__
|
real_import = builtins.__import__
|
||||||
|
|
||||||
@@ -218,10 +207,10 @@ class TestPdfDependencyMissing:
|
|||||||
return real_import(name, *a, **kw)
|
return real_import(name, *a, **kw)
|
||||||
|
|
||||||
monkeypatch.setattr(builtins, "__import__", fake_import)
|
monkeypatch.setattr(builtins, "__import__", fake_import)
|
||||||
with pytest.raises(pdf_extract.PdfDependencyMissing) as exc_info:
|
with pytest.raises(pdf_extract.PdfDependencyMissing) as exc:
|
||||||
pdf_extract._require_pdfplumber()
|
pdf_extract._require_pdfplumber()
|
||||||
assert "pdfplumber" in str(exc_info.value)
|
assert "pdfplumber" in str(exc.value)
|
||||||
assert exc_info.value.hint # actionable hint must be populated
|
assert exc.value.hint
|
||||||
|
|
||||||
def test_require_pdfium_raises_typed_on_absence(self, monkeypatch):
|
def test_require_pdfium_raises_typed_on_absence(self, monkeypatch):
|
||||||
from src import pdf_extract
|
from src import pdf_extract
|
||||||
@@ -239,17 +228,13 @@ class TestPdfDependencyMissing:
|
|||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Requirements-pin consistency
|
# Requirements pin consistency
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
class TestPinnedVersionsMatchInstalled:
|
class TestPinnedVersionsMatchInstalled:
|
||||||
"""If someone bumps the pin in ``requirements.txt`` without
|
"""If someone bumps the pin in ``requirements.txt`` without
|
||||||
actually reinstalling, this test points it out before CI does.
|
actually reinstalling, this test points it out before CI does."""
|
||||||
|
|
||||||
Uses ``importlib.metadata`` rather than each library's
|
|
||||||
``__version__`` attribute because not every PDF dep exposes
|
|
||||||
one (``pypdfium2`` keeps version info on a submodule)."""
|
|
||||||
|
|
||||||
def _parse_pins(self) -> dict[str, str]:
|
def _parse_pins(self) -> dict[str, str]:
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -266,21 +251,17 @@ class TestPinnedVersionsMatchInstalled:
|
|||||||
pins[name.strip()] = version.strip()
|
pins[name.strip()] = version.strip()
|
||||||
return pins
|
return pins
|
||||||
|
|
||||||
def _installed(self, dist_name: str) -> str:
|
|
||||||
import importlib.metadata as md
|
|
||||||
return md.version(dist_name)
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("dist_name", [
|
@pytest.mark.parametrize("dist_name", [
|
||||||
"pdfplumber",
|
"pdfplumber",
|
||||||
"pypdfium2",
|
"pypdfium2",
|
||||||
"pytesseract",
|
"pytesseract",
|
||||||
"streamlit-drawable-canvas",
|
|
||||||
])
|
])
|
||||||
def test_pin_matches_installed(self, dist_name):
|
def test_pin_matches_installed(self, dist_name):
|
||||||
|
import importlib.metadata as md
|
||||||
pins = self._parse_pins()
|
pins = self._parse_pins()
|
||||||
if dist_name not in pins:
|
if dist_name not in pins:
|
||||||
pytest.skip(f"{dist_name} not exact-pinned in requirements.txt")
|
pytest.skip(f"{dist_name} not exact-pinned in requirements.txt")
|
||||||
installed = self._installed(dist_name)
|
installed = md.version(dist_name)
|
||||||
assert installed == pins[dist_name], (
|
assert installed == pins[dist_name], (
|
||||||
f"installed {dist_name}=={installed} but requirements.txt "
|
f"installed {dist_name}=={installed} but requirements.txt "
|
||||||
f"pins {pins[dist_name]} — bump the pin, or reinstall."
|
f"pins {pins[dist_name]} — bump the pin, or reinstall."
|
||||||
@@ -288,79 +269,52 @@ class TestPinnedVersionsMatchInstalled:
|
|||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# OCR availability runtime probe
|
# OCR availability
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
class TestOcrAvailability:
|
class TestOcrAvailability:
|
||||||
"""``ocr_available`` is the linchpin of the UI's OCR banner.
|
|
||||||
Returns ``(bool, str)`` — both branches must round-trip."""
|
|
||||||
|
|
||||||
def test_returns_a_tuple(self):
|
def test_returns_a_tuple(self):
|
||||||
from src.pdf_extract import ocr_available
|
from src.pdf_extract import ocr_available
|
||||||
result = ocr_available()
|
result = ocr_available()
|
||||||
assert isinstance(result, tuple)
|
assert isinstance(result, tuple) and len(result) == 2
|
||||||
assert len(result) == 2
|
|
||||||
ok, reason = result
|
ok, reason = result
|
||||||
assert isinstance(ok, bool)
|
assert isinstance(ok, bool)
|
||||||
assert isinstance(reason, str)
|
assert isinstance(reason, str)
|
||||||
|
|
||||||
def test_extract_pages_auto_skips_ocr_when_disabled(self):
|
def test_extract_pages_auto_skips_ocr_when_disabled(self):
|
||||||
from src.pdf_extract import extract_pages_auto
|
from src.pdf_extract import extract_pages_auto
|
||||||
# With allow_ocr=False, no OCR even if pages are blank.
|
|
||||||
pdf_bytes = _build_tiny_statement_pdf()
|
pdf_bytes = _build_tiny_statement_pdf()
|
||||||
pages, warnings = extract_pages_auto(pdf_bytes, allow_ocr=False)
|
pages, warnings = extract_pages_auto(pdf_bytes, allow_ocr=False)
|
||||||
assert len(pages) == 1
|
assert len(pages) == 1
|
||||||
# No OCR-disabled warning on a text PDF, since pages have text.
|
|
||||||
assert not any("OCR is disabled" in w for w in warnings)
|
assert not any("OCR is disabled" in w for w in warnings)
|
||||||
|
|
||||||
|
|
||||||
class TestTesseractDiscovery:
|
class TestTesseractDiscovery:
|
||||||
"""Windows install paths + env-var override are how a real user
|
|
||||||
(no PATH munging) gets OCR working. Cover the discovery logic
|
|
||||||
even on Linux/macOS test runners by mocking out the OS check
|
|
||||||
and ``Path.exists``."""
|
|
||||||
|
|
||||||
def test_autodetect_returns_none_on_non_windows(self, monkeypatch):
|
def test_autodetect_returns_none_on_non_windows(self, monkeypatch):
|
||||||
from src import pdf_extract
|
from src import pdf_extract
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr("platform.system", lambda: "Linux")
|
||||||
"platform.system",
|
|
||||||
lambda: "Linux",
|
|
||||||
)
|
|
||||||
assert pdf_extract._autodetect_tesseract_path() is None
|
assert pdf_extract._autodetect_tesseract_path() is None
|
||||||
|
|
||||||
def test_autodetect_finds_program_files_on_windows(self, monkeypatch):
|
def test_autodetect_finds_program_files_on_windows(self, monkeypatch):
|
||||||
from src import pdf_extract
|
from src import pdf_extract
|
||||||
monkeypatch.setattr("platform.system", lambda: "Windows")
|
monkeypatch.setattr("platform.system", lambda: "Windows")
|
||||||
|
|
||||||
target = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
|
target = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
|
||||||
|
|
||||||
def fake_exists(self):
|
def fake_exists(self):
|
||||||
return str(self) == target
|
return str(self) == target
|
||||||
|
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr("pathlib.Path.exists", fake_exists)
|
||||||
"pathlib.Path.exists",
|
|
||||||
fake_exists,
|
|
||||||
)
|
|
||||||
assert pdf_extract._autodetect_tesseract_path() == target
|
assert pdf_extract._autodetect_tesseract_path() == target
|
||||||
|
|
||||||
def test_autodetect_returns_none_when_nothing_installed(
|
def test_autodetect_returns_none_when_nothing_installed(self, monkeypatch):
|
||||||
self, monkeypatch,
|
|
||||||
):
|
|
||||||
from src import pdf_extract
|
from src import pdf_extract
|
||||||
monkeypatch.setattr("platform.system", lambda: "Windows")
|
monkeypatch.setattr("platform.system", lambda: "Windows")
|
||||||
monkeypatch.setattr("pathlib.Path.exists", lambda self: False)
|
monkeypatch.setattr("pathlib.Path.exists", lambda self: False)
|
||||||
assert pdf_extract._autodetect_tesseract_path() is None
|
assert pdf_extract._autodetect_tesseract_path() is None
|
||||||
|
|
||||||
def test_env_var_override_takes_precedence(self, monkeypatch, tmp_path):
|
def test_env_var_override_takes_precedence(self, monkeypatch, tmp_path):
|
||||||
"""``DATATOOLS_TESSERACT_PATH`` wins over discovery so a
|
|
||||||
portable install at a non-default path works without
|
|
||||||
relying on PATH."""
|
|
||||||
from src import pdf_extract
|
from src import pdf_extract
|
||||||
# Point the override at a path that doesn't exist —
|
|
||||||
# ocr_available will try it and report the failure, but
|
|
||||||
# importantly the cmd attribute is set BEFORE the call,
|
|
||||||
# which is what we're verifying.
|
|
||||||
fake_bin = str(tmp_path / "fake-tesseract.exe")
|
fake_bin = str(tmp_path / "fake-tesseract.exe")
|
||||||
monkeypatch.setenv("DATATOOLS_TESSERACT_PATH", fake_bin)
|
monkeypatch.setenv("DATATOOLS_TESSERACT_PATH", fake_bin)
|
||||||
pdf_extract.ocr_available()
|
pdf_extract.ocr_available()
|
||||||
|
|||||||
@@ -1,280 +0,0 @@
|
|||||||
"""Tests for the row-heuristic extraction pipeline.
|
|
||||||
|
|
||||||
This is now the primary extraction mode — uses date + amount
|
|
||||||
pattern matching to find transaction lines, with no dependency
|
|
||||||
on x-position column boundaries. Robust to layout drift across
|
|
||||||
statements from the same bank.
|
|
||||||
|
|
||||||
The legacy column-visual pipeline keeps its own tests in
|
|
||||||
``test_pdf_extract.py``.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
from src.pdf_extract import (
|
|
||||||
Page,
|
|
||||||
WordBox,
|
|
||||||
apply_template,
|
|
||||||
apply_template_row_heuristic,
|
|
||||||
find_transaction_rows,
|
|
||||||
_find_amount_tokens,
|
|
||||||
_find_dates_in_words,
|
|
||||||
_infer_amount_column_centers,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _w(text: str, x0: float, top: float) -> WordBox:
|
|
||||||
return WordBox(
|
|
||||||
x0=x0,
|
|
||||||
top=top,
|
|
||||||
x1=x0 + 8 * len(text),
|
|
||||||
bottom=top + 10,
|
|
||||||
text=text,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class TestFindDatesInRow:
|
|
||||||
def test_us_slash(self):
|
|
||||||
row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)]
|
|
||||||
assert _find_dates_in_words(row) == [(0, "01/15/2026")]
|
|
||||||
|
|
||||||
def test_two_digit_year(self):
|
|
||||||
row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)]
|
|
||||||
result = _find_dates_in_words(row)
|
|
||||||
assert result and result[0][1] == "01/15/26"
|
|
||||||
|
|
||||||
def test_iso(self):
|
|
||||||
row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)]
|
|
||||||
assert _find_dates_in_words(row) == [(0, "2026-01-15")]
|
|
||||||
|
|
||||||
def test_month_name(self):
|
|
||||||
# "Jan 15, 2026" — three word tokens, should stitch.
|
|
||||||
row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)]
|
|
||||||
result = _find_dates_in_words(row)
|
|
||||||
assert result, "Multi-word month-day-year should match"
|
|
||||||
assert "Jan 15" in result[0][1]
|
|
||||||
|
|
||||||
def test_no_date(self):
|
|
||||||
row = [_w("Just", 0, 0), _w("text", 50, 0)]
|
|
||||||
assert _find_dates_in_words(row) == []
|
|
||||||
|
|
||||||
|
|
||||||
class TestFindAmountTokens:
|
|
||||||
def test_currency_format(self):
|
|
||||||
row = [_w("Coffee", 0, 0), _w("$4.50", 100, 0)]
|
|
||||||
out = _find_amount_tokens(row)
|
|
||||||
assert len(out) == 1
|
|
||||||
assert out[0][2] == "$4.50"
|
|
||||||
|
|
||||||
def test_parens_negative(self):
|
|
||||||
row = [_w("(123.45)", 0, 0)]
|
|
||||||
out = _find_amount_tokens(row)
|
|
||||||
assert out and out[0][2] == "(123.45)"
|
|
||||||
|
|
||||||
def test_no_amount_on_pure_text(self):
|
|
||||||
row = [_w("Hello", 0, 0), _w("World", 50, 0)]
|
|
||||||
assert _find_amount_tokens(row) == []
|
|
||||||
|
|
||||||
def test_rejects_bare_year(self):
|
|
||||||
# "2026" matches the digit pattern but lacks $/decimal/etc.,
|
|
||||||
# so the looks-like-amount filter should drop it.
|
|
||||||
row = [_w("2026", 0, 0)]
|
|
||||||
# Bare integer can pass the regex but not the heuristic.
|
|
||||||
out = _find_amount_tokens(row)
|
|
||||||
# Either filtered out OR included — both are defensible.
|
|
||||||
# If included, it'd be missed-amount territory not a false-
|
|
||||||
# positive. Pin the conservative behavior: NO match.
|
|
||||||
assert out == [], "Bare 4-digit year should not register as amount"
|
|
||||||
|
|
||||||
|
|
||||||
class TestInferAmountColumnCenters:
|
|
||||||
def test_two_clear_columns(self):
|
|
||||||
# 5 rows, each with two amounts at roughly x=300 and x=450.
|
|
||||||
rows = []
|
|
||||||
for top in range(0, 100, 20):
|
|
||||||
rows.append([
|
|
||||||
_w("01/15/2026", 20, top),
|
|
||||||
_w("Item", 100, top),
|
|
||||||
_w("$10.00", 300, top),
|
|
||||||
_w("$1,000.00", 450, top),
|
|
||||||
])
|
|
||||||
centers = _infer_amount_column_centers(
|
|
||||||
rows, expected=2, min_amounts=2, max_amounts=2,
|
|
||||||
)
|
|
||||||
assert len(centers) == 2
|
|
||||||
# Left center ≈ 300 + 8*len("$10.00")/2 = 300+24 = 324
|
|
||||||
assert 310 < centers[0] < 340
|
|
||||||
assert 460 < centers[1] < 490
|
|
||||||
|
|
||||||
def test_no_transactions_returns_empty(self):
|
|
||||||
rows = [[_w("just", 0, 0), _w("text", 50, 0)]]
|
|
||||||
assert _infer_amount_column_centers(
|
|
||||||
rows, expected=2, min_amounts=1, max_amounts=3,
|
|
||||||
) == []
|
|
||||||
|
|
||||||
|
|
||||||
class TestRowHeuristicEndToEnd:
|
|
||||||
"""Synthetic ``Page`` objects exercise the full row-heuristic
|
|
||||||
pipeline end-to-end without a real PDF."""
|
|
||||||
|
|
||||||
def _page_single_amount(self) -> Page:
|
|
||||||
words = [
|
|
||||||
_w("ACME BANK STATEMENT", 20, 0),
|
|
||||||
_w("01/15/2026", 20, 30), _w("Coffee", 100, 30),
|
|
||||||
_w("Shop", 150, 30), _w("$4.50", 400, 30),
|
|
||||||
_w("01/16/2026", 20, 50), _w("Refund", 100, 50),
|
|
||||||
_w("from", 100, 70), _w("vendor", 140, 70), # continuation
|
|
||||||
_w("Vendor", 140, 50), _w("$12.00", 400, 50),
|
|
||||||
_w("Page", 20, 90), _w("1", 60, 90), # not a txn
|
|
||||||
]
|
|
||||||
return Page(page_no=1, width=600, height=120, text="", words=words)
|
|
||||||
|
|
||||||
def test_extracts_two_rows_single_amount(self):
|
|
||||||
tpl = {
|
|
||||||
"mode": "row_heuristic",
|
|
||||||
"row_detection": {
|
|
||||||
"min_amounts_per_row": 1,
|
|
||||||
"max_amounts_per_row": 1,
|
|
||||||
"merge_multiline_description": True,
|
|
||||||
},
|
|
||||||
"amounts": {"shape": "single", "negative_in_parens": True},
|
|
||||||
"date": {"format": "%m/%d/%Y"},
|
|
||||||
}
|
|
||||||
df = apply_template_row_heuristic([self._page_single_amount()], tpl)
|
|
||||||
assert len(df) == 2
|
|
||||||
assert list(df["date"]) == ["2026-01-15", "2026-01-16"]
|
|
||||||
# Multi-line description merged
|
|
||||||
assert "from vendor" in df.iloc[1]["description"]
|
|
||||||
|
|
||||||
def test_dispatches_through_apply_template(self):
|
|
||||||
tpl = {
|
|
||||||
"mode": "row_heuristic",
|
|
||||||
"row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1},
|
|
||||||
"amounts": {"shape": "single"},
|
|
||||||
"date": {"format": "%m/%d/%Y"},
|
|
||||||
}
|
|
||||||
df = apply_template([self._page_single_amount()], tpl)
|
|
||||||
assert isinstance(df, pd.DataFrame)
|
|
||||||
assert len(df) == 2
|
|
||||||
|
|
||||||
def test_txn_balance_shape(self):
|
|
||||||
page = Page(
|
|
||||||
page_no=1, width=600, height=100, text="", words=[
|
|
||||||
_w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
|
|
||||||
_w("(4.50)", 300, 0), _w("1,000.00", 450, 0),
|
|
||||||
_w("01/16/2026", 20, 20), _w("Refund", 100, 20),
|
|
||||||
_w("12.00", 300, 20), _w("1,012.00", 450, 20),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
tpl = {
|
|
||||||
"mode": "row_heuristic",
|
|
||||||
"row_detection": {"min_amounts_per_row": 2, "max_amounts_per_row": 2},
|
|
||||||
"amounts": {"shape": "txn_balance", "negative_in_parens": True},
|
|
||||||
"date": {"format": "%m/%d/%Y"},
|
|
||||||
}
|
|
||||||
df = apply_template([page], tpl)
|
|
||||||
assert len(df) == 2
|
|
||||||
assert df.iloc[0]["amount"] == -4.50
|
|
||||||
assert df.iloc[0]["balance"] == 1000.00
|
|
||||||
assert df.iloc[1]["amount"] == 12.00
|
|
||||||
assert df.iloc[1]["balance"] == 1012.00
|
|
||||||
|
|
||||||
def test_debit_credit_balance_shape(self):
|
|
||||||
page = Page(
|
|
||||||
page_no=1, width=600, height=100, text="", words=[
|
|
||||||
_w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
|
|
||||||
_w("4.50", 300, 0), _w("1,000.00", 450, 0),
|
|
||||||
_w("01/16/2026", 20, 20), _w("Refund", 100, 20),
|
|
||||||
_w("12.00", 380, 20), _w("1,012.00", 450, 20),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
tpl = {
|
|
||||||
"mode": "row_heuristic",
|
|
||||||
"row_detection": {"min_amounts_per_row": 2, "max_amounts_per_row": 3},
|
|
||||||
"amounts": {"shape": "debit_credit_balance"},
|
|
||||||
"date": {"format": "%m/%d/%Y"},
|
|
||||||
}
|
|
||||||
df = apply_template([page], tpl)
|
|
||||||
assert len(df) == 2
|
|
||||||
# Row 0: amount at x=300 (debit column) → debit, balance at 450
|
|
||||||
assert df.iloc[0]["amount"] == -4.50
|
|
||||||
assert df.iloc[0]["type"] == "debit"
|
|
||||||
# Row 1: amount at x=380 (credit column) → credit, balance at 450
|
|
||||||
assert df.iloc[1]["amount"] == 12.00
|
|
||||||
assert df.iloc[1]["type"] == "credit"
|
|
||||||
|
|
||||||
def test_skip_rows_matching(self):
|
|
||||||
page = self._page_single_amount()
|
|
||||||
tpl = {
|
|
||||||
"mode": "row_heuristic",
|
|
||||||
"row_detection": {
|
|
||||||
"min_amounts_per_row": 1,
|
|
||||||
"max_amounts_per_row": 1,
|
|
||||||
"skip_rows_matching": ["Refund"],
|
|
||||||
},
|
|
||||||
"amounts": {"shape": "single"},
|
|
||||||
"date": {"format": "%m/%d/%Y"},
|
|
||||||
}
|
|
||||||
df = apply_template_row_heuristic([page], tpl)
|
|
||||||
assert len(df) == 1
|
|
||||||
assert df.iloc[0]["date"] == "2026-01-15"
|
|
||||||
|
|
||||||
def test_layout_drift_doesnt_matter(self):
|
|
||||||
"""The whole point of row-heuristic: same template works
|
|
||||||
on pages of different sizes / different column x-positions."""
|
|
||||||
# Page A: amounts at x=400
|
|
||||||
page_a = Page(
|
|
||||||
page_no=1, width=600, height=80, text="", words=[
|
|
||||||
_w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
|
|
||||||
_w("$4.50", 400, 0),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
# Page B: amounts shifted to x=520 (different layout)
|
|
||||||
page_b = Page(
|
|
||||||
page_no=1, width=720, height=80, text="", words=[
|
|
||||||
_w("01/15/2026", 50, 0), _w("Coffee", 150, 0),
|
|
||||||
_w("$4.50", 520, 0),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
tpl = {
|
|
||||||
"mode": "row_heuristic",
|
|
||||||
"row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1},
|
|
||||||
"amounts": {"shape": "single"},
|
|
||||||
"date": {"format": "%m/%d/%Y"},
|
|
||||||
}
|
|
||||||
df_a = apply_template([page_a], tpl)
|
|
||||||
df_b = apply_template([page_b], tpl)
|
|
||||||
# Both should extract — proves no coordinate dependency.
|
|
||||||
assert len(df_a) == 1
|
|
||||||
assert len(df_b) == 1
|
|
||||||
assert df_a.iloc[0]["amount"] == df_b.iloc[0]["amount"] == 4.50
|
|
||||||
|
|
||||||
|
|
||||||
class TestFindTransactionRows:
|
|
||||||
"""The pre-DataFrame stage — returns dict records the build UI
|
|
||||||
uses to render a preview before the user commits."""
|
|
||||||
|
|
||||||
def test_returns_records(self):
|
|
||||||
page = Page(
|
|
||||||
page_no=1, width=600, height=80, text="", words=[
|
|
||||||
_w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
|
|
||||||
_w("$4.50", 400, 0),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
tpl = {
|
|
||||||
"mode": "row_heuristic",
|
|
||||||
"row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1},
|
|
||||||
"amounts": {"shape": "single"},
|
|
||||||
"date": {"format": "%m/%d/%Y"},
|
|
||||||
}
|
|
||||||
rows = find_transaction_rows([page], tpl)
|
|
||||||
assert len(rows) == 1
|
|
||||||
r = rows[0]
|
|
||||||
assert r["date"] == "2026-01-15"
|
|
||||||
assert r["description"] == "Coffee"
|
|
||||||
assert r["amount"] == 4.50
|
|
||||||
assert r["_page"] == 1
|
|
||||||
# Raw line is preserved so the GUI can show "what we saw"
|
|
||||||
assert "_raw_line" in r
|
|
||||||
@@ -1,316 +0,0 @@
|
|||||||
"""Tests for the PDF template storage layer."""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from src.pdf_templates import (
|
|
||||||
SCHEMA_VERSION,
|
|
||||||
delete_template,
|
|
||||||
list_templates,
|
|
||||||
load_template,
|
|
||||||
new_template,
|
|
||||||
save_template,
|
|
||||||
slugify,
|
|
||||||
template_from_json,
|
|
||||||
template_path,
|
|
||||||
templates_dir,
|
|
||||||
template_to_json,
|
|
||||||
validate_template,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def isolated_templates(monkeypatch, tmp_path):
|
|
||||||
"""Redirect the templates directory into ``tmp_path``."""
|
|
||||||
monkeypatch.setenv("DATATOOLS_PDF_TEMPLATES_DIR", str(tmp_path))
|
|
||||||
yield tmp_path
|
|
||||||
|
|
||||||
|
|
||||||
class TestSlugify:
|
|
||||||
def test_basic(self):
|
|
||||||
assert slugify("Chase Personal Checking") == "chase-personal-checking"
|
|
||||||
|
|
||||||
def test_strips_punctuation(self):
|
|
||||||
assert slugify("BofA: Business (USD)") == "bofa-business-usd"
|
|
||||||
|
|
||||||
def test_empty_falls_back(self):
|
|
||||||
assert slugify("") == "untitled"
|
|
||||||
assert slugify(" ") == "untitled"
|
|
||||||
|
|
||||||
|
|
||||||
class TestNewTemplate:
|
|
||||||
def test_has_schema_version(self):
|
|
||||||
t = new_template("Sample")
|
|
||||||
assert t["schema_version"] == SCHEMA_VERSION
|
|
||||||
|
|
||||||
def test_slug_derived_from_name(self):
|
|
||||||
t = new_template("Sample Bank")
|
|
||||||
assert t["slug"] == "sample-bank"
|
|
||||||
assert t["name"] == "Sample Bank"
|
|
||||||
|
|
||||||
def test_timestamps_present(self):
|
|
||||||
t = new_template("X")
|
|
||||||
assert t["created_at"]
|
|
||||||
assert t["updated_at"]
|
|
||||||
|
|
||||||
|
|
||||||
class TestValidateTemplateRowHeuristic:
|
|
||||||
"""Row-heuristic mode is the v2 default."""
|
|
||||||
|
|
||||||
def _valid(self) -> dict:
|
|
||||||
return {
|
|
||||||
"schema_version": SCHEMA_VERSION,
|
|
||||||
"slug": "x",
|
|
||||||
"name": "X",
|
|
||||||
"mode": "row_heuristic",
|
|
||||||
"row_detection": {
|
|
||||||
"min_amounts_per_row": 1,
|
|
||||||
"max_amounts_per_row": 3,
|
|
||||||
},
|
|
||||||
"amounts": {"shape": "single"},
|
|
||||||
"date": {"format": "%m/%d/%Y"},
|
|
||||||
}
|
|
||||||
|
|
||||||
def test_valid_passes(self):
|
|
||||||
ok, errs = validate_template(self._valid())
|
|
||||||
assert ok, errs
|
|
||||||
|
|
||||||
def test_missing_name_fails(self):
|
|
||||||
t = self._valid()
|
|
||||||
t["name"] = ""
|
|
||||||
ok, errs = validate_template(t)
|
|
||||||
assert not ok
|
|
||||||
|
|
||||||
def test_bad_mode_fails(self):
|
|
||||||
t = self._valid()
|
|
||||||
t["mode"] = "magic"
|
|
||||||
ok, errs = validate_template(t)
|
|
||||||
assert not ok
|
|
||||||
assert any("mode" in e for e in errs)
|
|
||||||
|
|
||||||
def test_bad_shape_fails(self):
|
|
||||||
t = self._valid()
|
|
||||||
t["amounts"]["shape"] = "telepathic"
|
|
||||||
ok, errs = validate_template(t)
|
|
||||||
assert not ok
|
|
||||||
assert any("shape" in e for e in errs)
|
|
||||||
|
|
||||||
def test_inverted_amount_range_fails(self):
|
|
||||||
t = self._valid()
|
|
||||||
t["row_detection"]["min_amounts_per_row"] = 5
|
|
||||||
t["row_detection"]["max_amounts_per_row"] = 2
|
|
||||||
ok, errs = validate_template(t)
|
|
||||||
assert not ok
|
|
||||||
|
|
||||||
def test_does_not_require_columns_in_row_mode(self):
|
|
||||||
"""Key point: row mode doesn't need ``columns`` populated.
|
|
||||||
That's what makes the GUI's primary path simpler than v1."""
|
|
||||||
t = self._valid()
|
|
||||||
# No columns key at all.
|
|
||||||
ok, errs = validate_template(t)
|
|
||||||
assert ok, errs
|
|
||||||
|
|
||||||
|
|
||||||
class TestValidateTemplateColumnVisual:
|
|
||||||
"""Legacy column-visual mode keeps its own contract."""
|
|
||||||
|
|
||||||
def _valid(self) -> dict:
|
|
||||||
return {
|
|
||||||
"schema_version": SCHEMA_VERSION,
|
|
||||||
"slug": "x",
|
|
||||||
"name": "X",
|
|
||||||
"mode": "column_visual",
|
|
||||||
"pages": {"range": "all"},
|
|
||||||
"table": {"column_boundaries": [100, 200]},
|
|
||||||
"columns": [
|
|
||||||
{"source": 0, "target": "date"},
|
|
||||||
{"source": 1, "target": "description"},
|
|
||||||
{"source": 2, "target": "amount"},
|
|
||||||
],
|
|
||||||
"parse": {},
|
|
||||||
}
|
|
||||||
|
|
||||||
def test_valid_passes(self):
|
|
||||||
ok, errs = validate_template(self._valid())
|
|
||||||
assert ok, errs
|
|
||||||
|
|
||||||
def test_requires_date_column(self):
|
|
||||||
t = self._valid()
|
|
||||||
t["columns"] = [
|
|
||||||
{"source": 0, "target": "description"},
|
|
||||||
{"source": 1, "target": "amount"},
|
|
||||||
]
|
|
||||||
ok, errs = validate_template(t)
|
|
||||||
assert not ok
|
|
||||||
assert any("date" in e for e in errs)
|
|
||||||
|
|
||||||
def test_requires_amount_or_debit_credit(self):
|
|
||||||
t = self._valid()
|
|
||||||
t["columns"] = [
|
|
||||||
{"source": 0, "target": "date"},
|
|
||||||
{"source": 1, "target": "description"},
|
|
||||||
]
|
|
||||||
ok, errs = validate_template(t)
|
|
||||||
assert not ok
|
|
||||||
assert any("amount" in e for e in errs)
|
|
||||||
|
|
||||||
def test_debit_credit_pair_is_valid(self):
|
|
||||||
t = self._valid()
|
|
||||||
t["columns"] = [
|
|
||||||
{"source": 0, "target": "date"},
|
|
||||||
{"source": 1, "target": "description"},
|
|
||||||
{"source": 2, "target": "amount_debit"},
|
|
||||||
{"source": 3, "target": "amount_credit"},
|
|
||||||
]
|
|
||||||
t["table"]["column_boundaries"] = [100, 200, 300]
|
|
||||||
ok, errs = validate_template(t)
|
|
||||||
assert ok, errs
|
|
||||||
|
|
||||||
|
|
||||||
class TestV1Migration:
|
|
||||||
"""v1 templates load with mode='column_visual' auto-injected;
|
|
||||||
the file on disk stays v1 until the user re-saves."""
|
|
||||||
|
|
||||||
def test_loads_v1_template(self, isolated_templates, tmp_path):
|
|
||||||
import json
|
|
||||||
v1_payload = {
|
|
||||||
"schema_version": 1,
|
|
||||||
"slug": "legacy",
|
|
||||||
"name": "Legacy Bank",
|
|
||||||
"pages": {"range": "all"},
|
|
||||||
"table": {"column_boundaries": [100, 200]},
|
|
||||||
"columns": [
|
|
||||||
{"source": 0, "target": "date"},
|
|
||||||
{"source": 1, "target": "description"},
|
|
||||||
{"source": 2, "target": "amount"},
|
|
||||||
],
|
|
||||||
"parse": {},
|
|
||||||
}
|
|
||||||
(tmp_path / "legacy.json").write_text(
|
|
||||||
json.dumps(v1_payload), encoding="utf-8",
|
|
||||||
)
|
|
||||||
loaded = load_template("legacy")
|
|
||||||
# In-memory migration adds mode + bumps schema_version
|
|
||||||
assert loaded["mode"] == "column_visual"
|
|
||||||
assert loaded["schema_version"] == SCHEMA_VERSION
|
|
||||||
# Original keys still intact
|
|
||||||
assert loaded["columns"][0]["target"] == "date"
|
|
||||||
|
|
||||||
|
|
||||||
class TestPersistence:
|
|
||||||
def test_round_trip(self, isolated_templates):
|
|
||||||
t = new_template("Round Trip Bank")
|
|
||||||
t["columns"] = [
|
|
||||||
{"source": 0, "target": "date"},
|
|
||||||
{"source": 1, "target": "description"},
|
|
||||||
{"source": 2, "target": "amount"},
|
|
||||||
]
|
|
||||||
t["table"]["column_boundaries"] = [100, 200]
|
|
||||||
slug = save_template(t)
|
|
||||||
assert slug == "round-trip-bank"
|
|
||||||
|
|
||||||
path = template_path(slug)
|
|
||||||
assert path.exists()
|
|
||||||
loaded = load_template(slug)
|
|
||||||
assert loaded["name"] == "Round Trip Bank"
|
|
||||||
assert loaded["columns"][0]["target"] == "date"
|
|
||||||
|
|
||||||
def test_save_rejects_invalid(self, isolated_templates):
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
save_template({"schema_version": 1, "name": ""})
|
|
||||||
|
|
||||||
def test_load_missing_raises(self, isolated_templates):
|
|
||||||
with pytest.raises(FileNotFoundError):
|
|
||||||
load_template("does-not-exist")
|
|
||||||
|
|
||||||
def test_load_corrupt_raises(self, isolated_templates, tmp_path):
|
|
||||||
bad = tmp_path / "bad.json"
|
|
||||||
bad.write_text("not json", encoding="utf-8")
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
load_template("bad")
|
|
||||||
|
|
||||||
def test_delete(self, isolated_templates):
|
|
||||||
t = new_template("To Delete")
|
|
||||||
t["columns"] = [
|
|
||||||
{"source": 0, "target": "date"},
|
|
||||||
{"source": 1, "target": "amount"},
|
|
||||||
]
|
|
||||||
t["table"]["column_boundaries"] = [100]
|
|
||||||
save_template(t)
|
|
||||||
assert delete_template("to-delete") is True
|
|
||||||
assert delete_template("to-delete") is False
|
|
||||||
|
|
||||||
def test_list_returns_summaries(self, isolated_templates):
|
|
||||||
for name in ["Alpha", "Bravo"]:
|
|
||||||
t = new_template(name)
|
|
||||||
t["columns"] = [
|
|
||||||
{"source": 0, "target": "date"},
|
|
||||||
{"source": 1, "target": "amount"},
|
|
||||||
]
|
|
||||||
t["table"]["column_boundaries"] = [100]
|
|
||||||
save_template(t)
|
|
||||||
rows = list_templates()
|
|
||||||
assert {r["slug"] for r in rows} == {"alpha", "bravo"}
|
|
||||||
|
|
||||||
def test_list_skips_corrupt(self, isolated_templates, tmp_path):
|
|
||||||
(tmp_path / "broken.json").write_text("nope", encoding="utf-8")
|
|
||||||
# Even with a broken file present, list still returns []
|
|
||||||
rows = list_templates()
|
|
||||||
assert rows == []
|
|
||||||
|
|
||||||
def test_atomic_save_no_partial_file_on_failure(
|
|
||||||
self, isolated_templates, monkeypatch
|
|
||||||
):
|
|
||||||
"""If the write step fails mid-way, no half-written JSON survives
|
|
||||||
at the target path. Tests the temp-file-rename safety pattern."""
|
|
||||||
t = new_template("Atomic")
|
|
||||||
t["columns"] = [
|
|
||||||
{"source": 0, "target": "date"},
|
|
||||||
{"source": 1, "target": "amount"},
|
|
||||||
]
|
|
||||||
t["table"]["column_boundaries"] = [100]
|
|
||||||
|
|
||||||
# Make json.dumps blow up to simulate a failure during write.
|
|
||||||
# save_template already validated before this step, so the
|
|
||||||
# crash is "after validation, during write".
|
|
||||||
import src.pdf_templates as mod
|
|
||||||
original_dumps = mod.json.dumps
|
|
||||||
|
|
||||||
def boom(*a, **kw):
|
|
||||||
raise IOError("disk full")
|
|
||||||
|
|
||||||
monkeypatch.setattr(mod.json, "dumps", boom)
|
|
||||||
with pytest.raises(IOError):
|
|
||||||
save_template(t)
|
|
||||||
monkeypatch.setattr(mod.json, "dumps", original_dumps)
|
|
||||||
|
|
||||||
assert not template_path("atomic").exists()
|
|
||||||
|
|
||||||
|
|
||||||
class TestImportExport:
|
|
||||||
def test_round_trip_via_json(self):
|
|
||||||
t = new_template("Exported")
|
|
||||||
t["columns"] = [
|
|
||||||
{"source": 0, "target": "date"},
|
|
||||||
{"source": 1, "target": "amount"},
|
|
||||||
]
|
|
||||||
payload = template_to_json(t)
|
|
||||||
loaded = template_from_json(payload)
|
|
||||||
assert loaded["name"] == "Exported"
|
|
||||||
|
|
||||||
def test_import_rejects_bad_schema(self):
|
|
||||||
bad = json.dumps({"schema_version": 999, "name": "X"})
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
template_from_json(bad)
|
|
||||||
|
|
||||||
def test_import_rejects_non_object(self):
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
template_from_json('["not", "an", "object"]')
|
|
||||||
|
|
||||||
|
|
||||||
def test_templates_dir_env_override(monkeypatch, tmp_path):
|
|
||||||
monkeypatch.setenv("DATATOOLS_PDF_TEMPLATES_DIR", str(tmp_path))
|
|
||||||
assert templates_dir() == tmp_path
|
|
||||||
Reference in New Issue
Block a user