feat(pdf): template storage layer (load/save/list/import/export)

Phase 2/6. Persists "how to read this bank's statements" as JSON
files under ``~/.datatools/pdf_templates/<slug>.json`` so an
accountant can build one template per source and reuse it across
every statement that follows the same layout.

Public API:

- ``new_template(name)`` — blank with sensible defaults
- ``save_template(t)`` — validate + atomic write (temp + rename)
- ``load_template(slug)`` / ``delete_template(slug)``
- ``list_templates()`` — sorted summaries, skips corrupt files
- ``template_to_json`` / ``template_from_json`` — portability
- ``validate_template(t)`` — returns (ok, errors) list for GUI

Schema is documented in the module docstring. Versioned via
``schema_version: 1`` so future fields don't break saved files
silently — ``load_template`` refuses unknown versions instead of
limping along with missing keys.

Validation contract enforces:
- non-empty name + slug (lowercase alphanumeric + hyphens)
- at least two output columns
- at least one column mapped to ``date``
- either one ``amount`` column OR both ``amount_debit`` +
  ``amount_credit``
- column boundary count consistent with source-column count

Storage is atomic: ``_atomic_write`` goes through a temp file +
``os.replace`` so a crashed save can't leave a half-written JSON
at the canonical path. The GUI's build flow saves on most
visual-picker changes, so this matters more here than for a
"save button" workflow.

24 tests cover slugify, defaults, validation branches, round-trip
load/save, missing/corrupt file handling, delete, list (incl.
skipping corrupt files), atomic-write rollback, and import/export.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-19 22:46:44 +00:00
parent b8aff862ed
commit aea520d2f7
2 changed files with 646 additions and 0 deletions

407
src/pdf_templates.py Normal file
View File

@@ -0,0 +1,407 @@
"""PDF extract template storage.
Templates encode "how to read this bank's statements" — page
range, table window markers, column x-positions, target field
mapping, amount/date parse options. They live as JSON files in
``~/.datatools/pdf_templates/`` so an accountant can build one
per source and reuse it for every statement that follows the
same layout. Templates are portable: the ``export`` / ``import``
flow is just a file copy of the JSON.
The schema is intentionally a plain dict (not a frozen dataclass)
because the GUI mutates it incrementally during the build flow.
``validate_template`` enforces the contract at save time.
Schema (``schema_version: 1``)::
{
"schema_version": 1,
"slug": "chase-personal-checking",
"name": "Chase Personal Checking",
"notes": "",
"created_at": "<iso8601>",
"updated_at": "<iso8601>",
"pages": {
"range": "all" | "1-3" | "2,4,6-",
"skip_matching": "<regex>"
},
"table": {
"header_text": "<text containing all header words>",
"end_markers": ["<regex>", ...],
"column_boundaries": [x0, x1, ...],
"y_tolerance": 3.0,
"skip_rows_matching": ["<regex>", ...]
},
"columns": [
{"source": 0, "target": "date"},
...
# ``target`` is one of: date | description | amount |
# amount_debit | amount_credit | balance | <free text>
],
"parse": {
"date_format": "%m/%d/%Y",
"date_formats": [],
"decimal_separator": ".",
"thousands_separator": ",",
"currency_strip": "$",
"amount_negative_in_parens": true,
"merge_multiline_description": true
},
"visual": {
"page_width": 612.0,
"page_height": 792.0,
"sample_page": 1,
"table_bbox": [x0, top, x1, bottom] | null
}
}
The ``visual`` block is preserved across save/load so the build
UI can round-trip the user's last visual-picker state.
"""
from __future__ import annotations
import json
import os
import re
import tempfile
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
SCHEMA_VERSION = 1
VALID_TARGETS = frozenset({
"date",
"description",
"amount",
"amount_debit",
"amount_credit",
"balance",
"type",
})
# ---------------------------------------------------------------------------
# Filesystem layout
# ---------------------------------------------------------------------------
def templates_dir() -> Path:
"""Return ``~/.datatools/pdf_templates/``. Override via the
``DATATOOLS_PDF_TEMPLATES_DIR`` env var (used by tests)."""
override = os.environ.get("DATATOOLS_PDF_TEMPLATES_DIR")
if override:
return Path(override)
try:
return Path.home() / ".datatools" / "pdf_templates"
except Exception:
return Path(tempfile.gettempdir()) / "datatools-pdf-templates"
def template_path(slug: str) -> Path:
"""Resolve *slug* to its on-disk JSON path."""
return templates_dir() / f"{slug}.json"
# ---------------------------------------------------------------------------
# Slugify
# ---------------------------------------------------------------------------
_SLUG_STRIP = re.compile(r"[^a-z0-9]+")
def slugify(name: str) -> str:
"""Make a filesystem-safe slug from a human-friendly name."""
s = (name or "").strip().lower()
s = _SLUG_STRIP.sub("-", s).strip("-")
return s or "untitled"
# ---------------------------------------------------------------------------
# Construction + defaults
# ---------------------------------------------------------------------------
def new_template(name: str) -> dict[str, Any]:
"""Build a blank template with sensible defaults.
Caller can edit any field; the GUI's build flow fills in the
table and columns sections as the user works through it.
"""
now = datetime.now(tz=timezone.utc).isoformat(timespec="seconds")
slug = slugify(name)
return {
"schema_version": SCHEMA_VERSION,
"slug": slug,
"name": name or slug,
"notes": "",
"created_at": now,
"updated_at": now,
"pages": {
"range": "all",
"skip_matching": "",
},
"table": {
"header_text": "",
"end_markers": [],
"column_boundaries": [],
"y_tolerance": 3.0,
"skip_rows_matching": [],
},
"columns": [],
"parse": {
"date_format": "%m/%d/%Y",
"date_formats": [],
"decimal_separator": ".",
"thousands_separator": ",",
"currency_strip": "$",
"amount_negative_in_parens": True,
"merge_multiline_description": True,
},
"visual": {
"page_width": 612.0,
"page_height": 792.0,
"sample_page": 1,
"table_bbox": None,
},
}
# ---------------------------------------------------------------------------
# Validation
# ---------------------------------------------------------------------------
def validate_template(template: dict[str, Any]) -> tuple[bool, list[str]]:
"""Check the template before saving. Returns ``(ok, errors)``.
The GUI shows the errors next to the Save button; nothing
silent here."""
errors: list[str] = []
if not isinstance(template, dict):
return False, ["Template must be a JSON object."]
sv = template.get("schema_version")
if sv != SCHEMA_VERSION:
errors.append(
f"Unsupported schema_version {sv!r} (expected {SCHEMA_VERSION})."
)
name = template.get("name", "")
if not isinstance(name, str) or not name.strip():
errors.append("name is required.")
slug = template.get("slug") or slugify(name)
if not re.match(r"^[a-z0-9][a-z0-9-]{0,63}$", slug or ""):
errors.append(
"slug must be lowercase alphanumeric + hyphens, "
"164 chars, starting with a letter or digit."
)
columns = template.get("columns", [])
if not isinstance(columns, list) or len(columns) < 2:
errors.append("At least two output columns are required.")
else:
seen_targets: list[str] = []
for i, col in enumerate(columns):
if not isinstance(col, dict):
errors.append(f"columns[{i}] must be an object.")
continue
src = col.get("source")
tgt = col.get("target")
if not isinstance(src, int) or src < 0:
errors.append(
f"columns[{i}].source must be a non-negative integer."
)
if not isinstance(tgt, str) or not tgt:
errors.append(f"columns[{i}].target must be a non-empty string.")
else:
seen_targets.append(tgt)
if "date" not in seen_targets:
errors.append("At least one column must map to 'date'.")
if (
"amount" not in seen_targets
and not (
"amount_debit" in seen_targets
and "amount_credit" in seen_targets
)
):
errors.append(
"Either an 'amount' column or both 'amount_debit' + "
"'amount_credit' columns are required."
)
table = template.get("table", {}) or {}
boundaries = table.get("column_boundaries", [])
if not isinstance(boundaries, list):
errors.append("table.column_boundaries must be a list.")
elif columns and len(boundaries) + 1 < len(set(
c.get("source") for c in columns if isinstance(c, dict)
)):
errors.append(
"table.column_boundaries doesn't match the number of source columns "
"implied by the column mapping."
)
return (not errors), errors
# ---------------------------------------------------------------------------
# Persistence
# ---------------------------------------------------------------------------
def _atomic_write(path: Path, payload: str) -> None:
"""Write *payload* to *path* via a temp file + rename.
Avoids leaving a half-written JSON if the process dies mid-save —
the GUI saves on every visual-picker change, and a corrupt
template file would be hostile to recover from.
"""
path.parent.mkdir(parents=True, exist_ok=True)
fd, tmp_path = tempfile.mkstemp(
prefix=f".{path.name}.",
suffix=".tmp",
dir=str(path.parent),
)
try:
with os.fdopen(fd, "w", encoding="utf-8") as f:
f.write(payload)
os.replace(tmp_path, path)
except Exception:
try:
os.unlink(tmp_path)
except FileNotFoundError:
pass
raise
def save_template(template: dict[str, Any]) -> str:
"""Persist *template* to disk; return the slug it was saved as.
Stamps ``updated_at``. Atomic via temp-file + rename.
Raises ``ValueError`` with a multi-line error list if validation
fails — caller should surface that to the user.
"""
ok, errors = validate_template(template)
if not ok:
raise ValueError("\n".join(errors))
template = dict(template)
template["updated_at"] = datetime.now(tz=timezone.utc).isoformat(
timespec="seconds"
)
slug = template["slug"]
payload = json.dumps(template, indent=2, ensure_ascii=False)
_atomic_write(template_path(slug), payload)
return slug
def load_template(slug: str) -> dict[str, Any]:
"""Read the template at *slug*. Raises ``FileNotFoundError`` if
missing, ``ValueError`` if the JSON is corrupt or the schema
version is unknown."""
p = template_path(slug)
try:
raw = p.read_text(encoding="utf-8")
except FileNotFoundError:
raise
try:
data = json.loads(raw)
except json.JSONDecodeError as e:
raise ValueError(f"Corrupt template {slug!r}: {e}") from e
sv = data.get("schema_version")
if sv != SCHEMA_VERSION:
raise ValueError(
f"Template {slug!r} has unsupported schema_version {sv!r}; "
f"expected {SCHEMA_VERSION}."
)
return data
def delete_template(slug: str) -> bool:
"""Remove the template file; returns ``True`` if it existed."""
p = template_path(slug)
try:
p.unlink()
return True
except FileNotFoundError:
return False
def list_templates() -> list[dict[str, Any]]:
"""Return a sorted list of ``{slug, name, updated_at}`` summaries.
Skips files that fail to parse — surfaces them in the manage UI
as warnings rather than crashing the list view.
"""
d = templates_dir()
if not d.exists():
return []
out: list[dict[str, Any]] = []
for p in sorted(d.glob("*.json")):
try:
data = json.loads(p.read_text(encoding="utf-8"))
except Exception:
continue
if not isinstance(data, dict):
continue
out.append({
"slug": data.get("slug") or p.stem,
"name": data.get("name") or p.stem,
"updated_at": data.get("updated_at", ""),
"notes": data.get("notes", ""),
})
out.sort(key=lambda r: r["updated_at"] or r["name"], reverse=True)
return out
# ---------------------------------------------------------------------------
# Import / export
# ---------------------------------------------------------------------------
def template_to_json(template: dict[str, Any]) -> str:
"""Serialize a template for download. Pretty-printed for human
inspection / diffing."""
return json.dumps(template, indent=2, ensure_ascii=False)
def template_from_json(payload: str) -> dict[str, Any]:
"""Deserialize uploaded template JSON. Validates schema version
but does NOT save — caller decides whether to ``save_template``
or merge into the current build.
Raises ``ValueError`` on malformed input."""
try:
data = json.loads(payload)
except json.JSONDecodeError as e:
raise ValueError(f"Not valid JSON: {e}") from e
if not isinstance(data, dict):
raise ValueError("Top-level JSON must be an object.")
sv = data.get("schema_version")
if sv != SCHEMA_VERSION:
raise ValueError(
f"Imported template has schema_version {sv!r}; "
f"this build expects {SCHEMA_VERSION}."
)
return data
__all__ = [
"SCHEMA_VERSION",
"VALID_TARGETS",
"delete_template",
"list_templates",
"load_template",
"new_template",
"save_template",
"slugify",
"template_from_json",
"template_path",
"template_to_json",
"templates_dir",
"validate_template",
]