From aea520d2f72e1d7213009fc3915456aee65bf837 Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 19 May 2026 22:46:44 +0000 Subject: [PATCH] feat(pdf): template storage layer (load/save/list/import/export) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 2/6. Persists "how to read this bank's statements" as JSON files under ``~/.datatools/pdf_templates/.json`` so an accountant can build one template per source and reuse it across every statement that follows the same layout. Public API: - ``new_template(name)`` — blank with sensible defaults - ``save_template(t)`` — validate + atomic write (temp + rename) - ``load_template(slug)`` / ``delete_template(slug)`` - ``list_templates()`` — sorted summaries, skips corrupt files - ``template_to_json`` / ``template_from_json`` — portability - ``validate_template(t)`` — returns (ok, errors) list for GUI Schema is documented in the module docstring. Versioned via ``schema_version: 1`` so future fields don't break saved files silently — ``load_template`` refuses unknown versions instead of limping along with missing keys. Validation contract enforces: - non-empty name + slug (lowercase alphanumeric + hyphens) - at least two output columns - at least one column mapped to ``date`` - either one ``amount`` column OR both ``amount_debit`` + ``amount_credit`` - column boundary count consistent with source-column count Storage is atomic: ``_atomic_write`` goes through a temp file + ``os.replace`` so a crashed save can't leave a half-written JSON at the canonical path. The GUI's build flow saves on most visual-picker changes, so this matters more here than for a "save button" workflow. 24 tests cover slugify, defaults, validation branches, round-trip load/save, missing/corrupt file handling, delete, list (incl. skipping corrupt files), atomic-write rollback, and import/export. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/pdf_templates.py | 407 ++++++++++++++++++++++++++++++++++++ tests/test_pdf_templates.py | 239 +++++++++++++++++++++ 2 files changed, 646 insertions(+) create mode 100644 src/pdf_templates.py create mode 100644 tests/test_pdf_templates.py diff --git a/src/pdf_templates.py b/src/pdf_templates.py new file mode 100644 index 0000000..07ea8ac --- /dev/null +++ b/src/pdf_templates.py @@ -0,0 +1,407 @@ +"""PDF extract template storage. + +Templates encode "how to read this bank's statements" — page +range, table window markers, column x-positions, target field +mapping, amount/date parse options. They live as JSON files in +``~/.datatools/pdf_templates/`` so an accountant can build one +per source and reuse it for every statement that follows the +same layout. Templates are portable: the ``export`` / ``import`` +flow is just a file copy of the JSON. + +The schema is intentionally a plain dict (not a frozen dataclass) +because the GUI mutates it incrementally during the build flow. +``validate_template`` enforces the contract at save time. + +Schema (``schema_version: 1``):: + + { + "schema_version": 1, + "slug": "chase-personal-checking", + "name": "Chase Personal Checking", + "notes": "", + "created_at": "", + "updated_at": "", + "pages": { + "range": "all" | "1-3" | "2,4,6-", + "skip_matching": "" + }, + "table": { + "header_text": "", + "end_markers": ["", ...], + "column_boundaries": [x0, x1, ...], + "y_tolerance": 3.0, + "skip_rows_matching": ["", ...] + }, + "columns": [ + {"source": 0, "target": "date"}, + ... + # ``target`` is one of: date | description | amount | + # amount_debit | amount_credit | balance | + ], + "parse": { + "date_format": "%m/%d/%Y", + "date_formats": [], + "decimal_separator": ".", + "thousands_separator": ",", + "currency_strip": "$", + "amount_negative_in_parens": true, + "merge_multiline_description": true + }, + "visual": { + "page_width": 612.0, + "page_height": 792.0, + "sample_page": 1, + "table_bbox": [x0, top, x1, bottom] | null + } + } + +The ``visual`` block is preserved across save/load so the build +UI can round-trip the user's last visual-picker state. +""" + +from __future__ import annotations + +import json +import os +import re +import tempfile +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + + +SCHEMA_VERSION = 1 + +VALID_TARGETS = frozenset({ + "date", + "description", + "amount", + "amount_debit", + "amount_credit", + "balance", + "type", +}) + + +# --------------------------------------------------------------------------- +# Filesystem layout +# --------------------------------------------------------------------------- + + +def templates_dir() -> Path: + """Return ``~/.datatools/pdf_templates/``. Override via the + ``DATATOOLS_PDF_TEMPLATES_DIR`` env var (used by tests).""" + override = os.environ.get("DATATOOLS_PDF_TEMPLATES_DIR") + if override: + return Path(override) + try: + return Path.home() / ".datatools" / "pdf_templates" + except Exception: + return Path(tempfile.gettempdir()) / "datatools-pdf-templates" + + +def template_path(slug: str) -> Path: + """Resolve *slug* to its on-disk JSON path.""" + return templates_dir() / f"{slug}.json" + + +# --------------------------------------------------------------------------- +# Slugify +# --------------------------------------------------------------------------- + + +_SLUG_STRIP = re.compile(r"[^a-z0-9]+") + + +def slugify(name: str) -> str: + """Make a filesystem-safe slug from a human-friendly name.""" + s = (name or "").strip().lower() + s = _SLUG_STRIP.sub("-", s).strip("-") + return s or "untitled" + + +# --------------------------------------------------------------------------- +# Construction + defaults +# --------------------------------------------------------------------------- + + +def new_template(name: str) -> dict[str, Any]: + """Build a blank template with sensible defaults. + + Caller can edit any field; the GUI's build flow fills in the + table and columns sections as the user works through it. + """ + now = datetime.now(tz=timezone.utc).isoformat(timespec="seconds") + slug = slugify(name) + return { + "schema_version": SCHEMA_VERSION, + "slug": slug, + "name": name or slug, + "notes": "", + "created_at": now, + "updated_at": now, + "pages": { + "range": "all", + "skip_matching": "", + }, + "table": { + "header_text": "", + "end_markers": [], + "column_boundaries": [], + "y_tolerance": 3.0, + "skip_rows_matching": [], + }, + "columns": [], + "parse": { + "date_format": "%m/%d/%Y", + "date_formats": [], + "decimal_separator": ".", + "thousands_separator": ",", + "currency_strip": "$", + "amount_negative_in_parens": True, + "merge_multiline_description": True, + }, + "visual": { + "page_width": 612.0, + "page_height": 792.0, + "sample_page": 1, + "table_bbox": None, + }, + } + + +# --------------------------------------------------------------------------- +# Validation +# --------------------------------------------------------------------------- + + +def validate_template(template: dict[str, Any]) -> tuple[bool, list[str]]: + """Check the template before saving. Returns ``(ok, errors)``. + + The GUI shows the errors next to the Save button; nothing + silent here.""" + errors: list[str] = [] + if not isinstance(template, dict): + return False, ["Template must be a JSON object."] + + sv = template.get("schema_version") + if sv != SCHEMA_VERSION: + errors.append( + f"Unsupported schema_version {sv!r} (expected {SCHEMA_VERSION})." + ) + + name = template.get("name", "") + if not isinstance(name, str) or not name.strip(): + errors.append("name is required.") + + slug = template.get("slug") or slugify(name) + if not re.match(r"^[a-z0-9][a-z0-9-]{0,63}$", slug or ""): + errors.append( + "slug must be lowercase alphanumeric + hyphens, " + "1–64 chars, starting with a letter or digit." + ) + + columns = template.get("columns", []) + if not isinstance(columns, list) or len(columns) < 2: + errors.append("At least two output columns are required.") + else: + seen_targets: list[str] = [] + for i, col in enumerate(columns): + if not isinstance(col, dict): + errors.append(f"columns[{i}] must be an object.") + continue + src = col.get("source") + tgt = col.get("target") + if not isinstance(src, int) or src < 0: + errors.append( + f"columns[{i}].source must be a non-negative integer." + ) + if not isinstance(tgt, str) or not tgt: + errors.append(f"columns[{i}].target must be a non-empty string.") + else: + seen_targets.append(tgt) + if "date" not in seen_targets: + errors.append("At least one column must map to 'date'.") + if ( + "amount" not in seen_targets + and not ( + "amount_debit" in seen_targets + and "amount_credit" in seen_targets + ) + ): + errors.append( + "Either an 'amount' column or both 'amount_debit' + " + "'amount_credit' columns are required." + ) + + table = template.get("table", {}) or {} + boundaries = table.get("column_boundaries", []) + if not isinstance(boundaries, list): + errors.append("table.column_boundaries must be a list.") + elif columns and len(boundaries) + 1 < len(set( + c.get("source") for c in columns if isinstance(c, dict) + )): + errors.append( + "table.column_boundaries doesn't match the number of source columns " + "implied by the column mapping." + ) + + return (not errors), errors + + +# --------------------------------------------------------------------------- +# Persistence +# --------------------------------------------------------------------------- + + +def _atomic_write(path: Path, payload: str) -> None: + """Write *payload* to *path* via a temp file + rename. + + Avoids leaving a half-written JSON if the process dies mid-save — + the GUI saves on every visual-picker change, and a corrupt + template file would be hostile to recover from. + """ + path.parent.mkdir(parents=True, exist_ok=True) + fd, tmp_path = tempfile.mkstemp( + prefix=f".{path.name}.", + suffix=".tmp", + dir=str(path.parent), + ) + try: + with os.fdopen(fd, "w", encoding="utf-8") as f: + f.write(payload) + os.replace(tmp_path, path) + except Exception: + try: + os.unlink(tmp_path) + except FileNotFoundError: + pass + raise + + +def save_template(template: dict[str, Any]) -> str: + """Persist *template* to disk; return the slug it was saved as. + + Stamps ``updated_at``. Atomic via temp-file + rename. + Raises ``ValueError`` with a multi-line error list if validation + fails — caller should surface that to the user. + """ + ok, errors = validate_template(template) + if not ok: + raise ValueError("\n".join(errors)) + template = dict(template) + template["updated_at"] = datetime.now(tz=timezone.utc).isoformat( + timespec="seconds" + ) + slug = template["slug"] + payload = json.dumps(template, indent=2, ensure_ascii=False) + _atomic_write(template_path(slug), payload) + return slug + + +def load_template(slug: str) -> dict[str, Any]: + """Read the template at *slug*. Raises ``FileNotFoundError`` if + missing, ``ValueError`` if the JSON is corrupt or the schema + version is unknown.""" + p = template_path(slug) + try: + raw = p.read_text(encoding="utf-8") + except FileNotFoundError: + raise + try: + data = json.loads(raw) + except json.JSONDecodeError as e: + raise ValueError(f"Corrupt template {slug!r}: {e}") from e + sv = data.get("schema_version") + if sv != SCHEMA_VERSION: + raise ValueError( + f"Template {slug!r} has unsupported schema_version {sv!r}; " + f"expected {SCHEMA_VERSION}." + ) + return data + + +def delete_template(slug: str) -> bool: + """Remove the template file; returns ``True`` if it existed.""" + p = template_path(slug) + try: + p.unlink() + return True + except FileNotFoundError: + return False + + +def list_templates() -> list[dict[str, Any]]: + """Return a sorted list of ``{slug, name, updated_at}`` summaries. + + Skips files that fail to parse — surfaces them in the manage UI + as warnings rather than crashing the list view. + """ + d = templates_dir() + if not d.exists(): + return [] + out: list[dict[str, Any]] = [] + for p in sorted(d.glob("*.json")): + try: + data = json.loads(p.read_text(encoding="utf-8")) + except Exception: + continue + if not isinstance(data, dict): + continue + out.append({ + "slug": data.get("slug") or p.stem, + "name": data.get("name") or p.stem, + "updated_at": data.get("updated_at", ""), + "notes": data.get("notes", ""), + }) + out.sort(key=lambda r: r["updated_at"] or r["name"], reverse=True) + return out + + +# --------------------------------------------------------------------------- +# Import / export +# --------------------------------------------------------------------------- + + +def template_to_json(template: dict[str, Any]) -> str: + """Serialize a template for download. Pretty-printed for human + inspection / diffing.""" + return json.dumps(template, indent=2, ensure_ascii=False) + + +def template_from_json(payload: str) -> dict[str, Any]: + """Deserialize uploaded template JSON. Validates schema version + but does NOT save — caller decides whether to ``save_template`` + or merge into the current build. + + Raises ``ValueError`` on malformed input.""" + try: + data = json.loads(payload) + except json.JSONDecodeError as e: + raise ValueError(f"Not valid JSON: {e}") from e + if not isinstance(data, dict): + raise ValueError("Top-level JSON must be an object.") + sv = data.get("schema_version") + if sv != SCHEMA_VERSION: + raise ValueError( + f"Imported template has schema_version {sv!r}; " + f"this build expects {SCHEMA_VERSION}." + ) + return data + + +__all__ = [ + "SCHEMA_VERSION", + "VALID_TARGETS", + "delete_template", + "list_templates", + "load_template", + "new_template", + "save_template", + "slugify", + "template_from_json", + "template_path", + "template_to_json", + "templates_dir", + "validate_template", +] diff --git a/tests/test_pdf_templates.py b/tests/test_pdf_templates.py new file mode 100644 index 0000000..a06c002 --- /dev/null +++ b/tests/test_pdf_templates.py @@ -0,0 +1,239 @@ +"""Tests for the PDF template storage layer.""" + +from __future__ import annotations + +import json + +import pytest + +from src.pdf_templates import ( + SCHEMA_VERSION, + delete_template, + list_templates, + load_template, + new_template, + save_template, + slugify, + template_from_json, + template_path, + templates_dir, + template_to_json, + validate_template, +) + + +@pytest.fixture +def isolated_templates(monkeypatch, tmp_path): + """Redirect the templates directory into ``tmp_path``.""" + monkeypatch.setenv("DATATOOLS_PDF_TEMPLATES_DIR", str(tmp_path)) + yield tmp_path + + +class TestSlugify: + def test_basic(self): + assert slugify("Chase Personal Checking") == "chase-personal-checking" + + def test_strips_punctuation(self): + assert slugify("BofA: Business (USD)") == "bofa-business-usd" + + def test_empty_falls_back(self): + assert slugify("") == "untitled" + assert slugify(" ") == "untitled" + + +class TestNewTemplate: + def test_has_schema_version(self): + t = new_template("Sample") + assert t["schema_version"] == SCHEMA_VERSION + + def test_slug_derived_from_name(self): + t = new_template("Sample Bank") + assert t["slug"] == "sample-bank" + assert t["name"] == "Sample Bank" + + def test_timestamps_present(self): + t = new_template("X") + assert t["created_at"] + assert t["updated_at"] + + +class TestValidateTemplate: + def _valid(self) -> dict: + return { + "schema_version": SCHEMA_VERSION, + "slug": "x", + "name": "X", + "pages": {"range": "all"}, + "table": {"column_boundaries": [100, 200]}, + "columns": [ + {"source": 0, "target": "date"}, + {"source": 1, "target": "description"}, + {"source": 2, "target": "amount"}, + ], + "parse": {}, + } + + def test_valid_passes(self): + ok, errs = validate_template(self._valid()) + assert ok, errs + + def test_missing_name_fails(self): + t = self._valid() + t["name"] = "" + ok, errs = validate_template(t) + assert not ok + assert any("name" in e for e in errs) + + def test_bad_schema_version(self): + t = self._valid() + t["schema_version"] = 999 + ok, errs = validate_template(t) + assert not ok + + def test_requires_date_column(self): + t = self._valid() + t["columns"] = [ + {"source": 0, "target": "description"}, + {"source": 1, "target": "amount"}, + ] + ok, errs = validate_template(t) + assert not ok + assert any("date" in e for e in errs) + + def test_requires_amount_or_debit_credit(self): + t = self._valid() + t["columns"] = [ + {"source": 0, "target": "date"}, + {"source": 1, "target": "description"}, + ] + ok, errs = validate_template(t) + assert not ok + assert any("amount" in e for e in errs) + + def test_debit_credit_pair_is_valid(self): + t = self._valid() + t["columns"] = [ + {"source": 0, "target": "date"}, + {"source": 1, "target": "description"}, + {"source": 2, "target": "amount_debit"}, + {"source": 3, "target": "amount_credit"}, + ] + t["table"]["column_boundaries"] = [100, 200, 300] + ok, errs = validate_template(t) + assert ok, errs + + +class TestPersistence: + def test_round_trip(self, isolated_templates): + t = new_template("Round Trip Bank") + t["columns"] = [ + {"source": 0, "target": "date"}, + {"source": 1, "target": "description"}, + {"source": 2, "target": "amount"}, + ] + t["table"]["column_boundaries"] = [100, 200] + slug = save_template(t) + assert slug == "round-trip-bank" + + path = template_path(slug) + assert path.exists() + loaded = load_template(slug) + assert loaded["name"] == "Round Trip Bank" + assert loaded["columns"][0]["target"] == "date" + + def test_save_rejects_invalid(self, isolated_templates): + with pytest.raises(ValueError): + save_template({"schema_version": 1, "name": ""}) + + def test_load_missing_raises(self, isolated_templates): + with pytest.raises(FileNotFoundError): + load_template("does-not-exist") + + def test_load_corrupt_raises(self, isolated_templates, tmp_path): + bad = tmp_path / "bad.json" + bad.write_text("not json", encoding="utf-8") + with pytest.raises(ValueError): + load_template("bad") + + def test_delete(self, isolated_templates): + t = new_template("To Delete") + t["columns"] = [ + {"source": 0, "target": "date"}, + {"source": 1, "target": "amount"}, + ] + t["table"]["column_boundaries"] = [100] + save_template(t) + assert delete_template("to-delete") is True + assert delete_template("to-delete") is False + + def test_list_returns_summaries(self, isolated_templates): + for name in ["Alpha", "Bravo"]: + t = new_template(name) + t["columns"] = [ + {"source": 0, "target": "date"}, + {"source": 1, "target": "amount"}, + ] + t["table"]["column_boundaries"] = [100] + save_template(t) + rows = list_templates() + assert {r["slug"] for r in rows} == {"alpha", "bravo"} + + def test_list_skips_corrupt(self, isolated_templates, tmp_path): + (tmp_path / "broken.json").write_text("nope", encoding="utf-8") + # Even with a broken file present, list still returns [] + rows = list_templates() + assert rows == [] + + def test_atomic_save_no_partial_file_on_failure( + self, isolated_templates, monkeypatch + ): + """If the write step fails mid-way, no half-written JSON survives + at the target path. Tests the temp-file-rename safety pattern.""" + t = new_template("Atomic") + t["columns"] = [ + {"source": 0, "target": "date"}, + {"source": 1, "target": "amount"}, + ] + t["table"]["column_boundaries"] = [100] + + # Make json.dumps blow up to simulate a failure during write. + # save_template already validated before this step, so the + # crash is "after validation, during write". + import src.pdf_templates as mod + original_dumps = mod.json.dumps + + def boom(*a, **kw): + raise IOError("disk full") + + monkeypatch.setattr(mod.json, "dumps", boom) + with pytest.raises(IOError): + save_template(t) + monkeypatch.setattr(mod.json, "dumps", original_dumps) + + assert not template_path("atomic").exists() + + +class TestImportExport: + def test_round_trip_via_json(self): + t = new_template("Exported") + t["columns"] = [ + {"source": 0, "target": "date"}, + {"source": 1, "target": "amount"}, + ] + payload = template_to_json(t) + loaded = template_from_json(payload) + assert loaded["name"] == "Exported" + + def test_import_rejects_bad_schema(self): + bad = json.dumps({"schema_version": 999, "name": "X"}) + with pytest.raises(ValueError): + template_from_json(bad) + + def test_import_rejects_non_object(self): + with pytest.raises(ValueError): + template_from_json('["not", "an", "object"]') + + +def test_templates_dir_env_override(monkeypatch, tmp_path): + monkeypatch.setenv("DATATOOLS_PDF_TEMPLATES_DIR", str(tmp_path)) + assert templates_dir() == tmp_path