Bumps ``SCHEMA_VERSION`` from 1 to 2 to add a top-level ``mode``
field distinguishing ``row_heuristic`` (new default) from
``column_visual`` (legacy). The schema bump is real — old code
that defaults missing keys would silently mis-extract — so we
do it the careful way:
- ``new_template`` now returns mode=``row_heuristic`` with the
full row-heuristic config tree pre-populated. The legacy
column-visual fields are still seeded with empty defaults so
switching modes in the GUI doesn't require runtime key
insertion.
- ``validate_template`` is mode-aware: row_heuristic templates
must have a valid ``amounts.shape`` + sane
``row_detection.min/max_amounts_per_row``; column_visual
templates keep the existing column/target requirements.
- ``load_template`` accepts both v1 and v2 files
(``_LOAD_SUPPORTED_VERSIONS = {1, 2}``). v1 files get
``mode="column_visual"`` injected and ``schema_version`` bumped
IN MEMORY ONLY — disk file stays v1 until the user explicitly
re-saves. A buggy migration can't silently corrupt their
template library.
- ``save_template`` continues to write the current schema; saving
a v1 template through the GUI naturally upgrades it.
Mode + shape constants exported (``VALID_MODES``,
``VALID_AMOUNT_SHAPES``) so the GUI dropdowns can derive their
options from the source of truth.
Tests split into ``TestValidateTemplateRowHeuristic`` (6) +
``TestValidateTemplateColumnVisual`` (4) + ``TestV1Migration``
(1). All 29 template tests pass; the original column-mode tests
that previously implicitly relied on schema_version=1 keep
working because new_template's seeded column fields are still
present in row_heuristic templates (just not validated as
required).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
317 lines
10 KiB
Python
317 lines
10 KiB
Python
"""Tests for the PDF template storage layer."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
|
|
import pytest
|
|
|
|
from src.pdf_templates import (
|
|
SCHEMA_VERSION,
|
|
delete_template,
|
|
list_templates,
|
|
load_template,
|
|
new_template,
|
|
save_template,
|
|
slugify,
|
|
template_from_json,
|
|
template_path,
|
|
templates_dir,
|
|
template_to_json,
|
|
validate_template,
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def isolated_templates(monkeypatch, tmp_path):
|
|
"""Redirect the templates directory into ``tmp_path``."""
|
|
monkeypatch.setenv("DATATOOLS_PDF_TEMPLATES_DIR", str(tmp_path))
|
|
yield tmp_path
|
|
|
|
|
|
class TestSlugify:
|
|
def test_basic(self):
|
|
assert slugify("Chase Personal Checking") == "chase-personal-checking"
|
|
|
|
def test_strips_punctuation(self):
|
|
assert slugify("BofA: Business (USD)") == "bofa-business-usd"
|
|
|
|
def test_empty_falls_back(self):
|
|
assert slugify("") == "untitled"
|
|
assert slugify(" ") == "untitled"
|
|
|
|
|
|
class TestNewTemplate:
|
|
def test_has_schema_version(self):
|
|
t = new_template("Sample")
|
|
assert t["schema_version"] == SCHEMA_VERSION
|
|
|
|
def test_slug_derived_from_name(self):
|
|
t = new_template("Sample Bank")
|
|
assert t["slug"] == "sample-bank"
|
|
assert t["name"] == "Sample Bank"
|
|
|
|
def test_timestamps_present(self):
|
|
t = new_template("X")
|
|
assert t["created_at"]
|
|
assert t["updated_at"]
|
|
|
|
|
|
class TestValidateTemplateRowHeuristic:
|
|
"""Row-heuristic mode is the v2 default."""
|
|
|
|
def _valid(self) -> dict:
|
|
return {
|
|
"schema_version": SCHEMA_VERSION,
|
|
"slug": "x",
|
|
"name": "X",
|
|
"mode": "row_heuristic",
|
|
"row_detection": {
|
|
"min_amounts_per_row": 1,
|
|
"max_amounts_per_row": 3,
|
|
},
|
|
"amounts": {"shape": "single"},
|
|
"date": {"format": "%m/%d/%Y"},
|
|
}
|
|
|
|
def test_valid_passes(self):
|
|
ok, errs = validate_template(self._valid())
|
|
assert ok, errs
|
|
|
|
def test_missing_name_fails(self):
|
|
t = self._valid()
|
|
t["name"] = ""
|
|
ok, errs = validate_template(t)
|
|
assert not ok
|
|
|
|
def test_bad_mode_fails(self):
|
|
t = self._valid()
|
|
t["mode"] = "magic"
|
|
ok, errs = validate_template(t)
|
|
assert not ok
|
|
assert any("mode" in e for e in errs)
|
|
|
|
def test_bad_shape_fails(self):
|
|
t = self._valid()
|
|
t["amounts"]["shape"] = "telepathic"
|
|
ok, errs = validate_template(t)
|
|
assert not ok
|
|
assert any("shape" in e for e in errs)
|
|
|
|
def test_inverted_amount_range_fails(self):
|
|
t = self._valid()
|
|
t["row_detection"]["min_amounts_per_row"] = 5
|
|
t["row_detection"]["max_amounts_per_row"] = 2
|
|
ok, errs = validate_template(t)
|
|
assert not ok
|
|
|
|
def test_does_not_require_columns_in_row_mode(self):
|
|
"""Key point: row mode doesn't need ``columns`` populated.
|
|
That's what makes the GUI's primary path simpler than v1."""
|
|
t = self._valid()
|
|
# No columns key at all.
|
|
ok, errs = validate_template(t)
|
|
assert ok, errs
|
|
|
|
|
|
class TestValidateTemplateColumnVisual:
|
|
"""Legacy column-visual mode keeps its own contract."""
|
|
|
|
def _valid(self) -> dict:
|
|
return {
|
|
"schema_version": SCHEMA_VERSION,
|
|
"slug": "x",
|
|
"name": "X",
|
|
"mode": "column_visual",
|
|
"pages": {"range": "all"},
|
|
"table": {"column_boundaries": [100, 200]},
|
|
"columns": [
|
|
{"source": 0, "target": "date"},
|
|
{"source": 1, "target": "description"},
|
|
{"source": 2, "target": "amount"},
|
|
],
|
|
"parse": {},
|
|
}
|
|
|
|
def test_valid_passes(self):
|
|
ok, errs = validate_template(self._valid())
|
|
assert ok, errs
|
|
|
|
def test_requires_date_column(self):
|
|
t = self._valid()
|
|
t["columns"] = [
|
|
{"source": 0, "target": "description"},
|
|
{"source": 1, "target": "amount"},
|
|
]
|
|
ok, errs = validate_template(t)
|
|
assert not ok
|
|
assert any("date" in e for e in errs)
|
|
|
|
def test_requires_amount_or_debit_credit(self):
|
|
t = self._valid()
|
|
t["columns"] = [
|
|
{"source": 0, "target": "date"},
|
|
{"source": 1, "target": "description"},
|
|
]
|
|
ok, errs = validate_template(t)
|
|
assert not ok
|
|
assert any("amount" in e for e in errs)
|
|
|
|
def test_debit_credit_pair_is_valid(self):
|
|
t = self._valid()
|
|
t["columns"] = [
|
|
{"source": 0, "target": "date"},
|
|
{"source": 1, "target": "description"},
|
|
{"source": 2, "target": "amount_debit"},
|
|
{"source": 3, "target": "amount_credit"},
|
|
]
|
|
t["table"]["column_boundaries"] = [100, 200, 300]
|
|
ok, errs = validate_template(t)
|
|
assert ok, errs
|
|
|
|
|
|
class TestV1Migration:
|
|
"""v1 templates load with mode='column_visual' auto-injected;
|
|
the file on disk stays v1 until the user re-saves."""
|
|
|
|
def test_loads_v1_template(self, isolated_templates, tmp_path):
|
|
import json
|
|
v1_payload = {
|
|
"schema_version": 1,
|
|
"slug": "legacy",
|
|
"name": "Legacy Bank",
|
|
"pages": {"range": "all"},
|
|
"table": {"column_boundaries": [100, 200]},
|
|
"columns": [
|
|
{"source": 0, "target": "date"},
|
|
{"source": 1, "target": "description"},
|
|
{"source": 2, "target": "amount"},
|
|
],
|
|
"parse": {},
|
|
}
|
|
(tmp_path / "legacy.json").write_text(
|
|
json.dumps(v1_payload), encoding="utf-8",
|
|
)
|
|
loaded = load_template("legacy")
|
|
# In-memory migration adds mode + bumps schema_version
|
|
assert loaded["mode"] == "column_visual"
|
|
assert loaded["schema_version"] == SCHEMA_VERSION
|
|
# Original keys still intact
|
|
assert loaded["columns"][0]["target"] == "date"
|
|
|
|
|
|
class TestPersistence:
|
|
def test_round_trip(self, isolated_templates):
|
|
t = new_template("Round Trip Bank")
|
|
t["columns"] = [
|
|
{"source": 0, "target": "date"},
|
|
{"source": 1, "target": "description"},
|
|
{"source": 2, "target": "amount"},
|
|
]
|
|
t["table"]["column_boundaries"] = [100, 200]
|
|
slug = save_template(t)
|
|
assert slug == "round-trip-bank"
|
|
|
|
path = template_path(slug)
|
|
assert path.exists()
|
|
loaded = load_template(slug)
|
|
assert loaded["name"] == "Round Trip Bank"
|
|
assert loaded["columns"][0]["target"] == "date"
|
|
|
|
def test_save_rejects_invalid(self, isolated_templates):
|
|
with pytest.raises(ValueError):
|
|
save_template({"schema_version": 1, "name": ""})
|
|
|
|
def test_load_missing_raises(self, isolated_templates):
|
|
with pytest.raises(FileNotFoundError):
|
|
load_template("does-not-exist")
|
|
|
|
def test_load_corrupt_raises(self, isolated_templates, tmp_path):
|
|
bad = tmp_path / "bad.json"
|
|
bad.write_text("not json", encoding="utf-8")
|
|
with pytest.raises(ValueError):
|
|
load_template("bad")
|
|
|
|
def test_delete(self, isolated_templates):
|
|
t = new_template("To Delete")
|
|
t["columns"] = [
|
|
{"source": 0, "target": "date"},
|
|
{"source": 1, "target": "amount"},
|
|
]
|
|
t["table"]["column_boundaries"] = [100]
|
|
save_template(t)
|
|
assert delete_template("to-delete") is True
|
|
assert delete_template("to-delete") is False
|
|
|
|
def test_list_returns_summaries(self, isolated_templates):
|
|
for name in ["Alpha", "Bravo"]:
|
|
t = new_template(name)
|
|
t["columns"] = [
|
|
{"source": 0, "target": "date"},
|
|
{"source": 1, "target": "amount"},
|
|
]
|
|
t["table"]["column_boundaries"] = [100]
|
|
save_template(t)
|
|
rows = list_templates()
|
|
assert {r["slug"] for r in rows} == {"alpha", "bravo"}
|
|
|
|
def test_list_skips_corrupt(self, isolated_templates, tmp_path):
|
|
(tmp_path / "broken.json").write_text("nope", encoding="utf-8")
|
|
# Even with a broken file present, list still returns []
|
|
rows = list_templates()
|
|
assert rows == []
|
|
|
|
def test_atomic_save_no_partial_file_on_failure(
|
|
self, isolated_templates, monkeypatch
|
|
):
|
|
"""If the write step fails mid-way, no half-written JSON survives
|
|
at the target path. Tests the temp-file-rename safety pattern."""
|
|
t = new_template("Atomic")
|
|
t["columns"] = [
|
|
{"source": 0, "target": "date"},
|
|
{"source": 1, "target": "amount"},
|
|
]
|
|
t["table"]["column_boundaries"] = [100]
|
|
|
|
# Make json.dumps blow up to simulate a failure during write.
|
|
# save_template already validated before this step, so the
|
|
# crash is "after validation, during write".
|
|
import src.pdf_templates as mod
|
|
original_dumps = mod.json.dumps
|
|
|
|
def boom(*a, **kw):
|
|
raise IOError("disk full")
|
|
|
|
monkeypatch.setattr(mod.json, "dumps", boom)
|
|
with pytest.raises(IOError):
|
|
save_template(t)
|
|
monkeypatch.setattr(mod.json, "dumps", original_dumps)
|
|
|
|
assert not template_path("atomic").exists()
|
|
|
|
|
|
class TestImportExport:
|
|
def test_round_trip_via_json(self):
|
|
t = new_template("Exported")
|
|
t["columns"] = [
|
|
{"source": 0, "target": "date"},
|
|
{"source": 1, "target": "amount"},
|
|
]
|
|
payload = template_to_json(t)
|
|
loaded = template_from_json(payload)
|
|
assert loaded["name"] == "Exported"
|
|
|
|
def test_import_rejects_bad_schema(self):
|
|
bad = json.dumps({"schema_version": 999, "name": "X"})
|
|
with pytest.raises(ValueError):
|
|
template_from_json(bad)
|
|
|
|
def test_import_rejects_non_object(self):
|
|
with pytest.raises(ValueError):
|
|
template_from_json('["not", "an", "object"]')
|
|
|
|
|
|
def test_templates_dir_env_override(monkeypatch, tmp_path):
|
|
monkeypatch.setenv("DATATOOLS_PDF_TEMPLATES_DIR", str(tmp_path))
|
|
assert templates_dir() == tmp_path
|