feat(pdf): schema v2 + mode field + v1 in-memory migration

Bumps ``SCHEMA_VERSION`` from 1 to 2 to add a top-level ``mode``
field distinguishing ``row_heuristic`` (new default) from
``column_visual`` (legacy). The schema bump is real — old code
that defaults missing keys would silently mis-extract — so we
do it the careful way:

- ``new_template`` now returns mode=``row_heuristic`` with the
  full row-heuristic config tree pre-populated. The legacy
  column-visual fields are still seeded with empty defaults so
  switching modes in the GUI doesn't require runtime key
  insertion.
- ``validate_template`` is mode-aware: row_heuristic templates
  must have a valid ``amounts.shape`` + sane
  ``row_detection.min/max_amounts_per_row``; column_visual
  templates keep the existing column/target requirements.
- ``load_template`` accepts both v1 and v2 files
  (``_LOAD_SUPPORTED_VERSIONS = {1, 2}``). v1 files get
  ``mode="column_visual"`` injected and ``schema_version`` bumped
  IN MEMORY ONLY — disk file stays v1 until the user explicitly
  re-saves. A buggy migration can't silently corrupt their
  template library.
- ``save_template`` continues to write the current schema; saving
  a v1 template through the GUI naturally upgrades it.

Mode + shape constants exported (``VALID_MODES``,
``VALID_AMOUNT_SHAPES``) so the GUI dropdowns can derive their
options from the source of truth.

Tests split into ``TestValidateTemplateRowHeuristic`` (6) +
``TestValidateTemplateColumnVisual`` (4) + ``TestV1Migration``
(1). All 29 template tests pass; the original column-mode tests
that previously implicitly relied on schema_version=1 keep
working because new_template's seeded column fields are still
present in row_heuristic templates (just not validated as
required).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-19 23:46:10 +00:00
parent d80befd05a
commit 48cd9e8249
2 changed files with 241 additions and 63 deletions

View File

@@ -57,12 +57,72 @@ class TestNewTemplate:
assert t["updated_at"]
class TestValidateTemplate:
class TestValidateTemplateRowHeuristic:
"""Row-heuristic mode is the v2 default."""
def _valid(self) -> dict:
return {
"schema_version": SCHEMA_VERSION,
"slug": "x",
"name": "X",
"mode": "row_heuristic",
"row_detection": {
"min_amounts_per_row": 1,
"max_amounts_per_row": 3,
},
"amounts": {"shape": "single"},
"date": {"format": "%m/%d/%Y"},
}
def test_valid_passes(self):
ok, errs = validate_template(self._valid())
assert ok, errs
def test_missing_name_fails(self):
t = self._valid()
t["name"] = ""
ok, errs = validate_template(t)
assert not ok
def test_bad_mode_fails(self):
t = self._valid()
t["mode"] = "magic"
ok, errs = validate_template(t)
assert not ok
assert any("mode" in e for e in errs)
def test_bad_shape_fails(self):
t = self._valid()
t["amounts"]["shape"] = "telepathic"
ok, errs = validate_template(t)
assert not ok
assert any("shape" in e for e in errs)
def test_inverted_amount_range_fails(self):
t = self._valid()
t["row_detection"]["min_amounts_per_row"] = 5
t["row_detection"]["max_amounts_per_row"] = 2
ok, errs = validate_template(t)
assert not ok
def test_does_not_require_columns_in_row_mode(self):
"""Key point: row mode doesn't need ``columns`` populated.
That's what makes the GUI's primary path simpler than v1."""
t = self._valid()
# No columns key at all.
ok, errs = validate_template(t)
assert ok, errs
class TestValidateTemplateColumnVisual:
"""Legacy column-visual mode keeps its own contract."""
def _valid(self) -> dict:
return {
"schema_version": SCHEMA_VERSION,
"slug": "x",
"name": "X",
"mode": "column_visual",
"pages": {"range": "all"},
"table": {"column_boundaries": [100, 200]},
"columns": [
@@ -77,19 +137,6 @@ class TestValidateTemplate:
ok, errs = validate_template(self._valid())
assert ok, errs
def test_missing_name_fails(self):
t = self._valid()
t["name"] = ""
ok, errs = validate_template(t)
assert not ok
assert any("name" in e for e in errs)
def test_bad_schema_version(self):
t = self._valid()
t["schema_version"] = 999
ok, errs = validate_template(t)
assert not ok
def test_requires_date_column(self):
t = self._valid()
t["columns"] = [
@@ -123,6 +170,36 @@ class TestValidateTemplate:
assert ok, errs
class TestV1Migration:
"""v1 templates load with mode='column_visual' auto-injected;
the file on disk stays v1 until the user re-saves."""
def test_loads_v1_template(self, isolated_templates, tmp_path):
import json
v1_payload = {
"schema_version": 1,
"slug": "legacy",
"name": "Legacy Bank",
"pages": {"range": "all"},
"table": {"column_boundaries": [100, 200]},
"columns": [
{"source": 0, "target": "date"},
{"source": 1, "target": "description"},
{"source": 2, "target": "amount"},
],
"parse": {},
}
(tmp_path / "legacy.json").write_text(
json.dumps(v1_payload), encoding="utf-8",
)
loaded = load_template("legacy")
# In-memory migration adds mode + bumps schema_version
assert loaded["mode"] == "column_visual"
assert loaded["schema_version"] == SCHEMA_VERSION
# Original keys still intact
assert loaded["columns"][0]["target"] == "date"
class TestPersistence:
def test_round_trip(self, isolated_templates):
t = new_template("Round Trip Bank")