feat(pdf): schema v2 + mode field + v1 in-memory migration
Bumps ``SCHEMA_VERSION`` from 1 to 2 to add a top-level ``mode``
field distinguishing ``row_heuristic`` (new default) from
``column_visual`` (legacy). The schema bump is real — old code
that defaults missing keys would silently mis-extract — so we
do it the careful way:
- ``new_template`` now returns mode=``row_heuristic`` with the
full row-heuristic config tree pre-populated. The legacy
column-visual fields are still seeded with empty defaults so
switching modes in the GUI doesn't require runtime key
insertion.
- ``validate_template`` is mode-aware: row_heuristic templates
must have a valid ``amounts.shape`` + sane
``row_detection.min/max_amounts_per_row``; column_visual
templates keep the existing column/target requirements.
- ``load_template`` accepts both v1 and v2 files
(``_LOAD_SUPPORTED_VERSIONS = {1, 2}``). v1 files get
``mode="column_visual"`` injected and ``schema_version`` bumped
IN MEMORY ONLY — disk file stays v1 until the user explicitly
re-saves. A buggy migration can't silently corrupt their
template library.
- ``save_template`` continues to write the current schema; saving
a v1 template through the GUI naturally upgrades it.
Mode + shape constants exported (``VALID_MODES``,
``VALID_AMOUNT_SHAPES``) so the GUI dropdowns can derive their
options from the source of truth.
Tests split into ``TestValidateTemplateRowHeuristic`` (6) +
``TestValidateTemplateColumnVisual`` (4) + ``TestV1Migration``
(1). All 29 template tests pass; the original column-mode tests
that previously implicitly relied on schema_version=1 keep
working because new_template's seeded column fields are still
present in row_heuristic templates (just not validated as
required).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -70,7 +70,29 @@ from pathlib import Path
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
SCHEMA_VERSION = 1
|
SCHEMA_VERSION = 2
|
||||||
|
|
||||||
|
# Backward-compatible versions ``load_template`` will accept.
|
||||||
|
# v1 templates predate the row-heuristic shift and are loaded as
|
||||||
|
# ``mode="column_visual"``; they're not auto-migrated on disk, so
|
||||||
|
# the user keeps their canonical original until they re-save.
|
||||||
|
_LOAD_SUPPORTED_VERSIONS = frozenset({1, 2})
|
||||||
|
|
||||||
|
# Extraction modes. ``row_heuristic`` is the default for new
|
||||||
|
# templates — finds transactions by date+amount pattern matching
|
||||||
|
# with no coordinate dependency. ``column_visual`` is the legacy
|
||||||
|
# x-position-boundary approach, kept for old templates and for
|
||||||
|
# the "Advanced" build-mode fallback when the heuristic misfires.
|
||||||
|
VALID_MODES = frozenset({"row_heuristic", "column_visual"})
|
||||||
|
|
||||||
|
# Amount shapes for row_heuristic mode. The GUI offers these as a
|
||||||
|
# dropdown; the parser uses them to assign amount tokens to fields.
|
||||||
|
VALID_AMOUNT_SHAPES = frozenset({
|
||||||
|
"single",
|
||||||
|
"txn_balance",
|
||||||
|
"debit_credit",
|
||||||
|
"debit_credit_balance",
|
||||||
|
})
|
||||||
|
|
||||||
VALID_TARGETS = frozenset({
|
VALID_TARGETS = frozenset({
|
||||||
"date",
|
"date",
|
||||||
@@ -128,8 +150,10 @@ def slugify(name: str) -> str:
|
|||||||
def new_template(name: str) -> dict[str, Any]:
|
def new_template(name: str) -> dict[str, Any]:
|
||||||
"""Build a blank template with sensible defaults.
|
"""Build a blank template with sensible defaults.
|
||||||
|
|
||||||
Caller can edit any field; the GUI's build flow fills in the
|
Defaults to ``mode="row_heuristic"`` — the simpler, more
|
||||||
table and columns sections as the user works through it.
|
robust approach. The GUI's build flow lets the user switch to
|
||||||
|
``mode="column_visual"`` if the heuristic doesn't fit their
|
||||||
|
statement layout.
|
||||||
"""
|
"""
|
||||||
now = datetime.now(tz=timezone.utc).isoformat(timespec="seconds")
|
now = datetime.now(tz=timezone.utc).isoformat(timespec="seconds")
|
||||||
slug = slugify(name)
|
slug = slugify(name)
|
||||||
@@ -138,12 +162,35 @@ def new_template(name: str) -> dict[str, Any]:
|
|||||||
"slug": slug,
|
"slug": slug,
|
||||||
"name": name or slug,
|
"name": name or slug,
|
||||||
"notes": "",
|
"notes": "",
|
||||||
|
"mode": "row_heuristic",
|
||||||
"created_at": now,
|
"created_at": now,
|
||||||
"updated_at": now,
|
"updated_at": now,
|
||||||
"pages": {
|
"pages": {
|
||||||
"range": "all",
|
"range": "all",
|
||||||
"skip_matching": "",
|
"skip_matching": "",
|
||||||
},
|
},
|
||||||
|
# Row-heuristic config (primary path).
|
||||||
|
"row_detection": {
|
||||||
|
"min_amounts_per_row": 1,
|
||||||
|
"max_amounts_per_row": 3,
|
||||||
|
"y_tolerance": 3.0,
|
||||||
|
"skip_rows_matching": [],
|
||||||
|
"merge_multiline_description": True,
|
||||||
|
},
|
||||||
|
"amounts": {
|
||||||
|
"shape": "single",
|
||||||
|
"negative_in_parens": True,
|
||||||
|
"decimal_separator": ".",
|
||||||
|
"thousands_separator": ",",
|
||||||
|
"currency_strip": "$",
|
||||||
|
},
|
||||||
|
"date": {
|
||||||
|
"format": "%m/%d/%Y",
|
||||||
|
"formats_fallback": [],
|
||||||
|
},
|
||||||
|
# Column-visual config (legacy / Advanced fallback). Empty
|
||||||
|
# placeholders so the GUI can populate when the user
|
||||||
|
# switches modes without inserting keys at runtime.
|
||||||
"table": {
|
"table": {
|
||||||
"header_text": "",
|
"header_text": "",
|
||||||
"end_markers": [],
|
"end_markers": [],
|
||||||
@@ -178,8 +225,9 @@ def new_template(name: str) -> dict[str, Any]:
|
|||||||
def validate_template(template: dict[str, Any]) -> tuple[bool, list[str]]:
|
def validate_template(template: dict[str, Any]) -> tuple[bool, list[str]]:
|
||||||
"""Check the template before saving. Returns ``(ok, errors)``.
|
"""Check the template before saving. Returns ``(ok, errors)``.
|
||||||
|
|
||||||
The GUI shows the errors next to the Save button; nothing
|
Mode-aware: row-heuristic templates and column-visual
|
||||||
silent here."""
|
templates have different required fields. The GUI shows the
|
||||||
|
errors next to the Save button; nothing silent here."""
|
||||||
errors: list[str] = []
|
errors: list[str] = []
|
||||||
if not isinstance(template, dict):
|
if not isinstance(template, dict):
|
||||||
return False, ["Template must be a JSON object."]
|
return False, ["Template must be a JSON object."]
|
||||||
@@ -201,50 +249,82 @@ def validate_template(template: dict[str, Any]) -> tuple[bool, list[str]]:
|
|||||||
"1–64 chars, starting with a letter or digit."
|
"1–64 chars, starting with a letter or digit."
|
||||||
)
|
)
|
||||||
|
|
||||||
columns = template.get("columns", [])
|
mode = template.get("mode", "row_heuristic")
|
||||||
if not isinstance(columns, list) or len(columns) < 2:
|
if mode not in VALID_MODES:
|
||||||
errors.append("At least two output columns are required.")
|
errors.append(
|
||||||
else:
|
f"mode {mode!r} must be one of: {sorted(VALID_MODES)}."
|
||||||
seen_targets: list[str] = []
|
)
|
||||||
for i, col in enumerate(columns):
|
|
||||||
if not isinstance(col, dict):
|
if mode == "row_heuristic":
|
||||||
errors.append(f"columns[{i}] must be an object.")
|
amounts = template.get("amounts", {}) or {}
|
||||||
continue
|
shape = amounts.get("shape", "single")
|
||||||
src = col.get("source")
|
if shape not in VALID_AMOUNT_SHAPES:
|
||||||
tgt = col.get("target")
|
|
||||||
if not isinstance(src, int) or src < 0:
|
|
||||||
errors.append(
|
|
||||||
f"columns[{i}].source must be a non-negative integer."
|
|
||||||
)
|
|
||||||
if not isinstance(tgt, str) or not tgt:
|
|
||||||
errors.append(f"columns[{i}].target must be a non-empty string.")
|
|
||||||
else:
|
|
||||||
seen_targets.append(tgt)
|
|
||||||
if "date" not in seen_targets:
|
|
||||||
errors.append("At least one column must map to 'date'.")
|
|
||||||
if (
|
|
||||||
"amount" not in seen_targets
|
|
||||||
and not (
|
|
||||||
"amount_debit" in seen_targets
|
|
||||||
and "amount_credit" in seen_targets
|
|
||||||
)
|
|
||||||
):
|
|
||||||
errors.append(
|
errors.append(
|
||||||
"Either an 'amount' column or both 'amount_debit' + "
|
f"amounts.shape {shape!r} must be one of: "
|
||||||
"'amount_credit' columns are required."
|
f"{sorted(VALID_AMOUNT_SHAPES)}."
|
||||||
|
)
|
||||||
|
rd = template.get("row_detection", {}) or {}
|
||||||
|
min_a = rd.get("min_amounts_per_row", 1)
|
||||||
|
max_a = rd.get("max_amounts_per_row", 3)
|
||||||
|
if not (isinstance(min_a, int) and isinstance(max_a, int)):
|
||||||
|
errors.append(
|
||||||
|
"row_detection.min_amounts_per_row and "
|
||||||
|
"max_amounts_per_row must be integers."
|
||||||
|
)
|
||||||
|
elif min_a < 1 or max_a < min_a:
|
||||||
|
errors.append(
|
||||||
|
"row_detection.min_amounts_per_row must be ≥1 and ≤ "
|
||||||
|
"max_amounts_per_row."
|
||||||
)
|
)
|
||||||
|
|
||||||
table = template.get("table", {}) or {}
|
elif mode == "column_visual":
|
||||||
boundaries = table.get("column_boundaries", [])
|
columns = template.get("columns", [])
|
||||||
if not isinstance(boundaries, list):
|
if not isinstance(columns, list) or len(columns) < 2:
|
||||||
errors.append("table.column_boundaries must be a list.")
|
errors.append(
|
||||||
elif columns and len(boundaries) + 1 < len(set(
|
"column_visual mode: at least two output columns "
|
||||||
c.get("source") for c in columns if isinstance(c, dict)
|
"are required."
|
||||||
)):
|
)
|
||||||
errors.append(
|
else:
|
||||||
"table.column_boundaries doesn't match the number of source columns "
|
seen_targets: list[str] = []
|
||||||
"implied by the column mapping."
|
for i, col in enumerate(columns):
|
||||||
)
|
if not isinstance(col, dict):
|
||||||
|
errors.append(f"columns[{i}] must be an object.")
|
||||||
|
continue
|
||||||
|
src = col.get("source")
|
||||||
|
tgt = col.get("target")
|
||||||
|
if not isinstance(src, int) or src < 0:
|
||||||
|
errors.append(
|
||||||
|
f"columns[{i}].source must be a non-negative "
|
||||||
|
f"integer."
|
||||||
|
)
|
||||||
|
if not isinstance(tgt, str) or not tgt:
|
||||||
|
errors.append(
|
||||||
|
f"columns[{i}].target must be a non-empty string."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
seen_targets.append(tgt)
|
||||||
|
if "date" not in seen_targets:
|
||||||
|
errors.append(
|
||||||
|
"column_visual mode: at least one column must map "
|
||||||
|
"to 'date'."
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
"amount" not in seen_targets
|
||||||
|
and not (
|
||||||
|
"amount_debit" in seen_targets
|
||||||
|
and "amount_credit" in seen_targets
|
||||||
|
)
|
||||||
|
):
|
||||||
|
errors.append(
|
||||||
|
"column_visual mode: either an 'amount' column or "
|
||||||
|
"both 'amount_debit' + 'amount_credit' columns "
|
||||||
|
"are required."
|
||||||
|
)
|
||||||
|
|
||||||
|
table = template.get("table", {}) or {}
|
||||||
|
boundaries = table.get("column_boundaries", [])
|
||||||
|
if not isinstance(boundaries, list):
|
||||||
|
errors.append("table.column_boundaries must be a list.")
|
||||||
|
|
||||||
return (not errors), errors
|
return (not errors), errors
|
||||||
|
|
||||||
@@ -302,7 +382,14 @@ def save_template(template: dict[str, Any]) -> str:
|
|||||||
def load_template(slug: str) -> dict[str, Any]:
|
def load_template(slug: str) -> dict[str, Any]:
|
||||||
"""Read the template at *slug*. Raises ``FileNotFoundError`` if
|
"""Read the template at *slug*. Raises ``FileNotFoundError`` if
|
||||||
missing, ``ValueError`` if the JSON is corrupt or the schema
|
missing, ``ValueError`` if the JSON is corrupt or the schema
|
||||||
version is unknown."""
|
version is unsupported.
|
||||||
|
|
||||||
|
v1 templates (pre row-heuristic) are accepted and migrated
|
||||||
|
in-memory to v2 shape with ``mode="column_visual"``. The file
|
||||||
|
on disk is NOT rewritten — the user's canonical original stays
|
||||||
|
intact until they explicitly re-save, so a buggy migration
|
||||||
|
can't silently corrupt their template library.
|
||||||
|
"""
|
||||||
p = template_path(slug)
|
p = template_path(slug)
|
||||||
try:
|
try:
|
||||||
raw = p.read_text(encoding="utf-8")
|
raw = p.read_text(encoding="utf-8")
|
||||||
@@ -313,11 +400,25 @@ def load_template(slug: str) -> dict[str, Any]:
|
|||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
raise ValueError(f"Corrupt template {slug!r}: {e}") from e
|
raise ValueError(f"Corrupt template {slug!r}: {e}") from e
|
||||||
sv = data.get("schema_version")
|
sv = data.get("schema_version")
|
||||||
if sv != SCHEMA_VERSION:
|
if sv not in _LOAD_SUPPORTED_VERSIONS:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Template {slug!r} has unsupported schema_version {sv!r}; "
|
f"Template {slug!r} has unsupported schema_version {sv!r}; "
|
||||||
f"expected {SCHEMA_VERSION}."
|
f"this build supports {sorted(_LOAD_SUPPORTED_VERSIONS)}."
|
||||||
)
|
)
|
||||||
|
return _migrate_to_current(data)
|
||||||
|
|
||||||
|
|
||||||
|
def _migrate_to_current(data: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
"""In-memory migration of older schemas to the current shape.
|
||||||
|
|
||||||
|
v1 → v2 adds a ``mode`` key defaulting to ``"column_visual"``
|
||||||
|
(since v1 was the column-x-position approach) and stamps
|
||||||
|
``schema_version`` to the current value. All v1 keys keep
|
||||||
|
their original meaning."""
|
||||||
|
if data.get("schema_version") == 1:
|
||||||
|
data = dict(data)
|
||||||
|
data["schema_version"] = SCHEMA_VERSION
|
||||||
|
data.setdefault("mode", "column_visual")
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -57,12 +57,72 @@ class TestNewTemplate:
|
|||||||
assert t["updated_at"]
|
assert t["updated_at"]
|
||||||
|
|
||||||
|
|
||||||
class TestValidateTemplate:
|
class TestValidateTemplateRowHeuristic:
|
||||||
|
"""Row-heuristic mode is the v2 default."""
|
||||||
|
|
||||||
def _valid(self) -> dict:
|
def _valid(self) -> dict:
|
||||||
return {
|
return {
|
||||||
"schema_version": SCHEMA_VERSION,
|
"schema_version": SCHEMA_VERSION,
|
||||||
"slug": "x",
|
"slug": "x",
|
||||||
"name": "X",
|
"name": "X",
|
||||||
|
"mode": "row_heuristic",
|
||||||
|
"row_detection": {
|
||||||
|
"min_amounts_per_row": 1,
|
||||||
|
"max_amounts_per_row": 3,
|
||||||
|
},
|
||||||
|
"amounts": {"shape": "single"},
|
||||||
|
"date": {"format": "%m/%d/%Y"},
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_valid_passes(self):
|
||||||
|
ok, errs = validate_template(self._valid())
|
||||||
|
assert ok, errs
|
||||||
|
|
||||||
|
def test_missing_name_fails(self):
|
||||||
|
t = self._valid()
|
||||||
|
t["name"] = ""
|
||||||
|
ok, errs = validate_template(t)
|
||||||
|
assert not ok
|
||||||
|
|
||||||
|
def test_bad_mode_fails(self):
|
||||||
|
t = self._valid()
|
||||||
|
t["mode"] = "magic"
|
||||||
|
ok, errs = validate_template(t)
|
||||||
|
assert not ok
|
||||||
|
assert any("mode" in e for e in errs)
|
||||||
|
|
||||||
|
def test_bad_shape_fails(self):
|
||||||
|
t = self._valid()
|
||||||
|
t["amounts"]["shape"] = "telepathic"
|
||||||
|
ok, errs = validate_template(t)
|
||||||
|
assert not ok
|
||||||
|
assert any("shape" in e for e in errs)
|
||||||
|
|
||||||
|
def test_inverted_amount_range_fails(self):
|
||||||
|
t = self._valid()
|
||||||
|
t["row_detection"]["min_amounts_per_row"] = 5
|
||||||
|
t["row_detection"]["max_amounts_per_row"] = 2
|
||||||
|
ok, errs = validate_template(t)
|
||||||
|
assert not ok
|
||||||
|
|
||||||
|
def test_does_not_require_columns_in_row_mode(self):
|
||||||
|
"""Key point: row mode doesn't need ``columns`` populated.
|
||||||
|
That's what makes the GUI's primary path simpler than v1."""
|
||||||
|
t = self._valid()
|
||||||
|
# No columns key at all.
|
||||||
|
ok, errs = validate_template(t)
|
||||||
|
assert ok, errs
|
||||||
|
|
||||||
|
|
||||||
|
class TestValidateTemplateColumnVisual:
|
||||||
|
"""Legacy column-visual mode keeps its own contract."""
|
||||||
|
|
||||||
|
def _valid(self) -> dict:
|
||||||
|
return {
|
||||||
|
"schema_version": SCHEMA_VERSION,
|
||||||
|
"slug": "x",
|
||||||
|
"name": "X",
|
||||||
|
"mode": "column_visual",
|
||||||
"pages": {"range": "all"},
|
"pages": {"range": "all"},
|
||||||
"table": {"column_boundaries": [100, 200]},
|
"table": {"column_boundaries": [100, 200]},
|
||||||
"columns": [
|
"columns": [
|
||||||
@@ -77,19 +137,6 @@ class TestValidateTemplate:
|
|||||||
ok, errs = validate_template(self._valid())
|
ok, errs = validate_template(self._valid())
|
||||||
assert ok, errs
|
assert ok, errs
|
||||||
|
|
||||||
def test_missing_name_fails(self):
|
|
||||||
t = self._valid()
|
|
||||||
t["name"] = ""
|
|
||||||
ok, errs = validate_template(t)
|
|
||||||
assert not ok
|
|
||||||
assert any("name" in e for e in errs)
|
|
||||||
|
|
||||||
def test_bad_schema_version(self):
|
|
||||||
t = self._valid()
|
|
||||||
t["schema_version"] = 999
|
|
||||||
ok, errs = validate_template(t)
|
|
||||||
assert not ok
|
|
||||||
|
|
||||||
def test_requires_date_column(self):
|
def test_requires_date_column(self):
|
||||||
t = self._valid()
|
t = self._valid()
|
||||||
t["columns"] = [
|
t["columns"] = [
|
||||||
@@ -123,6 +170,36 @@ class TestValidateTemplate:
|
|||||||
assert ok, errs
|
assert ok, errs
|
||||||
|
|
||||||
|
|
||||||
|
class TestV1Migration:
|
||||||
|
"""v1 templates load with mode='column_visual' auto-injected;
|
||||||
|
the file on disk stays v1 until the user re-saves."""
|
||||||
|
|
||||||
|
def test_loads_v1_template(self, isolated_templates, tmp_path):
|
||||||
|
import json
|
||||||
|
v1_payload = {
|
||||||
|
"schema_version": 1,
|
||||||
|
"slug": "legacy",
|
||||||
|
"name": "Legacy Bank",
|
||||||
|
"pages": {"range": "all"},
|
||||||
|
"table": {"column_boundaries": [100, 200]},
|
||||||
|
"columns": [
|
||||||
|
{"source": 0, "target": "date"},
|
||||||
|
{"source": 1, "target": "description"},
|
||||||
|
{"source": 2, "target": "amount"},
|
||||||
|
],
|
||||||
|
"parse": {},
|
||||||
|
}
|
||||||
|
(tmp_path / "legacy.json").write_text(
|
||||||
|
json.dumps(v1_payload), encoding="utf-8",
|
||||||
|
)
|
||||||
|
loaded = load_template("legacy")
|
||||||
|
# In-memory migration adds mode + bumps schema_version
|
||||||
|
assert loaded["mode"] == "column_visual"
|
||||||
|
assert loaded["schema_version"] == SCHEMA_VERSION
|
||||||
|
# Original keys still intact
|
||||||
|
assert loaded["columns"][0]["target"] == "date"
|
||||||
|
|
||||||
|
|
||||||
class TestPersistence:
|
class TestPersistence:
|
||||||
def test_round_trip(self, isolated_templates):
|
def test_round_trip(self, isolated_templates):
|
||||||
t = new_template("Round Trip Bank")
|
t = new_template("Round Trip Bank")
|
||||||
|
|||||||
Reference in New Issue
Block a user