feat(pdf): schema v2 + mode field + v1 in-memory migration

Bumps ``SCHEMA_VERSION`` from 1 to 2 to add a top-level ``mode`` field distinguishing ``row_heuristic`` (new default) from ``column_visual`` (legacy). The schema bump is real — old code that defaults missing keys would silently mis-extract — so we do it the careful way: - ``new_template`` now returns mode=``row_heuristic`` with the full row-heuristic config tree pre-populated. The legacy column-visual fields are still seeded with empty defaults so switching modes in the GUI doesn't require runtime key insertion. - ``validate_template`` is mode-aware: row_heuristic templates must have a valid ``amounts.shape`` + sane ``row_detection.min/max_amounts_per_row``; column_visual templates keep the existing column/target requirements. - ``load_template`` accepts both v1 and v2 files (``_LOAD_SUPPORTED_VERSIONS = {1, 2}``). v1 files get ``mode="column_visual"`` injected and ``schema_version`` bumped IN MEMORY ONLY — disk file stays v1 until the user explicitly re-saves. A buggy migration can't silently corrupt their template library. - ``save_template`` continues to write the current schema; saving a v1 template through the GUI naturally upgrades it. Mode + shape constants exported (``VALID_MODES``, ``VALID_AMOUNT_SHAPES``) so the GUI dropdowns can derive their options from the source of truth. Tests split into ``TestValidateTemplateRowHeuristic`` (6) + ``TestValidateTemplateColumnVisual`` (4) + ``TestV1Migration`` (1). All 29 template tests pass; the original column-mode tests that previously implicitly relied on schema_version=1 keep working because new_template's seeded column fields are still present in row_heuristic templates (just not validated as required). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 23:46:10 +00:00
parent d80befd05a
commit 48cd9e8249
2 changed files with 241 additions and 63 deletions
--- a/src/pdf_templates.py
+++ b/src/pdf_templates.py
@@ -70,7 +70,29 @@ from pathlib import Path
 from typing import Any


-SCHEMA_VERSION = 1
+SCHEMA_VERSION = 2
+
+# Backward-compatible versions ``load_template`` will accept.
+# v1 templates predate the row-heuristic shift and are loaded as
+# ``mode="column_visual"``; they're not auto-migrated on disk, so
+# the user keeps their canonical original until they re-save.
+_LOAD_SUPPORTED_VERSIONS = frozenset({1, 2})
+
+# Extraction modes. ``row_heuristic`` is the default for new
+# templates — finds transactions by date+amount pattern matching
+# with no coordinate dependency. ``column_visual`` is the legacy
+# x-position-boundary approach, kept for old templates and for
+# the "Advanced" build-mode fallback when the heuristic misfires.
+VALID_MODES = frozenset({"row_heuristic", "column_visual"})
+
+# Amount shapes for row_heuristic mode. The GUI offers these as a
+# dropdown; the parser uses them to assign amount tokens to fields.
+VALID_AMOUNT_SHAPES = frozenset({
+    "single",
+    "txn_balance",
+    "debit_credit",
+    "debit_credit_balance",
+})

 VALID_TARGETS = frozenset({
    "date",
@@ -128,8 +150,10 @@ def slugify(name: str) -> str:
 def new_template(name: str) -> dict[str, Any]:
    """Build a blank template with sensible defaults.

-    Caller can edit any field; the GUI's build flow fills in the
-    table and columns sections as the user works through it.
+    Defaults to ``mode="row_heuristic"`` — the simpler, more
+    robust approach. The GUI's build flow lets the user switch to
+    ``mode="column_visual"`` if the heuristic doesn't fit their
+    statement layout.
    """
    now = datetime.now(tz=timezone.utc).isoformat(timespec="seconds")
    slug = slugify(name)
@@ -138,12 +162,35 @@ def new_template(name: str) -> dict[str, Any]:
        "slug": slug,
        "name": name or slug,
        "notes": "",
+        "mode": "row_heuristic",
        "created_at": now,
        "updated_at": now,
        "pages": {
            "range": "all",
            "skip_matching": "",
        },
+        # Row-heuristic config (primary path).
+        "row_detection": {
+            "min_amounts_per_row": 1,
+            "max_amounts_per_row": 3,
+            "y_tolerance": 3.0,
+            "skip_rows_matching": [],
+            "merge_multiline_description": True,
+        },
+        "amounts": {
+            "shape": "single",
+            "negative_in_parens": True,
+            "decimal_separator": ".",
+            "thousands_separator": ",",
+            "currency_strip": "$",
+        },
+        "date": {
+            "format": "%m/%d/%Y",
+            "formats_fallback": [],
+        },
+        # Column-visual config (legacy / Advanced fallback). Empty
+        # placeholders so the GUI can populate when the user
+        # switches modes without inserting keys at runtime.
        "table": {
            "header_text": "",
            "end_markers": [],
@@ -178,8 +225,9 @@ def new_template(name: str) -> dict[str, Any]:
 def validate_template(template: dict[str, Any]) -> tuple[bool, list[str]]:
    """Check the template before saving. Returns ``(ok, errors)``.

-    The GUI shows the errors next to the Save button; nothing
-    silent here."""
+    Mode-aware: row-heuristic templates and column-visual
+    templates have different required fields. The GUI shows the
+    errors next to the Save button; nothing silent here."""
    errors: list[str] = []
    if not isinstance(template, dict):
        return False, ["Template must be a JSON object."]
@@ -201,50 +249,82 @@ def validate_template(template: dict[str, Any]) -> tuple[bool, list[str]]:
            "1–64 chars, starting with a letter or digit."
        )

-    columns = template.get("columns", [])
-    if not isinstance(columns, list) or len(columns) < 2:
-        errors.append("At least two output columns are required.")
-    else:
-        seen_targets: list[str] = []
-        for i, col in enumerate(columns):
-            if not isinstance(col, dict):
-                errors.append(f"columns[{i}] must be an object.")
-                continue
-            src = col.get("source")
-            tgt = col.get("target")
-            if not isinstance(src, int) or src < 0:
-                errors.append(
-                    f"columns[{i}].source must be a non-negative integer."
-                )
-            if not isinstance(tgt, str) or not tgt:
-                errors.append(f"columns[{i}].target must be a non-empty string.")
-            else:
-                seen_targets.append(tgt)
-        if "date" not in seen_targets:
-            errors.append("At least one column must map to 'date'.")
-        if (
-            "amount" not in seen_targets
-            and not (
-                "amount_debit" in seen_targets
-                and "amount_credit" in seen_targets
-            )
-        ):
+    mode = template.get("mode", "row_heuristic")
+    if mode not in VALID_MODES:
+        errors.append(
+            f"mode {mode!r} must be one of: {sorted(VALID_MODES)}."
+        )
+
+    if mode == "row_heuristic":
+        amounts = template.get("amounts", {}) or {}
+        shape = amounts.get("shape", "single")
+        if shape not in VALID_AMOUNT_SHAPES:
            errors.append(
-                "Either an 'amount' column or both 'amount_debit' + "
-                "'amount_credit' columns are required."
+                f"amounts.shape {shape!r} must be one of: "
+                f"{sorted(VALID_AMOUNT_SHAPES)}."
+            )
+        rd = template.get("row_detection", {}) or {}
+        min_a = rd.get("min_amounts_per_row", 1)
+        max_a = rd.get("max_amounts_per_row", 3)
+        if not (isinstance(min_a, int) and isinstance(max_a, int)):
+            errors.append(
+                "row_detection.min_amounts_per_row and "
+                "max_amounts_per_row must be integers."
+            )
+        elif min_a < 1 or max_a < min_a:
+            errors.append(
+                "row_detection.min_amounts_per_row must be ≥1 and ≤ "
+                "max_amounts_per_row."
            )

-    table = template.get("table", {}) or {}
-    boundaries = table.get("column_boundaries", [])
-    if not isinstance(boundaries, list):
-        errors.append("table.column_boundaries must be a list.")
-    elif columns and len(boundaries) + 1 < len(set(
-        c.get("source") for c in columns if isinstance(c, dict)
-    )):
-        errors.append(
-            "table.column_boundaries doesn't match the number of source columns "
-            "implied by the column mapping."
-        )
+    elif mode == "column_visual":
+        columns = template.get("columns", [])
+        if not isinstance(columns, list) or len(columns) < 2:
+            errors.append(
+                "column_visual mode: at least two output columns "
+                "are required."
+            )
+        else:
+            seen_targets: list[str] = []
+            for i, col in enumerate(columns):
+                if not isinstance(col, dict):
+                    errors.append(f"columns[{i}] must be an object.")
+                    continue
+                src = col.get("source")
+                tgt = col.get("target")
+                if not isinstance(src, int) or src < 0:
+                    errors.append(
+                        f"columns[{i}].source must be a non-negative "
+                        f"integer."
+                    )
+                if not isinstance(tgt, str) or not tgt:
+                    errors.append(
+                        f"columns[{i}].target must be a non-empty string."
+                    )
+                else:
+                    seen_targets.append(tgt)
+            if "date" not in seen_targets:
+                errors.append(
+                    "column_visual mode: at least one column must map "
+                    "to 'date'."
+                )
+            if (
+                "amount" not in seen_targets
+                and not (
+                    "amount_debit" in seen_targets
+                    and "amount_credit" in seen_targets
+                )
+            ):
+                errors.append(
+                    "column_visual mode: either an 'amount' column or "
+                    "both 'amount_debit' + 'amount_credit' columns "
+                    "are required."
+                )
+
+        table = template.get("table", {}) or {}
+        boundaries = table.get("column_boundaries", [])
+        if not isinstance(boundaries, list):
+            errors.append("table.column_boundaries must be a list.")

    return (not errors), errors

@@ -302,7 +382,14 @@ def save_template(template: dict[str, Any]) -> str:
 def load_template(slug: str) -> dict[str, Any]:
    """Read the template at *slug*. Raises ``FileNotFoundError`` if
    missing, ``ValueError`` if the JSON is corrupt or the schema
-    version is unknown."""
+    version is unsupported.
+
+    v1 templates (pre row-heuristic) are accepted and migrated
+    in-memory to v2 shape with ``mode="column_visual"``. The file
+    on disk is NOT rewritten — the user's canonical original stays
+    intact until they explicitly re-save, so a buggy migration
+    can't silently corrupt their template library.
+    """
    p = template_path(slug)
    try:
        raw = p.read_text(encoding="utf-8")
@@ -313,11 +400,25 @@ def load_template(slug: str) -> dict[str, Any]:
    except json.JSONDecodeError as e:
        raise ValueError(f"Corrupt template {slug!r}: {e}") from e
    sv = data.get("schema_version")
-    if sv != SCHEMA_VERSION:
+    if sv not in _LOAD_SUPPORTED_VERSIONS:
        raise ValueError(
            f"Template {slug!r} has unsupported schema_version {sv!r}; "
-            f"expected {SCHEMA_VERSION}."
+            f"this build supports {sorted(_LOAD_SUPPORTED_VERSIONS)}."
        )
+    return _migrate_to_current(data)
+
+
+def _migrate_to_current(data: dict[str, Any]) -> dict[str, Any]:
+    """In-memory migration of older schemas to the current shape.
+
+    v1 → v2 adds a ``mode`` key defaulting to ``"column_visual"``
+    (since v1 was the column-x-position approach) and stamps
+    ``schema_version`` to the current value. All v1 keys keep
+    their original meaning."""
+    if data.get("schema_version") == 1:
+        data = dict(data)
+        data["schema_version"] = SCHEMA_VERSION
+        data.setdefault("mode", "column_visual")
    return data


--- a/tests/test_pdf_templates.py
+++ b/tests/test_pdf_templates.py
@@ -57,12 +57,72 @@ class TestNewTemplate:
        assert t["updated_at"]


-class TestValidateTemplate:
+class TestValidateTemplateRowHeuristic:
+    """Row-heuristic mode is the v2 default."""
+
    def _valid(self) -> dict:
        return {
            "schema_version": SCHEMA_VERSION,
            "slug": "x",
            "name": "X",
+            "mode": "row_heuristic",
+            "row_detection": {
+                "min_amounts_per_row": 1,
+                "max_amounts_per_row": 3,
+            },
+            "amounts": {"shape": "single"},
+            "date": {"format": "%m/%d/%Y"},
+        }
+
+    def test_valid_passes(self):
+        ok, errs = validate_template(self._valid())
+        assert ok, errs
+
+    def test_missing_name_fails(self):
+        t = self._valid()
+        t["name"] = ""
+        ok, errs = validate_template(t)
+        assert not ok
+
+    def test_bad_mode_fails(self):
+        t = self._valid()
+        t["mode"] = "magic"
+        ok, errs = validate_template(t)
+        assert not ok
+        assert any("mode" in e for e in errs)
+
+    def test_bad_shape_fails(self):
+        t = self._valid()
+        t["amounts"]["shape"] = "telepathic"
+        ok, errs = validate_template(t)
+        assert not ok
+        assert any("shape" in e for e in errs)
+
+    def test_inverted_amount_range_fails(self):
+        t = self._valid()
+        t["row_detection"]["min_amounts_per_row"] = 5
+        t["row_detection"]["max_amounts_per_row"] = 2
+        ok, errs = validate_template(t)
+        assert not ok
+
+    def test_does_not_require_columns_in_row_mode(self):
+        """Key point: row mode doesn't need ``columns`` populated.
+        That's what makes the GUI's primary path simpler than v1."""
+        t = self._valid()
+        # No columns key at all.
+        ok, errs = validate_template(t)
+        assert ok, errs
+
+
+class TestValidateTemplateColumnVisual:
+    """Legacy column-visual mode keeps its own contract."""
+
+    def _valid(self) -> dict:
+        return {
+            "schema_version": SCHEMA_VERSION,
+            "slug": "x",
+            "name": "X",
+            "mode": "column_visual",
            "pages": {"range": "all"},
            "table": {"column_boundaries": [100, 200]},
            "columns": [
@@ -77,19 +137,6 @@ class TestValidateTemplate:
        ok, errs = validate_template(self._valid())
        assert ok, errs

-    def test_missing_name_fails(self):
-        t = self._valid()
-        t["name"] = ""
-        ok, errs = validate_template(t)
-        assert not ok
-        assert any("name" in e for e in errs)
-
-    def test_bad_schema_version(self):
-        t = self._valid()
-        t["schema_version"] = 999
-        ok, errs = validate_template(t)
-        assert not ok
-
    def test_requires_date_column(self):
        t = self._valid()
        t["columns"] = [
@@ -123,6 +170,36 @@ class TestValidateTemplate:
        assert ok, errs


+class TestV1Migration:
+    """v1 templates load with mode='column_visual' auto-injected;
+    the file on disk stays v1 until the user re-saves."""
+
+    def test_loads_v1_template(self, isolated_templates, tmp_path):
+        import json
+        v1_payload = {
+            "schema_version": 1,
+            "slug": "legacy",
+            "name": "Legacy Bank",
+            "pages": {"range": "all"},
+            "table": {"column_boundaries": [100, 200]},
+            "columns": [
+                {"source": 0, "target": "date"},
+                {"source": 1, "target": "description"},
+                {"source": 2, "target": "amount"},
+            ],
+            "parse": {},
+        }
+        (tmp_path / "legacy.json").write_text(
+            json.dumps(v1_payload), encoding="utf-8",
+        )
+        loaded = load_template("legacy")
+        # In-memory migration adds mode + bumps schema_version
+        assert loaded["mode"] == "column_visual"
+        assert loaded["schema_version"] == SCHEMA_VERSION
+        # Original keys still intact
+        assert loaded["columns"][0]["target"] == "date"
+
+
 class TestPersistence:
    def test_round_trip(self, isolated_templates):
        t = new_template("Round Trip Bank")