From 48cd9e82492b4dcbd78d3d61b98c1dbb6c3f0918 Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 19 May 2026 23:46:10 +0000 Subject: [PATCH] feat(pdf): schema v2 + mode field + v1 in-memory migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps ``SCHEMA_VERSION`` from 1 to 2 to add a top-level ``mode`` field distinguishing ``row_heuristic`` (new default) from ``column_visual`` (legacy). The schema bump is real — old code that defaults missing keys would silently mis-extract — so we do it the careful way: - ``new_template`` now returns mode=``row_heuristic`` with the full row-heuristic config tree pre-populated. The legacy column-visual fields are still seeded with empty defaults so switching modes in the GUI doesn't require runtime key insertion. - ``validate_template`` is mode-aware: row_heuristic templates must have a valid ``amounts.shape`` + sane ``row_detection.min/max_amounts_per_row``; column_visual templates keep the existing column/target requirements. - ``load_template`` accepts both v1 and v2 files (``_LOAD_SUPPORTED_VERSIONS = {1, 2}``). v1 files get ``mode="column_visual"`` injected and ``schema_version`` bumped IN MEMORY ONLY — disk file stays v1 until the user explicitly re-saves. A buggy migration can't silently corrupt their template library. - ``save_template`` continues to write the current schema; saving a v1 template through the GUI naturally upgrades it. Mode + shape constants exported (``VALID_MODES``, ``VALID_AMOUNT_SHAPES``) so the GUI dropdowns can derive their options from the source of truth. Tests split into ``TestValidateTemplateRowHeuristic`` (6) + ``TestValidateTemplateColumnVisual`` (4) + ``TestV1Migration`` (1). All 29 template tests pass; the original column-mode tests that previously implicitly relied on schema_version=1 keep working because new_template's seeded column fields are still present in row_heuristic templates (just not validated as required). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/pdf_templates.py | 199 +++++++++++++++++++++++++++--------- tests/test_pdf_templates.py | 105 ++++++++++++++++--- 2 files changed, 241 insertions(+), 63 deletions(-) diff --git a/src/pdf_templates.py b/src/pdf_templates.py index 07ea8ac..0339c6d 100644 --- a/src/pdf_templates.py +++ b/src/pdf_templates.py @@ -70,7 +70,29 @@ from pathlib import Path from typing import Any -SCHEMA_VERSION = 1 +SCHEMA_VERSION = 2 + +# Backward-compatible versions ``load_template`` will accept. +# v1 templates predate the row-heuristic shift and are loaded as +# ``mode="column_visual"``; they're not auto-migrated on disk, so +# the user keeps their canonical original until they re-save. +_LOAD_SUPPORTED_VERSIONS = frozenset({1, 2}) + +# Extraction modes. ``row_heuristic`` is the default for new +# templates — finds transactions by date+amount pattern matching +# with no coordinate dependency. ``column_visual`` is the legacy +# x-position-boundary approach, kept for old templates and for +# the "Advanced" build-mode fallback when the heuristic misfires. +VALID_MODES = frozenset({"row_heuristic", "column_visual"}) + +# Amount shapes for row_heuristic mode. The GUI offers these as a +# dropdown; the parser uses them to assign amount tokens to fields. +VALID_AMOUNT_SHAPES = frozenset({ + "single", + "txn_balance", + "debit_credit", + "debit_credit_balance", +}) VALID_TARGETS = frozenset({ "date", @@ -128,8 +150,10 @@ def slugify(name: str) -> str: def new_template(name: str) -> dict[str, Any]: """Build a blank template with sensible defaults. - Caller can edit any field; the GUI's build flow fills in the - table and columns sections as the user works through it. + Defaults to ``mode="row_heuristic"`` — the simpler, more + robust approach. The GUI's build flow lets the user switch to + ``mode="column_visual"`` if the heuristic doesn't fit their + statement layout. """ now = datetime.now(tz=timezone.utc).isoformat(timespec="seconds") slug = slugify(name) @@ -138,12 +162,35 @@ def new_template(name: str) -> dict[str, Any]: "slug": slug, "name": name or slug, "notes": "", + "mode": "row_heuristic", "created_at": now, "updated_at": now, "pages": { "range": "all", "skip_matching": "", }, + # Row-heuristic config (primary path). + "row_detection": { + "min_amounts_per_row": 1, + "max_amounts_per_row": 3, + "y_tolerance": 3.0, + "skip_rows_matching": [], + "merge_multiline_description": True, + }, + "amounts": { + "shape": "single", + "negative_in_parens": True, + "decimal_separator": ".", + "thousands_separator": ",", + "currency_strip": "$", + }, + "date": { + "format": "%m/%d/%Y", + "formats_fallback": [], + }, + # Column-visual config (legacy / Advanced fallback). Empty + # placeholders so the GUI can populate when the user + # switches modes without inserting keys at runtime. "table": { "header_text": "", "end_markers": [], @@ -178,8 +225,9 @@ def new_template(name: str) -> dict[str, Any]: def validate_template(template: dict[str, Any]) -> tuple[bool, list[str]]: """Check the template before saving. Returns ``(ok, errors)``. - The GUI shows the errors next to the Save button; nothing - silent here.""" + Mode-aware: row-heuristic templates and column-visual + templates have different required fields. The GUI shows the + errors next to the Save button; nothing silent here.""" errors: list[str] = [] if not isinstance(template, dict): return False, ["Template must be a JSON object."] @@ -201,50 +249,82 @@ def validate_template(template: dict[str, Any]) -> tuple[bool, list[str]]: "1–64 chars, starting with a letter or digit." ) - columns = template.get("columns", []) - if not isinstance(columns, list) or len(columns) < 2: - errors.append("At least two output columns are required.") - else: - seen_targets: list[str] = [] - for i, col in enumerate(columns): - if not isinstance(col, dict): - errors.append(f"columns[{i}] must be an object.") - continue - src = col.get("source") - tgt = col.get("target") - if not isinstance(src, int) or src < 0: - errors.append( - f"columns[{i}].source must be a non-negative integer." - ) - if not isinstance(tgt, str) or not tgt: - errors.append(f"columns[{i}].target must be a non-empty string.") - else: - seen_targets.append(tgt) - if "date" not in seen_targets: - errors.append("At least one column must map to 'date'.") - if ( - "amount" not in seen_targets - and not ( - "amount_debit" in seen_targets - and "amount_credit" in seen_targets - ) - ): + mode = template.get("mode", "row_heuristic") + if mode not in VALID_MODES: + errors.append( + f"mode {mode!r} must be one of: {sorted(VALID_MODES)}." + ) + + if mode == "row_heuristic": + amounts = template.get("amounts", {}) or {} + shape = amounts.get("shape", "single") + if shape not in VALID_AMOUNT_SHAPES: errors.append( - "Either an 'amount' column or both 'amount_debit' + " - "'amount_credit' columns are required." + f"amounts.shape {shape!r} must be one of: " + f"{sorted(VALID_AMOUNT_SHAPES)}." + ) + rd = template.get("row_detection", {}) or {} + min_a = rd.get("min_amounts_per_row", 1) + max_a = rd.get("max_amounts_per_row", 3) + if not (isinstance(min_a, int) and isinstance(max_a, int)): + errors.append( + "row_detection.min_amounts_per_row and " + "max_amounts_per_row must be integers." + ) + elif min_a < 1 or max_a < min_a: + errors.append( + "row_detection.min_amounts_per_row must be ≥1 and ≤ " + "max_amounts_per_row." ) - table = template.get("table", {}) or {} - boundaries = table.get("column_boundaries", []) - if not isinstance(boundaries, list): - errors.append("table.column_boundaries must be a list.") - elif columns and len(boundaries) + 1 < len(set( - c.get("source") for c in columns if isinstance(c, dict) - )): - errors.append( - "table.column_boundaries doesn't match the number of source columns " - "implied by the column mapping." - ) + elif mode == "column_visual": + columns = template.get("columns", []) + if not isinstance(columns, list) or len(columns) < 2: + errors.append( + "column_visual mode: at least two output columns " + "are required." + ) + else: + seen_targets: list[str] = [] + for i, col in enumerate(columns): + if not isinstance(col, dict): + errors.append(f"columns[{i}] must be an object.") + continue + src = col.get("source") + tgt = col.get("target") + if not isinstance(src, int) or src < 0: + errors.append( + f"columns[{i}].source must be a non-negative " + f"integer." + ) + if not isinstance(tgt, str) or not tgt: + errors.append( + f"columns[{i}].target must be a non-empty string." + ) + else: + seen_targets.append(tgt) + if "date" not in seen_targets: + errors.append( + "column_visual mode: at least one column must map " + "to 'date'." + ) + if ( + "amount" not in seen_targets + and not ( + "amount_debit" in seen_targets + and "amount_credit" in seen_targets + ) + ): + errors.append( + "column_visual mode: either an 'amount' column or " + "both 'amount_debit' + 'amount_credit' columns " + "are required." + ) + + table = template.get("table", {}) or {} + boundaries = table.get("column_boundaries", []) + if not isinstance(boundaries, list): + errors.append("table.column_boundaries must be a list.") return (not errors), errors @@ -302,7 +382,14 @@ def save_template(template: dict[str, Any]) -> str: def load_template(slug: str) -> dict[str, Any]: """Read the template at *slug*. Raises ``FileNotFoundError`` if missing, ``ValueError`` if the JSON is corrupt or the schema - version is unknown.""" + version is unsupported. + + v1 templates (pre row-heuristic) are accepted and migrated + in-memory to v2 shape with ``mode="column_visual"``. The file + on disk is NOT rewritten — the user's canonical original stays + intact until they explicitly re-save, so a buggy migration + can't silently corrupt their template library. + """ p = template_path(slug) try: raw = p.read_text(encoding="utf-8") @@ -313,11 +400,25 @@ def load_template(slug: str) -> dict[str, Any]: except json.JSONDecodeError as e: raise ValueError(f"Corrupt template {slug!r}: {e}") from e sv = data.get("schema_version") - if sv != SCHEMA_VERSION: + if sv not in _LOAD_SUPPORTED_VERSIONS: raise ValueError( f"Template {slug!r} has unsupported schema_version {sv!r}; " - f"expected {SCHEMA_VERSION}." + f"this build supports {sorted(_LOAD_SUPPORTED_VERSIONS)}." ) + return _migrate_to_current(data) + + +def _migrate_to_current(data: dict[str, Any]) -> dict[str, Any]: + """In-memory migration of older schemas to the current shape. + + v1 → v2 adds a ``mode`` key defaulting to ``"column_visual"`` + (since v1 was the column-x-position approach) and stamps + ``schema_version`` to the current value. All v1 keys keep + their original meaning.""" + if data.get("schema_version") == 1: + data = dict(data) + data["schema_version"] = SCHEMA_VERSION + data.setdefault("mode", "column_visual") return data diff --git a/tests/test_pdf_templates.py b/tests/test_pdf_templates.py index a06c002..551dab6 100644 --- a/tests/test_pdf_templates.py +++ b/tests/test_pdf_templates.py @@ -57,12 +57,72 @@ class TestNewTemplate: assert t["updated_at"] -class TestValidateTemplate: +class TestValidateTemplateRowHeuristic: + """Row-heuristic mode is the v2 default.""" + def _valid(self) -> dict: return { "schema_version": SCHEMA_VERSION, "slug": "x", "name": "X", + "mode": "row_heuristic", + "row_detection": { + "min_amounts_per_row": 1, + "max_amounts_per_row": 3, + }, + "amounts": {"shape": "single"}, + "date": {"format": "%m/%d/%Y"}, + } + + def test_valid_passes(self): + ok, errs = validate_template(self._valid()) + assert ok, errs + + def test_missing_name_fails(self): + t = self._valid() + t["name"] = "" + ok, errs = validate_template(t) + assert not ok + + def test_bad_mode_fails(self): + t = self._valid() + t["mode"] = "magic" + ok, errs = validate_template(t) + assert not ok + assert any("mode" in e for e in errs) + + def test_bad_shape_fails(self): + t = self._valid() + t["amounts"]["shape"] = "telepathic" + ok, errs = validate_template(t) + assert not ok + assert any("shape" in e for e in errs) + + def test_inverted_amount_range_fails(self): + t = self._valid() + t["row_detection"]["min_amounts_per_row"] = 5 + t["row_detection"]["max_amounts_per_row"] = 2 + ok, errs = validate_template(t) + assert not ok + + def test_does_not_require_columns_in_row_mode(self): + """Key point: row mode doesn't need ``columns`` populated. + That's what makes the GUI's primary path simpler than v1.""" + t = self._valid() + # No columns key at all. + ok, errs = validate_template(t) + assert ok, errs + + +class TestValidateTemplateColumnVisual: + """Legacy column-visual mode keeps its own contract.""" + + def _valid(self) -> dict: + return { + "schema_version": SCHEMA_VERSION, + "slug": "x", + "name": "X", + "mode": "column_visual", "pages": {"range": "all"}, "table": {"column_boundaries": [100, 200]}, "columns": [ @@ -77,19 +137,6 @@ class TestValidateTemplate: ok, errs = validate_template(self._valid()) assert ok, errs - def test_missing_name_fails(self): - t = self._valid() - t["name"] = "" - ok, errs = validate_template(t) - assert not ok - assert any("name" in e for e in errs) - - def test_bad_schema_version(self): - t = self._valid() - t["schema_version"] = 999 - ok, errs = validate_template(t) - assert not ok - def test_requires_date_column(self): t = self._valid() t["columns"] = [ @@ -123,6 +170,36 @@ class TestValidateTemplate: assert ok, errs +class TestV1Migration: + """v1 templates load with mode='column_visual' auto-injected; + the file on disk stays v1 until the user re-saves.""" + + def test_loads_v1_template(self, isolated_templates, tmp_path): + import json + v1_payload = { + "schema_version": 1, + "slug": "legacy", + "name": "Legacy Bank", + "pages": {"range": "all"}, + "table": {"column_boundaries": [100, 200]}, + "columns": [ + {"source": 0, "target": "date"}, + {"source": 1, "target": "description"}, + {"source": 2, "target": "amount"}, + ], + "parse": {}, + } + (tmp_path / "legacy.json").write_text( + json.dumps(v1_payload), encoding="utf-8", + ) + loaded = load_template("legacy") + # In-memory migration adds mode + bumps schema_version + assert loaded["mode"] == "column_visual" + assert loaded["schema_version"] == SCHEMA_VERSION + # Original keys still intact + assert loaded["columns"][0]["target"] == "date" + + class TestPersistence: def test_round_trip(self, isolated_templates): t = new_template("Round Trip Bank")