datatools-dev/tests/test_pdf_templates.py

"""Tests for the PDF template storage layer."""

from __future__ import annotations

import json

import pytest

from src.pdf_templates import (
    SCHEMA_VERSION,
    delete_template,
    list_templates,
    load_template,
    new_template,
    save_template,
    slugify,
    template_from_json,
    template_path,
    templates_dir,
    template_to_json,
    validate_template,
)


@pytest.fixture
def isolated_templates(monkeypatch, tmp_path):
    """Redirect the templates directory into ``tmp_path``."""
    monkeypatch.setenv("DATATOOLS_PDF_TEMPLATES_DIR", str(tmp_path))
    yield tmp_path


class TestSlugify:
    def test_basic(self):
        assert slugify("Chase Personal Checking") == "chase-personal-checking"

    def test_strips_punctuation(self):
        assert slugify("BofA: Business (USD)") == "bofa-business-usd"

    def test_empty_falls_back(self):
        assert slugify("") == "untitled"
        assert slugify("   ") == "untitled"


class TestNewTemplate:
    def test_has_schema_version(self):
        t = new_template("Sample")
        assert t["schema_version"] == SCHEMA_VERSION

    def test_slug_derived_from_name(self):
        t = new_template("Sample Bank")
        assert t["slug"] == "sample-bank"
        assert t["name"] == "Sample Bank"

    def test_timestamps_present(self):
        t = new_template("X")
        assert t["created_at"]
        assert t["updated_at"]


class TestValidateTemplateRowHeuristic:
    """Row-heuristic mode is the v2 default."""

    def _valid(self) -> dict:
        return {
            "schema_version": SCHEMA_VERSION,
            "slug": "x",
            "name": "X",
            "mode": "row_heuristic",
            "row_detection": {
                "min_amounts_per_row": 1,
                "max_amounts_per_row": 3,
            },
            "amounts": {"shape": "single"},
            "date": {"format": "%m/%d/%Y"},
        }

    def test_valid_passes(self):
        ok, errs = validate_template(self._valid())
        assert ok, errs

    def test_missing_name_fails(self):
        t = self._valid()
        t["name"] = ""
        ok, errs = validate_template(t)
        assert not ok

    def test_bad_mode_fails(self):
        t = self._valid()
        t["mode"] = "magic"
        ok, errs = validate_template(t)
        assert not ok
        assert any("mode" in e for e in errs)

    def test_bad_shape_fails(self):
        t = self._valid()
        t["amounts"]["shape"] = "telepathic"
        ok, errs = validate_template(t)
        assert not ok
        assert any("shape" in e for e in errs)

    def test_inverted_amount_range_fails(self):
        t = self._valid()
        t["row_detection"]["min_amounts_per_row"] = 5
        t["row_detection"]["max_amounts_per_row"] = 2
        ok, errs = validate_template(t)
        assert not ok

    def test_does_not_require_columns_in_row_mode(self):
        """Key point: row mode doesn't need ``columns`` populated.
        That's what makes the GUI's primary path simpler than v1."""
        t = self._valid()
        # No columns key at all.
        ok, errs = validate_template(t)
        assert ok, errs


class TestValidateTemplateColumnVisual:
    """Legacy column-visual mode keeps its own contract."""

    def _valid(self) -> dict:
        return {
            "schema_version": SCHEMA_VERSION,
            "slug": "x",
            "name": "X",
            "mode": "column_visual",
            "pages": {"range": "all"},
            "table": {"column_boundaries": [100, 200]},
            "columns": [
                {"source": 0, "target": "date"},
                {"source": 1, "target": "description"},
                {"source": 2, "target": "amount"},
            ],
            "parse": {},
        }

    def test_valid_passes(self):
        ok, errs = validate_template(self._valid())
        assert ok, errs

    def test_requires_date_column(self):
        t = self._valid()
        t["columns"] = [
            {"source": 0, "target": "description"},
            {"source": 1, "target": "amount"},
        ]
        ok, errs = validate_template(t)
        assert not ok
        assert any("date" in e for e in errs)

    def test_requires_amount_or_debit_credit(self):
        t = self._valid()
        t["columns"] = [
            {"source": 0, "target": "date"},
            {"source": 1, "target": "description"},
        ]
        ok, errs = validate_template(t)
        assert not ok
        assert any("amount" in e for e in errs)

    def test_debit_credit_pair_is_valid(self):
        t = self._valid()
        t["columns"] = [
            {"source": 0, "target": "date"},
            {"source": 1, "target": "description"},
            {"source": 2, "target": "amount_debit"},
            {"source": 3, "target": "amount_credit"},
        ]
        t["table"]["column_boundaries"] = [100, 200, 300]
        ok, errs = validate_template(t)
        assert ok, errs


class TestV1Migration:
    """v1 templates load with mode='column_visual' auto-injected;
    the file on disk stays v1 until the user re-saves."""

    def test_loads_v1_template(self, isolated_templates, tmp_path):
        import json
        v1_payload = {
            "schema_version": 1,
            "slug": "legacy",
            "name": "Legacy Bank",
            "pages": {"range": "all"},
            "table": {"column_boundaries": [100, 200]},
            "columns": [
                {"source": 0, "target": "date"},
                {"source": 1, "target": "description"},
                {"source": 2, "target": "amount"},
            ],
            "parse": {},
        }
        (tmp_path / "legacy.json").write_text(
            json.dumps(v1_payload), encoding="utf-8",
        )
        loaded = load_template("legacy")
        # In-memory migration adds mode + bumps schema_version
        assert loaded["mode"] == "column_visual"
        assert loaded["schema_version"] == SCHEMA_VERSION
        # Original keys still intact
        assert loaded["columns"][0]["target"] == "date"


class TestPersistence:
    def test_round_trip(self, isolated_templates):
        t = new_template("Round Trip Bank")
        t["columns"] = [
            {"source": 0, "target": "date"},
            {"source": 1, "target": "description"},
            {"source": 2, "target": "amount"},
        ]
        t["table"]["column_boundaries"] = [100, 200]
        slug = save_template(t)
        assert slug == "round-trip-bank"

        path = template_path(slug)
        assert path.exists()
        loaded = load_template(slug)
        assert loaded["name"] == "Round Trip Bank"
        assert loaded["columns"][0]["target"] == "date"

    def test_save_rejects_invalid(self, isolated_templates):
        with pytest.raises(ValueError):
            save_template({"schema_version": 1, "name": ""})

    def test_load_missing_raises(self, isolated_templates):
        with pytest.raises(FileNotFoundError):
            load_template("does-not-exist")

    def test_load_corrupt_raises(self, isolated_templates, tmp_path):
        bad = tmp_path / "bad.json"
        bad.write_text("not json", encoding="utf-8")
        with pytest.raises(ValueError):
            load_template("bad")

    def test_delete(self, isolated_templates):
        t = new_template("To Delete")
        t["columns"] = [
            {"source": 0, "target": "date"},
            {"source": 1, "target": "amount"},
        ]
        t["table"]["column_boundaries"] = [100]
        save_template(t)
        assert delete_template("to-delete") is True
        assert delete_template("to-delete") is False

    def test_list_returns_summaries(self, isolated_templates):
        for name in ["Alpha", "Bravo"]:
            t = new_template(name)
            t["columns"] = [
                {"source": 0, "target": "date"},
                {"source": 1, "target": "amount"},
            ]
            t["table"]["column_boundaries"] = [100]
            save_template(t)
        rows = list_templates()
        assert {r["slug"] for r in rows} == {"alpha", "bravo"}

    def test_list_skips_corrupt(self, isolated_templates, tmp_path):
        (tmp_path / "broken.json").write_text("nope", encoding="utf-8")
        # Even with a broken file present, list still returns []
        rows = list_templates()
        assert rows == []

    def test_atomic_save_no_partial_file_on_failure(
        self, isolated_templates, monkeypatch
    ):
        """If the write step fails mid-way, no half-written JSON survives
        at the target path. Tests the temp-file-rename safety pattern."""
        t = new_template("Atomic")
        t["columns"] = [
            {"source": 0, "target": "date"},
            {"source": 1, "target": "amount"},
        ]
        t["table"]["column_boundaries"] = [100]

        # Make json.dumps blow up to simulate a failure during write.
        # save_template already validated before this step, so the
        # crash is "after validation, during write".
        import src.pdf_templates as mod
        original_dumps = mod.json.dumps

        def boom(*a, **kw):
            raise IOError("disk full")

        monkeypatch.setattr(mod.json, "dumps", boom)
        with pytest.raises(IOError):
            save_template(t)
        monkeypatch.setattr(mod.json, "dumps", original_dumps)

        assert not template_path("atomic").exists()


class TestImportExport:
    def test_round_trip_via_json(self):
        t = new_template("Exported")
        t["columns"] = [
            {"source": 0, "target": "date"},
            {"source": 1, "target": "amount"},
        ]
        payload = template_to_json(t)
        loaded = template_from_json(payload)
        assert loaded["name"] == "Exported"

    def test_import_rejects_bad_schema(self):
        bad = json.dumps({"schema_version": 999, "name": "X"})
        with pytest.raises(ValueError):
            template_from_json(bad)

    def test_import_rejects_non_object(self):
        with pytest.raises(ValueError):
            template_from_json('["not", "an", "object"]')


def test_templates_dir_env_override(monkeypatch, tmp_path):
    monkeypatch.setenv("DATATOOLS_PDF_TEMPLATES_DIR", str(tmp_path))
    assert templates_dir() == tmp_path