datatools-dev/tests/test_pdf_row_heuristic.py

"""Tests for the row-heuristic extraction pipeline.

This is now the primary extraction mode — uses date + amount
pattern matching to find transaction lines, with no dependency
on x-position column boundaries. Robust to layout drift across
statements from the same bank.

The legacy column-visual pipeline keeps its own tests in
``test_pdf_extract.py``.
"""

from __future__ import annotations

import pandas as pd

from src.pdf_extract import (
    Page,
    WordBox,
    apply_template,
    apply_template_row_heuristic,
    find_transaction_rows,
    _find_amount_tokens,
    _find_dates_in_words,
    _infer_amount_column_centers,
)


def _w(text: str, x0: float, top: float) -> WordBox:
    return WordBox(
        x0=x0,
        top=top,
        x1=x0 + 8 * len(text),
        bottom=top + 10,
        text=text,
    )


class TestFindDatesInRow:
    def test_us_slash(self):
        row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)]
        assert _find_dates_in_words(row) == [(0, "01/15/2026")]

    def test_two_digit_year(self):
        row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)]
        result = _find_dates_in_words(row)
        assert result and result[0][1] == "01/15/26"

    def test_iso(self):
        row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)]
        assert _find_dates_in_words(row) == [(0, "2026-01-15")]

    def test_month_name(self):
        # "Jan 15, 2026" — three word tokens, should stitch.
        row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)]
        result = _find_dates_in_words(row)
        assert result, "Multi-word month-day-year should match"
        assert "Jan 15" in result[0][1]

    def test_no_date(self):
        row = [_w("Just", 0, 0), _w("text", 50, 0)]
        assert _find_dates_in_words(row) == []


class TestFindAmountTokens:
    def test_currency_format(self):
        row = [_w("Coffee", 0, 0), _w("$4.50", 100, 0)]
        out = _find_amount_tokens(row)
        assert len(out) == 1
        assert out[0][2] == "$4.50"

    def test_parens_negative(self):
        row = [_w("(123.45)", 0, 0)]
        out = _find_amount_tokens(row)
        assert out and out[0][2] == "(123.45)"

    def test_no_amount_on_pure_text(self):
        row = [_w("Hello", 0, 0), _w("World", 50, 0)]
        assert _find_amount_tokens(row) == []

    def test_rejects_bare_year(self):
        # "2026" matches the digit pattern but lacks $/decimal/etc.,
        # so the looks-like-amount filter should drop it.
        row = [_w("2026", 0, 0)]
        # Bare integer can pass the regex but not the heuristic.
        out = _find_amount_tokens(row)
        # Either filtered out OR included — both are defensible.
        # If included, it'd be missed-amount territory not a false-
        # positive. Pin the conservative behavior: NO match.
        assert out == [], "Bare 4-digit year should not register as amount"


class TestInferAmountColumnCenters:
    def test_two_clear_columns(self):
        # 5 rows, each with two amounts at roughly x=300 and x=450.
        rows = []
        for top in range(0, 100, 20):
            rows.append([
                _w("01/15/2026", 20, top),
                _w("Item", 100, top),
                _w("$10.00", 300, top),
                _w("$1,000.00", 450, top),
            ])
        centers = _infer_amount_column_centers(
            rows, expected=2, min_amounts=2, max_amounts=2,
        )
        assert len(centers) == 2
        # Left center ≈ 300 + 8*len("$10.00")/2 = 300+24 = 324
        assert 310 < centers[0] < 340
        assert 460 < centers[1] < 490

    def test_no_transactions_returns_empty(self):
        rows = [[_w("just", 0, 0), _w("text", 50, 0)]]
        assert _infer_amount_column_centers(
            rows, expected=2, min_amounts=1, max_amounts=3,
        ) == []


class TestRowHeuristicEndToEnd:
    """Synthetic ``Page`` objects exercise the full row-heuristic
    pipeline end-to-end without a real PDF."""

    def _page_single_amount(self) -> Page:
        words = [
            _w("ACME BANK STATEMENT", 20, 0),
            _w("01/15/2026", 20, 30), _w("Coffee", 100, 30),
            _w("Shop", 150, 30), _w("$4.50", 400, 30),
            _w("01/16/2026", 20, 50), _w("Refund", 100, 50),
            _w("from", 100, 70), _w("vendor", 140, 70),  # continuation
            _w("Vendor", 140, 50), _w("$12.00", 400, 50),
            _w("Page", 20, 90), _w("1", 60, 90),  # not a txn
        ]
        return Page(page_no=1, width=600, height=120, text="", words=words)

    def test_extracts_two_rows_single_amount(self):
        tpl = {
            "mode": "row_heuristic",
            "row_detection": {
                "min_amounts_per_row": 1,
                "max_amounts_per_row": 1,
                "merge_multiline_description": True,
            },
            "amounts": {"shape": "single", "negative_in_parens": True},
            "date": {"format": "%m/%d/%Y"},
        }
        df = apply_template_row_heuristic([self._page_single_amount()], tpl)
        assert len(df) == 2
        assert list(df["date"]) == ["2026-01-15", "2026-01-16"]
        # Multi-line description merged
        assert "from vendor" in df.iloc[1]["description"]

    def test_dispatches_through_apply_template(self):
        tpl = {
            "mode": "row_heuristic",
            "row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1},
            "amounts": {"shape": "single"},
            "date": {"format": "%m/%d/%Y"},
        }
        df = apply_template([self._page_single_amount()], tpl)
        assert isinstance(df, pd.DataFrame)
        assert len(df) == 2

    def test_txn_balance_shape(self):
        page = Page(
            page_no=1, width=600, height=100, text="", words=[
                _w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
                _w("(4.50)", 300, 0), _w("1,000.00", 450, 0),
                _w("01/16/2026", 20, 20), _w("Refund", 100, 20),
                _w("12.00", 300, 20), _w("1,012.00", 450, 20),
            ],
        )
        tpl = {
            "mode": "row_heuristic",
            "row_detection": {"min_amounts_per_row": 2, "max_amounts_per_row": 2},
            "amounts": {"shape": "txn_balance", "negative_in_parens": True},
            "date": {"format": "%m/%d/%Y"},
        }
        df = apply_template([page], tpl)
        assert len(df) == 2
        assert df.iloc[0]["amount"] == -4.50
        assert df.iloc[0]["balance"] == 1000.00
        assert df.iloc[1]["amount"] == 12.00
        assert df.iloc[1]["balance"] == 1012.00

    def test_debit_credit_balance_shape(self):
        page = Page(
            page_no=1, width=600, height=100, text="", words=[
                _w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
                _w("4.50", 300, 0), _w("1,000.00", 450, 0),
                _w("01/16/2026", 20, 20), _w("Refund", 100, 20),
                _w("12.00", 380, 20), _w("1,012.00", 450, 20),
            ],
        )
        tpl = {
            "mode": "row_heuristic",
            "row_detection": {"min_amounts_per_row": 2, "max_amounts_per_row": 3},
            "amounts": {"shape": "debit_credit_balance"},
            "date": {"format": "%m/%d/%Y"},
        }
        df = apply_template([page], tpl)
        assert len(df) == 2
        # Row 0: amount at x=300 (debit column) → debit, balance at 450
        assert df.iloc[0]["amount"] == -4.50
        assert df.iloc[0]["type"] == "debit"
        # Row 1: amount at x=380 (credit column) → credit, balance at 450
        assert df.iloc[1]["amount"] == 12.00
        assert df.iloc[1]["type"] == "credit"

    def test_skip_rows_matching(self):
        page = self._page_single_amount()
        tpl = {
            "mode": "row_heuristic",
            "row_detection": {
                "min_amounts_per_row": 1,
                "max_amounts_per_row": 1,
                "skip_rows_matching": ["Refund"],
            },
            "amounts": {"shape": "single"},
            "date": {"format": "%m/%d/%Y"},
        }
        df = apply_template_row_heuristic([page], tpl)
        assert len(df) == 1
        assert df.iloc[0]["date"] == "2026-01-15"

    def test_layout_drift_doesnt_matter(self):
        """The whole point of row-heuristic: same template works
        on pages of different sizes / different column x-positions."""
        # Page A: amounts at x=400
        page_a = Page(
            page_no=1, width=600, height=80, text="", words=[
                _w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
                _w("$4.50", 400, 0),
            ],
        )
        # Page B: amounts shifted to x=520 (different layout)
        page_b = Page(
            page_no=1, width=720, height=80, text="", words=[
                _w("01/15/2026", 50, 0), _w("Coffee", 150, 0),
                _w("$4.50", 520, 0),
            ],
        )
        tpl = {
            "mode": "row_heuristic",
            "row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1},
            "amounts": {"shape": "single"},
            "date": {"format": "%m/%d/%Y"},
        }
        df_a = apply_template([page_a], tpl)
        df_b = apply_template([page_b], tpl)
        # Both should extract — proves no coordinate dependency.
        assert len(df_a) == 1
        assert len(df_b) == 1
        assert df_a.iloc[0]["amount"] == df_b.iloc[0]["amount"] == 4.50


class TestFindTransactionRows:
    """The pre-DataFrame stage — returns dict records the build UI
    uses to render a preview before the user commits."""

    def test_returns_records(self):
        page = Page(
            page_no=1, width=600, height=80, text="", words=[
                _w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
                _w("$4.50", 400, 0),
            ],
        )
        tpl = {
            "mode": "row_heuristic",
            "row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1},
            "amounts": {"shape": "single"},
            "date": {"format": "%m/%d/%Y"},
        }
        rows = find_transaction_rows([page], tpl)
        assert len(rows) == 1
        r = rows[0]
        assert r["date"] == "2026-01-15"
        assert r["description"] == "Coffee"
        assert r["amount"] == 4.50
        assert r["_page"] == 1
        # Raw line is preserved so the GUI can show "what we saw"
        assert "_raw_line" in r