datatools-dev/tests/test_pdf_extract.py

"""Tests for the pure PDF-extraction pipeline.

Real PDF parsing (``extract_pages``) is a thin wrapper around
``pdfplumber`` and is exercised by hand on real bank statements.
These tests pin the meaty bits — value parsing, row clustering,
column assignment, template-driven extraction — against synthetic
``WordBox`` data so they run fast and have no PDF dependency.
"""

from __future__ import annotations

import pandas as pd

from src.pdf_extract import (
    Page,
    WordBox,
    apply_template,
    assign_columns,
    cluster_rows,
    parse_amount,
    parse_date,
    _pages_in_range,
    _within_table_window,
)


def _w(text: str, x0: float, top: float, x1: float | None = None) -> WordBox:
    """Convenience constructor — heights and exact x1 don't matter
    for the tests we write."""
    return WordBox(
        x0=x0,
        top=top,
        x1=x1 if x1 is not None else x0 + 10 * len(text),
        bottom=top + 10,
        text=text,
    )


class TestParseAmount:
    def test_plain_positive(self):
        assert parse_amount("1234.56") == 1234.56

    def test_currency_and_thousands(self):
        assert parse_amount("$1,234.56") == 1234.56

    def test_parens_negative(self):
        assert parse_amount("(1,234.56)") == -1234.56

    def test_leading_minus(self):
        assert parse_amount("-100.00") == -100.0

    def test_trailing_minus(self):
        assert parse_amount("100.00-") == -100.0

    def test_blank_returns_none(self):
        assert parse_amount("") is None
        assert parse_amount("   ") is None
        assert parse_amount(None) is None

    def test_garbage_returns_none(self):
        assert parse_amount("not a number") is None

    def test_european_decimal(self):
        opts = {
            "decimal_separator": ",",
            "thousands_separator": ".",
            "currency_strip": "€",
            "negative_in_parens": True,
        }
        assert parse_amount("€1.234,56", opts) == 1234.56


class TestParseDate:
    def test_us_slash(self):
        assert parse_date("01/15/2026", ["%m/%d/%Y"]) == "2026-01-15"

    def test_iso(self):
        assert parse_date("2026-01-15", ["%Y-%m-%d"]) == "2026-01-15"

    def test_fallback_format(self):
        # Not in the supplied list — should still parse via fallback.
        assert parse_date("01/15/26") == "2026-01-15"

    def test_invalid(self):
        assert parse_date("not-a-date") is None


class TestClusterRows:
    def test_groups_close_y(self):
        words = [
            _w("A", x0=0, top=100),
            _w("B", x0=20, top=101),
            _w("C", x0=40, top=102),
        ]
        rows = cluster_rows(words, y_tolerance=3.0)
        assert len(rows) == 1
        assert [w.text for w in rows[0]] == ["A", "B", "C"]

    def test_separates_far_y(self):
        words = [
            _w("A", x0=0, top=100),
            _w("B", x0=0, top=120),
        ]
        rows = cluster_rows(words, y_tolerance=3.0)
        assert [[w.text for w in r] for r in rows] == [["A"], ["B"]]

    def test_sorts_left_to_right_within_row(self):
        words = [
            _w("C", x0=40, top=100),
            _w("A", x0=0, top=100),
            _w("B", x0=20, top=100),
        ]
        rows = cluster_rows(words)
        assert [w.text for w in rows[0]] == ["A", "B", "C"]

    def test_empty(self):
        assert cluster_rows([]) == []


class TestAssignColumns:
    def test_three_columns(self):
        # boundaries at x=100, 200 → columns [0,100), [100,200), [200,∞)
        row = [
            _w("Jan", x0=10, top=0, x1=40),       # col 0
            _w("1", x0=45, top=0, x1=55),         # col 0
            _w("Deposit", x0=110, top=0, x1=180), # col 1
            _w("250.00", x0=210, top=0, x1=260),  # col 2
        ]
        cells = assign_columns(row, [100, 200])
        assert cells[0] == "Jan 1"
        assert cells[1] == "Deposit"
        assert cells[2] == "250.00"

    def test_no_boundaries_one_column(self):
        row = [_w("A", 0, 0), _w("B", 20, 0)]
        cells = assign_columns(row, [])
        assert cells == ["A B"]


class TestPagesInRange:
    def _mk(self, n):
        return [Page(page_no=i + 1, width=600, height=800, text="", words=[]) for i in range(n)]

    def test_all(self):
        pages = self._mk(5)
        assert len(_pages_in_range(pages, "all")) == 5
        assert len(_pages_in_range(pages, "")) == 5

    def test_explicit_list(self):
        pages = self._mk(5)
        got = [p.page_no for p in _pages_in_range(pages, "1,3,5")]
        assert got == [1, 3, 5]

    def test_range(self):
        pages = self._mk(5)
        got = [p.page_no for p in _pages_in_range(pages, "2-4")]
        assert got == [2, 3, 4]

    def test_open_ended(self):
        pages = self._mk(5)
        got = [p.page_no for p in _pages_in_range(pages, "3-")]
        assert got == [3, 4, 5]


class TestWithinTableWindow:
    def test_header_skipped_end_excluded(self):
        rows = [
            [_w("STATEMENT", 0, 0)],
            [_w("Date", 0, 20), _w("Description", 50, 20), _w("Amount", 200, 20)],
            [_w("01/15", 0, 40), _w("Coffee", 50, 40), _w("4.50", 200, 40)],
            [_w("01/16", 0, 60), _w("Refund", 50, 60), _w("12.00", 200, 60)],
            [_w("Closing", 0, 80), _w("balance", 50, 80)],
            [_w("Page", 0, 100), _w("1", 50, 100)],
        ]
        out = _within_table_window(rows, "Date Description Amount", ["Closing balance"])
        # Should keep just the two transaction rows.
        assert len(out) == 2
        assert out[0][0].text == "01/15"
        assert out[1][0].text == "01/16"

    def test_no_header_returns_empty_when_required(self):
        rows = [[_w("foo", 0, 0)]]
        assert _within_table_window(rows, "Date Description Amount", []) == []

    def test_blank_header_passes_through(self):
        rows = [[_w("x", 0, 0)], [_w("y", 0, 20)]]
        assert _within_table_window(rows, "", []) == rows


class TestApplyTemplate:
    """End-to-end on synthetic ``Page`` objects."""

    def _statement_page(self) -> Page:
        # Mock layout: 3 columns at x=0/100/200, header at y=20, data at 40+.
        words = [
            _w("STATEMENT", 0, 0),
            # Header
            _w("Date", 5, 20), _w("Description", 105, 20), _w("Amount", 205, 20),
            # Row 1
            _w("01/15/2026", 5, 40), _w("Coffee", 105, 40),
            _w("Shop", 140, 40), _w("(4.50)", 205, 40),
            # Row 2
            _w("01/16/2026", 5, 60), _w("Refund", 105, 60), _w("$12.00", 205, 60),
            # Continuation row (no date) — should merge into row 2
            _w("from", 105, 80), _w("vendor", 140, 80),
            # End marker
            _w("Closing", 5, 100), _w("balance", 105, 100), _w("$1,000.00", 205, 100),
        ]
        return Page(page_no=1, width=300, height=120, text="", words=words)

    def _template(self) -> dict:
        return {
            "pages": {"range": "all"},
            "table": {
                "header_text": "Date Description Amount",
                "end_markers": ["Closing balance"],
                "column_boundaries": [100, 200],
                "y_tolerance": 3.0,
                "skip_rows_matching": [],
            },
            "columns": [
                {"source": 0, "target": "date"},
                {"source": 1, "target": "description"},
                {"source": 2, "target": "amount"},
            ],
            "parse": {
                "date_format": "%m/%d/%Y",
                "amount_negative_in_parens": True,
                "merge_multiline_description": True,
            },
        }

    def test_basic_extraction(self):
        df = apply_template([self._statement_page()], self._template())
        assert isinstance(df, pd.DataFrame)
        assert len(df) == 2
        assert list(df["date"]) == ["2026-01-15", "2026-01-16"]
        # Parens-negative
        assert df.iloc[0]["amount"] == -4.50
        # Plain positive with currency strip
        assert df.iloc[1]["amount"] == 12.00
        # Multi-line description merged
        assert "from vendor" in df.iloc[1]["description"]

    def test_debit_credit_split_columns(self):
        # Layout: date | description | debit | credit columns
        page = Page(
            page_no=1, width=400, height=80, text="",
            words=[
                _w("Date", 5, 0), _w("Desc", 105, 0),
                _w("Debit", 205, 0), _w("Credit", 305, 0),
                _w("01/15/2026", 5, 20), _w("Coffee", 105, 20), _w("4.50", 205, 20),
                _w("01/16/2026", 5, 40), _w("Refund", 105, 40),
                _w("", 205, 40),  # no debit
                _w("12.00", 305, 40),
            ],
        )
        tpl = {
            "table": {
                "header_text": "Date Desc Debit Credit",
                "column_boundaries": [100, 200, 300],
            },
            "columns": [
                {"source": 0, "target": "date"},
                {"source": 1, "target": "description"},
                {"source": 2, "target": "amount_debit"},
                {"source": 3, "target": "amount_credit"},
            ],
            "parse": {"date_format": "%m/%d/%Y"},
        }
        df = apply_template([page], tpl)
        assert list(df["amount"]) == [-4.50, 12.00]
        assert list(df["type"]) == ["debit", "credit"]

    def test_skip_rows_matching(self):
        page = self._statement_page()
        tpl = self._template()
        tpl["table"]["skip_rows_matching"] = ["Refund"]
        df = apply_template([page], tpl)
        # Refund row is dropped — only one transaction left
        assert len(df) == 1
        assert df.iloc[0]["amount"] == -4.50

    def test_empty_pages_returns_empty_df(self):
        df = apply_template([], self._template())
        assert df.empty