fix(pdf): drop zero-amount rows; multi-date rows clean description

Two corrections from real-statement feedback: **1. Drop rows where the transaction amount is exactly 0.** Bank statements include date+amount-shaped noise like "INTEREST EARNED 0.00", "PAGE TOTAL 0.00", "BALANCE FORWARD 0.00 1,234.56" — all match the date+amount heuristic but aren't transactions. New filter in ``scan_pdf_for_transactions``: drop rows whose ``amount_1`` parses to exactly 0. Non-zero balances in ``amount_2`` don't rescue a zero amount_1 — leftmost amount is the canonical transaction amount. Unparsed-but-non-empty amount strings are kept (user verifies in the editor). **2. Multi-date rows: first date wins for the column, every date excluded from the description.** Chase / BofA / Wells commonly show both a transaction date and a posting date per row: 01/13 01/14 COFFEE SHOP $4.50 Before this fix, ``_find_dates_in_words`` returned the first date only and the second date leaked into description as "01/14 COFFEE SHOP". Now it returns ALL dates with their word ranges; the scanner uses ``dates[0]`` as the canonical date and passes every range to the description builder for exclusion. The detector's two-pass strategy now also guards against mixing full-year and short-date matches on the same row. Previously, a header line like ``Page 1/2 of 3 ... Statement Date 01/13/2026`` would return both ``1/2`` and ``01/13/2026``, and ``1/2`` (being leftmost) would have won the date column. Now: if any full-year date is found on the row, short patterns are NOT also collected — full year anchors interpretation. A row with no full-year date (Chase short-date case) still falls back to short patterns and collects all of them. New tests: - ``test_multiple_dates_returned_in_position_order`` — ``01/13`` + ``01/14`` both returned, in order - ``TestMultiDateRow.test_first_date_wins_second_excluded_from_description`` — end-to-end through ``scan_pdf_for_transactions`` - ``TestZeroAmountRowsAreDropped.test_zero_amount_row_dropped`` — "INTEREST EARNED 0.00" row dropped while real txn kept - ``test_negative_amount_kept`` — pin that -40.00 is not treated as zero by the filter Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 00:12:21 +00:00
parent 263af3c7c2
commit 3cf935c999
3 changed files with 205 additions and 34 deletions
--- a/src/pdf_extract.py
+++ b/src/pdf_extract.py
@@ -441,45 +441,61 @@ def extract_pages_auto(
 def _find_dates_in_words(
    row_words: list[WordBox],
 ) -> list[tuple[int, int, str]]:
-    """Return ``[(start_idx, end_idx, date_text)]`` for the first
+    """Return every date-like substring on this row, sorted by
-    date-like substring on this row, or ``[]`` if none.
+    position. Each entry is ``(start_idx, end_idx_exclusive, text)``.
    Two-pass search:
    - **Pass 1** — full-year patterns (``01/15/2026``,
-      ``Jan 13, 2026``). Tries the longest window first within
+      ``Jan 13, 2026``). Longest window first so multi-word dates
-      this pass so a multi-word ``Jan 15, 2026`` isn't truncated
+      aren't truncated to a partial short match.
      to ``Jan 15``.
    - **Pass 2** — short patterns (``01/13``, ``Jan 13``). Only
-      runs if pass 1 found nothing — otherwise a stray
+      claims word ranges that pass 1 didn't already take, so a
-      ``Page 1/2`` on the same line could shadow the real dated
+      real ``01/13/2026`` always wins over an adjacent
-      transaction.
+      ``Page 1/2``.
-    ``end_idx`` is exclusive — caller uses ``range(start, end)``
+    Some statements show both a transaction date and a posting
-    to exclude all words the date consumed from the description
+    date per row (Chase, BofA, …). The scanner uses the first
-    (the previous single-index return mis-attributed the day
+    match as the canonical date for the CSV column, and excludes
-    token of multi-word dates like ``Jan 13`` to the description).
+    EVERY date from the description so the second / third dates
    don't leak into the description text.
    """
-    for patterns, window_order in (
+    def _scan(patterns, window_order):
-        (_DATE_RES_FULL, (3, 2, 1)),
+        local_found: list[tuple[int, int, str]] = []
-        (_DATE_RES_SHORT, (2, 1)),
+        local_claimed: set[int] = set()
    ):
        for i in range(len(row_words)):
            if i in local_claimed:
                continue
            matched = False
            for window in window_order:
                end = i + window
                if end > len(row_words):
                    continue
                if any(j in local_claimed for j in range(i, end)):
                    continue
                chunk = " ".join(x.text for x in row_words[i:end])
                for rx in patterns:
                    m = rx.search(chunk)
                    if m:
                        # Count whitespace-separated tokens in the
                        # MATCH, not in the window — the window may
                        # have included extra trailing words the
                        # regex didn't actually consume.
                        consumed = max(1, len(m.group(1).split()))
-                        return [(i, i + consumed, m.group(1))]
+                        actual_end = i + consumed
-    return []
+                        local_found.append((i, actual_end, m.group(1)))
                        local_claimed.update(range(i, actual_end))
                        matched = True
                        break
                if matched:
                    break
        return local_found
    full = _scan(_DATE_RES_FULL, (3, 2, 1))
    if full:
        # A real full-year date on the row anchors interpretation.
        # Don't ALSO collect short patterns — they're almost always
        # page numbers ("Page 1/2") or fractions in memos when a
        # real date is present.
        return sorted(full, key=lambda t: t[0])
    short = _scan(_DATE_RES_SHORT, (2, 1))
    return sorted(short, key=lambda t: t[0])
 def _find_amount_tokens(
@@ -506,23 +522,30 @@ def _find_amount_tokens(
 def _description_from_row(
    row_words: list[WordBox],
-    date_range: tuple[int, int],
+    date_ranges: list[tuple[int, int]],
    amount_idxs: set[int],
 ) -> str:
    """Stitch the description from the row's non-date, non-amount
-    tokens. ``date_range`` is ``(start, end)`` exclusive — every
+    tokens. ``date_ranges`` is a list of ``(start, end)`` (end
-    word in that range is excluded so multi-word dates like
+    exclusive) — every word in any range is excluded.
-    ``Jan 13`` don't leak the day token into the description.
+
    Why a list: some bank statements show two dates per row
    (transaction + posting). Without excluding all of them, the
    extra date(s) leak into the description and look like trash.
    Keeps tokens before the first amount and after the last
    amount (trailing check numbers, memos); drops words between
-    amount tokens (usually whitespace artifacts in column gaps)."""
+    amount tokens (usually whitespace artifacts in column gaps).
-    date_start, date_end = date_range
+    """
    excluded: set[int] = set()
    for start, end in date_ranges:
        excluded.update(range(start, end))
    keep: list[str] = []
    seen_first_amount = False
    last_amount_idx = max(amount_idxs) if amount_idxs else -1
    for i, w in enumerate(row_words):
-        if date_start <= i < date_end:
+        if i in excluded:
            continue
        if i in amount_idxs:
            seen_first_amount = True
@@ -594,14 +617,22 @@ def scan_pdf_for_transactions(
                    )
                continue
-            date_start, date_end, date_text = dates[0]
+            # First date wins for the "date" column; ALL dates are
            # excluded from the description so a row carrying both
            # a transaction date and a posting date doesn't leak
            # the second one into description text.
            _, _, first_date_text = dates[0]
            date_ranges = [(s, e) for s, e, _ in dates]
            amount_idxs = {idx for idx, _, _ in amount_tokens}
            desc = _description_from_row(
-                row_words, (date_start, date_end), amount_idxs,
+                row_words, date_ranges, amount_idxs,
            )
            record: dict[str, Any] = {
-                "date": parse_date(date_text, date_formats) or date_text,
+                "date": (
                    parse_date(first_date_text, date_formats)
                    or first_date_text
                ),
                "description": desc,
                "page": page.page_no,
                "raw": line,
@@ -616,12 +647,36 @@ def scan_pdf_for_transactions(
                record[f"amount_{k}"] = (
                    parsed if parsed is not None else txt
                )
            # Drop rows where the transaction amount is exactly 0.
            # Bank statements include noise like "INTEREST EARNED
            # 0.00" or "PAGE TOTAL 0.00" that pass the date+amount
            # heuristic but aren't real transactions. We key off
            # ``amount_1`` (leftmost amount = usually the txn
            # amount); a non-zero balance in ``amount_2`` doesn't
            # rescue a zero ``amount_1``.
            if not _has_real_transaction_amount(record):
                continue
            out_rows.append(record)
            prev = record
    return out_rows, warnings
 def _has_real_transaction_amount(record: dict[str, Any]) -> bool:
    """``amount_1`` is the row's primary amount. Drop rows whose
    amount_1 parsed to exactly 0; keep everything else (positive,
    negative, or unparsed-but-non-empty)."""
    amount_1 = record.get("amount_1")
    if amount_1 is None:
        return False
    if isinstance(amount_1, (int, float)):
        return amount_1 != 0
    # Unparsed string — keep so the user can verify in the editor.
    return bool(str(amount_1).strip())
 def diagnose_pdf_lines(
    pdf_bytes: bytes,
    *,
--- a/tests/test_pdf_extract.py
+++ b/tests/test_pdf_extract.py
@@ -151,14 +151,29 @@ class TestFindDatesInWords:
    def test_short_pattern_does_not_shadow_full_year(self):
        """If a full-year date is present, short patterns shouldn't
        steal — e.g. ``Page 1/2 of 3 ... 01/13/2026 Coffee`` should
-        return the real ``01/13/2026``, not the ``1/2`` page marker."""
+        return the real ``01/13/2026`` first."""
        row = [
            _w("Page", 0, 0), _w("1/2", 40, 0), _w("of", 80, 0),
            _w("3", 100, 0),
            _w("01/13/2026", 200, 0), _w("Coffee", 300, 0),
        ]
        result = _find_dates_in_words(row)
-        assert result and result[0][2] == "01/13/2026"
+        # Full-year match wins position 0 in the returned list.
        assert result[0][2] == "01/13/2026"
    def test_multiple_dates_returned_in_position_order(self):
        """Chase-style transaction with both posting and txn dates."""
        row = [
            _w("01/13", 0, 0), _w("01/14", 50, 0),
            _w("Coffee", 100, 0), _w("$4.50", 200, 0),
        ]
        result = _find_dates_in_words(row)
        assert len(result) == 2
        assert result[0][2] == "01/13"
        assert result[1][2] == "01/14"
        # First date claims word 0, second claims word 1
        assert result[0][:2] == (0, 1)
        assert result[1][:2] == (1, 2)
    def test_no_date(self):
        row = [_w("Just", 0, 0), _w("text", 50, 0)]
--- a/tests/test_pdf_extract_smoke.py
+++ b/tests/test_pdf_extract_smoke.py
@@ -144,6 +144,107 @@ class TestScanPdfForTransactions:
 # ---------------------------------------------------------------------------
 class TestMultiDateRow:
    """Some statements (Chase, BofA) show both a transaction date
    and a posting date per row. The scanner uses the first date
    in position order and excludes every date from the description."""
    def test_first_date_wins_second_excluded_from_description(self):
        from src import pdf_extract as mod
        from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
        original = mod.extract_pages_auto
        def fake(_b, *, allow_ocr=True):
            words = [
                WordBox(x0=0, top=0, x1=40, bottom=10, text="01/13"),
                WordBox(x0=50, top=0, x1=90, bottom=10, text="01/14"),
                WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
                WordBox(x0=170, top=0, x1=210, bottom=10, text="Shop"),
                WordBox(x0=220, top=0, x1=270, bottom=10, text="$4.50"),
            ]
            return [Page(
                page_no=1, width=300, height=20, text="", words=words,
            )], []
        mod.extract_pages_auto = fake
        try:
            rows, _ = scan_pdf_for_transactions(b"")
        finally:
            mod.extract_pages_auto = original
        assert len(rows) == 1
        # First date used as the canonical
        assert rows[0]["date"] == "01/13"
        # Second date NOT in description
        assert "01/14" not in rows[0]["description"]
        # Description is the actual content between dates and amount
        assert rows[0]["description"] == "Coffee Shop"
 class TestZeroAmountRowsAreDropped:
    """Rows where the transaction amount is exactly 0 are noise
    (statements love to print "INTEREST EARNED 0.00" or
    "PAGE TOTAL 0.00") and get filtered out."""
    def test_zero_amount_row_dropped(self):
        from src import pdf_extract as mod
        from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
        original = mod.extract_pages_auto
        def fake(_b, *, allow_ocr=True):
            words = [
                # Real transaction
                WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
                WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
                WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
                # Zero-amount noise row (should be dropped)
                WordBox(x0=0, top=20, x1=80, bottom=30, text="01/14/2026"),
                WordBox(x0=100, top=20, x1=180, bottom=30, text="INTEREST"),
                WordBox(x0=200, top=20, x1=240, bottom=30, text="0.00"),
            ]
            return [Page(
                page_no=1, width=300, height=40, text="", words=words,
            )], []
        mod.extract_pages_auto = fake
        try:
            rows, _ = scan_pdf_for_transactions(b"")
        finally:
            mod.extract_pages_auto = original
        assert len(rows) == 1
        assert rows[0]["amount_1"] == 4.50
        assert "INTEREST" not in rows[0]["description"]
    def test_negative_amount_kept(self):
        from src import pdf_extract as mod
        from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
        original = mod.extract_pages_auto
        def fake(_b, *, allow_ocr=True):
            words = [
                WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
                WordBox(x0=100, top=0, x1=160, bottom=10, text="Withdraw"),
                WordBox(x0=200, top=0, x1=240, bottom=10, text="(40.00)"),
            ]
            return [Page(
                page_no=1, width=300, height=20, text="", words=words,
            )], []
        mod.extract_pages_auto = fake
        try:
            rows, _ = scan_pdf_for_transactions(b"")
        finally:
            mod.extract_pages_auto = original
        # -40 is not zero — keep it
        assert len(rows) == 1
        assert rows[0]["amount_1"] == -40.00
 class TestMultilineDescription:
    def test_continuation_line_merges(self):
        """A line with no date and no amount, sitting between two