fix(pdf): drop zero-amount rows; multi-date rows clean description

Two corrections from real-statement feedback: **1. Drop rows where the transaction amount is exactly 0.** Bank statements include date+amount-shaped noise like "INTEREST EARNED 0.00", "PAGE TOTAL 0.00", "BALANCE FORWARD 0.00 1,234.56" — all match the date+amount heuristic but aren't transactions. New filter in ``scan_pdf_for_transactions``: drop rows whose ``amount_1`` parses to exactly 0. Non-zero balances in ``amount_2`` don't rescue a zero amount_1 — leftmost amount is the canonical transaction amount. Unparsed-but-non-empty amount strings are kept (user verifies in the editor). **2. Multi-date rows: first date wins for the column, every date excluded from the description.** Chase / BofA / Wells commonly show both a transaction date and a posting date per row: 01/13 01/14 COFFEE SHOP $4.50 Before this fix, ``_find_dates_in_words`` returned the first date only and the second date leaked into description as "01/14 COFFEE SHOP". Now it returns ALL dates with their word ranges; the scanner uses ``dates[0]`` as the canonical date and passes every range to the description builder for exclusion. The detector's two-pass strategy now also guards against mixing full-year and short-date matches on the same row. Previously, a header line like ``Page 1/2 of 3 ... Statement Date 01/13/2026`` would return both ``1/2`` and ``01/13/2026``, and ``1/2`` (being leftmost) would have won the date column. Now: if any full-year date is found on the row, short patterns are NOT also collected — full year anchors interpretation. A row with no full-year date (Chase short-date case) still falls back to short patterns and collects all of them. New tests: - ``test_multiple_dates_returned_in_position_order`` — ``01/13`` + ``01/14`` both returned, in order - ``TestMultiDateRow.test_first_date_wins_second_excluded_from_description`` — end-to-end through ``scan_pdf_for_transactions`` - ``TestZeroAmountRowsAreDropped.test_zero_amount_row_dropped`` — "INTEREST EARNED 0.00" row dropped while real txn kept - ``test_negative_amount_kept`` — pin that -40.00 is not treated as zero by the filter Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 00:12:21 +00:00
parent 263af3c7c2
commit 3cf935c999
3 changed files with 205 additions and 34 deletions
--- a/src/pdf_extract.py
+++ b/src/pdf_extract.py
@@ -441,45 +441,61 @@ def extract_pages_auto(
 def _find_dates_in_words(
    row_words: list[WordBox],
 ) -> list[tuple[int, int, str]]:
-    """Return ``[(start_idx, end_idx, date_text)]`` for the first
-    date-like substring on this row, or ``[]`` if none.
+    """Return every date-like substring on this row, sorted by
+    position. Each entry is ``(start_idx, end_idx_exclusive, text)``.

    Two-pass search:

    - **Pass 1** — full-year patterns (``01/15/2026``,
-      ``Jan 13, 2026``). Tries the longest window first within
-      this pass so a multi-word ``Jan 15, 2026`` isn't truncated
-      to ``Jan 15``.
+      ``Jan 13, 2026``). Longest window first so multi-word dates
+      aren't truncated to a partial short match.
    - **Pass 2** — short patterns (``01/13``, ``Jan 13``). Only
-      runs if pass 1 found nothing — otherwise a stray
-      ``Page 1/2`` on the same line could shadow the real dated
-      transaction.
+      claims word ranges that pass 1 didn't already take, so a
+      real ``01/13/2026`` always wins over an adjacent
+      ``Page 1/2``.

-    ``end_idx`` is exclusive — caller uses ``range(start, end)``
-    to exclude all words the date consumed from the description
-    (the previous single-index return mis-attributed the day
-    token of multi-word dates like ``Jan 13`` to the description).
+    Some statements show both a transaction date and a posting
+    date per row (Chase, BofA, …). The scanner uses the first
+    match as the canonical date for the CSV column, and excludes
+    EVERY date from the description so the second / third dates
+    don't leak into the description text.
    """
-    for patterns, window_order in (
-        (_DATE_RES_FULL, (3, 2, 1)),
-        (_DATE_RES_SHORT, (2, 1)),
-    ):
+    def _scan(patterns, window_order):
+        local_found: list[tuple[int, int, str]] = []
+        local_claimed: set[int] = set()
        for i in range(len(row_words)):
+            if i in local_claimed:
+                continue
+            matched = False
            for window in window_order:
                end = i + window
                if end > len(row_words):
                    continue
+                if any(j in local_claimed for j in range(i, end)):
+                    continue
                chunk = " ".join(x.text for x in row_words[i:end])
                for rx in patterns:
                    m = rx.search(chunk)
                    if m:
-                        # Count whitespace-separated tokens in the
-                        # MATCH, not in the window — the window may
-                        # have included extra trailing words the
-                        # regex didn't actually consume.
                        consumed = max(1, len(m.group(1).split()))
-                        return [(i, i + consumed, m.group(1))]
-    return []
+                        actual_end = i + consumed
+                        local_found.append((i, actual_end, m.group(1)))
+                        local_claimed.update(range(i, actual_end))
+                        matched = True
+                        break
+                if matched:
+                    break
+        return local_found
+
+    full = _scan(_DATE_RES_FULL, (3, 2, 1))
+    if full:
+        # A real full-year date on the row anchors interpretation.
+        # Don't ALSO collect short patterns — they're almost always
+        # page numbers ("Page 1/2") or fractions in memos when a
+        # real date is present.
+        return sorted(full, key=lambda t: t[0])
+    short = _scan(_DATE_RES_SHORT, (2, 1))
+    return sorted(short, key=lambda t: t[0])


 def _find_amount_tokens(
@@ -506,23 +522,30 @@ def _find_amount_tokens(

 def _description_from_row(
    row_words: list[WordBox],
-    date_range: tuple[int, int],
+    date_ranges: list[tuple[int, int]],
    amount_idxs: set[int],
 ) -> str:
    """Stitch the description from the row's non-date, non-amount
-    tokens. ``date_range`` is ``(start, end)`` exclusive — every
-    word in that range is excluded so multi-word dates like
-    ``Jan 13`` don't leak the day token into the description.
+    tokens. ``date_ranges`` is a list of ``(start, end)`` (end
+    exclusive) — every word in any range is excluded.
+
+    Why a list: some bank statements show two dates per row
+    (transaction + posting). Without excluding all of them, the
+    extra date(s) leak into the description and look like trash.

    Keeps tokens before the first amount and after the last
    amount (trailing check numbers, memos); drops words between
-    amount tokens (usually whitespace artifacts in column gaps)."""
-    date_start, date_end = date_range
+    amount tokens (usually whitespace artifacts in column gaps).
+    """
+    excluded: set[int] = set()
+    for start, end in date_ranges:
+        excluded.update(range(start, end))
+
    keep: list[str] = []
    seen_first_amount = False
    last_amount_idx = max(amount_idxs) if amount_idxs else -1
    for i, w in enumerate(row_words):
-        if date_start <= i < date_end:
+        if i in excluded:
            continue
        if i in amount_idxs:
            seen_first_amount = True
@@ -594,14 +617,22 @@ def scan_pdf_for_transactions(
                    )
                continue

-            date_start, date_end, date_text = dates[0]
+            # First date wins for the "date" column; ALL dates are
+            # excluded from the description so a row carrying both
+            # a transaction date and a posting date doesn't leak
+            # the second one into description text.
+            _, _, first_date_text = dates[0]
+            date_ranges = [(s, e) for s, e, _ in dates]
            amount_idxs = {idx for idx, _, _ in amount_tokens}
            desc = _description_from_row(
-                row_words, (date_start, date_end), amount_idxs,
+                row_words, date_ranges, amount_idxs,
            )

            record: dict[str, Any] = {
-                "date": parse_date(date_text, date_formats) or date_text,
+                "date": (
+                    parse_date(first_date_text, date_formats)
+                    or first_date_text
+                ),
                "description": desc,
                "page": page.page_no,
                "raw": line,
@@ -616,12 +647,36 @@ def scan_pdf_for_transactions(
                record[f"amount_{k}"] = (
                    parsed if parsed is not None else txt
                )
+
+            # Drop rows where the transaction amount is exactly 0.
+            # Bank statements include noise like "INTEREST EARNED
+            # 0.00" or "PAGE TOTAL 0.00" that pass the date+amount
+            # heuristic but aren't real transactions. We key off
+            # ``amount_1`` (leftmost amount = usually the txn
+            # amount); a non-zero balance in ``amount_2`` doesn't
+            # rescue a zero ``amount_1``.
+            if not _has_real_transaction_amount(record):
+                continue
+
            out_rows.append(record)
            prev = record

    return out_rows, warnings


+def _has_real_transaction_amount(record: dict[str, Any]) -> bool:
+    """``amount_1`` is the row's primary amount. Drop rows whose
+    amount_1 parsed to exactly 0; keep everything else (positive,
+    negative, or unparsed-but-non-empty)."""
+    amount_1 = record.get("amount_1")
+    if amount_1 is None:
+        return False
+    if isinstance(amount_1, (int, float)):
+        return amount_1 != 0
+    # Unparsed string — keep so the user can verify in the editor.
+    return bool(str(amount_1).strip())
+
+
 def diagnose_pdf_lines(
    pdf_bytes: bytes,
    *,
--- a/tests/test_pdf_extract.py
+++ b/tests/test_pdf_extract.py
@@ -151,14 +151,29 @@ class TestFindDatesInWords:
    def test_short_pattern_does_not_shadow_full_year(self):
        """If a full-year date is present, short patterns shouldn't
        steal — e.g. ``Page 1/2 of 3 ... 01/13/2026 Coffee`` should
-        return the real ``01/13/2026``, not the ``1/2`` page marker."""
+        return the real ``01/13/2026`` first."""
        row = [
            _w("Page", 0, 0), _w("1/2", 40, 0), _w("of", 80, 0),
            _w("3", 100, 0),
            _w("01/13/2026", 200, 0), _w("Coffee", 300, 0),
        ]
        result = _find_dates_in_words(row)
-        assert result and result[0][2] == "01/13/2026"
+        # Full-year match wins position 0 in the returned list.
+        assert result[0][2] == "01/13/2026"
+
+    def test_multiple_dates_returned_in_position_order(self):
+        """Chase-style transaction with both posting and txn dates."""
+        row = [
+            _w("01/13", 0, 0), _w("01/14", 50, 0),
+            _w("Coffee", 100, 0), _w("$4.50", 200, 0),
+        ]
+        result = _find_dates_in_words(row)
+        assert len(result) == 2
+        assert result[0][2] == "01/13"
+        assert result[1][2] == "01/14"
+        # First date claims word 0, second claims word 1
+        assert result[0][:2] == (0, 1)
+        assert result[1][:2] == (1, 2)

    def test_no_date(self):
        row = [_w("Just", 0, 0), _w("text", 50, 0)]
--- a/tests/test_pdf_extract_smoke.py
+++ b/tests/test_pdf_extract_smoke.py
@@ -144,6 +144,107 @@ class TestScanPdfForTransactions:
 # ---------------------------------------------------------------------------


+class TestMultiDateRow:
+    """Some statements (Chase, BofA) show both a transaction date
+    and a posting date per row. The scanner uses the first date
+    in position order and excludes every date from the description."""
+
+    def test_first_date_wins_second_excluded_from_description(self):
+        from src import pdf_extract as mod
+        from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
+
+        original = mod.extract_pages_auto
+
+        def fake(_b, *, allow_ocr=True):
+            words = [
+                WordBox(x0=0, top=0, x1=40, bottom=10, text="01/13"),
+                WordBox(x0=50, top=0, x1=90, bottom=10, text="01/14"),
+                WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
+                WordBox(x0=170, top=0, x1=210, bottom=10, text="Shop"),
+                WordBox(x0=220, top=0, x1=270, bottom=10, text="$4.50"),
+            ]
+            return [Page(
+                page_no=1, width=300, height=20, text="", words=words,
+            )], []
+
+        mod.extract_pages_auto = fake
+        try:
+            rows, _ = scan_pdf_for_transactions(b"")
+        finally:
+            mod.extract_pages_auto = original
+
+        assert len(rows) == 1
+        # First date used as the canonical
+        assert rows[0]["date"] == "01/13"
+        # Second date NOT in description
+        assert "01/14" not in rows[0]["description"]
+        # Description is the actual content between dates and amount
+        assert rows[0]["description"] == "Coffee Shop"
+
+
+class TestZeroAmountRowsAreDropped:
+    """Rows where the transaction amount is exactly 0 are noise
+    (statements love to print "INTEREST EARNED 0.00" or
+    "PAGE TOTAL 0.00") and get filtered out."""
+
+    def test_zero_amount_row_dropped(self):
+        from src import pdf_extract as mod
+        from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
+
+        original = mod.extract_pages_auto
+
+        def fake(_b, *, allow_ocr=True):
+            words = [
+                # Real transaction
+                WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
+                WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
+                WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
+                # Zero-amount noise row (should be dropped)
+                WordBox(x0=0, top=20, x1=80, bottom=30, text="01/14/2026"),
+                WordBox(x0=100, top=20, x1=180, bottom=30, text="INTEREST"),
+                WordBox(x0=200, top=20, x1=240, bottom=30, text="0.00"),
+            ]
+            return [Page(
+                page_no=1, width=300, height=40, text="", words=words,
+            )], []
+
+        mod.extract_pages_auto = fake
+        try:
+            rows, _ = scan_pdf_for_transactions(b"")
+        finally:
+            mod.extract_pages_auto = original
+
+        assert len(rows) == 1
+        assert rows[0]["amount_1"] == 4.50
+        assert "INTEREST" not in rows[0]["description"]
+
+    def test_negative_amount_kept(self):
+        from src import pdf_extract as mod
+        from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
+
+        original = mod.extract_pages_auto
+
+        def fake(_b, *, allow_ocr=True):
+            words = [
+                WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
+                WordBox(x0=100, top=0, x1=160, bottom=10, text="Withdraw"),
+                WordBox(x0=200, top=0, x1=240, bottom=10, text="(40.00)"),
+            ]
+            return [Page(
+                page_no=1, width=300, height=20, text="", words=words,
+            )], []
+
+        mod.extract_pages_auto = fake
+        try:
+            rows, _ = scan_pdf_for_transactions(b"")
+        finally:
+            mod.extract_pages_auto = original
+
+        # -40 is not zero — keep it
+        assert len(rows) == 1
+        assert rows[0]["amount_1"] == -40.00
+
+
 class TestMultilineDescription:
    def test_continuation_line_merges(self):
        """A line with no date and no amount, sitting between two