fix(pdf): drop zero-amount rows; multi-date rows clean description

Two corrections from real-statement feedback: **1. Drop rows where the transaction amount is exactly 0.** Bank statements include date+amount-shaped noise like "INTEREST EARNED 0.00", "PAGE TOTAL 0.00", "BALANCE FORWARD 0.00 1,234.56" — all match the date+amount heuristic but aren't transactions. New filter in ``scan_pdf_for_transactions``: drop rows whose ``amount_1`` parses to exactly 0. Non-zero balances in ``amount_2`` don't rescue a zero amount_1 — leftmost amount is the canonical transaction amount. Unparsed-but-non-empty amount strings are kept (user verifies in the editor). **2. Multi-date rows: first date wins for the column, every date excluded from the description.** Chase / BofA / Wells commonly show both a transaction date and a posting date per row: 01/13 01/14 COFFEE SHOP $4.50 Before this fix, ``_find_dates_in_words`` returned the first date only and the second date leaked into description as "01/14 COFFEE SHOP". Now it returns ALL dates with their word ranges; the scanner uses ``dates[0]`` as the canonical date and passes every range to the description builder for exclusion. The detector's two-pass strategy now also guards against mixing full-year and short-date matches on the same row. Previously, a header line like ``Page 1/2 of 3 ... Statement Date 01/13/2026`` would return both ``1/2`` and ``01/13/2026``, and ``1/2`` (being leftmost) would have won the date column. Now: if any full-year date is found on the row, short patterns are NOT also collected — full year anchors interpretation. A row with no full-year date (Chase short-date case) still falls back to short patterns and collects all of them. New tests: - ``test_multiple_dates_returned_in_position_order`` — ``01/13`` + ``01/14`` both returned, in order - ``TestMultiDateRow.test_first_date_wins_second_excluded_from_description`` — end-to-end through ``scan_pdf_for_transactions`` - ``TestZeroAmountRowsAreDropped.test_zero_amount_row_dropped`` — "INTEREST EARNED 0.00" row dropped while real txn kept - ``test_negative_amount_kept`` — pin that -40.00 is not treated as zero by the filter Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 00:12:21 +00:00
parent 263af3c7c2
commit 3cf935c999
3 changed files with 205 additions and 34 deletions
--- a/src/pdf_extract.py
+++ b/src/pdf_extract.py
@@ -441,45 +441,61 @@ def extract_pages_auto(
 def _find_dates_in_words(
    row_words: list[WordBox],
 ) -> list[tuple[int, int, str]]:
-    """Return ``[(start_idx, end_idx, date_text)]`` for the first
-    date-like substring on this row, or ``[]`` if none.
+    """Return every date-like substring on this row, sorted by
+    position. Each entry is ``(start_idx, end_idx_exclusive, text)``.

    Two-pass search:

    - **Pass 1** — full-year patterns (``01/15/2026``,
-      ``Jan 13, 2026``). Tries the longest window first within
-      this pass so a multi-word ``Jan 15, 2026`` isn't truncated
-      to ``Jan 15``.
+      ``Jan 13, 2026``). Longest window first so multi-word dates
+      aren't truncated to a partial short match.
    - **Pass 2** — short patterns (``01/13``, ``Jan 13``). Only
-      runs if pass 1 found nothing — otherwise a stray
-      ``Page 1/2`` on the same line could shadow the real dated
-      transaction.
+      claims word ranges that pass 1 didn't already take, so a
+      real ``01/13/2026`` always wins over an adjacent
+      ``Page 1/2``.

-    ``end_idx`` is exclusive — caller uses ``range(start, end)``
-    to exclude all words the date consumed from the description
-    (the previous single-index return mis-attributed the day
-    token of multi-word dates like ``Jan 13`` to the description).
+    Some statements show both a transaction date and a posting
+    date per row (Chase, BofA, …). The scanner uses the first
+    match as the canonical date for the CSV column, and excludes
+    EVERY date from the description so the second / third dates
+    don't leak into the description text.
    """
-    for patterns, window_order in (
-        (_DATE_RES_FULL, (3, 2, 1)),
-        (_DATE_RES_SHORT, (2, 1)),
-    ):
+    def _scan(patterns, window_order):
+        local_found: list[tuple[int, int, str]] = []
+        local_claimed: set[int] = set()
        for i in range(len(row_words)):
+            if i in local_claimed:
+                continue
+            matched = False
            for window in window_order:
                end = i + window
                if end > len(row_words):
                    continue
+                if any(j in local_claimed for j in range(i, end)):
+                    continue
                chunk = " ".join(x.text for x in row_words[i:end])
                for rx in patterns:
                    m = rx.search(chunk)
                    if m:
-                        # Count whitespace-separated tokens in the
-                        # MATCH, not in the window — the window may
-                        # have included extra trailing words the
-                        # regex didn't actually consume.
                        consumed = max(1, len(m.group(1).split()))
-                        return [(i, i + consumed, m.group(1))]
-    return []
+                        actual_end = i + consumed
+                        local_found.append((i, actual_end, m.group(1)))
+                        local_claimed.update(range(i, actual_end))
+                        matched = True
+                        break
+                if matched:
+                    break
+        return local_found
+
+    full = _scan(_DATE_RES_FULL, (3, 2, 1))
+    if full:
+        # A real full-year date on the row anchors interpretation.
+        # Don't ALSO collect short patterns — they're almost always
+        # page numbers ("Page 1/2") or fractions in memos when a
+        # real date is present.
+        return sorted(full, key=lambda t: t[0])
+    short = _scan(_DATE_RES_SHORT, (2, 1))
+    return sorted(short, key=lambda t: t[0])


 def _find_amount_tokens(
@@ -506,23 +522,30 @@ def _find_amount_tokens(

 def _description_from_row(
    row_words: list[WordBox],
-    date_range: tuple[int, int],
+    date_ranges: list[tuple[int, int]],
    amount_idxs: set[int],
 ) -> str:
    """Stitch the description from the row's non-date, non-amount
-    tokens. ``date_range`` is ``(start, end)`` exclusive — every
-    word in that range is excluded so multi-word dates like
-    ``Jan 13`` don't leak the day token into the description.
+    tokens. ``date_ranges`` is a list of ``(start, end)`` (end
+    exclusive) — every word in any range is excluded.
+
+    Why a list: some bank statements show two dates per row
+    (transaction + posting). Without excluding all of them, the
+    extra date(s) leak into the description and look like trash.

    Keeps tokens before the first amount and after the last
    amount (trailing check numbers, memos); drops words between
-    amount tokens (usually whitespace artifacts in column gaps)."""
-    date_start, date_end = date_range
+    amount tokens (usually whitespace artifacts in column gaps).
+    """
+    excluded: set[int] = set()
+    for start, end in date_ranges:
+        excluded.update(range(start, end))
+
    keep: list[str] = []
    seen_first_amount = False
    last_amount_idx = max(amount_idxs) if amount_idxs else -1
    for i, w in enumerate(row_words):
-        if date_start <= i < date_end:
+        if i in excluded:
            continue
        if i in amount_idxs:
            seen_first_amount = True
@@ -594,14 +617,22 @@ def scan_pdf_for_transactions(
                    )
                continue

-            date_start, date_end, date_text = dates[0]
+            # First date wins for the "date" column; ALL dates are
+            # excluded from the description so a row carrying both
+            # a transaction date and a posting date doesn't leak
+            # the second one into description text.
+            _, _, first_date_text = dates[0]
+            date_ranges = [(s, e) for s, e, _ in dates]
            amount_idxs = {idx for idx, _, _ in amount_tokens}
            desc = _description_from_row(
-                row_words, (date_start, date_end), amount_idxs,
+                row_words, date_ranges, amount_idxs,
            )

            record: dict[str, Any] = {
-                "date": parse_date(date_text, date_formats) or date_text,
+                "date": (
+                    parse_date(first_date_text, date_formats)
+                    or first_date_text
+                ),
                "description": desc,
                "page": page.page_no,
                "raw": line,
@@ -616,12 +647,36 @@ def scan_pdf_for_transactions(
                record[f"amount_{k}"] = (
                    parsed if parsed is not None else txt
                )
+
+            # Drop rows where the transaction amount is exactly 0.
+            # Bank statements include noise like "INTEREST EARNED
+            # 0.00" or "PAGE TOTAL 0.00" that pass the date+amount
+            # heuristic but aren't real transactions. We key off
+            # ``amount_1`` (leftmost amount = usually the txn
+            # amount); a non-zero balance in ``amount_2`` doesn't
+            # rescue a zero ``amount_1``.
+            if not _has_real_transaction_amount(record):
+                continue
+
            out_rows.append(record)
            prev = record

    return out_rows, warnings


+def _has_real_transaction_amount(record: dict[str, Any]) -> bool:
+    """``amount_1`` is the row's primary amount. Drop rows whose
+    amount_1 parsed to exactly 0; keep everything else (positive,
+    negative, or unparsed-but-non-empty)."""
+    amount_1 = record.get("amount_1")
+    if amount_1 is None:
+        return False
+    if isinstance(amount_1, (int, float)):
+        return amount_1 != 0
+    # Unparsed string — keep so the user can verify in the editor.
+    return bool(str(amount_1).strip())
+
+
 def diagnose_pdf_lines(
    pdf_bytes: bytes,
    *,