diff --git a/src/pdf_extract.py b/src/pdf_extract.py index b709d07..5f9ef91 100644 --- a/src/pdf_extract.py +++ b/src/pdf_extract.py @@ -441,45 +441,61 @@ def extract_pages_auto( def _find_dates_in_words( row_words: list[WordBox], ) -> list[tuple[int, int, str]]: - """Return ``[(start_idx, end_idx, date_text)]`` for the first - date-like substring on this row, or ``[]`` if none. + """Return every date-like substring on this row, sorted by + position. Each entry is ``(start_idx, end_idx_exclusive, text)``. Two-pass search: - **Pass 1** — full-year patterns (``01/15/2026``, - ``Jan 13, 2026``). Tries the longest window first within - this pass so a multi-word ``Jan 15, 2026`` isn't truncated - to ``Jan 15``. + ``Jan 13, 2026``). Longest window first so multi-word dates + aren't truncated to a partial short match. - **Pass 2** — short patterns (``01/13``, ``Jan 13``). Only - runs if pass 1 found nothing — otherwise a stray - ``Page 1/2`` on the same line could shadow the real dated - transaction. + claims word ranges that pass 1 didn't already take, so a + real ``01/13/2026`` always wins over an adjacent + ``Page 1/2``. - ``end_idx`` is exclusive — caller uses ``range(start, end)`` - to exclude all words the date consumed from the description - (the previous single-index return mis-attributed the day - token of multi-word dates like ``Jan 13`` to the description). + Some statements show both a transaction date and a posting + date per row (Chase, BofA, …). The scanner uses the first + match as the canonical date for the CSV column, and excludes + EVERY date from the description so the second / third dates + don't leak into the description text. """ - for patterns, window_order in ( - (_DATE_RES_FULL, (3, 2, 1)), - (_DATE_RES_SHORT, (2, 1)), - ): + def _scan(patterns, window_order): + local_found: list[tuple[int, int, str]] = [] + local_claimed: set[int] = set() for i in range(len(row_words)): + if i in local_claimed: + continue + matched = False for window in window_order: end = i + window if end > len(row_words): continue + if any(j in local_claimed for j in range(i, end)): + continue chunk = " ".join(x.text for x in row_words[i:end]) for rx in patterns: m = rx.search(chunk) if m: - # Count whitespace-separated tokens in the - # MATCH, not in the window — the window may - # have included extra trailing words the - # regex didn't actually consume. consumed = max(1, len(m.group(1).split())) - return [(i, i + consumed, m.group(1))] - return [] + actual_end = i + consumed + local_found.append((i, actual_end, m.group(1))) + local_claimed.update(range(i, actual_end)) + matched = True + break + if matched: + break + return local_found + + full = _scan(_DATE_RES_FULL, (3, 2, 1)) + if full: + # A real full-year date on the row anchors interpretation. + # Don't ALSO collect short patterns — they're almost always + # page numbers ("Page 1/2") or fractions in memos when a + # real date is present. + return sorted(full, key=lambda t: t[0]) + short = _scan(_DATE_RES_SHORT, (2, 1)) + return sorted(short, key=lambda t: t[0]) def _find_amount_tokens( @@ -506,23 +522,30 @@ def _find_amount_tokens( def _description_from_row( row_words: list[WordBox], - date_range: tuple[int, int], + date_ranges: list[tuple[int, int]], amount_idxs: set[int], ) -> str: """Stitch the description from the row's non-date, non-amount - tokens. ``date_range`` is ``(start, end)`` exclusive — every - word in that range is excluded so multi-word dates like - ``Jan 13`` don't leak the day token into the description. + tokens. ``date_ranges`` is a list of ``(start, end)`` (end + exclusive) — every word in any range is excluded. + + Why a list: some bank statements show two dates per row + (transaction + posting). Without excluding all of them, the + extra date(s) leak into the description and look like trash. Keeps tokens before the first amount and after the last amount (trailing check numbers, memos); drops words between - amount tokens (usually whitespace artifacts in column gaps).""" - date_start, date_end = date_range + amount tokens (usually whitespace artifacts in column gaps). + """ + excluded: set[int] = set() + for start, end in date_ranges: + excluded.update(range(start, end)) + keep: list[str] = [] seen_first_amount = False last_amount_idx = max(amount_idxs) if amount_idxs else -1 for i, w in enumerate(row_words): - if date_start <= i < date_end: + if i in excluded: continue if i in amount_idxs: seen_first_amount = True @@ -594,14 +617,22 @@ def scan_pdf_for_transactions( ) continue - date_start, date_end, date_text = dates[0] + # First date wins for the "date" column; ALL dates are + # excluded from the description so a row carrying both + # a transaction date and a posting date doesn't leak + # the second one into description text. + _, _, first_date_text = dates[0] + date_ranges = [(s, e) for s, e, _ in dates] amount_idxs = {idx for idx, _, _ in amount_tokens} desc = _description_from_row( - row_words, (date_start, date_end), amount_idxs, + row_words, date_ranges, amount_idxs, ) record: dict[str, Any] = { - "date": parse_date(date_text, date_formats) or date_text, + "date": ( + parse_date(first_date_text, date_formats) + or first_date_text + ), "description": desc, "page": page.page_no, "raw": line, @@ -616,12 +647,36 @@ def scan_pdf_for_transactions( record[f"amount_{k}"] = ( parsed if parsed is not None else txt ) + + # Drop rows where the transaction amount is exactly 0. + # Bank statements include noise like "INTEREST EARNED + # 0.00" or "PAGE TOTAL 0.00" that pass the date+amount + # heuristic but aren't real transactions. We key off + # ``amount_1`` (leftmost amount = usually the txn + # amount); a non-zero balance in ``amount_2`` doesn't + # rescue a zero ``amount_1``. + if not _has_real_transaction_amount(record): + continue + out_rows.append(record) prev = record return out_rows, warnings +def _has_real_transaction_amount(record: dict[str, Any]) -> bool: + """``amount_1`` is the row's primary amount. Drop rows whose + amount_1 parsed to exactly 0; keep everything else (positive, + negative, or unparsed-but-non-empty).""" + amount_1 = record.get("amount_1") + if amount_1 is None: + return False + if isinstance(amount_1, (int, float)): + return amount_1 != 0 + # Unparsed string — keep so the user can verify in the editor. + return bool(str(amount_1).strip()) + + def diagnose_pdf_lines( pdf_bytes: bytes, *, diff --git a/tests/test_pdf_extract.py b/tests/test_pdf_extract.py index caec485..ebaba7c 100644 --- a/tests/test_pdf_extract.py +++ b/tests/test_pdf_extract.py @@ -151,14 +151,29 @@ class TestFindDatesInWords: def test_short_pattern_does_not_shadow_full_year(self): """If a full-year date is present, short patterns shouldn't steal — e.g. ``Page 1/2 of 3 ... 01/13/2026 Coffee`` should - return the real ``01/13/2026``, not the ``1/2`` page marker.""" + return the real ``01/13/2026`` first.""" row = [ _w("Page", 0, 0), _w("1/2", 40, 0), _w("of", 80, 0), _w("3", 100, 0), _w("01/13/2026", 200, 0), _w("Coffee", 300, 0), ] result = _find_dates_in_words(row) - assert result and result[0][2] == "01/13/2026" + # Full-year match wins position 0 in the returned list. + assert result[0][2] == "01/13/2026" + + def test_multiple_dates_returned_in_position_order(self): + """Chase-style transaction with both posting and txn dates.""" + row = [ + _w("01/13", 0, 0), _w("01/14", 50, 0), + _w("Coffee", 100, 0), _w("$4.50", 200, 0), + ] + result = _find_dates_in_words(row) + assert len(result) == 2 + assert result[0][2] == "01/13" + assert result[1][2] == "01/14" + # First date claims word 0, second claims word 1 + assert result[0][:2] == (0, 1) + assert result[1][:2] == (1, 2) def test_no_date(self): row = [_w("Just", 0, 0), _w("text", 50, 0)] diff --git a/tests/test_pdf_extract_smoke.py b/tests/test_pdf_extract_smoke.py index f648871..eae937f 100644 --- a/tests/test_pdf_extract_smoke.py +++ b/tests/test_pdf_extract_smoke.py @@ -144,6 +144,107 @@ class TestScanPdfForTransactions: # --------------------------------------------------------------------------- +class TestMultiDateRow: + """Some statements (Chase, BofA) show both a transaction date + and a posting date per row. The scanner uses the first date + in position order and excludes every date from the description.""" + + def test_first_date_wins_second_excluded_from_description(self): + from src import pdf_extract as mod + from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions + + original = mod.extract_pages_auto + + def fake(_b, *, allow_ocr=True): + words = [ + WordBox(x0=0, top=0, x1=40, bottom=10, text="01/13"), + WordBox(x0=50, top=0, x1=90, bottom=10, text="01/14"), + WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"), + WordBox(x0=170, top=0, x1=210, bottom=10, text="Shop"), + WordBox(x0=220, top=0, x1=270, bottom=10, text="$4.50"), + ] + return [Page( + page_no=1, width=300, height=20, text="", words=words, + )], [] + + mod.extract_pages_auto = fake + try: + rows, _ = scan_pdf_for_transactions(b"") + finally: + mod.extract_pages_auto = original + + assert len(rows) == 1 + # First date used as the canonical + assert rows[0]["date"] == "01/13" + # Second date NOT in description + assert "01/14" not in rows[0]["description"] + # Description is the actual content between dates and amount + assert rows[0]["description"] == "Coffee Shop" + + +class TestZeroAmountRowsAreDropped: + """Rows where the transaction amount is exactly 0 are noise + (statements love to print "INTEREST EARNED 0.00" or + "PAGE TOTAL 0.00") and get filtered out.""" + + def test_zero_amount_row_dropped(self): + from src import pdf_extract as mod + from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions + + original = mod.extract_pages_auto + + def fake(_b, *, allow_ocr=True): + words = [ + # Real transaction + WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"), + WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"), + WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"), + # Zero-amount noise row (should be dropped) + WordBox(x0=0, top=20, x1=80, bottom=30, text="01/14/2026"), + WordBox(x0=100, top=20, x1=180, bottom=30, text="INTEREST"), + WordBox(x0=200, top=20, x1=240, bottom=30, text="0.00"), + ] + return [Page( + page_no=1, width=300, height=40, text="", words=words, + )], [] + + mod.extract_pages_auto = fake + try: + rows, _ = scan_pdf_for_transactions(b"") + finally: + mod.extract_pages_auto = original + + assert len(rows) == 1 + assert rows[0]["amount_1"] == 4.50 + assert "INTEREST" not in rows[0]["description"] + + def test_negative_amount_kept(self): + from src import pdf_extract as mod + from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions + + original = mod.extract_pages_auto + + def fake(_b, *, allow_ocr=True): + words = [ + WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"), + WordBox(x0=100, top=0, x1=160, bottom=10, text="Withdraw"), + WordBox(x0=200, top=0, x1=240, bottom=10, text="(40.00)"), + ] + return [Page( + page_no=1, width=300, height=20, text="", words=words, + )], [] + + mod.extract_pages_auto = fake + try: + rows, _ = scan_pdf_for_transactions(b"") + finally: + mod.extract_pages_auto = original + + # -40 is not zero — keep it + assert len(rows) == 1 + assert rows[0]["amount_1"] == -40.00 + + class TestMultilineDescription: def test_continuation_line_merges(self): """A line with no date and no amount, sitting between two