diff --git a/src/pdf_extract.py b/src/pdf_extract.py index 31cac7b..1989043 100644 --- a/src/pdf_extract.py +++ b/src/pdf_extract.py @@ -778,9 +778,21 @@ def scan_pdf_for_transactions( metadata = extract_statement_metadata(pages) out_rows: list[dict[str, Any]] = [] - prev: dict[str, Any] | None = None + # Maximum y-gap (in PDF points) between a transaction and a + # following no-date-no-amount line for that line to count as a + # continuation of the description. Typical line baselines sit + # ~10–14 pts apart; 25 pts allows for one blank line but + # rejects section headers that are several rows away. + _MULTILINE_MERGE_MAX_GAP = 25.0 for page in pages: + # ``prev`` and ``prev_y_bottom`` reset per page so a section + # header at the top of page 2 can't attach to the last + # transaction on page 1 — PDF y-coordinates restart at the + # top of each page so the y-distance check is meaningless + # across page boundaries. + prev: dict[str, Any] | None = None + prev_y_bottom: float | None = None rows = cluster_rows(page.words, y_tolerance=y_tolerance) for row_words in rows: line = " ".join(w.text for w in row_words).strip() @@ -791,18 +803,29 @@ def scan_pdf_for_transactions( amount_tokens = _find_amount_tokens(row_words) if not dates or not amount_tokens: - # Continuation candidate — a line on a transaction - # page that has neither a date nor an amount of its - # own. Attach to the previous row's description. + # Continuation candidate — a line with no date AND + # no amount of its own. Only attach to the previous + # transaction if (a) we have one, (b) it's on this + # same page, and (c) the y-gap to it is small enough + # that a human would read this as a wrapped line + # rather than a separate paragraph or section header. if ( merge_multiline_descriptions and prev is not None and not dates and not amount_tokens + and row_words ): - prev["description"] = ( - (prev["description"] + " " + line).strip() - ) + current_top = min(w.top for w in row_words) + if ( + prev_y_bottom is not None + and (current_top - prev_y_bottom) + <= _MULTILINE_MERGE_MAX_GAP + ): + prev["description"] = ( + (prev["description"] + " " + line).strip() + ) + prev_y_bottom = max(w.bottom for w in row_words) continue # First date wins for the "date" column; ALL dates are @@ -816,6 +839,14 @@ def scan_pdf_for_transactions( row_words, date_ranges, amount_idxs, ) + # Every real transaction must have a description. Rows + # like "01/13/2025 $1,000.00" (Daily Ledger Balances + # section, page totals, period summaries) carry a date + # and an amount but no text in between — they're + # statement furniture, not transactions. Drop them. + if not desc.strip(): + continue + iso = parse_date(first_date_text, date_formats) if iso is None: # Short date — try to bind to the statement period @@ -867,6 +898,9 @@ def scan_pdf_for_transactions( out_rows.append(record) prev = record + prev_y_bottom = ( + max(w.bottom for w in row_words) if row_words else None + ) return out_rows, warnings diff --git a/tests/test_pdf_extract_smoke.py b/tests/test_pdf_extract_smoke.py index 79a0d13..2baca94 100644 --- a/tests/test_pdf_extract_smoke.py +++ b/tests/test_pdf_extract_smoke.py @@ -341,6 +341,153 @@ class TestZeroAmountRowsAreDropped: assert rows[0]["amount_1"] == -40.00 +class TestRequiresDescription: + """Every kept row must have non-empty description. Filters out + "Daily Ledger Balances" entries (date + amount with no + description) and similar statement-furniture rows.""" + + def test_empty_description_row_dropped(self): + from src import pdf_extract as mod + from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions + + original = mod.extract_pages_auto + + def fake(_b, *, allow_ocr=True): + words = [ + # Real transaction + WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"), + WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"), + WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"), + # Daily-balance row: date + amount, NO description + WordBox(x0=0, top=200, x1=80, bottom=210, text="01/14/2026"), + WordBox(x0=200, top=200, x1=280, bottom=210, text="$1,000.00"), + ] + return [Page( + page_no=1, width=300, height=300, text="", words=words, + )], [] + + mod.extract_pages_auto = fake + try: + rows, _ = scan_pdf_for_transactions(b"") + finally: + mod.extract_pages_auto = original + + assert len(rows) == 1 + assert "Coffee" in rows[0]["description"] + + +class TestPrevTransactionResetsPerPage: + """A no-date no-amount line at the top of page 2 must NOT + attach to the last transaction of page 1. Different pages have + independent y-coordinate origins so the gap check would be + meaningless across pages.""" + + def test_no_cross_page_merge(self): + from src import pdf_extract as mod + from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions + + original = mod.extract_pages_auto + + def fake(_b, *, allow_ocr=True): + p1 = Page( + page_no=1, width=300, height=300, text="", + words=[ + WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"), + WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"), + WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"), + ], + ) + p2 = Page( + page_no=2, width=300, height=300, text="", + # First content on page 2 is a section header with + # no date and no amount — should NOT attach to the + # Coffee row from page 1. + words=[ + WordBox(x0=0, top=0, x1=200, bottom=10, text="Daily"), + WordBox(x0=50, top=0, x1=160, bottom=10, text="Ledger"), + WordBox(x0=120, top=0, x1=240, bottom=10, text="Balances"), + ], + ) + return [p1, p2], [] + + mod.extract_pages_auto = fake + try: + rows, _ = scan_pdf_for_transactions(b"") + finally: + mod.extract_pages_auto = original + + assert len(rows) == 1 + assert rows[0]["description"] == "Coffee" + assert "Ledger" not in rows[0]["description"] + + +class TestMultilineMergeYGap: + """Wrapped-description continuations are close to the previous + transaction (~12 pts gap). Section headers further down the + page must NOT be silently merged in.""" + + def test_close_continuation_merges(self): + from src import pdf_extract as mod + from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions + + original = mod.extract_pages_auto + + def fake(_b, *, allow_ocr=True): + words = [ + # Transaction at y=0..10 + WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"), + WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"), + WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"), + # Continuation at y=20..30 (gap = 10 pts) — should merge + WordBox(x0=100, top=20, x1=200, bottom=30, text="Vendor"), + WordBox(x0=210, top=20, x1=260, bottom=30, text="memo"), + ] + return [Page( + page_no=1, width=300, height=50, text="", words=words, + )], [] + + mod.extract_pages_auto = fake + try: + rows, _ = scan_pdf_for_transactions(b"") + finally: + mod.extract_pages_auto = original + + assert len(rows) == 1 + assert "Vendor memo" in rows[0]["description"] + + def test_far_section_header_does_not_merge(self): + """Same setup but the second line is far below — would be + a different paragraph in the source PDF.""" + from src import pdf_extract as mod + from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions + + original = mod.extract_pages_auto + + def fake(_b, *, allow_ocr=True): + words = [ + WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"), + WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"), + WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"), + # 100 pts away — well past the 25-pt merge ceiling + WordBox(x0=0, top=110, x1=200, bottom=120, text="Daily"), + WordBox(x0=80, top=110, x1=200, bottom=120, text="Ledger"), + WordBox(x0=180, top=110, x1=300, bottom=120, text="Balances"), + ] + return [Page( + page_no=1, width=300, height=200, text="", words=words, + )], [] + + mod.extract_pages_auto = fake + try: + rows, _ = scan_pdf_for_transactions(b"") + finally: + mod.extract_pages_auto = original + + assert len(rows) == 1 + assert rows[0]["description"] == "Coffee" + assert "Ledger" not in rows[0]["description"] + + class TestMultilineDescription: def test_continuation_line_merges(self): """A line with no date and no amount, sitting between two