fix(pdf): require non-empty description; tighten multi-line merge

User reported "Daily Ledger Balances" entries leaking into
output. Three correlated bugs in the row qualifier:

**1. Empty description is now disqualifying.** A row like
``01/13/2025  $1,000.00`` has a date and an amount but no text
between them — that's a daily-balance entry, a period-summary,
or page furniture. Drop these. New filter sits after
``_description_from_row`` returns: if the description string is
empty (or whitespace-only), continue past the row.

**2. ``prev`` resets per page.** The state that drives multi-
line description merging (the "previous transaction this
continuation might attach to") used to persist across page
boundaries. A no-date no-amount line at the top of page 2
could silently attach to the last transaction on page 1. Fixed
by moving the ``prev`` / ``prev_y_bottom`` declarations into
the outer page loop so each page starts clean.

**3. Multi-line merges now check y-distance.** Before this fix,
ANY no-date no-amount line attached to the previous
transaction's description. A "Daily Ledger Balances" section
header several rows below the last transaction would silently
fold into it. Now the merge only happens when the gap
``current_top - prev_y_bottom <= 25.0`` PDF points — generous
enough for one blank-line gap between wrapped descriptions,
tight enough to reject section headers across paragraph
breaks. The threshold is a module constant
(``_MULTILINE_MERGE_MAX_GAP``) for future tuning if real
statements call for it.

Three new test classes:

- ``TestRequiresDescription.test_empty_description_row_dropped``
  — date+amount-no-text row filtered, real transaction kept.
- ``TestPrevTransactionResetsPerPage.test_no_cross_page_merge``
  — page-1 transaction + page-2 section header = no merge.
- ``TestMultilineMergeYGap`` — close continuation merges
  (10-pt gap), far section header doesn't (100-pt gap).

The original ``TestMultilineDescription.test_continuation_line_merges``
still passes — its setup has a 10-pt gap which is within the
new threshold.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-20 00:58:50 +00:00
parent a1824b8dc4
commit 6f2ad57490
2 changed files with 188 additions and 7 deletions

View File

@@ -341,6 +341,153 @@ class TestZeroAmountRowsAreDropped:
assert rows[0]["amount_1"] == -40.00
class TestRequiresDescription:
"""Every kept row must have non-empty description. Filters out
"Daily Ledger Balances" entries (date + amount with no
description) and similar statement-furniture rows."""
def test_empty_description_row_dropped(self):
from src import pdf_extract as mod
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
original = mod.extract_pages_auto
def fake(_b, *, allow_ocr=True):
words = [
# Real transaction
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
# Daily-balance row: date + amount, NO description
WordBox(x0=0, top=200, x1=80, bottom=210, text="01/14/2026"),
WordBox(x0=200, top=200, x1=280, bottom=210, text="$1,000.00"),
]
return [Page(
page_no=1, width=300, height=300, text="", words=words,
)], []
mod.extract_pages_auto = fake
try:
rows, _ = scan_pdf_for_transactions(b"")
finally:
mod.extract_pages_auto = original
assert len(rows) == 1
assert "Coffee" in rows[0]["description"]
class TestPrevTransactionResetsPerPage:
"""A no-date no-amount line at the top of page 2 must NOT
attach to the last transaction of page 1. Different pages have
independent y-coordinate origins so the gap check would be
meaningless across pages."""
def test_no_cross_page_merge(self):
from src import pdf_extract as mod
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
original = mod.extract_pages_auto
def fake(_b, *, allow_ocr=True):
p1 = Page(
page_no=1, width=300, height=300, text="",
words=[
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
],
)
p2 = Page(
page_no=2, width=300, height=300, text="",
# First content on page 2 is a section header with
# no date and no amount — should NOT attach to the
# Coffee row from page 1.
words=[
WordBox(x0=0, top=0, x1=200, bottom=10, text="Daily"),
WordBox(x0=50, top=0, x1=160, bottom=10, text="Ledger"),
WordBox(x0=120, top=0, x1=240, bottom=10, text="Balances"),
],
)
return [p1, p2], []
mod.extract_pages_auto = fake
try:
rows, _ = scan_pdf_for_transactions(b"")
finally:
mod.extract_pages_auto = original
assert len(rows) == 1
assert rows[0]["description"] == "Coffee"
assert "Ledger" not in rows[0]["description"]
class TestMultilineMergeYGap:
"""Wrapped-description continuations are close to the previous
transaction (~12 pts gap). Section headers further down the
page must NOT be silently merged in."""
def test_close_continuation_merges(self):
from src import pdf_extract as mod
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
original = mod.extract_pages_auto
def fake(_b, *, allow_ocr=True):
words = [
# Transaction at y=0..10
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
# Continuation at y=20..30 (gap = 10 pts) — should merge
WordBox(x0=100, top=20, x1=200, bottom=30, text="Vendor"),
WordBox(x0=210, top=20, x1=260, bottom=30, text="memo"),
]
return [Page(
page_no=1, width=300, height=50, text="", words=words,
)], []
mod.extract_pages_auto = fake
try:
rows, _ = scan_pdf_for_transactions(b"")
finally:
mod.extract_pages_auto = original
assert len(rows) == 1
assert "Vendor memo" in rows[0]["description"]
def test_far_section_header_does_not_merge(self):
"""Same setup but the second line is far below — would be
a different paragraph in the source PDF."""
from src import pdf_extract as mod
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
original = mod.extract_pages_auto
def fake(_b, *, allow_ocr=True):
words = [
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
# 100 pts away — well past the 25-pt merge ceiling
WordBox(x0=0, top=110, x1=200, bottom=120, text="Daily"),
WordBox(x0=80, top=110, x1=200, bottom=120, text="Ledger"),
WordBox(x0=180, top=110, x1=300, bottom=120, text="Balances"),
]
return [Page(
page_no=1, width=300, height=200, text="", words=words,
)], []
mod.extract_pages_auto = fake
try:
rows, _ = scan_pdf_for_transactions(b"")
finally:
mod.extract_pages_auto = original
assert len(rows) == 1
assert rows[0]["description"] == "Coffee"
assert "Ledger" not in rows[0]["description"]
class TestMultilineDescription:
def test_continuation_line_merges(self):
"""A line with no date and no amount, sitting between two