fix(pdf): require non-empty description; tighten multi-line merge
User reported "Daily Ledger Balances" entries leaking into output. Three correlated bugs in the row qualifier: **1. Empty description is now disqualifying.** A row like ``01/13/2025 $1,000.00`` has a date and an amount but no text between them — that's a daily-balance entry, a period-summary, or page furniture. Drop these. New filter sits after ``_description_from_row`` returns: if the description string is empty (or whitespace-only), continue past the row. **2. ``prev`` resets per page.** The state that drives multi- line description merging (the "previous transaction this continuation might attach to") used to persist across page boundaries. A no-date no-amount line at the top of page 2 could silently attach to the last transaction on page 1. Fixed by moving the ``prev`` / ``prev_y_bottom`` declarations into the outer page loop so each page starts clean. **3. Multi-line merges now check y-distance.** Before this fix, ANY no-date no-amount line attached to the previous transaction's description. A "Daily Ledger Balances" section header several rows below the last transaction would silently fold into it. Now the merge only happens when the gap ``current_top - prev_y_bottom <= 25.0`` PDF points — generous enough for one blank-line gap between wrapped descriptions, tight enough to reject section headers across paragraph breaks. The threshold is a module constant (``_MULTILINE_MERGE_MAX_GAP``) for future tuning if real statements call for it. Three new test classes: - ``TestRequiresDescription.test_empty_description_row_dropped`` — date+amount-no-text row filtered, real transaction kept. - ``TestPrevTransactionResetsPerPage.test_no_cross_page_merge`` — page-1 transaction + page-2 section header = no merge. - ``TestMultilineMergeYGap`` — close continuation merges (10-pt gap), far section header doesn't (100-pt gap). The original ``TestMultilineDescription.test_continuation_line_merges`` still passes — its setup has a 10-pt gap which is within the new threshold. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -778,9 +778,21 @@ def scan_pdf_for_transactions(
|
||||
metadata = extract_statement_metadata(pages)
|
||||
|
||||
out_rows: list[dict[str, Any]] = []
|
||||
prev: dict[str, Any] | None = None
|
||||
# Maximum y-gap (in PDF points) between a transaction and a
|
||||
# following no-date-no-amount line for that line to count as a
|
||||
# continuation of the description. Typical line baselines sit
|
||||
# ~10–14 pts apart; 25 pts allows for one blank line but
|
||||
# rejects section headers that are several rows away.
|
||||
_MULTILINE_MERGE_MAX_GAP = 25.0
|
||||
|
||||
for page in pages:
|
||||
# ``prev`` and ``prev_y_bottom`` reset per page so a section
|
||||
# header at the top of page 2 can't attach to the last
|
||||
# transaction on page 1 — PDF y-coordinates restart at the
|
||||
# top of each page so the y-distance check is meaningless
|
||||
# across page boundaries.
|
||||
prev: dict[str, Any] | None = None
|
||||
prev_y_bottom: float | None = None
|
||||
rows = cluster_rows(page.words, y_tolerance=y_tolerance)
|
||||
for row_words in rows:
|
||||
line = " ".join(w.text for w in row_words).strip()
|
||||
@@ -791,18 +803,29 @@ def scan_pdf_for_transactions(
|
||||
amount_tokens = _find_amount_tokens(row_words)
|
||||
|
||||
if not dates or not amount_tokens:
|
||||
# Continuation candidate — a line on a transaction
|
||||
# page that has neither a date nor an amount of its
|
||||
# own. Attach to the previous row's description.
|
||||
# Continuation candidate — a line with no date AND
|
||||
# no amount of its own. Only attach to the previous
|
||||
# transaction if (a) we have one, (b) it's on this
|
||||
# same page, and (c) the y-gap to it is small enough
|
||||
# that a human would read this as a wrapped line
|
||||
# rather than a separate paragraph or section header.
|
||||
if (
|
||||
merge_multiline_descriptions
|
||||
and prev is not None
|
||||
and not dates
|
||||
and not amount_tokens
|
||||
and row_words
|
||||
):
|
||||
prev["description"] = (
|
||||
(prev["description"] + " " + line).strip()
|
||||
)
|
||||
current_top = min(w.top for w in row_words)
|
||||
if (
|
||||
prev_y_bottom is not None
|
||||
and (current_top - prev_y_bottom)
|
||||
<= _MULTILINE_MERGE_MAX_GAP
|
||||
):
|
||||
prev["description"] = (
|
||||
(prev["description"] + " " + line).strip()
|
||||
)
|
||||
prev_y_bottom = max(w.bottom for w in row_words)
|
||||
continue
|
||||
|
||||
# First date wins for the "date" column; ALL dates are
|
||||
@@ -816,6 +839,14 @@ def scan_pdf_for_transactions(
|
||||
row_words, date_ranges, amount_idxs,
|
||||
)
|
||||
|
||||
# Every real transaction must have a description. Rows
|
||||
# like "01/13/2025 $1,000.00" (Daily Ledger Balances
|
||||
# section, page totals, period summaries) carry a date
|
||||
# and an amount but no text in between — they're
|
||||
# statement furniture, not transactions. Drop them.
|
||||
if not desc.strip():
|
||||
continue
|
||||
|
||||
iso = parse_date(first_date_text, date_formats)
|
||||
if iso is None:
|
||||
# Short date — try to bind to the statement period
|
||||
@@ -867,6 +898,9 @@ def scan_pdf_for_transactions(
|
||||
|
||||
out_rows.append(record)
|
||||
prev = record
|
||||
prev_y_bottom = (
|
||||
max(w.bottom for w in row_words) if row_words else None
|
||||
)
|
||||
|
||||
return out_rows, warnings
|
||||
|
||||
|
||||
@@ -341,6 +341,153 @@ class TestZeroAmountRowsAreDropped:
|
||||
assert rows[0]["amount_1"] == -40.00
|
||||
|
||||
|
||||
class TestRequiresDescription:
|
||||
"""Every kept row must have non-empty description. Filters out
|
||||
"Daily Ledger Balances" entries (date + amount with no
|
||||
description) and similar statement-furniture rows."""
|
||||
|
||||
def test_empty_description_row_dropped(self):
|
||||
from src import pdf_extract as mod
|
||||
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
|
||||
|
||||
original = mod.extract_pages_auto
|
||||
|
||||
def fake(_b, *, allow_ocr=True):
|
||||
words = [
|
||||
# Real transaction
|
||||
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
|
||||
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
|
||||
WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
|
||||
# Daily-balance row: date + amount, NO description
|
||||
WordBox(x0=0, top=200, x1=80, bottom=210, text="01/14/2026"),
|
||||
WordBox(x0=200, top=200, x1=280, bottom=210, text="$1,000.00"),
|
||||
]
|
||||
return [Page(
|
||||
page_no=1, width=300, height=300, text="", words=words,
|
||||
)], []
|
||||
|
||||
mod.extract_pages_auto = fake
|
||||
try:
|
||||
rows, _ = scan_pdf_for_transactions(b"")
|
||||
finally:
|
||||
mod.extract_pages_auto = original
|
||||
|
||||
assert len(rows) == 1
|
||||
assert "Coffee" in rows[0]["description"]
|
||||
|
||||
|
||||
class TestPrevTransactionResetsPerPage:
|
||||
"""A no-date no-amount line at the top of page 2 must NOT
|
||||
attach to the last transaction of page 1. Different pages have
|
||||
independent y-coordinate origins so the gap check would be
|
||||
meaningless across pages."""
|
||||
|
||||
def test_no_cross_page_merge(self):
|
||||
from src import pdf_extract as mod
|
||||
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
|
||||
|
||||
original = mod.extract_pages_auto
|
||||
|
||||
def fake(_b, *, allow_ocr=True):
|
||||
p1 = Page(
|
||||
page_no=1, width=300, height=300, text="",
|
||||
words=[
|
||||
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
|
||||
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
|
||||
WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
|
||||
],
|
||||
)
|
||||
p2 = Page(
|
||||
page_no=2, width=300, height=300, text="",
|
||||
# First content on page 2 is a section header with
|
||||
# no date and no amount — should NOT attach to the
|
||||
# Coffee row from page 1.
|
||||
words=[
|
||||
WordBox(x0=0, top=0, x1=200, bottom=10, text="Daily"),
|
||||
WordBox(x0=50, top=0, x1=160, bottom=10, text="Ledger"),
|
||||
WordBox(x0=120, top=0, x1=240, bottom=10, text="Balances"),
|
||||
],
|
||||
)
|
||||
return [p1, p2], []
|
||||
|
||||
mod.extract_pages_auto = fake
|
||||
try:
|
||||
rows, _ = scan_pdf_for_transactions(b"")
|
||||
finally:
|
||||
mod.extract_pages_auto = original
|
||||
|
||||
assert len(rows) == 1
|
||||
assert rows[0]["description"] == "Coffee"
|
||||
assert "Ledger" not in rows[0]["description"]
|
||||
|
||||
|
||||
class TestMultilineMergeYGap:
|
||||
"""Wrapped-description continuations are close to the previous
|
||||
transaction (~12 pts gap). Section headers further down the
|
||||
page must NOT be silently merged in."""
|
||||
|
||||
def test_close_continuation_merges(self):
|
||||
from src import pdf_extract as mod
|
||||
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
|
||||
|
||||
original = mod.extract_pages_auto
|
||||
|
||||
def fake(_b, *, allow_ocr=True):
|
||||
words = [
|
||||
# Transaction at y=0..10
|
||||
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
|
||||
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
|
||||
WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
|
||||
# Continuation at y=20..30 (gap = 10 pts) — should merge
|
||||
WordBox(x0=100, top=20, x1=200, bottom=30, text="Vendor"),
|
||||
WordBox(x0=210, top=20, x1=260, bottom=30, text="memo"),
|
||||
]
|
||||
return [Page(
|
||||
page_no=1, width=300, height=50, text="", words=words,
|
||||
)], []
|
||||
|
||||
mod.extract_pages_auto = fake
|
||||
try:
|
||||
rows, _ = scan_pdf_for_transactions(b"")
|
||||
finally:
|
||||
mod.extract_pages_auto = original
|
||||
|
||||
assert len(rows) == 1
|
||||
assert "Vendor memo" in rows[0]["description"]
|
||||
|
||||
def test_far_section_header_does_not_merge(self):
|
||||
"""Same setup but the second line is far below — would be
|
||||
a different paragraph in the source PDF."""
|
||||
from src import pdf_extract as mod
|
||||
from src.pdf_extract import Page, WordBox, scan_pdf_for_transactions
|
||||
|
||||
original = mod.extract_pages_auto
|
||||
|
||||
def fake(_b, *, allow_ocr=True):
|
||||
words = [
|
||||
WordBox(x0=0, top=0, x1=80, bottom=10, text="01/13/2026"),
|
||||
WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"),
|
||||
WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"),
|
||||
# 100 pts away — well past the 25-pt merge ceiling
|
||||
WordBox(x0=0, top=110, x1=200, bottom=120, text="Daily"),
|
||||
WordBox(x0=80, top=110, x1=200, bottom=120, text="Ledger"),
|
||||
WordBox(x0=180, top=110, x1=300, bottom=120, text="Balances"),
|
||||
]
|
||||
return [Page(
|
||||
page_no=1, width=300, height=200, text="", words=words,
|
||||
)], []
|
||||
|
||||
mod.extract_pages_auto = fake
|
||||
try:
|
||||
rows, _ = scan_pdf_for_transactions(b"")
|
||||
finally:
|
||||
mod.extract_pages_auto = original
|
||||
|
||||
assert len(rows) == 1
|
||||
assert rows[0]["description"] == "Coffee"
|
||||
assert "Ledger" not in rows[0]["description"]
|
||||
|
||||
|
||||
class TestMultilineDescription:
|
||||
def test_continuation_line_merges(self):
|
||||
"""A line with no date and no amount, sitting between two
|
||||
|
||||
Reference in New Issue
Block a user