feat(pdf): row-heuristic extraction (mode dispatch, no coordinates)
User reported the column-visual approach is too brittle for real bank statements: column-x-positions saved against a sample page don't survive layout drift between months (statement A has columns at x=300, statement B drifted to x=320), and a saved template can only realistically work for one statement's specific render. The fundamental fix is to stop depending on coordinates at all. **Row-heuristic mode** finds transaction rows by pattern: any line with a date token + N amount tokens IS a transaction. Date patterns (US slash / EU slash / ISO / "Jan 15, 2026" / etc.) and amount patterns (currency, parens-negative, thousands grouping) are matched against word text — no x-positions involved. The full pipeline: 1. ``find_transaction_rows`` clusters words into rows and scans each line for date + amount tokens. 2. Multi-line descriptions still attach to the previous row via the no-date-no-amount continuation rule. 3. Amount shapes drive interpretation: ``single`` / ``txn_balance`` / ``debit_credit`` / ``debit_credit_balance``. 4. ``_infer_amount_column_centers`` clusters amount x-midpoints ACROSS ALL detected rows to find natural column groupings — so debit-vs-credit assignment for single-amount lines works without the user marking anything on screen. ``apply_template`` is now a dispatch over ``template["mode"]``: - ``mode="row_heuristic"`` (default for new templates) — the new pipeline. - ``mode="column_visual"`` — the existing pipeline, kept under ``_apply_template_column_visual`` for v1 templates and the Advanced fallback. 18 new tests cover: date detection (US slash, two-digit year, ISO, month-name, missing); amount-token finding (currency, parens, pure text, bare-year rejection); column-center inference (clear two-column case, empty input); end-to-end on synthetic Page objects with all four amount shapes; the critical layout-drift test that proves the same template works on pages of different sizes / different absolute x-positions. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -417,16 +417,432 @@ def _coerce_amount_columns(
|
|||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Row-heuristic extraction (mode = "row_heuristic", default for new templates)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
_DATE_RES = [
|
||||||
|
re.compile(r"\b(\d{1,2}/\d{1,2}/\d{2,4})\b"),
|
||||||
|
re.compile(r"\b(\d{1,2}-\d{1,2}-\d{2,4})\b"),
|
||||||
|
re.compile(r"\b(\d{4}-\d{2}-\d{2})\b"),
|
||||||
|
re.compile(r"\b([A-Z][a-z]{2}\s+\d{1,2},?\s+\d{2,4})\b"),
|
||||||
|
re.compile(r"\b(\d{1,2}\s+[A-Z][a-z]{2}\s+\d{2,4})\b"),
|
||||||
|
# Short month-day (e.g. "Jan 15") — sometimes used when year is
|
||||||
|
# implied by the statement period. Lower-priority match.
|
||||||
|
re.compile(r"\b([A-Z][a-z]{2}\s+\d{1,2})\b"),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Amount tokens: optional $/€/£, optional leading -, optional parens,
|
||||||
|
# 1-3 digits before grouping with comma-thousand groups, optional
|
||||||
|
# decimal portion. Trailing minus also captured.
|
||||||
|
_AMOUNT_RE = re.compile(
|
||||||
|
r"(?<![\w.])" # no preceding letter/dot
|
||||||
|
r"(\(?-?[\$€£]?-?\d{1,3}(?:,\d{3})*(?:\.\d{1,4})?\)?-?)"
|
||||||
|
r"(?![\w.])"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_like_amount(token: str) -> bool:
|
||||||
|
"""Reject tokens that match the amount regex but are obviously
|
||||||
|
not money — e.g. a bare year or a page number. Real amounts
|
||||||
|
have at least one of: currency symbol, decimal point, parens,
|
||||||
|
minus sign, or a thousand separator."""
|
||||||
|
if not token:
|
||||||
|
return False
|
||||||
|
return bool(re.search(r"[\$€£.,()\-]", token))
|
||||||
|
|
||||||
|
|
||||||
|
def _find_dates_in_words(
|
||||||
|
row_words: list[WordBox],
|
||||||
|
) -> list[tuple[int, str]]:
|
||||||
|
"""Find the FIRST date-like substring on this row.
|
||||||
|
|
||||||
|
Returns ``[(word_index, date_text)]`` or empty list. Searches
|
||||||
|
word-by-word so we can identify which word(s) constitute the
|
||||||
|
date and exclude them from the description."""
|
||||||
|
for i, w in enumerate(row_words):
|
||||||
|
# Stitch the next few words together — some date formats
|
||||||
|
# like "Jan 15, 2026" span 3 word tokens.
|
||||||
|
for window in (3, 2, 1):
|
||||||
|
chunk = " ".join(x.text for x in row_words[i : i + window])
|
||||||
|
for rx in _DATE_RES:
|
||||||
|
m = rx.search(chunk)
|
||||||
|
if m:
|
||||||
|
return [(i, m.group(1))]
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def _find_amount_tokens(
|
||||||
|
row_words: list[WordBox],
|
||||||
|
) -> list[tuple[int, WordBox, str]]:
|
||||||
|
"""Find amount-shaped tokens on this row, keeping their position.
|
||||||
|
|
||||||
|
Returns ``[(word_index, wordbox, normalized_text)]``. The
|
||||||
|
word_index lets the caller exclude these from description text;
|
||||||
|
the wordbox preserves the x-position so we can cluster amount
|
||||||
|
columns later without templated coordinates."""
|
||||||
|
out: list[tuple[int, WordBox, str]] = []
|
||||||
|
for i, w in enumerate(row_words):
|
||||||
|
# Each word might contain multiple amount tokens if the PDF
|
||||||
|
# extractor merged things, but in practice one match per word.
|
||||||
|
m = _AMOUNT_RE.search(w.text)
|
||||||
|
if m and _looks_like_amount(m.group(1)):
|
||||||
|
out.append((i, w, m.group(1)))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _row_is_transaction(
|
||||||
|
row_words: list[WordBox],
|
||||||
|
*,
|
||||||
|
min_amounts: int,
|
||||||
|
max_amounts: int,
|
||||||
|
) -> bool:
|
||||||
|
"""A transaction line has at least one date AND enough amount
|
||||||
|
tokens to satisfy the configured shape."""
|
||||||
|
if not _find_dates_in_words(row_words):
|
||||||
|
return False
|
||||||
|
amounts = _find_amount_tokens(row_words)
|
||||||
|
return min_amounts <= len(amounts) <= max_amounts
|
||||||
|
|
||||||
|
|
||||||
|
def _description_from_row(
|
||||||
|
row_words: list[WordBox],
|
||||||
|
date_idx: int,
|
||||||
|
amount_idxs: set[int],
|
||||||
|
) -> str:
|
||||||
|
"""Stitch the row's description: everything between the date
|
||||||
|
word and the first amount token, plus anything after the last
|
||||||
|
amount that isn't itself an amount."""
|
||||||
|
keep: list[str] = []
|
||||||
|
seen_first_amount = False
|
||||||
|
last_amount_idx = max(amount_idxs) if amount_idxs else -1
|
||||||
|
for i, w in enumerate(row_words):
|
||||||
|
if i == date_idx:
|
||||||
|
continue
|
||||||
|
if i in amount_idxs:
|
||||||
|
seen_first_amount = True
|
||||||
|
continue
|
||||||
|
# After the last amount, trailing tokens are usually a
|
||||||
|
# check number or memo — keep them too.
|
||||||
|
if seen_first_amount and i < last_amount_idx:
|
||||||
|
continue
|
||||||
|
keep.append(w.text)
|
||||||
|
return " ".join(keep).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _assign_amounts_by_shape(
|
||||||
|
amount_tokens: list[tuple[int, WordBox, str]],
|
||||||
|
shape: str,
|
||||||
|
parse_opts: dict[str, Any],
|
||||||
|
column_centers: list[float] | None = None,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Map raw amount tokens to typed CSV fields per the shape.
|
||||||
|
|
||||||
|
Shapes:
|
||||||
|
``single`` → first amount is ``amount`` (sign in value)
|
||||||
|
``txn_balance`` → leftmost is ``amount``, rightmost is
|
||||||
|
``balance``
|
||||||
|
``debit_credit`` → if one token, assign to debit or credit by
|
||||||
|
x-position (uses ``column_centers``); if two, leftmost is
|
||||||
|
debit, next is credit. Combine into signed ``amount``.
|
||||||
|
``debit_credit_balance`` → leftmost is debit, middle is
|
||||||
|
credit, rightmost is balance.
|
||||||
|
"""
|
||||||
|
out: dict[str, Any] = {}
|
||||||
|
if not amount_tokens:
|
||||||
|
return out
|
||||||
|
txt = [t[2] for t in amount_tokens]
|
||||||
|
boxes = [t[1] for t in amount_tokens]
|
||||||
|
|
||||||
|
if shape == "single":
|
||||||
|
out["amount"] = parse_amount(txt[0], parse_opts)
|
||||||
|
|
||||||
|
elif shape == "txn_balance":
|
||||||
|
out["amount"] = parse_amount(txt[0], parse_opts)
|
||||||
|
if len(txt) >= 2:
|
||||||
|
out["balance"] = parse_amount(txt[-1], parse_opts)
|
||||||
|
|
||||||
|
elif shape == "debit_credit":
|
||||||
|
debit_val: float | None = None
|
||||||
|
credit_val: float | None = None
|
||||||
|
if len(txt) == 1 and column_centers and len(column_centers) >= 2:
|
||||||
|
# Decide debit vs credit by which column-center the token's
|
||||||
|
# midpoint is closest to.
|
||||||
|
mid = (boxes[0].x0 + boxes[0].x1) / 2
|
||||||
|
distances = [abs(mid - c) for c in column_centers[:2]]
|
||||||
|
if distances[0] <= distances[1]:
|
||||||
|
debit_val = parse_amount(txt[0], parse_opts)
|
||||||
|
else:
|
||||||
|
credit_val = parse_amount(txt[0], parse_opts)
|
||||||
|
else:
|
||||||
|
# Two tokens: leftmost = debit, rightmost = credit.
|
||||||
|
if len(txt) >= 1:
|
||||||
|
debit_val = parse_amount(txt[0], parse_opts)
|
||||||
|
if len(txt) >= 2:
|
||||||
|
credit_val = parse_amount(txt[1], parse_opts)
|
||||||
|
amt = 0.0
|
||||||
|
if credit_val:
|
||||||
|
amt += credit_val
|
||||||
|
if debit_val:
|
||||||
|
amt -= debit_val
|
||||||
|
out["amount"] = amt
|
||||||
|
out["type"] = "credit" if amt > 0 else ("debit" if amt < 0 else "")
|
||||||
|
|
||||||
|
elif shape == "debit_credit_balance":
|
||||||
|
debit_val = None
|
||||||
|
credit_val = None
|
||||||
|
if len(txt) == 2 and column_centers and len(column_centers) >= 3:
|
||||||
|
# Two tokens but the shape expects three — fall through
|
||||||
|
# to x-position assignment using the configured columns.
|
||||||
|
mids = [(b.x0 + b.x1) / 2 for b in boxes]
|
||||||
|
assigned: list[int | None] = [None, None, None]
|
||||||
|
for k, m in enumerate(mids):
|
||||||
|
col = min(
|
||||||
|
range(3),
|
||||||
|
key=lambda c, m=m: abs(m - column_centers[c]),
|
||||||
|
)
|
||||||
|
assigned[col] = k
|
||||||
|
if assigned[0] is not None:
|
||||||
|
debit_val = parse_amount(txt[assigned[0]], parse_opts)
|
||||||
|
if assigned[1] is not None:
|
||||||
|
credit_val = parse_amount(txt[assigned[1]], parse_opts)
|
||||||
|
if assigned[2] is not None:
|
||||||
|
out["balance"] = parse_amount(txt[assigned[2]], parse_opts)
|
||||||
|
else:
|
||||||
|
if len(txt) >= 1:
|
||||||
|
debit_val = parse_amount(txt[0], parse_opts)
|
||||||
|
if len(txt) >= 2:
|
||||||
|
credit_val = parse_amount(txt[1], parse_opts)
|
||||||
|
if len(txt) >= 3:
|
||||||
|
out["balance"] = parse_amount(txt[2], parse_opts)
|
||||||
|
amt = 0.0
|
||||||
|
if credit_val:
|
||||||
|
amt += credit_val
|
||||||
|
if debit_val:
|
||||||
|
amt -= debit_val
|
||||||
|
out["amount"] = amt
|
||||||
|
out["type"] = "credit" if amt > 0 else ("debit" if amt < 0 else "")
|
||||||
|
else:
|
||||||
|
# Unknown shape — fall back to the simplest interpretation.
|
||||||
|
out["amount"] = parse_amount(txt[0], parse_opts)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _infer_amount_column_centers(
|
||||||
|
rows: list[list[WordBox]],
|
||||||
|
*,
|
||||||
|
expected: int,
|
||||||
|
min_amounts: int,
|
||||||
|
max_amounts: int,
|
||||||
|
) -> list[float]:
|
||||||
|
"""Cluster amount-token x-midpoints across all transaction rows
|
||||||
|
to find natural column centers. Returns up to *expected* centers
|
||||||
|
sorted left-to-right.
|
||||||
|
|
||||||
|
Avoids re-introducing user-drawn coordinates: the columns are
|
||||||
|
inferred from the data itself. We can't run k-means without
|
||||||
|
scikit-learn, so use a simple sorted-midpoints + greedy bucket
|
||||||
|
by proximity tolerance approach.
|
||||||
|
"""
|
||||||
|
midpoints: list[float] = []
|
||||||
|
for row_words in rows:
|
||||||
|
if not _row_is_transaction(
|
||||||
|
row_words, min_amounts=min_amounts, max_amounts=max_amounts,
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
for _, w, _ in _find_amount_tokens(row_words):
|
||||||
|
midpoints.append((w.x0 + w.x1) / 2)
|
||||||
|
if not midpoints:
|
||||||
|
return []
|
||||||
|
midpoints.sort()
|
||||||
|
# Bucket by adjacency: any gap > 30pt starts a new bucket.
|
||||||
|
# 30pt ≈ 4x the typical inter-column spacing on bank statements.
|
||||||
|
buckets: list[list[float]] = [[midpoints[0]]]
|
||||||
|
for m in midpoints[1:]:
|
||||||
|
if m - buckets[-1][-1] <= 30:
|
||||||
|
buckets[-1].append(m)
|
||||||
|
else:
|
||||||
|
buckets.append([m])
|
||||||
|
centers = [sum(b) / len(b) for b in buckets]
|
||||||
|
if len(centers) <= expected:
|
||||||
|
return centers
|
||||||
|
# More buckets than expected — keep the *expected* most-populated.
|
||||||
|
by_pop = sorted(
|
||||||
|
zip(centers, (len(b) for b in buckets)),
|
||||||
|
key=lambda x: x[1],
|
||||||
|
reverse=True,
|
||||||
|
)[:expected]
|
||||||
|
return sorted(c for c, _ in by_pop)
|
||||||
|
|
||||||
|
|
||||||
|
def find_transaction_rows(
|
||||||
|
pages: list[Page],
|
||||||
|
template: dict[str, Any],
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
"""Heuristic row detector. Returns a list of preview records
|
||||||
|
suitable for rendering in the build-mode preview table.
|
||||||
|
|
||||||
|
Each record carries the raw text + parsed fields; the GUI
|
||||||
|
surfaces these so the user can confirm or tune the template
|
||||||
|
before extraction commits to disk.
|
||||||
|
"""
|
||||||
|
rd = template.get("row_detection", {}) or {}
|
||||||
|
amt_cfg = template.get("amounts", {}) or {}
|
||||||
|
date_cfg = template.get("date", {}) or {}
|
||||||
|
pages_cfg = template.get("pages", {}) or {}
|
||||||
|
|
||||||
|
pages_used = _pages_in_range(pages, pages_cfg.get("range", "all"))
|
||||||
|
skip_pages_re = pages_cfg.get("skip_matching") or ""
|
||||||
|
if skip_pages_re:
|
||||||
|
skip_re = re.compile(skip_pages_re, re.IGNORECASE)
|
||||||
|
pages_used = [p for p in pages_used if not skip_re.search(p.text)]
|
||||||
|
|
||||||
|
min_amounts = int(rd.get("min_amounts_per_row", 1))
|
||||||
|
max_amounts = int(rd.get("max_amounts_per_row", 3))
|
||||||
|
skip_row_res = [
|
||||||
|
re.compile(p, re.IGNORECASE)
|
||||||
|
for p in (rd.get("skip_rows_matching") or [])
|
||||||
|
]
|
||||||
|
shape = amt_cfg.get("shape", "single")
|
||||||
|
expected_amount_cols = {
|
||||||
|
"single": 1,
|
||||||
|
"txn_balance": 2,
|
||||||
|
"debit_credit": 2,
|
||||||
|
"debit_credit_balance": 3,
|
||||||
|
}.get(shape, 1)
|
||||||
|
|
||||||
|
parse_opts = {
|
||||||
|
"decimal_separator": amt_cfg.get("decimal_separator", "."),
|
||||||
|
"thousands_separator": amt_cfg.get("thousands_separator", ","),
|
||||||
|
"currency_strip": amt_cfg.get("currency_strip", "$"),
|
||||||
|
"negative_in_parens": amt_cfg.get("negative_in_parens", True),
|
||||||
|
}
|
||||||
|
date_formats: list[str] = list(date_cfg.get("formats_fallback") or [])
|
||||||
|
if date_cfg.get("format"):
|
||||||
|
date_formats = [date_cfg["format"]] + date_formats
|
||||||
|
|
||||||
|
# First pass per page: gather rows so we can also infer amount
|
||||||
|
# column centers across the whole document.
|
||||||
|
all_rows: list[tuple[Page, list[list[WordBox]]]] = []
|
||||||
|
for page in pages_used:
|
||||||
|
rows = cluster_rows(
|
||||||
|
page.words,
|
||||||
|
y_tolerance=float(rd.get("y_tolerance", 3.0)),
|
||||||
|
)
|
||||||
|
all_rows.append((page, rows))
|
||||||
|
|
||||||
|
flat_rows = [r for _, rows in all_rows for r in rows]
|
||||||
|
column_centers = _infer_amount_column_centers(
|
||||||
|
flat_rows,
|
||||||
|
expected=expected_amount_cols,
|
||||||
|
min_amounts=min_amounts,
|
||||||
|
max_amounts=max_amounts,
|
||||||
|
)
|
||||||
|
|
||||||
|
out: list[dict[str, Any]] = []
|
||||||
|
merge_multi = bool(rd.get("merge_multiline_description", True))
|
||||||
|
prev: dict[str, Any] | None = None
|
||||||
|
|
||||||
|
for page, rows in all_rows:
|
||||||
|
for row_words in rows:
|
||||||
|
line = " ".join(w.text for w in row_words)
|
||||||
|
if not line.strip():
|
||||||
|
continue
|
||||||
|
if any(rx.search(line) for rx in skip_row_res):
|
||||||
|
continue
|
||||||
|
|
||||||
|
dates = _find_dates_in_words(row_words)
|
||||||
|
amount_tokens = _find_amount_tokens(row_words)
|
||||||
|
|
||||||
|
is_txn = bool(dates) and (
|
||||||
|
min_amounts <= len(amount_tokens) <= max_amounts
|
||||||
|
)
|
||||||
|
|
||||||
|
if not is_txn:
|
||||||
|
# Possible multi-line description continuation —
|
||||||
|
# a no-date, no-amount line directly following a
|
||||||
|
# transaction.
|
||||||
|
if (
|
||||||
|
merge_multi
|
||||||
|
and prev is not None
|
||||||
|
and not amount_tokens
|
||||||
|
and not dates
|
||||||
|
):
|
||||||
|
prev["description"] = (
|
||||||
|
(prev.get("description") or "") + " " + line
|
||||||
|
).strip()
|
||||||
|
continue
|
||||||
|
|
||||||
|
date_idx, date_text = dates[0]
|
||||||
|
amount_idxs = {idx for idx, _, _ in amount_tokens}
|
||||||
|
desc = _description_from_row(row_words, date_idx, amount_idxs)
|
||||||
|
|
||||||
|
record: dict[str, Any] = {
|
||||||
|
"date": parse_date(date_text, date_formats) or date_text,
|
||||||
|
"description": desc,
|
||||||
|
"_page": page.page_no,
|
||||||
|
"_raw_line": line,
|
||||||
|
}
|
||||||
|
record.update(_assign_amounts_by_shape(
|
||||||
|
amount_tokens, shape, parse_opts, column_centers,
|
||||||
|
))
|
||||||
|
out.append(record)
|
||||||
|
prev = record
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def apply_template_row_heuristic(
|
||||||
|
pages: list[Page],
|
||||||
|
template: dict[str, Any],
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""Row-heuristic counterpart to ``apply_template``. Same return
|
||||||
|
shape (a DataFrame) so callers don't care which mode produced it."""
|
||||||
|
rows = find_transaction_rows(pages, template)
|
||||||
|
if not rows:
|
||||||
|
return pd.DataFrame()
|
||||||
|
df = pd.DataFrame(rows)
|
||||||
|
# Drop internal helper columns from the user-facing output.
|
||||||
|
if "_raw_line" in df.columns:
|
||||||
|
df = df.drop(columns=["_raw_line"])
|
||||||
|
preferred = ["date", "description", "amount", "type", "balance"]
|
||||||
|
cols = [c for c in preferred if c in df.columns]
|
||||||
|
extras = [c for c in df.columns if c not in cols and c != "_page"]
|
||||||
|
df = df[cols + extras + (["_page"] if "_page" in df.columns else [])]
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
def apply_template(
|
def apply_template(
|
||||||
pages: list[Page],
|
pages: list[Page],
|
||||||
template: dict[str, Any],
|
template: dict[str, Any],
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""Run *template* over *pages* and return the extracted DataFrame.
|
"""Dispatch by template mode and return the extracted DataFrame.
|
||||||
|
|
||||||
Template schema is defined in ``src/pdf_templates.py``. Missing
|
``mode="row_heuristic"`` (default for new templates): no
|
||||||
keys fall through to sensible defaults so a half-built template
|
coordinates needed — finds transaction lines by date+amount
|
||||||
in the GUI still produces a preview.
|
pattern matching. Robust to layout drift between statements.
|
||||||
|
|
||||||
|
``mode="column_visual"`` (legacy): uses x-position boundaries
|
||||||
|
from the visual picker. Kept for templates saved before the
|
||||||
|
row-heuristic shift.
|
||||||
|
|
||||||
|
Templates without a mode key default to ``column_visual`` for
|
||||||
|
backward compatibility with schema_version=1 templates.
|
||||||
"""
|
"""
|
||||||
|
mode = template.get("mode", "column_visual")
|
||||||
|
if mode == "row_heuristic":
|
||||||
|
return apply_template_row_heuristic(pages, template)
|
||||||
|
return _apply_template_column_visual(pages, template)
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_template_column_visual(
|
||||||
|
pages: list[Page],
|
||||||
|
template: dict[str, Any],
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""Original column-x-position pipeline. Now the legacy code
|
||||||
|
path; kept for any v1 templates and as the manual-override
|
||||||
|
advanced mode in the build UI."""
|
||||||
pages_cfg = template.get("pages", {}) or {}
|
pages_cfg = template.get("pages", {}) or {}
|
||||||
table_cfg = template.get("table", {}) or {}
|
table_cfg = template.get("table", {}) or {}
|
||||||
columns_cfg = template.get("columns", []) or []
|
columns_cfg = template.get("columns", []) or []
|
||||||
|
|||||||
280
tests/test_pdf_row_heuristic.py
Normal file
280
tests/test_pdf_row_heuristic.py
Normal file
@@ -0,0 +1,280 @@
|
|||||||
|
"""Tests for the row-heuristic extraction pipeline.
|
||||||
|
|
||||||
|
This is now the primary extraction mode — uses date + amount
|
||||||
|
pattern matching to find transaction lines, with no dependency
|
||||||
|
on x-position column boundaries. Robust to layout drift across
|
||||||
|
statements from the same bank.
|
||||||
|
|
||||||
|
The legacy column-visual pipeline keeps its own tests in
|
||||||
|
``test_pdf_extract.py``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from src.pdf_extract import (
|
||||||
|
Page,
|
||||||
|
WordBox,
|
||||||
|
apply_template,
|
||||||
|
apply_template_row_heuristic,
|
||||||
|
find_transaction_rows,
|
||||||
|
_find_amount_tokens,
|
||||||
|
_find_dates_in_words,
|
||||||
|
_infer_amount_column_centers,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _w(text: str, x0: float, top: float) -> WordBox:
|
||||||
|
return WordBox(
|
||||||
|
x0=x0,
|
||||||
|
top=top,
|
||||||
|
x1=x0 + 8 * len(text),
|
||||||
|
bottom=top + 10,
|
||||||
|
text=text,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestFindDatesInRow:
|
||||||
|
def test_us_slash(self):
|
||||||
|
row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)]
|
||||||
|
assert _find_dates_in_words(row) == [(0, "01/15/2026")]
|
||||||
|
|
||||||
|
def test_two_digit_year(self):
|
||||||
|
row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)]
|
||||||
|
result = _find_dates_in_words(row)
|
||||||
|
assert result and result[0][1] == "01/15/26"
|
||||||
|
|
||||||
|
def test_iso(self):
|
||||||
|
row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)]
|
||||||
|
assert _find_dates_in_words(row) == [(0, "2026-01-15")]
|
||||||
|
|
||||||
|
def test_month_name(self):
|
||||||
|
# "Jan 15, 2026" — three word tokens, should stitch.
|
||||||
|
row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)]
|
||||||
|
result = _find_dates_in_words(row)
|
||||||
|
assert result, "Multi-word month-day-year should match"
|
||||||
|
assert "Jan 15" in result[0][1]
|
||||||
|
|
||||||
|
def test_no_date(self):
|
||||||
|
row = [_w("Just", 0, 0), _w("text", 50, 0)]
|
||||||
|
assert _find_dates_in_words(row) == []
|
||||||
|
|
||||||
|
|
||||||
|
class TestFindAmountTokens:
|
||||||
|
def test_currency_format(self):
|
||||||
|
row = [_w("Coffee", 0, 0), _w("$4.50", 100, 0)]
|
||||||
|
out = _find_amount_tokens(row)
|
||||||
|
assert len(out) == 1
|
||||||
|
assert out[0][2] == "$4.50"
|
||||||
|
|
||||||
|
def test_parens_negative(self):
|
||||||
|
row = [_w("(123.45)", 0, 0)]
|
||||||
|
out = _find_amount_tokens(row)
|
||||||
|
assert out and out[0][2] == "(123.45)"
|
||||||
|
|
||||||
|
def test_no_amount_on_pure_text(self):
|
||||||
|
row = [_w("Hello", 0, 0), _w("World", 50, 0)]
|
||||||
|
assert _find_amount_tokens(row) == []
|
||||||
|
|
||||||
|
def test_rejects_bare_year(self):
|
||||||
|
# "2026" matches the digit pattern but lacks $/decimal/etc.,
|
||||||
|
# so the looks-like-amount filter should drop it.
|
||||||
|
row = [_w("2026", 0, 0)]
|
||||||
|
# Bare integer can pass the regex but not the heuristic.
|
||||||
|
out = _find_amount_tokens(row)
|
||||||
|
# Either filtered out OR included — both are defensible.
|
||||||
|
# If included, it'd be missed-amount territory not a false-
|
||||||
|
# positive. Pin the conservative behavior: NO match.
|
||||||
|
assert out == [], "Bare 4-digit year should not register as amount"
|
||||||
|
|
||||||
|
|
||||||
|
class TestInferAmountColumnCenters:
|
||||||
|
def test_two_clear_columns(self):
|
||||||
|
# 5 rows, each with two amounts at roughly x=300 and x=450.
|
||||||
|
rows = []
|
||||||
|
for top in range(0, 100, 20):
|
||||||
|
rows.append([
|
||||||
|
_w("01/15/2026", 20, top),
|
||||||
|
_w("Item", 100, top),
|
||||||
|
_w("$10.00", 300, top),
|
||||||
|
_w("$1,000.00", 450, top),
|
||||||
|
])
|
||||||
|
centers = _infer_amount_column_centers(
|
||||||
|
rows, expected=2, min_amounts=2, max_amounts=2,
|
||||||
|
)
|
||||||
|
assert len(centers) == 2
|
||||||
|
# Left center ≈ 300 + 8*len("$10.00")/2 = 300+24 = 324
|
||||||
|
assert 310 < centers[0] < 340
|
||||||
|
assert 460 < centers[1] < 490
|
||||||
|
|
||||||
|
def test_no_transactions_returns_empty(self):
|
||||||
|
rows = [[_w("just", 0, 0), _w("text", 50, 0)]]
|
||||||
|
assert _infer_amount_column_centers(
|
||||||
|
rows, expected=2, min_amounts=1, max_amounts=3,
|
||||||
|
) == []
|
||||||
|
|
||||||
|
|
||||||
|
class TestRowHeuristicEndToEnd:
|
||||||
|
"""Synthetic ``Page`` objects exercise the full row-heuristic
|
||||||
|
pipeline end-to-end without a real PDF."""
|
||||||
|
|
||||||
|
def _page_single_amount(self) -> Page:
|
||||||
|
words = [
|
||||||
|
_w("ACME BANK STATEMENT", 20, 0),
|
||||||
|
_w("01/15/2026", 20, 30), _w("Coffee", 100, 30),
|
||||||
|
_w("Shop", 150, 30), _w("$4.50", 400, 30),
|
||||||
|
_w("01/16/2026", 20, 50), _w("Refund", 100, 50),
|
||||||
|
_w("from", 100, 70), _w("vendor", 140, 70), # continuation
|
||||||
|
_w("Vendor", 140, 50), _w("$12.00", 400, 50),
|
||||||
|
_w("Page", 20, 90), _w("1", 60, 90), # not a txn
|
||||||
|
]
|
||||||
|
return Page(page_no=1, width=600, height=120, text="", words=words)
|
||||||
|
|
||||||
|
def test_extracts_two_rows_single_amount(self):
|
||||||
|
tpl = {
|
||||||
|
"mode": "row_heuristic",
|
||||||
|
"row_detection": {
|
||||||
|
"min_amounts_per_row": 1,
|
||||||
|
"max_amounts_per_row": 1,
|
||||||
|
"merge_multiline_description": True,
|
||||||
|
},
|
||||||
|
"amounts": {"shape": "single", "negative_in_parens": True},
|
||||||
|
"date": {"format": "%m/%d/%Y"},
|
||||||
|
}
|
||||||
|
df = apply_template_row_heuristic([self._page_single_amount()], tpl)
|
||||||
|
assert len(df) == 2
|
||||||
|
assert list(df["date"]) == ["2026-01-15", "2026-01-16"]
|
||||||
|
# Multi-line description merged
|
||||||
|
assert "from vendor" in df.iloc[1]["description"]
|
||||||
|
|
||||||
|
def test_dispatches_through_apply_template(self):
|
||||||
|
tpl = {
|
||||||
|
"mode": "row_heuristic",
|
||||||
|
"row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1},
|
||||||
|
"amounts": {"shape": "single"},
|
||||||
|
"date": {"format": "%m/%d/%Y"},
|
||||||
|
}
|
||||||
|
df = apply_template([self._page_single_amount()], tpl)
|
||||||
|
assert isinstance(df, pd.DataFrame)
|
||||||
|
assert len(df) == 2
|
||||||
|
|
||||||
|
def test_txn_balance_shape(self):
|
||||||
|
page = Page(
|
||||||
|
page_no=1, width=600, height=100, text="", words=[
|
||||||
|
_w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
|
||||||
|
_w("(4.50)", 300, 0), _w("1,000.00", 450, 0),
|
||||||
|
_w("01/16/2026", 20, 20), _w("Refund", 100, 20),
|
||||||
|
_w("12.00", 300, 20), _w("1,012.00", 450, 20),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
tpl = {
|
||||||
|
"mode": "row_heuristic",
|
||||||
|
"row_detection": {"min_amounts_per_row": 2, "max_amounts_per_row": 2},
|
||||||
|
"amounts": {"shape": "txn_balance", "negative_in_parens": True},
|
||||||
|
"date": {"format": "%m/%d/%Y"},
|
||||||
|
}
|
||||||
|
df = apply_template([page], tpl)
|
||||||
|
assert len(df) == 2
|
||||||
|
assert df.iloc[0]["amount"] == -4.50
|
||||||
|
assert df.iloc[0]["balance"] == 1000.00
|
||||||
|
assert df.iloc[1]["amount"] == 12.00
|
||||||
|
assert df.iloc[1]["balance"] == 1012.00
|
||||||
|
|
||||||
|
def test_debit_credit_balance_shape(self):
|
||||||
|
page = Page(
|
||||||
|
page_no=1, width=600, height=100, text="", words=[
|
||||||
|
_w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
|
||||||
|
_w("4.50", 300, 0), _w("1,000.00", 450, 0),
|
||||||
|
_w("01/16/2026", 20, 20), _w("Refund", 100, 20),
|
||||||
|
_w("12.00", 380, 20), _w("1,012.00", 450, 20),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
tpl = {
|
||||||
|
"mode": "row_heuristic",
|
||||||
|
"row_detection": {"min_amounts_per_row": 2, "max_amounts_per_row": 3},
|
||||||
|
"amounts": {"shape": "debit_credit_balance"},
|
||||||
|
"date": {"format": "%m/%d/%Y"},
|
||||||
|
}
|
||||||
|
df = apply_template([page], tpl)
|
||||||
|
assert len(df) == 2
|
||||||
|
# Row 0: amount at x=300 (debit column) → debit, balance at 450
|
||||||
|
assert df.iloc[0]["amount"] == -4.50
|
||||||
|
assert df.iloc[0]["type"] == "debit"
|
||||||
|
# Row 1: amount at x=380 (credit column) → credit, balance at 450
|
||||||
|
assert df.iloc[1]["amount"] == 12.00
|
||||||
|
assert df.iloc[1]["type"] == "credit"
|
||||||
|
|
||||||
|
def test_skip_rows_matching(self):
|
||||||
|
page = self._page_single_amount()
|
||||||
|
tpl = {
|
||||||
|
"mode": "row_heuristic",
|
||||||
|
"row_detection": {
|
||||||
|
"min_amounts_per_row": 1,
|
||||||
|
"max_amounts_per_row": 1,
|
||||||
|
"skip_rows_matching": ["Refund"],
|
||||||
|
},
|
||||||
|
"amounts": {"shape": "single"},
|
||||||
|
"date": {"format": "%m/%d/%Y"},
|
||||||
|
}
|
||||||
|
df = apply_template_row_heuristic([page], tpl)
|
||||||
|
assert len(df) == 1
|
||||||
|
assert df.iloc[0]["date"] == "2026-01-15"
|
||||||
|
|
||||||
|
def test_layout_drift_doesnt_matter(self):
|
||||||
|
"""The whole point of row-heuristic: same template works
|
||||||
|
on pages of different sizes / different column x-positions."""
|
||||||
|
# Page A: amounts at x=400
|
||||||
|
page_a = Page(
|
||||||
|
page_no=1, width=600, height=80, text="", words=[
|
||||||
|
_w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
|
||||||
|
_w("$4.50", 400, 0),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
# Page B: amounts shifted to x=520 (different layout)
|
||||||
|
page_b = Page(
|
||||||
|
page_no=1, width=720, height=80, text="", words=[
|
||||||
|
_w("01/15/2026", 50, 0), _w("Coffee", 150, 0),
|
||||||
|
_w("$4.50", 520, 0),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
tpl = {
|
||||||
|
"mode": "row_heuristic",
|
||||||
|
"row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1},
|
||||||
|
"amounts": {"shape": "single"},
|
||||||
|
"date": {"format": "%m/%d/%Y"},
|
||||||
|
}
|
||||||
|
df_a = apply_template([page_a], tpl)
|
||||||
|
df_b = apply_template([page_b], tpl)
|
||||||
|
# Both should extract — proves no coordinate dependency.
|
||||||
|
assert len(df_a) == 1
|
||||||
|
assert len(df_b) == 1
|
||||||
|
assert df_a.iloc[0]["amount"] == df_b.iloc[0]["amount"] == 4.50
|
||||||
|
|
||||||
|
|
||||||
|
class TestFindTransactionRows:
|
||||||
|
"""The pre-DataFrame stage — returns dict records the build UI
|
||||||
|
uses to render a preview before the user commits."""
|
||||||
|
|
||||||
|
def test_returns_records(self):
|
||||||
|
page = Page(
|
||||||
|
page_no=1, width=600, height=80, text="", words=[
|
||||||
|
_w("01/15/2026", 20, 0), _w("Coffee", 100, 0),
|
||||||
|
_w("$4.50", 400, 0),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
tpl = {
|
||||||
|
"mode": "row_heuristic",
|
||||||
|
"row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1},
|
||||||
|
"amounts": {"shape": "single"},
|
||||||
|
"date": {"format": "%m/%d/%Y"},
|
||||||
|
}
|
||||||
|
rows = find_transaction_rows([page], tpl)
|
||||||
|
assert len(rows) == 1
|
||||||
|
r = rows[0]
|
||||||
|
assert r["date"] == "2026-01-15"
|
||||||
|
assert r["description"] == "Coffee"
|
||||||
|
assert r["amount"] == 4.50
|
||||||
|
assert r["_page"] == 1
|
||||||
|
# Raw line is preserved so the GUI can show "what we saw"
|
||||||
|
assert "_raw_line" in r
|
||||||
Reference in New Issue
Block a user