fix(pdf): short dates without year + diagnostic for "0 rows" runs
User uploaded a real Chase statement and got "0 rows detected."
Two bugs the rewrite shipped with, plus a diagnostic:
**1. Short dates without year weren't recognized.** Most bank
statements (Chase, Wells, BofA, …) display transaction dates as
``01/13`` or ``Jan 13`` because the year is implied by the
statement period. The original regex required ``\d{2,4}`` after
the second slash, so ``01/13`` failed to match and rows with no
detected date got dropped.
Split ``_DATE_RES`` into ``_FULL`` (with year) and ``_SHORT``
(no year), with a two-pass detector: pass 1 tries full-year
patterns across the whole row; pass 2 only tries short patterns
if pass 1 found nothing. This prevents a stray ``Page 1/2`` from
shadowing the real dated transaction on the same line.
Short patterns:
- ``\d{1,2}/\d{1,2}`` — Chase, etc.
- ``\d{1,2}-\d{1,2}``
- ``[A-Z][a-z]{2}\s+\d{1,2}`` — "Jan 13"
When parsing, short dates pass through ``parse_date`` and
return None (no year to bind to), so the scanner falls back to
the raw text — the user sees ``01/13`` in the date column and
can correct in the editor.
**2. Multi-word dates leaked the day token into the description.**
A pre-existing bug: ``_find_dates_in_words`` returned only the
START word index, and ``_description_from_row`` only excluded
that single word. For "Jan 13 Coffee $4.50", the description
became "13 Coffee" instead of "Coffee". Fixed by returning
``(start, end, text)`` with ``end`` exclusive (computed from
``len(m.group(1).split())`` so window-overrun doesn't
over-consume), and the description builder now skips the full
range.
**3. New diagnostic: ``diagnose_pdf_lines(pdf_bytes)``.** Returns
every clustered text line the scanner saw with ``has_date`` /
``has_amount`` flags. When the page's scan returns 0 rows, an
auto-expanded "what the scanner saw" expander now renders a
table of all extracted lines so the user can:
- Spot scanned-PDF cases (empty result → enable OCR)
- See which lines have a date but no amount (or vice versa)
- Eyeball the date / amount format the scanner missed
Without leaving the app or asking the developer for help.
Eight new tests cover: short US date (``01/13``), short month-
name date with two-word consumption (``Jan 13``), the
``Page 1/2 ... 01/13/2026`` shadowing case, and the multi-word-
date description fix.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -23,6 +23,7 @@ from src.audit import log_event, log_page_open
|
||||
from src.gui.components import hide_streamlit_chrome, render_sticky_footer
|
||||
from src.pdf_extract import (
|
||||
PdfDependencyMissing,
|
||||
diagnose_pdf_lines,
|
||||
ocr_available,
|
||||
scan_pdf_for_transactions,
|
||||
)
|
||||
@@ -58,6 +59,7 @@ render_sticky_footer()
|
||||
K_ROWS = "pdf_scan_rows"
|
||||
K_WARNINGS = "pdf_scan_warnings"
|
||||
K_SOURCE_COUNT = "pdf_scan_source_count"
|
||||
K_DIAGNOSTIC = "pdf_scan_diagnostic"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -130,6 +132,9 @@ scan_clicked = st.button(
|
||||
if scan_clicked and uploads:
|
||||
all_rows: list[dict] = []
|
||||
all_warnings: list[str] = []
|
||||
# Cache the raw bytes per file so the diagnostic expander can
|
||||
# re-extract lines without asking the user to re-upload.
|
||||
cached_bytes: list[tuple[str, bytes]] = []
|
||||
with st.status(
|
||||
f"Scanning {len(uploads)} file(s)…",
|
||||
expanded=True,
|
||||
@@ -137,8 +142,10 @@ if scan_clicked and uploads:
|
||||
for i, up in enumerate(uploads, start=1):
|
||||
st.write(f"**{i}/{len(uploads)}** · {up.name}")
|
||||
try:
|
||||
raw = up.read()
|
||||
cached_bytes.append((up.name, raw))
|
||||
rows, warns = scan_pdf_for_transactions(
|
||||
up.read(),
|
||||
raw,
|
||||
negative_in_parens=negative_in_parens,
|
||||
allow_ocr=use_ocr,
|
||||
)
|
||||
@@ -164,6 +171,7 @@ if scan_clicked and uploads:
|
||||
st.session_state[K_ROWS] = all_rows
|
||||
st.session_state[K_WARNINGS] = all_warnings
|
||||
st.session_state[K_SOURCE_COUNT] = len(uploads)
|
||||
st.session_state[K_DIAGNOSTIC] = cached_bytes
|
||||
|
||||
log_event(
|
||||
"tool_run",
|
||||
@@ -197,10 +205,53 @@ if rows is None:
|
||||
elif not rows:
|
||||
st.info(
|
||||
"No transaction rows detected. The scanner looks for lines "
|
||||
"containing a date and at least one amount. Check the "
|
||||
"warnings expander above for clues — most often the PDF is "
|
||||
"scanned (image-only) and OCR isn't available."
|
||||
"containing a date and at least one amount. The diagnostic "
|
||||
"below shows every line the PDF reader could see — use the "
|
||||
"``has_date`` and ``has_amount`` columns to spot which "
|
||||
"pieces are missing (usually one or the other)."
|
||||
)
|
||||
cached_bytes = st.session_state.get(K_DIAGNOSTIC) or []
|
||||
if cached_bytes:
|
||||
with st.expander(
|
||||
"Diagnostic: what the scanner saw",
|
||||
expanded=True,
|
||||
):
|
||||
for fname, raw in cached_bytes:
|
||||
st.markdown(f"**{fname}**")
|
||||
try:
|
||||
lines, dwarns = diagnose_pdf_lines(
|
||||
raw, allow_ocr=use_ocr, max_lines=200,
|
||||
)
|
||||
except Exception as e:
|
||||
st.error(f"Diagnostic failed: {type(e).__name__}: {e}")
|
||||
continue
|
||||
for w in dwarns:
|
||||
st.caption(w)
|
||||
if not lines:
|
||||
st.warning(
|
||||
"Zero text lines extracted. This is almost "
|
||||
"certainly a scanned (image-based) PDF — "
|
||||
"enable OCR in Scan options if available."
|
||||
)
|
||||
continue
|
||||
st.dataframe(
|
||||
pd.DataFrame(lines),
|
||||
hide_index=True,
|
||||
use_container_width=True,
|
||||
height=400,
|
||||
)
|
||||
date_hits = sum(1 for ln in lines if ln["has_date"])
|
||||
amt_hits = sum(1 for ln in lines if ln["has_amount"])
|
||||
both = sum(
|
||||
1 for ln in lines
|
||||
if ln["has_date"] and ln["has_amount"]
|
||||
)
|
||||
st.caption(
|
||||
f"{len(lines):,} lines · {date_hits:,} look like "
|
||||
f"they contain a date · {amt_hits:,} look like "
|
||||
f"they contain an amount · {both:,} have both "
|
||||
"(those are the rows the scanner would have kept)."
|
||||
)
|
||||
|
||||
else:
|
||||
df = pd.DataFrame(rows)
|
||||
|
||||
@@ -98,7 +98,7 @@ class Page:
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
_DATE_RES = [
|
||||
_DATE_RES_FULL = [
|
||||
re.compile(r"\b(\d{1,2}/\d{1,2}/\d{2,4})\b"),
|
||||
re.compile(r"\b(\d{1,2}-\d{1,2}-\d{2,4})\b"),
|
||||
re.compile(r"\b(\d{4}-\d{2}-\d{2})\b"),
|
||||
@@ -106,6 +106,19 @@ _DATE_RES = [
|
||||
re.compile(r"\b(\d{1,2}\s+[A-Z][a-z]{2}\s+\d{2,4})\b"),
|
||||
]
|
||||
|
||||
# Short-date patterns (no year). Many bank statements show dates as
|
||||
# ``MM/DD`` or ``Jan 13`` because the year is implied by the
|
||||
# statement period. Tried only after the full-year patterns fail
|
||||
# so a string like "1/2 cup" in a memo can't claim to be a date
|
||||
# when a real dated transaction was already matched on the same row.
|
||||
_DATE_RES_SHORT = [
|
||||
re.compile(r"\b(\d{1,2}/\d{1,2})(?!\d)"),
|
||||
re.compile(r"\b(\d{1,2}-\d{1,2})(?!\d)"),
|
||||
re.compile(r"\b([A-Z][a-z]{2}\s+\d{1,2})(?!\d)"),
|
||||
]
|
||||
|
||||
_DATE_RES = _DATE_RES_FULL + _DATE_RES_SHORT
|
||||
|
||||
_DATE_FORMATS_FALLBACK = [
|
||||
"%m/%d/%Y", "%m/%d/%y", "%Y-%m-%d", "%d/%m/%Y", "%d/%m/%y",
|
||||
"%b %d %Y", "%b %d, %Y", "%d %b %Y", "%d-%b-%Y",
|
||||
@@ -427,21 +440,45 @@ def extract_pages_auto(
|
||||
|
||||
def _find_dates_in_words(
|
||||
row_words: list[WordBox],
|
||||
) -> list[tuple[int, str]]:
|
||||
"""Return ``[(word_index, date_text)]`` for the first date-like
|
||||
substring on this row, or ``[]`` if none. The index lets the
|
||||
caller exclude the date words from the description text.
|
||||
) -> list[tuple[int, int, str]]:
|
||||
"""Return ``[(start_idx, end_idx, date_text)]`` for the first
|
||||
date-like substring on this row, or ``[]`` if none.
|
||||
|
||||
Multi-word formats like ``Jan 15, 2026`` are handled by stitching
|
||||
up to three adjacent words before matching.
|
||||
Two-pass search:
|
||||
|
||||
- **Pass 1** — full-year patterns (``01/15/2026``,
|
||||
``Jan 13, 2026``). Tries the longest window first within
|
||||
this pass so a multi-word ``Jan 15, 2026`` isn't truncated
|
||||
to ``Jan 15``.
|
||||
- **Pass 2** — short patterns (``01/13``, ``Jan 13``). Only
|
||||
runs if pass 1 found nothing — otherwise a stray
|
||||
``Page 1/2`` on the same line could shadow the real dated
|
||||
transaction.
|
||||
|
||||
``end_idx`` is exclusive — caller uses ``range(start, end)``
|
||||
to exclude all words the date consumed from the description
|
||||
(the previous single-index return mis-attributed the day
|
||||
token of multi-word dates like ``Jan 13`` to the description).
|
||||
"""
|
||||
for i, w in enumerate(row_words):
|
||||
for window in (3, 2, 1):
|
||||
chunk = " ".join(x.text for x in row_words[i : i + window])
|
||||
for rx in _DATE_RES:
|
||||
m = rx.search(chunk)
|
||||
if m:
|
||||
return [(i, m.group(1))]
|
||||
for patterns, window_order in (
|
||||
(_DATE_RES_FULL, (3, 2, 1)),
|
||||
(_DATE_RES_SHORT, (2, 1)),
|
||||
):
|
||||
for i in range(len(row_words)):
|
||||
for window in window_order:
|
||||
end = i + window
|
||||
if end > len(row_words):
|
||||
continue
|
||||
chunk = " ".join(x.text for x in row_words[i:end])
|
||||
for rx in patterns:
|
||||
m = rx.search(chunk)
|
||||
if m:
|
||||
# Count whitespace-separated tokens in the
|
||||
# MATCH, not in the window — the window may
|
||||
# have included extra trailing words the
|
||||
# regex didn't actually consume.
|
||||
consumed = max(1, len(m.group(1).split()))
|
||||
return [(i, i + consumed, m.group(1))]
|
||||
return []
|
||||
|
||||
|
||||
@@ -469,18 +506,23 @@ def _find_amount_tokens(
|
||||
|
||||
def _description_from_row(
|
||||
row_words: list[WordBox],
|
||||
date_idx: int,
|
||||
date_range: tuple[int, int],
|
||||
amount_idxs: set[int],
|
||||
) -> str:
|
||||
"""Stitch the description from the row's non-date, non-amount
|
||||
tokens. Keeps tokens before the first amount and after the last
|
||||
amount (trailing check numbers and memos); drops words between
|
||||
tokens. ``date_range`` is ``(start, end)`` exclusive — every
|
||||
word in that range is excluded so multi-word dates like
|
||||
``Jan 13`` don't leak the day token into the description.
|
||||
|
||||
Keeps tokens before the first amount and after the last
|
||||
amount (trailing check numbers, memos); drops words between
|
||||
amount tokens (usually whitespace artifacts in column gaps)."""
|
||||
date_start, date_end = date_range
|
||||
keep: list[str] = []
|
||||
seen_first_amount = False
|
||||
last_amount_idx = max(amount_idxs) if amount_idxs else -1
|
||||
for i, w in enumerate(row_words):
|
||||
if i == date_idx:
|
||||
if date_start <= i < date_end:
|
||||
continue
|
||||
if i in amount_idxs:
|
||||
seen_first_amount = True
|
||||
@@ -552,9 +594,11 @@ def scan_pdf_for_transactions(
|
||||
)
|
||||
continue
|
||||
|
||||
date_idx, date_text = dates[0]
|
||||
date_start, date_end, date_text = dates[0]
|
||||
amount_idxs = {idx for idx, _, _ in amount_tokens}
|
||||
desc = _description_from_row(row_words, date_idx, amount_idxs)
|
||||
desc = _description_from_row(
|
||||
row_words, (date_start, date_end), amount_idxs,
|
||||
)
|
||||
|
||||
record: dict[str, Any] = {
|
||||
"date": parse_date(date_text, date_formats) or date_text,
|
||||
@@ -578,11 +622,58 @@ def scan_pdf_for_transactions(
|
||||
return out_rows, warnings
|
||||
|
||||
|
||||
def diagnose_pdf_lines(
|
||||
pdf_bytes: bytes,
|
||||
*,
|
||||
allow_ocr: bool = True,
|
||||
max_lines: int = 200,
|
||||
) -> tuple[list[dict[str, Any]], list[str]]:
|
||||
"""Dump every clustered text line from a PDF for diagnosis.
|
||||
|
||||
Surfaces what the scanner actually saw — including lines the
|
||||
detector dropped because they lacked a date or amount. Use
|
||||
when ``scan_pdf_for_transactions`` returns 0 rows so the user
|
||||
can spot what's wrong (no extractable text → scanned PDF /
|
||||
weird date format / amounts in a column the regex misses).
|
||||
|
||||
Returns ``(lines, warnings)`` where each line is::
|
||||
|
||||
{"page": int, "text": str,
|
||||
"has_date": bool, "has_amount": bool}
|
||||
|
||||
Capped at *max_lines* across all pages so a 100-page statement
|
||||
doesn't dump 10,000 rows into the UI.
|
||||
"""
|
||||
pages, warnings = extract_pages_auto(pdf_bytes, allow_ocr=allow_ocr)
|
||||
out: list[dict[str, Any]] = []
|
||||
for page in pages:
|
||||
rows = cluster_rows(page.words)
|
||||
for row_words in rows:
|
||||
text = " ".join(w.text for w in row_words).strip()
|
||||
if not text:
|
||||
continue
|
||||
out.append({
|
||||
"page": page.page_no,
|
||||
"text": text,
|
||||
"has_date": bool(_find_dates_in_words(row_words)),
|
||||
"has_amount": bool(_find_amount_tokens(row_words)),
|
||||
})
|
||||
if len(out) >= max_lines:
|
||||
warnings.append(
|
||||
f"Diagnostic capped at {max_lines} lines. "
|
||||
"Larger PDFs aren't fully shown here — the full "
|
||||
"scan still runs in Scan mode."
|
||||
)
|
||||
return out, warnings
|
||||
return out, warnings
|
||||
|
||||
|
||||
__all__ = [
|
||||
"PdfDependencyMissing",
|
||||
"Page",
|
||||
"WordBox",
|
||||
"cluster_rows",
|
||||
"diagnose_pdf_lines",
|
||||
"extract_pages",
|
||||
"extract_pages_auto",
|
||||
"ocr_available",
|
||||
|
||||
@@ -111,23 +111,54 @@ class TestClusterRows:
|
||||
|
||||
|
||||
class TestFindDatesInWords:
|
||||
"""Returns ``[(start, end, text)]`` — end is exclusive index of
|
||||
words the date consumed."""
|
||||
|
||||
def test_us_slash(self):
|
||||
row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)]
|
||||
assert _find_dates_in_words(row) == [(0, "01/15/2026")]
|
||||
assert _find_dates_in_words(row) == [(0, 1, "01/15/2026")]
|
||||
|
||||
def test_two_digit_year(self):
|
||||
row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)]
|
||||
result = _find_dates_in_words(row)
|
||||
assert result and result[0][1] == "01/15/26"
|
||||
assert result and result[0][2] == "01/15/26"
|
||||
|
||||
def test_iso(self):
|
||||
row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)]
|
||||
assert _find_dates_in_words(row) == [(0, "2026-01-15")]
|
||||
assert _find_dates_in_words(row) == [(0, 1, "2026-01-15")]
|
||||
|
||||
def test_month_name(self):
|
||||
def test_month_name_with_year_consumes_three_words(self):
|
||||
row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)]
|
||||
result = _find_dates_in_words(row)
|
||||
assert result and "Jan 15" in result[0][1]
|
||||
assert result and "Jan 15" in result[0][2]
|
||||
# Date consumes all 3 words so they don't leak to description.
|
||||
assert result[0][1] == 3
|
||||
|
||||
def test_short_us_date_no_year(self):
|
||||
"""Chase-style ``01/13`` without a year still detects."""
|
||||
row = [_w("01/13", 0, 0), _w("Coffee", 100, 0), _w("$4.50", 200, 0)]
|
||||
result = _find_dates_in_words(row)
|
||||
assert result and result[0][2] == "01/13"
|
||||
assert result[0][1] == 1 # one word consumed
|
||||
|
||||
def test_short_month_name_no_year_consumes_two_words(self):
|
||||
row = [_w("Jan", 0, 0), _w("13", 30, 0), _w("Coffee", 100, 0)]
|
||||
result = _find_dates_in_words(row)
|
||||
assert result
|
||||
assert "Jan 13" in result[0][2]
|
||||
assert result[0][1] == 2 # "Jan" + "13" both consumed
|
||||
|
||||
def test_short_pattern_does_not_shadow_full_year(self):
|
||||
"""If a full-year date is present, short patterns shouldn't
|
||||
steal — e.g. ``Page 1/2 of 3 ... 01/13/2026 Coffee`` should
|
||||
return the real ``01/13/2026``, not the ``1/2`` page marker."""
|
||||
row = [
|
||||
_w("Page", 0, 0), _w("1/2", 40, 0), _w("of", 80, 0),
|
||||
_w("3", 100, 0),
|
||||
_w("01/13/2026", 200, 0), _w("Coffee", 300, 0),
|
||||
]
|
||||
result = _find_dates_in_words(row)
|
||||
assert result and result[0][2] == "01/13/2026"
|
||||
|
||||
def test_no_date(self):
|
||||
row = [_w("Just", 0, 0), _w("text", 50, 0)]
|
||||
|
||||
Reference in New Issue
Block a user