diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py index e268ddb..7033a60 100644 --- a/src/gui/pages/10_PDF_Extractor.py +++ b/src/gui/pages/10_PDF_Extractor.py @@ -23,6 +23,7 @@ from src.audit import log_event, log_page_open from src.gui.components import hide_streamlit_chrome, render_sticky_footer from src.pdf_extract import ( PdfDependencyMissing, + diagnose_pdf_lines, ocr_available, scan_pdf_for_transactions, ) @@ -58,6 +59,7 @@ render_sticky_footer() K_ROWS = "pdf_scan_rows" K_WARNINGS = "pdf_scan_warnings" K_SOURCE_COUNT = "pdf_scan_source_count" +K_DIAGNOSTIC = "pdf_scan_diagnostic" # --------------------------------------------------------------------------- @@ -130,6 +132,9 @@ scan_clicked = st.button( if scan_clicked and uploads: all_rows: list[dict] = [] all_warnings: list[str] = [] + # Cache the raw bytes per file so the diagnostic expander can + # re-extract lines without asking the user to re-upload. + cached_bytes: list[tuple[str, bytes]] = [] with st.status( f"Scanning {len(uploads)} file(s)…", expanded=True, @@ -137,8 +142,10 @@ if scan_clicked and uploads: for i, up in enumerate(uploads, start=1): st.write(f"**{i}/{len(uploads)}** · {up.name}") try: + raw = up.read() + cached_bytes.append((up.name, raw)) rows, warns = scan_pdf_for_transactions( - up.read(), + raw, negative_in_parens=negative_in_parens, allow_ocr=use_ocr, ) @@ -164,6 +171,7 @@ if scan_clicked and uploads: st.session_state[K_ROWS] = all_rows st.session_state[K_WARNINGS] = all_warnings st.session_state[K_SOURCE_COUNT] = len(uploads) + st.session_state[K_DIAGNOSTIC] = cached_bytes log_event( "tool_run", @@ -197,10 +205,53 @@ if rows is None: elif not rows: st.info( "No transaction rows detected. The scanner looks for lines " - "containing a date and at least one amount. Check the " - "warnings expander above for clues — most often the PDF is " - "scanned (image-only) and OCR isn't available." + "containing a date and at least one amount. The diagnostic " + "below shows every line the PDF reader could see — use the " + "``has_date`` and ``has_amount`` columns to spot which " + "pieces are missing (usually one or the other)." ) + cached_bytes = st.session_state.get(K_DIAGNOSTIC) or [] + if cached_bytes: + with st.expander( + "Diagnostic: what the scanner saw", + expanded=True, + ): + for fname, raw in cached_bytes: + st.markdown(f"**{fname}**") + try: + lines, dwarns = diagnose_pdf_lines( + raw, allow_ocr=use_ocr, max_lines=200, + ) + except Exception as e: + st.error(f"Diagnostic failed: {type(e).__name__}: {e}") + continue + for w in dwarns: + st.caption(w) + if not lines: + st.warning( + "Zero text lines extracted. This is almost " + "certainly a scanned (image-based) PDF — " + "enable OCR in Scan options if available." + ) + continue + st.dataframe( + pd.DataFrame(lines), + hide_index=True, + use_container_width=True, + height=400, + ) + date_hits = sum(1 for ln in lines if ln["has_date"]) + amt_hits = sum(1 for ln in lines if ln["has_amount"]) + both = sum( + 1 for ln in lines + if ln["has_date"] and ln["has_amount"] + ) + st.caption( + f"{len(lines):,} lines · {date_hits:,} look like " + f"they contain a date · {amt_hits:,} look like " + f"they contain an amount · {both:,} have both " + "(those are the rows the scanner would have kept)." + ) else: df = pd.DataFrame(rows) diff --git a/src/pdf_extract.py b/src/pdf_extract.py index 4c375e4..b709d07 100644 --- a/src/pdf_extract.py +++ b/src/pdf_extract.py @@ -98,7 +98,7 @@ class Page: # --------------------------------------------------------------------------- -_DATE_RES = [ +_DATE_RES_FULL = [ re.compile(r"\b(\d{1,2}/\d{1,2}/\d{2,4})\b"), re.compile(r"\b(\d{1,2}-\d{1,2}-\d{2,4})\b"), re.compile(r"\b(\d{4}-\d{2}-\d{2})\b"), @@ -106,6 +106,19 @@ _DATE_RES = [ re.compile(r"\b(\d{1,2}\s+[A-Z][a-z]{2}\s+\d{2,4})\b"), ] +# Short-date patterns (no year). Many bank statements show dates as +# ``MM/DD`` or ``Jan 13`` because the year is implied by the +# statement period. Tried only after the full-year patterns fail +# so a string like "1/2 cup" in a memo can't claim to be a date +# when a real dated transaction was already matched on the same row. +_DATE_RES_SHORT = [ + re.compile(r"\b(\d{1,2}/\d{1,2})(?!\d)"), + re.compile(r"\b(\d{1,2}-\d{1,2})(?!\d)"), + re.compile(r"\b([A-Z][a-z]{2}\s+\d{1,2})(?!\d)"), +] + +_DATE_RES = _DATE_RES_FULL + _DATE_RES_SHORT + _DATE_FORMATS_FALLBACK = [ "%m/%d/%Y", "%m/%d/%y", "%Y-%m-%d", "%d/%m/%Y", "%d/%m/%y", "%b %d %Y", "%b %d, %Y", "%d %b %Y", "%d-%b-%Y", @@ -427,21 +440,45 @@ def extract_pages_auto( def _find_dates_in_words( row_words: list[WordBox], -) -> list[tuple[int, str]]: - """Return ``[(word_index, date_text)]`` for the first date-like - substring on this row, or ``[]`` if none. The index lets the - caller exclude the date words from the description text. +) -> list[tuple[int, int, str]]: + """Return ``[(start_idx, end_idx, date_text)]`` for the first + date-like substring on this row, or ``[]`` if none. - Multi-word formats like ``Jan 15, 2026`` are handled by stitching - up to three adjacent words before matching. + Two-pass search: + + - **Pass 1** — full-year patterns (``01/15/2026``, + ``Jan 13, 2026``). Tries the longest window first within + this pass so a multi-word ``Jan 15, 2026`` isn't truncated + to ``Jan 15``. + - **Pass 2** — short patterns (``01/13``, ``Jan 13``). Only + runs if pass 1 found nothing — otherwise a stray + ``Page 1/2`` on the same line could shadow the real dated + transaction. + + ``end_idx`` is exclusive — caller uses ``range(start, end)`` + to exclude all words the date consumed from the description + (the previous single-index return mis-attributed the day + token of multi-word dates like ``Jan 13`` to the description). """ - for i, w in enumerate(row_words): - for window in (3, 2, 1): - chunk = " ".join(x.text for x in row_words[i : i + window]) - for rx in _DATE_RES: - m = rx.search(chunk) - if m: - return [(i, m.group(1))] + for patterns, window_order in ( + (_DATE_RES_FULL, (3, 2, 1)), + (_DATE_RES_SHORT, (2, 1)), + ): + for i in range(len(row_words)): + for window in window_order: + end = i + window + if end > len(row_words): + continue + chunk = " ".join(x.text for x in row_words[i:end]) + for rx in patterns: + m = rx.search(chunk) + if m: + # Count whitespace-separated tokens in the + # MATCH, not in the window — the window may + # have included extra trailing words the + # regex didn't actually consume. + consumed = max(1, len(m.group(1).split())) + return [(i, i + consumed, m.group(1))] return [] @@ -469,18 +506,23 @@ def _find_amount_tokens( def _description_from_row( row_words: list[WordBox], - date_idx: int, + date_range: tuple[int, int], amount_idxs: set[int], ) -> str: """Stitch the description from the row's non-date, non-amount - tokens. Keeps tokens before the first amount and after the last - amount (trailing check numbers and memos); drops words between + tokens. ``date_range`` is ``(start, end)`` exclusive — every + word in that range is excluded so multi-word dates like + ``Jan 13`` don't leak the day token into the description. + + Keeps tokens before the first amount and after the last + amount (trailing check numbers, memos); drops words between amount tokens (usually whitespace artifacts in column gaps).""" + date_start, date_end = date_range keep: list[str] = [] seen_first_amount = False last_amount_idx = max(amount_idxs) if amount_idxs else -1 for i, w in enumerate(row_words): - if i == date_idx: + if date_start <= i < date_end: continue if i in amount_idxs: seen_first_amount = True @@ -552,9 +594,11 @@ def scan_pdf_for_transactions( ) continue - date_idx, date_text = dates[0] + date_start, date_end, date_text = dates[0] amount_idxs = {idx for idx, _, _ in amount_tokens} - desc = _description_from_row(row_words, date_idx, amount_idxs) + desc = _description_from_row( + row_words, (date_start, date_end), amount_idxs, + ) record: dict[str, Any] = { "date": parse_date(date_text, date_formats) or date_text, @@ -578,11 +622,58 @@ def scan_pdf_for_transactions( return out_rows, warnings +def diagnose_pdf_lines( + pdf_bytes: bytes, + *, + allow_ocr: bool = True, + max_lines: int = 200, +) -> tuple[list[dict[str, Any]], list[str]]: + """Dump every clustered text line from a PDF for diagnosis. + + Surfaces what the scanner actually saw — including lines the + detector dropped because they lacked a date or amount. Use + when ``scan_pdf_for_transactions`` returns 0 rows so the user + can spot what's wrong (no extractable text → scanned PDF / + weird date format / amounts in a column the regex misses). + + Returns ``(lines, warnings)`` where each line is:: + + {"page": int, "text": str, + "has_date": bool, "has_amount": bool} + + Capped at *max_lines* across all pages so a 100-page statement + doesn't dump 10,000 rows into the UI. + """ + pages, warnings = extract_pages_auto(pdf_bytes, allow_ocr=allow_ocr) + out: list[dict[str, Any]] = [] + for page in pages: + rows = cluster_rows(page.words) + for row_words in rows: + text = " ".join(w.text for w in row_words).strip() + if not text: + continue + out.append({ + "page": page.page_no, + "text": text, + "has_date": bool(_find_dates_in_words(row_words)), + "has_amount": bool(_find_amount_tokens(row_words)), + }) + if len(out) >= max_lines: + warnings.append( + f"Diagnostic capped at {max_lines} lines. " + "Larger PDFs aren't fully shown here — the full " + "scan still runs in Scan mode." + ) + return out, warnings + return out, warnings + + __all__ = [ "PdfDependencyMissing", "Page", "WordBox", "cluster_rows", + "diagnose_pdf_lines", "extract_pages", "extract_pages_auto", "ocr_available", diff --git a/tests/test_pdf_extract.py b/tests/test_pdf_extract.py index 93abb2f..caec485 100644 --- a/tests/test_pdf_extract.py +++ b/tests/test_pdf_extract.py @@ -111,23 +111,54 @@ class TestClusterRows: class TestFindDatesInWords: + """Returns ``[(start, end, text)]`` — end is exclusive index of + words the date consumed.""" + def test_us_slash(self): row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)] - assert _find_dates_in_words(row) == [(0, "01/15/2026")] + assert _find_dates_in_words(row) == [(0, 1, "01/15/2026")] def test_two_digit_year(self): row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)] result = _find_dates_in_words(row) - assert result and result[0][1] == "01/15/26" + assert result and result[0][2] == "01/15/26" def test_iso(self): row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)] - assert _find_dates_in_words(row) == [(0, "2026-01-15")] + assert _find_dates_in_words(row) == [(0, 1, "2026-01-15")] - def test_month_name(self): + def test_month_name_with_year_consumes_three_words(self): row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)] result = _find_dates_in_words(row) - assert result and "Jan 15" in result[0][1] + assert result and "Jan 15" in result[0][2] + # Date consumes all 3 words so they don't leak to description. + assert result[0][1] == 3 + + def test_short_us_date_no_year(self): + """Chase-style ``01/13`` without a year still detects.""" + row = [_w("01/13", 0, 0), _w("Coffee", 100, 0), _w("$4.50", 200, 0)] + result = _find_dates_in_words(row) + assert result and result[0][2] == "01/13" + assert result[0][1] == 1 # one word consumed + + def test_short_month_name_no_year_consumes_two_words(self): + row = [_w("Jan", 0, 0), _w("13", 30, 0), _w("Coffee", 100, 0)] + result = _find_dates_in_words(row) + assert result + assert "Jan 13" in result[0][2] + assert result[0][1] == 2 # "Jan" + "13" both consumed + + def test_short_pattern_does_not_shadow_full_year(self): + """If a full-year date is present, short patterns shouldn't + steal — e.g. ``Page 1/2 of 3 ... 01/13/2026 Coffee`` should + return the real ``01/13/2026``, not the ``1/2`` page marker.""" + row = [ + _w("Page", 0, 0), _w("1/2", 40, 0), _w("of", 80, 0), + _w("3", 100, 0), + _w("01/13/2026", 200, 0), _w("Coffee", 300, 0), + ] + result = _find_dates_in_words(row) + assert result and result[0][2] == "01/13/2026" def test_no_date(self): row = [_w("Just", 0, 0), _w("text", 50, 0)]