"""PDF to CSV — heuristic transaction scanner. Upload one or more bank-statement PDFs, scan for transaction-like rows ([date] [description] [amount]), uncheck the rows you don't want, download as CSV. No templates, no per-bank configuration, no coordinate picking. """ from __future__ import annotations import hashlib import sys from datetime import datetime from pathlib import Path import pandas as pd import streamlit as st _project_root = Path(__file__).resolve().parent.parent.parent.parent if str(_project_root) not in sys.path: sys.path.insert(0, str(_project_root)) from src.audit import log_event, log_page_open from src.gui.components import ( hide_streamlit_chrome, html_download_button, render_sticky_footer, ) from src.pdf_extract import ( PdfDependencyMissing, diagnose_pdf_lines, format_amount, ocr_available, scan_pdf_for_transactions, year_from_filename, ) def _pdf_deps_status() -> tuple[bool, list[str]]: """Probe each runtime PDF dep without forcing the user to hit the Scan button. Returns ``(ok, missing_names)``.""" missing: list[str] = [] for name in ("pdfplumber", "pypdfium2"): try: __import__(name) except ImportError: missing.append(name) return (not missing), missing log_page_open("10_PDF_Extractor") _ICON_PATH = str(Path(__file__).parent.parent / "assets" / "datatools_icon_256.png") st.set_page_config( page_title="PDF to CSV · DataTools", page_icon=_ICON_PATH, layout="wide", ) hide_streamlit_chrome() render_sticky_footer() # --------------------------------------------------------------------------- # Session-state keys # --------------------------------------------------------------------------- K_ROWS = "pdf_scan_rows" K_WARNINGS = "pdf_scan_warnings" K_SOURCE_COUNT = "pdf_scan_source_count" # Stamped once at scan time. The download button's file_name # embeds this so the user gets a unique-per-scan filename — but # crucially, the value is stable across reruns triggered by # unrelated widget interactions (otherwise the html_download_button # helper's session-state key drifts every second and the # "Saved to " banner never gets to render). K_TIMESTAMP = "pdf_scan_timestamp" # ``pdf_uploads`` is the persistent stash of uploaded PDFs (dict # keyed by filename → {"bytes": ..., "size": ...}). It survives # Streamlit reruns and navigation away from the page. The # uploader widget feeds this stash via ``_sync_pdf_uploads`` and # the custom file list / Clear-all button operate on it. K_UPLOADS = "pdf_uploads" # Bumped to force the file_uploader to re-instantiate (clear its # internal state) when the user removes a file via the custom X or # clicks Clear-all. Streamlit's widget state is keyed on the widget # key, so changing the key resets the widget without us having to # touch its session-state directly (which Streamlit disallows). K_UPLOAD_COUNTER = "pdf_upload_counter" def _format_size(n_bytes: int) -> str: """Human-friendly file size — KB / MB / GB.""" size = float(n_bytes) for unit in ("B", "KB", "MB", "GB"): if size < 1024: if unit == "B": return f"{int(size)} {unit}" return f"{size:.1f} {unit}" size /= 1024 return f"{size:.1f} TB" # --------------------------------------------------------------------------- # Header + dep guard # --------------------------------------------------------------------------- st.markdown("# PDF to CSV") st.caption( "Scan bank-statement PDFs for transaction rows " "(``[date] [description] [amount]``). Review the table, uncheck " "rows you don't want, edit any cell that needs fixing, then " "download as CSV. No per-bank setup." ) _pdf_ok, _pdf_missing = _pdf_deps_status() if not _pdf_ok: st.error( "**PDF dependencies are not installed.** " f"Missing module(s): `{', '.join(_pdf_missing)}`.\n\n" "Install them into the same Python that launches DataTools:\n\n" "```\npip install pdfplumber pypdfium2 pytesseract\n```\n\n" "Then **fully restart the launcher** to pick up the new modules." ) st.stop() # --------------------------------------------------------------------------- # Options + upload # --------------------------------------------------------------------------- _DATE_FORMAT_CHOICES = { "YYYY-MM-DD (2026-01-13)": "%Y-%m-%d", "YYYYMMDD (20260113)": "%Y%m%d", "MM/DD/YYYY (01/13/2026)": "%m/%d/%Y", "DD/MM/YYYY (13/01/2026)": "%d/%m/%Y", "MMM DD, YYYY (Jan 13, 2026)": "%b %d, %Y", "Custom strftime…": "__custom__", } with st.expander("Scan options", expanded=False): c1, c2 = st.columns(2) negative_in_parens = c1.checkbox( "Treat (4.50) as negative", value=True, help=( "Bank statements commonly show withdrawals as ``(4.50)``. " "Off if your statements use a different convention." ), ) _ocr_ok, _ocr_reason = ocr_available() use_ocr = c2.checkbox( "Use OCR for scanned pages", value=_ocr_ok, disabled=not _ocr_ok, help=( f"OCR status: {'ready' if _ocr_ok else _ocr_reason or 'unavailable'}. " "Most modern bank PDFs are text-based and don't need OCR — " "only enable for image-based scans." ), ) c3, c4 = st.columns(2) date_label = c3.selectbox( "Output date format", list(_DATE_FORMAT_CHOICES.keys()), index=0, help=( "Applied to the transaction date AND the statement " "period dates pulled from the header. Pick Custom to " "enter your own ``strftime`` string." ), ) output_date_format = _DATE_FORMAT_CHOICES[date_label] if output_date_format == "__custom__": output_date_format = c4.text_input( "Custom strftime format", value="%Y-%m-%d", help=( "Python ``strftime`` codes — e.g., ``%Y-%m-%d`` for " "2026-01-13, ``%Y%m%d`` for 20260113." ), ) # Year override for short dates. Empty by default — the # scanner uses statement-period detection + filename year hint # automatically. Set this when the statement period regex # misses on a particular bank's layout, or when you want to # force a specific year (e.g., historical reconciliation). year_override_str = st.text_input( "Override year for short dates (optional)", value="", help=( "Short dates like ``01/13`` get bound to a year by the " "scanner — statement period first, then filename year, " "then this override. Leave blank for automatic. Enter " "a 4-digit year (e.g., 2025) to force every short date " "to that year. Won't affect dates that already have a " "year (``01/13/2025``)." ), ) try: year_override = ( int(year_override_str) if year_override_str.strip() else None ) if year_override is not None and not (1900 <= year_override <= 2100): st.warning( f"Year override {year_override} looks wrong — using " "automatic detection instead." ) year_override = None except ValueError: st.warning( f"Year override {year_override_str!r} isn't a number — " "using automatic detection instead." ) year_override = None # Persistent stash + rotating widget key. See K_UPLOADS / K_UPLOAD_COUNTER # docstrings for why the counter exists. pdf_uploads: dict = st.session_state.setdefault(K_UPLOADS, {}) upload_counter: int = st.session_state.setdefault(K_UPLOAD_COUNTER, 0) uploader_key = f"pdf_upload_v{upload_counter}" # Mirror the Home-page upload pattern: the Streamlit file_uploader # is positioned off-screen via CSS (keeps its underlying ```` reachable to JS), and the page renders a Home-style # bordered file list with an "Add more files" button at the # bottom. A small iframe-injected script wires that button to # programmatically click the hidden uploader so the OS file picker # opens. Same approach as ``_sync_uploader_to_home_uploads`` in # ``src/gui/_home.py``. st.markdown( '', unsafe_allow_html=True, ) def _sync_pdf_uploads() -> None: """``on_change`` callback. Adds newly-uploaded files to the persistent stash. **Add-only** — removal happens through the custom X buttons + counter bump, NOT through this callback. """ widget_files = st.session_state.get(uploader_key) or [] for f in widget_files: if f.name not in pdf_uploads: pdf_uploads[f.name] = { "bytes": f.getvalue(), "size": f.size, } log_event( "upload", f"PDF: {f.name}", filename=f.name, bytes=f.size, page="10_PDF_Extractor", ) st.file_uploader( "PDF file(s)", type=["pdf"], accept_multiple_files=True, key=uploader_key, on_change=_sync_pdf_uploads, label_visibility="collapsed", help="Drop one or more bank-statement PDFs. Multi-file batches " "are merged into a single table with a ``source_file`` column.", ) # --------------------------------------------------------------------------- # Files section (Home-style layout) # --------------------------------------------------------------------------- import html as _html _DOC_SVG = ( '' ) _PLUS_SVG = ( '' ) n_files = len(pdf_uploads) if n_files: total_bytes = sum(m["size"] for m in pdf_uploads.values()) files_word = "file" if n_files == 1 else "files" meta_html = ( f'{n_files} {files_word} · ' f'{_html.escape(_format_size(total_bytes))} total' ) else: meta_html = "No files imported yet" st.markdown( '

' '

Files

' f'{meta_html}' '

', unsafe_allow_html=True, ) # Single bordered card hosting the file rows + the in-card # "Add more files" button at the bottom, matching the Home page. # Two-phase remove pattern: walk all rows once, accumulate # ``to_remove`` if any X was clicked, then mutate state + rerun # ONCE after the loop so Streamlit doesn't see a half-mutated # dict mid-render. to_remove: str | None = None with st.container(border=True): for name, meta in pdf_uploads.items(): digest = hashlib.sha1( name.encode("utf-8"), usedforsecurity=False, ).hexdigest()[:10] col_x, col_name, col_size = st.columns([0.55, 8, 1.6]) if col_x.button( "✕", key=f"pdf_rm_{digest}", help=f"Remove {name}", type="tertiary", ): to_remove = name col_name.markdown( '

' f'{_DOC_SVG}' f'{_html.escape(name)}' '

', unsafe_allow_html=True, ) col_size.markdown( f'

' f'' f'{_html.escape(_format_size(meta["size"]))}' '

', unsafe_allow_html=True, ) # In-card "Add more files" button. The HTML is rendered as-is # — Streamlit's sanitiser strips inline ``onclick``, so the # click wiring is done by the iframe script below. st.markdown( '', unsafe_allow_html=True, ) # Wire the in-card "Add more files" button to the off-screen # ``stFileUploaderDropzoneInput``. Identical pattern to the # Home page (see ``src/gui/_home.py``); a ``MutationObserver`` # re-wires after every Streamlit rerun in case the button got # re-mounted. st.iframe( """ """, height=1, ) if to_remove is not None: log_event( "upload", f"PDF removed: {to_remove}", filename=to_remove, page="10_PDF_Extractor", ) del pdf_uploads[to_remove] # Bump the uploader counter so the widget re-instantiates and # forgets the removed file. st.session_state[K_UPLOAD_COUNTER] = upload_counter + 1 st.rerun() # --------------------------------------------------------------------------- # Action buttons (Scan + Clear all) live below the Files card # --------------------------------------------------------------------------- c_scan, c_clear, _spacer = st.columns([1, 1, 4]) with c_scan: scan_clicked = st.button( "Scan", type="primary", disabled=not pdf_uploads, use_container_width=True, ) with c_clear: if st.button( "Clear all files", type="secondary", disabled=not pdf_uploads, help="Removes all uploaded files and the last scan result.", use_container_width=True, ): st.session_state[K_UPLOADS] = {} st.session_state[K_UPLOAD_COUNTER] = upload_counter + 1 for k in (K_ROWS, K_WARNINGS, K_SOURCE_COUNT, K_TIMESTAMP): st.session_state.pop(k, None) log_event( "upload", "PDF list cleared", page="10_PDF_Extractor", count=n_files, ) st.rerun() # --------------------------------------------------------------------------- # Scan # --------------------------------------------------------------------------- if scan_clicked and pdf_uploads: all_rows: list[dict] = [] all_warnings: list[str] = [] n_files = len(pdf_uploads) with st.status( f"Scanning {n_files} file(s)…", expanded=True, ) as status: for i, (name, meta) in enumerate(pdf_uploads.items(), start=1): st.write(f"**{i}/{n_files}** · {name}") try: raw = meta["bytes"] rows, warns = scan_pdf_for_transactions( raw, negative_in_parens=negative_in_parens, allow_ocr=use_ocr, output_date_format=output_date_format, filename_year_hint=year_from_filename(name), year_override=year_override, ) for r in rows: r["source_file"] = name all_rows.extend(rows) all_warnings.extend(f"[{name}] {w}" for w in warns) except PdfDependencyMissing as e: all_warnings.append(f"[{name}] {e}") except Exception as e: all_warnings.append( f"[{name}] scan failed: {type(e).__name__}: {e}" ) status.update( label=( f"Found {len(all_rows):,} candidate transactions " f"across {n_files} file(s)" ), state="complete", expanded=False, ) st.session_state[K_ROWS] = all_rows st.session_state[K_WARNINGS] = all_warnings st.session_state[K_SOURCE_COUNT] = n_files st.session_state[K_TIMESTAMP] = datetime.now().strftime("%Y%m%d-%H%M%S") log_event( "tool_run", "PDF scan", page="10_PDF_Extractor", files=n_files, rows=len(all_rows), warnings=len(all_warnings), ) # --------------------------------------------------------------------------- # Results — editable table + download # --------------------------------------------------------------------------- rows = st.session_state.get(K_ROWS) warnings = st.session_state.get(K_WARNINGS) or [] source_count = st.session_state.get(K_SOURCE_COUNT, 0) if warnings: with st.expander(f"Warnings ({len(warnings)})", expanded=False): for w in warnings: st.warning(w) if rows is None: if pdf_uploads: st.info("Click **Scan** to detect transactions.") else: st.info("Upload one or more PDF files to begin.") elif not rows: st.info( "No transaction rows detected. The scanner looks for lines " "containing a date and at least one amount. The diagnostic " "below shows every line the PDF reader could see — use the " "``has_date`` and ``has_amount`` columns to spot which " "pieces are missing (usually one or the other)." ) if pdf_uploads: with st.expander( "Diagnostic: what the scanner saw", expanded=True, ): for fname, meta in pdf_uploads.items(): raw = meta["bytes"] st.markdown(f"**{fname}**") try: lines, dwarns = diagnose_pdf_lines( raw, allow_ocr=use_ocr, max_lines=200, ) except Exception as e: st.error(f"Diagnostic failed: {type(e).__name__}: {e}") continue for w in dwarns: st.caption(w) if not lines: st.warning( "Zero text lines extracted. This is almost " "certainly a scanned (image-based) PDF — " "enable OCR in Scan options if available." ) continue st.dataframe( pd.DataFrame(lines), hide_index=True, use_container_width=True, height=400, ) date_hits = sum(1 for ln in lines if ln["has_date"]) amt_hits = sum(1 for ln in lines if ln["has_amount"]) both = sum( 1 for ln in lines if ln["has_date"] and ln["has_amount"] ) st.caption( f"{len(lines):,} lines · {date_hits:,} look like " f"they contain a date · {amt_hits:,} look like " f"they contain an amount · {both:,} have both " "(those are the rows the scanner would have kept)." ) else: df = pd.DataFrame(rows) # Order columns so the user-facing fields are leftmost; raw + # internals are last and easy to scroll past or unselect at # download time. ``account_number`` sits with the transaction # detail since it's per-row context an accountant typically # wants alongside the amounts. front = [ "date", "description", ] amount_cols = sorted(c for c in df.columns if c.startswith("amount_")) metadata_cols = ["account_number"] tail = ["source_file", "page", "raw"] ordered = [ c for c in front + amount_cols + metadata_cols + tail if c in df.columns ] extras = [c for c in df.columns if c not in ordered] df = df[ordered + extras] # Prepend the include checkbox. df.insert(0, "Include", True) st.markdown( f"#### {len(df):,} candidate transaction(s) " f"from {source_count} file(s)" ) st.caption( "Uncheck rows to exclude. Edit any cell to fix a value the " "scanner got wrong. The ``raw`` column shows the original " "PDF text for that row." ) column_config = { "Include": st.column_config.CheckboxColumn( "Include", default=True, help="Uncheck to drop this row from the CSV.", ), "raw": st.column_config.TextColumn( "raw", help="Original text line from the PDF (read-only reference).", disabled=True, width="large", ), "page": st.column_config.NumberColumn( "page", disabled=True, width="small", ), } if "source_file" in df.columns: column_config["source_file"] = st.column_config.TextColumn( "source_file", disabled=True, ) # Force 2-decimal display on every amount column. Without this, # Streamlit / Pandas show floats with their raw repr ("4.5", # "12.0", "1000") and the precision looks inconsistent across # rows that all came from the same statement. Internal dtype # stays float for arithmetic accuracy; only the rendering and # CSV-export formatting force two-place precision. for amt_col in (c for c in df.columns if c.startswith("amount_")): column_config[amt_col] = st.column_config.NumberColumn( amt_col, format="%.2f", help="Two-decimal currency amount.", ) edited = st.data_editor( df, hide_index=True, use_container_width=True, column_config=column_config, num_rows="fixed", key="pdf_results_editor", ) selected = edited[edited["Include"]].drop(columns=["Include"]) c_dl, c_meta = st.columns([2, 3]) with c_dl: if selected.empty: st.button("Download CSV", disabled=True) else: # Reuse the timestamp stamped when this scan finished — # stable across reruns so the download helper's button # key doesn't drift every second. ts = st.session_state.get(K_TIMESTAMP) or "results" # Default: drop the internal columns from the download. keep_default = [ c for c in selected.columns if c not in ("page", "raw") ] with c_meta: keep = st.multiselect( "Columns to include in CSV", options=list(selected.columns), default=keep_default, help="``page`` and ``raw`` are kept off by default; " "tick them if you want them in the file.", ) export = (selected[keep] if keep else selected).copy() # Coerce every amount column to a fixed 2-decimal string # before serialising. Pandas' default float-to-CSV # writer drops trailing zeros (4.50 → 4.5) which an # accountant immediately notices in Excel; preserving # the precision is the whole point of this commit. for amt_col in ( c for c in export.columns if c.startswith("amount_") ): export[amt_col] = export[amt_col].map(format_amount) csv_bytes = export.to_csv(index=False).encode("utf-8") # Save server-side (consistent with the other tools) — # writes to the user's Downloads folder and shows the # exact path. Avoids the st.download_button quirk where # the second-or-later button in a script pass silently # fails to fire. html_download_button( f"Download {len(export):,} rows as CSV", csv_bytes, file_name=f"transactions-{ts}.csv", mime="text/csv", ) if not selected.empty: st.caption( f"{len(selected):,} of {len(df):,} rows selected." )