diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py index ffd3956..70ac27e 100644 --- a/src/gui/pages/10_PDF_Extractor.py +++ b/src/gui/pages/10_PDF_Extractor.py @@ -8,6 +8,7 @@ no coordinate picking. from __future__ import annotations +import hashlib import sys from datetime import datetime from pathlib import Path @@ -59,7 +60,30 @@ render_sticky_footer() K_ROWS = "pdf_scan_rows" K_WARNINGS = "pdf_scan_warnings" K_SOURCE_COUNT = "pdf_scan_source_count" -K_DIAGNOSTIC = "pdf_scan_diagnostic" +# ``pdf_uploads`` is the persistent stash of uploaded PDFs (dict +# keyed by filename → {"bytes": ..., "size": ...}). It survives +# Streamlit reruns and navigation away from the page. The +# uploader widget feeds this stash via ``_sync_pdf_uploads`` and +# the custom file list / Clear-all button operate on it. +K_UPLOADS = "pdf_uploads" +# Bumped to force the file_uploader to re-instantiate (clear its +# internal state) when the user removes a file via the custom X or +# clicks Clear-all. Streamlit's widget state is keyed on the widget +# key, so changing the key resets the widget without us having to +# touch its session-state directly (which Streamlit disallows). +K_UPLOAD_COUNTER = "pdf_upload_counter" + + +def _format_size(n_bytes: int) -> str: + """Human-friendly file size — KB / MB / GB.""" + size = float(n_bytes) + for unit in ("B", "KB", "MB", "GB"): + if size < 1024: + if unit == "B": + return f"{int(size)} {unit}" + return f"{size:.1f} {unit}" + size /= 1024 + return f"{size:.1f} TB" # --------------------------------------------------------------------------- @@ -143,38 +167,151 @@ with st.expander("Scan options", expanded=False): ), ) -uploads = st.file_uploader( +# Persistent stash + rotating widget key. See K_UPLOADS / K_UPLOAD_COUNTER +# docstrings for why the counter exists. +pdf_uploads: dict = st.session_state.setdefault(K_UPLOADS, {}) +upload_counter: int = st.session_state.setdefault(K_UPLOAD_COUNTER, 0) +uploader_key = f"pdf_upload_v{upload_counter}" + +# Hide the file_uploader's built-in file list (Streamlit shows +# tiny chips with X buttons under its dropzone). We render our own +# Home-style list below, so suppressing the native one leaves a +# single source of truth on screen. +st.markdown( + """""", + unsafe_allow_html=True, +) + + +def _sync_pdf_uploads() -> None: + """``on_change`` callback. Adds newly-uploaded files to the + persistent stash. **Add-only** — removal happens through the + custom X buttons + counter bump, NOT through this callback. + That way the widget's hidden native X buttons can't silently + drop files behind the user's back, and we can ignore them. + """ + widget_files = st.session_state.get(uploader_key) or [] + for f in widget_files: + if f.name not in pdf_uploads: + pdf_uploads[f.name] = { + "bytes": f.getvalue(), + "size": f.size, + } + log_event( + "upload", + f"PDF: {f.name}", + filename=f.name, + bytes=f.size, + page="10_PDF_Extractor", + ) + + +st.file_uploader( "PDF file(s)", type=["pdf"], accept_multiple_files=True, + key=uploader_key, + on_change=_sync_pdf_uploads, help="Drop one or more bank-statement PDFs. Multi-file batches " "are merged into a single table with a ``source_file`` column.", ) -scan_clicked = st.button( - "Scan", type="primary", disabled=not uploads, -) + +# --------------------------------------------------------------------------- +# Custom file list (Home-style: one row per file, X to remove) +# --------------------------------------------------------------------------- + +if pdf_uploads: + n = len(pdf_uploads) + total = sum(m["size"] for m in pdf_uploads.values()) + word = "file" if n == 1 else "files" + st.markdown( + f"**{n} {word}** · {_format_size(total)} total", + ) + to_remove: str | None = None + with st.container(border=True): + for name, meta in pdf_uploads.items(): + digest = hashlib.sha1( + name.encode("utf-8"), usedforsecurity=False, + ).hexdigest()[:10] + col_x, col_name, col_size = st.columns([0.55, 8, 1.6]) + if col_x.button( + "✕", + key=f"pdf_rm_{digest}", + help=f"Remove {name}", + type="tertiary", + ): + to_remove = name + col_name.markdown(f"📄 **{name}**") + col_size.markdown( + f"
" + f"{_format_size(meta['size'])}
", + unsafe_allow_html=True, + ) + + c_scan, c_clear = st.columns([1, 4]) + with c_scan: + scan_clicked = st.button("Scan", type="primary") + with c_clear: + if st.button( + "Clear all files", + type="secondary", + help="Removes all uploaded files and the last scan result.", + ): + st.session_state[K_UPLOADS] = {} + st.session_state[K_UPLOAD_COUNTER] = upload_counter + 1 + for k in (K_ROWS, K_WARNINGS, K_SOURCE_COUNT): + st.session_state.pop(k, None) + log_event( + "upload", + "PDF list cleared", + page="10_PDF_Extractor", + count=n, + ) + st.rerun() + + if to_remove is not None: + log_event( + "upload", + f"PDF removed: {to_remove}", + filename=to_remove, + page="10_PDF_Extractor", + ) + del pdf_uploads[to_remove] + # Bump the uploader counter so the widget re-instantiates + # and forgets the removed file. Without this, the user + # would have to click the widget's own X (which is hidden) + # OR re-upload to refresh the state. + st.session_state[K_UPLOAD_COUNTER] = upload_counter + 1 + st.rerun() +else: + st.caption("No files uploaded yet.") + scan_clicked = False # --------------------------------------------------------------------------- # Scan # --------------------------------------------------------------------------- -if scan_clicked and uploads: +if scan_clicked and pdf_uploads: all_rows: list[dict] = [] all_warnings: list[str] = [] - # Cache the raw bytes per file so the diagnostic expander can - # re-extract lines without asking the user to re-upload. - cached_bytes: list[tuple[str, bytes]] = [] + n_files = len(pdf_uploads) with st.status( - f"Scanning {len(uploads)} file(s)…", + f"Scanning {n_files} file(s)…", expanded=True, ) as status: - for i, up in enumerate(uploads, start=1): - st.write(f"**{i}/{len(uploads)}** · {up.name}") + for i, (name, meta) in enumerate(pdf_uploads.items(), start=1): + st.write(f"**{i}/{n_files}** · {name}") try: - raw = up.read() - cached_bytes.append((up.name, raw)) + raw = meta["bytes"] rows, warns = scan_pdf_for_transactions( raw, negative_in_parens=negative_in_parens, @@ -182,19 +319,19 @@ if scan_clicked and uploads: output_date_format=output_date_format, ) for r in rows: - r["source_file"] = up.name + r["source_file"] = name all_rows.extend(rows) - all_warnings.extend(f"[{up.name}] {w}" for w in warns) + all_warnings.extend(f"[{name}] {w}" for w in warns) except PdfDependencyMissing as e: - all_warnings.append(f"[{up.name}] {e}") + all_warnings.append(f"[{name}] {e}") except Exception as e: all_warnings.append( - f"[{up.name}] scan failed: {type(e).__name__}: {e}" + f"[{name}] scan failed: {type(e).__name__}: {e}" ) status.update( label=( f"Found {len(all_rows):,} candidate transactions " - f"across {len(uploads)} file(s)" + f"across {n_files} file(s)" ), state="complete", expanded=False, @@ -202,14 +339,13 @@ if scan_clicked and uploads: st.session_state[K_ROWS] = all_rows st.session_state[K_WARNINGS] = all_warnings - st.session_state[K_SOURCE_COUNT] = len(uploads) - st.session_state[K_DIAGNOSTIC] = cached_bytes + st.session_state[K_SOURCE_COUNT] = n_files log_event( "tool_run", "PDF scan", page="10_PDF_Extractor", - files=len(uploads), + files=n_files, rows=len(all_rows), warnings=len(all_warnings), ) @@ -229,7 +365,7 @@ if warnings: st.warning(w) if rows is None: - if uploads: + if pdf_uploads: st.info("Click **Scan** to detect transactions.") else: st.info("Upload one or more PDF files to begin.") @@ -242,13 +378,13 @@ elif not rows: "``has_date`` and ``has_amount`` columns to spot which " "pieces are missing (usually one or the other)." ) - cached_bytes = st.session_state.get(K_DIAGNOSTIC) or [] - if cached_bytes: + if pdf_uploads: with st.expander( "Diagnostic: what the scanner saw", expanded=True, ): - for fname, raw in cached_bytes: + for fname, meta in pdf_uploads.items(): + raw = meta["bytes"] st.markdown(f"**{fname}**") try: lines, dwarns = diagnose_pdf_lines(