diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py index ffd3956..70ac27e 100644 --- a/src/gui/pages/10_PDF_Extractor.py +++ b/src/gui/pages/10_PDF_Extractor.py @@ -8,6 +8,7 @@ no coordinate picking. from __future__ import annotations +import hashlib import sys from datetime import datetime from pathlib import Path @@ -59,7 +60,30 @@ render_sticky_footer() K_ROWS = "pdf_scan_rows" K_WARNINGS = "pdf_scan_warnings" K_SOURCE_COUNT = "pdf_scan_source_count" -K_DIAGNOSTIC = "pdf_scan_diagnostic" +# ``pdf_uploads`` is the persistent stash of uploaded PDFs (dict +# keyed by filename → {"bytes": ..., "size": ...}). It survives +# Streamlit reruns and navigation away from the page. The +# uploader widget feeds this stash via ``_sync_pdf_uploads`` and +# the custom file list / Clear-all button operate on it. +K_UPLOADS = "pdf_uploads" +# Bumped to force the file_uploader to re-instantiate (clear its +# internal state) when the user removes a file via the custom X or +# clicks Clear-all. Streamlit's widget state is keyed on the widget +# key, so changing the key resets the widget without us having to +# touch its session-state directly (which Streamlit disallows). +K_UPLOAD_COUNTER = "pdf_upload_counter" + + +def _format_size(n_bytes: int) -> str: + """Human-friendly file size — KB / MB / GB.""" + size = float(n_bytes) + for unit in ("B", "KB", "MB", "GB"): + if size < 1024: + if unit == "B": + return f"{int(size)} {unit}" + return f"{size:.1f} {unit}" + size /= 1024 + return f"{size:.1f} TB" # --------------------------------------------------------------------------- @@ -143,38 +167,151 @@ with st.expander("Scan options", expanded=False): ), ) -uploads = st.file_uploader( +# Persistent stash + rotating widget key. See K_UPLOADS / K_UPLOAD_COUNTER +# docstrings for why the counter exists. +pdf_uploads: dict = st.session_state.setdefault(K_UPLOADS, {}) +upload_counter: int = st.session_state.setdefault(K_UPLOAD_COUNTER, 0) +uploader_key = f"pdf_upload_v{upload_counter}" + +# Hide the file_uploader's built-in file list (Streamlit shows +# tiny chips with X buttons under its dropzone). We render our own +# Home-style list below, so suppressing the native one leaves a +# single source of truth on screen. +st.markdown( + """""", + unsafe_allow_html=True, +) + + +def _sync_pdf_uploads() -> None: + """``on_change`` callback. Adds newly-uploaded files to the + persistent stash. **Add-only** — removal happens through the + custom X buttons + counter bump, NOT through this callback. + That way the widget's hidden native X buttons can't silently + drop files behind the user's back, and we can ignore them. + """ + widget_files = st.session_state.get(uploader_key) or [] + for f in widget_files: + if f.name not in pdf_uploads: + pdf_uploads[f.name] = { + "bytes": f.getvalue(), + "size": f.size, + } + log_event( + "upload", + f"PDF: {f.name}", + filename=f.name, + bytes=f.size, + page="10_PDF_Extractor", + ) + + +st.file_uploader( "PDF file(s)", type=["pdf"], accept_multiple_files=True, + key=uploader_key, + on_change=_sync_pdf_uploads, help="Drop one or more bank-statement PDFs. Multi-file batches " "are merged into a single table with a ``source_file`` column.", ) -scan_clicked = st.button( - "Scan", type="primary", disabled=not uploads, -) + +# --------------------------------------------------------------------------- +# Custom file list (Home-style: one row per file, X to remove) +# --------------------------------------------------------------------------- + +if pdf_uploads: + n = len(pdf_uploads) + total = sum(m["size"] for m in pdf_uploads.values()) + word = "file" if n == 1 else "files" + st.markdown( + f"**{n} {word}** · {_format_size(total)} total", + ) + to_remove: str | None = None + with st.container(border=True): + for name, meta in pdf_uploads.items(): + digest = hashlib.sha1( + name.encode("utf-8"), usedforsecurity=False, + ).hexdigest()[:10] + col_x, col_name, col_size = st.columns([0.55, 8, 1.6]) + if col_x.button( + "✕", + key=f"pdf_rm_{digest}", + help=f"Remove {name}", + type="tertiary", + ): + to_remove = name + col_name.markdown(f"📄 **{name}**") + col_size.markdown( + f"