feat(pdf): Home-style file list + Clear-all button
User feedback: the standard file_uploader didn't visually match
the Home page, and there was no obvious way to clear out
uploaded files between scans (have to refresh the browser tab).
**Persistent stash + add-only sync.** Files captured into
``st.session_state["pdf_uploads"]`` (dict name → {bytes, size})
via an ``on_change`` callback on the file_uploader widget. The
callback is **add-only** — never removes files from the stash
based on widget state. Removal is owned by the custom X buttons
+ widget-counter bump (see below). This guarantees a hidden
native X click can't silently drop files behind the user's
back.
**Hidden native file list.** A small CSS block suppresses the
file_uploader's built-in file rows + their delete buttons
(``stFileUploaderFile`` + ``stFileUploaderDeleteBtn``), so the
custom list below is the single source of truth on screen.
**Custom file list (Home pattern).** Below the dropzone, every
uploaded file gets a row: ``✕ | 📄 filename | size``. Top of
section shows ``N files · 12.3 MB total``. Counts and sizes
update in real time as the user adds or removes files. The X
button per row calls ``log_event("upload", "PDF removed: …")``,
removes the entry from the stash, and bumps the widget counter
to clear the widget too.
**Clear-all button.** Sits next to the Scan button. Wipes the
stash, bumps the widget counter, drops any cached scan results
(``K_ROWS``, ``K_WARNINGS``, ``K_SOURCE_COUNT``). Audited via
``log_event("upload", "PDF list cleared", count=N)``.
**Widget reset via counter bump.** Streamlit disallows
programmatic mutation of widget session-state entries; the
standard workaround is to rotate the widget's ``key``. Page
maintains ``K_UPLOAD_COUNTER`` which gets incremented on
remove / clear-all, producing a fresh ``pdf_upload_v{N}`` key
and a freshly-instantiated empty widget. The stash retains any
unaffected files; on next upload, the add-only sync picks up
the new ones without re-adding the removed ones.
**Scan rewired to read the stash.** Instead of iterating the
widget's UploadedFile objects (which the previous code did and
which broke when the widget unmounted on remove), the scan
loop iterates ``pdf_uploads.items()`` and uses the cached
``bytes``. Diagnostic expander does the same — re-reads from
the stash, removing the need for a separate ``K_DIAGNOSTIC``
cache (deleted).
**``_format_size`` helper** ports the byte-formatting logic
from ``_home.py``'s pattern (KB / MB / GB rollover).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -8,6 +8,7 @@ no coordinate picking.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
@@ -59,7 +60,30 @@ render_sticky_footer()
|
||||
K_ROWS = "pdf_scan_rows"
|
||||
K_WARNINGS = "pdf_scan_warnings"
|
||||
K_SOURCE_COUNT = "pdf_scan_source_count"
|
||||
K_DIAGNOSTIC = "pdf_scan_diagnostic"
|
||||
# ``pdf_uploads`` is the persistent stash of uploaded PDFs (dict
|
||||
# keyed by filename → {"bytes": ..., "size": ...}). It survives
|
||||
# Streamlit reruns and navigation away from the page. The
|
||||
# uploader widget feeds this stash via ``_sync_pdf_uploads`` and
|
||||
# the custom file list / Clear-all button operate on it.
|
||||
K_UPLOADS = "pdf_uploads"
|
||||
# Bumped to force the file_uploader to re-instantiate (clear its
|
||||
# internal state) when the user removes a file via the custom X or
|
||||
# clicks Clear-all. Streamlit's widget state is keyed on the widget
|
||||
# key, so changing the key resets the widget without us having to
|
||||
# touch its session-state directly (which Streamlit disallows).
|
||||
K_UPLOAD_COUNTER = "pdf_upload_counter"
|
||||
|
||||
|
||||
def _format_size(n_bytes: int) -> str:
|
||||
"""Human-friendly file size — KB / MB / GB."""
|
||||
size = float(n_bytes)
|
||||
for unit in ("B", "KB", "MB", "GB"):
|
||||
if size < 1024:
|
||||
if unit == "B":
|
||||
return f"{int(size)} {unit}"
|
||||
return f"{size:.1f} {unit}"
|
||||
size /= 1024
|
||||
return f"{size:.1f} TB"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -143,38 +167,151 @@ with st.expander("Scan options", expanded=False):
|
||||
),
|
||||
)
|
||||
|
||||
uploads = st.file_uploader(
|
||||
# Persistent stash + rotating widget key. See K_UPLOADS / K_UPLOAD_COUNTER
|
||||
# docstrings for why the counter exists.
|
||||
pdf_uploads: dict = st.session_state.setdefault(K_UPLOADS, {})
|
||||
upload_counter: int = st.session_state.setdefault(K_UPLOAD_COUNTER, 0)
|
||||
uploader_key = f"pdf_upload_v{upload_counter}"
|
||||
|
||||
# Hide the file_uploader's built-in file list (Streamlit shows
|
||||
# tiny chips with X buttons under its dropzone). We render our own
|
||||
# Home-style list below, so suppressing the native one leaves a
|
||||
# single source of truth on screen.
|
||||
st.markdown(
|
||||
"""<style>
|
||||
[data-testid="stFileUploader"] [data-testid="stFileUploaderFile"] {
|
||||
display: none !important;
|
||||
}
|
||||
[data-testid="stFileUploader"] [data-testid="stFileUploaderDeleteBtn"] {
|
||||
display: none !important;
|
||||
}
|
||||
</style>""",
|
||||
unsafe_allow_html=True,
|
||||
)
|
||||
|
||||
|
||||
def _sync_pdf_uploads() -> None:
|
||||
"""``on_change`` callback. Adds newly-uploaded files to the
|
||||
persistent stash. **Add-only** — removal happens through the
|
||||
custom X buttons + counter bump, NOT through this callback.
|
||||
That way the widget's hidden native X buttons can't silently
|
||||
drop files behind the user's back, and we can ignore them.
|
||||
"""
|
||||
widget_files = st.session_state.get(uploader_key) or []
|
||||
for f in widget_files:
|
||||
if f.name not in pdf_uploads:
|
||||
pdf_uploads[f.name] = {
|
||||
"bytes": f.getvalue(),
|
||||
"size": f.size,
|
||||
}
|
||||
log_event(
|
||||
"upload",
|
||||
f"PDF: {f.name}",
|
||||
filename=f.name,
|
||||
bytes=f.size,
|
||||
page="10_PDF_Extractor",
|
||||
)
|
||||
|
||||
|
||||
st.file_uploader(
|
||||
"PDF file(s)",
|
||||
type=["pdf"],
|
||||
accept_multiple_files=True,
|
||||
key=uploader_key,
|
||||
on_change=_sync_pdf_uploads,
|
||||
help="Drop one or more bank-statement PDFs. Multi-file batches "
|
||||
"are merged into a single table with a ``source_file`` column.",
|
||||
)
|
||||
|
||||
scan_clicked = st.button(
|
||||
"Scan", type="primary", disabled=not uploads,
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Custom file list (Home-style: one row per file, X to remove)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
if pdf_uploads:
|
||||
n = len(pdf_uploads)
|
||||
total = sum(m["size"] for m in pdf_uploads.values())
|
||||
word = "file" if n == 1 else "files"
|
||||
st.markdown(
|
||||
f"**{n} {word}** · {_format_size(total)} total",
|
||||
)
|
||||
to_remove: str | None = None
|
||||
with st.container(border=True):
|
||||
for name, meta in pdf_uploads.items():
|
||||
digest = hashlib.sha1(
|
||||
name.encode("utf-8"), usedforsecurity=False,
|
||||
).hexdigest()[:10]
|
||||
col_x, col_name, col_size = st.columns([0.55, 8, 1.6])
|
||||
if col_x.button(
|
||||
"✕",
|
||||
key=f"pdf_rm_{digest}",
|
||||
help=f"Remove {name}",
|
||||
type="tertiary",
|
||||
):
|
||||
to_remove = name
|
||||
col_name.markdown(f"📄 **{name}**")
|
||||
col_size.markdown(
|
||||
f"<div style='text-align:right;color:#5a5f6e;'>"
|
||||
f"{_format_size(meta['size'])}</div>",
|
||||
unsafe_allow_html=True,
|
||||
)
|
||||
|
||||
c_scan, c_clear = st.columns([1, 4])
|
||||
with c_scan:
|
||||
scan_clicked = st.button("Scan", type="primary")
|
||||
with c_clear:
|
||||
if st.button(
|
||||
"Clear all files",
|
||||
type="secondary",
|
||||
help="Removes all uploaded files and the last scan result.",
|
||||
):
|
||||
st.session_state[K_UPLOADS] = {}
|
||||
st.session_state[K_UPLOAD_COUNTER] = upload_counter + 1
|
||||
for k in (K_ROWS, K_WARNINGS, K_SOURCE_COUNT):
|
||||
st.session_state.pop(k, None)
|
||||
log_event(
|
||||
"upload",
|
||||
"PDF list cleared",
|
||||
page="10_PDF_Extractor",
|
||||
count=n,
|
||||
)
|
||||
st.rerun()
|
||||
|
||||
if to_remove is not None:
|
||||
log_event(
|
||||
"upload",
|
||||
f"PDF removed: {to_remove}",
|
||||
filename=to_remove,
|
||||
page="10_PDF_Extractor",
|
||||
)
|
||||
del pdf_uploads[to_remove]
|
||||
# Bump the uploader counter so the widget re-instantiates
|
||||
# and forgets the removed file. Without this, the user
|
||||
# would have to click the widget's own X (which is hidden)
|
||||
# OR re-upload to refresh the state.
|
||||
st.session_state[K_UPLOAD_COUNTER] = upload_counter + 1
|
||||
st.rerun()
|
||||
else:
|
||||
st.caption("No files uploaded yet.")
|
||||
scan_clicked = False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scan
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
if scan_clicked and uploads:
|
||||
if scan_clicked and pdf_uploads:
|
||||
all_rows: list[dict] = []
|
||||
all_warnings: list[str] = []
|
||||
# Cache the raw bytes per file so the diagnostic expander can
|
||||
# re-extract lines without asking the user to re-upload.
|
||||
cached_bytes: list[tuple[str, bytes]] = []
|
||||
n_files = len(pdf_uploads)
|
||||
with st.status(
|
||||
f"Scanning {len(uploads)} file(s)…",
|
||||
f"Scanning {n_files} file(s)…",
|
||||
expanded=True,
|
||||
) as status:
|
||||
for i, up in enumerate(uploads, start=1):
|
||||
st.write(f"**{i}/{len(uploads)}** · {up.name}")
|
||||
for i, (name, meta) in enumerate(pdf_uploads.items(), start=1):
|
||||
st.write(f"**{i}/{n_files}** · {name}")
|
||||
try:
|
||||
raw = up.read()
|
||||
cached_bytes.append((up.name, raw))
|
||||
raw = meta["bytes"]
|
||||
rows, warns = scan_pdf_for_transactions(
|
||||
raw,
|
||||
negative_in_parens=negative_in_parens,
|
||||
@@ -182,19 +319,19 @@ if scan_clicked and uploads:
|
||||
output_date_format=output_date_format,
|
||||
)
|
||||
for r in rows:
|
||||
r["source_file"] = up.name
|
||||
r["source_file"] = name
|
||||
all_rows.extend(rows)
|
||||
all_warnings.extend(f"[{up.name}] {w}" for w in warns)
|
||||
all_warnings.extend(f"[{name}] {w}" for w in warns)
|
||||
except PdfDependencyMissing as e:
|
||||
all_warnings.append(f"[{up.name}] {e}")
|
||||
all_warnings.append(f"[{name}] {e}")
|
||||
except Exception as e:
|
||||
all_warnings.append(
|
||||
f"[{up.name}] scan failed: {type(e).__name__}: {e}"
|
||||
f"[{name}] scan failed: {type(e).__name__}: {e}"
|
||||
)
|
||||
status.update(
|
||||
label=(
|
||||
f"Found {len(all_rows):,} candidate transactions "
|
||||
f"across {len(uploads)} file(s)"
|
||||
f"across {n_files} file(s)"
|
||||
),
|
||||
state="complete",
|
||||
expanded=False,
|
||||
@@ -202,14 +339,13 @@ if scan_clicked and uploads:
|
||||
|
||||
st.session_state[K_ROWS] = all_rows
|
||||
st.session_state[K_WARNINGS] = all_warnings
|
||||
st.session_state[K_SOURCE_COUNT] = len(uploads)
|
||||
st.session_state[K_DIAGNOSTIC] = cached_bytes
|
||||
st.session_state[K_SOURCE_COUNT] = n_files
|
||||
|
||||
log_event(
|
||||
"tool_run",
|
||||
"PDF scan",
|
||||
page="10_PDF_Extractor",
|
||||
files=len(uploads),
|
||||
files=n_files,
|
||||
rows=len(all_rows),
|
||||
warnings=len(all_warnings),
|
||||
)
|
||||
@@ -229,7 +365,7 @@ if warnings:
|
||||
st.warning(w)
|
||||
|
||||
if rows is None:
|
||||
if uploads:
|
||||
if pdf_uploads:
|
||||
st.info("Click **Scan** to detect transactions.")
|
||||
else:
|
||||
st.info("Upload one or more PDF files to begin.")
|
||||
@@ -242,13 +378,13 @@ elif not rows:
|
||||
"``has_date`` and ``has_amount`` columns to spot which "
|
||||
"pieces are missing (usually one or the other)."
|
||||
)
|
||||
cached_bytes = st.session_state.get(K_DIAGNOSTIC) or []
|
||||
if cached_bytes:
|
||||
if pdf_uploads:
|
||||
with st.expander(
|
||||
"Diagnostic: what the scanner saw",
|
||||
expanded=True,
|
||||
):
|
||||
for fname, raw in cached_bytes:
|
||||
for fname, meta in pdf_uploads.items():
|
||||
raw = meta["bytes"]
|
||||
st.markdown(f"**{fname}**")
|
||||
try:
|
||||
lines, dwarns = diagnose_pdf_lines(
|
||||
|
||||
Reference in New Issue
Block a user