feat(pdf): Home-style file list + Clear-all button
User feedback: the standard file_uploader didn't visually match
the Home page, and there was no obvious way to clear out
uploaded files between scans (have to refresh the browser tab).
**Persistent stash + add-only sync.** Files captured into
``st.session_state["pdf_uploads"]`` (dict name → {bytes, size})
via an ``on_change`` callback on the file_uploader widget. The
callback is **add-only** — never removes files from the stash
based on widget state. Removal is owned by the custom X buttons
+ widget-counter bump (see below). This guarantees a hidden
native X click can't silently drop files behind the user's
back.
**Hidden native file list.** A small CSS block suppresses the
file_uploader's built-in file rows + their delete buttons
(``stFileUploaderFile`` + ``stFileUploaderDeleteBtn``), so the
custom list below is the single source of truth on screen.
**Custom file list (Home pattern).** Below the dropzone, every
uploaded file gets a row: ``✕ | 📄 filename | size``. Top of
section shows ``N files · 12.3 MB total``. Counts and sizes
update in real time as the user adds or removes files. The X
button per row calls ``log_event("upload", "PDF removed: …")``,
removes the entry from the stash, and bumps the widget counter
to clear the widget too.
**Clear-all button.** Sits next to the Scan button. Wipes the
stash, bumps the widget counter, drops any cached scan results
(``K_ROWS``, ``K_WARNINGS``, ``K_SOURCE_COUNT``). Audited via
``log_event("upload", "PDF list cleared", count=N)``.
**Widget reset via counter bump.** Streamlit disallows
programmatic mutation of widget session-state entries; the
standard workaround is to rotate the widget's ``key``. Page
maintains ``K_UPLOAD_COUNTER`` which gets incremented on
remove / clear-all, producing a fresh ``pdf_upload_v{N}`` key
and a freshly-instantiated empty widget. The stash retains any
unaffected files; on next upload, the add-only sync picks up
the new ones without re-adding the removed ones.
**Scan rewired to read the stash.** Instead of iterating the
widget's UploadedFile objects (which the previous code did and
which broke when the widget unmounted on remove), the scan
loop iterates ``pdf_uploads.items()`` and uses the cached
``bytes``. Diagnostic expander does the same — re-reads from
the stash, removing the need for a separate ``K_DIAGNOSTIC``
cache (deleted).
**``_format_size`` helper** ports the byte-formatting logic
from ``_home.py``'s pattern (KB / MB / GB rollover).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -8,6 +8,7 @@ no coordinate picking.
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
import sys
|
import sys
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -59,7 +60,30 @@ render_sticky_footer()
|
|||||||
K_ROWS = "pdf_scan_rows"
|
K_ROWS = "pdf_scan_rows"
|
||||||
K_WARNINGS = "pdf_scan_warnings"
|
K_WARNINGS = "pdf_scan_warnings"
|
||||||
K_SOURCE_COUNT = "pdf_scan_source_count"
|
K_SOURCE_COUNT = "pdf_scan_source_count"
|
||||||
K_DIAGNOSTIC = "pdf_scan_diagnostic"
|
# ``pdf_uploads`` is the persistent stash of uploaded PDFs (dict
|
||||||
|
# keyed by filename → {"bytes": ..., "size": ...}). It survives
|
||||||
|
# Streamlit reruns and navigation away from the page. The
|
||||||
|
# uploader widget feeds this stash via ``_sync_pdf_uploads`` and
|
||||||
|
# the custom file list / Clear-all button operate on it.
|
||||||
|
K_UPLOADS = "pdf_uploads"
|
||||||
|
# Bumped to force the file_uploader to re-instantiate (clear its
|
||||||
|
# internal state) when the user removes a file via the custom X or
|
||||||
|
# clicks Clear-all. Streamlit's widget state is keyed on the widget
|
||||||
|
# key, so changing the key resets the widget without us having to
|
||||||
|
# touch its session-state directly (which Streamlit disallows).
|
||||||
|
K_UPLOAD_COUNTER = "pdf_upload_counter"
|
||||||
|
|
||||||
|
|
||||||
|
def _format_size(n_bytes: int) -> str:
|
||||||
|
"""Human-friendly file size — KB / MB / GB."""
|
||||||
|
size = float(n_bytes)
|
||||||
|
for unit in ("B", "KB", "MB", "GB"):
|
||||||
|
if size < 1024:
|
||||||
|
if unit == "B":
|
||||||
|
return f"{int(size)} {unit}"
|
||||||
|
return f"{size:.1f} {unit}"
|
||||||
|
size /= 1024
|
||||||
|
return f"{size:.1f} TB"
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -143,38 +167,151 @@ with st.expander("Scan options", expanded=False):
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
uploads = st.file_uploader(
|
# Persistent stash + rotating widget key. See K_UPLOADS / K_UPLOAD_COUNTER
|
||||||
|
# docstrings for why the counter exists.
|
||||||
|
pdf_uploads: dict = st.session_state.setdefault(K_UPLOADS, {})
|
||||||
|
upload_counter: int = st.session_state.setdefault(K_UPLOAD_COUNTER, 0)
|
||||||
|
uploader_key = f"pdf_upload_v{upload_counter}"
|
||||||
|
|
||||||
|
# Hide the file_uploader's built-in file list (Streamlit shows
|
||||||
|
# tiny chips with X buttons under its dropzone). We render our own
|
||||||
|
# Home-style list below, so suppressing the native one leaves a
|
||||||
|
# single source of truth on screen.
|
||||||
|
st.markdown(
|
||||||
|
"""<style>
|
||||||
|
[data-testid="stFileUploader"] [data-testid="stFileUploaderFile"] {
|
||||||
|
display: none !important;
|
||||||
|
}
|
||||||
|
[data-testid="stFileUploader"] [data-testid="stFileUploaderDeleteBtn"] {
|
||||||
|
display: none !important;
|
||||||
|
}
|
||||||
|
</style>""",
|
||||||
|
unsafe_allow_html=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _sync_pdf_uploads() -> None:
|
||||||
|
"""``on_change`` callback. Adds newly-uploaded files to the
|
||||||
|
persistent stash. **Add-only** — removal happens through the
|
||||||
|
custom X buttons + counter bump, NOT through this callback.
|
||||||
|
That way the widget's hidden native X buttons can't silently
|
||||||
|
drop files behind the user's back, and we can ignore them.
|
||||||
|
"""
|
||||||
|
widget_files = st.session_state.get(uploader_key) or []
|
||||||
|
for f in widget_files:
|
||||||
|
if f.name not in pdf_uploads:
|
||||||
|
pdf_uploads[f.name] = {
|
||||||
|
"bytes": f.getvalue(),
|
||||||
|
"size": f.size,
|
||||||
|
}
|
||||||
|
log_event(
|
||||||
|
"upload",
|
||||||
|
f"PDF: {f.name}",
|
||||||
|
filename=f.name,
|
||||||
|
bytes=f.size,
|
||||||
|
page="10_PDF_Extractor",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
st.file_uploader(
|
||||||
"PDF file(s)",
|
"PDF file(s)",
|
||||||
type=["pdf"],
|
type=["pdf"],
|
||||||
accept_multiple_files=True,
|
accept_multiple_files=True,
|
||||||
|
key=uploader_key,
|
||||||
|
on_change=_sync_pdf_uploads,
|
||||||
help="Drop one or more bank-statement PDFs. Multi-file batches "
|
help="Drop one or more bank-statement PDFs. Multi-file batches "
|
||||||
"are merged into a single table with a ``source_file`` column.",
|
"are merged into a single table with a ``source_file`` column.",
|
||||||
)
|
)
|
||||||
|
|
||||||
scan_clicked = st.button(
|
|
||||||
"Scan", type="primary", disabled=not uploads,
|
# ---------------------------------------------------------------------------
|
||||||
)
|
# Custom file list (Home-style: one row per file, X to remove)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
if pdf_uploads:
|
||||||
|
n = len(pdf_uploads)
|
||||||
|
total = sum(m["size"] for m in pdf_uploads.values())
|
||||||
|
word = "file" if n == 1 else "files"
|
||||||
|
st.markdown(
|
||||||
|
f"**{n} {word}** · {_format_size(total)} total",
|
||||||
|
)
|
||||||
|
to_remove: str | None = None
|
||||||
|
with st.container(border=True):
|
||||||
|
for name, meta in pdf_uploads.items():
|
||||||
|
digest = hashlib.sha1(
|
||||||
|
name.encode("utf-8"), usedforsecurity=False,
|
||||||
|
).hexdigest()[:10]
|
||||||
|
col_x, col_name, col_size = st.columns([0.55, 8, 1.6])
|
||||||
|
if col_x.button(
|
||||||
|
"✕",
|
||||||
|
key=f"pdf_rm_{digest}",
|
||||||
|
help=f"Remove {name}",
|
||||||
|
type="tertiary",
|
||||||
|
):
|
||||||
|
to_remove = name
|
||||||
|
col_name.markdown(f"📄 **{name}**")
|
||||||
|
col_size.markdown(
|
||||||
|
f"<div style='text-align:right;color:#5a5f6e;'>"
|
||||||
|
f"{_format_size(meta['size'])}</div>",
|
||||||
|
unsafe_allow_html=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
c_scan, c_clear = st.columns([1, 4])
|
||||||
|
with c_scan:
|
||||||
|
scan_clicked = st.button("Scan", type="primary")
|
||||||
|
with c_clear:
|
||||||
|
if st.button(
|
||||||
|
"Clear all files",
|
||||||
|
type="secondary",
|
||||||
|
help="Removes all uploaded files and the last scan result.",
|
||||||
|
):
|
||||||
|
st.session_state[K_UPLOADS] = {}
|
||||||
|
st.session_state[K_UPLOAD_COUNTER] = upload_counter + 1
|
||||||
|
for k in (K_ROWS, K_WARNINGS, K_SOURCE_COUNT):
|
||||||
|
st.session_state.pop(k, None)
|
||||||
|
log_event(
|
||||||
|
"upload",
|
||||||
|
"PDF list cleared",
|
||||||
|
page="10_PDF_Extractor",
|
||||||
|
count=n,
|
||||||
|
)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
if to_remove is not None:
|
||||||
|
log_event(
|
||||||
|
"upload",
|
||||||
|
f"PDF removed: {to_remove}",
|
||||||
|
filename=to_remove,
|
||||||
|
page="10_PDF_Extractor",
|
||||||
|
)
|
||||||
|
del pdf_uploads[to_remove]
|
||||||
|
# Bump the uploader counter so the widget re-instantiates
|
||||||
|
# and forgets the removed file. Without this, the user
|
||||||
|
# would have to click the widget's own X (which is hidden)
|
||||||
|
# OR re-upload to refresh the state.
|
||||||
|
st.session_state[K_UPLOAD_COUNTER] = upload_counter + 1
|
||||||
|
st.rerun()
|
||||||
|
else:
|
||||||
|
st.caption("No files uploaded yet.")
|
||||||
|
scan_clicked = False
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Scan
|
# Scan
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
if scan_clicked and uploads:
|
if scan_clicked and pdf_uploads:
|
||||||
all_rows: list[dict] = []
|
all_rows: list[dict] = []
|
||||||
all_warnings: list[str] = []
|
all_warnings: list[str] = []
|
||||||
# Cache the raw bytes per file so the diagnostic expander can
|
n_files = len(pdf_uploads)
|
||||||
# re-extract lines without asking the user to re-upload.
|
|
||||||
cached_bytes: list[tuple[str, bytes]] = []
|
|
||||||
with st.status(
|
with st.status(
|
||||||
f"Scanning {len(uploads)} file(s)…",
|
f"Scanning {n_files} file(s)…",
|
||||||
expanded=True,
|
expanded=True,
|
||||||
) as status:
|
) as status:
|
||||||
for i, up in enumerate(uploads, start=1):
|
for i, (name, meta) in enumerate(pdf_uploads.items(), start=1):
|
||||||
st.write(f"**{i}/{len(uploads)}** · {up.name}")
|
st.write(f"**{i}/{n_files}** · {name}")
|
||||||
try:
|
try:
|
||||||
raw = up.read()
|
raw = meta["bytes"]
|
||||||
cached_bytes.append((up.name, raw))
|
|
||||||
rows, warns = scan_pdf_for_transactions(
|
rows, warns = scan_pdf_for_transactions(
|
||||||
raw,
|
raw,
|
||||||
negative_in_parens=negative_in_parens,
|
negative_in_parens=negative_in_parens,
|
||||||
@@ -182,19 +319,19 @@ if scan_clicked and uploads:
|
|||||||
output_date_format=output_date_format,
|
output_date_format=output_date_format,
|
||||||
)
|
)
|
||||||
for r in rows:
|
for r in rows:
|
||||||
r["source_file"] = up.name
|
r["source_file"] = name
|
||||||
all_rows.extend(rows)
|
all_rows.extend(rows)
|
||||||
all_warnings.extend(f"[{up.name}] {w}" for w in warns)
|
all_warnings.extend(f"[{name}] {w}" for w in warns)
|
||||||
except PdfDependencyMissing as e:
|
except PdfDependencyMissing as e:
|
||||||
all_warnings.append(f"[{up.name}] {e}")
|
all_warnings.append(f"[{name}] {e}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
all_warnings.append(
|
all_warnings.append(
|
||||||
f"[{up.name}] scan failed: {type(e).__name__}: {e}"
|
f"[{name}] scan failed: {type(e).__name__}: {e}"
|
||||||
)
|
)
|
||||||
status.update(
|
status.update(
|
||||||
label=(
|
label=(
|
||||||
f"Found {len(all_rows):,} candidate transactions "
|
f"Found {len(all_rows):,} candidate transactions "
|
||||||
f"across {len(uploads)} file(s)"
|
f"across {n_files} file(s)"
|
||||||
),
|
),
|
||||||
state="complete",
|
state="complete",
|
||||||
expanded=False,
|
expanded=False,
|
||||||
@@ -202,14 +339,13 @@ if scan_clicked and uploads:
|
|||||||
|
|
||||||
st.session_state[K_ROWS] = all_rows
|
st.session_state[K_ROWS] = all_rows
|
||||||
st.session_state[K_WARNINGS] = all_warnings
|
st.session_state[K_WARNINGS] = all_warnings
|
||||||
st.session_state[K_SOURCE_COUNT] = len(uploads)
|
st.session_state[K_SOURCE_COUNT] = n_files
|
||||||
st.session_state[K_DIAGNOSTIC] = cached_bytes
|
|
||||||
|
|
||||||
log_event(
|
log_event(
|
||||||
"tool_run",
|
"tool_run",
|
||||||
"PDF scan",
|
"PDF scan",
|
||||||
page="10_PDF_Extractor",
|
page="10_PDF_Extractor",
|
||||||
files=len(uploads),
|
files=n_files,
|
||||||
rows=len(all_rows),
|
rows=len(all_rows),
|
||||||
warnings=len(all_warnings),
|
warnings=len(all_warnings),
|
||||||
)
|
)
|
||||||
@@ -229,7 +365,7 @@ if warnings:
|
|||||||
st.warning(w)
|
st.warning(w)
|
||||||
|
|
||||||
if rows is None:
|
if rows is None:
|
||||||
if uploads:
|
if pdf_uploads:
|
||||||
st.info("Click **Scan** to detect transactions.")
|
st.info("Click **Scan** to detect transactions.")
|
||||||
else:
|
else:
|
||||||
st.info("Upload one or more PDF files to begin.")
|
st.info("Upload one or more PDF files to begin.")
|
||||||
@@ -242,13 +378,13 @@ elif not rows:
|
|||||||
"``has_date`` and ``has_amount`` columns to spot which "
|
"``has_date`` and ``has_amount`` columns to spot which "
|
||||||
"pieces are missing (usually one or the other)."
|
"pieces are missing (usually one or the other)."
|
||||||
)
|
)
|
||||||
cached_bytes = st.session_state.get(K_DIAGNOSTIC) or []
|
if pdf_uploads:
|
||||||
if cached_bytes:
|
|
||||||
with st.expander(
|
with st.expander(
|
||||||
"Diagnostic: what the scanner saw",
|
"Diagnostic: what the scanner saw",
|
||||||
expanded=True,
|
expanded=True,
|
||||||
):
|
):
|
||||||
for fname, raw in cached_bytes:
|
for fname, meta in pdf_uploads.items():
|
||||||
|
raw = meta["bytes"]
|
||||||
st.markdown(f"**{fname}**")
|
st.markdown(f"**{fname}**")
|
||||||
try:
|
try:
|
||||||
lines, dwarns = diagnose_pdf_lines(
|
lines, dwarns = diagnose_pdf_lines(
|
||||||
|
|||||||
Reference in New Issue
Block a user