feat(pdf): batch extract polish — ZIP output, sort-by-date, status block

Phase 4/6. Polishes the batch workflow shipped in commit 3:

- **st.status progress block** replaces the simple progress bar.
  Each file appears as its own line as it's processed; the block
  auto-collapses on completion with a "12/13 extracted" summary
  and turns red if any file errored.
- **Sort combined output by date** checkbox (default ON) sorts
  the merged CSV ascending by date, with source_file as a stable
  secondary sort so multiple statements interleave by date but
  same-day rows from the same file stay together.
- **ZIP-of-per-PDF-CSVs output option** alongside the combined
  CSV. When the accountant has 12 statements from 12 different
  account periods and wants to feed them into 12 separate ledger
  imports, the ZIP keeps each file's rows in its own CSV named
  after the original PDF stem.
- **Per-file summary table** gets a ``status`` column ("ok" /
  "no rows" / "error: ExceptionName") so error grouping is
  obvious at a glance — already present from commit 3, now
  upgraded with the status field.

Cancellation is intentionally not added — Streamlit's single-
thread rerun model has no clean way to interrupt a tool-run
mid-stream without architectural changes to extraction. If a
user mis-fires Extract on 50 PDFs they can refresh the browser
tab; the task will be killed when the next interaction comes in.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-19 22:51:05 +00:00
parent 2f349e8191
commit 5a8e2ec9e1

View File

@@ -127,6 +127,26 @@ def _render_extract_mode() -> None:
),
)
c1, c2 = st.columns(2)
sort_by_date = c1.checkbox(
"Sort combined output by date",
value=True,
help=(
"Sorts the combined CSV ascending by the ``date`` column "
"after extraction. Off → preserve per-PDF order."
),
)
output_shape = c2.radio(
"Output",
["Combined CSV", "ZIP of per-PDF CSVs"],
horizontal=True,
help=(
"Combined: one CSV with a ``source_file`` column. "
"ZIP: one CSV per source PDF, useful when feeding files "
"back into separate ledgers."
),
)
run = st.button("Extract", type="primary", disabled=not uploads)
if run and uploads:
try:
@@ -138,39 +158,65 @@ def _render_extract_mode() -> None:
per_file_frames: list[pd.DataFrame] = []
all_warnings: list[str] = []
files_meta: list[dict] = []
progress = st.progress(0.0, text="Reading PDFs…")
for i, up in enumerate(uploads, start=1):
try:
pdf_bytes = up.read()
pages, warns = extract_pages_auto(pdf_bytes, allow_ocr=True)
df = apply_template(pages, tpl)
df.insert(0, "source_file", up.name)
per_file_frames.append(df)
files_meta.append({
"file": up.name,
"rows": len(df),
"pages": len(pages),
})
for w in warns:
all_warnings.append(f"[{up.name}] {w}")
except Exception as e:
all_warnings.append(
f"[{up.name}] extraction failed: "
f"{type(e).__name__}: {e}"
)
files_meta.append({
"file": up.name, "rows": 0, "pages": 0, "error": str(e),
})
progress.progress(i / len(uploads), text=f"Read {i}/{len(uploads)}")
progress.empty()
with st.status(
f"Extracting {len(uploads)} file(s)…",
expanded=True,
) as status:
for i, up in enumerate(uploads, start=1):
st.write(f"**{i}/{len(uploads)}** · {up.name}")
try:
pdf_bytes = up.read()
pages, warns = extract_pages_auto(pdf_bytes, allow_ocr=True)
df = apply_template(pages, tpl)
df.insert(0, "source_file", up.name)
per_file_frames.append(df)
files_meta.append({
"file": up.name,
"pages": len(pages),
"rows": len(df),
"warnings": len(warns),
"status": "ok" if len(df) else "no rows",
})
for w in warns:
all_warnings.append(f"[{up.name}] {w}")
except Exception as e:
all_warnings.append(
f"[{up.name}] extraction failed: "
f"{type(e).__name__}: {e}"
)
files_meta.append({
"file": up.name,
"pages": 0,
"rows": 0,
"warnings": 1,
"status": f"error: {type(e).__name__}",
})
ok_count = sum(1 for m in files_meta if m["status"] == "ok")
status.update(
label=f"Done · {ok_count}/{len(uploads)} extracted",
state="complete" if ok_count == len(uploads) else "error",
expanded=False,
)
if per_file_frames:
combined = pd.concat(per_file_frames, ignore_index=True)
if sort_by_date and "date" in combined.columns:
combined = combined.sort_values(
by=["date", "source_file"],
kind="mergesort",
na_position="last",
).reset_index(drop=True)
else:
combined = pd.DataFrame()
st.session_state[K_EXTRACT_DF] = combined
st.session_state[K_EXTRACT_WARNINGS] = all_warnings
st.session_state[K_EXTRACT_FILES] = files_meta
st.session_state["pdf_extract_output_shape"] = output_shape
st.session_state["pdf_extract_per_file"] = [
(m["file"], per_file_frames[i])
for i, m in enumerate(files_meta)
if m["status"] == "ok"
]
log_event(
"tool_run",
@@ -179,6 +225,7 @@ def _render_extract_mode() -> None:
template=slug,
files=len(uploads),
rows=len(combined),
output_shape=output_shape,
)
df = st.session_state.get(K_EXTRACT_DF)
@@ -206,15 +253,42 @@ def _render_extract_mode() -> None:
else:
st.markdown(f"#### Extracted rows ({len(df):,})")
st.dataframe(df, hide_index=True, use_container_width=True)
csv_bytes = df.to_csv(index=False).encode("utf-8")
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
st.download_button(
"Download CSV",
data=csv_bytes,
file_name=f"transactions-{slug}-{ts}.csv",
mime="text/csv",
type="primary",
output_shape = st.session_state.get(
"pdf_extract_output_shape", "Combined CSV",
)
if output_shape == "ZIP of per-PDF CSVs":
import zipfile
per_file = st.session_state.get("pdf_extract_per_file") or []
if not per_file:
st.warning("No per-file CSVs to bundle.")
else:
buf = io.BytesIO()
with zipfile.ZipFile(
buf, "w", zipfile.ZIP_DEFLATED,
) as zf:
for name, sub_df in per_file:
stem = Path(name).stem or "transactions"
zf.writestr(
f"{stem}.csv",
sub_df.to_csv(index=False),
)
st.download_button(
f"Download ZIP ({len(per_file)} files)",
data=buf.getvalue(),
file_name=f"transactions-{slug}-{ts}.zip",
mime="application/zip",
type="primary",
)
else:
csv_bytes = df.to_csv(index=False).encode("utf-8")
st.download_button(
"Download CSV",
data=csv_bytes,
file_name=f"transactions-{slug}-{ts}.csv",
mime="text/csv",
type="primary",
)
# ===========================================================================