feat(pdf): batch extract polish — ZIP output, sort-by-date, status block
Phase 4/6. Polishes the batch workflow shipped in commit 3:
- **st.status progress block** replaces the simple progress bar.
Each file appears as its own line as it's processed; the block
auto-collapses on completion with a "12/13 extracted" summary
and turns red if any file errored.
- **Sort combined output by date** checkbox (default ON) sorts
the merged CSV ascending by date, with source_file as a stable
secondary sort so multiple statements interleave by date but
same-day rows from the same file stay together.
- **ZIP-of-per-PDF-CSVs output option** alongside the combined
CSV. When the accountant has 12 statements from 12 different
account periods and wants to feed them into 12 separate ledger
imports, the ZIP keeps each file's rows in its own CSV named
after the original PDF stem.
- **Per-file summary table** gets a ``status`` column ("ok" /
"no rows" / "error: ExceptionName") so error grouping is
obvious at a glance — already present from commit 3, now
upgraded with the status field.
Cancellation is intentionally not added — Streamlit's single-
thread rerun model has no clean way to interrupt a tool-run
mid-stream without architectural changes to extraction. If a
user mis-fires Extract on 50 PDFs they can refresh the browser
tab; the task will be killed when the next interaction comes in.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -127,6 +127,26 @@ def _render_extract_mode() -> None:
|
||||
),
|
||||
)
|
||||
|
||||
c1, c2 = st.columns(2)
|
||||
sort_by_date = c1.checkbox(
|
||||
"Sort combined output by date",
|
||||
value=True,
|
||||
help=(
|
||||
"Sorts the combined CSV ascending by the ``date`` column "
|
||||
"after extraction. Off → preserve per-PDF order."
|
||||
),
|
||||
)
|
||||
output_shape = c2.radio(
|
||||
"Output",
|
||||
["Combined CSV", "ZIP of per-PDF CSVs"],
|
||||
horizontal=True,
|
||||
help=(
|
||||
"Combined: one CSV with a ``source_file`` column. "
|
||||
"ZIP: one CSV per source PDF, useful when feeding files "
|
||||
"back into separate ledgers."
|
||||
),
|
||||
)
|
||||
|
||||
run = st.button("Extract", type="primary", disabled=not uploads)
|
||||
if run and uploads:
|
||||
try:
|
||||
@@ -138,39 +158,65 @@ def _render_extract_mode() -> None:
|
||||
per_file_frames: list[pd.DataFrame] = []
|
||||
all_warnings: list[str] = []
|
||||
files_meta: list[dict] = []
|
||||
progress = st.progress(0.0, text="Reading PDFs…")
|
||||
for i, up in enumerate(uploads, start=1):
|
||||
try:
|
||||
pdf_bytes = up.read()
|
||||
pages, warns = extract_pages_auto(pdf_bytes, allow_ocr=True)
|
||||
df = apply_template(pages, tpl)
|
||||
df.insert(0, "source_file", up.name)
|
||||
per_file_frames.append(df)
|
||||
files_meta.append({
|
||||
"file": up.name,
|
||||
"rows": len(df),
|
||||
"pages": len(pages),
|
||||
})
|
||||
for w in warns:
|
||||
all_warnings.append(f"[{up.name}] {w}")
|
||||
except Exception as e:
|
||||
all_warnings.append(
|
||||
f"[{up.name}] extraction failed: "
|
||||
f"{type(e).__name__}: {e}"
|
||||
)
|
||||
files_meta.append({
|
||||
"file": up.name, "rows": 0, "pages": 0, "error": str(e),
|
||||
})
|
||||
progress.progress(i / len(uploads), text=f"Read {i}/{len(uploads)}")
|
||||
progress.empty()
|
||||
with st.status(
|
||||
f"Extracting {len(uploads)} file(s)…",
|
||||
expanded=True,
|
||||
) as status:
|
||||
for i, up in enumerate(uploads, start=1):
|
||||
st.write(f"**{i}/{len(uploads)}** · {up.name}")
|
||||
try:
|
||||
pdf_bytes = up.read()
|
||||
pages, warns = extract_pages_auto(pdf_bytes, allow_ocr=True)
|
||||
df = apply_template(pages, tpl)
|
||||
df.insert(0, "source_file", up.name)
|
||||
per_file_frames.append(df)
|
||||
files_meta.append({
|
||||
"file": up.name,
|
||||
"pages": len(pages),
|
||||
"rows": len(df),
|
||||
"warnings": len(warns),
|
||||
"status": "ok" if len(df) else "no rows",
|
||||
})
|
||||
for w in warns:
|
||||
all_warnings.append(f"[{up.name}] {w}")
|
||||
except Exception as e:
|
||||
all_warnings.append(
|
||||
f"[{up.name}] extraction failed: "
|
||||
f"{type(e).__name__}: {e}"
|
||||
)
|
||||
files_meta.append({
|
||||
"file": up.name,
|
||||
"pages": 0,
|
||||
"rows": 0,
|
||||
"warnings": 1,
|
||||
"status": f"error: {type(e).__name__}",
|
||||
})
|
||||
ok_count = sum(1 for m in files_meta if m["status"] == "ok")
|
||||
status.update(
|
||||
label=f"Done · {ok_count}/{len(uploads)} extracted",
|
||||
state="complete" if ok_count == len(uploads) else "error",
|
||||
expanded=False,
|
||||
)
|
||||
|
||||
if per_file_frames:
|
||||
combined = pd.concat(per_file_frames, ignore_index=True)
|
||||
if sort_by_date and "date" in combined.columns:
|
||||
combined = combined.sort_values(
|
||||
by=["date", "source_file"],
|
||||
kind="mergesort",
|
||||
na_position="last",
|
||||
).reset_index(drop=True)
|
||||
else:
|
||||
combined = pd.DataFrame()
|
||||
st.session_state[K_EXTRACT_DF] = combined
|
||||
st.session_state[K_EXTRACT_WARNINGS] = all_warnings
|
||||
st.session_state[K_EXTRACT_FILES] = files_meta
|
||||
st.session_state["pdf_extract_output_shape"] = output_shape
|
||||
st.session_state["pdf_extract_per_file"] = [
|
||||
(m["file"], per_file_frames[i])
|
||||
for i, m in enumerate(files_meta)
|
||||
if m["status"] == "ok"
|
||||
]
|
||||
|
||||
log_event(
|
||||
"tool_run",
|
||||
@@ -179,6 +225,7 @@ def _render_extract_mode() -> None:
|
||||
template=slug,
|
||||
files=len(uploads),
|
||||
rows=len(combined),
|
||||
output_shape=output_shape,
|
||||
)
|
||||
|
||||
df = st.session_state.get(K_EXTRACT_DF)
|
||||
@@ -206,15 +253,42 @@ def _render_extract_mode() -> None:
|
||||
else:
|
||||
st.markdown(f"#### Extracted rows ({len(df):,})")
|
||||
st.dataframe(df, hide_index=True, use_container_width=True)
|
||||
csv_bytes = df.to_csv(index=False).encode("utf-8")
|
||||
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
st.download_button(
|
||||
"Download CSV",
|
||||
data=csv_bytes,
|
||||
file_name=f"transactions-{slug}-{ts}.csv",
|
||||
mime="text/csv",
|
||||
type="primary",
|
||||
output_shape = st.session_state.get(
|
||||
"pdf_extract_output_shape", "Combined CSV",
|
||||
)
|
||||
if output_shape == "ZIP of per-PDF CSVs":
|
||||
import zipfile
|
||||
per_file = st.session_state.get("pdf_extract_per_file") or []
|
||||
if not per_file:
|
||||
st.warning("No per-file CSVs to bundle.")
|
||||
else:
|
||||
buf = io.BytesIO()
|
||||
with zipfile.ZipFile(
|
||||
buf, "w", zipfile.ZIP_DEFLATED,
|
||||
) as zf:
|
||||
for name, sub_df in per_file:
|
||||
stem = Path(name).stem or "transactions"
|
||||
zf.writestr(
|
||||
f"{stem}.csv",
|
||||
sub_df.to_csv(index=False),
|
||||
)
|
||||
st.download_button(
|
||||
f"Download ZIP ({len(per_file)} files)",
|
||||
data=buf.getvalue(),
|
||||
file_name=f"transactions-{slug}-{ts}.zip",
|
||||
mime="application/zip",
|
||||
type="primary",
|
||||
)
|
||||
else:
|
||||
csv_bytes = df.to_csv(index=False).encode("utf-8")
|
||||
st.download_button(
|
||||
"Download CSV",
|
||||
data=csv_bytes,
|
||||
file_name=f"transactions-{slug}-{ts}.csv",
|
||||
mime="text/csv",
|
||||
type="primary",
|
||||
)
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
|
||||
Reference in New Issue
Block a user