From 5a8e2ec9e120984885c060e847d6fc911920a954 Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 19 May 2026 22:51:05 +0000 Subject: [PATCH] =?UTF-8?q?feat(pdf):=20batch=20extract=20polish=20?= =?UTF-8?q?=E2=80=94=20ZIP=20output,=20sort-by-date,=20status=20block?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 4/6. Polishes the batch workflow shipped in commit 3: - **st.status progress block** replaces the simple progress bar. Each file appears as its own line as it's processed; the block auto-collapses on completion with a "12/13 extracted" summary and turns red if any file errored. - **Sort combined output by date** checkbox (default ON) sorts the merged CSV ascending by date, with source_file as a stable secondary sort so multiple statements interleave by date but same-day rows from the same file stay together. - **ZIP-of-per-PDF-CSVs output option** alongside the combined CSV. When the accountant has 12 statements from 12 different account periods and wants to feed them into 12 separate ledger imports, the ZIP keeps each file's rows in its own CSV named after the original PDF stem. - **Per-file summary table** gets a ``status`` column ("ok" / "no rows" / "error: ExceptionName") so error grouping is obvious at a glance — already present from commit 3, now upgraded with the status field. Cancellation is intentionally not added — Streamlit's single- thread rerun model has no clean way to interrupt a tool-run mid-stream without architectural changes to extraction. If a user mis-fires Extract on 50 PDFs they can refresh the browser tab; the task will be killed when the next interaction comes in. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/gui/pages/10_PDF_Extractor.py | 138 +++++++++++++++++++++++------- 1 file changed, 106 insertions(+), 32 deletions(-) diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py index 5111390..9fa42ae 100644 --- a/src/gui/pages/10_PDF_Extractor.py +++ b/src/gui/pages/10_PDF_Extractor.py @@ -127,6 +127,26 @@ def _render_extract_mode() -> None: ), ) + c1, c2 = st.columns(2) + sort_by_date = c1.checkbox( + "Sort combined output by date", + value=True, + help=( + "Sorts the combined CSV ascending by the ``date`` column " + "after extraction. Off → preserve per-PDF order." + ), + ) + output_shape = c2.radio( + "Output", + ["Combined CSV", "ZIP of per-PDF CSVs"], + horizontal=True, + help=( + "Combined: one CSV with a ``source_file`` column. " + "ZIP: one CSV per source PDF, useful when feeding files " + "back into separate ledgers." + ), + ) + run = st.button("Extract", type="primary", disabled=not uploads) if run and uploads: try: @@ -138,39 +158,65 @@ def _render_extract_mode() -> None: per_file_frames: list[pd.DataFrame] = [] all_warnings: list[str] = [] files_meta: list[dict] = [] - progress = st.progress(0.0, text="Reading PDFs…") - for i, up in enumerate(uploads, start=1): - try: - pdf_bytes = up.read() - pages, warns = extract_pages_auto(pdf_bytes, allow_ocr=True) - df = apply_template(pages, tpl) - df.insert(0, "source_file", up.name) - per_file_frames.append(df) - files_meta.append({ - "file": up.name, - "rows": len(df), - "pages": len(pages), - }) - for w in warns: - all_warnings.append(f"[{up.name}] {w}") - except Exception as e: - all_warnings.append( - f"[{up.name}] extraction failed: " - f"{type(e).__name__}: {e}" - ) - files_meta.append({ - "file": up.name, "rows": 0, "pages": 0, "error": str(e), - }) - progress.progress(i / len(uploads), text=f"Read {i}/{len(uploads)}") - progress.empty() + with st.status( + f"Extracting {len(uploads)} file(s)…", + expanded=True, + ) as status: + for i, up in enumerate(uploads, start=1): + st.write(f"**{i}/{len(uploads)}** · {up.name}") + try: + pdf_bytes = up.read() + pages, warns = extract_pages_auto(pdf_bytes, allow_ocr=True) + df = apply_template(pages, tpl) + df.insert(0, "source_file", up.name) + per_file_frames.append(df) + files_meta.append({ + "file": up.name, + "pages": len(pages), + "rows": len(df), + "warnings": len(warns), + "status": "ok" if len(df) else "no rows", + }) + for w in warns: + all_warnings.append(f"[{up.name}] {w}") + except Exception as e: + all_warnings.append( + f"[{up.name}] extraction failed: " + f"{type(e).__name__}: {e}" + ) + files_meta.append({ + "file": up.name, + "pages": 0, + "rows": 0, + "warnings": 1, + "status": f"error: {type(e).__name__}", + }) + ok_count = sum(1 for m in files_meta if m["status"] == "ok") + status.update( + label=f"Done · {ok_count}/{len(uploads)} extracted", + state="complete" if ok_count == len(uploads) else "error", + expanded=False, + ) if per_file_frames: combined = pd.concat(per_file_frames, ignore_index=True) + if sort_by_date and "date" in combined.columns: + combined = combined.sort_values( + by=["date", "source_file"], + kind="mergesort", + na_position="last", + ).reset_index(drop=True) else: combined = pd.DataFrame() st.session_state[K_EXTRACT_DF] = combined st.session_state[K_EXTRACT_WARNINGS] = all_warnings st.session_state[K_EXTRACT_FILES] = files_meta + st.session_state["pdf_extract_output_shape"] = output_shape + st.session_state["pdf_extract_per_file"] = [ + (m["file"], per_file_frames[i]) + for i, m in enumerate(files_meta) + if m["status"] == "ok" + ] log_event( "tool_run", @@ -179,6 +225,7 @@ def _render_extract_mode() -> None: template=slug, files=len(uploads), rows=len(combined), + output_shape=output_shape, ) df = st.session_state.get(K_EXTRACT_DF) @@ -206,15 +253,42 @@ def _render_extract_mode() -> None: else: st.markdown(f"#### Extracted rows ({len(df):,})") st.dataframe(df, hide_index=True, use_container_width=True) - csv_bytes = df.to_csv(index=False).encode("utf-8") ts = datetime.now().strftime("%Y%m%d-%H%M%S") - st.download_button( - "Download CSV", - data=csv_bytes, - file_name=f"transactions-{slug}-{ts}.csv", - mime="text/csv", - type="primary", + output_shape = st.session_state.get( + "pdf_extract_output_shape", "Combined CSV", ) + if output_shape == "ZIP of per-PDF CSVs": + import zipfile + per_file = st.session_state.get("pdf_extract_per_file") or [] + if not per_file: + st.warning("No per-file CSVs to bundle.") + else: + buf = io.BytesIO() + with zipfile.ZipFile( + buf, "w", zipfile.ZIP_DEFLATED, + ) as zf: + for name, sub_df in per_file: + stem = Path(name).stem or "transactions" + zf.writestr( + f"{stem}.csv", + sub_df.to_csv(index=False), + ) + st.download_button( + f"Download ZIP ({len(per_file)} files)", + data=buf.getvalue(), + file_name=f"transactions-{slug}-{ts}.zip", + mime="application/zip", + type="primary", + ) + else: + csv_bytes = df.to_csv(index=False).encode("utf-8") + st.download_button( + "Download CSV", + data=csv_bytes, + file_name=f"transactions-{slug}-{ts}.csv", + mime="text/csv", + type="primary", + ) # ===========================================================================