From 5a8e2ec9e120984885c060e847d6fc911920a954 Mon Sep 17 00:00:00 2001
From: Michael <michael.dombaugh@gmail.com>
Date: Tue, 19 May 2026 22:51:05 +0000
Subject: [PATCH] =?UTF-8?q?feat(pdf):=20batch=20extract=20polish=20?=
 =?UTF-8?q?=E2=80=94=20ZIP=20output,=20sort-by-date,=20status=20block?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 4/6. Polishes the batch workflow shipped in commit 3:

- **st.status progress block** replaces the simple progress bar.
  Each file appears as its own line as it's processed; the block
  auto-collapses on completion with a "12/13 extracted" summary
  and turns red if any file errored.
- **Sort combined output by date** checkbox (default ON) sorts
  the merged CSV ascending by date, with source_file as a stable
  secondary sort so multiple statements interleave by date but
  same-day rows from the same file stay together.
- **ZIP-of-per-PDF-CSVs output option** alongside the combined
  CSV. When the accountant has 12 statements from 12 different
  account periods and wants to feed them into 12 separate ledger
  imports, the ZIP keeps each file's rows in its own CSV named
  after the original PDF stem.
- **Per-file summary table** gets a ``status`` column ("ok" /
  "no rows" / "error: ExceptionName") so error grouping is
  obvious at a glance — already present from commit 3, now
  upgraded with the status field.

Cancellation is intentionally not added — Streamlit's single-
thread rerun model has no clean way to interrupt a tool-run
mid-stream without architectural changes to extraction. If a
user mis-fires Extract on 50 PDFs they can refresh the browser
tab; the task will be killed when the next interaction comes in.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/gui/pages/10_PDF_Extractor.py | 138 +++++++++++++++++++++++-------
 1 file changed, 106 insertions(+), 32 deletions(-)

diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py
index 5111390..9fa42ae 100644
--- a/src/gui/pages/10_PDF_Extractor.py
+++ b/src/gui/pages/10_PDF_Extractor.py
@@ -127,6 +127,26 @@ def _render_extract_mode() -> None:
         ),
     )
 
+    c1, c2 = st.columns(2)
+    sort_by_date = c1.checkbox(
+        "Sort combined output by date",
+        value=True,
+        help=(
+            "Sorts the combined CSV ascending by the ``date`` column "
+            "after extraction. Off → preserve per-PDF order."
+        ),
+    )
+    output_shape = c2.radio(
+        "Output",
+        ["Combined CSV", "ZIP of per-PDF CSVs"],
+        horizontal=True,
+        help=(
+            "Combined: one CSV with a ``source_file`` column. "
+            "ZIP: one CSV per source PDF, useful when feeding files "
+            "back into separate ledgers."
+        ),
+    )
+
     run = st.button("Extract", type="primary", disabled=not uploads)
     if run and uploads:
         try:
@@ -138,39 +158,65 @@ def _render_extract_mode() -> None:
         per_file_frames: list[pd.DataFrame] = []
         all_warnings: list[str] = []
         files_meta: list[dict] = []
-        progress = st.progress(0.0, text="Reading PDFs…")
-        for i, up in enumerate(uploads, start=1):
-            try:
-                pdf_bytes = up.read()
-                pages, warns = extract_pages_auto(pdf_bytes, allow_ocr=True)
-                df = apply_template(pages, tpl)
-                df.insert(0, "source_file", up.name)
-                per_file_frames.append(df)
-                files_meta.append({
-                    "file": up.name,
-                    "rows": len(df),
-                    "pages": len(pages),
-                })
-                for w in warns:
-                    all_warnings.append(f"[{up.name}] {w}")
-            except Exception as e:
-                all_warnings.append(
-                    f"[{up.name}] extraction failed: "
-                    f"{type(e).__name__}: {e}"
-                )
-                files_meta.append({
-                    "file": up.name, "rows": 0, "pages": 0, "error": str(e),
-                })
-            progress.progress(i / len(uploads), text=f"Read {i}/{len(uploads)}")
-        progress.empty()
+        with st.status(
+            f"Extracting {len(uploads)} file(s)…",
+            expanded=True,
+        ) as status:
+            for i, up in enumerate(uploads, start=1):
+                st.write(f"**{i}/{len(uploads)}** · {up.name}")
+                try:
+                    pdf_bytes = up.read()
+                    pages, warns = extract_pages_auto(pdf_bytes, allow_ocr=True)
+                    df = apply_template(pages, tpl)
+                    df.insert(0, "source_file", up.name)
+                    per_file_frames.append(df)
+                    files_meta.append({
+                        "file": up.name,
+                        "pages": len(pages),
+                        "rows": len(df),
+                        "warnings": len(warns),
+                        "status": "ok" if len(df) else "no rows",
+                    })
+                    for w in warns:
+                        all_warnings.append(f"[{up.name}] {w}")
+                except Exception as e:
+                    all_warnings.append(
+                        f"[{up.name}] extraction failed: "
+                        f"{type(e).__name__}: {e}"
+                    )
+                    files_meta.append({
+                        "file": up.name,
+                        "pages": 0,
+                        "rows": 0,
+                        "warnings": 1,
+                        "status": f"error: {type(e).__name__}",
+                    })
+            ok_count = sum(1 for m in files_meta if m["status"] == "ok")
+            status.update(
+                label=f"Done · {ok_count}/{len(uploads)} extracted",
+                state="complete" if ok_count == len(uploads) else "error",
+                expanded=False,
+            )
 
         if per_file_frames:
             combined = pd.concat(per_file_frames, ignore_index=True)
+            if sort_by_date and "date" in combined.columns:
+                combined = combined.sort_values(
+                    by=["date", "source_file"],
+                    kind="mergesort",
+                    na_position="last",
+                ).reset_index(drop=True)
         else:
             combined = pd.DataFrame()
         st.session_state[K_EXTRACT_DF] = combined
         st.session_state[K_EXTRACT_WARNINGS] = all_warnings
         st.session_state[K_EXTRACT_FILES] = files_meta
+        st.session_state["pdf_extract_output_shape"] = output_shape
+        st.session_state["pdf_extract_per_file"] = [
+            (m["file"], per_file_frames[i])
+            for i, m in enumerate(files_meta)
+            if m["status"] == "ok"
+        ]
 
         log_event(
             "tool_run",
@@ -179,6 +225,7 @@ def _render_extract_mode() -> None:
             template=slug,
             files=len(uploads),
             rows=len(combined),
+            output_shape=output_shape,
         )
 
     df = st.session_state.get(K_EXTRACT_DF)
@@ -206,15 +253,42 @@ def _render_extract_mode() -> None:
         else:
             st.markdown(f"#### Extracted rows ({len(df):,})")
             st.dataframe(df, hide_index=True, use_container_width=True)
-            csv_bytes = df.to_csv(index=False).encode("utf-8")
             ts = datetime.now().strftime("%Y%m%d-%H%M%S")
-            st.download_button(
-                "Download CSV",
-                data=csv_bytes,
-                file_name=f"transactions-{slug}-{ts}.csv",
-                mime="text/csv",
-                type="primary",
+            output_shape = st.session_state.get(
+                "pdf_extract_output_shape", "Combined CSV",
             )
+            if output_shape == "ZIP of per-PDF CSVs":
+                import zipfile
+                per_file = st.session_state.get("pdf_extract_per_file") or []
+                if not per_file:
+                    st.warning("No per-file CSVs to bundle.")
+                else:
+                    buf = io.BytesIO()
+                    with zipfile.ZipFile(
+                        buf, "w", zipfile.ZIP_DEFLATED,
+                    ) as zf:
+                        for name, sub_df in per_file:
+                            stem = Path(name).stem or "transactions"
+                            zf.writestr(
+                                f"{stem}.csv",
+                                sub_df.to_csv(index=False),
+                            )
+                    st.download_button(
+                        f"Download ZIP ({len(per_file)} files)",
+                        data=buf.getvalue(),
+                        file_name=f"transactions-{slug}-{ts}.zip",
+                        mime="application/zip",
+                        type="primary",
+                    )
+            else:
+                csv_bytes = df.to_csv(index=False).encode("utf-8")
+                st.download_button(
+                    "Download CSV",
+                    data=csv_bytes,
+                    file_name=f"transactions-{slug}-{ts}.csv",
+                    mime="text/csv",
+                    type="primary",
+                )
 
 
 # ===========================================================================