diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py index 5111390..9fa42ae 100644 --- a/src/gui/pages/10_PDF_Extractor.py +++ b/src/gui/pages/10_PDF_Extractor.py @@ -127,6 +127,26 @@ def _render_extract_mode() -> None: ), ) + c1, c2 = st.columns(2) + sort_by_date = c1.checkbox( + "Sort combined output by date", + value=True, + help=( + "Sorts the combined CSV ascending by the ``date`` column " + "after extraction. Off → preserve per-PDF order." + ), + ) + output_shape = c2.radio( + "Output", + ["Combined CSV", "ZIP of per-PDF CSVs"], + horizontal=True, + help=( + "Combined: one CSV with a ``source_file`` column. " + "ZIP: one CSV per source PDF, useful when feeding files " + "back into separate ledgers." + ), + ) + run = st.button("Extract", type="primary", disabled=not uploads) if run and uploads: try: @@ -138,39 +158,65 @@ def _render_extract_mode() -> None: per_file_frames: list[pd.DataFrame] = [] all_warnings: list[str] = [] files_meta: list[dict] = [] - progress = st.progress(0.0, text="Reading PDFs…") - for i, up in enumerate(uploads, start=1): - try: - pdf_bytes = up.read() - pages, warns = extract_pages_auto(pdf_bytes, allow_ocr=True) - df = apply_template(pages, tpl) - df.insert(0, "source_file", up.name) - per_file_frames.append(df) - files_meta.append({ - "file": up.name, - "rows": len(df), - "pages": len(pages), - }) - for w in warns: - all_warnings.append(f"[{up.name}] {w}") - except Exception as e: - all_warnings.append( - f"[{up.name}] extraction failed: " - f"{type(e).__name__}: {e}" - ) - files_meta.append({ - "file": up.name, "rows": 0, "pages": 0, "error": str(e), - }) - progress.progress(i / len(uploads), text=f"Read {i}/{len(uploads)}") - progress.empty() + with st.status( + f"Extracting {len(uploads)} file(s)…", + expanded=True, + ) as status: + for i, up in enumerate(uploads, start=1): + st.write(f"**{i}/{len(uploads)}** · {up.name}") + try: + pdf_bytes = up.read() + pages, warns = extract_pages_auto(pdf_bytes, allow_ocr=True) + df = apply_template(pages, tpl) + df.insert(0, "source_file", up.name) + per_file_frames.append(df) + files_meta.append({ + "file": up.name, + "pages": len(pages), + "rows": len(df), + "warnings": len(warns), + "status": "ok" if len(df) else "no rows", + }) + for w in warns: + all_warnings.append(f"[{up.name}] {w}") + except Exception as e: + all_warnings.append( + f"[{up.name}] extraction failed: " + f"{type(e).__name__}: {e}" + ) + files_meta.append({ + "file": up.name, + "pages": 0, + "rows": 0, + "warnings": 1, + "status": f"error: {type(e).__name__}", + }) + ok_count = sum(1 for m in files_meta if m["status"] == "ok") + status.update( + label=f"Done · {ok_count}/{len(uploads)} extracted", + state="complete" if ok_count == len(uploads) else "error", + expanded=False, + ) if per_file_frames: combined = pd.concat(per_file_frames, ignore_index=True) + if sort_by_date and "date" in combined.columns: + combined = combined.sort_values( + by=["date", "source_file"], + kind="mergesort", + na_position="last", + ).reset_index(drop=True) else: combined = pd.DataFrame() st.session_state[K_EXTRACT_DF] = combined st.session_state[K_EXTRACT_WARNINGS] = all_warnings st.session_state[K_EXTRACT_FILES] = files_meta + st.session_state["pdf_extract_output_shape"] = output_shape + st.session_state["pdf_extract_per_file"] = [ + (m["file"], per_file_frames[i]) + for i, m in enumerate(files_meta) + if m["status"] == "ok" + ] log_event( "tool_run", @@ -179,6 +225,7 @@ def _render_extract_mode() -> None: template=slug, files=len(uploads), rows=len(combined), + output_shape=output_shape, ) df = st.session_state.get(K_EXTRACT_DF) @@ -206,15 +253,42 @@ def _render_extract_mode() -> None: else: st.markdown(f"#### Extracted rows ({len(df):,})") st.dataframe(df, hide_index=True, use_container_width=True) - csv_bytes = df.to_csv(index=False).encode("utf-8") ts = datetime.now().strftime("%Y%m%d-%H%M%S") - st.download_button( - "Download CSV", - data=csv_bytes, - file_name=f"transactions-{slug}-{ts}.csv", - mime="text/csv", - type="primary", + output_shape = st.session_state.get( + "pdf_extract_output_shape", "Combined CSV", ) + if output_shape == "ZIP of per-PDF CSVs": + import zipfile + per_file = st.session_state.get("pdf_extract_per_file") or [] + if not per_file: + st.warning("No per-file CSVs to bundle.") + else: + buf = io.BytesIO() + with zipfile.ZipFile( + buf, "w", zipfile.ZIP_DEFLATED, + ) as zf: + for name, sub_df in per_file: + stem = Path(name).stem or "transactions" + zf.writestr( + f"{stem}.csv", + sub_df.to_csv(index=False), + ) + st.download_button( + f"Download ZIP ({len(per_file)} files)", + data=buf.getvalue(), + file_name=f"transactions-{slug}-{ts}.zip", + mime="application/zip", + type="primary", + ) + else: + csv_bytes = df.to_csv(index=False).encode("utf-8") + st.download_button( + "Download CSV", + data=csv_bytes, + file_name=f"transactions-{slug}-{ts}.csv", + mime="text/csv", + type="primary", + ) # ===========================================================================