User asked to flip the default from YYYYMMDD to YYYY-MM-DD. ISO is the better default for an accountant CSV workflow: - Lexicographic sort = chronological sort (no parsing needed). - Every spreadsheet tool the user might import into recognises it as a real date with no ambiguity (US vs EU readers can't disagree on the order). - Hyphens make the year/month/day boundaries scan-able by eye. Concrete changes: - New module constant ``DEFAULT_DATE_FORMAT = "%Y-%m-%d"``, used as the default for ``format_date()`` and the ``output_date_format`` keyword on ``scan_pdf_for_transactions``. - Page's ``_DATE_FORMAT_CHOICES`` reordered so the ISO entry is first (index 0 = default Streamlit selection); YYYYMMDD drops to second. - Custom-strftime input default also flips to ``%Y-%m-%d``. Tests updated to reflect the new default (``test_dates_formatted_iso_by_default``, ``test_short_dates_get_year_from_period``, ``test_compact_format_round_trip``, plus a new ``test_default_is_iso`` for the format_date helper). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
698 lines
24 KiB
Python
698 lines
24 KiB
Python
"""PDF to CSV — heuristic transaction scanner.
|
|
|
|
Upload one or more bank-statement PDFs, scan for transaction-like
|
|
rows ([date] [description] [amount]), uncheck the rows you don't
|
|
want, download as CSV. No templates, no per-bank configuration,
|
|
no coordinate picking.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
import streamlit as st
|
|
|
|
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
|
if str(_project_root) not in sys.path:
|
|
sys.path.insert(0, str(_project_root))
|
|
|
|
from src.audit import log_event, log_page_open
|
|
from src.gui.components import (
|
|
hide_streamlit_chrome,
|
|
html_download_button,
|
|
render_sticky_footer,
|
|
)
|
|
from src.pdf_extract import (
|
|
PdfDependencyMissing,
|
|
diagnose_pdf_lines,
|
|
format_amount,
|
|
ocr_available,
|
|
scan_pdf_for_transactions,
|
|
year_from_filename,
|
|
)
|
|
|
|
|
|
def _pdf_deps_status() -> tuple[bool, list[str]]:
|
|
"""Probe each runtime PDF dep without forcing the user to hit
|
|
the Scan button. Returns ``(ok, missing_names)``."""
|
|
missing: list[str] = []
|
|
for name in ("pdfplumber", "pypdfium2"):
|
|
try:
|
|
__import__(name)
|
|
except ImportError:
|
|
missing.append(name)
|
|
return (not missing), missing
|
|
|
|
|
|
log_page_open("10_PDF_Extractor")
|
|
|
|
_ICON_PATH = str(Path(__file__).parent.parent / "assets" / "datatools_icon_256.png")
|
|
st.set_page_config(
|
|
page_title="PDF to CSV · DataTools",
|
|
page_icon=_ICON_PATH,
|
|
layout="wide",
|
|
)
|
|
hide_streamlit_chrome()
|
|
render_sticky_footer()
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Session-state keys
|
|
# ---------------------------------------------------------------------------
|
|
|
|
K_ROWS = "pdf_scan_rows"
|
|
K_WARNINGS = "pdf_scan_warnings"
|
|
K_SOURCE_COUNT = "pdf_scan_source_count"
|
|
# Stamped once at scan time. The download button's file_name
|
|
# embeds this so the user gets a unique-per-scan filename — but
|
|
# crucially, the value is stable across reruns triggered by
|
|
# unrelated widget interactions (otherwise the html_download_button
|
|
# helper's session-state key drifts every second and the
|
|
# "Saved to <path>" banner never gets to render).
|
|
K_TIMESTAMP = "pdf_scan_timestamp"
|
|
# ``pdf_uploads`` is the persistent stash of uploaded PDFs (dict
|
|
# keyed by filename → {"bytes": ..., "size": ...}). It survives
|
|
# Streamlit reruns and navigation away from the page. The
|
|
# uploader widget feeds this stash via ``_sync_pdf_uploads`` and
|
|
# the custom file list / Clear-all button operate on it.
|
|
K_UPLOADS = "pdf_uploads"
|
|
# Bumped to force the file_uploader to re-instantiate (clear its
|
|
# internal state) when the user removes a file via the custom X or
|
|
# clicks Clear-all. Streamlit's widget state is keyed on the widget
|
|
# key, so changing the key resets the widget without us having to
|
|
# touch its session-state directly (which Streamlit disallows).
|
|
K_UPLOAD_COUNTER = "pdf_upload_counter"
|
|
|
|
|
|
def _format_size(n_bytes: int) -> str:
|
|
"""Human-friendly file size — KB / MB / GB."""
|
|
size = float(n_bytes)
|
|
for unit in ("B", "KB", "MB", "GB"):
|
|
if size < 1024:
|
|
if unit == "B":
|
|
return f"{int(size)} {unit}"
|
|
return f"{size:.1f} {unit}"
|
|
size /= 1024
|
|
return f"{size:.1f} TB"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Header + dep guard
|
|
# ---------------------------------------------------------------------------
|
|
|
|
st.markdown("# PDF to CSV")
|
|
st.caption(
|
|
"Scan bank-statement PDFs for transaction rows "
|
|
"(``[date] [description] [amount]``). Review the table, uncheck "
|
|
"rows you don't want, edit any cell that needs fixing, then "
|
|
"download as CSV. No per-bank setup."
|
|
)
|
|
|
|
_pdf_ok, _pdf_missing = _pdf_deps_status()
|
|
if not _pdf_ok:
|
|
st.error(
|
|
"**PDF dependencies are not installed.** "
|
|
f"Missing module(s): `{', '.join(_pdf_missing)}`.\n\n"
|
|
"Install them into the same Python that launches DataTools:\n\n"
|
|
"```\npip install pdfplumber pypdfium2 pytesseract\n```\n\n"
|
|
"Then **fully restart the launcher** to pick up the new modules."
|
|
)
|
|
st.stop()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Options + upload
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_DATE_FORMAT_CHOICES = {
|
|
"YYYY-MM-DD (2026-01-13)": "%Y-%m-%d",
|
|
"YYYYMMDD (20260113)": "%Y%m%d",
|
|
"MM/DD/YYYY (01/13/2026)": "%m/%d/%Y",
|
|
"DD/MM/YYYY (13/01/2026)": "%d/%m/%Y",
|
|
"MMM DD, YYYY (Jan 13, 2026)": "%b %d, %Y",
|
|
"Custom strftime…": "__custom__",
|
|
}
|
|
|
|
with st.expander("Scan options", expanded=False):
|
|
c1, c2 = st.columns(2)
|
|
negative_in_parens = c1.checkbox(
|
|
"Treat (4.50) as negative",
|
|
value=True,
|
|
help=(
|
|
"Bank statements commonly show withdrawals as ``(4.50)``. "
|
|
"Off if your statements use a different convention."
|
|
),
|
|
)
|
|
_ocr_ok, _ocr_reason = ocr_available()
|
|
use_ocr = c2.checkbox(
|
|
"Use OCR for scanned pages",
|
|
value=_ocr_ok,
|
|
disabled=not _ocr_ok,
|
|
help=(
|
|
f"OCR status: {'ready' if _ocr_ok else _ocr_reason or 'unavailable'}. "
|
|
"Most modern bank PDFs are text-based and don't need OCR — "
|
|
"only enable for image-based scans."
|
|
),
|
|
)
|
|
|
|
c3, c4 = st.columns(2)
|
|
date_label = c3.selectbox(
|
|
"Output date format",
|
|
list(_DATE_FORMAT_CHOICES.keys()),
|
|
index=0,
|
|
help=(
|
|
"Applied to the transaction date AND the statement "
|
|
"period dates pulled from the header. Pick Custom to "
|
|
"enter your own ``strftime`` string."
|
|
),
|
|
)
|
|
output_date_format = _DATE_FORMAT_CHOICES[date_label]
|
|
if output_date_format == "__custom__":
|
|
output_date_format = c4.text_input(
|
|
"Custom strftime format",
|
|
value="%Y-%m-%d",
|
|
help=(
|
|
"Python ``strftime`` codes — e.g., ``%Y-%m-%d`` for "
|
|
"2026-01-13, ``%Y%m%d`` for 20260113."
|
|
),
|
|
)
|
|
|
|
# Year override for short dates. Empty by default — the
|
|
# scanner uses statement-period detection + filename year hint
|
|
# automatically. Set this when the statement period regex
|
|
# misses on a particular bank's layout, or when you want to
|
|
# force a specific year (e.g., historical reconciliation).
|
|
year_override_str = st.text_input(
|
|
"Override year for short dates (optional)",
|
|
value="",
|
|
help=(
|
|
"Short dates like ``01/13`` get bound to a year by the "
|
|
"scanner — statement period first, then filename year, "
|
|
"then this override. Leave blank for automatic. Enter "
|
|
"a 4-digit year (e.g., 2025) to force every short date "
|
|
"to that year. Won't affect dates that already have a "
|
|
"year (``01/13/2025``)."
|
|
),
|
|
)
|
|
try:
|
|
year_override = (
|
|
int(year_override_str) if year_override_str.strip() else None
|
|
)
|
|
if year_override is not None and not (1900 <= year_override <= 2100):
|
|
st.warning(
|
|
f"Year override {year_override} looks wrong — using "
|
|
"automatic detection instead."
|
|
)
|
|
year_override = None
|
|
except ValueError:
|
|
st.warning(
|
|
f"Year override {year_override_str!r} isn't a number — "
|
|
"using automatic detection instead."
|
|
)
|
|
year_override = None
|
|
|
|
# Persistent stash + rotating widget key. See K_UPLOADS / K_UPLOAD_COUNTER
|
|
# docstrings for why the counter exists.
|
|
pdf_uploads: dict = st.session_state.setdefault(K_UPLOADS, {})
|
|
upload_counter: int = st.session_state.setdefault(K_UPLOAD_COUNTER, 0)
|
|
uploader_key = f"pdf_upload_v{upload_counter}"
|
|
|
|
|
|
# Mirror the Home-page upload pattern: the Streamlit file_uploader
|
|
# is positioned off-screen via CSS (keeps its underlying ``<input
|
|
# type=file>`` reachable to JS), and the page renders a Home-style
|
|
# bordered file list with an "Add more files" button at the
|
|
# bottom. A small iframe-injected script wires that button to
|
|
# programmatically click the hidden uploader so the OS file picker
|
|
# opens. Same approach as ``_sync_uploader_to_home_uploads`` in
|
|
# ``src/gui/_home.py``.
|
|
st.markdown(
|
|
'<style>[data-testid="stFileUploader"] {'
|
|
'position:absolute!important;left:-10000px!important;'
|
|
'width:1px!important;height:1px!important;overflow:hidden!important;'
|
|
'pointer-events:none!important;}</style>',
|
|
unsafe_allow_html=True,
|
|
)
|
|
|
|
|
|
def _sync_pdf_uploads() -> None:
|
|
"""``on_change`` callback. Adds newly-uploaded files to the
|
|
persistent stash. **Add-only** — removal happens through the
|
|
custom X buttons + counter bump, NOT through this callback.
|
|
"""
|
|
widget_files = st.session_state.get(uploader_key) or []
|
|
for f in widget_files:
|
|
if f.name not in pdf_uploads:
|
|
pdf_uploads[f.name] = {
|
|
"bytes": f.getvalue(),
|
|
"size": f.size,
|
|
}
|
|
log_event(
|
|
"upload",
|
|
f"PDF: {f.name}",
|
|
filename=f.name,
|
|
bytes=f.size,
|
|
page="10_PDF_Extractor",
|
|
)
|
|
|
|
|
|
st.file_uploader(
|
|
"PDF file(s)",
|
|
type=["pdf"],
|
|
accept_multiple_files=True,
|
|
key=uploader_key,
|
|
on_change=_sync_pdf_uploads,
|
|
label_visibility="collapsed",
|
|
help="Drop one or more bank-statement PDFs. Multi-file batches "
|
|
"are merged into a single table with a ``source_file`` column.",
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Files section (Home-style layout)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
import html as _html
|
|
|
|
_DOC_SVG = (
|
|
'<svg viewBox="0 0 24 24" fill="none" stroke="currentColor">'
|
|
'<path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/>'
|
|
'<path d="M14 2v6h6"/>'
|
|
'</svg>'
|
|
)
|
|
_PLUS_SVG = (
|
|
'<svg viewBox="0 0 24 24" fill="none" stroke="currentColor">'
|
|
'<path d="M12 5v14M5 12h14"/>'
|
|
'</svg>'
|
|
)
|
|
|
|
n_files = len(pdf_uploads)
|
|
if n_files:
|
|
total_bytes = sum(m["size"] for m in pdf_uploads.values())
|
|
files_word = "file" if n_files == 1 else "files"
|
|
meta_html = (
|
|
f'{n_files} {files_word} · '
|
|
f'{_html.escape(_format_size(total_bytes))} total'
|
|
)
|
|
else:
|
|
meta_html = "No files imported yet"
|
|
|
|
st.markdown(
|
|
'<div class="dt-files-section-head">'
|
|
'<h2>Files</h2>'
|
|
f'<span class="dt-section-meta">{meta_html}</span>'
|
|
'</div>',
|
|
unsafe_allow_html=True,
|
|
)
|
|
|
|
# Single bordered card hosting the file rows + the in-card
|
|
# "Add more files" button at the bottom, matching the Home page.
|
|
# Two-phase remove pattern: walk all rows once, accumulate
|
|
# ``to_remove`` if any X was clicked, then mutate state + rerun
|
|
# ONCE after the loop so Streamlit doesn't see a half-mutated
|
|
# dict mid-render.
|
|
to_remove: str | None = None
|
|
with st.container(border=True):
|
|
for name, meta in pdf_uploads.items():
|
|
digest = hashlib.sha1(
|
|
name.encode("utf-8"), usedforsecurity=False,
|
|
).hexdigest()[:10]
|
|
col_x, col_name, col_size = st.columns([0.55, 8, 1.6])
|
|
if col_x.button(
|
|
"✕",
|
|
key=f"pdf_rm_{digest}",
|
|
help=f"Remove {name}",
|
|
type="tertiary",
|
|
):
|
|
to_remove = name
|
|
col_name.markdown(
|
|
'<div class="dt-file-row">'
|
|
f'<span class="dt-file-icon-chip">{_DOC_SVG}</span>'
|
|
f'<span class="dt-file-name">{_html.escape(name)}</span>'
|
|
'</div>',
|
|
unsafe_allow_html=True,
|
|
)
|
|
col_size.markdown(
|
|
f'<div style="text-align:right;">'
|
|
f'<span class="dt-file-size">'
|
|
f'{_html.escape(_format_size(meta["size"]))}'
|
|
'</span></div>',
|
|
unsafe_allow_html=True,
|
|
)
|
|
# In-card "Add more files" button. The HTML is rendered as-is
|
|
# — Streamlit's sanitiser strips inline ``onclick``, so the
|
|
# click wiring is done by the iframe script below.
|
|
st.markdown(
|
|
'<button class="dt-file-add" type="button">'
|
|
f'{_PLUS_SVG} Add more files'
|
|
'</button>',
|
|
unsafe_allow_html=True,
|
|
)
|
|
|
|
# Wire the in-card "Add more files" button to the off-screen
|
|
# ``stFileUploaderDropzoneInput``. Identical pattern to the
|
|
# Home page (see ``src/gui/_home.py``); a ``MutationObserver``
|
|
# re-wires after every Streamlit rerun in case the button got
|
|
# re-mounted.
|
|
st.iframe(
|
|
"""
|
|
<script>
|
|
(function () {
|
|
function wire(doc) {
|
|
var btn = doc.querySelector('button.dt-file-add');
|
|
var input = doc.querySelector('[data-testid="stFileUploaderDropzoneInput"]');
|
|
if (!btn || !input) return;
|
|
if (btn.dataset.dtWired === '1') return;
|
|
btn.dataset.dtWired = '1';
|
|
btn.addEventListener('click', function (e) {
|
|
e.preventDefault();
|
|
input.click();
|
|
});
|
|
}
|
|
var doc;
|
|
try { doc = window.parent.document; }
|
|
catch (e) { doc = document; }
|
|
wire(doc);
|
|
var win = doc.defaultView || window.parent || window;
|
|
if ('MutationObserver' in win) {
|
|
var raf = 0;
|
|
try {
|
|
new win.MutationObserver(function () {
|
|
if (raf) return;
|
|
raf = win.requestAnimationFrame(function () { raf = 0; wire(doc); });
|
|
}).observe(doc.body, { childList: true, subtree: true });
|
|
} catch (e) {}
|
|
}
|
|
})();
|
|
</script>
|
|
""",
|
|
height=1,
|
|
)
|
|
|
|
if to_remove is not None:
|
|
log_event(
|
|
"upload",
|
|
f"PDF removed: {to_remove}",
|
|
filename=to_remove,
|
|
page="10_PDF_Extractor",
|
|
)
|
|
del pdf_uploads[to_remove]
|
|
# Bump the uploader counter so the widget re-instantiates and
|
|
# forgets the removed file.
|
|
st.session_state[K_UPLOAD_COUNTER] = upload_counter + 1
|
|
st.rerun()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Action buttons (Scan + Clear all) live below the Files card
|
|
# ---------------------------------------------------------------------------
|
|
|
|
c_scan, c_clear, _spacer = st.columns([1, 1, 4])
|
|
with c_scan:
|
|
scan_clicked = st.button(
|
|
"Scan",
|
|
type="primary",
|
|
disabled=not pdf_uploads,
|
|
use_container_width=True,
|
|
)
|
|
with c_clear:
|
|
if st.button(
|
|
"Clear all files",
|
|
type="secondary",
|
|
disabled=not pdf_uploads,
|
|
help="Removes all uploaded files and the last scan result.",
|
|
use_container_width=True,
|
|
):
|
|
st.session_state[K_UPLOADS] = {}
|
|
st.session_state[K_UPLOAD_COUNTER] = upload_counter + 1
|
|
for k in (K_ROWS, K_WARNINGS, K_SOURCE_COUNT, K_TIMESTAMP):
|
|
st.session_state.pop(k, None)
|
|
log_event(
|
|
"upload",
|
|
"PDF list cleared",
|
|
page="10_PDF_Extractor",
|
|
count=n_files,
|
|
)
|
|
st.rerun()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Scan
|
|
# ---------------------------------------------------------------------------
|
|
|
|
if scan_clicked and pdf_uploads:
|
|
all_rows: list[dict] = []
|
|
all_warnings: list[str] = []
|
|
n_files = len(pdf_uploads)
|
|
with st.status(
|
|
f"Scanning {n_files} file(s)…",
|
|
expanded=True,
|
|
) as status:
|
|
for i, (name, meta) in enumerate(pdf_uploads.items(), start=1):
|
|
st.write(f"**{i}/{n_files}** · {name}")
|
|
try:
|
|
raw = meta["bytes"]
|
|
rows, warns = scan_pdf_for_transactions(
|
|
raw,
|
|
negative_in_parens=negative_in_parens,
|
|
allow_ocr=use_ocr,
|
|
output_date_format=output_date_format,
|
|
filename_year_hint=year_from_filename(name),
|
|
year_override=year_override,
|
|
)
|
|
for r in rows:
|
|
r["source_file"] = name
|
|
all_rows.extend(rows)
|
|
all_warnings.extend(f"[{name}] {w}" for w in warns)
|
|
except PdfDependencyMissing as e:
|
|
all_warnings.append(f"[{name}] {e}")
|
|
except Exception as e:
|
|
all_warnings.append(
|
|
f"[{name}] scan failed: {type(e).__name__}: {e}"
|
|
)
|
|
status.update(
|
|
label=(
|
|
f"Found {len(all_rows):,} candidate transactions "
|
|
f"across {n_files} file(s)"
|
|
),
|
|
state="complete",
|
|
expanded=False,
|
|
)
|
|
|
|
st.session_state[K_ROWS] = all_rows
|
|
st.session_state[K_WARNINGS] = all_warnings
|
|
st.session_state[K_SOURCE_COUNT] = n_files
|
|
st.session_state[K_TIMESTAMP] = datetime.now().strftime("%Y%m%d-%H%M%S")
|
|
|
|
log_event(
|
|
"tool_run",
|
|
"PDF scan",
|
|
page="10_PDF_Extractor",
|
|
files=n_files,
|
|
rows=len(all_rows),
|
|
warnings=len(all_warnings),
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Results — editable table + download
|
|
# ---------------------------------------------------------------------------
|
|
|
|
rows = st.session_state.get(K_ROWS)
|
|
warnings = st.session_state.get(K_WARNINGS) or []
|
|
source_count = st.session_state.get(K_SOURCE_COUNT, 0)
|
|
|
|
if warnings:
|
|
with st.expander(f"Warnings ({len(warnings)})", expanded=False):
|
|
for w in warnings:
|
|
st.warning(w)
|
|
|
|
if rows is None:
|
|
if pdf_uploads:
|
|
st.info("Click **Scan** to detect transactions.")
|
|
else:
|
|
st.info("Upload one or more PDF files to begin.")
|
|
|
|
elif not rows:
|
|
st.info(
|
|
"No transaction rows detected. The scanner looks for lines "
|
|
"containing a date and at least one amount. The diagnostic "
|
|
"below shows every line the PDF reader could see — use the "
|
|
"``has_date`` and ``has_amount`` columns to spot which "
|
|
"pieces are missing (usually one or the other)."
|
|
)
|
|
if pdf_uploads:
|
|
with st.expander(
|
|
"Diagnostic: what the scanner saw",
|
|
expanded=True,
|
|
):
|
|
for fname, meta in pdf_uploads.items():
|
|
raw = meta["bytes"]
|
|
st.markdown(f"**{fname}**")
|
|
try:
|
|
lines, dwarns = diagnose_pdf_lines(
|
|
raw, allow_ocr=use_ocr, max_lines=200,
|
|
)
|
|
except Exception as e:
|
|
st.error(f"Diagnostic failed: {type(e).__name__}: {e}")
|
|
continue
|
|
for w in dwarns:
|
|
st.caption(w)
|
|
if not lines:
|
|
st.warning(
|
|
"Zero text lines extracted. This is almost "
|
|
"certainly a scanned (image-based) PDF — "
|
|
"enable OCR in Scan options if available."
|
|
)
|
|
continue
|
|
st.dataframe(
|
|
pd.DataFrame(lines),
|
|
hide_index=True,
|
|
use_container_width=True,
|
|
height=400,
|
|
)
|
|
date_hits = sum(1 for ln in lines if ln["has_date"])
|
|
amt_hits = sum(1 for ln in lines if ln["has_amount"])
|
|
both = sum(
|
|
1 for ln in lines
|
|
if ln["has_date"] and ln["has_amount"]
|
|
)
|
|
st.caption(
|
|
f"{len(lines):,} lines · {date_hits:,} look like "
|
|
f"they contain a date · {amt_hits:,} look like "
|
|
f"they contain an amount · {both:,} have both "
|
|
"(those are the rows the scanner would have kept)."
|
|
)
|
|
|
|
else:
|
|
df = pd.DataFrame(rows)
|
|
|
|
# Order columns so the user-facing fields are leftmost; raw +
|
|
# internals are last and easy to scroll past or unselect at
|
|
# download time. ``account_number`` sits with the transaction
|
|
# detail since it's per-row context an accountant typically
|
|
# wants alongside the amounts.
|
|
front = [
|
|
"date",
|
|
"description",
|
|
]
|
|
amount_cols = sorted(c for c in df.columns if c.startswith("amount_"))
|
|
metadata_cols = ["account_number"]
|
|
tail = ["source_file", "page", "raw"]
|
|
ordered = [
|
|
c for c in front + amount_cols + metadata_cols + tail
|
|
if c in df.columns
|
|
]
|
|
extras = [c for c in df.columns if c not in ordered]
|
|
df = df[ordered + extras]
|
|
|
|
# Prepend the include checkbox.
|
|
df.insert(0, "Include", True)
|
|
|
|
st.markdown(
|
|
f"#### {len(df):,} candidate transaction(s) "
|
|
f"from {source_count} file(s)"
|
|
)
|
|
st.caption(
|
|
"Uncheck rows to exclude. Edit any cell to fix a value the "
|
|
"scanner got wrong. The ``raw`` column shows the original "
|
|
"PDF text for that row."
|
|
)
|
|
|
|
column_config = {
|
|
"Include": st.column_config.CheckboxColumn(
|
|
"Include",
|
|
default=True,
|
|
help="Uncheck to drop this row from the CSV.",
|
|
),
|
|
"raw": st.column_config.TextColumn(
|
|
"raw",
|
|
help="Original text line from the PDF (read-only reference).",
|
|
disabled=True,
|
|
width="large",
|
|
),
|
|
"page": st.column_config.NumberColumn(
|
|
"page", disabled=True, width="small",
|
|
),
|
|
}
|
|
if "source_file" in df.columns:
|
|
column_config["source_file"] = st.column_config.TextColumn(
|
|
"source_file", disabled=True,
|
|
)
|
|
# Force 2-decimal display on every amount column. Without this,
|
|
# Streamlit / Pandas show floats with their raw repr ("4.5",
|
|
# "12.0", "1000") and the precision looks inconsistent across
|
|
# rows that all came from the same statement. Internal dtype
|
|
# stays float for arithmetic accuracy; only the rendering and
|
|
# CSV-export formatting force two-place precision.
|
|
for amt_col in (c for c in df.columns if c.startswith("amount_")):
|
|
column_config[amt_col] = st.column_config.NumberColumn(
|
|
amt_col,
|
|
format="%.2f",
|
|
help="Two-decimal currency amount.",
|
|
)
|
|
|
|
edited = st.data_editor(
|
|
df,
|
|
hide_index=True,
|
|
use_container_width=True,
|
|
column_config=column_config,
|
|
num_rows="fixed",
|
|
key="pdf_results_editor",
|
|
)
|
|
|
|
selected = edited[edited["Include"]].drop(columns=["Include"])
|
|
|
|
c_dl, c_meta = st.columns([2, 3])
|
|
with c_dl:
|
|
if selected.empty:
|
|
st.button("Download CSV", disabled=True)
|
|
else:
|
|
# Reuse the timestamp stamped when this scan finished —
|
|
# stable across reruns so the download helper's button
|
|
# key doesn't drift every second.
|
|
ts = st.session_state.get(K_TIMESTAMP) or "results"
|
|
# Default: drop the internal columns from the download.
|
|
keep_default = [
|
|
c for c in selected.columns
|
|
if c not in ("page", "raw")
|
|
]
|
|
with c_meta:
|
|
keep = st.multiselect(
|
|
"Columns to include in CSV",
|
|
options=list(selected.columns),
|
|
default=keep_default,
|
|
help="``page`` and ``raw`` are kept off by default; "
|
|
"tick them if you want them in the file.",
|
|
)
|
|
export = (selected[keep] if keep else selected).copy()
|
|
# Coerce every amount column to a fixed 2-decimal string
|
|
# before serialising. Pandas' default float-to-CSV
|
|
# writer drops trailing zeros (4.50 → 4.5) which an
|
|
# accountant immediately notices in Excel; preserving
|
|
# the precision is the whole point of this commit.
|
|
for amt_col in (
|
|
c for c in export.columns if c.startswith("amount_")
|
|
):
|
|
export[amt_col] = export[amt_col].map(format_amount)
|
|
csv_bytes = export.to_csv(index=False).encode("utf-8")
|
|
# Save server-side (consistent with the other tools) —
|
|
# writes to the user's Downloads folder and shows the
|
|
# exact path. Avoids the st.download_button quirk where
|
|
# the second-or-later button in a script pass silently
|
|
# fails to fire.
|
|
html_download_button(
|
|
f"Download {len(export):,} rows as CSV",
|
|
csv_bytes,
|
|
file_name=f"transactions-{ts}.csv",
|
|
mime="text/csv",
|
|
)
|
|
|
|
if not selected.empty:
|
|
st.caption(
|
|
f"{len(selected):,} of {len(df):,} rows selected."
|
|
)
|