Files
datatools-dev/src/gui/pages/10_PDF_Extractor.py
Michael 904356f4e8 feat(gui): inline Help popover next to every tool's title
Adds a contextual Help button on each detail page, right of the title.
Clicking it opens a Streamlit popover with a one-shot how-to: when to
use, numbered steps, before→after examples, and an optional one-line
tip. Designed to be scannable — no paragraph prose.

Implementation:
- New ``render_tool_header(tool_id)`` helper in components replaces the
  bare ``st.title(...) + st.caption(...)`` block on each of the 11 tool
  pages. Title in the wide column, popover in a narrow right column;
  caption sits on its own line beneath.
- Help content is one markdown blob per tool stored in i18n under
  ``tools.<id>.help_md`` (en + es). Editors can tweak copy without
  touching Python.
- ``help.button_label`` and ``help.missing_body`` keys added to both
  packs for the popover trigger and the empty-tool fallback.

All 11 tool pages now use the same header pattern — including the
PDF Extractor and Reconciler which previously had hardcoded title/
caption pairs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-02 17:21:55 +00:00

693 lines
24 KiB
Python

"""PDF to CSV — heuristic transaction scanner.
Upload one or more bank-statement PDFs, scan for transaction-like
rows ([date] [description] [amount]), uncheck the rows you don't
want, download as CSV. No templates, no per-bank configuration,
no coordinate picking.
"""
from __future__ import annotations
import hashlib
import sys
from datetime import datetime
from pathlib import Path
import pandas as pd
import streamlit as st
_project_root = Path(__file__).resolve().parent.parent.parent.parent
if str(_project_root) not in sys.path:
sys.path.insert(0, str(_project_root))
from src.audit import log_event, log_page_open
from src.gui.components import (
hide_streamlit_chrome,
html_download_button,
render_sticky_footer,
render_tool_header,
)
from src.pdf_extract import (
PdfDependencyMissing,
diagnose_pdf_lines,
format_amount,
ocr_available,
scan_pdf_for_transactions,
year_from_filename,
)
def _pdf_deps_status() -> tuple[bool, list[str]]:
"""Probe each runtime PDF dep without forcing the user to hit
the Scan button. Returns ``(ok, missing_names)``."""
missing: list[str] = []
for name in ("pdfplumber", "pypdfium2"):
try:
__import__(name)
except ImportError:
missing.append(name)
return (not missing), missing
log_page_open("10_PDF_Extractor")
_ICON_PATH = str(Path(__file__).parent.parent / "assets" / "datatools_icon_256.png")
st.set_page_config(
page_title="PDF to CSV · DataTools",
page_icon=_ICON_PATH,
layout="wide",
)
hide_streamlit_chrome()
render_sticky_footer()
# ---------------------------------------------------------------------------
# Session-state keys
# ---------------------------------------------------------------------------
K_ROWS = "pdf_scan_rows"
K_WARNINGS = "pdf_scan_warnings"
K_SOURCE_COUNT = "pdf_scan_source_count"
# Stamped once at scan time. The download button's file_name
# embeds this so the user gets a unique-per-scan filename — but
# crucially, the value is stable across reruns triggered by
# unrelated widget interactions (otherwise the html_download_button
# helper's session-state key drifts every second and the
# "Saved to <path>" banner never gets to render).
K_TIMESTAMP = "pdf_scan_timestamp"
# ``pdf_uploads`` is the persistent stash of uploaded PDFs (dict
# keyed by filename → {"bytes": ..., "size": ...}). It survives
# Streamlit reruns and navigation away from the page. The
# uploader widget feeds this stash via ``_sync_pdf_uploads`` and
# the custom file list / Clear-all button operate on it.
K_UPLOADS = "pdf_uploads"
# Bumped to force the file_uploader to re-instantiate (clear its
# internal state) when the user removes a file via the custom X or
# clicks Clear-all. Streamlit's widget state is keyed on the widget
# key, so changing the key resets the widget without us having to
# touch its session-state directly (which Streamlit disallows).
K_UPLOAD_COUNTER = "pdf_upload_counter"
def _format_size(n_bytes: int) -> str:
"""Human-friendly file size — KB / MB / GB."""
size = float(n_bytes)
for unit in ("B", "KB", "MB", "GB"):
if size < 1024:
if unit == "B":
return f"{int(size)} {unit}"
return f"{size:.1f} {unit}"
size /= 1024
return f"{size:.1f} TB"
# ---------------------------------------------------------------------------
# Header + dep guard
# ---------------------------------------------------------------------------
render_tool_header("10_pdf_extractor")
_pdf_ok, _pdf_missing = _pdf_deps_status()
if not _pdf_ok:
st.error(
"**PDF dependencies are not installed.** "
f"Missing module(s): `{', '.join(_pdf_missing)}`.\n\n"
"Install them into the same Python that launches DataTools:\n\n"
"```\npip install pdfplumber pypdfium2 pytesseract\n```\n\n"
"Then **fully restart the launcher** to pick up the new modules."
)
st.stop()
# ---------------------------------------------------------------------------
# Options + upload
# ---------------------------------------------------------------------------
_DATE_FORMAT_CHOICES = {
"YYYY-MM-DD (2026-01-13)": "%Y-%m-%d",
"YYYYMMDD (20260113)": "%Y%m%d",
"MM/DD/YYYY (01/13/2026)": "%m/%d/%Y",
"DD/MM/YYYY (13/01/2026)": "%d/%m/%Y",
"MMM DD, YYYY (Jan 13, 2026)": "%b %d, %Y",
"Custom strftime…": "__custom__",
}
with st.expander("Scan options", expanded=False):
c1, c2 = st.columns(2)
negative_in_parens = c1.checkbox(
"Treat (4.50) as negative",
value=True,
help=(
"Bank statements commonly show withdrawals as ``(4.50)``. "
"Off if your statements use a different convention."
),
)
_ocr_ok, _ocr_reason = ocr_available()
use_ocr = c2.checkbox(
"Use OCR for scanned pages",
value=_ocr_ok,
disabled=not _ocr_ok,
help=(
f"OCR status: {'ready' if _ocr_ok else _ocr_reason or 'unavailable'}. "
"Most modern bank PDFs are text-based and don't need OCR — "
"only enable for image-based scans."
),
)
c3, c4 = st.columns(2)
date_label = c3.selectbox(
"Output date format",
list(_DATE_FORMAT_CHOICES.keys()),
index=0,
help=(
"Applied to the transaction date AND the statement "
"period dates pulled from the header. Pick Custom to "
"enter your own ``strftime`` string."
),
)
output_date_format = _DATE_FORMAT_CHOICES[date_label]
if output_date_format == "__custom__":
output_date_format = c4.text_input(
"Custom strftime format",
value="%Y-%m-%d",
help=(
"Python ``strftime`` codes — e.g., ``%Y-%m-%d`` for "
"2026-01-13, ``%Y%m%d`` for 20260113."
),
)
# Year override for short dates. Empty by default — the
# scanner uses statement-period detection + filename year hint
# automatically. Set this when the statement period regex
# misses on a particular bank's layout, or when you want to
# force a specific year (e.g., historical reconciliation).
year_override_str = st.text_input(
"Override year for short dates (optional)",
value="",
help=(
"Short dates like ``01/13`` get bound to a year by the "
"scanner — statement period first, then filename year, "
"then this override. Leave blank for automatic. Enter "
"a 4-digit year (e.g., 2025) to force every short date "
"to that year. Won't affect dates that already have a "
"year (``01/13/2025``)."
),
)
try:
year_override = (
int(year_override_str) if year_override_str.strip() else None
)
if year_override is not None and not (1900 <= year_override <= 2100):
st.warning(
f"Year override {year_override} looks wrong — using "
"automatic detection instead."
)
year_override = None
except ValueError:
st.warning(
f"Year override {year_override_str!r} isn't a number — "
"using automatic detection instead."
)
year_override = None
# Persistent stash + rotating widget key. See K_UPLOADS / K_UPLOAD_COUNTER
# docstrings for why the counter exists.
pdf_uploads: dict = st.session_state.setdefault(K_UPLOADS, {})
upload_counter: int = st.session_state.setdefault(K_UPLOAD_COUNTER, 0)
uploader_key = f"pdf_upload_v{upload_counter}"
# Mirror the Home-page upload pattern: the Streamlit file_uploader
# is positioned off-screen via CSS (keeps its underlying ``<input
# type=file>`` reachable to JS), and the page renders a Home-style
# bordered file list with an "Add more files" button at the
# bottom. A small iframe-injected script wires that button to
# programmatically click the hidden uploader so the OS file picker
# opens. Same approach as ``_sync_uploader_to_home_uploads`` in
# ``src/gui/_home.py``.
st.markdown(
'<style>[data-testid="stFileUploader"] {'
'position:absolute!important;left:-10000px!important;'
'width:1px!important;height:1px!important;overflow:hidden!important;'
'pointer-events:none!important;}</style>',
unsafe_allow_html=True,
)
def _sync_pdf_uploads() -> None:
"""``on_change`` callback. Adds newly-uploaded files to the
persistent stash. **Add-only** — removal happens through the
custom X buttons + counter bump, NOT through this callback.
"""
widget_files = st.session_state.get(uploader_key) or []
for f in widget_files:
if f.name not in pdf_uploads:
pdf_uploads[f.name] = {
"bytes": f.getvalue(),
"size": f.size,
}
log_event(
"upload",
f"PDF: {f.name}",
filename=f.name,
bytes=f.size,
page="10_PDF_Extractor",
)
st.file_uploader(
"PDF file(s)",
type=["pdf"],
accept_multiple_files=True,
key=uploader_key,
on_change=_sync_pdf_uploads,
label_visibility="collapsed",
help="Drop one or more bank-statement PDFs. Multi-file batches "
"are merged into a single table with a ``source_file`` column.",
)
# ---------------------------------------------------------------------------
# Files section (Home-style layout)
# ---------------------------------------------------------------------------
import html as _html
_DOC_SVG = (
'<svg viewBox="0 0 24 24" fill="none" stroke="currentColor">'
'<path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/>'
'<path d="M14 2v6h6"/>'
'</svg>'
)
_PLUS_SVG = (
'<svg viewBox="0 0 24 24" fill="none" stroke="currentColor">'
'<path d="M12 5v14M5 12h14"/>'
'</svg>'
)
n_files = len(pdf_uploads)
if n_files:
total_bytes = sum(m["size"] for m in pdf_uploads.values())
files_word = "file" if n_files == 1 else "files"
meta_html = (
f'{n_files} {files_word} · '
f'{_html.escape(_format_size(total_bytes))} total'
)
else:
meta_html = "No files imported yet"
st.markdown(
'<div class="dt-files-section-head">'
'<h2>Files</h2>'
f'<span class="dt-section-meta">{meta_html}</span>'
'</div>',
unsafe_allow_html=True,
)
# Single bordered card hosting the file rows + the in-card
# "Add more files" button at the bottom, matching the Home page.
# Two-phase remove pattern: walk all rows once, accumulate
# ``to_remove`` if any X was clicked, then mutate state + rerun
# ONCE after the loop so Streamlit doesn't see a half-mutated
# dict mid-render.
to_remove: str | None = None
with st.container(border=True):
for name, meta in pdf_uploads.items():
digest = hashlib.sha1(
name.encode("utf-8"), usedforsecurity=False,
).hexdigest()[:10]
col_x, col_name, col_size = st.columns([0.55, 8, 1.6])
if col_x.button(
"",
key=f"pdf_rm_{digest}",
help=f"Remove {name}",
type="tertiary",
):
to_remove = name
col_name.markdown(
'<div class="dt-file-row">'
f'<span class="dt-file-icon-chip">{_DOC_SVG}</span>'
f'<span class="dt-file-name">{_html.escape(name)}</span>'
'</div>',
unsafe_allow_html=True,
)
col_size.markdown(
f'<div style="text-align:right;">'
f'<span class="dt-file-size">'
f'{_html.escape(_format_size(meta["size"]))}'
'</span></div>',
unsafe_allow_html=True,
)
# In-card "Add more files" button. The HTML is rendered as-is
# — Streamlit's sanitiser strips inline ``onclick``, so the
# click wiring is done by the iframe script below.
st.markdown(
'<button class="dt-file-add" type="button">'
f'{_PLUS_SVG} Add more files'
'</button>',
unsafe_allow_html=True,
)
# Wire the in-card "Add more files" button to the off-screen
# ``stFileUploaderDropzoneInput``. Identical pattern to the
# Home page (see ``src/gui/_home.py``); a ``MutationObserver``
# re-wires after every Streamlit rerun in case the button got
# re-mounted.
st.iframe(
"""
<script>
(function () {
function wire(doc) {
var btn = doc.querySelector('button.dt-file-add');
var input = doc.querySelector('[data-testid="stFileUploaderDropzoneInput"]');
if (!btn || !input) return;
if (btn.dataset.dtWired === '1') return;
btn.dataset.dtWired = '1';
btn.addEventListener('click', function (e) {
e.preventDefault();
input.click();
});
}
var doc;
try { doc = window.parent.document; }
catch (e) { doc = document; }
wire(doc);
var win = doc.defaultView || window.parent || window;
if ('MutationObserver' in win) {
var raf = 0;
try {
new win.MutationObserver(function () {
if (raf) return;
raf = win.requestAnimationFrame(function () { raf = 0; wire(doc); });
}).observe(doc.body, { childList: true, subtree: true });
} catch (e) {}
}
})();
</script>
""",
height=1,
)
if to_remove is not None:
log_event(
"upload",
f"PDF removed: {to_remove}",
filename=to_remove,
page="10_PDF_Extractor",
)
del pdf_uploads[to_remove]
# Bump the uploader counter so the widget re-instantiates and
# forgets the removed file.
st.session_state[K_UPLOAD_COUNTER] = upload_counter + 1
st.rerun()
# ---------------------------------------------------------------------------
# Action buttons (Scan + Clear all) live below the Files card
# ---------------------------------------------------------------------------
c_scan, c_clear, _spacer = st.columns([1, 1, 4])
with c_scan:
scan_clicked = st.button(
"Scan",
type="primary",
disabled=not pdf_uploads,
use_container_width=True,
)
with c_clear:
if st.button(
"Clear all files",
type="secondary",
disabled=not pdf_uploads,
help="Removes all uploaded files and the last scan result.",
use_container_width=True,
):
st.session_state[K_UPLOADS] = {}
st.session_state[K_UPLOAD_COUNTER] = upload_counter + 1
for k in (K_ROWS, K_WARNINGS, K_SOURCE_COUNT, K_TIMESTAMP):
st.session_state.pop(k, None)
log_event(
"upload",
"PDF list cleared",
page="10_PDF_Extractor",
count=n_files,
)
st.rerun()
# ---------------------------------------------------------------------------
# Scan
# ---------------------------------------------------------------------------
if scan_clicked and pdf_uploads:
all_rows: list[dict] = []
all_warnings: list[str] = []
n_files = len(pdf_uploads)
with st.status(
f"Scanning {n_files} file(s)…",
expanded=True,
) as status:
for i, (name, meta) in enumerate(pdf_uploads.items(), start=1):
st.write(f"**{i}/{n_files}** · {name}")
try:
raw = meta["bytes"]
rows, warns = scan_pdf_for_transactions(
raw,
negative_in_parens=negative_in_parens,
allow_ocr=use_ocr,
output_date_format=output_date_format,
filename_year_hint=year_from_filename(name),
year_override=year_override,
)
for r in rows:
r["source_file"] = name
all_rows.extend(rows)
all_warnings.extend(f"[{name}] {w}" for w in warns)
except PdfDependencyMissing as e:
all_warnings.append(f"[{name}] {e}")
except Exception as e:
all_warnings.append(
f"[{name}] scan failed: {type(e).__name__}: {e}"
)
status.update(
label=(
f"Found {len(all_rows):,} candidate transactions "
f"across {n_files} file(s)"
),
state="complete",
expanded=False,
)
st.session_state[K_ROWS] = all_rows
st.session_state[K_WARNINGS] = all_warnings
st.session_state[K_SOURCE_COUNT] = n_files
st.session_state[K_TIMESTAMP] = datetime.now().strftime("%Y%m%d-%H%M%S")
log_event(
"tool_run",
"PDF scan",
page="10_PDF_Extractor",
files=n_files,
rows=len(all_rows),
warnings=len(all_warnings),
)
# ---------------------------------------------------------------------------
# Results — editable table + download
# ---------------------------------------------------------------------------
rows = st.session_state.get(K_ROWS)
warnings = st.session_state.get(K_WARNINGS) or []
source_count = st.session_state.get(K_SOURCE_COUNT, 0)
if warnings:
with st.expander(f"Warnings ({len(warnings)})", expanded=False):
for w in warnings:
st.warning(w)
if rows is None:
if pdf_uploads:
st.info("Click **Scan** to detect transactions.")
else:
st.info("Upload one or more PDF files to begin.")
elif not rows:
st.info(
"No transaction rows detected. The scanner looks for lines "
"containing a date and at least one amount. The diagnostic "
"below shows every line the PDF reader could see — use the "
"``has_date`` and ``has_amount`` columns to spot which "
"pieces are missing (usually one or the other)."
)
if pdf_uploads:
with st.expander(
"Diagnostic: what the scanner saw",
expanded=True,
):
for fname, meta in pdf_uploads.items():
raw = meta["bytes"]
st.markdown(f"**{fname}**")
try:
lines, dwarns = diagnose_pdf_lines(
raw, allow_ocr=use_ocr, max_lines=200,
)
except Exception as e:
st.error(f"Diagnostic failed: {type(e).__name__}: {e}")
continue
for w in dwarns:
st.caption(w)
if not lines:
st.warning(
"Zero text lines extracted. This is almost "
"certainly a scanned (image-based) PDF — "
"enable OCR in Scan options if available."
)
continue
st.dataframe(
pd.DataFrame(lines),
hide_index=True,
use_container_width=True,
height=400,
)
date_hits = sum(1 for ln in lines if ln["has_date"])
amt_hits = sum(1 for ln in lines if ln["has_amount"])
both = sum(
1 for ln in lines
if ln["has_date"] and ln["has_amount"]
)
st.caption(
f"{len(lines):,} lines · {date_hits:,} look like "
f"they contain a date · {amt_hits:,} look like "
f"they contain an amount · {both:,} have both "
"(those are the rows the scanner would have kept)."
)
else:
df = pd.DataFrame(rows)
# Order columns so the user-facing fields are leftmost; raw +
# internals are last and easy to scroll past or unselect at
# download time. ``account_number`` sits with the transaction
# detail since it's per-row context an accountant typically
# wants alongside the amounts.
front = [
"date",
"description",
]
amount_cols = sorted(c for c in df.columns if c.startswith("amount_"))
metadata_cols = ["account_number"]
tail = ["source_file", "page", "raw"]
ordered = [
c for c in front + amount_cols + metadata_cols + tail
if c in df.columns
]
extras = [c for c in df.columns if c not in ordered]
df = df[ordered + extras]
# Prepend the include checkbox.
df.insert(0, "Include", True)
st.markdown(
f"#### {len(df):,} candidate transaction(s) "
f"from {source_count} file(s)"
)
st.caption(
"Uncheck rows to exclude. Edit any cell to fix a value the "
"scanner got wrong. The ``raw`` column shows the original "
"PDF text for that row."
)
column_config = {
"Include": st.column_config.CheckboxColumn(
"Include",
default=True,
help="Uncheck to drop this row from the CSV.",
),
"raw": st.column_config.TextColumn(
"raw",
help="Original text line from the PDF (read-only reference).",
disabled=True,
width="large",
),
"page": st.column_config.NumberColumn(
"page", disabled=True, width="small",
),
}
if "source_file" in df.columns:
column_config["source_file"] = st.column_config.TextColumn(
"source_file", disabled=True,
)
# Force 2-decimal display on every amount column. Without this,
# Streamlit / Pandas show floats with their raw repr ("4.5",
# "12.0", "1000") and the precision looks inconsistent across
# rows that all came from the same statement. Internal dtype
# stays float for arithmetic accuracy; only the rendering and
# CSV-export formatting force two-place precision.
for amt_col in (c for c in df.columns if c.startswith("amount_")):
column_config[amt_col] = st.column_config.NumberColumn(
amt_col,
format="%.2f",
help="Two-decimal currency amount.",
)
edited = st.data_editor(
df,
hide_index=True,
use_container_width=True,
column_config=column_config,
num_rows="fixed",
key="pdf_results_editor",
)
selected = edited[edited["Include"]].drop(columns=["Include"])
c_dl, c_meta = st.columns([2, 3])
with c_dl:
if selected.empty:
st.button("Download CSV", disabled=True)
else:
# Reuse the timestamp stamped when this scan finished —
# stable across reruns so the download helper's button
# key doesn't drift every second.
ts = st.session_state.get(K_TIMESTAMP) or "results"
# Default: drop the internal columns from the download.
keep_default = [
c for c in selected.columns
if c not in ("page", "raw")
]
with c_meta:
keep = st.multiselect(
"Columns to include in CSV",
options=list(selected.columns),
default=keep_default,
help="``page`` and ``raw`` are kept off by default; "
"tick them if you want them in the file.",
)
export = (selected[keep] if keep else selected).copy()
# Coerce every amount column to a fixed 2-decimal string
# before serialising. Pandas' default float-to-CSV
# writer drops trailing zeros (4.50 → 4.5) which an
# accountant immediately notices in Excel; preserving
# the precision is the whole point of this commit.
for amt_col in (
c for c in export.columns if c.startswith("amount_")
):
export[amt_col] = export[amt_col].map(format_amount)
csv_bytes = export.to_csv(index=False).encode("utf-8")
# Save server-side (consistent with the other tools) —
# writes to the user's Downloads folder and shows the
# exact path. Avoids the st.download_button quirk where
# the second-or-later button in a script pass silently
# fails to fire.
html_download_button(
f"Download {len(export):,} rows as CSV",
csv_bytes,
file_name=f"transactions-{ts}.csv",
mime="text/csv",
)
if not selected.empty:
st.caption(
f"{len(selected):,} of {len(df):,} rows selected."
)