diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py new file mode 100644 index 0000000..5111390 --- /dev/null +++ b/src/gui/pages/10_PDF_Extractor.py @@ -0,0 +1,584 @@ +"""PDF Extractor — extract bank-statement transactions to CSV. + +Three modes: + +- **Extract** (daily workflow): pick a saved template, upload a + PDF, get a CSV preview + download. +- **Build template**: upload a sample PDF, configure how the + table is identified, save the template for reuse. +- **Manage templates**: list / rename / delete / export / import. + +The expensive step is ``extract_pages_auto`` (PDF I/O + word +extraction + optional OCR). It runs only on explicit user action +("Extract" / "Preview"), and results are stashed in session_state +so re-renders from form-field edits don't re-parse the PDF. Heavy +work off Streamlit's rerun-on-every-widget path. +""" + +from __future__ import annotations + +import io +import sys +from datetime import datetime +from pathlib import Path + +import pandas as pd +import streamlit as st + +_project_root = Path(__file__).resolve().parent.parent.parent.parent +if str(_project_root) not in sys.path: + sys.path.insert(0, str(_project_root)) + +from src.audit import log_event, log_page_open +from src.gui.components import hide_streamlit_chrome, render_sticky_footer +from src.pdf_extract import apply_template, extract_pages_auto +from src.pdf_templates import ( + SCHEMA_VERSION, + VALID_TARGETS, + delete_template, + list_templates, + load_template, + new_template, + save_template, + slugify, + template_from_json, + template_to_json, + validate_template, +) + +log_page_open("10_PDF_Extractor") + +_ICON_PATH = str(Path(__file__).parent.parent / "assets" / "datatools_icon_256.png") +st.set_page_config( + page_title="PDF to CSV · DataTools", + page_icon=_ICON_PATH, + layout="wide", +) +hide_streamlit_chrome() +render_sticky_footer() + + +# --------------------------------------------------------------------------- +# Session-state keys (centralized so the build / extract flows agree on names) +# --------------------------------------------------------------------------- + +K_MODE = "pdf_mode" +K_CURRENT_TEMPLATE = "pdf_tpl_current" +K_SAMPLE_BYTES = "pdf_tpl_sample_bytes" +K_SAMPLE_NAME = "pdf_tpl_sample_name" +K_SAMPLE_PAGES = "pdf_tpl_sample_pages" +K_EXTRACT_DF = "pdf_extract_df" +K_EXTRACT_WARNINGS = "pdf_extract_warnings" +K_EXTRACT_FILES = "pdf_extract_files" + + +def _get_or_init(key: str, default): + if key not in st.session_state: + st.session_state[key] = default + return st.session_state[key] + + +# --------------------------------------------------------------------------- +# Page header + mode selector +# --------------------------------------------------------------------------- + +st.markdown("# PDF to CSV") +st.caption( + "Extract transaction tables from bank-statement PDFs. Build one " + "template per source (bank + account type), then reuse it for " + "every statement that follows the same layout." +) + +mode = st.radio( + "Mode", + ["Extract", "Build template", "Manage templates"], + horizontal=True, + key=K_MODE, +) +st.divider() + + +# =========================================================================== +# Extract mode +# =========================================================================== + + +def _render_extract_mode() -> None: + templates = list_templates() + if not templates: + st.info( + "No templates yet. Switch to **Build template** to create your " + "first one — you'll need a sample PDF from the source bank." + ) + return + + options = {f"{t['name']} · {t['slug']}": t["slug"] for t in templates} + label = st.selectbox("Template", list(options.keys())) + slug = options[label] + + uploads = st.file_uploader( + "Statement PDF(s)", + type=["pdf"], + accept_multiple_files=True, + help=( + "Drop one or more statements from the same source. Rows from " + "every file are combined into a single CSV, tagged with the " + "source filename." + ), + ) + + run = st.button("Extract", type="primary", disabled=not uploads) + if run and uploads: + try: + tpl = load_template(slug) + except Exception as e: + st.error(f"Couldn't load template {slug!r}: {e}") + return + + per_file_frames: list[pd.DataFrame] = [] + all_warnings: list[str] = [] + files_meta: list[dict] = [] + progress = st.progress(0.0, text="Reading PDFs…") + for i, up in enumerate(uploads, start=1): + try: + pdf_bytes = up.read() + pages, warns = extract_pages_auto(pdf_bytes, allow_ocr=True) + df = apply_template(pages, tpl) + df.insert(0, "source_file", up.name) + per_file_frames.append(df) + files_meta.append({ + "file": up.name, + "rows": len(df), + "pages": len(pages), + }) + for w in warns: + all_warnings.append(f"[{up.name}] {w}") + except Exception as e: + all_warnings.append( + f"[{up.name}] extraction failed: " + f"{type(e).__name__}: {e}" + ) + files_meta.append({ + "file": up.name, "rows": 0, "pages": 0, "error": str(e), + }) + progress.progress(i / len(uploads), text=f"Read {i}/{len(uploads)}") + progress.empty() + + if per_file_frames: + combined = pd.concat(per_file_frames, ignore_index=True) + else: + combined = pd.DataFrame() + st.session_state[K_EXTRACT_DF] = combined + st.session_state[K_EXTRACT_WARNINGS] = all_warnings + st.session_state[K_EXTRACT_FILES] = files_meta + + log_event( + "tool_run", + "PDF Extractor run", + page="10_PDF_Extractor", + template=slug, + files=len(uploads), + rows=len(combined), + ) + + df = st.session_state.get(K_EXTRACT_DF) + if isinstance(df, pd.DataFrame): + warnings = st.session_state.get(K_EXTRACT_WARNINGS, []) or [] + files_meta = st.session_state.get(K_EXTRACT_FILES, []) or [] + if files_meta: + st.markdown("#### Per-file summary") + st.dataframe( + pd.DataFrame(files_meta), + hide_index=True, + use_container_width=True, + ) + if warnings: + with st.expander(f"Warnings ({len(warnings)})", expanded=False): + for w in warnings: + st.warning(w) + + if df.empty: + st.info( + "No rows were extracted. Re-check the template's header " + "text, column boundaries, and end markers in **Build " + "template** mode against a sample PDF." + ) + else: + st.markdown(f"#### Extracted rows ({len(df):,})") + st.dataframe(df, hide_index=True, use_container_width=True) + csv_bytes = df.to_csv(index=False).encode("utf-8") + ts = datetime.now().strftime("%Y%m%d-%H%M%S") + st.download_button( + "Download CSV", + data=csv_bytes, + file_name=f"transactions-{slug}-{ts}.csv", + mime="text/csv", + type="primary", + ) + + +# =========================================================================== +# Build-template mode +# =========================================================================== + + +def _ensure_sample_loaded() -> bool: + """Side-bar uploader for the sample PDF. Returns True if a sample + is loaded and parsed (pages cached in session_state).""" + up = st.file_uploader( + "Sample statement", + type=["pdf"], + help=( + "Used to drive the live preview while you build the " + "template — pick a representative statement from this " + "source." + ), + key="pdf_tpl_sample_uploader", + ) + if up is not None and up.name != st.session_state.get(K_SAMPLE_NAME): + pdf_bytes = up.read() + try: + pages, warns = extract_pages_auto(pdf_bytes, allow_ocr=True) + except Exception as e: + st.error(f"Couldn't read PDF: {type(e).__name__}: {e}") + return False + st.session_state[K_SAMPLE_BYTES] = pdf_bytes + st.session_state[K_SAMPLE_NAME] = up.name + st.session_state[K_SAMPLE_PAGES] = pages + for w in warns: + st.info(w) + return bool(st.session_state.get(K_SAMPLE_PAGES)) + + +def _render_columns_editor(tpl: dict) -> None: + """Edit the column mapping (source index → target field) and the + boundary x-positions in one place.""" + st.markdown("##### Columns") + boundaries = list(tpl["table"].get("column_boundaries") or []) + bounds_text = st.text_input( + "Column boundaries (x-positions, comma-separated)", + value=", ".join(str(int(b)) for b in boundaries), + help=( + "N boundaries create N+1 columns. The visual picker in " + "the next phase will set these for you — until then you " + "can read x-positions from the page-preview hover tip " + "below, or trial-and-error against the live preview." + ), + ) + try: + tpl["table"]["column_boundaries"] = sorted( + float(x.strip()) for x in bounds_text.split(",") if x.strip() + ) + except ValueError: + st.warning("Column boundaries must be numbers.") + + n_cols = len(tpl["table"]["column_boundaries"]) + 1 + st.caption(f"{n_cols} source column(s) defined.") + + # Column mapping: one row per output column the user wants. + columns_state = tpl.get("columns") or [] + if not columns_state: + # Seed a reasonable default the first time. + columns_state = [ + {"source": 0, "target": "date"}, + {"source": 1, "target": "description"}, + {"source": 2, "target": "amount"}, + ][:n_cols] + + targets = ["date", "description", "amount", "amount_debit", + "amount_credit", "balance", "type"] + new_columns: list[dict] = [] + for i, col in enumerate(columns_state): + c1, c2, c3 = st.columns([2, 3, 1]) + src = c1.number_input( + f"Source #{i + 1}", + min_value=0, + max_value=max(n_cols - 1, 0), + value=min(int(col.get("source", 0)), max(n_cols - 1, 0)), + step=1, + key=f"src_{i}", + ) + tgt_default = col.get("target", "") + if tgt_default not in targets: + targets_ext = targets + [tgt_default] if tgt_default else targets + else: + targets_ext = targets + tgt = c2.selectbox( + f"Target #{i + 1}", + targets_ext, + index=(targets_ext.index(tgt_default) if tgt_default in targets_ext else 0), + key=f"tgt_{i}", + ) + keep = c3.checkbox("Keep", value=True, key=f"keep_{i}") + if keep: + new_columns.append({"source": int(src), "target": tgt}) + + if st.button("+ Add column", key="add_col"): + new_columns.append({"source": n_cols - 1 if n_cols else 0, "target": ""}) + st.rerun() + tpl["columns"] = new_columns + + +def _render_build_form(tpl: dict) -> None: + """Render every editable field on the template, in tabs.""" + t1, t2, t3, t4 = st.tabs(["Pages & table", "Columns", "Parsing", "Save"]) + + with t1: + c1, c2 = st.columns(2) + with c1: + tpl["name"] = st.text_input("Template name", value=tpl.get("name", "")) + tpl["slug"] = slugify(tpl["name"]) + tpl["notes"] = st.text_area("Notes", value=tpl.get("notes", ""), height=70) + tpl["pages"]["range"] = st.text_input( + "Pages", + value=tpl["pages"].get("range", "all"), + help='"all", "1-3", "2,4", "3-" all work.', + ) + tpl["pages"]["skip_matching"] = st.text_input( + "Skip pages matching (regex, optional)", + value=tpl["pages"].get("skip_matching", ""), + help='e.g. "Page \\d+ of" to skip cover pages.', + ) + with c2: + tpl["table"]["header_text"] = st.text_input( + "Header text (transactions table)", + value=tpl["table"].get("header_text", ""), + help=( + "Words from the header row of the transactions table, " + "e.g. \"Date Description Amount Balance\". Extraction " + "starts on the row AFTER this match." + ), + ) + ends = "\n".join(tpl["table"].get("end_markers") or []) + new_ends = st.text_area( + "End markers (one regex per line)", + value=ends, + help='e.g. "Closing balance", "Page \\d+ of".', + height=80, + ) + tpl["table"]["end_markers"] = [ + line.strip() for line in new_ends.splitlines() if line.strip() + ] + skips = "\n".join(tpl["table"].get("skip_rows_matching") or []) + new_skips = st.text_area( + "Skip rows matching (one regex per line, optional)", + value=skips, + help='Common entries: "Total", "Subtotal", "^Page ".', + height=80, + ) + tpl["table"]["skip_rows_matching"] = [ + line.strip() for line in new_skips.splitlines() if line.strip() + ] + tpl["table"]["y_tolerance"] = st.number_input( + "Row y-tolerance (pts)", + min_value=0.5, + max_value=20.0, + value=float(tpl["table"].get("y_tolerance", 3.0)), + step=0.5, + help=( + "How close two words' y-positions must be to be on the " + "same row. Bump up if rows are getting split, down if " + "rows are merging." + ), + ) + + with t2: + _render_columns_editor(tpl) + + with t3: + c1, c2 = st.columns(2) + with c1: + tpl["parse"]["date_format"] = st.text_input( + "Date format", + value=tpl["parse"].get("date_format", "%m/%d/%Y"), + help=( + "Python strftime format. Common: %m/%d/%Y (US), " + "%d/%m/%Y (EU), %Y-%m-%d (ISO)." + ), + ) + tpl["parse"]["currency_strip"] = st.text_input( + "Currency symbols to strip", + value=tpl["parse"].get("currency_strip", "$"), + ) + tpl["parse"]["decimal_separator"] = st.text_input( + "Decimal separator", + value=tpl["parse"].get("decimal_separator", "."), + max_chars=1, + ) + tpl["parse"]["thousands_separator"] = st.text_input( + "Thousands separator", + value=tpl["parse"].get("thousands_separator", ","), + max_chars=1, + ) + with c2: + tpl["parse"]["amount_negative_in_parens"] = st.checkbox( + "Parens = negative amount", + value=bool(tpl["parse"].get("amount_negative_in_parens", True)), + ) + tpl["parse"]["merge_multiline_description"] = st.checkbox( + "Merge multi-line descriptions", + value=bool(tpl["parse"].get("merge_multiline_description", True)), + help=( + "Rows with no date attach to the previous row's " + "description — handles wrapped vendor names." + ), + ) + + with t4: + ok, errors = validate_template(tpl) + if errors: + for err in errors: + st.error(err) + c1, c2 = st.columns([1, 3]) + with c1: + save_btn = st.button("Save template", type="primary", disabled=not ok) + with c2: + st.caption( + f"Will save as: ``{tpl.get('slug') or '—'}`` " + f"(folder: ``~/.datatools/pdf_templates/``)" + ) + if save_btn: + try: + slug = save_template(tpl) + st.success(f"Saved as **{slug}**. Switch to Extract mode to use it.") + log_event( + "tool_run", + "PDF Extractor template saved", + page="10_PDF_Extractor", + template=slug, + ) + except Exception as e: + st.error(f"Save failed: {e}") + + +def _render_preview(tpl: dict) -> None: + """Below-the-fold live preview against the cached sample pages.""" + pages = st.session_state.get(K_SAMPLE_PAGES) + if not pages: + return + st.divider() + st.markdown("##### Live preview") + try: + df = apply_template(pages, tpl) + except Exception as e: + st.error(f"Preview failed: {type(e).__name__}: {e}") + return + if df.empty: + st.info( + "Template doesn't match any rows yet. Common fixes: tighten " + "the header text, add an end marker, adjust column " + "boundaries." + ) + else: + st.caption(f"{len(df)} row(s) from {len(pages)} page(s)") + st.dataframe(df.head(50), hide_index=True, use_container_width=True) + + +def _render_build_mode() -> None: + # Optionally load an existing template into the form + templates = list_templates() + c1, c2, c3 = st.columns([2, 2, 1]) + with c1: + existing_label = "— start from scratch —" + choices = [existing_label] + [ + f"{t['name']} · {t['slug']}" for t in templates + ] + picked = st.selectbox("Load existing", choices, key="build_load_pick") + with c2: + if st.button("Load", disabled=picked == existing_label, key="build_load_btn"): + slug = picked.split(" · ")[-1] + try: + st.session_state[K_CURRENT_TEMPLATE] = load_template(slug) + st.rerun() + except Exception as e: + st.error(f"Load failed: {e}") + with c3: + if st.button("New", key="build_new_btn"): + st.session_state[K_CURRENT_TEMPLATE] = new_template("New template") + st.rerun() + + tpl = _get_or_init(K_CURRENT_TEMPLATE, new_template("New template")) + + if not _ensure_sample_loaded(): + st.info( + "Upload a sample statement from this source to drive the live " + "preview. Your template is built against the sample's layout." + ) + return + + _render_build_form(tpl) + _render_preview(tpl) + + +# =========================================================================== +# Manage-templates mode +# =========================================================================== + + +def _render_manage_mode() -> None: + templates = list_templates() + + st.markdown("##### Import a template") + up = st.file_uploader( + "Template JSON", + type=["json"], + key="manage_import_uploader", + help="Paste a colleague's exported JSON file here to add it to your library.", + ) + if up is not None: + try: + imported = template_from_json(up.read().decode("utf-8")) + save_template(imported) + st.success(f"Imported **{imported['name']}** (slug `{imported['slug']}`).") + st.rerun() + except Exception as e: + st.error(f"Import failed: {e}") + + st.divider() + st.markdown("##### Existing templates") + if not templates: + st.caption("No templates yet — build one in **Build template** mode.") + return + + for t in templates: + slug = t["slug"] + with st.container(border=True): + c1, c2, c3, c4 = st.columns([3, 3, 2, 2]) + with c1: + st.markdown(f"**{t['name']}**") + st.caption(f"`{slug}`") + with c2: + st.caption(f"Updated: {t.get('updated_at', '—')}") + if t.get("notes"): + st.caption(t["notes"]) + with c3: + try: + full = load_template(slug) + payload = template_to_json(full) + st.download_button( + "Export", + data=payload.encode("utf-8"), + file_name=f"{slug}.json", + mime="application/json", + key=f"export_{slug}", + ) + except Exception as e: + st.error(f"Read failed: {e}") + with c4: + if st.button("Delete", key=f"del_{slug}"): + delete_template(slug) + st.success(f"Deleted `{slug}`.") + st.rerun() + + +# =========================================================================== +# Dispatch +# =========================================================================== + + +if mode == "Extract": + _render_extract_mode() +elif mode == "Build template": + _render_build_mode() +elif mode == "Manage templates": + _render_manage_mode() diff --git a/src/gui/tools_registry.py b/src/gui/tools_registry.py index dbfa93c..2f58bea 100644 --- a/src/gui/tools_registry.py +++ b/src/gui/tools_registry.py @@ -145,6 +145,18 @@ TOOLS: list[Tool] = [ status="Ready", section="automations", ), + Tool( + tool_id="10_pdf_extractor", + icon=":material/picture_as_pdf:", + name="PDF to CSV", + description=( + "Extract bank-statement transactions from PDFs using reusable " + "per-source templates." + ), + page_slug="10_PDF_Extractor", + status="Ready", + section="transformations", + ), ] diff --git a/src/i18n/packs/en.json b/src/i18n/packs/en.json index 762ec86..020df22 100644 --- a/src/i18n/packs/en.json +++ b/src/i18n/packs/en.json @@ -158,6 +158,12 @@ "description": "Chain tools in recommended order and pass output between steps.", "page_title": "Automated Workflows", "page_caption": "Chain DataTools cleaning steps into one repeatable workflow. The pipeline recommends an order; you stay in control." + }, + "10_pdf_extractor": { + "name": "PDF to CSV", + "description": "Extract bank-statement transactions from PDFs using reusable per-source templates.", + "page_title": "PDF to CSV", + "page_caption": "Extract transaction tables from bank-statement PDFs. Build one template per source and reuse it for every statement that follows the same layout. Runs locally — your data never leaves this computer." } }, "nav": { diff --git a/src/i18n/packs/es.json b/src/i18n/packs/es.json index 768e966..62ced16 100644 --- a/src/i18n/packs/es.json +++ b/src/i18n/packs/es.json @@ -158,6 +158,12 @@ "description": "Encadena herramientas en el orden recomendado y pasa la salida entre pasos.", "page_title": "Flujos automatizados", "page_caption": "Encadena pasos de limpieza de DataTools en un flujo repetible. La canalización recomienda un orden; tú mantienes el control." + }, + "10_pdf_extractor": { + "name": "PDF a CSV", + "description": "Extrae transacciones de extractos bancarios en PDF usando plantillas reutilizables por origen.", + "page_title": "PDF a CSV", + "page_caption": "Extrae tablas de transacciones de extractos bancarios en PDF. Crea una plantilla por origen y reutilízala para cada extracto que siga el mismo formato. Se ejecuta localmente — tus datos no salen de este equipo." } }, "nav": {