feat(pdf): tool page with Extract / Build / Manage modes
Phase 3/6. Wires the PDF Extractor into the GUI as a new
"transformations" tool with three modes selected by a horizontal
radio at the top of the page:
**Extract** — pick a saved template, upload one or more
statement PDFs (single + batch shipping together to keep the
common case one-step), get a previewed DataFrame + CSV download.
Per-file row counts and warnings are surfaced; failures on one
file don't kill the whole batch. The combined CSV gets a
``source_file`` first column so the accountant can sort/filter
by statement.
**Build template** — load an existing template or start fresh,
upload a sample PDF, edit every schema field across four tabs
(Pages & table / Columns / Parsing / Save). A live preview below
re-runs ``apply_template`` against the sample on each re-render
so the user sees their changes hit rows immediately. The column-
boundary editor is text-input ("comma-separated x-positions") for
now — replaced by the drawable-canvas visual picker in commit 5.
**Manage templates** — list with rename / delete / export
(downloads the canonical JSON) / import (uploads someone else's
JSON, validated through ``template_from_json``).
Heavy work (``extract_pages_auto``) only runs on explicit user
action (Extract / a new sample upload), and the parsed Page list
is cached in ``st.session_state`` so widget-edit reruns don't
re-parse the PDF.
Logging: tool runs and template saves both hit the audit log via
``log_event("tool_run", …)``, matching every other tool's
instrumentation pattern.
Registered in ``tools_registry.py`` under ``transformations``
with status ``Ready`` and the picture-as-pdf Material icon. i18n
keys added for en + es ("PDF to CSV" / "PDF a CSV").
OCR is wired in this commit — ``extract_pages_auto`` already
falls back through ``pytesseract`` when the binary is available,
and the warning strings it returns surface as ``st.info`` /
``st.warning`` per-file. Commit 6 will polish the OCR UX with a
status row.
Next commits build on this page:
4 — batch progress + cancellation + per-file error grouping
5 — drawable-canvas visual picker replaces text x-positions
6 — OCR availability banner + scanned-page indicators
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
584
src/gui/pages/10_PDF_Extractor.py
Normal file
584
src/gui/pages/10_PDF_Extractor.py
Normal file
@@ -0,0 +1,584 @@
|
||||
"""PDF Extractor — extract bank-statement transactions to CSV.
|
||||
|
||||
Three modes:
|
||||
|
||||
- **Extract** (daily workflow): pick a saved template, upload a
|
||||
PDF, get a CSV preview + download.
|
||||
- **Build template**: upload a sample PDF, configure how the
|
||||
table is identified, save the template for reuse.
|
||||
- **Manage templates**: list / rename / delete / export / import.
|
||||
|
||||
The expensive step is ``extract_pages_auto`` (PDF I/O + word
|
||||
extraction + optional OCR). It runs only on explicit user action
|
||||
("Extract" / "Preview"), and results are stashed in session_state
|
||||
so re-renders from form-field edits don't re-parse the PDF. Heavy
|
||||
work off Streamlit's rerun-on-every-widget path.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
if str(_project_root) not in sys.path:
|
||||
sys.path.insert(0, str(_project_root))
|
||||
|
||||
from src.audit import log_event, log_page_open
|
||||
from src.gui.components import hide_streamlit_chrome, render_sticky_footer
|
||||
from src.pdf_extract import apply_template, extract_pages_auto
|
||||
from src.pdf_templates import (
|
||||
SCHEMA_VERSION,
|
||||
VALID_TARGETS,
|
||||
delete_template,
|
||||
list_templates,
|
||||
load_template,
|
||||
new_template,
|
||||
save_template,
|
||||
slugify,
|
||||
template_from_json,
|
||||
template_to_json,
|
||||
validate_template,
|
||||
)
|
||||
|
||||
log_page_open("10_PDF_Extractor")
|
||||
|
||||
_ICON_PATH = str(Path(__file__).parent.parent / "assets" / "datatools_icon_256.png")
|
||||
st.set_page_config(
|
||||
page_title="PDF to CSV · DataTools",
|
||||
page_icon=_ICON_PATH,
|
||||
layout="wide",
|
||||
)
|
||||
hide_streamlit_chrome()
|
||||
render_sticky_footer()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Session-state keys (centralized so the build / extract flows agree on names)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
K_MODE = "pdf_mode"
|
||||
K_CURRENT_TEMPLATE = "pdf_tpl_current"
|
||||
K_SAMPLE_BYTES = "pdf_tpl_sample_bytes"
|
||||
K_SAMPLE_NAME = "pdf_tpl_sample_name"
|
||||
K_SAMPLE_PAGES = "pdf_tpl_sample_pages"
|
||||
K_EXTRACT_DF = "pdf_extract_df"
|
||||
K_EXTRACT_WARNINGS = "pdf_extract_warnings"
|
||||
K_EXTRACT_FILES = "pdf_extract_files"
|
||||
|
||||
|
||||
def _get_or_init(key: str, default):
|
||||
if key not in st.session_state:
|
||||
st.session_state[key] = default
|
||||
return st.session_state[key]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Page header + mode selector
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.markdown("# PDF to CSV")
|
||||
st.caption(
|
||||
"Extract transaction tables from bank-statement PDFs. Build one "
|
||||
"template per source (bank + account type), then reuse it for "
|
||||
"every statement that follows the same layout."
|
||||
)
|
||||
|
||||
mode = st.radio(
|
||||
"Mode",
|
||||
["Extract", "Build template", "Manage templates"],
|
||||
horizontal=True,
|
||||
key=K_MODE,
|
||||
)
|
||||
st.divider()
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Extract mode
|
||||
# ===========================================================================
|
||||
|
||||
|
||||
def _render_extract_mode() -> None:
|
||||
templates = list_templates()
|
||||
if not templates:
|
||||
st.info(
|
||||
"No templates yet. Switch to **Build template** to create your "
|
||||
"first one — you'll need a sample PDF from the source bank."
|
||||
)
|
||||
return
|
||||
|
||||
options = {f"{t['name']} · {t['slug']}": t["slug"] for t in templates}
|
||||
label = st.selectbox("Template", list(options.keys()))
|
||||
slug = options[label]
|
||||
|
||||
uploads = st.file_uploader(
|
||||
"Statement PDF(s)",
|
||||
type=["pdf"],
|
||||
accept_multiple_files=True,
|
||||
help=(
|
||||
"Drop one or more statements from the same source. Rows from "
|
||||
"every file are combined into a single CSV, tagged with the "
|
||||
"source filename."
|
||||
),
|
||||
)
|
||||
|
||||
run = st.button("Extract", type="primary", disabled=not uploads)
|
||||
if run and uploads:
|
||||
try:
|
||||
tpl = load_template(slug)
|
||||
except Exception as e:
|
||||
st.error(f"Couldn't load template {slug!r}: {e}")
|
||||
return
|
||||
|
||||
per_file_frames: list[pd.DataFrame] = []
|
||||
all_warnings: list[str] = []
|
||||
files_meta: list[dict] = []
|
||||
progress = st.progress(0.0, text="Reading PDFs…")
|
||||
for i, up in enumerate(uploads, start=1):
|
||||
try:
|
||||
pdf_bytes = up.read()
|
||||
pages, warns = extract_pages_auto(pdf_bytes, allow_ocr=True)
|
||||
df = apply_template(pages, tpl)
|
||||
df.insert(0, "source_file", up.name)
|
||||
per_file_frames.append(df)
|
||||
files_meta.append({
|
||||
"file": up.name,
|
||||
"rows": len(df),
|
||||
"pages": len(pages),
|
||||
})
|
||||
for w in warns:
|
||||
all_warnings.append(f"[{up.name}] {w}")
|
||||
except Exception as e:
|
||||
all_warnings.append(
|
||||
f"[{up.name}] extraction failed: "
|
||||
f"{type(e).__name__}: {e}"
|
||||
)
|
||||
files_meta.append({
|
||||
"file": up.name, "rows": 0, "pages": 0, "error": str(e),
|
||||
})
|
||||
progress.progress(i / len(uploads), text=f"Read {i}/{len(uploads)}")
|
||||
progress.empty()
|
||||
|
||||
if per_file_frames:
|
||||
combined = pd.concat(per_file_frames, ignore_index=True)
|
||||
else:
|
||||
combined = pd.DataFrame()
|
||||
st.session_state[K_EXTRACT_DF] = combined
|
||||
st.session_state[K_EXTRACT_WARNINGS] = all_warnings
|
||||
st.session_state[K_EXTRACT_FILES] = files_meta
|
||||
|
||||
log_event(
|
||||
"tool_run",
|
||||
"PDF Extractor run",
|
||||
page="10_PDF_Extractor",
|
||||
template=slug,
|
||||
files=len(uploads),
|
||||
rows=len(combined),
|
||||
)
|
||||
|
||||
df = st.session_state.get(K_EXTRACT_DF)
|
||||
if isinstance(df, pd.DataFrame):
|
||||
warnings = st.session_state.get(K_EXTRACT_WARNINGS, []) or []
|
||||
files_meta = st.session_state.get(K_EXTRACT_FILES, []) or []
|
||||
if files_meta:
|
||||
st.markdown("#### Per-file summary")
|
||||
st.dataframe(
|
||||
pd.DataFrame(files_meta),
|
||||
hide_index=True,
|
||||
use_container_width=True,
|
||||
)
|
||||
if warnings:
|
||||
with st.expander(f"Warnings ({len(warnings)})", expanded=False):
|
||||
for w in warnings:
|
||||
st.warning(w)
|
||||
|
||||
if df.empty:
|
||||
st.info(
|
||||
"No rows were extracted. Re-check the template's header "
|
||||
"text, column boundaries, and end markers in **Build "
|
||||
"template** mode against a sample PDF."
|
||||
)
|
||||
else:
|
||||
st.markdown(f"#### Extracted rows ({len(df):,})")
|
||||
st.dataframe(df, hide_index=True, use_container_width=True)
|
||||
csv_bytes = df.to_csv(index=False).encode("utf-8")
|
||||
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
st.download_button(
|
||||
"Download CSV",
|
||||
data=csv_bytes,
|
||||
file_name=f"transactions-{slug}-{ts}.csv",
|
||||
mime="text/csv",
|
||||
type="primary",
|
||||
)
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Build-template mode
|
||||
# ===========================================================================
|
||||
|
||||
|
||||
def _ensure_sample_loaded() -> bool:
|
||||
"""Side-bar uploader for the sample PDF. Returns True if a sample
|
||||
is loaded and parsed (pages cached in session_state)."""
|
||||
up = st.file_uploader(
|
||||
"Sample statement",
|
||||
type=["pdf"],
|
||||
help=(
|
||||
"Used to drive the live preview while you build the "
|
||||
"template — pick a representative statement from this "
|
||||
"source."
|
||||
),
|
||||
key="pdf_tpl_sample_uploader",
|
||||
)
|
||||
if up is not None and up.name != st.session_state.get(K_SAMPLE_NAME):
|
||||
pdf_bytes = up.read()
|
||||
try:
|
||||
pages, warns = extract_pages_auto(pdf_bytes, allow_ocr=True)
|
||||
except Exception as e:
|
||||
st.error(f"Couldn't read PDF: {type(e).__name__}: {e}")
|
||||
return False
|
||||
st.session_state[K_SAMPLE_BYTES] = pdf_bytes
|
||||
st.session_state[K_SAMPLE_NAME] = up.name
|
||||
st.session_state[K_SAMPLE_PAGES] = pages
|
||||
for w in warns:
|
||||
st.info(w)
|
||||
return bool(st.session_state.get(K_SAMPLE_PAGES))
|
||||
|
||||
|
||||
def _render_columns_editor(tpl: dict) -> None:
|
||||
"""Edit the column mapping (source index → target field) and the
|
||||
boundary x-positions in one place."""
|
||||
st.markdown("##### Columns")
|
||||
boundaries = list(tpl["table"].get("column_boundaries") or [])
|
||||
bounds_text = st.text_input(
|
||||
"Column boundaries (x-positions, comma-separated)",
|
||||
value=", ".join(str(int(b)) for b in boundaries),
|
||||
help=(
|
||||
"N boundaries create N+1 columns. The visual picker in "
|
||||
"the next phase will set these for you — until then you "
|
||||
"can read x-positions from the page-preview hover tip "
|
||||
"below, or trial-and-error against the live preview."
|
||||
),
|
||||
)
|
||||
try:
|
||||
tpl["table"]["column_boundaries"] = sorted(
|
||||
float(x.strip()) for x in bounds_text.split(",") if x.strip()
|
||||
)
|
||||
except ValueError:
|
||||
st.warning("Column boundaries must be numbers.")
|
||||
|
||||
n_cols = len(tpl["table"]["column_boundaries"]) + 1
|
||||
st.caption(f"{n_cols} source column(s) defined.")
|
||||
|
||||
# Column mapping: one row per output column the user wants.
|
||||
columns_state = tpl.get("columns") or []
|
||||
if not columns_state:
|
||||
# Seed a reasonable default the first time.
|
||||
columns_state = [
|
||||
{"source": 0, "target": "date"},
|
||||
{"source": 1, "target": "description"},
|
||||
{"source": 2, "target": "amount"},
|
||||
][:n_cols]
|
||||
|
||||
targets = ["date", "description", "amount", "amount_debit",
|
||||
"amount_credit", "balance", "type"]
|
||||
new_columns: list[dict] = []
|
||||
for i, col in enumerate(columns_state):
|
||||
c1, c2, c3 = st.columns([2, 3, 1])
|
||||
src = c1.number_input(
|
||||
f"Source #{i + 1}",
|
||||
min_value=0,
|
||||
max_value=max(n_cols - 1, 0),
|
||||
value=min(int(col.get("source", 0)), max(n_cols - 1, 0)),
|
||||
step=1,
|
||||
key=f"src_{i}",
|
||||
)
|
||||
tgt_default = col.get("target", "")
|
||||
if tgt_default not in targets:
|
||||
targets_ext = targets + [tgt_default] if tgt_default else targets
|
||||
else:
|
||||
targets_ext = targets
|
||||
tgt = c2.selectbox(
|
||||
f"Target #{i + 1}",
|
||||
targets_ext,
|
||||
index=(targets_ext.index(tgt_default) if tgt_default in targets_ext else 0),
|
||||
key=f"tgt_{i}",
|
||||
)
|
||||
keep = c3.checkbox("Keep", value=True, key=f"keep_{i}")
|
||||
if keep:
|
||||
new_columns.append({"source": int(src), "target": tgt})
|
||||
|
||||
if st.button("+ Add column", key="add_col"):
|
||||
new_columns.append({"source": n_cols - 1 if n_cols else 0, "target": ""})
|
||||
st.rerun()
|
||||
tpl["columns"] = new_columns
|
||||
|
||||
|
||||
def _render_build_form(tpl: dict) -> None:
|
||||
"""Render every editable field on the template, in tabs."""
|
||||
t1, t2, t3, t4 = st.tabs(["Pages & table", "Columns", "Parsing", "Save"])
|
||||
|
||||
with t1:
|
||||
c1, c2 = st.columns(2)
|
||||
with c1:
|
||||
tpl["name"] = st.text_input("Template name", value=tpl.get("name", ""))
|
||||
tpl["slug"] = slugify(tpl["name"])
|
||||
tpl["notes"] = st.text_area("Notes", value=tpl.get("notes", ""), height=70)
|
||||
tpl["pages"]["range"] = st.text_input(
|
||||
"Pages",
|
||||
value=tpl["pages"].get("range", "all"),
|
||||
help='"all", "1-3", "2,4", "3-" all work.',
|
||||
)
|
||||
tpl["pages"]["skip_matching"] = st.text_input(
|
||||
"Skip pages matching (regex, optional)",
|
||||
value=tpl["pages"].get("skip_matching", ""),
|
||||
help='e.g. "Page \\d+ of" to skip cover pages.',
|
||||
)
|
||||
with c2:
|
||||
tpl["table"]["header_text"] = st.text_input(
|
||||
"Header text (transactions table)",
|
||||
value=tpl["table"].get("header_text", ""),
|
||||
help=(
|
||||
"Words from the header row of the transactions table, "
|
||||
"e.g. \"Date Description Amount Balance\". Extraction "
|
||||
"starts on the row AFTER this match."
|
||||
),
|
||||
)
|
||||
ends = "\n".join(tpl["table"].get("end_markers") or [])
|
||||
new_ends = st.text_area(
|
||||
"End markers (one regex per line)",
|
||||
value=ends,
|
||||
help='e.g. "Closing balance", "Page \\d+ of".',
|
||||
height=80,
|
||||
)
|
||||
tpl["table"]["end_markers"] = [
|
||||
line.strip() for line in new_ends.splitlines() if line.strip()
|
||||
]
|
||||
skips = "\n".join(tpl["table"].get("skip_rows_matching") or [])
|
||||
new_skips = st.text_area(
|
||||
"Skip rows matching (one regex per line, optional)",
|
||||
value=skips,
|
||||
help='Common entries: "Total", "Subtotal", "^Page ".',
|
||||
height=80,
|
||||
)
|
||||
tpl["table"]["skip_rows_matching"] = [
|
||||
line.strip() for line in new_skips.splitlines() if line.strip()
|
||||
]
|
||||
tpl["table"]["y_tolerance"] = st.number_input(
|
||||
"Row y-tolerance (pts)",
|
||||
min_value=0.5,
|
||||
max_value=20.0,
|
||||
value=float(tpl["table"].get("y_tolerance", 3.0)),
|
||||
step=0.5,
|
||||
help=(
|
||||
"How close two words' y-positions must be to be on the "
|
||||
"same row. Bump up if rows are getting split, down if "
|
||||
"rows are merging."
|
||||
),
|
||||
)
|
||||
|
||||
with t2:
|
||||
_render_columns_editor(tpl)
|
||||
|
||||
with t3:
|
||||
c1, c2 = st.columns(2)
|
||||
with c1:
|
||||
tpl["parse"]["date_format"] = st.text_input(
|
||||
"Date format",
|
||||
value=tpl["parse"].get("date_format", "%m/%d/%Y"),
|
||||
help=(
|
||||
"Python strftime format. Common: %m/%d/%Y (US), "
|
||||
"%d/%m/%Y (EU), %Y-%m-%d (ISO)."
|
||||
),
|
||||
)
|
||||
tpl["parse"]["currency_strip"] = st.text_input(
|
||||
"Currency symbols to strip",
|
||||
value=tpl["parse"].get("currency_strip", "$"),
|
||||
)
|
||||
tpl["parse"]["decimal_separator"] = st.text_input(
|
||||
"Decimal separator",
|
||||
value=tpl["parse"].get("decimal_separator", "."),
|
||||
max_chars=1,
|
||||
)
|
||||
tpl["parse"]["thousands_separator"] = st.text_input(
|
||||
"Thousands separator",
|
||||
value=tpl["parse"].get("thousands_separator", ","),
|
||||
max_chars=1,
|
||||
)
|
||||
with c2:
|
||||
tpl["parse"]["amount_negative_in_parens"] = st.checkbox(
|
||||
"Parens = negative amount",
|
||||
value=bool(tpl["parse"].get("amount_negative_in_parens", True)),
|
||||
)
|
||||
tpl["parse"]["merge_multiline_description"] = st.checkbox(
|
||||
"Merge multi-line descriptions",
|
||||
value=bool(tpl["parse"].get("merge_multiline_description", True)),
|
||||
help=(
|
||||
"Rows with no date attach to the previous row's "
|
||||
"description — handles wrapped vendor names."
|
||||
),
|
||||
)
|
||||
|
||||
with t4:
|
||||
ok, errors = validate_template(tpl)
|
||||
if errors:
|
||||
for err in errors:
|
||||
st.error(err)
|
||||
c1, c2 = st.columns([1, 3])
|
||||
with c1:
|
||||
save_btn = st.button("Save template", type="primary", disabled=not ok)
|
||||
with c2:
|
||||
st.caption(
|
||||
f"Will save as: ``{tpl.get('slug') or '—'}`` "
|
||||
f"(folder: ``~/.datatools/pdf_templates/``)"
|
||||
)
|
||||
if save_btn:
|
||||
try:
|
||||
slug = save_template(tpl)
|
||||
st.success(f"Saved as **{slug}**. Switch to Extract mode to use it.")
|
||||
log_event(
|
||||
"tool_run",
|
||||
"PDF Extractor template saved",
|
||||
page="10_PDF_Extractor",
|
||||
template=slug,
|
||||
)
|
||||
except Exception as e:
|
||||
st.error(f"Save failed: {e}")
|
||||
|
||||
|
||||
def _render_preview(tpl: dict) -> None:
|
||||
"""Below-the-fold live preview against the cached sample pages."""
|
||||
pages = st.session_state.get(K_SAMPLE_PAGES)
|
||||
if not pages:
|
||||
return
|
||||
st.divider()
|
||||
st.markdown("##### Live preview")
|
||||
try:
|
||||
df = apply_template(pages, tpl)
|
||||
except Exception as e:
|
||||
st.error(f"Preview failed: {type(e).__name__}: {e}")
|
||||
return
|
||||
if df.empty:
|
||||
st.info(
|
||||
"Template doesn't match any rows yet. Common fixes: tighten "
|
||||
"the header text, add an end marker, adjust column "
|
||||
"boundaries."
|
||||
)
|
||||
else:
|
||||
st.caption(f"{len(df)} row(s) from {len(pages)} page(s)")
|
||||
st.dataframe(df.head(50), hide_index=True, use_container_width=True)
|
||||
|
||||
|
||||
def _render_build_mode() -> None:
|
||||
# Optionally load an existing template into the form
|
||||
templates = list_templates()
|
||||
c1, c2, c3 = st.columns([2, 2, 1])
|
||||
with c1:
|
||||
existing_label = "— start from scratch —"
|
||||
choices = [existing_label] + [
|
||||
f"{t['name']} · {t['slug']}" for t in templates
|
||||
]
|
||||
picked = st.selectbox("Load existing", choices, key="build_load_pick")
|
||||
with c2:
|
||||
if st.button("Load", disabled=picked == existing_label, key="build_load_btn"):
|
||||
slug = picked.split(" · ")[-1]
|
||||
try:
|
||||
st.session_state[K_CURRENT_TEMPLATE] = load_template(slug)
|
||||
st.rerun()
|
||||
except Exception as e:
|
||||
st.error(f"Load failed: {e}")
|
||||
with c3:
|
||||
if st.button("New", key="build_new_btn"):
|
||||
st.session_state[K_CURRENT_TEMPLATE] = new_template("New template")
|
||||
st.rerun()
|
||||
|
||||
tpl = _get_or_init(K_CURRENT_TEMPLATE, new_template("New template"))
|
||||
|
||||
if not _ensure_sample_loaded():
|
||||
st.info(
|
||||
"Upload a sample statement from this source to drive the live "
|
||||
"preview. Your template is built against the sample's layout."
|
||||
)
|
||||
return
|
||||
|
||||
_render_build_form(tpl)
|
||||
_render_preview(tpl)
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Manage-templates mode
|
||||
# ===========================================================================
|
||||
|
||||
|
||||
def _render_manage_mode() -> None:
|
||||
templates = list_templates()
|
||||
|
||||
st.markdown("##### Import a template")
|
||||
up = st.file_uploader(
|
||||
"Template JSON",
|
||||
type=["json"],
|
||||
key="manage_import_uploader",
|
||||
help="Paste a colleague's exported JSON file here to add it to your library.",
|
||||
)
|
||||
if up is not None:
|
||||
try:
|
||||
imported = template_from_json(up.read().decode("utf-8"))
|
||||
save_template(imported)
|
||||
st.success(f"Imported **{imported['name']}** (slug `{imported['slug']}`).")
|
||||
st.rerun()
|
||||
except Exception as e:
|
||||
st.error(f"Import failed: {e}")
|
||||
|
||||
st.divider()
|
||||
st.markdown("##### Existing templates")
|
||||
if not templates:
|
||||
st.caption("No templates yet — build one in **Build template** mode.")
|
||||
return
|
||||
|
||||
for t in templates:
|
||||
slug = t["slug"]
|
||||
with st.container(border=True):
|
||||
c1, c2, c3, c4 = st.columns([3, 3, 2, 2])
|
||||
with c1:
|
||||
st.markdown(f"**{t['name']}**")
|
||||
st.caption(f"`{slug}`")
|
||||
with c2:
|
||||
st.caption(f"Updated: {t.get('updated_at', '—')}")
|
||||
if t.get("notes"):
|
||||
st.caption(t["notes"])
|
||||
with c3:
|
||||
try:
|
||||
full = load_template(slug)
|
||||
payload = template_to_json(full)
|
||||
st.download_button(
|
||||
"Export",
|
||||
data=payload.encode("utf-8"),
|
||||
file_name=f"{slug}.json",
|
||||
mime="application/json",
|
||||
key=f"export_{slug}",
|
||||
)
|
||||
except Exception as e:
|
||||
st.error(f"Read failed: {e}")
|
||||
with c4:
|
||||
if st.button("Delete", key=f"del_{slug}"):
|
||||
delete_template(slug)
|
||||
st.success(f"Deleted `{slug}`.")
|
||||
st.rerun()
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Dispatch
|
||||
# ===========================================================================
|
||||
|
||||
|
||||
if mode == "Extract":
|
||||
_render_extract_mode()
|
||||
elif mode == "Build template":
|
||||
_render_build_mode()
|
||||
elif mode == "Manage templates":
|
||||
_render_manage_mode()
|
||||
@@ -145,6 +145,18 @@ TOOLS: list[Tool] = [
|
||||
status="Ready",
|
||||
section="automations",
|
||||
),
|
||||
Tool(
|
||||
tool_id="10_pdf_extractor",
|
||||
icon=":material/picture_as_pdf:",
|
||||
name="PDF to CSV",
|
||||
description=(
|
||||
"Extract bank-statement transactions from PDFs using reusable "
|
||||
"per-source templates."
|
||||
),
|
||||
page_slug="10_PDF_Extractor",
|
||||
status="Ready",
|
||||
section="transformations",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -158,6 +158,12 @@
|
||||
"description": "Chain tools in recommended order and pass output between steps.",
|
||||
"page_title": "Automated Workflows",
|
||||
"page_caption": "Chain DataTools cleaning steps into one repeatable workflow. The pipeline recommends an order; you stay in control."
|
||||
},
|
||||
"10_pdf_extractor": {
|
||||
"name": "PDF to CSV",
|
||||
"description": "Extract bank-statement transactions from PDFs using reusable per-source templates.",
|
||||
"page_title": "PDF to CSV",
|
||||
"page_caption": "Extract transaction tables from bank-statement PDFs. Build one template per source and reuse it for every statement that follows the same layout. Runs locally — your data never leaves this computer."
|
||||
}
|
||||
},
|
||||
"nav": {
|
||||
|
||||
@@ -158,6 +158,12 @@
|
||||
"description": "Encadena herramientas en el orden recomendado y pasa la salida entre pasos.",
|
||||
"page_title": "Flujos automatizados",
|
||||
"page_caption": "Encadena pasos de limpieza de DataTools en un flujo repetible. La canalización recomienda un orden; tú mantienes el control."
|
||||
},
|
||||
"10_pdf_extractor": {
|
||||
"name": "PDF a CSV",
|
||||
"description": "Extrae transacciones de extractos bancarios en PDF usando plantillas reutilizables por origen.",
|
||||
"page_title": "PDF a CSV",
|
||||
"page_caption": "Extrae tablas de transacciones de extractos bancarios en PDF. Crea una plantilla por origen y reutilízala para cada extracto que siga el mismo formato. Se ejecuta localmente — tus datos no salen de este equipo."
|
||||
}
|
||||
},
|
||||
"nav": {
|
||||
|
||||
Reference in New Issue
Block a user