diff --git a/build/datatools.spec b/build/datatools.spec index b5d3268..1469fea 100644 --- a/build/datatools.spec +++ b/build/datatools.spec @@ -58,15 +58,12 @@ hidden_imports += collect_submodules("charset_normalizer") hidden_imports += collect_submodules("openpyxl") hidden_imports += collect_submodules("loguru") -# PDF Extractor stack. ``streamlit_drawable_canvas`` and -# ``pypdfium2`` both have their own PyInstaller hooks under -# ``build/hooks/`` that pull in the native binary + frontend -# assets — keep the ``collect_submodules`` calls here for -# belt-and-braces. +# PDF Extractor stack. ``pypdfium2`` has its own PyInstaller hook +# under ``build/hooks/`` that pulls in the native PDFium binary — +# keep the ``collect_submodules`` calls here for belt-and-braces. hidden_imports += collect_submodules("pdfplumber") hidden_imports += collect_submodules("pdfminer") hidden_imports += collect_submodules("pypdfium2") -hidden_imports += collect_submodules("streamlit_drawable_canvas") hidden_imports += collect_submodules("PIL") hidden_imports += collect_submodules("pytesseract") @@ -91,13 +88,10 @@ datas += collect_data_files("phonenumbers", include_py_files=False) # PDF Extractor data files. ``pypdfium2`` ships a native PDFium # shared library (``.dll`` / ``.so`` / ``.dylib``) under its package -# dir; ``streamlit-drawable-canvas`` ships a built JS bundle that -# Streamlit serves from the package dir at runtime; pdfminer ships -# the Adobe CMap tables it uses for character mapping. Hooks -# under ``build/hooks/`` mirror these calls for explicit -# documentation and survive ``collect_data_files`` regressions. +# dir; ``pdfminer`` ships the Adobe CMap tables it uses for +# character mapping. The drawable-canvas frontend bundle is gone +# now that the visual picker was removed. datas += collect_data_files("pypdfium2", include_py_files=False) -datas += collect_data_files("streamlit_drawable_canvas") datas += collect_data_files("pdfminer", include_py_files=False) # Our application files. PyInstaller's bundler treats source as code diff --git a/build/hooks/hook-streamlit_drawable_canvas.py b/build/hooks/hook-streamlit_drawable_canvas.py deleted file mode 100644 index 17483ab..0000000 --- a/build/hooks/hook-streamlit_drawable_canvas.py +++ /dev/null @@ -1,19 +0,0 @@ -"""PyInstaller hook for streamlit-drawable-canvas. - -Streamlit components are Python packages that also ship a built -JavaScript/CSS bundle Streamlit serves from disk at component- -render time. Without those assets in the bundle the canvas -iframe loads blank — the user sees the page render fine but the -visual picker shows no image and no drawing controls. - -``collect_data_files`` covers the frontend bundle directory -(named ``frontend`` or ``frontend/build`` depending on the -component version). Hidden imports are picked up by the main -spec's ``collect_submodules`` call, repeated here for the same -belt-and-braces reason as ``hook-pypdfium2.py``. -""" - -from PyInstaller.utils.hooks import collect_data_files, collect_submodules - -datas = collect_data_files("streamlit_drawable_canvas") -hiddenimports = collect_submodules("streamlit_drawable_canvas") diff --git a/requirements.txt b/requirements.txt index 226b13d..26e662c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,10 +10,14 @@ phonenumbers>=8.13,<9 streamlit>=1.35,<2 cryptography>=41,<49 # PDF Extractor stack — pinned to exact tested versions so a future -# upstream release can't change the visual picker's coordinate model -# or pdfplumber's word-position behavior mid-build. Bump these +# upstream release can't quietly change pdfplumber's word-position +# behavior or pypdfium2's OCR rendering mid-build. Bump these # explicitly when re-testing against a new release. +# +# ``pypdfium2`` is here for the OCR fallback path only (rasterizing +# pages to images for Tesseract). The drawable-canvas dep was +# removed when the visual picker was ripped out — the scanner is +# pure heuristic now, no coordinate UI. pdfplumber==0.11.9 pypdfium2==5.8.0 pytesseract==0.3.13 -streamlit-drawable-canvas==0.9.3 diff --git a/src/gui/_drawable_canvas_compat.py b/src/gui/_drawable_canvas_compat.py deleted file mode 100644 index e6b2258..0000000 --- a/src/gui/_drawable_canvas_compat.py +++ /dev/null @@ -1,86 +0,0 @@ -"""Compatibility shim for streamlit-drawable-canvas on modern Streamlit. - -``streamlit-drawable-canvas`` 0.9.3 (last release 2023) calls -``streamlit.elements.image.image_to_url(image, width, clamp, -channels, output_format, image_id)``. Streamlit ~1.30+ moved this -helper out of ``streamlit.elements.image`` and changed its -signature so the second positional argument is now a -``LayoutConfig`` dataclass instead of a plain ``int`` width. - -The canvas package hasn't been updated, so on modern Streamlit -its very first call fails with:: - - AttributeError: module 'streamlit.elements.image' - has no attribute 'image_to_url' - -This module re-attaches a wrapper at the old import path that -adapts the old call shape to the new function. Import it once -before any ``st_canvas`` call; idempotent. - -The shim is opt-in (not auto-installed at module import) so the -audit log of "I patched a third-party internal" is visible in -``grep`` rather than silently happening on every page load. -""" - -from __future__ import annotations - - -_PATCHED = False - - -def install() -> None: - """Install the ``image_to_url`` compatibility shim. - - Idempotent — safe to call multiple times. Returns silently - if the canvas package or Streamlit can't be imported (lets - the caller handle the "PDF deps missing" path on its own). - """ - global _PATCHED - if _PATCHED: - return - - try: - import streamlit.elements.image as _old_image_module - except ImportError: - return - - # Already present (old Streamlit, or already shimmed) — bail. - if hasattr(_old_image_module, "image_to_url"): - _PATCHED = True - return - - try: - from streamlit.elements.lib.image_utils import ( - image_to_url as _new_image_to_url, - ) - from streamlit.elements.lib.layout_utils import LayoutConfig - except ImportError: - # ``image_to_url`` is in some other location we don't know - # about yet — let the canvas surface its own error so we - # learn where to look. Don't fail silently. - return - - def _shim( - image, - width, - clamp, - channels, - output_format, - image_id, - ) -> str: - """Old API → new API. The old ``width=-1`` sentinel meant - "use the image's natural width", which is also the new - function's default behavior when ``LayoutConfig`` is left - unconfigured.""" - layout = LayoutConfig() - return _new_image_to_url( - image, - layout, - clamp, - channels, - output_format, - image_id, - ) - - _old_image_module.image_to_url = _shim - _PATCHED = True diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py index aefb50f..e268ddb 100644 --- a/src/gui/pages/10_PDF_Extractor.py +++ b/src/gui/pages/10_PDF_Extractor.py @@ -1,23 +1,13 @@ -"""PDF Extractor — extract bank-statement transactions to CSV. +"""PDF to CSV — heuristic transaction scanner. -Three modes: - -- **Extract** (daily workflow): pick a saved template, upload a - PDF, get a CSV preview + download. -- **Build template**: upload a sample PDF, configure how the - table is identified, save the template for reuse. -- **Manage templates**: list / rename / delete / export / import. - -The expensive step is ``extract_pages_auto`` (PDF I/O + word -extraction + optional OCR). It runs only on explicit user action -("Extract" / "Preview"), and results are stashed in session_state -so re-renders from form-field edits don't re-parse the PDF. Heavy -work off Streamlit's rerun-on-every-widget path. +Upload one or more bank-statement PDFs, scan for transaction-like +rows ([date] [description] [amount]), uncheck the rows you don't +want, download as CSV. No templates, no per-bank configuration, +no coordinate picking. """ from __future__ import annotations -import io import sys from datetime import datetime from pathlib import Path @@ -30,26 +20,17 @@ if str(_project_root) not in sys.path: sys.path.insert(0, str(_project_root)) from src.audit import log_event, log_page_open -from src.gui._drawable_canvas_compat import install as _install_canvas_compat from src.gui.components import hide_streamlit_chrome, render_sticky_footer from src.pdf_extract import ( PdfDependencyMissing, - apply_template, - extract_pages_auto, ocr_available, - render_page_image, + scan_pdf_for_transactions, ) -# streamlit-drawable-canvas 0.9.3 calls a Streamlit internal -# (``image_to_url``) that was relocated in Streamlit ~1.30+. The -# shim re-attaches the old import path with a signature adapter. -# See ``src/gui/_drawable_canvas_compat.py`` for the why. -_install_canvas_compat() - def _pdf_deps_status() -> tuple[bool, list[str]]: - """Probe each runtime PDF dep without forcing the user to hit the - extract button. Returns ``(ok, missing_names)``.""" + """Probe each runtime PDF dep without forcing the user to hit + the Scan button. Returns ``(ok, missing_names)``.""" missing: list[str] = [] for name in ("pdfplumber", "pypdfium2"): try: @@ -59,20 +40,6 @@ def _pdf_deps_status() -> tuple[bool, list[str]]: return (not missing), missing -from src.pdf_templates import ( - SCHEMA_VERSION, - VALID_TARGETS, - delete_template, - list_templates, - load_template, - new_template, - save_template, - slugify, - template_from_json, - template_to_json, - validate_template, -) - log_page_open("10_PDF_Extractor") _ICON_PATH = str(Path(__file__).parent.parent / "assets" / "datatools_icon_256.png") @@ -84,36 +51,25 @@ st.set_page_config( hide_streamlit_chrome() render_sticky_footer() - # --------------------------------------------------------------------------- -# Session-state keys (centralized so the build / extract flows agree on names) +# Session-state keys # --------------------------------------------------------------------------- -K_MODE = "pdf_mode" -K_CURRENT_TEMPLATE = "pdf_tpl_current" -K_SAMPLE_BYTES = "pdf_tpl_sample_bytes" -K_SAMPLE_NAME = "pdf_tpl_sample_name" -K_SAMPLE_PAGES = "pdf_tpl_sample_pages" -K_EXTRACT_DF = "pdf_extract_df" -K_EXTRACT_WARNINGS = "pdf_extract_warnings" -K_EXTRACT_FILES = "pdf_extract_files" - - -def _get_or_init(key: str, default): - if key not in st.session_state: - st.session_state[key] = default - return st.session_state[key] +K_ROWS = "pdf_scan_rows" +K_WARNINGS = "pdf_scan_warnings" +K_SOURCE_COUNT = "pdf_scan_source_count" # --------------------------------------------------------------------------- -# Page header + mode selector +# Header + dep guard # --------------------------------------------------------------------------- st.markdown("# PDF to CSV") st.caption( - "Extract transaction tables from bank-statement PDFs. Build one " - "template per source (bank + account type), then reuse it for " - "every statement that follows the same layout." + "Scan bank-statement PDFs for transaction rows " + "(``[date] [description] [amount]``). Review the table, uncheck " + "rows you don't want, edit any cell that needs fixing, then " + "download as CSV. No per-bank setup." ) _pdf_ok, _pdf_missing = _pdf_deps_status() @@ -122,1011 +78,218 @@ if not _pdf_ok: "**PDF dependencies are not installed.** " f"Missing module(s): `{', '.join(_pdf_missing)}`.\n\n" "Install them into the same Python that launches DataTools:\n\n" - "```\npip install pdfplumber pypdfium2 " - "streamlit-drawable-canvas pytesseract\n```\n\n" - "Then **fully restart the launcher** to pick up the new modules. " - "(Templates you've already saved are unaffected.)" + "```\npip install pdfplumber pypdfium2 pytesseract\n```\n\n" + "Then **fully restart the launcher** to pick up the new modules." ) st.stop() -_ocr_ok, _ocr_reason = ocr_available() -c_mode, c_ocr = st.columns([3, 2]) -with c_mode: - mode = st.radio( - "Mode", - ["Extract", "Build template", "Manage templates"], - horizontal=True, - key=K_MODE, - label_visibility="collapsed", - ) -with c_ocr: - if _ocr_ok: - st.caption("**OCR:** ready · scanned pages will be transcribed.") - else: - import platform as _platform - _os_name = _platform.system() - with st.expander("**OCR:** unavailable", expanded=False): - st.markdown( - f"**Reason:** {_ocr_reason or 'unknown'}.\n\n" - "Scanned (image-based) statements will fall through " - "with warnings. Most modern bank statements are text-" - "based and don't need OCR — only install Tesseract if " - "your statements actually come through as images." - ) - if _os_name == "Windows": - st.markdown( - "**Install on Windows:**\n" - "1. Download the installer from " - "[UB-Mannheim/tesseract](https://github.com/UB-Mannheim/tesseract/wiki) " - "(look for ``tesseract-ocr-w64-setup-…``).\n" - "2. Run it. Keep the **\"Add tesseract to system " - "PATH\"** checkbox on during setup.\n" - "3. Restart the DataTools launcher.\n\n" - "If you installed without PATH and don't want to " - "reinstall, point DataTools at the binary directly " - "by setting the ``DATATOOLS_TESSERACT_PATH`` env " - "var to ``C:\\Program Files\\Tesseract-OCR\\tesseract.exe`` " - "before launching." - ) - elif _os_name == "Darwin": - st.markdown( - "**Install on macOS:** ``brew install tesseract`` " - "(requires [Homebrew](https://brew.sh)). Restart " - "the DataTools launcher afterward." - ) - else: - st.markdown( - "**Install on Linux:** ``sudo apt install " - "tesseract-ocr`` (Debian/Ubuntu) or your distro's " - "equivalent (``dnf``, ``pacman``, …). Restart the " - "DataTools launcher afterward." - ) -st.divider() +# --------------------------------------------------------------------------- +# Options + upload +# --------------------------------------------------------------------------- - -# =========================================================================== -# Extract mode -# =========================================================================== - - -def _render_extract_mode() -> None: - templates = list_templates() - if not templates: - st.info( - "No templates yet. Switch to **Build template** to create your " - "first one — you'll need a sample PDF from the source bank." - ) - return - - options = {f"{t['name']} · {t['slug']}": t["slug"] for t in templates} - label = st.selectbox("Template", list(options.keys())) - slug = options[label] - - uploads = st.file_uploader( - "Statement PDF(s)", - type=["pdf"], - accept_multiple_files=True, - help=( - "Drop one or more statements from the same source. Rows from " - "every file are combined into a single CSV, tagged with the " - "source filename." - ), - ) - - c1, c2, c3 = st.columns(3) - sort_by_date = c1.checkbox( - "Sort combined output by date", +with st.expander("Scan options", expanded=False): + c1, c2 = st.columns(2) + negative_in_parens = c1.checkbox( + "Treat (4.50) as negative", value=True, help=( - "Sorts the combined CSV ascending by the ``date`` column " - "after extraction. Off → preserve per-PDF order." + "Bank statements commonly show withdrawals as ``(4.50)``. " + "Off if your statements use a different convention." ), ) - output_shape = c2.radio( - "Output", - ["Combined CSV", "ZIP of per-PDF CSVs"], - horizontal=True, - help=( - "Combined: one CSV with a ``source_file`` column. " - "ZIP: one CSV per source PDF, useful when feeding files " - "back into separate ledgers." - ), - ) - use_ocr = c3.checkbox( + _ocr_ok, _ocr_reason = ocr_available() + use_ocr = c2.checkbox( "Use OCR for scanned pages", value=_ocr_ok, disabled=not _ocr_ok, help=( - "When a page has no extractable text (typically a scan), " - "OCR it with Tesseract. Disabled when OCR isn't installed." + f"OCR status: {'ready' if _ocr_ok else _ocr_reason or 'unavailable'}. " + "Most modern bank PDFs are text-based and don't need OCR — " + "only enable for image-based scans." ), ) - run = st.button("Extract", type="primary", disabled=not uploads) - if run and uploads: - try: - tpl = load_template(slug) - except Exception as e: - st.error(f"Couldn't load template {slug!r}: {e}") - return +uploads = st.file_uploader( + "PDF file(s)", + type=["pdf"], + accept_multiple_files=True, + help="Drop one or more bank-statement PDFs. Multi-file batches " + "are merged into a single table with a ``source_file`` column.", +) - per_file_frames: list[pd.DataFrame] = [] - all_warnings: list[str] = [] - files_meta: list[dict] = [] - with st.status( - f"Extracting {len(uploads)} file(s)…", - expanded=True, - ) as status: - for i, up in enumerate(uploads, start=1): - st.write(f"**{i}/{len(uploads)}** · {up.name}") - try: - pdf_bytes = up.read() - pages, warns = extract_pages_auto( - pdf_bytes, allow_ocr=use_ocr, - ) - df = apply_template(pages, tpl) - df.insert(0, "source_file", up.name) - per_file_frames.append(df) - files_meta.append({ - "file": up.name, - "pages": len(pages), - "rows": len(df), - "warnings": len(warns), - "status": "ok" if len(df) else "no rows", - }) - for w in warns: - all_warnings.append(f"[{up.name}] {w}") - except Exception as e: - all_warnings.append( - f"[{up.name}] extraction failed: " - f"{type(e).__name__}: {e}" - ) - files_meta.append({ - "file": up.name, - "pages": 0, - "rows": 0, - "warnings": 1, - "status": f"error: {type(e).__name__}", - }) - ok_count = sum(1 for m in files_meta if m["status"] == "ok") - status.update( - label=f"Done · {ok_count}/{len(uploads)} extracted", - state="complete" if ok_count == len(uploads) else "error", - expanded=False, - ) +scan_clicked = st.button( + "Scan", type="primary", disabled=not uploads, +) - if per_file_frames: - combined = pd.concat(per_file_frames, ignore_index=True) - if sort_by_date and "date" in combined.columns: - combined = combined.sort_values( - by=["date", "source_file"], - kind="mergesort", - na_position="last", - ).reset_index(drop=True) - else: - combined = pd.DataFrame() - st.session_state[K_EXTRACT_DF] = combined - st.session_state[K_EXTRACT_WARNINGS] = all_warnings - st.session_state[K_EXTRACT_FILES] = files_meta - st.session_state["pdf_extract_output_shape"] = output_shape - st.session_state["pdf_extract_per_file"] = [ - (m["file"], per_file_frames[i]) - for i, m in enumerate(files_meta) - if m["status"] == "ok" - ] - log_event( - "tool_run", - "PDF Extractor run", - page="10_PDF_Extractor", - template=slug, - files=len(uploads), - rows=len(combined), - output_shape=output_shape, - ) +# --------------------------------------------------------------------------- +# Scan +# --------------------------------------------------------------------------- - df = st.session_state.get(K_EXTRACT_DF) - if isinstance(df, pd.DataFrame): - warnings = st.session_state.get(K_EXTRACT_WARNINGS, []) or [] - files_meta = st.session_state.get(K_EXTRACT_FILES, []) or [] - if files_meta: - st.markdown("#### Per-file summary") - st.dataframe( - pd.DataFrame(files_meta), - hide_index=True, - use_container_width=True, - ) - if warnings: - with st.expander(f"Warnings ({len(warnings)})", expanded=False): - for w in warnings: - st.warning(w) - - if df.empty: - st.info( - "No rows were extracted. Re-check the template's header " - "text, column boundaries, and end markers in **Build " - "template** mode against a sample PDF." - ) - else: - st.markdown(f"#### Extracted rows ({len(df):,})") - st.dataframe(df, hide_index=True, use_container_width=True) - ts = datetime.now().strftime("%Y%m%d-%H%M%S") - output_shape = st.session_state.get( - "pdf_extract_output_shape", "Combined CSV", - ) - if output_shape == "ZIP of per-PDF CSVs": - import zipfile - per_file = st.session_state.get("pdf_extract_per_file") or [] - if not per_file: - st.warning("No per-file CSVs to bundle.") - else: - buf = io.BytesIO() - with zipfile.ZipFile( - buf, "w", zipfile.ZIP_DEFLATED, - ) as zf: - for name, sub_df in per_file: - stem = Path(name).stem or "transactions" - zf.writestr( - f"{stem}.csv", - sub_df.to_csv(index=False), - ) - st.download_button( - f"Download ZIP ({len(per_file)} files)", - data=buf.getvalue(), - file_name=f"transactions-{slug}-{ts}.zip", - mime="application/zip", - type="primary", - ) - else: - csv_bytes = df.to_csv(index=False).encode("utf-8") - st.download_button( - "Download CSV", - data=csv_bytes, - file_name=f"transactions-{slug}-{ts}.csv", - mime="text/csv", - type="primary", +if scan_clicked and uploads: + all_rows: list[dict] = [] + all_warnings: list[str] = [] + with st.status( + f"Scanning {len(uploads)} file(s)…", + expanded=True, + ) as status: + for i, up in enumerate(uploads, start=1): + st.write(f"**{i}/{len(uploads)}** · {up.name}") + try: + rows, warns = scan_pdf_for_transactions( + up.read(), + negative_in_parens=negative_in_parens, + allow_ocr=use_ocr, ) - - -# =========================================================================== -# Build-template mode -# =========================================================================== - - -def _ensure_sample_loaded() -> bool: - """Side-bar uploader for the sample PDF. Returns True if a sample - is loaded and parsed (pages cached in session_state).""" - up = st.file_uploader( - "Sample statement", - type=["pdf"], - help=( - "Used to drive the live preview while you build the " - "template — pick a representative statement from this " - "source." - ), - key="pdf_tpl_sample_uploader", - ) - if up is not None and up.name != st.session_state.get(K_SAMPLE_NAME): - pdf_bytes = up.read() - try: - pages, warns = extract_pages_auto(pdf_bytes, allow_ocr=True) - except Exception as e: - st.error(f"Couldn't read PDF: {type(e).__name__}: {e}") - return False - st.session_state[K_SAMPLE_BYTES] = pdf_bytes - st.session_state[K_SAMPLE_NAME] = up.name - st.session_state[K_SAMPLE_PAGES] = pages - for w in warns: - st.info(w) - return bool(st.session_state.get(K_SAMPLE_PAGES)) - - -def _render_columns_editor(tpl: dict) -> None: - """Edit the column mapping (source index → target field) and the - boundary x-positions in one place.""" - st.markdown("##### Columns") - boundaries = list(tpl["table"].get("column_boundaries") or []) - bounds_text = st.text_input( - "Column boundaries (x-positions, comma-separated)", - value=", ".join(str(int(b)) for b in boundaries), - help=( - "N boundaries create N+1 columns. The visual picker in " - "the next phase will set these for you — until then you " - "can read x-positions from the page-preview hover tip " - "below, or trial-and-error against the live preview." - ), - ) - try: - tpl["table"]["column_boundaries"] = sorted( - float(x.strip()) for x in bounds_text.split(",") if x.strip() - ) - except ValueError: - st.warning("Column boundaries must be numbers.") - - n_cols = len(tpl["table"]["column_boundaries"]) + 1 - st.caption(f"{n_cols} source column(s) defined.") - - # Column mapping: one row per output column the user wants. - columns_state = tpl.get("columns") or [] - if not columns_state: - # Seed a reasonable default the first time. - columns_state = [ - {"source": 0, "target": "date"}, - {"source": 1, "target": "description"}, - {"source": 2, "target": "amount"}, - ][:n_cols] - - targets = ["date", "description", "amount", "amount_debit", - "amount_credit", "balance", "type"] - new_columns: list[dict] = [] - for i, col in enumerate(columns_state): - c1, c2, c3 = st.columns([2, 3, 1]) - src = c1.number_input( - f"Source #{i + 1}", - min_value=0, - max_value=max(n_cols - 1, 0), - value=min(int(col.get("source", 0)), max(n_cols - 1, 0)), - step=1, - key=f"src_{i}", - ) - tgt_default = col.get("target", "") - if tgt_default not in targets: - targets_ext = targets + [tgt_default] if tgt_default else targets - else: - targets_ext = targets - tgt = c2.selectbox( - f"Target #{i + 1}", - targets_ext, - index=(targets_ext.index(tgt_default) if tgt_default in targets_ext else 0), - key=f"tgt_{i}", - ) - keep = c3.checkbox("Keep", value=True, key=f"keep_{i}") - if keep: - new_columns.append({"source": int(src), "target": tgt}) - - if st.button("+ Add column", key="add_col"): - new_columns.append({"source": n_cols - 1 if n_cols else 0, "target": ""}) - st.rerun() - tpl["columns"] = new_columns - - -def _render_visual_picker(tpl: dict) -> None: - """Drawable-canvas overlay on a rendered sample page. - - The user draws (mostly) vertical lines where columns should - split. We harvest each line's x-midpoint and write that into - ``tpl["table"]["column_boundaries"]`` (in PDF point space). An - optional rectangle becomes ``tpl["visual"]["table_bbox"]`` (in - PDF points), preserved for round-trip but not yet used by - extraction — the header/end-marker pair is enough to slice - the row band in practice. - """ - from streamlit_drawable_canvas import st_canvas - - pdf_bytes = st.session_state.get(K_SAMPLE_BYTES) - pages = st.session_state.get(K_SAMPLE_PAGES) or [] - if not pdf_bytes or not pages: - st.info("Upload a sample PDF above to use the visual picker.") - return - - max_page = len(pages) - sample_page = int(tpl.get("visual", {}).get("sample_page", 1)) - sample_page = st.number_input( - "Sample page", - min_value=1, - max_value=max_page, - value=min(sample_page, max_page), - step=1, - help="Pick a page that contains the transactions table.", - ) - tpl.setdefault("visual", {})["sample_page"] = int(sample_page) - - try: - pil_image, scale = render_page_image(pdf_bytes, int(sample_page)) - except Exception as e: - st.error(f"Couldn't render page {sample_page}: {type(e).__name__}: {e}") - return - - tpl["visual"]["page_width"] = pil_image.width / scale - tpl["visual"]["page_height"] = pil_image.height / scale - - c_left, c_right = st.columns([2, 1]) - with c_right: - st.markdown("**How to use**") - st.caption( - "• **Lines** mode: drag short vertical strokes where you " - "want columns to split. Each stroke contributes one " - "x-boundary.\n" - "• **Rect** mode: drag a box around the transactions " - "table to crop the working region.\n" - "• Use the trash icon (top-right of the canvas) to " - "remove the last shape, or the X to clear all." - ) - drawing_mode = st.radio( - "Draw", - ["line", "rect", "transform"], - horizontal=True, - help=( - "transform lets you move/resize already-drawn shapes." + for r in rows: + r["source_file"] = up.name + all_rows.extend(rows) + all_warnings.extend(f"[{up.name}] {w}" for w in warns) + except PdfDependencyMissing as e: + all_warnings.append(f"[{up.name}] {e}") + except Exception as e: + all_warnings.append( + f"[{up.name}] scan failed: {type(e).__name__}: {e}" + ) + status.update( + label=( + f"Found {len(all_rows):,} candidate transactions " + f"across {len(uploads)} file(s)" ), + state="complete", + expanded=False, ) - initial_objects = _boundaries_to_canvas_lines( - tpl["table"].get("column_boundaries", []), - scale=scale, - image_height=pil_image.height, + st.session_state[K_ROWS] = all_rows + st.session_state[K_WARNINGS] = all_warnings + st.session_state[K_SOURCE_COUNT] = len(uploads) + + log_event( + "tool_run", + "PDF scan", + page="10_PDF_Extractor", + files=len(uploads), + rows=len(all_rows), + warnings=len(all_warnings), ) - bbox = tpl["visual"].get("table_bbox") - if bbox: - initial_objects.append(_bbox_to_canvas_rect(bbox, scale)) - - with c_left: - canvas_state = st_canvas( - fill_color="rgba(255, 165, 0, 0.15)", - stroke_width=2, - stroke_color="#d62728", - background_image=pil_image, - update_streamlit=True, - height=pil_image.height, - width=pil_image.width, - drawing_mode=drawing_mode, - initial_drawing={"version": "4.4.0", "objects": initial_objects}, - key=f"pdf_canvas_p{sample_page}", - ) - - new_bounds, new_bbox = _harvest_canvas(canvas_state, scale) - if new_bounds is not None: - tpl["table"]["column_boundaries"] = new_bounds - if new_bbox is not None: - tpl["visual"]["table_bbox"] = new_bbox - - if tpl["table"].get("column_boundaries"): - st.caption( - "Boundaries (PDF pts): " - + ", ".join( - f"{b:.0f}" for b in tpl["table"]["column_boundaries"] - ) - ) -def _boundaries_to_canvas_lines( - boundaries: list[float], - *, - scale: float, - image_height: int, -) -> list[dict]: - """Seed the canvas with full-height vertical lines for any - boundaries already on the template, so the user sees their - saved state when re-entering build mode.""" - out: list[dict] = [] - for b in boundaries: - x_px = float(b) * scale - out.append({ - "type": "line", - "left": x_px, - "top": 0, - "width": 0, - "height": image_height, - "x1": 0, "y1": 0, - "x2": 0, "y2": image_height, - "stroke": "#1f77b4", - "strokeWidth": 2, - "fill": "#1f77b4", - "selectable": True, - }) - return out +# --------------------------------------------------------------------------- +# Results — editable table + download +# --------------------------------------------------------------------------- +rows = st.session_state.get(K_ROWS) +warnings = st.session_state.get(K_WARNINGS) or [] +source_count = st.session_state.get(K_SOURCE_COUNT, 0) -def _bbox_to_canvas_rect(bbox: list[float], scale: float) -> dict: - x0, top, x1, bottom = bbox - return { - "type": "rect", - "left": x0 * scale, - "top": top * scale, - "width": (x1 - x0) * scale, - "height": (bottom - top) * scale, - "stroke": "#d62728", - "strokeWidth": 1, - "fill": "rgba(255, 165, 0, 0.10)", +if warnings: + with st.expander(f"Warnings ({len(warnings)})", expanded=False): + for w in warnings: + st.warning(w) + +if rows is None: + if uploads: + st.info("Click **Scan** to detect transactions.") + else: + st.info("Upload one or more PDF files to begin.") + +elif not rows: + st.info( + "No transaction rows detected. The scanner looks for lines " + "containing a date and at least one amount. Check the " + "warnings expander above for clues — most often the PDF is " + "scanned (image-only) and OCR isn't available." + ) + +else: + df = pd.DataFrame(rows) + + # Order columns so the user-facing fields are leftmost; raw + + # internals are last and easy to scroll past or unselect at + # download time. + front = ["date", "description"] + amount_cols = sorted(c for c in df.columns if c.startswith("amount_")) + tail = ["source_file", "page", "raw"] + ordered = [c for c in front + amount_cols + tail if c in df.columns] + extras = [c for c in df.columns if c not in ordered] + df = df[ordered + extras] + + # Prepend the include checkbox. + df.insert(0, "Include", True) + + st.markdown( + f"#### {len(df):,} candidate transaction(s) " + f"from {source_count} file(s)" + ) + st.caption( + "Uncheck rows to exclude. Edit any cell to fix a value the " + "scanner got wrong. The ``raw`` column shows the original " + "PDF text for that row." + ) + + column_config = { + "Include": st.column_config.CheckboxColumn( + "Include", + default=True, + help="Uncheck to drop this row from the CSV.", + ), + "raw": st.column_config.TextColumn( + "raw", + help="Original text line from the PDF (read-only reference).", + disabled=True, + width="large", + ), + "page": st.column_config.NumberColumn( + "page", disabled=True, width="small", + ), } - - -def _harvest_canvas(canvas_state, scale: float): - """Pull boundaries + bbox out of a ``st_canvas`` return value. - - Returns ``(boundaries_or_None, bbox_or_None)`` where ``None`` - means "no change" (so the existing template values stay put).""" - if canvas_state is None or canvas_state.json_data is None: - return None, None - objects = canvas_state.json_data.get("objects") or [] - - bounds: list[float] = [] - bbox: list[float] | None = None - for obj in objects: - kind = obj.get("type") - left = float(obj.get("left", 0)) - width = float(obj.get("width", 0)) - if kind == "line": - # Take the line's x-midpoint as the boundary x-position. - bounds.append((left + width / 2) / scale) - elif kind == "rect": - top = float(obj.get("top", 0)) - height = float(obj.get("height", 0)) - bbox = [ - left / scale, - top / scale, - (left + width) / scale, - (top + height) / scale, - ] - return sorted(bounds), bbox - - -def _render_build_form(tpl: dict) -> None: - """Mode-aware editor. - - Default mode (``row_heuristic``) presents simple form fields - for the parsing rules and a live preview of detected - transactions. The visual picker only shows when the user - explicitly switches to ``column_visual`` in the Advanced tab — - most users never need to go there. - """ - # Header: name + mode switcher - c_name, c_mode = st.columns([3, 2]) - with c_name: - tpl["name"] = st.text_input( - "Template name", - value=tpl.get("name", ""), - help="What this source is called, e.g. 'Chase Personal Checking'.", + if "source_file" in df.columns: + column_config["source_file"] = st.column_config.TextColumn( + "source_file", disabled=True, ) - tpl["slug"] = slugify(tpl["name"]) - with c_mode: - current_mode = tpl.get("mode", "row_heuristic") - new_mode = st.radio( - "Detection mode", - ["row_heuristic", "column_visual"], - index=["row_heuristic", "column_visual"].index(current_mode), - format_func=lambda m: { - "row_heuristic": "Auto-detect (recommended)", - "column_visual": "Visual columns (advanced)", - }.get(m, m), - help=( - "Auto-detect finds rows by date+amount patterns — no " - "coordinates needed; survives layout changes between " - "statements. Visual columns uses x-position boundaries " - "you draw — useful only when auto-detect fails to find " - "the table." - ), - horizontal=False, - ) - if new_mode != current_mode: - tpl["mode"] = new_mode - st.rerun() - if tpl.get("mode", "row_heuristic") == "row_heuristic": - _render_build_form_row_heuristic(tpl) - else: - _render_build_form_column_visual(tpl) - - -def _render_build_form_row_heuristic(tpl: dict) -> None: - """Simple form for the row-heuristic mode.""" - tab_amount, tab_filters, tab_save = st.tabs( - ["Amount layout", "Filters & date", "Save"] + edited = st.data_editor( + df, + hide_index=True, + use_container_width=True, + column_config=column_config, + num_rows="fixed", + key="pdf_results_editor", ) - tpl.setdefault("row_detection", {}) - tpl.setdefault("amounts", {}) - tpl.setdefault("date", {}) - tpl.setdefault("pages", {}) + selected = edited[edited["Include"]].drop(columns=["Include"]) - with tab_amount: - st.caption( - "Tell us how many amount columns each transaction row has, " - "and how negatives are written. The detector handles the " - "rest — no x-positions needed." - ) - shape_labels = { - "single": "One amount per row (sign in the number)", - "txn_balance": "Two amounts: transaction + running balance", - "debit_credit": "Two columns: separate debit and credit", - "debit_credit_balance": "Three: debit, credit, balance", - } - current_shape = tpl["amounts"].get("shape", "single") - shape = st.selectbox( - "Amount layout", - list(shape_labels.keys()), - index=list(shape_labels.keys()).index( - current_shape if current_shape in shape_labels else "single" - ), - format_func=lambda s: shape_labels[s], - ) - tpl["amounts"]["shape"] = shape - - c1, c2 = st.columns(2) - with c1: - tpl["amounts"]["negative_in_parens"] = st.checkbox( - "Parens (4.50) = negative", - value=bool(tpl["amounts"].get("negative_in_parens", True)), - ) - tpl["amounts"]["currency_strip"] = st.text_input( - "Currency symbols to strip", - value=tpl["amounts"].get("currency_strip", "$"), - max_chars=4, - ) - with c2: - tpl["amounts"]["decimal_separator"] = st.text_input( - "Decimal separator", - value=tpl["amounts"].get("decimal_separator", "."), - max_chars=1, - ) - tpl["amounts"]["thousands_separator"] = st.text_input( - "Thousands separator", - value=tpl["amounts"].get("thousands_separator", ","), - max_chars=1, - ) - - # Auto-derive min/max amounts from the chosen shape unless - # user has set non-default values explicitly. - shape_to_min_max = { - "single": (1, 1), - "txn_balance": (2, 2), - "debit_credit": (1, 2), - "debit_credit_balance": (2, 3), - } - cur_min = tpl["row_detection"].get("min_amounts_per_row") - cur_max = tpl["row_detection"].get("max_amounts_per_row") - derived_min, derived_max = shape_to_min_max.get(shape, (1, 3)) - if cur_min is None or cur_max is None: - tpl["row_detection"]["min_amounts_per_row"] = derived_min - tpl["row_detection"]["max_amounts_per_row"] = derived_max - - with st.expander("Advanced: tune amount-count range", expanded=False): - tpl["row_detection"]["min_amounts_per_row"] = st.number_input( - "Minimum amounts per transaction row", - min_value=1, max_value=10, - value=int(tpl["row_detection"].get("min_amounts_per_row", derived_min)), - step=1, - ) - tpl["row_detection"]["max_amounts_per_row"] = st.number_input( - "Maximum amounts per transaction row", - min_value=1, max_value=10, - value=int(tpl["row_detection"].get("max_amounts_per_row", derived_max)), - step=1, - ) - - with tab_filters: - c1, c2 = st.columns(2) - with c1: - tpl["date"]["format"] = st.text_input( - "Date format", - value=tpl["date"].get("format", "%m/%d/%Y"), - help=( - "Python strftime format. Common: %m/%d/%Y (US), " - "%d/%m/%Y (EU), %Y-%m-%d (ISO). Leave default to " - "try common formats automatically." - ), - ) - tpl["pages"]["range"] = st.text_input( - "Pages", - value=tpl["pages"].get("range", "all"), - help='"all", "1-3", "2,4", "3-" all work.', - ) - with c2: - tpl["row_detection"]["merge_multiline_description"] = st.checkbox( - "Merge multi-line descriptions", - value=bool( - tpl["row_detection"].get("merge_multiline_description", True) - ), - help=( - "Lines without a date attach to the previous " - "row's description — handles wrapped vendor names." - ), - ) - tpl["row_detection"]["y_tolerance"] = st.number_input( - "Row y-tolerance (pts)", - min_value=0.5, - max_value=20.0, - value=float(tpl["row_detection"].get("y_tolerance", 3.0)), - step=0.5, - help=( - "How close two words' y-positions must be to be on " - "the same row. Adjust if rows are splitting or merging." - ), - ) - - skips = "\n".join(tpl["row_detection"].get("skip_rows_matching") or []) - new_skips = st.text_area( - "Skip rows matching (one regex per line, optional)", - value=skips, - help=( - "Lines whose text matches any of these regexes are " - 'excluded. Common: "Total", "Subtotal", "^Page ".' - ), - height=80, - ) - tpl["row_detection"]["skip_rows_matching"] = [ - line.strip() for line in new_skips.splitlines() if line.strip() - ] - - with tab_save: - tpl["notes"] = st.text_area( - "Notes (optional)", value=tpl.get("notes", ""), height=70, - ) - ok, errors = validate_template(tpl) - if errors: - for err in errors: - st.error(err) - c1, c2 = st.columns([1, 3]) - with c1: - save_btn = st.button( - "Save template", type="primary", disabled=not ok, - ) - with c2: - st.caption( - f"Will save as: ``{tpl.get('slug') or '—'}`` " - f"(folder: ``~/.datatools/pdf_templates/``)" - ) - if save_btn: - try: - slug = save_template(tpl) - st.success( - f"Saved as **{slug}**. Switch to Extract mode to use it." - ) - log_event( - "tool_run", - "PDF Extractor template saved", - page="10_PDF_Extractor", - template=slug, - mode=tpl.get("mode"), - ) - except Exception as e: - st.error(f"Save failed: {e}") - - -def _render_build_form_column_visual(tpl: dict) -> None: - """Legacy column-visual editor. Reached via the Detection mode - radio when the user opts into the advanced flow.""" - st.warning( - "**Advanced mode.** Column-x-position templates depend on " - "every statement from this source having identical layout. " - "If your statements drift between months, switch back to " - "Auto-detect." - ) - - t0, t1, t2, t3, t4 = st.tabs( - ["Visual picker", "Pages & table", "Columns", "Parsing", "Save"] - ) - - tpl.setdefault("table", {}) - tpl.setdefault("parse", {}) - tpl.setdefault("pages", {}) - tpl.setdefault("columns", []) - - with t0: - _render_visual_picker(tpl) - - with t1: - c1, c2 = st.columns(2) - with c1: - tpl["notes"] = st.text_area( - "Notes", value=tpl.get("notes", ""), height=70, - ) - tpl["pages"]["range"] = st.text_input( - "Pages", - value=tpl["pages"].get("range", "all"), - help='"all", "1-3", "2,4", "3-" all work.', - ) - tpl["pages"]["skip_matching"] = st.text_input( - "Skip pages matching (regex, optional)", - value=tpl["pages"].get("skip_matching", ""), - ) - with c2: - tpl["table"]["header_text"] = st.text_input( - "Header text", - value=tpl["table"].get("header_text", ""), - ) - ends = "\n".join(tpl["table"].get("end_markers") or []) - new_ends = st.text_area( - "End markers (one regex per line)", - value=ends, - height=80, - ) - tpl["table"]["end_markers"] = [ - line.strip() for line in new_ends.splitlines() if line.strip() - ] - skips = "\n".join(tpl["table"].get("skip_rows_matching") or []) - new_skips = st.text_area( - "Skip rows matching (one regex per line)", - value=skips, - height=80, - ) - tpl["table"]["skip_rows_matching"] = [ - line.strip() for line in new_skips.splitlines() if line.strip() - ] - - with t2: - _render_columns_editor(tpl) - - with t3: - tpl["parse"]["date_format"] = st.text_input( - "Date format", - value=tpl["parse"].get("date_format", "%m/%d/%Y"), - ) - tpl["parse"]["currency_strip"] = st.text_input( - "Currency symbols", value=tpl["parse"].get("currency_strip", "$"), - ) - tpl["parse"]["amount_negative_in_parens"] = st.checkbox( - "Parens = negative", - value=bool(tpl["parse"].get("amount_negative_in_parens", True)), - ) - tpl["parse"]["merge_multiline_description"] = st.checkbox( - "Merge multi-line descriptions", - value=bool(tpl["parse"].get("merge_multiline_description", True)), - ) - - with t4: - ok, errors = validate_template(tpl) - if errors: - for err in errors: - st.error(err) - save_btn = st.button( - "Save template", type="primary", disabled=not ok, key="cv_save", - ) - if save_btn: - try: - slug = save_template(tpl) - st.success(f"Saved as **{slug}**.") - log_event( - "tool_run", - "PDF Extractor template saved", - page="10_PDF_Extractor", - template=slug, - mode=tpl.get("mode"), - ) - except Exception as e: - st.error(f"Save failed: {e}") - - -def _render_preview(tpl: dict) -> None: - """Below-the-fold live preview against the cached sample pages.""" - pages = st.session_state.get(K_SAMPLE_PAGES) - if not pages: - return - st.divider() - st.markdown("##### Live preview") - try: - df = apply_template(pages, tpl) - except Exception as e: - st.error(f"Preview failed: {type(e).__name__}: {e}") - return - mode = tpl.get("mode", "row_heuristic") - if df.empty: - if mode == "row_heuristic": - st.info( - "No transaction rows detected yet. Check that the date " - "format matches your statements, and try widening the " - "amount-count range under \"Advanced\" if your rows have " - "balance or extra columns." - ) + c_dl, c_meta = st.columns([2, 3]) + with c_dl: + if selected.empty: + st.button("Download CSV", disabled=True) else: - st.info( - "Template doesn't match any rows yet. Tighten the header " - "text, add an end marker, or adjust column boundaries." + ts = datetime.now().strftime("%Y%m%d-%H%M%S") + # Default: drop the internal columns from the download. + keep_default = [ + c for c in selected.columns + if c not in ("page", "raw") + ] + with c_meta: + keep = st.multiselect( + "Columns to include in CSV", + options=list(selected.columns), + default=keep_default, + help="``page`` and ``raw`` are kept off by default; " + "tick them if you want them in the file.", + ) + export = selected[keep] if keep else selected + csv_bytes = export.to_csv(index=False).encode("utf-8") + st.download_button( + f"Download {len(export):,} rows as CSV", + data=csv_bytes, + file_name=f"transactions-{ts}.csv", + mime="text/csv", + type="primary", ) - else: + + if not selected.empty: st.caption( - f"{len(df)} row(s) from {len(pages)} page(s) " - f"· mode: {mode}" + f"{len(selected):,} of {len(df):,} rows selected." ) - st.dataframe(df.head(100), hide_index=True, use_container_width=True) - - -def _render_build_mode() -> None: - # Optionally load an existing template into the form - templates = list_templates() - c1, c2, c3 = st.columns([2, 2, 1]) - with c1: - existing_label = "— start from scratch —" - choices = [existing_label] + [ - f"{t['name']} · {t['slug']}" for t in templates - ] - picked = st.selectbox("Load existing", choices, key="build_load_pick") - with c2: - if st.button("Load", disabled=picked == existing_label, key="build_load_btn"): - slug = picked.split(" · ")[-1] - try: - st.session_state[K_CURRENT_TEMPLATE] = load_template(slug) - st.rerun() - except Exception as e: - st.error(f"Load failed: {e}") - with c3: - if st.button("New", key="build_new_btn"): - st.session_state[K_CURRENT_TEMPLATE] = new_template("New template") - st.rerun() - - tpl = _get_or_init(K_CURRENT_TEMPLATE, new_template("New template")) - - if not _ensure_sample_loaded(): - st.info( - "Upload a sample statement from this source to drive the live " - "preview. Your template is built against the sample's layout." - ) - return - - _render_build_form(tpl) - _render_preview(tpl) - - -# =========================================================================== -# Manage-templates mode -# =========================================================================== - - -def _render_manage_mode() -> None: - templates = list_templates() - - st.markdown("##### Import a template") - up = st.file_uploader( - "Template JSON", - type=["json"], - key="manage_import_uploader", - help="Paste a colleague's exported JSON file here to add it to your library.", - ) - if up is not None: - try: - imported = template_from_json(up.read().decode("utf-8")) - save_template(imported) - st.success(f"Imported **{imported['name']}** (slug `{imported['slug']}`).") - st.rerun() - except Exception as e: - st.error(f"Import failed: {e}") - - st.divider() - st.markdown("##### Existing templates") - if not templates: - st.caption("No templates yet — build one in **Build template** mode.") - return - - for t in templates: - slug = t["slug"] - with st.container(border=True): - c1, c2, c3, c4 = st.columns([3, 3, 2, 2]) - with c1: - st.markdown(f"**{t['name']}**") - st.caption(f"`{slug}`") - with c2: - st.caption(f"Updated: {t.get('updated_at', '—')}") - if t.get("notes"): - st.caption(t["notes"]) - with c3: - try: - full = load_template(slug) - payload = template_to_json(full) - st.download_button( - "Export", - data=payload.encode("utf-8"), - file_name=f"{slug}.json", - mime="application/json", - key=f"export_{slug}", - ) - except Exception as e: - st.error(f"Read failed: {e}") - with c4: - if st.button("Delete", key=f"del_{slug}"): - delete_template(slug) - st.success(f"Deleted `{slug}`.") - st.rerun() - - -# =========================================================================== -# Dispatch -# =========================================================================== - - -if mode == "Extract": - _render_extract_mode() -elif mode == "Build template": - _render_build_mode() -elif mode == "Manage templates": - _render_manage_mode() diff --git a/src/pdf_extract.py b/src/pdf_extract.py index cd35143..4c375e4 100644 --- a/src/pdf_extract.py +++ b/src/pdf_extract.py @@ -1,64 +1,51 @@ -"""PDF transaction extraction. +"""Heuristic PDF transaction scanner. -Pure module — no Streamlit, no user-config I/O. Reads PDF bytes, -produces a ``pandas.DataFrame`` of rows according to a template -dict. The accountant-facing use case is extracting transaction -tables from bank statements (different banks = different -templates, reused across statements that share a format). +Single public entry point: ``scan_pdf_for_transactions(pdf_bytes)`` +returns a list of dicts shaped like ``[date] [description] [amount]``, +plus a list of warning strings. The GUI renders those rows in an +editable table and lets the user pick which to keep before +exporting to CSV. -Strategy: +There are no templates, no per-bank configuration files, and no +coordinate dependencies. A transaction row is "any extracted text +line containing a date pattern AND at least one amount pattern." +Multi-amount rows surface every detected amount as ``amount_1``, +``amount_2``, ... — the user labels and reshapes in their CSV +editor of choice. -- ``pdfplumber`` for text + word positions. Bank-statement tables - rarely have visible cell borders, so we don't rely on table-line - detection — instead the template carries explicit column - x-position boundaries (set by the visual picker UI). -- Rows are detected by clustering word ``top`` (y-position) values - within a small tolerance — words on the same baseline. -- Multi-line descriptions: rows whose first column (date) is empty - are merged into the previous row's description column. -- Signed amounts: parenthesized values (``(123.45)``) parse negative. - Single signed amount column passes through. Separate debit/credit - columns are combined into one signed amount column with credits - positive and debits negative (accounting register convention — - matches QuickBooks/Xero import expectations). -- Optional OCR: pages with no extractable text fall through to - ``pytesseract`` IF the binding + Tesseract binary are both - available. Otherwise the page is skipped with a warning row. - -The template is a plain dict matching the schema documented in -``src/pdf_templates.py``. This module reads it; ``pdf_templates`` -manages its persistence and validation. +Optional OCR fallback for scanned PDFs via ``pytesseract`` + +``pypdfium2``. Robust to missing system Tesseract — returns a +clear reason string instead of raising. """ from __future__ import annotations import io +import os +import platform import re from dataclasses import dataclass, field from datetime import datetime +from pathlib import Path from typing import Any -import pandas as pd - -# Lazy imports for the heavy PDF deps so a fresh ``pip`` that hasn't -# picked up the new ``requirements.txt`` lines yet doesn't crash the -# module-import path. The GUI page surfaces a friendly install message -# when these come back missing instead of throwing an ImportError -# traceback over the whole tool. Pure helpers (parse_amount, parse_date, -# cluster_rows, …) keep working with no PDF dep installed. +# --------------------------------------------------------------------------- +# Dependency guards +# --------------------------------------------------------------------------- class PdfDependencyMissing(ImportError): - """Raised when a runtime PDF dependency is missing. + """A runtime PDF dependency is missing. - Carries an actionable ``hint`` for the GUI to show to the user.""" + Carries an actionable install hint that the GUI surfaces. + """ def __init__(self, missing: str, hint: str = ""): self.missing = missing self.hint = hint or ( - f"Install the PDF dependencies: ``pip install " - f"pdfplumber pypdfium2 streamlit-drawable-canvas pytesseract``" + "Install the PDF dependencies: ``pip install " + "pdfplumber pypdfium2 pytesseract``" ) super().__init__(f"{missing} is not installed. {self.hint}") @@ -99,24 +86,123 @@ class WordBox: @dataclass class Page: """One PDF page's text + word positions.""" - page_no: int # 1-indexed + page_no: int width: float height: float text: str words: list[WordBox] = field(default_factory=list) +# --------------------------------------------------------------------------- +# Value parsing +# --------------------------------------------------------------------------- + + +_DATE_RES = [ + re.compile(r"\b(\d{1,2}/\d{1,2}/\d{2,4})\b"), + re.compile(r"\b(\d{1,2}-\d{1,2}-\d{2,4})\b"), + re.compile(r"\b(\d{4}-\d{2}-\d{2})\b"), + re.compile(r"\b([A-Z][a-z]{2}\s+\d{1,2},?\s+\d{2,4})\b"), + re.compile(r"\b(\d{1,2}\s+[A-Z][a-z]{2}\s+\d{2,4})\b"), +] + +_DATE_FORMATS_FALLBACK = [ + "%m/%d/%Y", "%m/%d/%y", "%Y-%m-%d", "%d/%m/%Y", "%d/%m/%y", + "%b %d %Y", "%b %d, %Y", "%d %b %Y", "%d-%b-%Y", + "%m-%d-%Y", "%m-%d-%y", +] + +# Amount tokens: optional $/€/£, optional leading -, optional parens, +# 1-3 digits before grouping with comma-thousand groups, optional +# decimal portion. Trailing minus also captured. +_AMOUNT_RE = re.compile( + r"(? float | None: + """Parse a money string to a signed float, or ``None`` if it + doesn't parse. + + Handles: currency prefixes (configurable), thousands separators, + parenthesized negatives, trailing minus signs ("123.45-"), + leading minus, and bare blanks. + """ + if text is None: + return None + s = str(text).strip() + if not s: + return None + + negative = False + if negative_in_parens and s.startswith("(") and s.endswith(")"): + negative = True + s = s[1:-1].strip() + if s.endswith("-"): + negative = True + s = s[:-1].strip() + if s.startswith("-"): + negative = True + s = s[1:].strip() + for ch in currency_strip: + s = s.replace(ch, "") + s = s.replace(" ", "") + if thousands: + s = s.replace(thousands, "") + if decimal != ".": + s = s.replace(decimal, ".") + + if not s or not re.match(r"^\d+(\.\d+)?$", s): + return None + val = float(s) + return -val if negative else val + + +def parse_date( + text: str, + formats: list[str] | None = None, +) -> str | None: + """Parse a date string and return ISO ``YYYY-MM-DD``. + + Tries *formats* first, then a list of common formats. Returns + ``None`` if no format matches. Caller is responsible for + preserving the raw text alongside the parsed value so the user + can correct mis-detections in the editor. + """ + if text is None: + return None + s = str(text).strip() + if not s: + return None + tries = list(formats or []) + _DATE_FORMATS_FALLBACK + for fmt in tries: + try: + return datetime.strptime(s, fmt).strftime("%Y-%m-%d") + except ValueError: + continue + return None + + # --------------------------------------------------------------------------- # PDF reading # --------------------------------------------------------------------------- def extract_pages(pdf_bytes: bytes) -> list[Page]: - """Parse a PDF blob into our internal ``Page`` representation. + """Parse a PDF blob into ``Page`` records with word positions. - Each page carries every word's bounding box; downstream code - groups them into rows by ``top`` clustering and into columns - by template-defined x-boundaries. + Word positions are kept so the row clusterer can group by + y-coordinate, but no x-position information is used downstream + — the detector only looks at text content. """ pdfplumber = _require_pdfplumber() out: list[Page] = [] @@ -149,102 +235,15 @@ def extract_pages(pdf_bytes: bytes) -> list[Page]: return out -# --------------------------------------------------------------------------- -# Value parsing -# --------------------------------------------------------------------------- - - -_AMOUNT_DEFAULTS = { - "decimal_separator": ".", - "thousands_separator": ",", - "currency_strip": "$", - "negative_in_parens": True, -} - -_DATE_FORMATS_FALLBACK = [ - "%m/%d/%Y", "%m/%d/%y", "%Y-%m-%d", "%d/%m/%Y", "%d/%m/%y", - "%b %d %Y", "%d %b %Y", "%d-%b-%Y", "%m-%d-%Y", "%m-%d-%y", -] - - -def parse_amount(text: str, opts: dict[str, Any] | None = None) -> float | None: - """Parse a money string to a signed float, or ``None`` if it doesn't parse. - - Handles: currency prefixes, thousands separators, parenthesized - negatives, trailing minus signs ("123.45-"), and bare blanks. - """ - if text is None: - return None - s = text.strip() - if not s: - return None - o = {**_AMOUNT_DEFAULTS, **(opts or {})} - - negative = False - if o["negative_in_parens"] and s.startswith("(") and s.endswith(")"): - negative = True - s = s[1:-1].strip() - if s.endswith("-"): - negative = True - s = s[:-1].strip() - if s.startswith("-"): - negative = True - s = s[1:].strip() - currency = o.get("currency_strip") or "" - if currency: - for ch in currency: - s = s.replace(ch, "") - s = s.replace(" ", "") - if o["thousands_separator"]: - s = s.replace(o["thousands_separator"], "") - if o["decimal_separator"] != ".": - s = s.replace(o["decimal_separator"], ".") - - if not s or not re.match(r"^\d+(\.\d+)?$", s): - return None - val = float(s) - return -val if negative else val - - -def parse_date( - text: str, - formats: list[str] | None = None, -) -> str | None: - """Parse a date string against the provided formats and return ISO ``YYYY-MM-DD``. - - Falls back to a list of common formats if *formats* is empty. - Returns ``None`` if no format matches. - """ - if text is None: - return None - s = text.strip() - if not s: - return None - tries = list(formats or []) + _DATE_FORMATS_FALLBACK - for fmt in tries: - try: - dt = datetime.strptime(s, fmt) - return dt.strftime("%Y-%m-%d") - except ValueError: - continue - return None - - -# --------------------------------------------------------------------------- -# Row + column structure -# --------------------------------------------------------------------------- - - def cluster_rows( words: list[WordBox], y_tolerance: float = 3.0, ) -> list[list[WordBox]]: - """Group word boxes into rows by ``top`` coordinate. + """Group word boxes into visual rows by ``top`` coordinate. - Words whose ``top`` is within *y_tolerance* of an existing row's - median are added to that row. Otherwise a new row is started. - Output rows are sorted top-to-bottom; within a row, words are - sorted left-to-right. + Words whose ``top`` is within *y_tolerance* of the current + cluster's first word join that cluster. Output rows are sorted + top-to-bottom and words within a row are sorted left-to-right. """ if not words: return [] @@ -263,679 +262,6 @@ def cluster_rows( return rows -def assign_columns( - row_words: list[WordBox], - boundaries: list[float], -) -> list[str]: - """Bucket the words of a single row into columns. - - ``boundaries`` are the *interior* x-positions between adjacent - columns. N boundaries → N+1 columns. A word's column is decided - by its horizontal midpoint; words within a column are joined - with single spaces in left-to-right order. - """ - n_cols = len(boundaries) + 1 - buckets: list[list[WordBox]] = [[] for _ in range(n_cols)] - sorted_bounds = sorted(boundaries) - for w in row_words: - mid = (w.x0 + w.x1) / 2 - col = 0 - for i, b in enumerate(sorted_bounds): - if mid >= b: - col = i + 1 - buckets[col].append(w) - return [ - " ".join(w.text for w in sorted(bucket, key=lambda w: w.x0)) - for bucket in buckets - ] - - -# --------------------------------------------------------------------------- -# Template application -# --------------------------------------------------------------------------- - - -def _pages_in_range(pages: list[Page], range_spec: str) -> list[Page]: - """Filter *pages* by a range spec like ``"all"``, ``"2-"``, ``"1,3-5"``. - - Empty / ``"all"`` returns all pages. Bad specs return all pages - (template author can fix at preview time).""" - s = (range_spec or "").strip().lower() - if not s or s == "all": - return pages - keep: set[int] = set() - for chunk in s.split(","): - chunk = chunk.strip() - if not chunk: - continue - if "-" in chunk: - a, b = chunk.split("-", 1) - a_i = int(a) if a.strip().isdigit() else 1 - b_i = int(b) if b.strip().isdigit() else len(pages) - for i in range(a_i, b_i + 1): - keep.add(i) - elif chunk.isdigit(): - keep.add(int(chunk)) - return [p for p in pages if p.page_no in keep] if keep else pages - - -def _within_table_window( - rows: list[list[WordBox]], - header_text: str, - end_markers: list[str], -) -> list[list[WordBox]]: - """Slice *rows* to the band between the header line and the end marker. - - Header match: the first row whose joined text contains every word - of ``header_text`` (case-insensitive). The header row itself is - excluded. End match: any row whose joined text matches one of the - ``end_markers`` regex patterns; that row and below are excluded. - - Empty ``header_text`` keeps from the first row; empty - ``end_markers`` keeps through the last row. - """ - if not rows: - return [] - needle_words = [w.lower() for w in (header_text or "").split() if w] - end_res = [re.compile(p, re.IGNORECASE) for p in end_markers if p] - - start = 0 - if needle_words: - start = -1 - for i, row in enumerate(rows): - joined = " ".join(w.text for w in row).lower() - if all(nw in joined for nw in needle_words): - start = i + 1 - break - if start == -1: - return [] - - end = len(rows) - for i in range(start, len(rows)): - joined = " ".join(w.text for w in rows[i]) - if any(rx.search(joined) for rx in end_res): - end = i - break - return rows[start:end] - - -def _row_is_continuation(cells: list[str]) -> bool: - """A row whose first column is empty is treated as a continuation - of the previous row's description (multi-line wrap).""" - return bool(cells) and not cells[0].strip() - - -def _coerce_amount_columns( - record: dict[str, str], - column_map: list[dict[str, Any]], - parse_opts: dict[str, Any], -) -> dict[str, Any]: - """Convert source-column text into typed output fields. - - Supports three amount shapes: - - 1. A single column mapped to ``amount`` — passes through with sign. - 2. Two columns mapped to ``amount_debit`` + ``amount_credit`` — - combined into a signed ``amount`` (credit positive, debit - negative — accounting register convention). - 3. A column mapped to ``balance`` — parsed as signed number. - - The ``date`` target is parsed against the template's date format. - Other targets pass through as text. - """ - out: dict[str, Any] = {} - debit_val: float | None = None - credit_val: float | None = None - - for col in column_map: - target = col.get("target", "") - source_text = record.get(target, "") if target else "" - if target == "date": - iso = parse_date(source_text, parse_opts.get("date_formats") or []) - out["date"] = iso or source_text - elif target == "description": - out["description"] = source_text - elif target == "amount": - out["amount"] = parse_amount(source_text, parse_opts) - elif target == "amount_debit": - debit_val = parse_amount(source_text, parse_opts) - elif target == "amount_credit": - credit_val = parse_amount(source_text, parse_opts) - elif target == "balance": - out["balance"] = parse_amount(source_text, parse_opts) - elif target: - out[target] = source_text - - if "amount" not in out and (debit_val is not None or credit_val is not None): - amt = 0.0 - if credit_val: - amt += credit_val - if debit_val: - amt -= debit_val - out["amount"] = amt - out["type"] = "credit" if amt > 0 else ("debit" if amt < 0 else "") - return out - - -# --------------------------------------------------------------------------- -# Row-heuristic extraction (mode = "row_heuristic", default for new templates) -# --------------------------------------------------------------------------- - - -_DATE_RES = [ - re.compile(r"\b(\d{1,2}/\d{1,2}/\d{2,4})\b"), - re.compile(r"\b(\d{1,2}-\d{1,2}-\d{2,4})\b"), - re.compile(r"\b(\d{4}-\d{2}-\d{2})\b"), - re.compile(r"\b([A-Z][a-z]{2}\s+\d{1,2},?\s+\d{2,4})\b"), - re.compile(r"\b(\d{1,2}\s+[A-Z][a-z]{2}\s+\d{2,4})\b"), - # Short month-day (e.g. "Jan 15") — sometimes used when year is - # implied by the statement period. Lower-priority match. - re.compile(r"\b([A-Z][a-z]{2}\s+\d{1,2})\b"), -] - -# Amount tokens: optional $/€/£, optional leading -, optional parens, -# 1-3 digits before grouping with comma-thousand groups, optional -# decimal portion. Trailing minus also captured. -_AMOUNT_RE = re.compile( - r"(? bool: - """Reject tokens that match the amount regex but are obviously - not money — e.g. a bare year or a page number. Real amounts - have at least one of: currency symbol, decimal point, parens, - minus sign, or a thousand separator.""" - if not token: - return False - return bool(re.search(r"[\$€£.,()\-]", token)) - - -def _find_dates_in_words( - row_words: list[WordBox], -) -> list[tuple[int, str]]: - """Find the FIRST date-like substring on this row. - - Returns ``[(word_index, date_text)]`` or empty list. Searches - word-by-word so we can identify which word(s) constitute the - date and exclude them from the description.""" - for i, w in enumerate(row_words): - # Stitch the next few words together — some date formats - # like "Jan 15, 2026" span 3 word tokens. - for window in (3, 2, 1): - chunk = " ".join(x.text for x in row_words[i : i + window]) - for rx in _DATE_RES: - m = rx.search(chunk) - if m: - return [(i, m.group(1))] - return [] - - -def _find_amount_tokens( - row_words: list[WordBox], -) -> list[tuple[int, WordBox, str]]: - """Find amount-shaped tokens on this row, keeping their position. - - Returns ``[(word_index, wordbox, normalized_text)]``. The - word_index lets the caller exclude these from description text; - the wordbox preserves the x-position so we can cluster amount - columns later without templated coordinates.""" - out: list[tuple[int, WordBox, str]] = [] - for i, w in enumerate(row_words): - # Each word might contain multiple amount tokens if the PDF - # extractor merged things, but in practice one match per word. - m = _AMOUNT_RE.search(w.text) - if m and _looks_like_amount(m.group(1)): - out.append((i, w, m.group(1))) - return out - - -def _row_is_transaction( - row_words: list[WordBox], - *, - min_amounts: int, - max_amounts: int, -) -> bool: - """A transaction line has at least one date AND enough amount - tokens to satisfy the configured shape.""" - if not _find_dates_in_words(row_words): - return False - amounts = _find_amount_tokens(row_words) - return min_amounts <= len(amounts) <= max_amounts - - -def _description_from_row( - row_words: list[WordBox], - date_idx: int, - amount_idxs: set[int], -) -> str: - """Stitch the row's description: everything between the date - word and the first amount token, plus anything after the last - amount that isn't itself an amount.""" - keep: list[str] = [] - seen_first_amount = False - last_amount_idx = max(amount_idxs) if amount_idxs else -1 - for i, w in enumerate(row_words): - if i == date_idx: - continue - if i in amount_idxs: - seen_first_amount = True - continue - # After the last amount, trailing tokens are usually a - # check number or memo — keep them too. - if seen_first_amount and i < last_amount_idx: - continue - keep.append(w.text) - return " ".join(keep).strip() - - -def _assign_amounts_by_shape( - amount_tokens: list[tuple[int, WordBox, str]], - shape: str, - parse_opts: dict[str, Any], - column_centers: list[float] | None = None, -) -> dict[str, Any]: - """Map raw amount tokens to typed CSV fields per the shape. - - Shapes: - ``single`` → first amount is ``amount`` (sign in value) - ``txn_balance`` → leftmost is ``amount``, rightmost is - ``balance`` - ``debit_credit`` → if one token, assign to debit or credit by - x-position (uses ``column_centers``); if two, leftmost is - debit, next is credit. Combine into signed ``amount``. - ``debit_credit_balance`` → leftmost is debit, middle is - credit, rightmost is balance. - """ - out: dict[str, Any] = {} - if not amount_tokens: - return out - txt = [t[2] for t in amount_tokens] - boxes = [t[1] for t in amount_tokens] - - if shape == "single": - out["amount"] = parse_amount(txt[0], parse_opts) - - elif shape == "txn_balance": - out["amount"] = parse_amount(txt[0], parse_opts) - if len(txt) >= 2: - out["balance"] = parse_amount(txt[-1], parse_opts) - - elif shape == "debit_credit": - debit_val: float | None = None - credit_val: float | None = None - if len(txt) == 1 and column_centers and len(column_centers) >= 2: - # Decide debit vs credit by which column-center the token's - # midpoint is closest to. - mid = (boxes[0].x0 + boxes[0].x1) / 2 - distances = [abs(mid - c) for c in column_centers[:2]] - if distances[0] <= distances[1]: - debit_val = parse_amount(txt[0], parse_opts) - else: - credit_val = parse_amount(txt[0], parse_opts) - else: - # Two tokens: leftmost = debit, rightmost = credit. - if len(txt) >= 1: - debit_val = parse_amount(txt[0], parse_opts) - if len(txt) >= 2: - credit_val = parse_amount(txt[1], parse_opts) - amt = 0.0 - if credit_val: - amt += credit_val - if debit_val: - amt -= debit_val - out["amount"] = amt - out["type"] = "credit" if amt > 0 else ("debit" if amt < 0 else "") - - elif shape == "debit_credit_balance": - debit_val = None - credit_val = None - if len(txt) == 2 and column_centers and len(column_centers) >= 3: - # Two tokens but the shape expects three — fall through - # to x-position assignment using the configured columns. - mids = [(b.x0 + b.x1) / 2 for b in boxes] - assigned: list[int | None] = [None, None, None] - for k, m in enumerate(mids): - col = min( - range(3), - key=lambda c, m=m: abs(m - column_centers[c]), - ) - assigned[col] = k - if assigned[0] is not None: - debit_val = parse_amount(txt[assigned[0]], parse_opts) - if assigned[1] is not None: - credit_val = parse_amount(txt[assigned[1]], parse_opts) - if assigned[2] is not None: - out["balance"] = parse_amount(txt[assigned[2]], parse_opts) - else: - if len(txt) >= 1: - debit_val = parse_amount(txt[0], parse_opts) - if len(txt) >= 2: - credit_val = parse_amount(txt[1], parse_opts) - if len(txt) >= 3: - out["balance"] = parse_amount(txt[2], parse_opts) - amt = 0.0 - if credit_val: - amt += credit_val - if debit_val: - amt -= debit_val - out["amount"] = amt - out["type"] = "credit" if amt > 0 else ("debit" if amt < 0 else "") - else: - # Unknown shape — fall back to the simplest interpretation. - out["amount"] = parse_amount(txt[0], parse_opts) - return out - - -def _infer_amount_column_centers( - rows: list[list[WordBox]], - *, - expected: int, - min_amounts: int, - max_amounts: int, -) -> list[float]: - """Cluster amount-token x-midpoints across all transaction rows - to find natural column centers. Returns up to *expected* centers - sorted left-to-right. - - Avoids re-introducing user-drawn coordinates: the columns are - inferred from the data itself. We can't run k-means without - scikit-learn, so use a simple sorted-midpoints + greedy bucket - by proximity tolerance approach. - """ - midpoints: list[float] = [] - for row_words in rows: - if not _row_is_transaction( - row_words, min_amounts=min_amounts, max_amounts=max_amounts, - ): - continue - for _, w, _ in _find_amount_tokens(row_words): - midpoints.append((w.x0 + w.x1) / 2) - if not midpoints: - return [] - midpoints.sort() - # Bucket by adjacency: any gap > 30pt starts a new bucket. - # 30pt ≈ 4x the typical inter-column spacing on bank statements. - buckets: list[list[float]] = [[midpoints[0]]] - for m in midpoints[1:]: - if m - buckets[-1][-1] <= 30: - buckets[-1].append(m) - else: - buckets.append([m]) - centers = [sum(b) / len(b) for b in buckets] - if len(centers) <= expected: - return centers - # More buckets than expected — keep the *expected* most-populated. - by_pop = sorted( - zip(centers, (len(b) for b in buckets)), - key=lambda x: x[1], - reverse=True, - )[:expected] - return sorted(c for c, _ in by_pop) - - -def find_transaction_rows( - pages: list[Page], - template: dict[str, Any], -) -> list[dict[str, Any]]: - """Heuristic row detector. Returns a list of preview records - suitable for rendering in the build-mode preview table. - - Each record carries the raw text + parsed fields; the GUI - surfaces these so the user can confirm or tune the template - before extraction commits to disk. - """ - rd = template.get("row_detection", {}) or {} - amt_cfg = template.get("amounts", {}) or {} - date_cfg = template.get("date", {}) or {} - pages_cfg = template.get("pages", {}) or {} - - pages_used = _pages_in_range(pages, pages_cfg.get("range", "all")) - skip_pages_re = pages_cfg.get("skip_matching") or "" - if skip_pages_re: - skip_re = re.compile(skip_pages_re, re.IGNORECASE) - pages_used = [p for p in pages_used if not skip_re.search(p.text)] - - min_amounts = int(rd.get("min_amounts_per_row", 1)) - max_amounts = int(rd.get("max_amounts_per_row", 3)) - skip_row_res = [ - re.compile(p, re.IGNORECASE) - for p in (rd.get("skip_rows_matching") or []) - ] - shape = amt_cfg.get("shape", "single") - expected_amount_cols = { - "single": 1, - "txn_balance": 2, - "debit_credit": 2, - "debit_credit_balance": 3, - }.get(shape, 1) - - parse_opts = { - "decimal_separator": amt_cfg.get("decimal_separator", "."), - "thousands_separator": amt_cfg.get("thousands_separator", ","), - "currency_strip": amt_cfg.get("currency_strip", "$"), - "negative_in_parens": amt_cfg.get("negative_in_parens", True), - } - date_formats: list[str] = list(date_cfg.get("formats_fallback") or []) - if date_cfg.get("format"): - date_formats = [date_cfg["format"]] + date_formats - - # First pass per page: gather rows so we can also infer amount - # column centers across the whole document. - all_rows: list[tuple[Page, list[list[WordBox]]]] = [] - for page in pages_used: - rows = cluster_rows( - page.words, - y_tolerance=float(rd.get("y_tolerance", 3.0)), - ) - all_rows.append((page, rows)) - - flat_rows = [r for _, rows in all_rows for r in rows] - column_centers = _infer_amount_column_centers( - flat_rows, - expected=expected_amount_cols, - min_amounts=min_amounts, - max_amounts=max_amounts, - ) - - out: list[dict[str, Any]] = [] - merge_multi = bool(rd.get("merge_multiline_description", True)) - prev: dict[str, Any] | None = None - - for page, rows in all_rows: - for row_words in rows: - line = " ".join(w.text for w in row_words) - if not line.strip(): - continue - if any(rx.search(line) for rx in skip_row_res): - continue - - dates = _find_dates_in_words(row_words) - amount_tokens = _find_amount_tokens(row_words) - - is_txn = bool(dates) and ( - min_amounts <= len(amount_tokens) <= max_amounts - ) - - if not is_txn: - # Possible multi-line description continuation — - # a no-date, no-amount line directly following a - # transaction. - if ( - merge_multi - and prev is not None - and not amount_tokens - and not dates - ): - prev["description"] = ( - (prev.get("description") or "") + " " + line - ).strip() - continue - - date_idx, date_text = dates[0] - amount_idxs = {idx for idx, _, _ in amount_tokens} - desc = _description_from_row(row_words, date_idx, amount_idxs) - - record: dict[str, Any] = { - "date": parse_date(date_text, date_formats) or date_text, - "description": desc, - "_page": page.page_no, - "_raw_line": line, - } - record.update(_assign_amounts_by_shape( - amount_tokens, shape, parse_opts, column_centers, - )) - out.append(record) - prev = record - - return out - - -def apply_template_row_heuristic( - pages: list[Page], - template: dict[str, Any], -) -> pd.DataFrame: - """Row-heuristic counterpart to ``apply_template``. Same return - shape (a DataFrame) so callers don't care which mode produced it.""" - rows = find_transaction_rows(pages, template) - if not rows: - return pd.DataFrame() - df = pd.DataFrame(rows) - # Drop internal helper columns from the user-facing output. - if "_raw_line" in df.columns: - df = df.drop(columns=["_raw_line"]) - preferred = ["date", "description", "amount", "type", "balance"] - cols = [c for c in preferred if c in df.columns] - extras = [c for c in df.columns if c not in cols and c != "_page"] - df = df[cols + extras + (["_page"] if "_page" in df.columns else [])] - return df - - -def apply_template( - pages: list[Page], - template: dict[str, Any], -) -> pd.DataFrame: - """Dispatch by template mode and return the extracted DataFrame. - - ``mode="row_heuristic"`` (default for new templates): no - coordinates needed — finds transaction lines by date+amount - pattern matching. Robust to layout drift between statements. - - ``mode="column_visual"`` (legacy): uses x-position boundaries - from the visual picker. Kept for templates saved before the - row-heuristic shift. - - Templates without a mode key default to ``column_visual`` for - backward compatibility with schema_version=1 templates. - """ - mode = template.get("mode", "column_visual") - if mode == "row_heuristic": - return apply_template_row_heuristic(pages, template) - return _apply_template_column_visual(pages, template) - - -def _apply_template_column_visual( - pages: list[Page], - template: dict[str, Any], -) -> pd.DataFrame: - """Original column-x-position pipeline. Now the legacy code - path; kept for any v1 templates and as the manual-override - advanced mode in the build UI.""" - pages_cfg = template.get("pages", {}) or {} - table_cfg = template.get("table", {}) or {} - columns_cfg = template.get("columns", []) or [] - parse_cfg = template.get("parse", {}) or {} - - pages_used = _pages_in_range(pages, pages_cfg.get("range", "all")) - skip_pages_re = pages_cfg.get("skip_matching") or "" - if skip_pages_re: - skip_re = re.compile(skip_pages_re, re.IGNORECASE) - pages_used = [p for p in pages_used if not skip_re.search(p.text)] - - boundaries = list(table_cfg.get("column_boundaries", []) or []) - header_text = table_cfg.get("header_text", "") or "" - end_markers = list(table_cfg.get("end_markers", []) or []) - skip_rows_res = [ - re.compile(p, re.IGNORECASE) - for p in (table_cfg.get("skip_rows_matching") or []) - ] - merge_multiline = bool(parse_cfg.get("merge_multiline_description", True)) - - target_names = [c.get("target") for c in columns_cfg if c.get("target")] - if not target_names: - target_names = [f"col_{i}" for i in range(len(boundaries) + 1)] - - parse_opts = { - "decimal_separator": parse_cfg.get("decimal_separator", "."), - "thousands_separator": parse_cfg.get("thousands_separator", ","), - "currency_strip": parse_cfg.get("currency_strip", "$"), - "negative_in_parens": parse_cfg.get("amount_negative_in_parens", True), - "date_formats": parse_cfg.get("date_formats") - or ([parse_cfg["date_format"]] if parse_cfg.get("date_format") else []), - } - - out_rows: list[dict[str, Any]] = [] - for page in pages_used: - rows = cluster_rows( - page.words, - y_tolerance=float(table_cfg.get("y_tolerance", 3.0)), - ) - rows = _within_table_window(rows, header_text, end_markers) - - prev_record: dict[str, Any] | None = None - for row_words in rows: - if not boundaries: - cells = [" ".join(w.text for w in row_words)] - else: - cells = assign_columns(row_words, boundaries) - joined = " ".join(c.strip() for c in cells if c.strip()) - if not joined: - continue - if any(rx.search(joined) for rx in skip_rows_res): - continue - - if merge_multiline and _row_is_continuation(cells) and prev_record: - # Glue the non-empty columns into the previous record's - # description (the natural sink for wrapped text). - extra = " ".join(c.strip() for c in cells if c.strip()) - if extra: - prev_record["description"] = ( - (prev_record.get("description") or "") - + " " - + extra - ).strip() - continue - - record_src: dict[str, str] = {} - for col_cfg in columns_cfg: - src_idx = col_cfg.get("source") - tgt = col_cfg.get("target") - if ( - isinstance(src_idx, int) - and 0 <= src_idx < len(cells) - and tgt - ): - record_src[tgt] = cells[src_idx] - - record_src.setdefault("_page", str(page.page_no)) - record = _coerce_amount_columns(record_src, columns_cfg, parse_opts) - record["_page"] = page.page_no - out_rows.append(record) - prev_record = record - - if not out_rows: - return pd.DataFrame() - df = pd.DataFrame(out_rows) - - preferred = ["date", "description", "amount", "type", "balance"] - cols = [c for c in preferred if c in df.columns] - extras = [c for c in df.columns if c not in cols and c != "_page"] - df = df[cols + extras + (["_page"] if "_page" in df.columns else [])] - return df - - # --------------------------------------------------------------------------- # OCR fallback (optional) # --------------------------------------------------------------------------- @@ -943,38 +269,25 @@ def _apply_template_column_visual( def page_has_extractable_text(page: Page, min_words: int = 5) -> bool: """Heuristic: a scanned page typically yields zero or near-zero - words. ``min_words`` of 5 catches title/logo-only pages too.""" + words. ``min_words=5`` catches title/logo-only pages too.""" return len(page.words) >= min_words def _autodetect_tesseract_path() -> str | None: - """Probe well-known install locations for ``tesseract.exe``. - - UB-Mannheim's Windows installer drops Tesseract at one of two - paths by default. Auto-detecting them lets ``ocr_available`` - succeed even when the user (or their installer) skipped the - "Add to PATH" step — the most common Windows install - snag based on real user reports. - - No-op on non-Windows: macOS/Linux package managers - always put ``tesseract`` on PATH, so PATH-based discovery is - sufficient. - """ - import os as _os - import platform as _platform - from pathlib import Path as _Path - - if _platform.system() != "Windows": + """Probe well-known install locations for ``tesseract.exe`` on + Windows. No-op on macOS/Linux where Tesseract is on PATH via + the system package manager.""" + if platform.system() != "Windows": return None candidates = [ r"C:\Program Files\Tesseract-OCR\tesseract.exe", r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe", - _os.path.expandvars( + os.path.expandvars( r"%LOCALAPPDATA%\Programs\Tesseract-OCR\tesseract.exe" ), ] for p in candidates: - if p and _Path(p).exists(): + if p and Path(p).exists(): return p return None @@ -982,27 +295,16 @@ def _autodetect_tesseract_path() -> str | None: def ocr_available() -> tuple[bool, str]: """Return ``(available, reason)`` — is OCR usable right now? - Checks both the Python binding (``pytesseract``) and the - Tesseract binary. The reason string is suitable for surfacing - to the user when OCR is unavailable. - - Discovery order for the Tesseract binary: - - 1. ``DATATOOLS_TESSERACT_PATH`` env var — explicit override, - wins over everything else. Useful for portable installs. - 2. Whatever's on PATH (``pytesseract``'s default). - 3. ``_autodetect_tesseract_path`` — known Windows install - locations. Sets ``pytesseract.pytesseract.tesseract_cmd`` - so subsequent ``image_to_data`` calls use the same binary. + Discovery order: ``DATATOOLS_TESSERACT_PATH`` env var override, + then PATH-based lookup, then well-known Windows install + locations. """ - import os as _os - try: - import pytesseract # noqa: F401, PLC0415 + import pytesseract # noqa: PLC0415 except ImportError: return False, "pytesseract is not installed." - override = _os.environ.get("DATATOOLS_TESSERACT_PATH") + override = os.environ.get("DATATOOLS_TESSERACT_PATH") if override: pytesseract.pytesseract.tesseract_cmd = override @@ -1010,7 +312,6 @@ def ocr_available() -> tuple[bool, str]: pytesseract.get_tesseract_version() return True, "" except Exception as e_path: - # Fallback: probe known install locations. candidate = _autodetect_tesseract_path() if candidate: pytesseract.pytesseract.tesseract_cmd = candidate @@ -1025,56 +326,18 @@ def ocr_available() -> tuple[bool, str]: return False, f"Tesseract binary not found on PATH: {e_path}" -def render_page_image( - pdf_bytes: bytes, - page_no: int, - *, - target_width: int = 900, -) -> tuple["Any", float]: - """Rasterize one page of *pdf_bytes* (1-indexed) to a PIL image. - - Returns ``(pil_image, scale)`` where ``scale`` is the - pixels-per-PDF-point factor. The caller uses ``scale`` to map - canvas coordinates (pixels) back to PDF coordinates (points). - - ``target_width`` caps the rendered width so the image is a - sensible size for the visual picker — bank statements at 100% - can be 800–1200 pts wide; we want ~900px on screen. - """ - pdfium = _require_pdfium() - - pdf = pdfium.PdfDocument(pdf_bytes) - try: - idx = max(0, min(page_no - 1, len(pdf) - 1)) - page = pdf[idx] - # Width in PDF points → pixels-per-point scale. - pdf_width = page.get_width() - scale = target_width / pdf_width if pdf_width else 2.0 - # Cap scale so big A3-style scans don't blow up. - scale = min(scale, 3.0) - bitmap = page.render(scale=scale) - return bitmap.to_pil(), scale - finally: - pdf.close() - - def ocr_pdf_to_pages(pdf_bytes: bytes, dpi: int = 200) -> list[Page]: - """Run Tesseract over each page of *pdf_bytes* and return a - word-position-rich ``Page`` list, parallel to ``extract_pages``. + """OCR every page of *pdf_bytes* and return word-position-rich + ``Page`` records, parallel to ``extract_pages``. - Caller is responsible for first checking ``ocr_available()``. - Uses pypdfium2 to rasterize and pytesseract's ``image_to_data`` - to recover per-word bounding boxes so the same column-assignment - pipeline keeps working. + Caller must check ``ocr_available()`` first. """ pdfium = _require_pdfium() import pytesseract # noqa: PLC0415 - from PIL import Image # noqa: F401, PLC0415 (transitively required) pages: list[Page] = [] pdf = pdfium.PdfDocument(pdf_bytes) try: - # PDF points-per-inch is 72; scale renders into pixels. scale = dpi / 72.0 for i in range(len(pdf)): pil_image = pdf[i].render(scale=scale).to_pil() @@ -1091,9 +354,6 @@ def ocr_pdf_to_pages(pdf_bytes: bytes, dpi: int = 200) -> list[Page]: top = float(data["top"][j]) width = float(data["width"][j]) height = float(data["height"][j]) - # Convert pixel coords back to PDF points so column - # boundaries from the template (in PDF points) keep - # working when an OCR page is mixed with text pages. words.append(WordBox( x0=left / scale, top=top / scale, @@ -1119,11 +379,10 @@ def extract_pages_auto( *, allow_ocr: bool = True, ) -> tuple[list[Page], list[str]]: - """Try text extraction first; OCR the pages that come back empty. + """Text extraction first; OCR the pages that come back empty. - Returns ``(pages, warnings)``. ``warnings`` is a list of human- - readable strings — e.g. "Pages 3, 4 had no text and OCR is - unavailable; they were skipped." Caller surfaces these in the UI. + Returns ``(pages, warnings)`` — human-readable warning strings + the caller surfaces in the UI. """ warnings: list[str] = [] pages = extract_pages(pdf_bytes) @@ -1146,7 +405,6 @@ def extract_pages_auto( return pages, warnings ocr_pages = ocr_pdf_to_pages(pdf_bytes) - # Splice OCR results into the original list for the blank pages. by_no = {p.page_no: p for p in ocr_pages} merged: list[Page] = [] for p in pages: @@ -1160,3 +418,175 @@ def extract_pages_auto( f"OCR was used for {len(blank)} page(s) with no extractable text." ) return merged, warnings + + +# --------------------------------------------------------------------------- +# Row detection (the only thing the GUI actually calls) +# --------------------------------------------------------------------------- + + +def _find_dates_in_words( + row_words: list[WordBox], +) -> list[tuple[int, str]]: + """Return ``[(word_index, date_text)]`` for the first date-like + substring on this row, or ``[]`` if none. The index lets the + caller exclude the date words from the description text. + + Multi-word formats like ``Jan 15, 2026`` are handled by stitching + up to three adjacent words before matching. + """ + for i, w in enumerate(row_words): + for window in (3, 2, 1): + chunk = " ".join(x.text for x in row_words[i : i + window]) + for rx in _DATE_RES: + m = rx.search(chunk) + if m: + return [(i, m.group(1))] + return [] + + +def _find_amount_tokens( + row_words: list[WordBox], +) -> list[tuple[int, WordBox, str]]: + """Return ``[(word_index, wordbox, normalized_text)]`` for each + amount-shaped token on this row, left-to-right. + + Filters out tokens that match the regex but lack real money + markers (currency symbol, decimal point, parens, sign, + thousand separator) — keeps bare years and page numbers out. + """ + out: list[tuple[int, WordBox, str]] = [] + for i, w in enumerate(row_words): + m = _AMOUNT_RE.search(w.text) + if not m: + continue + token = m.group(1) + if not re.search(r"[\$€£.,()\-]", token): + continue + out.append((i, w, token)) + return out + + +def _description_from_row( + row_words: list[WordBox], + date_idx: int, + amount_idxs: set[int], +) -> str: + """Stitch the description from the row's non-date, non-amount + tokens. Keeps tokens before the first amount and after the last + amount (trailing check numbers and memos); drops words between + amount tokens (usually whitespace artifacts in column gaps).""" + keep: list[str] = [] + seen_first_amount = False + last_amount_idx = max(amount_idxs) if amount_idxs else -1 + for i, w in enumerate(row_words): + if i == date_idx: + continue + if i in amount_idxs: + seen_first_amount = True + continue + if seen_first_amount and i < last_amount_idx: + continue + keep.append(w.text) + return " ".join(keep).strip() + + +def scan_pdf_for_transactions( + pdf_bytes: bytes, + *, + negative_in_parens: bool = True, + allow_ocr: bool = True, + date_formats: list[str] | None = None, + y_tolerance: float = 3.0, + merge_multiline_descriptions: bool = True, +) -> tuple[list[dict[str, Any]], list[str]]: + """Scan *pdf_bytes* for transaction-like rows. + + A row qualifies if it contains a date pattern AND at least one + amount pattern. Each returned record looks like:: + + { + "date": "2026-01-15", # ISO, or raw text if unparsable + "description": "...", + "amount_1": 4.50, # always present + "amount_2": 1000.00, # if a second amount was found + "amount_3": ..., # if a third was found + "page": 1, + "raw": "01/15/2026 Coffee $4.50", + } + + Multi-line descriptions (rows with no date and no amount) attach + to the most recent transaction row when + ``merge_multiline_descriptions=True`` (default). + + Returns ``(rows, warnings)``. Warnings are human-readable + strings the GUI surfaces in an expander. + """ + pages, warnings = extract_pages_auto(pdf_bytes, allow_ocr=allow_ocr) + + out_rows: list[dict[str, Any]] = [] + prev: dict[str, Any] | None = None + + for page in pages: + rows = cluster_rows(page.words, y_tolerance=y_tolerance) + for row_words in rows: + line = " ".join(w.text for w in row_words).strip() + if not line: + continue + + dates = _find_dates_in_words(row_words) + amount_tokens = _find_amount_tokens(row_words) + + if not dates or not amount_tokens: + # Continuation candidate — a line on a transaction + # page that has neither a date nor an amount of its + # own. Attach to the previous row's description. + if ( + merge_multiline_descriptions + and prev is not None + and not dates + and not amount_tokens + ): + prev["description"] = ( + (prev["description"] + " " + line).strip() + ) + continue + + date_idx, date_text = dates[0] + amount_idxs = {idx for idx, _, _ in amount_tokens} + desc = _description_from_row(row_words, date_idx, amount_idxs) + + record: dict[str, Any] = { + "date": parse_date(date_text, date_formats) or date_text, + "description": desc, + "page": page.page_no, + "raw": line, + } + for k, (_, _, txt) in enumerate(amount_tokens, start=1): + parsed = parse_amount( + txt, negative_in_parens=negative_in_parens, + ) + # Fall back to the raw text if the parser fails so + # the user sees something to fix in the editor + # rather than a silent NaN. + record[f"amount_{k}"] = ( + parsed if parsed is not None else txt + ) + out_rows.append(record) + prev = record + + return out_rows, warnings + + +__all__ = [ + "PdfDependencyMissing", + "Page", + "WordBox", + "cluster_rows", + "extract_pages", + "extract_pages_auto", + "ocr_available", + "parse_amount", + "parse_date", + "scan_pdf_for_transactions", +] diff --git a/src/pdf_templates.py b/src/pdf_templates.py deleted file mode 100644 index 0339c6d..0000000 --- a/src/pdf_templates.py +++ /dev/null @@ -1,508 +0,0 @@ -"""PDF extract template storage. - -Templates encode "how to read this bank's statements" — page -range, table window markers, column x-positions, target field -mapping, amount/date parse options. They live as JSON files in -``~/.datatools/pdf_templates/`` so an accountant can build one -per source and reuse it for every statement that follows the -same layout. Templates are portable: the ``export`` / ``import`` -flow is just a file copy of the JSON. - -The schema is intentionally a plain dict (not a frozen dataclass) -because the GUI mutates it incrementally during the build flow. -``validate_template`` enforces the contract at save time. - -Schema (``schema_version: 1``):: - - { - "schema_version": 1, - "slug": "chase-personal-checking", - "name": "Chase Personal Checking", - "notes": "", - "created_at": "", - "updated_at": "", - "pages": { - "range": "all" | "1-3" | "2,4,6-", - "skip_matching": "" - }, - "table": { - "header_text": "", - "end_markers": ["", ...], - "column_boundaries": [x0, x1, ...], - "y_tolerance": 3.0, - "skip_rows_matching": ["", ...] - }, - "columns": [ - {"source": 0, "target": "date"}, - ... - # ``target`` is one of: date | description | amount | - # amount_debit | amount_credit | balance | - ], - "parse": { - "date_format": "%m/%d/%Y", - "date_formats": [], - "decimal_separator": ".", - "thousands_separator": ",", - "currency_strip": "$", - "amount_negative_in_parens": true, - "merge_multiline_description": true - }, - "visual": { - "page_width": 612.0, - "page_height": 792.0, - "sample_page": 1, - "table_bbox": [x0, top, x1, bottom] | null - } - } - -The ``visual`` block is preserved across save/load so the build -UI can round-trip the user's last visual-picker state. -""" - -from __future__ import annotations - -import json -import os -import re -import tempfile -from datetime import datetime, timezone -from pathlib import Path -from typing import Any - - -SCHEMA_VERSION = 2 - -# Backward-compatible versions ``load_template`` will accept. -# v1 templates predate the row-heuristic shift and are loaded as -# ``mode="column_visual"``; they're not auto-migrated on disk, so -# the user keeps their canonical original until they re-save. -_LOAD_SUPPORTED_VERSIONS = frozenset({1, 2}) - -# Extraction modes. ``row_heuristic`` is the default for new -# templates — finds transactions by date+amount pattern matching -# with no coordinate dependency. ``column_visual`` is the legacy -# x-position-boundary approach, kept for old templates and for -# the "Advanced" build-mode fallback when the heuristic misfires. -VALID_MODES = frozenset({"row_heuristic", "column_visual"}) - -# Amount shapes for row_heuristic mode. The GUI offers these as a -# dropdown; the parser uses them to assign amount tokens to fields. -VALID_AMOUNT_SHAPES = frozenset({ - "single", - "txn_balance", - "debit_credit", - "debit_credit_balance", -}) - -VALID_TARGETS = frozenset({ - "date", - "description", - "amount", - "amount_debit", - "amount_credit", - "balance", - "type", -}) - - -# --------------------------------------------------------------------------- -# Filesystem layout -# --------------------------------------------------------------------------- - - -def templates_dir() -> Path: - """Return ``~/.datatools/pdf_templates/``. Override via the - ``DATATOOLS_PDF_TEMPLATES_DIR`` env var (used by tests).""" - override = os.environ.get("DATATOOLS_PDF_TEMPLATES_DIR") - if override: - return Path(override) - try: - return Path.home() / ".datatools" / "pdf_templates" - except Exception: - return Path(tempfile.gettempdir()) / "datatools-pdf-templates" - - -def template_path(slug: str) -> Path: - """Resolve *slug* to its on-disk JSON path.""" - return templates_dir() / f"{slug}.json" - - -# --------------------------------------------------------------------------- -# Slugify -# --------------------------------------------------------------------------- - - -_SLUG_STRIP = re.compile(r"[^a-z0-9]+") - - -def slugify(name: str) -> str: - """Make a filesystem-safe slug from a human-friendly name.""" - s = (name or "").strip().lower() - s = _SLUG_STRIP.sub("-", s).strip("-") - return s or "untitled" - - -# --------------------------------------------------------------------------- -# Construction + defaults -# --------------------------------------------------------------------------- - - -def new_template(name: str) -> dict[str, Any]: - """Build a blank template with sensible defaults. - - Defaults to ``mode="row_heuristic"`` — the simpler, more - robust approach. The GUI's build flow lets the user switch to - ``mode="column_visual"`` if the heuristic doesn't fit their - statement layout. - """ - now = datetime.now(tz=timezone.utc).isoformat(timespec="seconds") - slug = slugify(name) - return { - "schema_version": SCHEMA_VERSION, - "slug": slug, - "name": name or slug, - "notes": "", - "mode": "row_heuristic", - "created_at": now, - "updated_at": now, - "pages": { - "range": "all", - "skip_matching": "", - }, - # Row-heuristic config (primary path). - "row_detection": { - "min_amounts_per_row": 1, - "max_amounts_per_row": 3, - "y_tolerance": 3.0, - "skip_rows_matching": [], - "merge_multiline_description": True, - }, - "amounts": { - "shape": "single", - "negative_in_parens": True, - "decimal_separator": ".", - "thousands_separator": ",", - "currency_strip": "$", - }, - "date": { - "format": "%m/%d/%Y", - "formats_fallback": [], - }, - # Column-visual config (legacy / Advanced fallback). Empty - # placeholders so the GUI can populate when the user - # switches modes without inserting keys at runtime. - "table": { - "header_text": "", - "end_markers": [], - "column_boundaries": [], - "y_tolerance": 3.0, - "skip_rows_matching": [], - }, - "columns": [], - "parse": { - "date_format": "%m/%d/%Y", - "date_formats": [], - "decimal_separator": ".", - "thousands_separator": ",", - "currency_strip": "$", - "amount_negative_in_parens": True, - "merge_multiline_description": True, - }, - "visual": { - "page_width": 612.0, - "page_height": 792.0, - "sample_page": 1, - "table_bbox": None, - }, - } - - -# --------------------------------------------------------------------------- -# Validation -# --------------------------------------------------------------------------- - - -def validate_template(template: dict[str, Any]) -> tuple[bool, list[str]]: - """Check the template before saving. Returns ``(ok, errors)``. - - Mode-aware: row-heuristic templates and column-visual - templates have different required fields. The GUI shows the - errors next to the Save button; nothing silent here.""" - errors: list[str] = [] - if not isinstance(template, dict): - return False, ["Template must be a JSON object."] - - sv = template.get("schema_version") - if sv != SCHEMA_VERSION: - errors.append( - f"Unsupported schema_version {sv!r} (expected {SCHEMA_VERSION})." - ) - - name = template.get("name", "") - if not isinstance(name, str) or not name.strip(): - errors.append("name is required.") - - slug = template.get("slug") or slugify(name) - if not re.match(r"^[a-z0-9][a-z0-9-]{0,63}$", slug or ""): - errors.append( - "slug must be lowercase alphanumeric + hyphens, " - "1–64 chars, starting with a letter or digit." - ) - - mode = template.get("mode", "row_heuristic") - if mode not in VALID_MODES: - errors.append( - f"mode {mode!r} must be one of: {sorted(VALID_MODES)}." - ) - - if mode == "row_heuristic": - amounts = template.get("amounts", {}) or {} - shape = amounts.get("shape", "single") - if shape not in VALID_AMOUNT_SHAPES: - errors.append( - f"amounts.shape {shape!r} must be one of: " - f"{sorted(VALID_AMOUNT_SHAPES)}." - ) - rd = template.get("row_detection", {}) or {} - min_a = rd.get("min_amounts_per_row", 1) - max_a = rd.get("max_amounts_per_row", 3) - if not (isinstance(min_a, int) and isinstance(max_a, int)): - errors.append( - "row_detection.min_amounts_per_row and " - "max_amounts_per_row must be integers." - ) - elif min_a < 1 or max_a < min_a: - errors.append( - "row_detection.min_amounts_per_row must be ≥1 and ≤ " - "max_amounts_per_row." - ) - - elif mode == "column_visual": - columns = template.get("columns", []) - if not isinstance(columns, list) or len(columns) < 2: - errors.append( - "column_visual mode: at least two output columns " - "are required." - ) - else: - seen_targets: list[str] = [] - for i, col in enumerate(columns): - if not isinstance(col, dict): - errors.append(f"columns[{i}] must be an object.") - continue - src = col.get("source") - tgt = col.get("target") - if not isinstance(src, int) or src < 0: - errors.append( - f"columns[{i}].source must be a non-negative " - f"integer." - ) - if not isinstance(tgt, str) or not tgt: - errors.append( - f"columns[{i}].target must be a non-empty string." - ) - else: - seen_targets.append(tgt) - if "date" not in seen_targets: - errors.append( - "column_visual mode: at least one column must map " - "to 'date'." - ) - if ( - "amount" not in seen_targets - and not ( - "amount_debit" in seen_targets - and "amount_credit" in seen_targets - ) - ): - errors.append( - "column_visual mode: either an 'amount' column or " - "both 'amount_debit' + 'amount_credit' columns " - "are required." - ) - - table = template.get("table", {}) or {} - boundaries = table.get("column_boundaries", []) - if not isinstance(boundaries, list): - errors.append("table.column_boundaries must be a list.") - - return (not errors), errors - - -# --------------------------------------------------------------------------- -# Persistence -# --------------------------------------------------------------------------- - - -def _atomic_write(path: Path, payload: str) -> None: - """Write *payload* to *path* via a temp file + rename. - - Avoids leaving a half-written JSON if the process dies mid-save — - the GUI saves on every visual-picker change, and a corrupt - template file would be hostile to recover from. - """ - path.parent.mkdir(parents=True, exist_ok=True) - fd, tmp_path = tempfile.mkstemp( - prefix=f".{path.name}.", - suffix=".tmp", - dir=str(path.parent), - ) - try: - with os.fdopen(fd, "w", encoding="utf-8") as f: - f.write(payload) - os.replace(tmp_path, path) - except Exception: - try: - os.unlink(tmp_path) - except FileNotFoundError: - pass - raise - - -def save_template(template: dict[str, Any]) -> str: - """Persist *template* to disk; return the slug it was saved as. - - Stamps ``updated_at``. Atomic via temp-file + rename. - Raises ``ValueError`` with a multi-line error list if validation - fails — caller should surface that to the user. - """ - ok, errors = validate_template(template) - if not ok: - raise ValueError("\n".join(errors)) - template = dict(template) - template["updated_at"] = datetime.now(tz=timezone.utc).isoformat( - timespec="seconds" - ) - slug = template["slug"] - payload = json.dumps(template, indent=2, ensure_ascii=False) - _atomic_write(template_path(slug), payload) - return slug - - -def load_template(slug: str) -> dict[str, Any]: - """Read the template at *slug*. Raises ``FileNotFoundError`` if - missing, ``ValueError`` if the JSON is corrupt or the schema - version is unsupported. - - v1 templates (pre row-heuristic) are accepted and migrated - in-memory to v2 shape with ``mode="column_visual"``. The file - on disk is NOT rewritten — the user's canonical original stays - intact until they explicitly re-save, so a buggy migration - can't silently corrupt their template library. - """ - p = template_path(slug) - try: - raw = p.read_text(encoding="utf-8") - except FileNotFoundError: - raise - try: - data = json.loads(raw) - except json.JSONDecodeError as e: - raise ValueError(f"Corrupt template {slug!r}: {e}") from e - sv = data.get("schema_version") - if sv not in _LOAD_SUPPORTED_VERSIONS: - raise ValueError( - f"Template {slug!r} has unsupported schema_version {sv!r}; " - f"this build supports {sorted(_LOAD_SUPPORTED_VERSIONS)}." - ) - return _migrate_to_current(data) - - -def _migrate_to_current(data: dict[str, Any]) -> dict[str, Any]: - """In-memory migration of older schemas to the current shape. - - v1 → v2 adds a ``mode`` key defaulting to ``"column_visual"`` - (since v1 was the column-x-position approach) and stamps - ``schema_version`` to the current value. All v1 keys keep - their original meaning.""" - if data.get("schema_version") == 1: - data = dict(data) - data["schema_version"] = SCHEMA_VERSION - data.setdefault("mode", "column_visual") - return data - - -def delete_template(slug: str) -> bool: - """Remove the template file; returns ``True`` if it existed.""" - p = template_path(slug) - try: - p.unlink() - return True - except FileNotFoundError: - return False - - -def list_templates() -> list[dict[str, Any]]: - """Return a sorted list of ``{slug, name, updated_at}`` summaries. - - Skips files that fail to parse — surfaces them in the manage UI - as warnings rather than crashing the list view. - """ - d = templates_dir() - if not d.exists(): - return [] - out: list[dict[str, Any]] = [] - for p in sorted(d.glob("*.json")): - try: - data = json.loads(p.read_text(encoding="utf-8")) - except Exception: - continue - if not isinstance(data, dict): - continue - out.append({ - "slug": data.get("slug") or p.stem, - "name": data.get("name") or p.stem, - "updated_at": data.get("updated_at", ""), - "notes": data.get("notes", ""), - }) - out.sort(key=lambda r: r["updated_at"] or r["name"], reverse=True) - return out - - -# --------------------------------------------------------------------------- -# Import / export -# --------------------------------------------------------------------------- - - -def template_to_json(template: dict[str, Any]) -> str: - """Serialize a template for download. Pretty-printed for human - inspection / diffing.""" - return json.dumps(template, indent=2, ensure_ascii=False) - - -def template_from_json(payload: str) -> dict[str, Any]: - """Deserialize uploaded template JSON. Validates schema version - but does NOT save — caller decides whether to ``save_template`` - or merge into the current build. - - Raises ``ValueError`` on malformed input.""" - try: - data = json.loads(payload) - except json.JSONDecodeError as e: - raise ValueError(f"Not valid JSON: {e}") from e - if not isinstance(data, dict): - raise ValueError("Top-level JSON must be an object.") - sv = data.get("schema_version") - if sv != SCHEMA_VERSION: - raise ValueError( - f"Imported template has schema_version {sv!r}; " - f"this build expects {SCHEMA_VERSION}." - ) - return data - - -__all__ = [ - "SCHEMA_VERSION", - "VALID_TARGETS", - "delete_template", - "list_templates", - "load_template", - "new_template", - "save_template", - "slugify", - "template_from_json", - "template_path", - "template_to_json", - "templates_dir", - "validate_template", -] diff --git a/tests/test_drawable_canvas_compat.py b/tests/test_drawable_canvas_compat.py deleted file mode 100644 index 22d90ba..0000000 --- a/tests/test_drawable_canvas_compat.py +++ /dev/null @@ -1,116 +0,0 @@ -"""Tests for the streamlit-drawable-canvas compatibility shim. - -The shim re-attaches ``image_to_url`` to ``streamlit.elements.image`` -on modern Streamlit where the helper was relocated to -``streamlit.elements.lib.image_utils`` and given a new signature -(takes a ``LayoutConfig`` dataclass instead of a plain ``int`` -width). - -If this test ever fails on a Streamlit upgrade, it almost -certainly means the ``image_to_url`` function moved AGAIN — the -shim's fallback message points to where to look. Update -``_drawable_canvas_compat.py`` to find the new location. -""" - -from __future__ import annotations - -import sys -import types - - -def test_shim_attaches_image_to_url(): - """After ``install()`` the old import path resolves to a - callable, even on modern Streamlit where the original was - relocated.""" - # Force a fresh import so the module-level _PATCHED guard - # doesn't short-circuit between tests. - sys.modules.pop("src.gui._drawable_canvas_compat", None) - from src.gui._drawable_canvas_compat import install - install() - import streamlit.elements.image as old_loc - assert hasattr(old_loc, "image_to_url") - assert callable(old_loc.image_to_url) - - -def test_shim_is_idempotent(): - """Calling ``install()`` twice doesn't double-wrap or break - anything — important because the page module imports + calls - it once, and a Streamlit script-rerun re-executes the page - module top-to-bottom.""" - sys.modules.pop("src.gui._drawable_canvas_compat", None) - from src.gui._drawable_canvas_compat import install - install() - import streamlit.elements.image as old_loc - first = old_loc.image_to_url - install() - second = old_loc.image_to_url - assert first is second - - -def test_shim_no_op_when_image_to_url_already_present(): - """If a future Streamlit restores ``image_to_url`` at the old - location, the shim must not overwrite it — leave the upstream - function in place so the canvas package gets the official - version, not our compatibility wrapper.""" - sys.modules.pop("src.gui._drawable_canvas_compat", None) - import streamlit.elements.image as old_loc - - sentinel = lambda *a, **kw: "sentinel-url" # noqa: E731 - old_loc.image_to_url = sentinel - try: - from src.gui._drawable_canvas_compat import install - install() - assert old_loc.image_to_url is sentinel, ( - "Shim must not clobber an existing image_to_url." - ) - finally: - # Tidy up so subsequent tests see a clean module. - delattr(old_loc, "image_to_url") - sys.modules.pop("src.gui._drawable_canvas_compat", None) - - -def test_shim_calls_new_function_with_layout_config(): - """The shim's wrapper must translate the old ``(image, width, - clamp, channels, output_format, image_id)`` call into the new - ``(image, layout_config, …)`` signature without breaking.""" - sys.modules.pop("src.gui._drawable_canvas_compat", None) - import streamlit.elements.image as old_loc - if hasattr(old_loc, "image_to_url"): - delattr(old_loc, "image_to_url") - - # Replace the new function with a recorder so we can inspect - # what arguments the shim passed through. - from streamlit.elements.lib import image_utils - captured: dict = {} - original = image_utils.image_to_url - - def recorder(image, layout_config, clamp, channels, output_format, image_id): - captured["image"] = image - captured["layout_config"] = layout_config - captured["clamp"] = clamp - captured["channels"] = channels - captured["output_format"] = output_format - captured["image_id"] = image_id - return "fake-url" - - image_utils.image_to_url = recorder - try: - from src.gui._drawable_canvas_compat import install - install() - result = old_loc.image_to_url( - "fake-image", -1, False, "RGB", "PNG", "test-id", - ) - assert result == "fake-url" - assert captured["image"] == "fake-image" - assert captured["clamp"] is False - assert captured["channels"] == "RGB" - assert captured["output_format"] == "PNG" - assert captured["image_id"] == "test-id" - # The shim wraps the int width into a LayoutConfig. - from streamlit.elements.lib.layout_utils import LayoutConfig - assert isinstance(captured["layout_config"], LayoutConfig) - finally: - image_utils.image_to_url = original - if hasattr(old_loc, "image_to_url"): - delattr(old_loc, "image_to_url") - sys.modules.pop("src.gui._drawable_canvas_compat", None) diff --git a/tests/test_pdf_extract.py b/tests/test_pdf_extract.py index 0f72aed..93abb2f 100644 --- a/tests/test_pdf_extract.py +++ b/tests/test_pdf_extract.py @@ -1,36 +1,33 @@ -"""Tests for the pure PDF-extraction pipeline. +"""Tests for the minimal PDF transaction scanner. -Real PDF parsing (``extract_pages``) is a thin wrapper around -``pdfplumber`` and is exercised by hand on real bank statements. -These tests pin the meaty bits — value parsing, row clustering, -column assignment, template-driven extraction — against synthetic -``WordBox`` data so they run fast and have no PDF dependency. +The public API is one function: ``scan_pdf_for_transactions``. +These tests cover the value-parsing helpers, the row clusterer, +the date/amount token finders, and the end-to-end scanner +against synthetic ``Page`` objects with no real PDF involved. + +End-to-end-on-a-real-PDF coverage lives in +``test_pdf_extract_smoke.py``, which uses ``fpdf2`` to generate +a fixture statement at test time. """ from __future__ import annotations -import pandas as pd - from src.pdf_extract import ( Page, WordBox, - apply_template, - assign_columns, + _find_amount_tokens, + _find_dates_in_words, cluster_rows, parse_amount, parse_date, - _pages_in_range, - _within_table_window, ) def _w(text: str, x0: float, top: float, x1: float | None = None) -> WordBox: - """Convenience constructor — heights and exact x1 don't matter - for the tests we write.""" return WordBox( x0=x0, top=top, - x1=x1 if x1 is not None else x0 + 10 * len(text), + x1=x1 if x1 is not None else x0 + 8 * len(text), bottom=top + 10, text=text, ) @@ -61,13 +58,18 @@ class TestParseAmount: assert parse_amount("not a number") is None def test_european_decimal(self): - opts = { - "decimal_separator": ",", - "thousands_separator": ".", - "currency_strip": "€", - "negative_in_parens": True, - } - assert parse_amount("€1.234,56", opts) == 1234.56 + assert parse_amount( + "€1.234,56", + decimal=",", + thousands=".", + currency_strip="€", + ) == 1234.56 + + def test_parens_off_disables_paren_negative(self): + # With parens off, (4.50) won't be treated as negative — + # but it also won't parse cleanly since "(4.50)" isn't a + # plain number. Verify the off-path is non-flipping. + assert parse_amount("(4.50)", negative_in_parens=False) is None class TestParseDate: @@ -78,7 +80,7 @@ class TestParseDate: assert parse_date("2026-01-15", ["%Y-%m-%d"]) == "2026-01-15" def test_fallback_format(self): - # Not in the supplied list — should still parse via fallback. + # Not in supplied list — should still parse via fallback. assert parse_date("01/15/26") == "2026-01-15" def test_invalid(self): @@ -88,199 +90,74 @@ class TestParseDate: class TestClusterRows: def test_groups_close_y(self): words = [ - _w("A", x0=0, top=100), - _w("B", x0=20, top=101), - _w("C", x0=40, top=102), + _w("A", 0, 100), _w("B", 20, 101), _w("C", 40, 102), ] - rows = cluster_rows(words, y_tolerance=3.0) + rows = cluster_rows(words) assert len(rows) == 1 assert [w.text for w in rows[0]] == ["A", "B", "C"] def test_separates_far_y(self): - words = [ - _w("A", x0=0, top=100), - _w("B", x0=0, top=120), - ] - rows = cluster_rows(words, y_tolerance=3.0) - assert [[w.text for w in r] for r in rows] == [["A"], ["B"]] + words = [_w("A", 0, 100), _w("B", 0, 120)] + assert [ + [w.text for w in r] for r in cluster_rows(words) + ] == [["A"], ["B"]] def test_sorts_left_to_right_within_row(self): - words = [ - _w("C", x0=40, top=100), - _w("A", x0=0, top=100), - _w("B", x0=20, top=100), - ] - rows = cluster_rows(words) - assert [w.text for w in rows[0]] == ["A", "B", "C"] + words = [_w("C", 40, 100), _w("A", 0, 100), _w("B", 20, 100)] + assert [w.text for w in cluster_rows(words)[0]] == ["A", "B", "C"] def test_empty(self): assert cluster_rows([]) == [] -class TestAssignColumns: - def test_three_columns(self): - # boundaries at x=100, 200 → columns [0,100), [100,200), [200,∞) - row = [ - _w("Jan", x0=10, top=0, x1=40), # col 0 - _w("1", x0=45, top=0, x1=55), # col 0 - _w("Deposit", x0=110, top=0, x1=180), # col 1 - _w("250.00", x0=210, top=0, x1=260), # col 2 - ] - cells = assign_columns(row, [100, 200]) - assert cells[0] == "Jan 1" - assert cells[1] == "Deposit" - assert cells[2] == "250.00" +class TestFindDatesInWords: + def test_us_slash(self): + row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)] + assert _find_dates_in_words(row) == [(0, "01/15/2026")] - def test_no_boundaries_one_column(self): - row = [_w("A", 0, 0), _w("B", 20, 0)] - cells = assign_columns(row, []) - assert cells == ["A B"] + def test_two_digit_year(self): + row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)] + result = _find_dates_in_words(row) + assert result and result[0][1] == "01/15/26" + + def test_iso(self): + row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)] + assert _find_dates_in_words(row) == [(0, "2026-01-15")] + + def test_month_name(self): + row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)] + result = _find_dates_in_words(row) + assert result and "Jan 15" in result[0][1] + + def test_no_date(self): + row = [_w("Just", 0, 0), _w("text", 50, 0)] + assert _find_dates_in_words(row) == [] -class TestPagesInRange: - def _mk(self, n): - return [Page(page_no=i + 1, width=600, height=800, text="", words=[]) for i in range(n)] +class TestFindAmountTokens: + def test_currency_format(self): + row = [_w("Coffee", 0, 0), _w("$4.50", 100, 0)] + out = _find_amount_tokens(row) + assert len(out) == 1 + assert out[0][2] == "$4.50" - def test_all(self): - pages = self._mk(5) - assert len(_pages_in_range(pages, "all")) == 5 - assert len(_pages_in_range(pages, "")) == 5 + def test_parens_negative(self): + row = [_w("(123.45)", 0, 0)] + out = _find_amount_tokens(row) + assert out and out[0][2] == "(123.45)" - def test_explicit_list(self): - pages = self._mk(5) - got = [p.page_no for p in _pages_in_range(pages, "1,3,5")] - assert got == [1, 3, 5] + def test_no_amount_on_pure_text(self): + row = [_w("Hello", 0, 0), _w("World", 50, 0)] + assert _find_amount_tokens(row) == [] - def test_range(self): - pages = self._mk(5) - got = [p.page_no for p in _pages_in_range(pages, "2-4")] - assert got == [2, 3, 4] - - def test_open_ended(self): - pages = self._mk(5) - got = [p.page_no for p in _pages_in_range(pages, "3-")] - assert got == [3, 4, 5] + def test_rejects_bare_year(self): + # A bare 4-digit year matches the digit pattern but lacks + # any money marker — should be filtered out. + row = [_w("2026", 0, 0)] + assert _find_amount_tokens(row) == [] -class TestWithinTableWindow: - def test_header_skipped_end_excluded(self): - rows = [ - [_w("STATEMENT", 0, 0)], - [_w("Date", 0, 20), _w("Description", 50, 20), _w("Amount", 200, 20)], - [_w("01/15", 0, 40), _w("Coffee", 50, 40), _w("4.50", 200, 40)], - [_w("01/16", 0, 60), _w("Refund", 50, 60), _w("12.00", 200, 60)], - [_w("Closing", 0, 80), _w("balance", 50, 80)], - [_w("Page", 0, 100), _w("1", 50, 100)], - ] - out = _within_table_window(rows, "Date Description Amount", ["Closing balance"]) - # Should keep just the two transaction rows. - assert len(out) == 2 - assert out[0][0].text == "01/15" - assert out[1][0].text == "01/16" - - def test_no_header_returns_empty_when_required(self): - rows = [[_w("foo", 0, 0)]] - assert _within_table_window(rows, "Date Description Amount", []) == [] - - def test_blank_header_passes_through(self): - rows = [[_w("x", 0, 0)], [_w("y", 0, 20)]] - assert _within_table_window(rows, "", []) == rows - - -class TestApplyTemplate: - """End-to-end on synthetic ``Page`` objects.""" - - def _statement_page(self) -> Page: - # Mock layout: 3 columns at x=0/100/200, header at y=20, data at 40+. - words = [ - _w("STATEMENT", 0, 0), - # Header - _w("Date", 5, 20), _w("Description", 105, 20), _w("Amount", 205, 20), - # Row 1 - _w("01/15/2026", 5, 40), _w("Coffee", 105, 40), - _w("Shop", 140, 40), _w("(4.50)", 205, 40), - # Row 2 - _w("01/16/2026", 5, 60), _w("Refund", 105, 60), _w("$12.00", 205, 60), - # Continuation row (no date) — should merge into row 2 - _w("from", 105, 80), _w("vendor", 140, 80), - # End marker - _w("Closing", 5, 100), _w("balance", 105, 100), _w("$1,000.00", 205, 100), - ] - return Page(page_no=1, width=300, height=120, text="", words=words) - - def _template(self) -> dict: - return { - "pages": {"range": "all"}, - "table": { - "header_text": "Date Description Amount", - "end_markers": ["Closing balance"], - "column_boundaries": [100, 200], - "y_tolerance": 3.0, - "skip_rows_matching": [], - }, - "columns": [ - {"source": 0, "target": "date"}, - {"source": 1, "target": "description"}, - {"source": 2, "target": "amount"}, - ], - "parse": { - "date_format": "%m/%d/%Y", - "amount_negative_in_parens": True, - "merge_multiline_description": True, - }, - } - - def test_basic_extraction(self): - df = apply_template([self._statement_page()], self._template()) - assert isinstance(df, pd.DataFrame) - assert len(df) == 2 - assert list(df["date"]) == ["2026-01-15", "2026-01-16"] - # Parens-negative - assert df.iloc[0]["amount"] == -4.50 - # Plain positive with currency strip - assert df.iloc[1]["amount"] == 12.00 - # Multi-line description merged - assert "from vendor" in df.iloc[1]["description"] - - def test_debit_credit_split_columns(self): - # Layout: date | description | debit | credit columns - page = Page( - page_no=1, width=400, height=80, text="", - words=[ - _w("Date", 5, 0), _w("Desc", 105, 0), - _w("Debit", 205, 0), _w("Credit", 305, 0), - _w("01/15/2026", 5, 20), _w("Coffee", 105, 20), _w("4.50", 205, 20), - _w("01/16/2026", 5, 40), _w("Refund", 105, 40), - _w("", 205, 40), # no debit - _w("12.00", 305, 40), - ], - ) - tpl = { - "table": { - "header_text": "Date Desc Debit Credit", - "column_boundaries": [100, 200, 300], - }, - "columns": [ - {"source": 0, "target": "date"}, - {"source": 1, "target": "description"}, - {"source": 2, "target": "amount_debit"}, - {"source": 3, "target": "amount_credit"}, - ], - "parse": {"date_format": "%m/%d/%Y"}, - } - df = apply_template([page], tpl) - assert list(df["amount"]) == [-4.50, 12.00] - assert list(df["type"]) == ["debit", "credit"] - - def test_skip_rows_matching(self): - page = self._statement_page() - tpl = self._template() - tpl["table"]["skip_rows_matching"] = ["Refund"] - df = apply_template([page], tpl) - # Refund row is dropped — only one transaction left - assert len(df) == 1 - assert df.iloc[0]["amount"] == -4.50 - - def test_empty_pages_returns_empty_df(self): - df = apply_template([], self._template()) - assert df.empty +# End-to-end tests against synthetic Page objects are in the smoke +# test module — they need ``scan_pdf_for_transactions`` which in +# turn uses ``extract_pages_auto``. The unit-test layer here pins +# the building blocks; smoke tests pin the wiring. diff --git a/tests/test_pdf_extract_smoke.py b/tests/test_pdf_extract_smoke.py index f6c4004..f648871 100644 --- a/tests/test_pdf_extract_smoke.py +++ b/tests/test_pdf_extract_smoke.py @@ -1,55 +1,43 @@ -"""End-to-end smoke tests for the PDF extraction stack. +"""End-to-end smoke tests for the PDF transaction scanner. -These tests run real ``pdfplumber`` + ``pypdfium2`` calls against -a small PDF generated in-memory with ``fpdf2``. They exist to -catch the failure mode the user hit on first install — a missing -or mismatched native dependency that doesn't show up until the -extractor actually tries to open a PDF. +These run real ``pdfplumber`` + ``pypdfium2`` (when OCR is in play) +calls against a small statement-shaped PDF generated in memory +with ``fpdf2``. They catch the failure modes most likely to bite +an end-user installer build: missing native lib, broken hook +bundling, pin/installed mismatch. -Per ``project-pdf-extractor`` memory: ``test_pdf_extract.py`` -covers the parsing logic on synthetic ``WordBox`` data with no -PDF dep involved. This file is the layer above: it confirms the -deps themselves work, that hooks bundled them correctly (the -versions pinned in ``requirements.txt`` matter here), and that -the extractor's pipeline survives a round-trip through real -``pdfplumber.extract_words`` and real ``pypdfium2.render``. - -Generation note: ``fpdf2`` is a test-only dep listed in +Generation note: ``fpdf2`` is a test-only dep in ``requirements-dev.txt``. We don't ship it. """ from __future__ import annotations -import io - import pytest def _build_tiny_statement_pdf() -> bytes: - """Render a one-page PDF that looks roughly like the simplest - possible bank statement: a header line + three transaction - rows + a closing-balance footer. Word positions are stable - enough that the parser can identify columns by x-position.""" + """One-page PDF: header line + three transaction rows + a + closing-balance footer. The scanner should pick up exactly the + three transactions.""" from fpdf import FPDF pdf = FPDF(orientation="P", unit="pt", format="letter") pdf.add_page() pdf.set_font("Helvetica", size=12) - # Header pdf.set_xy(40, 50) pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT") - # Transaction-table header row + # Header row (not a transaction — no amount) pdf.set_xy(40, 100) pdf.cell(120, 14, "Date") pdf.set_xy(160, 100) pdf.cell(200, 14, "Description") pdf.set_xy(360, 100) pdf.cell(80, 14, "Amount") - # Three rows + # Three transactions rows = [ - ("01/15/2026", "Coffee Shop", "(4.50)"), - ("01/16/2026", "Refund Vendor", "$12.00"), - ("01/17/2026", "ATM Withdrawal","(40.00)"), + ("01/15/2026", "Coffee Shop", "(4.50)"), + ("01/16/2026", "Refund Vendor", "$12.00"), + ("01/17/2026", "ATM Withdrawal", "(40.00)"), ] y = 130 for date, desc, amt in rows: @@ -60,7 +48,7 @@ def _build_tiny_statement_pdf() -> bytes: pdf.set_xy(360, y) pdf.cell(80, 14, amt) y += 20 - # Closing-balance footer + # Footer — has a date-like number maybe but no real txn shape pdf.set_xy(40, y + 20) pdf.cell(0, 14, "Closing balance: $1,000.00") return bytes(pdf.output()) @@ -72,12 +60,8 @@ def _build_tiny_statement_pdf() -> bytes: class TestDependencyImports: - """Each runtime PDF dep must be importable. - - These tests will fail fast on a stripped/broken install — most - valuable as a CI gate when the requirements.txt pins are - bumped, so we know the new pin still installs cleanly across - the matrix.""" + """Each runtime PDF dep must be importable. Fails fast on a + stripped install or a missing CI pin.""" def test_pdfplumber(self): import pdfplumber # noqa: F401 @@ -85,130 +69,135 @@ class TestDependencyImports: def test_pypdfium2(self): import pypdfium2 # noqa: F401 - def test_streamlit_drawable_canvas(self): - # Don't instantiate the canvas — that needs a Streamlit - # script-run context. Just confirm the module loads. - import streamlit_drawable_canvas # noqa: F401 - def test_pytesseract(self): - # The Python binding must import even when the Tesseract - # binary isn't installed — the OCR availability check - # handles binary absence separately. import pytesseract # noqa: F401 def test_PIL(self): - # Transitively required by pdfplumber + pypdfium2 + canvas. - # Pinning explicit confirms hooks pull it through. from PIL import Image # noqa: F401 # --------------------------------------------------------------------------- -# Real-PDF round-trip +# End-to-end against a real PDF # --------------------------------------------------------------------------- -class TestRealPdfRoundTrip: - """``extract_pages`` + ``apply_template`` against a real PDF.""" - +class TestScanPdfForTransactions: @pytest.fixture def pdf_bytes(self) -> bytes: return _build_tiny_statement_pdf() - def test_extract_pages_returns_words(self, pdf_bytes): - from src.pdf_extract import extract_pages - pages = extract_pages(pdf_bytes) - assert len(pages) == 1 - assert pages[0].width > 0 and pages[0].height > 0 - # At minimum we should have the words from the header and - # one transaction row — proves pdfplumber wired up. - all_text = " ".join(w.text for w in pages[0].words) - assert "ACME" in all_text - assert "Coffee" in all_text - assert "01/15/2026" in all_text + def test_finds_three_transactions(self, pdf_bytes): + from src.pdf_extract import scan_pdf_for_transactions + rows, warnings = scan_pdf_for_transactions(pdf_bytes) + # The PDF has 3 transactions plus a header and a closing- + # balance footer. Header has no amount; closing-balance has + # no date in the same line — neither qualifies as a txn. + assert len(rows) == 3, ( + f"expected 3 rows, got {len(rows)}:\n" + f"{[r.get('raw') for r in rows]}" + ) - def test_apply_template_extracts_three_rows(self, pdf_bytes): - from src.pdf_extract import apply_template, extract_pages - # The template's column boundaries are tuned to fpdf2's - # x-coordinates above (40 / 160 / 360 pt). - tpl = { - "pages": {"range": "all"}, - "table": { - "header_text": "Date Description Amount", - "end_markers": ["Closing balance"], - "column_boundaries": [150, 350], - "y_tolerance": 3.0, - }, - "columns": [ - {"source": 0, "target": "date"}, - {"source": 1, "target": "description"}, - {"source": 2, "target": "amount"}, - ], - "parse": { - "date_format": "%m/%d/%Y", - "amount_negative_in_parens": True, - "merge_multiline_description": True, - }, - } - pages = extract_pages(pdf_bytes) - df = apply_template(pages, tpl) - assert len(df) == 3, f"expected 3 rows, got {len(df)}:\n{df}" - assert list(df["date"]) == [ + def test_parses_dates_to_iso(self, pdf_bytes): + from src.pdf_extract import scan_pdf_for_transactions + rows, _ = scan_pdf_for_transactions(pdf_bytes) + assert [r["date"] for r in rows] == [ "2026-01-15", "2026-01-16", "2026-01-17", ] - # Parens-negative + currency-positive both round-trip - assert df.iloc[0]["amount"] == -4.50 - assert df.iloc[1]["amount"] == 12.00 - assert df.iloc[2]["amount"] == -40.00 + + def test_parses_amounts_with_signs(self, pdf_bytes): + from src.pdf_extract import scan_pdf_for_transactions + rows, _ = scan_pdf_for_transactions(pdf_bytes) + assert rows[0]["amount_1"] == -4.50 + assert rows[1]["amount_1"] == 12.00 + assert rows[2]["amount_1"] == -40.00 + + def test_preserves_raw_line(self, pdf_bytes): + from src.pdf_extract import scan_pdf_for_transactions + rows, _ = scan_pdf_for_transactions(pdf_bytes) + # Raw line lets the user verify what was matched. + assert all("raw" in r and r["raw"] for r in rows) + assert "Coffee" in rows[0]["raw"] + + def test_page_tagged(self, pdf_bytes): + from src.pdf_extract import scan_pdf_for_transactions + rows, _ = scan_pdf_for_transactions(pdf_bytes) + assert all(r["page"] == 1 for r in rows) + + def test_negative_in_parens_off(self, pdf_bytes): + """With parens-negative off, the parser can't decode + ``(4.50)`` and falls back to the raw text — the row still + surfaces, just with the unparsed string in the amount slot + so the user can see and fix it in the editor.""" + from src.pdf_extract import scan_pdf_for_transactions + rows, _ = scan_pdf_for_transactions( + pdf_bytes, negative_in_parens=False, + ) + # Row 0 had "(4.50)" — without parens-negative, parse_amount + # returns None and the scanner keeps the raw token. + assert rows[0]["amount_1"] == "(4.50)" + # Row 1 had "$12.00" — still parses to positive. + assert rows[1]["amount_1"] == 12.00 # --------------------------------------------------------------------------- -# pypdfium2 rendering (powers the visual picker) +# Multi-line description merging # --------------------------------------------------------------------------- -class TestRenderPageImage: - """``render_page_image`` is what feeds the drawable canvas. +class TestMultilineDescription: + def test_continuation_line_merges(self): + """A line with no date and no amount, sitting between two + transaction rows, attaches to the previous transaction's + description.""" + from src.pdf_extract import ( + Page, + WordBox, + scan_pdf_for_transactions, + ) + # Build a synthetic page through the public entry point by + # going through extract_pages_auto's intermediate? Easier: + # call the internals directly via a fake PDF. For unit + # coverage of the merge behavior, route through the helper: + from src import pdf_extract as mod - Catches the most common installer-bug: native PDFium .dll/.so - missing from the bundle. If this test crashes with a - ``FileNotFoundError`` it almost always means the - ``hook-pypdfium2.py`` didn't pick up the shared lib.""" + original = mod.extract_pages_auto - def test_renders_a_real_pil_image(self): - from src.pdf_extract import render_page_image - pdf_bytes = _build_tiny_statement_pdf() - image, scale = render_page_image(pdf_bytes, page_no=1) - # Letter-size at scale ≈ 900/612 ≈ 1.47 → ~900px wide. - assert image.width > 800 - assert image.height > 800 - assert scale > 0 - # PIL Image is duck-typed; check the attrs we depend on. - assert hasattr(image, "save") - assert hasattr(image, "tobytes") + def fake(_pdf_bytes, *, allow_ocr=True): + words = [ + WordBox(x0=0, top=0, x1=80, bottom=10, text="01/15/2026"), + WordBox(x0=100, top=0, x1=160, bottom=10, text="Coffee"), + WordBox(x0=200, top=0, x1=240, bottom=10, text="$4.50"), + # Continuation: no date, no amount + WordBox(x0=100, top=20, x1=160, bottom=30, text="Vendor"), + WordBox(x0=170, top=20, x1=230, bottom=30, text="memo"), + # Next transaction + WordBox(x0=0, top=40, x1=80, bottom=50, text="01/16/2026"), + WordBox(x0=100, top=40, x1=160, bottom=50, text="Other"), + WordBox(x0=200, top=40, x1=240, bottom=50, text="$10.00"), + ] + return [Page( + page_no=1, width=300, height=100, text="", words=words, + )], [] - def test_invalid_page_number_clamps(self): - from src.pdf_extract import render_page_image - pdf_bytes = _build_tiny_statement_pdf() - # PDF has 1 page; page_no=99 should clamp, not raise. - image, scale = render_page_image(pdf_bytes, page_no=99) - assert image.width > 0 + mod.extract_pages_auto = fake + try: + rows, _ = scan_pdf_for_transactions(b"") + finally: + mod.extract_pages_auto = original + + assert len(rows) == 2 + assert "Vendor memo" in rows[0]["description"] + assert rows[1]["description"] == "Other" # --------------------------------------------------------------------------- -# Graceful-fallback behavior +# Graceful fallback when deps absent # --------------------------------------------------------------------------- class TestPdfDependencyMissing: - """The page should see a clean exception when a dep is absent, - not a raw ``ImportError`` that leaks into the Streamlit traceback.""" - def test_require_pdfplumber_raises_typed_on_absence(self, monkeypatch): from src import pdf_extract - # Simulate "pdfplumber not installed" without uninstalling. - # ``_require_pdfplumber`` does its own ``import pdfplumber`` - # at call time; patch ``__import__`` to throw for that one - # name only. import builtins real_import = builtins.__import__ @@ -218,10 +207,10 @@ class TestPdfDependencyMissing: return real_import(name, *a, **kw) monkeypatch.setattr(builtins, "__import__", fake_import) - with pytest.raises(pdf_extract.PdfDependencyMissing) as exc_info: + with pytest.raises(pdf_extract.PdfDependencyMissing) as exc: pdf_extract._require_pdfplumber() - assert "pdfplumber" in str(exc_info.value) - assert exc_info.value.hint # actionable hint must be populated + assert "pdfplumber" in str(exc.value) + assert exc.value.hint def test_require_pdfium_raises_typed_on_absence(self, monkeypatch): from src import pdf_extract @@ -239,17 +228,13 @@ class TestPdfDependencyMissing: # --------------------------------------------------------------------------- -# Requirements-pin consistency +# Requirements pin consistency # --------------------------------------------------------------------------- class TestPinnedVersionsMatchInstalled: """If someone bumps the pin in ``requirements.txt`` without - actually reinstalling, this test points it out before CI does. - - Uses ``importlib.metadata`` rather than each library's - ``__version__`` attribute because not every PDF dep exposes - one (``pypdfium2`` keeps version info on a submodule).""" + actually reinstalling, this test points it out before CI does.""" def _parse_pins(self) -> dict[str, str]: from pathlib import Path @@ -266,21 +251,17 @@ class TestPinnedVersionsMatchInstalled: pins[name.strip()] = version.strip() return pins - def _installed(self, dist_name: str) -> str: - import importlib.metadata as md - return md.version(dist_name) - @pytest.mark.parametrize("dist_name", [ "pdfplumber", "pypdfium2", "pytesseract", - "streamlit-drawable-canvas", ]) def test_pin_matches_installed(self, dist_name): + import importlib.metadata as md pins = self._parse_pins() if dist_name not in pins: pytest.skip(f"{dist_name} not exact-pinned in requirements.txt") - installed = self._installed(dist_name) + installed = md.version(dist_name) assert installed == pins[dist_name], ( f"installed {dist_name}=={installed} but requirements.txt " f"pins {pins[dist_name]} — bump the pin, or reinstall." @@ -288,79 +269,52 @@ class TestPinnedVersionsMatchInstalled: # --------------------------------------------------------------------------- -# OCR availability runtime probe +# OCR availability # --------------------------------------------------------------------------- class TestOcrAvailability: - """``ocr_available`` is the linchpin of the UI's OCR banner. - Returns ``(bool, str)`` — both branches must round-trip.""" - def test_returns_a_tuple(self): from src.pdf_extract import ocr_available result = ocr_available() - assert isinstance(result, tuple) - assert len(result) == 2 + assert isinstance(result, tuple) and len(result) == 2 ok, reason = result assert isinstance(ok, bool) assert isinstance(reason, str) def test_extract_pages_auto_skips_ocr_when_disabled(self): from src.pdf_extract import extract_pages_auto - # With allow_ocr=False, no OCR even if pages are blank. pdf_bytes = _build_tiny_statement_pdf() pages, warnings = extract_pages_auto(pdf_bytes, allow_ocr=False) assert len(pages) == 1 - # No OCR-disabled warning on a text PDF, since pages have text. assert not any("OCR is disabled" in w for w in warnings) class TestTesseractDiscovery: - """Windows install paths + env-var override are how a real user - (no PATH munging) gets OCR working. Cover the discovery logic - even on Linux/macOS test runners by mocking out the OS check - and ``Path.exists``.""" - def test_autodetect_returns_none_on_non_windows(self, monkeypatch): from src import pdf_extract - monkeypatch.setattr( - "platform.system", - lambda: "Linux", - ) + monkeypatch.setattr("platform.system", lambda: "Linux") assert pdf_extract._autodetect_tesseract_path() is None def test_autodetect_finds_program_files_on_windows(self, monkeypatch): from src import pdf_extract monkeypatch.setattr("platform.system", lambda: "Windows") - target = r"C:\Program Files\Tesseract-OCR\tesseract.exe" def fake_exists(self): return str(self) == target - monkeypatch.setattr( - "pathlib.Path.exists", - fake_exists, - ) + monkeypatch.setattr("pathlib.Path.exists", fake_exists) assert pdf_extract._autodetect_tesseract_path() == target - def test_autodetect_returns_none_when_nothing_installed( - self, monkeypatch, - ): + def test_autodetect_returns_none_when_nothing_installed(self, monkeypatch): from src import pdf_extract monkeypatch.setattr("platform.system", lambda: "Windows") monkeypatch.setattr("pathlib.Path.exists", lambda self: False) assert pdf_extract._autodetect_tesseract_path() is None def test_env_var_override_takes_precedence(self, monkeypatch, tmp_path): - """``DATATOOLS_TESSERACT_PATH`` wins over discovery so a - portable install at a non-default path works without - relying on PATH.""" from src import pdf_extract - # Point the override at a path that doesn't exist — - # ocr_available will try it and report the failure, but - # importantly the cmd attribute is set BEFORE the call, - # which is what we're verifying. fake_bin = str(tmp_path / "fake-tesseract.exe") monkeypatch.setenv("DATATOOLS_TESSERACT_PATH", fake_bin) pdf_extract.ocr_available() diff --git a/tests/test_pdf_row_heuristic.py b/tests/test_pdf_row_heuristic.py deleted file mode 100644 index c9f06b9..0000000 --- a/tests/test_pdf_row_heuristic.py +++ /dev/null @@ -1,280 +0,0 @@ -"""Tests for the row-heuristic extraction pipeline. - -This is now the primary extraction mode — uses date + amount -pattern matching to find transaction lines, with no dependency -on x-position column boundaries. Robust to layout drift across -statements from the same bank. - -The legacy column-visual pipeline keeps its own tests in -``test_pdf_extract.py``. -""" - -from __future__ import annotations - -import pandas as pd - -from src.pdf_extract import ( - Page, - WordBox, - apply_template, - apply_template_row_heuristic, - find_transaction_rows, - _find_amount_tokens, - _find_dates_in_words, - _infer_amount_column_centers, -) - - -def _w(text: str, x0: float, top: float) -> WordBox: - return WordBox( - x0=x0, - top=top, - x1=x0 + 8 * len(text), - bottom=top + 10, - text=text, - ) - - -class TestFindDatesInRow: - def test_us_slash(self): - row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)] - assert _find_dates_in_words(row) == [(0, "01/15/2026")] - - def test_two_digit_year(self): - row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)] - result = _find_dates_in_words(row) - assert result and result[0][1] == "01/15/26" - - def test_iso(self): - row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)] - assert _find_dates_in_words(row) == [(0, "2026-01-15")] - - def test_month_name(self): - # "Jan 15, 2026" — three word tokens, should stitch. - row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)] - result = _find_dates_in_words(row) - assert result, "Multi-word month-day-year should match" - assert "Jan 15" in result[0][1] - - def test_no_date(self): - row = [_w("Just", 0, 0), _w("text", 50, 0)] - assert _find_dates_in_words(row) == [] - - -class TestFindAmountTokens: - def test_currency_format(self): - row = [_w("Coffee", 0, 0), _w("$4.50", 100, 0)] - out = _find_amount_tokens(row) - assert len(out) == 1 - assert out[0][2] == "$4.50" - - def test_parens_negative(self): - row = [_w("(123.45)", 0, 0)] - out = _find_amount_tokens(row) - assert out and out[0][2] == "(123.45)" - - def test_no_amount_on_pure_text(self): - row = [_w("Hello", 0, 0), _w("World", 50, 0)] - assert _find_amount_tokens(row) == [] - - def test_rejects_bare_year(self): - # "2026" matches the digit pattern but lacks $/decimal/etc., - # so the looks-like-amount filter should drop it. - row = [_w("2026", 0, 0)] - # Bare integer can pass the regex but not the heuristic. - out = _find_amount_tokens(row) - # Either filtered out OR included — both are defensible. - # If included, it'd be missed-amount territory not a false- - # positive. Pin the conservative behavior: NO match. - assert out == [], "Bare 4-digit year should not register as amount" - - -class TestInferAmountColumnCenters: - def test_two_clear_columns(self): - # 5 rows, each with two amounts at roughly x=300 and x=450. - rows = [] - for top in range(0, 100, 20): - rows.append([ - _w("01/15/2026", 20, top), - _w("Item", 100, top), - _w("$10.00", 300, top), - _w("$1,000.00", 450, top), - ]) - centers = _infer_amount_column_centers( - rows, expected=2, min_amounts=2, max_amounts=2, - ) - assert len(centers) == 2 - # Left center ≈ 300 + 8*len("$10.00")/2 = 300+24 = 324 - assert 310 < centers[0] < 340 - assert 460 < centers[1] < 490 - - def test_no_transactions_returns_empty(self): - rows = [[_w("just", 0, 0), _w("text", 50, 0)]] - assert _infer_amount_column_centers( - rows, expected=2, min_amounts=1, max_amounts=3, - ) == [] - - -class TestRowHeuristicEndToEnd: - """Synthetic ``Page`` objects exercise the full row-heuristic - pipeline end-to-end without a real PDF.""" - - def _page_single_amount(self) -> Page: - words = [ - _w("ACME BANK STATEMENT", 20, 0), - _w("01/15/2026", 20, 30), _w("Coffee", 100, 30), - _w("Shop", 150, 30), _w("$4.50", 400, 30), - _w("01/16/2026", 20, 50), _w("Refund", 100, 50), - _w("from", 100, 70), _w("vendor", 140, 70), # continuation - _w("Vendor", 140, 50), _w("$12.00", 400, 50), - _w("Page", 20, 90), _w("1", 60, 90), # not a txn - ] - return Page(page_no=1, width=600, height=120, text="", words=words) - - def test_extracts_two_rows_single_amount(self): - tpl = { - "mode": "row_heuristic", - "row_detection": { - "min_amounts_per_row": 1, - "max_amounts_per_row": 1, - "merge_multiline_description": True, - }, - "amounts": {"shape": "single", "negative_in_parens": True}, - "date": {"format": "%m/%d/%Y"}, - } - df = apply_template_row_heuristic([self._page_single_amount()], tpl) - assert len(df) == 2 - assert list(df["date"]) == ["2026-01-15", "2026-01-16"] - # Multi-line description merged - assert "from vendor" in df.iloc[1]["description"] - - def test_dispatches_through_apply_template(self): - tpl = { - "mode": "row_heuristic", - "row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1}, - "amounts": {"shape": "single"}, - "date": {"format": "%m/%d/%Y"}, - } - df = apply_template([self._page_single_amount()], tpl) - assert isinstance(df, pd.DataFrame) - assert len(df) == 2 - - def test_txn_balance_shape(self): - page = Page( - page_no=1, width=600, height=100, text="", words=[ - _w("01/15/2026", 20, 0), _w("Coffee", 100, 0), - _w("(4.50)", 300, 0), _w("1,000.00", 450, 0), - _w("01/16/2026", 20, 20), _w("Refund", 100, 20), - _w("12.00", 300, 20), _w("1,012.00", 450, 20), - ], - ) - tpl = { - "mode": "row_heuristic", - "row_detection": {"min_amounts_per_row": 2, "max_amounts_per_row": 2}, - "amounts": {"shape": "txn_balance", "negative_in_parens": True}, - "date": {"format": "%m/%d/%Y"}, - } - df = apply_template([page], tpl) - assert len(df) == 2 - assert df.iloc[0]["amount"] == -4.50 - assert df.iloc[0]["balance"] == 1000.00 - assert df.iloc[1]["amount"] == 12.00 - assert df.iloc[1]["balance"] == 1012.00 - - def test_debit_credit_balance_shape(self): - page = Page( - page_no=1, width=600, height=100, text="", words=[ - _w("01/15/2026", 20, 0), _w("Coffee", 100, 0), - _w("4.50", 300, 0), _w("1,000.00", 450, 0), - _w("01/16/2026", 20, 20), _w("Refund", 100, 20), - _w("12.00", 380, 20), _w("1,012.00", 450, 20), - ], - ) - tpl = { - "mode": "row_heuristic", - "row_detection": {"min_amounts_per_row": 2, "max_amounts_per_row": 3}, - "amounts": {"shape": "debit_credit_balance"}, - "date": {"format": "%m/%d/%Y"}, - } - df = apply_template([page], tpl) - assert len(df) == 2 - # Row 0: amount at x=300 (debit column) → debit, balance at 450 - assert df.iloc[0]["amount"] == -4.50 - assert df.iloc[0]["type"] == "debit" - # Row 1: amount at x=380 (credit column) → credit, balance at 450 - assert df.iloc[1]["amount"] == 12.00 - assert df.iloc[1]["type"] == "credit" - - def test_skip_rows_matching(self): - page = self._page_single_amount() - tpl = { - "mode": "row_heuristic", - "row_detection": { - "min_amounts_per_row": 1, - "max_amounts_per_row": 1, - "skip_rows_matching": ["Refund"], - }, - "amounts": {"shape": "single"}, - "date": {"format": "%m/%d/%Y"}, - } - df = apply_template_row_heuristic([page], tpl) - assert len(df) == 1 - assert df.iloc[0]["date"] == "2026-01-15" - - def test_layout_drift_doesnt_matter(self): - """The whole point of row-heuristic: same template works - on pages of different sizes / different column x-positions.""" - # Page A: amounts at x=400 - page_a = Page( - page_no=1, width=600, height=80, text="", words=[ - _w("01/15/2026", 20, 0), _w("Coffee", 100, 0), - _w("$4.50", 400, 0), - ], - ) - # Page B: amounts shifted to x=520 (different layout) - page_b = Page( - page_no=1, width=720, height=80, text="", words=[ - _w("01/15/2026", 50, 0), _w("Coffee", 150, 0), - _w("$4.50", 520, 0), - ], - ) - tpl = { - "mode": "row_heuristic", - "row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1}, - "amounts": {"shape": "single"}, - "date": {"format": "%m/%d/%Y"}, - } - df_a = apply_template([page_a], tpl) - df_b = apply_template([page_b], tpl) - # Both should extract — proves no coordinate dependency. - assert len(df_a) == 1 - assert len(df_b) == 1 - assert df_a.iloc[0]["amount"] == df_b.iloc[0]["amount"] == 4.50 - - -class TestFindTransactionRows: - """The pre-DataFrame stage — returns dict records the build UI - uses to render a preview before the user commits.""" - - def test_returns_records(self): - page = Page( - page_no=1, width=600, height=80, text="", words=[ - _w("01/15/2026", 20, 0), _w("Coffee", 100, 0), - _w("$4.50", 400, 0), - ], - ) - tpl = { - "mode": "row_heuristic", - "row_detection": {"min_amounts_per_row": 1, "max_amounts_per_row": 1}, - "amounts": {"shape": "single"}, - "date": {"format": "%m/%d/%Y"}, - } - rows = find_transaction_rows([page], tpl) - assert len(rows) == 1 - r = rows[0] - assert r["date"] == "2026-01-15" - assert r["description"] == "Coffee" - assert r["amount"] == 4.50 - assert r["_page"] == 1 - # Raw line is preserved so the GUI can show "what we saw" - assert "_raw_line" in r diff --git a/tests/test_pdf_templates.py b/tests/test_pdf_templates.py deleted file mode 100644 index 551dab6..0000000 --- a/tests/test_pdf_templates.py +++ /dev/null @@ -1,316 +0,0 @@ -"""Tests for the PDF template storage layer.""" - -from __future__ import annotations - -import json - -import pytest - -from src.pdf_templates import ( - SCHEMA_VERSION, - delete_template, - list_templates, - load_template, - new_template, - save_template, - slugify, - template_from_json, - template_path, - templates_dir, - template_to_json, - validate_template, -) - - -@pytest.fixture -def isolated_templates(monkeypatch, tmp_path): - """Redirect the templates directory into ``tmp_path``.""" - monkeypatch.setenv("DATATOOLS_PDF_TEMPLATES_DIR", str(tmp_path)) - yield tmp_path - - -class TestSlugify: - def test_basic(self): - assert slugify("Chase Personal Checking") == "chase-personal-checking" - - def test_strips_punctuation(self): - assert slugify("BofA: Business (USD)") == "bofa-business-usd" - - def test_empty_falls_back(self): - assert slugify("") == "untitled" - assert slugify(" ") == "untitled" - - -class TestNewTemplate: - def test_has_schema_version(self): - t = new_template("Sample") - assert t["schema_version"] == SCHEMA_VERSION - - def test_slug_derived_from_name(self): - t = new_template("Sample Bank") - assert t["slug"] == "sample-bank" - assert t["name"] == "Sample Bank" - - def test_timestamps_present(self): - t = new_template("X") - assert t["created_at"] - assert t["updated_at"] - - -class TestValidateTemplateRowHeuristic: - """Row-heuristic mode is the v2 default.""" - - def _valid(self) -> dict: - return { - "schema_version": SCHEMA_VERSION, - "slug": "x", - "name": "X", - "mode": "row_heuristic", - "row_detection": { - "min_amounts_per_row": 1, - "max_amounts_per_row": 3, - }, - "amounts": {"shape": "single"}, - "date": {"format": "%m/%d/%Y"}, - } - - def test_valid_passes(self): - ok, errs = validate_template(self._valid()) - assert ok, errs - - def test_missing_name_fails(self): - t = self._valid() - t["name"] = "" - ok, errs = validate_template(t) - assert not ok - - def test_bad_mode_fails(self): - t = self._valid() - t["mode"] = "magic" - ok, errs = validate_template(t) - assert not ok - assert any("mode" in e for e in errs) - - def test_bad_shape_fails(self): - t = self._valid() - t["amounts"]["shape"] = "telepathic" - ok, errs = validate_template(t) - assert not ok - assert any("shape" in e for e in errs) - - def test_inverted_amount_range_fails(self): - t = self._valid() - t["row_detection"]["min_amounts_per_row"] = 5 - t["row_detection"]["max_amounts_per_row"] = 2 - ok, errs = validate_template(t) - assert not ok - - def test_does_not_require_columns_in_row_mode(self): - """Key point: row mode doesn't need ``columns`` populated. - That's what makes the GUI's primary path simpler than v1.""" - t = self._valid() - # No columns key at all. - ok, errs = validate_template(t) - assert ok, errs - - -class TestValidateTemplateColumnVisual: - """Legacy column-visual mode keeps its own contract.""" - - def _valid(self) -> dict: - return { - "schema_version": SCHEMA_VERSION, - "slug": "x", - "name": "X", - "mode": "column_visual", - "pages": {"range": "all"}, - "table": {"column_boundaries": [100, 200]}, - "columns": [ - {"source": 0, "target": "date"}, - {"source": 1, "target": "description"}, - {"source": 2, "target": "amount"}, - ], - "parse": {}, - } - - def test_valid_passes(self): - ok, errs = validate_template(self._valid()) - assert ok, errs - - def test_requires_date_column(self): - t = self._valid() - t["columns"] = [ - {"source": 0, "target": "description"}, - {"source": 1, "target": "amount"}, - ] - ok, errs = validate_template(t) - assert not ok - assert any("date" in e for e in errs) - - def test_requires_amount_or_debit_credit(self): - t = self._valid() - t["columns"] = [ - {"source": 0, "target": "date"}, - {"source": 1, "target": "description"}, - ] - ok, errs = validate_template(t) - assert not ok - assert any("amount" in e for e in errs) - - def test_debit_credit_pair_is_valid(self): - t = self._valid() - t["columns"] = [ - {"source": 0, "target": "date"}, - {"source": 1, "target": "description"}, - {"source": 2, "target": "amount_debit"}, - {"source": 3, "target": "amount_credit"}, - ] - t["table"]["column_boundaries"] = [100, 200, 300] - ok, errs = validate_template(t) - assert ok, errs - - -class TestV1Migration: - """v1 templates load with mode='column_visual' auto-injected; - the file on disk stays v1 until the user re-saves.""" - - def test_loads_v1_template(self, isolated_templates, tmp_path): - import json - v1_payload = { - "schema_version": 1, - "slug": "legacy", - "name": "Legacy Bank", - "pages": {"range": "all"}, - "table": {"column_boundaries": [100, 200]}, - "columns": [ - {"source": 0, "target": "date"}, - {"source": 1, "target": "description"}, - {"source": 2, "target": "amount"}, - ], - "parse": {}, - } - (tmp_path / "legacy.json").write_text( - json.dumps(v1_payload), encoding="utf-8", - ) - loaded = load_template("legacy") - # In-memory migration adds mode + bumps schema_version - assert loaded["mode"] == "column_visual" - assert loaded["schema_version"] == SCHEMA_VERSION - # Original keys still intact - assert loaded["columns"][0]["target"] == "date" - - -class TestPersistence: - def test_round_trip(self, isolated_templates): - t = new_template("Round Trip Bank") - t["columns"] = [ - {"source": 0, "target": "date"}, - {"source": 1, "target": "description"}, - {"source": 2, "target": "amount"}, - ] - t["table"]["column_boundaries"] = [100, 200] - slug = save_template(t) - assert slug == "round-trip-bank" - - path = template_path(slug) - assert path.exists() - loaded = load_template(slug) - assert loaded["name"] == "Round Trip Bank" - assert loaded["columns"][0]["target"] == "date" - - def test_save_rejects_invalid(self, isolated_templates): - with pytest.raises(ValueError): - save_template({"schema_version": 1, "name": ""}) - - def test_load_missing_raises(self, isolated_templates): - with pytest.raises(FileNotFoundError): - load_template("does-not-exist") - - def test_load_corrupt_raises(self, isolated_templates, tmp_path): - bad = tmp_path / "bad.json" - bad.write_text("not json", encoding="utf-8") - with pytest.raises(ValueError): - load_template("bad") - - def test_delete(self, isolated_templates): - t = new_template("To Delete") - t["columns"] = [ - {"source": 0, "target": "date"}, - {"source": 1, "target": "amount"}, - ] - t["table"]["column_boundaries"] = [100] - save_template(t) - assert delete_template("to-delete") is True - assert delete_template("to-delete") is False - - def test_list_returns_summaries(self, isolated_templates): - for name in ["Alpha", "Bravo"]: - t = new_template(name) - t["columns"] = [ - {"source": 0, "target": "date"}, - {"source": 1, "target": "amount"}, - ] - t["table"]["column_boundaries"] = [100] - save_template(t) - rows = list_templates() - assert {r["slug"] for r in rows} == {"alpha", "bravo"} - - def test_list_skips_corrupt(self, isolated_templates, tmp_path): - (tmp_path / "broken.json").write_text("nope", encoding="utf-8") - # Even with a broken file present, list still returns [] - rows = list_templates() - assert rows == [] - - def test_atomic_save_no_partial_file_on_failure( - self, isolated_templates, monkeypatch - ): - """If the write step fails mid-way, no half-written JSON survives - at the target path. Tests the temp-file-rename safety pattern.""" - t = new_template("Atomic") - t["columns"] = [ - {"source": 0, "target": "date"}, - {"source": 1, "target": "amount"}, - ] - t["table"]["column_boundaries"] = [100] - - # Make json.dumps blow up to simulate a failure during write. - # save_template already validated before this step, so the - # crash is "after validation, during write". - import src.pdf_templates as mod - original_dumps = mod.json.dumps - - def boom(*a, **kw): - raise IOError("disk full") - - monkeypatch.setattr(mod.json, "dumps", boom) - with pytest.raises(IOError): - save_template(t) - monkeypatch.setattr(mod.json, "dumps", original_dumps) - - assert not template_path("atomic").exists() - - -class TestImportExport: - def test_round_trip_via_json(self): - t = new_template("Exported") - t["columns"] = [ - {"source": 0, "target": "date"}, - {"source": 1, "target": "amount"}, - ] - payload = template_to_json(t) - loaded = template_from_json(payload) - assert loaded["name"] == "Exported" - - def test_import_rejects_bad_schema(self): - bad = json.dumps({"schema_version": 999, "name": "X"}) - with pytest.raises(ValueError): - template_from_json(bad) - - def test_import_rejects_non_object(self): - with pytest.raises(ValueError): - template_from_json('["not", "an", "object"]') - - -def test_templates_dir_env_override(monkeypatch, tmp_path): - monkeypatch.setenv("DATATOOLS_PDF_TEMPLATES_DIR", str(tmp_path)) - assert templates_dir() == tmp_path