datatools-dev/src/gui/pages/10_PDF_Extractor.py

"""PDF to CSV — heuristic transaction scanner.

Upload one or more bank-statement PDFs, scan for transaction-like
rows ([date] [description] [amount]), uncheck the rows you don't
want, download as CSV. No templates, no per-bank configuration,
no coordinate picking.
"""

from __future__ import annotations

import hashlib
import sys
from datetime import datetime
from pathlib import Path

import pandas as pd
import streamlit as st

_project_root = Path(__file__).resolve().parent.parent.parent.parent
if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root))

from src.audit import log_event, log_page_open
from src.gui.components import (
    hide_streamlit_chrome,
    html_download_button,
    render_sticky_footer,
    render_tool_header,
)
from src.pdf_extract import (
    PdfDependencyMissing,
    diagnose_pdf_lines,
    format_amount,
    ocr_available,
    scan_pdf_for_transactions,
    year_from_filename,
)


def _pdf_deps_status() -> tuple[bool, list[str]]:
    """Probe each runtime PDF dep without forcing the user to hit
    the Scan button. Returns ``(ok, missing_names)``."""
    missing: list[str] = []
    for name in ("pdfplumber", "pypdfium2"):
        try:
            __import__(name)
        except ImportError:
            missing.append(name)
    return (not missing), missing


log_page_open("10_PDF_Extractor")

_ICON_PATH = str(Path(__file__).parent.parent / "assets" / "datatools_icon_256.png")
st.set_page_config(
    page_title="PDF to CSV · DataTools",
    page_icon=_ICON_PATH,
    layout="wide",
)
hide_streamlit_chrome()
render_sticky_footer()

# ---------------------------------------------------------------------------
# Session-state keys
# ---------------------------------------------------------------------------

K_ROWS = "pdf_scan_rows"
K_WARNINGS = "pdf_scan_warnings"
K_SOURCE_COUNT = "pdf_scan_source_count"
# Stamped once at scan time. The download button's file_name
# embeds this so the user gets a unique-per-scan filename — but
# crucially, the value is stable across reruns triggered by
# unrelated widget interactions (otherwise the html_download_button
# helper's session-state key drifts every second and the
# "Saved to <path>" banner never gets to render).
K_TIMESTAMP = "pdf_scan_timestamp"
# ``pdf_uploads`` is the persistent stash of uploaded PDFs (dict
# keyed by filename → {"bytes": ..., "size": ...}). It survives
# Streamlit reruns and navigation away from the page. The
# uploader widget feeds this stash via ``_sync_pdf_uploads`` and
# the custom file list / Clear-all button operate on it.
K_UPLOADS = "pdf_uploads"
# Bumped to force the file_uploader to re-instantiate (clear its
# internal state) when the user removes a file via the custom X or
# clicks Clear-all. Streamlit's widget state is keyed on the widget
# key, so changing the key resets the widget without us having to
# touch its session-state directly (which Streamlit disallows).
K_UPLOAD_COUNTER = "pdf_upload_counter"


def _format_size(n_bytes: int) -> str:
    """Human-friendly file size — KB / MB / GB."""
    size = float(n_bytes)
    for unit in ("B", "KB", "MB", "GB"):
        if size < 1024:
            if unit == "B":
                return f"{int(size)} {unit}"
            return f"{size:.1f} {unit}"
        size /= 1024
    return f"{size:.1f} TB"


# ---------------------------------------------------------------------------
# Header + dep guard
# ---------------------------------------------------------------------------

render_tool_header("10_pdf_extractor")

_pdf_ok, _pdf_missing = _pdf_deps_status()
if not _pdf_ok:
    st.error(
        "**PDF dependencies are not installed.** "
        f"Missing module(s): `{', '.join(_pdf_missing)}`.\n\n"
        "Install them into the same Python that launches DataTools:\n\n"
        "```\npip install pdfplumber pypdfium2 pytesseract\n```\n\n"
        "Then **fully restart the launcher** to pick up the new modules."
    )
    st.stop()


# ---------------------------------------------------------------------------
# Options + upload
# ---------------------------------------------------------------------------

_DATE_FORMAT_CHOICES = {
    "YYYY-MM-DD (2026-01-13)": "%Y-%m-%d",
    "YYYYMMDD (20260113)": "%Y%m%d",
    "MM/DD/YYYY (01/13/2026)": "%m/%d/%Y",
    "DD/MM/YYYY (13/01/2026)": "%d/%m/%Y",
    "MMM DD, YYYY (Jan 13, 2026)": "%b %d, %Y",
    "Custom strftime…": "__custom__",
}

with st.expander("Scan options", expanded=False):
    c1, c2 = st.columns(2)
    negative_in_parens = c1.checkbox(
        "Treat (4.50) as negative",
        value=True,
        help=(
            "Bank statements commonly show withdrawals as ``(4.50)``. "
            "Off if your statements use a different convention."
        ),
    )
    _ocr_ok, _ocr_reason = ocr_available()
    use_ocr = c2.checkbox(
        "Use OCR for scanned pages",
        value=_ocr_ok,
        disabled=not _ocr_ok,
        help=(
            f"OCR status: {'ready' if _ocr_ok else _ocr_reason or 'unavailable'}. "
            "Most modern bank PDFs are text-based and don't need OCR — "
            "only enable for image-based scans."
        ),
    )

    c3, c4 = st.columns(2)
    date_label = c3.selectbox(
        "Output date format",
        list(_DATE_FORMAT_CHOICES.keys()),
        index=0,
        help=(
            "Applied to the transaction date AND the statement "
            "period dates pulled from the header. Pick Custom to "
            "enter your own ``strftime`` string."
        ),
    )
    output_date_format = _DATE_FORMAT_CHOICES[date_label]
    if output_date_format == "__custom__":
        output_date_format = c4.text_input(
            "Custom strftime format",
            value="%Y-%m-%d",
            help=(
                "Python ``strftime`` codes — e.g., ``%Y-%m-%d`` for "
                "2026-01-13, ``%Y%m%d`` for 20260113."
            ),
        )

    # Year override for short dates. Empty by default — the
    # scanner uses statement-period detection + filename year hint
    # automatically. Set this when the statement period regex
    # misses on a particular bank's layout, or when you want to
    # force a specific year (e.g., historical reconciliation).
    year_override_str = st.text_input(
        "Override year for short dates (optional)",
        value="",
        help=(
            "Short dates like ``01/13`` get bound to a year by the "
            "scanner — statement period first, then filename year, "
            "then this override. Leave blank for automatic. Enter "
            "a 4-digit year (e.g., 2025) to force every short date "
            "to that year. Won't affect dates that already have a "
            "year (``01/13/2025``)."
        ),
    )
    try:
        year_override = (
            int(year_override_str) if year_override_str.strip() else None
        )
        if year_override is not None and not (1900 <= year_override <= 2100):
            st.warning(
                f"Year override {year_override} looks wrong — using "
                "automatic detection instead."
            )
            year_override = None
    except ValueError:
        st.warning(
            f"Year override {year_override_str!r} isn't a number — "
            "using automatic detection instead."
        )
        year_override = None

# Persistent stash + rotating widget key. See K_UPLOADS / K_UPLOAD_COUNTER
# docstrings for why the counter exists.
pdf_uploads: dict = st.session_state.setdefault(K_UPLOADS, {})
upload_counter: int = st.session_state.setdefault(K_UPLOAD_COUNTER, 0)
uploader_key = f"pdf_upload_v{upload_counter}"


# Mirror the Home-page upload pattern: the Streamlit file_uploader
# is positioned off-screen via CSS (keeps its underlying ``<input
# type=file>`` reachable to JS), and the page renders a Home-style
# bordered file list with an "Add more files" button at the
# bottom. A small iframe-injected script wires that button to
# programmatically click the hidden uploader so the OS file picker
# opens. Same approach as ``_sync_uploader_to_home_uploads`` in
# ``src/gui/_home.py``.
st.markdown(
    '<style>[data-testid="stFileUploader"] {'
    'position:absolute!important;left:-10000px!important;'
    'width:1px!important;height:1px!important;overflow:hidden!important;'
    'pointer-events:none!important;}</style>',
    unsafe_allow_html=True,
)


def _sync_pdf_uploads() -> None:
    """``on_change`` callback. Adds newly-uploaded files to the
    persistent stash. **Add-only** — removal happens through the
    custom X buttons + counter bump, NOT through this callback.
    """
    widget_files = st.session_state.get(uploader_key) or []
    for f in widget_files:
        if f.name not in pdf_uploads:
            pdf_uploads[f.name] = {
                "bytes": f.getvalue(),
                "size": f.size,
            }
            log_event(
                "upload",
                f"PDF: {f.name}",
                filename=f.name,
                bytes=f.size,
                page="10_PDF_Extractor",
            )


st.file_uploader(
    "PDF file(s)",
    type=["pdf"],
    accept_multiple_files=True,
    key=uploader_key,
    on_change=_sync_pdf_uploads,
    label_visibility="collapsed",
    help="Drop one or more bank-statement PDFs. Multi-file batches "
    "are merged into a single table with a ``source_file`` column.",
)


# ---------------------------------------------------------------------------
# Files section (Home-style layout)
# ---------------------------------------------------------------------------

import html as _html

_DOC_SVG = (
    '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor">'
    '<path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/>'
    '<path d="M14 2v6h6"/>'
    '</svg>'
)
_PLUS_SVG = (
    '<svg viewBox="0 0 24 24" fill="none" stroke="currentColor">'
    '<path d="M12 5v14M5 12h14"/>'
    '</svg>'
)

n_files = len(pdf_uploads)
if n_files:
    total_bytes = sum(m["size"] for m in pdf_uploads.values())
    files_word = "file" if n_files == 1 else "files"
    meta_html = (
        f'{n_files} {files_word} · '
        f'{_html.escape(_format_size(total_bytes))} total'
    )
else:
    meta_html = "No files imported yet"

st.markdown(
    '<div class="dt-files-section-head">'
    '<h2>Files</h2>'
    f'<span class="dt-section-meta">{meta_html}</span>'
    '</div>',
    unsafe_allow_html=True,
)

# Single bordered card hosting the file rows + the in-card
# "Add more files" button at the bottom, matching the Home page.
# Two-phase remove pattern: walk all rows once, accumulate
# ``to_remove`` if any X was clicked, then mutate state + rerun
# ONCE after the loop so Streamlit doesn't see a half-mutated
# dict mid-render.
to_remove: str | None = None
with st.container(border=True):
    for name, meta in pdf_uploads.items():
        digest = hashlib.sha1(
            name.encode("utf-8"), usedforsecurity=False,
        ).hexdigest()[:10]
        col_x, col_name, col_size = st.columns([0.55, 8, 1.6])
        if col_x.button(
            "✕",
            key=f"pdf_rm_{digest}",
            help=f"Remove {name}",
            type="tertiary",
        ):
            to_remove = name
        col_name.markdown(
            '<div class="dt-file-row">'
            f'<span class="dt-file-icon-chip">{_DOC_SVG}</span>'
            f'<span class="dt-file-name">{_html.escape(name)}</span>'
            '</div>',
            unsafe_allow_html=True,
        )
        col_size.markdown(
            f'<div style="text-align:right;">'
            f'<span class="dt-file-size">'
            f'{_html.escape(_format_size(meta["size"]))}'
            '</span></div>',
            unsafe_allow_html=True,
        )
    # In-card "Add more files" button. The HTML is rendered as-is
    # — Streamlit's sanitiser strips inline ``onclick``, so the
    # click wiring is done by the iframe script below.
    st.markdown(
        '<button class="dt-file-add" type="button">'
        f'{_PLUS_SVG} Add more files'
        '</button>',
        unsafe_allow_html=True,
    )

# Wire the in-card "Add more files" button to the off-screen
# ``stFileUploaderDropzoneInput``. Identical pattern to the
# Home page (see ``src/gui/_home.py``); a ``MutationObserver``
# re-wires after every Streamlit rerun in case the button got
# re-mounted.
st.iframe(
    """
<script>
  (function () {
    function wire(doc) {
      var btn = doc.querySelector('button.dt-file-add');
      var input = doc.querySelector('[data-testid="stFileUploaderDropzoneInput"]');
      if (!btn || !input) return;
      if (btn.dataset.dtWired === '1') return;
      btn.dataset.dtWired = '1';
      btn.addEventListener('click', function (e) {
        e.preventDefault();
        input.click();
      });
    }
    var doc;
    try { doc = window.parent.document; }
    catch (e) { doc = document; }
    wire(doc);
    var win = doc.defaultView || window.parent || window;
    if ('MutationObserver' in win) {
      var raf = 0;
      try {
        new win.MutationObserver(function () {
          if (raf) return;
          raf = win.requestAnimationFrame(function () { raf = 0; wire(doc); });
        }).observe(doc.body, { childList: true, subtree: true });
      } catch (e) {}
    }
  })();
</script>
""",
    height=1,
)

if to_remove is not None:
    log_event(
        "upload",
        f"PDF removed: {to_remove}",
        filename=to_remove,
        page="10_PDF_Extractor",
    )
    del pdf_uploads[to_remove]
    # Bump the uploader counter so the widget re-instantiates and
    # forgets the removed file.
    st.session_state[K_UPLOAD_COUNTER] = upload_counter + 1
    st.rerun()


# ---------------------------------------------------------------------------
# Action buttons (Scan + Clear all) live below the Files card
# ---------------------------------------------------------------------------

c_scan, c_clear, _spacer = st.columns([1, 1, 4])
with c_scan:
    scan_clicked = st.button(
        "Scan",
        type="primary",
        disabled=not pdf_uploads,
        use_container_width=True,
    )
with c_clear:
    if st.button(
        "Clear all files",
        type="secondary",
        disabled=not pdf_uploads,
        help="Removes all uploaded files and the last scan result.",
        use_container_width=True,
    ):
        st.session_state[K_UPLOADS] = {}
        st.session_state[K_UPLOAD_COUNTER] = upload_counter + 1
        for k in (K_ROWS, K_WARNINGS, K_SOURCE_COUNT, K_TIMESTAMP):
            st.session_state.pop(k, None)
        log_event(
            "upload",
            "PDF list cleared",
            page="10_PDF_Extractor",
            count=n_files,
        )
        st.rerun()


# ---------------------------------------------------------------------------
# Scan
# ---------------------------------------------------------------------------

if scan_clicked and pdf_uploads:
    all_rows: list[dict] = []
    all_warnings: list[str] = []
    n_files = len(pdf_uploads)
    with st.status(
        f"Scanning {n_files} file(s)…",
        expanded=True,
    ) as status:
        for i, (name, meta) in enumerate(pdf_uploads.items(), start=1):
            st.write(f"**{i}/{n_files}** · {name}")
            try:
                raw = meta["bytes"]
                rows, warns = scan_pdf_for_transactions(
                    raw,
                    negative_in_parens=negative_in_parens,
                    allow_ocr=use_ocr,
                    output_date_format=output_date_format,
                    filename_year_hint=year_from_filename(name),
                    year_override=year_override,
                )
                for r in rows:
                    r["source_file"] = name
                all_rows.extend(rows)
                all_warnings.extend(f"[{name}] {w}" for w in warns)
            except PdfDependencyMissing as e:
                all_warnings.append(f"[{name}] {e}")
            except Exception as e:
                all_warnings.append(
                    f"[{name}] scan failed: {type(e).__name__}: {e}"
                )
        status.update(
            label=(
                f"Found {len(all_rows):,} candidate transactions "
                f"across {n_files} file(s)"
            ),
            state="complete",
            expanded=False,
        )

    st.session_state[K_ROWS] = all_rows
    st.session_state[K_WARNINGS] = all_warnings
    st.session_state[K_SOURCE_COUNT] = n_files
    st.session_state[K_TIMESTAMP] = datetime.now().strftime("%Y%m%d-%H%M%S")

    log_event(
        "tool_run",
        "PDF scan",
        page="10_PDF_Extractor",
        files=n_files,
        rows=len(all_rows),
        warnings=len(all_warnings),
    )


# ---------------------------------------------------------------------------
# Results — editable table + download
# ---------------------------------------------------------------------------

rows = st.session_state.get(K_ROWS)
warnings = st.session_state.get(K_WARNINGS) or []
source_count = st.session_state.get(K_SOURCE_COUNT, 0)

if warnings:
    with st.expander(f"Warnings ({len(warnings)})", expanded=False):
        for w in warnings:
            st.warning(w)

if rows is None:
    if pdf_uploads:
        st.info("Click **Scan** to detect transactions.")
    else:
        st.info("Upload one or more PDF files to begin.")

elif not rows:
    st.info(
        "No transaction rows detected. The scanner looks for lines "
        "containing a date and at least one amount. The diagnostic "
        "below shows every line the PDF reader could see — use the "
        "``has_date`` and ``has_amount`` columns to spot which "
        "pieces are missing (usually one or the other)."
    )
    if pdf_uploads:
        with st.expander(
            "Diagnostic: what the scanner saw",
            expanded=True,
        ):
            for fname, meta in pdf_uploads.items():
                raw = meta["bytes"]
                st.markdown(f"**{fname}**")
                try:
                    lines, dwarns = diagnose_pdf_lines(
                        raw, allow_ocr=use_ocr, max_lines=200,
                    )
                except Exception as e:
                    st.error(f"Diagnostic failed: {type(e).__name__}: {e}")
                    continue
                for w in dwarns:
                    st.caption(w)
                if not lines:
                    st.warning(
                        "Zero text lines extracted. This is almost "
                        "certainly a scanned (image-based) PDF — "
                        "enable OCR in Scan options if available."
                    )
                    continue
                st.dataframe(
                    pd.DataFrame(lines),
                    hide_index=True,
                    use_container_width=True,
                    height=400,
                )
                date_hits = sum(1 for ln in lines if ln["has_date"])
                amt_hits = sum(1 for ln in lines if ln["has_amount"])
                both = sum(
                    1 for ln in lines
                    if ln["has_date"] and ln["has_amount"]
                )
                st.caption(
                    f"{len(lines):,} lines · {date_hits:,} look like "
                    f"they contain a date · {amt_hits:,} look like "
                    f"they contain an amount · {both:,} have both "
                    "(those are the rows the scanner would have kept)."
                )

else:
    df = pd.DataFrame(rows)

    # Order columns so the user-facing fields are leftmost; raw +
    # internals are last and easy to scroll past or unselect at
    # download time. ``account_number`` sits with the transaction
    # detail since it's per-row context an accountant typically
    # wants alongside the amounts.
    front = [
        "date",
        "description",
    ]
    amount_cols = sorted(c for c in df.columns if c.startswith("amount_"))
    metadata_cols = ["account_number"]
    tail = ["source_file", "page", "raw"]
    ordered = [
        c for c in front + amount_cols + metadata_cols + tail
        if c in df.columns
    ]
    extras = [c for c in df.columns if c not in ordered]
    df = df[ordered + extras]

    # Prepend the include checkbox.
    df.insert(0, "Include", True)

    st.markdown(
        f"#### {len(df):,} candidate transaction(s) "
        f"from {source_count} file(s)"
    )
    st.caption(
        "Uncheck rows to exclude. Edit any cell to fix a value the "
        "scanner got wrong. The ``raw`` column shows the original "
        "PDF text for that row."
    )

    column_config = {
        "Include": st.column_config.CheckboxColumn(
            "Include",
            default=True,
            help="Uncheck to drop this row from the CSV.",
        ),
        "raw": st.column_config.TextColumn(
            "raw",
            help="Original text line from the PDF (read-only reference).",
            disabled=True,
            width="large",
        ),
        "page": st.column_config.NumberColumn(
            "page", disabled=True, width="small",
        ),
    }
    if "source_file" in df.columns:
        column_config["source_file"] = st.column_config.TextColumn(
            "source_file", disabled=True,
        )
    # Force 2-decimal display on every amount column. Without this,
    # Streamlit / Pandas show floats with their raw repr ("4.5",
    # "12.0", "1000") and the precision looks inconsistent across
    # rows that all came from the same statement. Internal dtype
    # stays float for arithmetic accuracy; only the rendering and
    # CSV-export formatting force two-place precision.
    for amt_col in (c for c in df.columns if c.startswith("amount_")):
        column_config[amt_col] = st.column_config.NumberColumn(
            amt_col,
            format="%.2f",
            help="Two-decimal currency amount.",
        )

    edited = st.data_editor(
        df,
        hide_index=True,
        use_container_width=True,
        column_config=column_config,
        num_rows="fixed",
        key="pdf_results_editor",
    )

    selected = edited[edited["Include"]].drop(columns=["Include"])

    c_dl, c_meta = st.columns([2, 3])
    with c_dl:
        if selected.empty:
            st.button("Download CSV", disabled=True)
        else:
            # Reuse the timestamp stamped when this scan finished —
            # stable across reruns so the download helper's button
            # key doesn't drift every second.
            ts = st.session_state.get(K_TIMESTAMP) or "results"
            # Default: drop the internal columns from the download.
            keep_default = [
                c for c in selected.columns
                if c not in ("page", "raw")
            ]
            with c_meta:
                keep = st.multiselect(
                    "Columns to include in CSV",
                    options=list(selected.columns),
                    default=keep_default,
                    help="``page`` and ``raw`` are kept off by default; "
                    "tick them if you want them in the file.",
                )
            export = (selected[keep] if keep else selected).copy()
            # Coerce every amount column to a fixed 2-decimal string
            # before serialising. Pandas' default float-to-CSV
            # writer drops trailing zeros (4.50 → 4.5) which an
            # accountant immediately notices in Excel; preserving
            # the precision is the whole point of this commit.
            for amt_col in (
                c for c in export.columns if c.startswith("amount_")
            ):
                export[amt_col] = export[amt_col].map(format_amount)
            csv_bytes = export.to_csv(index=False).encode("utf-8")
            # Save server-side (consistent with the other tools) —
            # writes to the user's Downloads folder and shows the
            # exact path. Avoids the st.download_button quirk where
            # the second-or-later button in a script pass silently
            # fails to fire.
            html_download_button(
                f"Download {len(export):,} rows as CSV",
                csv_bytes,
                file_name=f"transactions-{ts}.csv",
                mime="text/csv",
            )

    if not selected.empty:
        st.caption(
            f"{len(selected):,} of {len(df):,} rows selected."
        )