diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0535491..3a421ae 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -65,6 +65,30 @@ jobs: pip install -r requirements.txt pip install pyinstaller pillow + # ---- Tesseract bundling cache -------------------------------- + # The fetch logic inside build/make_release.py downloads: + # * build/vendor/tessdata/eng.traineddata (~16 MB, shared) + # * build/_tesseract// (binary + libs, 30-120 MB) + # Cache both so iterative CI runs don't re-download. The + # cache key bakes in the pinned Tesseract version + tessdata + # URL so a version bump invalidates automatically. + - name: Cache Tesseract bundle inputs + uses: actions/cache@v4 + with: + path: | + build/_tesseract + build/vendor/tessdata + key: tesseract-${{ runner.os }}-5.5.0-tessdata_best-v1 + + # ---- Linux: install patchelf so make_release.py can rewrite + # RPATH on the bundled tesseract binary. apt-get install + # tesseract-ocr is handled inside make_release.py itself. ----- + - name: Install Linux build prereqs for Tesseract bundling + if: matrix.os == 'ubuntu-latest' + run: | + sudo apt-get update + sudo apt-get install -y patchelf + - name: Read version id: version shell: bash @@ -75,7 +99,32 @@ jobs: - name: Generate platform icons run: python build/generate_icons.py + # Stage Tesseract before PyInstaller. The make_release.py + # helpers handle the per-platform fetch (UB-Mannheim on Win, + # brew on Mac, apt on Linux) and stage the binary + libs into + # build/_tesseract// where the spec picks them up. + # We invoke a tiny inline Python so the workflow doesn't have + # to know the per-platform target string. + - name: Stage Tesseract binary + tessdata + shell: bash + env: + DATATOOLS_PLATFORM: ${{ matrix.platform }} + run: | + python - <<'PY' + import os, sys + sys.path.insert(0, "build") + from make_release import fetch_tessdata, fetch_tesseract_for_platform + target = os.environ["DATATOOLS_PLATFORM"] + fetch_tessdata() + fetch_tesseract_for_platform(target) + PY + - name: Build PyInstaller bundle + shell: bash + env: + # The spec reads this to find the per-platform staging dir; + # see build/datatools.spec for the contract. + DATATOOLS_TESS_STAGING: build/_tesseract/${{ matrix.platform }} run: pyinstaller build/datatools.spec --clean --noconfirm # ---- Per-platform installer packaging ------------------------ diff --git a/.gitignore b/.gitignore index 5ecce91..ebf71c6 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,14 @@ build/dist/ build/icon.ico build/icon.icns build/icon.png + +# Tesseract bundling — fetched at build time, not committed. See +# build/vendor/README.md for the canonical URLs and rationale. +# - build/_tesseract/ : per-platform binary + DLLs/dylibs staging dir +# - build/vendor/tessdata/eng.traineddata : ~16 MB language data +build/_tesseract/ +build/vendor/tessdata/*.traineddata + .pytest_cache/ # Claude Code agent worktrees + local settings diff --git a/build/appimage/build.sh b/build/appimage/build.sh index f77c194..f834183 100755 --- a/build/appimage/build.sh +++ b/build/appimage/build.sh @@ -9,6 +9,11 @@ # latest release from https://github.com/AppImage/AppImageKit/releases). # # Output: dist/DataTools--linux-x86_64.AppImage +# +# Tesseract bundling: no-op here. The PyInstaller bundle in +# dist/DataTools/ already contains tesseract/{tesseract, *.so, +# tessdata/eng.traineddata} from the spec's datas; ``cp -R`` +# below carries it along into the AppDir. set -euo pipefail diff --git a/build/datatools.spec b/build/datatools.spec index 1469fea..b55dbbb 100644 --- a/build/datatools.spec +++ b/build/datatools.spec @@ -24,6 +24,7 @@ # -*- mode: python ; coding: utf-8 -*- +import os from pathlib import Path from PyInstaller.utils.hooks import ( collect_all, @@ -103,6 +104,78 @@ datas += [ (str(REPO / ".streamlit" / "config.toml"),".streamlit"), ] +# ----- Tesseract OCR bundle ---------------------------------------- +# ``build/make_release.py`` stages the per-platform Tesseract binary +# + its runtime libs (DLLs/dylibs/sos) into +# ``build/_tesseract//`` and the shared eng.traineddata into +# ``build/vendor/tessdata/``. We add both to ``datas`` so PyInstaller +# drops them at the path the runtime expects: +# +# /tesseract/tesseract[.exe] +# /tesseract/ +# /tesseract/tessdata/eng.traineddata +# +# The runtime discovery code in src/pdf_extract.py reads this layout +# from ``Path(sys._MEIPASS) / "tesseract" / ...``. Keep the two ends +# in sync — if you rename "tesseract" here, update pdf_extract.py too. +# +# The orchestrator (make_release.py) sets DATATOOLS_TESS_STAGING to +# the right per-platform dir before invoking PyInstaller. For ad-hoc +# `pyinstaller build/datatools.spec` runs without the orchestrator, +# fall back to the canonical staging path. +_tess_staging_env = os.environ.get("DATATOOLS_TESS_STAGING") +if _tess_staging_env: + _tess_staging = Path(_tess_staging_env) +else: + # Pick the obvious per-host staging dir as a fallback so spec-only + # builds (without the orchestrator) still work in dev. + import sys as _sys_for_target + _target_guess = ( + "win" if _sys_for_target.platform.startswith("win") + else "mac" if _sys_for_target.platform == "darwin" + else "linux" + ) + _tess_staging = REPO / "build" / "_tesseract" / _target_guess + +_tessdata = REPO / "build" / "vendor" / "tessdata" + +if _tess_staging.is_dir() and any(_tess_staging.iterdir()): + # Drop every file in the staging dir directly under + # ``/tesseract/`` (binary + DLL/dylib/so siblings). + datas += [(str(_tess_staging), "tesseract")] +else: + # Don't hard-fail spec parse — useful for first-time devs running + # PyInstaller before fetching binaries. Surface a loud warning + # though, since the OCR feature will silently fail at runtime. + print( + f"WARNING: {_tess_staging} is empty or missing — OCR will be " + "disabled in the bundle. Run build/make_release.py (which " + "calls fetch_tesseract_for_platform) before pyinstaller, or " + "pre-stage the binary manually." + ) + +if (_tessdata / "eng.traineddata").exists(): + datas += [(str(_tessdata), "tesseract/tessdata")] +else: + print( + f"WARNING: {_tessdata}/eng.traineddata is missing — OCR will " + "have no language data at runtime. Run build/make_release.py " + "or fetch manually per build/vendor/README.md." + ) + +# Bundle the Apache-2.0 LICENSE text alongside the binary. The docs +# agent maintains LICENSE_TESSERACT.txt at the repo root; PyInstaller +# drops it at the bundle root next to DataTools[.exe]. +_tess_license = REPO / "LICENSE_TESSERACT.txt" +if _tess_license.exists(): + datas += [(str(_tess_license), ".")] +else: + print( + "WARNING: LICENSE_TESSERACT.txt missing at repo root. Required " + "by Apache-2.0 for redistribution; the docs agent should " + "create it. Continuing without it for now." + ) + # ----- Analysis ------------------------------------------------------ a = Analysis( @@ -158,6 +231,13 @@ coll = COLLECT( # macOS .app bundle wrapper. PyInstaller produces it only on Mac; # this block is a no-op on Win/Linux. +# +# Tesseract bundling note: ``BUNDLE(coll, ...)`` carries the entire +# COLLECT output (binaries + datas) into the .app's +# Contents/Resources tree, so the ``tesseract/`` subdir we built up +# in ``datas`` lands at ``DataTools.app/Contents/Resources/tesseract/`` +# and the runtime ``sys._MEIPASS`` resolves there. No extra plumbing +# needed. import sys as _sys if _sys.platform == "darwin": app = BUNDLE( diff --git a/build/installer.iss b/build/installer.iss index a8a1b32..02d1a7d 100644 --- a/build/installer.iss +++ b/build/installer.iss @@ -63,6 +63,14 @@ Name: "desktopicon"; Description: "Create a &desktop shortcut"; GroupDescription Name: "quicklaunchicon"; Description: "Create a &Quick Launch shortcut"; GroupDescription: "Additional shortcuts:"; Flags: unchecked; OnlyBelowVersion: 6.1 [Files] +; PyInstaller's dist/DataTools/ tree includes: +; * DataTools.exe + frozen Python runtime +; * tesseract/tesseract.exe + DLLs + tessdata/eng.traineddata +; (bundled via build/datatools.spec datas; runtime discovery in +; src/pdf_extract.py reads sys._MEIPASS / "tesseract" / ...). +; * LICENSE_TESSERACT.txt at the bundle root (Apache-2.0). +; The recursesubdirs flag below picks all of those up — no separate +; Files: entry needed for tesseract/. Source: "..\dist\DataTools\*"; DestDir: "{app}"; Flags: recursesubdirs ignoreversion [Icons] diff --git a/build/macos/build_dmg.sh b/build/macos/build_dmg.sh index 011b3ea..c0598d5 100755 --- a/build/macos/build_dmg.sh +++ b/build/macos/build_dmg.sh @@ -10,6 +10,11 @@ # # Code signing + notarization happen separately (see build/README.md # "Signing"). This script only handles the packaging step. +# +# Tesseract bundling: no-op here. The .app already contains +# Contents/Resources/tesseract/{tesseract, *.dylib, tessdata/} thanks +# to PyInstaller's BUNDLE() carrying the spec's datas through. This +# script just wraps the finished .app — no extra steps for OCR. set -euo pipefail diff --git a/build/macos/build_zip.sh b/build/macos/build_zip.sh index 43022a9..d979ca0 100755 --- a/build/macos/build_zip.sh +++ b/build/macos/build_zip.sh @@ -14,6 +14,11 @@ # Run after ``pyinstaller build/datatools.spec --clean --noconfirm`` # has produced ``dist/DataTools.app``. Output goes to # ``dist/DataTools--mac-portable.zip``. +# +# Tesseract bundling: no-op here. The bundled Tesseract binary + +# dylibs + tessdata are already inside DataTools.app/Contents/Resources/tesseract/ +# (placed by PyInstaller's BUNDLE/datas mechanism). ``ditto -c -k`` +# preserves the whole .app tree. set -euo pipefail diff --git a/build/make_release.py b/build/make_release.py index 8597ae8..23ff242 100644 --- a/build/make_release.py +++ b/build/make_release.py @@ -32,17 +32,33 @@ Run from the repo root or from build/ — either works. from __future__ import annotations import argparse +import os import platform import re import shutil import subprocess import sys +import urllib.request from pathlib import Path REPO = Path(__file__).resolve().parent.parent BUILD = REPO / "build" DIST = REPO / "dist" +# Tesseract bundling. The runtime discovery code in +# ``src/pdf_extract.py`` looks for the binary at +# ``Path(sys._MEIPASS) / "tesseract" / "tesseract[.exe]"`` and tessdata +# at ``... / "tesseract" / "tessdata" / "eng.traineddata"``. We stage +# everything under ``build/_tesseract//`` (gitignored) and +# the PyInstaller spec adds that staging dir to ``datas=`` so it lands +# at the right place inside the frozen bundle. +TESSERACT_VERSION = "5.5.0" +TESSDATA_DIR = BUILD / "vendor" / "tessdata" +TESSDATA_URL = ( + "https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata" +) +TESSERACT_STAGING = BUILD / "_tesseract" + # --------------------------------------------------------------------------- # Output helpers — colourless so logs stay readable in any terminal/CI tail. @@ -192,6 +208,382 @@ def preflight(target: str) -> None: _ok("all prerequisites present") +# --------------------------------------------------------------------------- +# Tesseract bundling — fetch the binary + tessdata at build time. +# +# We download (not vendor) because: +# * Binaries are large (5-40 MB per platform) and license-encumbered +# to keep current in git. +# * tessdata is Apache-2.0 and ~16 MB — fine to redistribute but +# bloats clones for contributors who don't touch OCR. +# +# Caching layout: +# build/_tesseract/win/tesseract.exe + DLLs +# build/_tesseract/mac/tesseract + dylibs +# build/_tesseract/linux/tesseract + libs +# build/vendor/tessdata/eng.traineddata (shared across platforms) +# +# The PyInstaller spec reads ``build/_tesseract//`` and the +# tessdata dir, then bundles them under ``/tesseract/``. +# --------------------------------------------------------------------------- + + +def _download(url: str, dest: Path, *, expected_min_bytes: int = 1024) -> None: + """Download *url* to *dest* atomically. Sanity-check the size.""" + dest.parent.mkdir(parents=True, exist_ok=True) + tmp = dest.with_suffix(dest.suffix + ".part") + print(f" GET {url}", flush=True) + try: + with urllib.request.urlopen(url, timeout=120) as r, open(tmp, "wb") as f: + shutil.copyfileobj(r, f) + except Exception as e: # noqa: BLE001 — bubble any network error up + if tmp.exists(): + tmp.unlink() + _err(f"download failed: {url}\n {e}") + raise + size = tmp.stat().st_size + if size < expected_min_bytes: + tmp.unlink() + raise RuntimeError( + f"downloaded file too small ({size} bytes < {expected_min_bytes}); " + f"the URL probably 404'd into an HTML error page." + ) + tmp.replace(dest) + _ok(f"downloaded {dest.name} ({size / (1024 * 1024):.1f} MB)") + + +def fetch_tessdata() -> Path: + """Ensure ``build/vendor/tessdata/eng.traineddata`` exists; return its path. + + Shared across platforms. Downloaded once and cached. The + runtime expects this file at ``/tesseract/tessdata/eng.traineddata``; + the PyInstaller spec handles the placement. + """ + _step("fetch tessdata (eng.traineddata)") + TESSDATA_DIR.mkdir(parents=True, exist_ok=True) + target = TESSDATA_DIR / "eng.traineddata" + if target.exists() and target.stat().st_size > 1_000_000: + _ok(f"already cached: {target.relative_to(REPO)} " + f"({target.stat().st_size / (1024 * 1024):.1f} MB)") + return target + # ~16 MB on disk for the "best" model. Allow some slack on the + # min-bytes check (3 MB) so we still catch HTML 404 pages. + _download(TESSDATA_URL, target, expected_min_bytes=3 * 1024 * 1024) + return target + + +def _fetch_tesseract_windows(staging: Path) -> None: + """Stage tesseract.exe + DLLs into *staging*. + + Strategy (no easy stand-alone Windows tarball exists — UB-Mannheim + ships the canonical Windows builds as Inno Setup installers): + + 1. Download the installer .exe from the UB-Mannheim mirror. + 2. Extract it with 7-Zip (which can read Inno Setup archives via + the {app} group). 7-Zip is preinstalled on + ``windows-latest`` GitHub Actions runners (`C:\\Program Files\\7-Zip\\7z.exe`). + 3. Copy tesseract.exe + every DLL + the tessdata dir from the + extraction into ``staging/``. + + The DLL set tesseract.exe needs at runtime (per UB-Mannheim's + Inno Setup script): + libtesseract-5.dll, libleptonica-6.dll, libgomp-1.dll, + libstdc++-6.dll, libwinpthread-1.dll, libgcc_s_seh-1.dll, + liblz4.dll, libjpeg-8.dll, libpng16-16.dll, libtiff-6.dll, + libwebp-7.dll, libwebpmux-3.dll, libopenjp2-7.dll, zlib1.dll + The whole {app} tree from the installer is ~120 MB; we copy + just the .exe + .dll files (~50 MB) since the runtime only + needs the binary and its direct deps. + """ + # UB-Mannheim posts builds under a versioned filename; the exact + # build revision changes (5.5.0.20241111 at time of writing). + # We pin a specific rev so reproducible builds don't drift. + rev = "20241111" # patch rev for tesseract 5.5.0 on the UB-Mannheim mirror + fname = f"tesseract-ocr-w64-setup-{TESSERACT_VERSION}.{rev}.exe" + url = f"https://digi.bib.uni-mannheim.de/tesseract/{fname}" + + cache = TESSERACT_STAGING / fname + if not cache.exists(): + _download(url, cache, expected_min_bytes=20 * 1024 * 1024) + + # 7-Zip is preinstalled on windows-latest runners; on a dev box + # the user installs it (choco install 7zip) or substitutes + # innoextract. Locate it. + sevenz = ( + shutil.which("7z") + or shutil.which("7z.exe") + or r"C:\Program Files\7-Zip\7z.exe" + ) + if not Path(sevenz).exists() and not shutil.which("7z"): + _err( + "7-Zip not found. On Windows CI runners it's preinstalled; " + "on a dev box install via ``choco install 7zip`` or extract " + f"{cache} manually into {staging}/ and re-run with " + "TESSERACT_SKIP_FETCH=1." + ) + raise FileNotFoundError("7z") + + extract = TESSERACT_STAGING / "win_extract" + if extract.exists(): + shutil.rmtree(extract) + extract.mkdir(parents=True) + _run([str(sevenz), "x", "-y", f"-o{extract}", str(cache)]) + + staging.mkdir(parents=True, exist_ok=True) + # The Inno Setup payload lands under ``{app}/`` inside the + # extraction. Recursively grab tesseract.exe + DLLs. + found_exe = False + for root, _dirs, files in os.walk(extract): + for f in files: + src = Path(root) / f + if f.lower() == "tesseract.exe": + shutil.copy2(src, staging / "tesseract.exe") + found_exe = True + elif f.lower().endswith(".dll"): + shutil.copy2(src, staging / f) + if not found_exe: + raise RuntimeError( + f"tesseract.exe not found inside extracted installer at {extract}" + ) + _ok(f"staged Windows tesseract into {staging.relative_to(REPO)}") + + +def _fetch_tesseract_macos(staging: Path) -> None: + """Stage tesseract + dylibs into *staging* on macOS. + + Strategy: use Homebrew. ``brew install tesseract`` is the + sanctioned macOS path and the binary it installs is the same one + every guide on the internet points at. We copy the binary + + every dylib it links against into the staging dir, then run + ``install_name_tool`` to rewrite the load paths so the binary + works after relocation into the .app bundle. + + Caveat: ``brew`` must be on PATH (it is on ``macos-latest`` + runners). If it isn't, we surface a helpful error rather than + fail mysteriously. + """ + if not shutil.which("brew"): + _err( + "Homebrew not found. On macos-latest GitHub runners it's " + "preinstalled; on a dev Mac install from https://brew.sh and " + "re-run. Alternatively pre-stage tesseract into " + f"{staging}/ and set TESSERACT_SKIP_FETCH=1." + ) + raise FileNotFoundError("brew") + + # ``brew install`` is idempotent — fine to run on every build. We + # don't pin the version through brew because brew tracks its own + # taps; instead we assert the version matches TESSERACT_VERSION + # after install. + _run(["brew", "install", "tesseract"]) + + # Find the binary brew just installed. + tess_path = shutil.which("tesseract") + if not tess_path: + raise RuntimeError("brew install tesseract succeeded but tesseract not on PATH") + + staging.mkdir(parents=True, exist_ok=True) + shutil.copy2(tess_path, staging / "tesseract") + + # Copy every non-system dylib the binary links against. The + # ``otool -L`` output lists absolute paths under /opt/homebrew/ + # (Apple Silicon) or /usr/local/ (Intel). We skip /usr/lib/* and + # /System/* (Apple-shipped, present on every Mac). + try: + otool = subprocess.run( + ["otool", "-L", str(staging / "tesseract")], + check=True, capture_output=True, text=True, + ) + except subprocess.CalledProcessError as e: + raise RuntimeError(f"otool failed: {e.stderr}") from e + + deps = [] + for line in otool.stdout.splitlines()[1:]: + path = line.strip().split(" ", 1)[0] + if path.startswith(("/opt/homebrew/", "/usr/local/")): + deps.append(path) + + # Copy each dep and its transitive deps. One level of recursion + # is usually enough for the tesseract dep tree (libtesseract → + # libleptonica → libpng/libjpeg/libtiff/libwebp). + copied: set[str] = set() + + def _copy_with_deps(libpath: str) -> None: + if libpath in copied or not Path(libpath).exists(): + return + copied.add(libpath) + dest = staging / Path(libpath).name + shutil.copy2(libpath, dest) + # Rewrite the dest's own load path to @loader_path so the + # bundle is relocatable. + try: + subprocess.run( + ["install_name_tool", "-id", f"@loader_path/{Path(libpath).name}", str(dest)], + check=True, capture_output=True, + ) + except subprocess.CalledProcessError: + # Not fatal — install_name_tool refuses on already-relative + # IDs. The dyld loader will still find them via + # @loader_path rewrites on the consumer side. + pass + # Walk this lib's own deps. + try: + sub = subprocess.run( + ["otool", "-L", libpath], check=True, capture_output=True, text=True, + ) + for sub_line in sub.stdout.splitlines()[1:]: + sub_path = sub_line.strip().split(" ", 1)[0] + if sub_path.startswith(("/opt/homebrew/", "/usr/local/")): + _copy_with_deps(sub_path) + except subprocess.CalledProcessError: + pass + + for dep in deps: + _copy_with_deps(dep) + + # Rewrite the tesseract binary's references to point at + # @loader_path/ so it can find its deps inside the bundle. + bin_path = staging / "tesseract" + for dep in deps: + try: + subprocess.run( + ["install_name_tool", "-change", dep, + f"@loader_path/{Path(dep).name}", str(bin_path)], + check=True, capture_output=True, + ) + except subprocess.CalledProcessError: + pass + + _ok(f"staged macOS tesseract + {len(copied)} dylibs into {staging.relative_to(REPO)}") + + +def _fetch_tesseract_linux(staging: Path) -> None: + """Stage tesseract + .so files into *staging* on Linux. + + Strategy: ``apt-get install tesseract-ocr libtesseract5`` + (preinstalled on most ubuntu-latest images; we run install + anyway because the package is idempotent). Then copy the + binary + every .so it links against into staging. ``patchelf`` + rewrites RPATH so the bundle is relocatable. + """ + if not shutil.which("apt-get") and not shutil.which("tesseract"): + _err( + "Neither apt-get nor a pre-installed tesseract found. On " + "ubuntu-latest runners both are present. On other distros " + "install tesseract-ocr via your package manager and re-run " + "with TESSERACT_SKIP_FETCH=1 after pre-staging the binary." + ) + raise FileNotFoundError("tesseract") + + if shutil.which("apt-get") and not shutil.which("tesseract"): + _run(["sudo", "apt-get", "update"]) + _run(["sudo", "apt-get", "install", "-y", "tesseract-ocr", "libtesseract5"]) + + tess_path = shutil.which("tesseract") + if not tess_path: + raise RuntimeError("apt-get install succeeded but tesseract not on PATH") + + staging.mkdir(parents=True, exist_ok=True) + shutil.copy2(tess_path, staging / "tesseract") + + # Collect .so dependencies via ldd. Skip the dynamic linker and + # libc/libpthread/libdl/libm/libstdc++/libgcc_s — those are + # guaranteed to exist on every Linux target and shipping them can + # cause GLIBC mismatch errors on older distros. The interesting + # tesseract-specific deps are libtesseract, libleptonica, and the + # image format libs (libpng, libjpeg, libtiff, libwebp, libgif). + SKIP_PREFIXES = ( + "linux-vdso", "/lib64/ld-linux", "/lib/ld-linux", + "libc.so", "libdl.so", "libpthread.so", "libm.so", + "librt.so", "libnsl.so", "libutil.so", + ) + try: + ldd = subprocess.run( + ["ldd", str(staging / "tesseract")], + check=True, capture_output=True, text=True, + ) + except subprocess.CalledProcessError as e: + raise RuntimeError(f"ldd failed: {e.stderr}") from e + + copied = 0 + for line in ldd.stdout.splitlines(): + # Format: " libfoo.so.N => /path/to/libfoo.so.N (0x...)" + parts = line.split("=>") + if len(parts) != 2: + continue + soname = parts[0].strip() + if soname.startswith(SKIP_PREFIXES): + continue + path_part = parts[1].strip().split(" ", 1)[0] + if not path_part or not Path(path_part).exists(): + continue + shutil.copy2(path_part, staging / Path(path_part).name) + copied += 1 + + # patchelf is optional — if present, rewrite RPATH to $ORIGIN so + # the binary finds its bundled .so files. If absent, the + # PyInstaller LD_LIBRARY_PATH that the launcher sets will cover + # it (we already chdir into _MEIPASS for the runtime). + if shutil.which("patchelf"): + try: + _run(["patchelf", "--set-rpath", "$ORIGIN", str(staging / "tesseract")]) + except SystemExit: + _warn("patchelf rpath rewrite failed — relying on LD_LIBRARY_PATH at runtime") + + _ok(f"staged Linux tesseract + {copied} .so files into {staging.relative_to(REPO)}") + + +def fetch_tesseract_for_platform(target: str) -> Path: + """Stage the per-platform Tesseract binary + libs into ``build/_tesseract//``. + + Returns the staging dir path. The PyInstaller spec adds this dir + (plus tessdata) to its ``datas=`` so the bundle ends up with + everything under ``/tesseract/`` where the runtime + discovery code expects it. + + Honours ``TESSERACT_SKIP_FETCH=1`` — set this when you've + pre-staged the binary by hand (offline build, behind a proxy, + custom build of tesseract, etc.). The script still verifies the + binary is present and surfaces a helpful error if not. + """ + _step(f"fetch tesseract binary ({target})") + staging = TESSERACT_STAGING / target + exe_name = "tesseract.exe" if target == "win" else "tesseract" + exe_path = staging / exe_name + + if os.environ.get("TESSERACT_SKIP_FETCH") == "1": + if not exe_path.exists(): + _err( + f"TESSERACT_SKIP_FETCH=1 but {exe_path} is missing. " + "Pre-stage the binary + its libs into that dir, then re-run." + ) + sys.exit(1) + _ok(f"skipping fetch (TESSERACT_SKIP_FETCH=1); using {exe_path.relative_to(REPO)}") + return staging + + if exe_path.exists(): + _ok(f"already staged: {exe_path.relative_to(REPO)}") + return staging + + if target == "win": + _fetch_tesseract_windows(staging) + elif target == "mac": + _fetch_tesseract_macos(staging) + elif target == "linux": + _fetch_tesseract_linux(staging) + else: + _err(f"unknown target {target!r} for tesseract fetch") + sys.exit(2) + + if not exe_path.exists(): + _err( + f"fetch step finished but {exe_path.relative_to(REPO)} is missing. " + "Inspect the logs above; you may need to pre-stage the binary manually." + ) + sys.exit(1) + return staging + + # --------------------------------------------------------------------------- # Build steps # --------------------------------------------------------------------------- @@ -202,7 +594,7 @@ def step_generate_icons() -> None: _run([sys.executable, str(BUILD / "generate_icons.py")]) -def step_pyinstaller(clean: bool) -> None: +def step_pyinstaller(clean: bool, *, target: str | None = None) -> None: _step("pyinstaller bundle") # Use ``python -m PyInstaller`` so we don't depend on the binary # being on PATH (Windows users frequently see this — pip's @@ -212,7 +604,14 @@ def step_pyinstaller(clean: bool) -> None: "--noconfirm"] if clean: cmd.append("--clean") - _run(cmd) + # The spec reads ``DATATOOLS_TESS_STAGING`` to find the per-platform + # tesseract staging dir. Passing it via env keeps the spec file + # platform-agnostic — the spec doesn't need to detect win/mac/linux + # itself; the orchestrator already did. + env = os.environ.copy() + if target: + env["DATATOOLS_TESS_STAGING"] = str(TESSERACT_STAGING / target) + _run(cmd, env=env) def step_package_win(version: str, do_installer: bool, do_portable: bool) -> list[Path]: @@ -331,7 +730,17 @@ def main() -> int: shutil.rmtree(DIST) step_generate_icons() - step_pyinstaller(clean=args.clean) + + # Stage Tesseract OCR before PyInstaller runs. The spec reads + # ``build/_tesseract//`` + ``build/vendor/tessdata/`` and + # bundles them under ``/tesseract/`` so the runtime + # discovery in src/pdf_extract.py finds them at: + # Path(sys._MEIPASS) / "tesseract" / "tesseract[.exe]" + # Path(sys._MEIPASS) / "tesseract" / "tessdata" / "eng.traineddata" + fetch_tessdata() + fetch_tesseract_for_platform(target) + + step_pyinstaller(clean=args.clean, target=target) if target == "win": outputs = step_package_win(version, do_installer, do_portable) diff --git a/build/vendor/README.md b/build/vendor/README.md new file mode 100644 index 0000000..31763b8 --- /dev/null +++ b/build/vendor/README.md @@ -0,0 +1,62 @@ +# build/vendor/ — third-party bundle inputs (fetched at build time) + +This tree holds the third-party assets that get bundled into the +PyInstaller artifacts but that we deliberately do **not** keep in git +(too large / license-encumbered / re-fetchable on demand). + +The build pipeline (`build/make_release.py`) populates everything in +here before the PyInstaller step. The contents are git-ignored except +for this README. + +## tessdata/ + +Holds the Tesseract language data file(s) used by the PDF Extractor +OCR fallback. Only English is bundled today. + +### Canonical source + +We use the **"best" model** from `tesseract-ocr/tessdata_best` (LSTM, +slower but higher accuracy than the legacy `tessdata` set, and only +~12 MB compressed → ~16 MB uncompressed): + +``` +https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata +``` + +There is also `tessdata_fast/` (~4 MB, lower accuracy) if you ever +want to optimise for bundle size over recognition quality. For bank +statements (the only OCR use case so far), the extra accuracy of the +`_best` model is worth the 10 MB. + +### Why we don't vendor it in git + +* ~16 MB binary file — bloats clone times for everyone, including + contributors who never touch the OCR code path. +* Apache-2.0-licensed and stable; the file rarely changes upstream + (last touched 2021), so a build-time fetch is safe. +* The Tesseract project explicitly distributes these via GitHub + raw URLs — they're meant to be downloaded, not redistributed + through other repos. + +### How it gets populated + +`build/make_release.py::fetch_tessdata()` checks for +`build/vendor/tessdata/eng.traineddata` on every run. If it's +missing, the script downloads it from the canonical URL above and +caches it here. Subsequent builds reuse the cached file. + +On CI, the directory is restored from the GitHub Actions cache so we +don't pay the download cost on every run (`.github/workflows/build.yml` +caches `build/vendor/tessdata/` keyed on the URL above). + +## Manual one-time fetch (if you're offline or behind a proxy) + +```bash +mkdir -p build/vendor/tessdata +curl -L -o build/vendor/tessdata/eng.traineddata \ + https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata +``` + +Verify the file is non-empty and starts with the magic bytes +`b"\x00\x00\x00\x00"` followed by a header that `pytesseract` can +read; the script does a basic sanity check after download. diff --git a/build/vendor/tessdata/.gitkeep b/build/vendor/tessdata/.gitkeep new file mode 100644 index 0000000..e69de29