"""Tesseract bundling helpers for the release build. PDF Extractor OCR ships a per-platform Tesseract binary plus the English ``eng.traineddata`` model inside the frozen PyInstaller bundle so scanned PDFs work without a separate user install. These helpers fetch the binary and tessdata at build time; the GitHub Actions workflow (``.github/workflows/build.yml``) imports ``fetch_tessdata`` and ``fetch_tesseract_for_platform`` and runs them before PyInstaller. Everything is staged under ``build/_tesseract//`` (gitignored). The PyInstaller spec (``build/datatools.spec``) reads that staging dir plus ``build/vendor/tessdata/`` and bundles them under ``/tesseract/``, where the runtime discovery code in ``src/pdf_extract.py`` expects: Path(sys._MEIPASS) / "tesseract" / "tesseract[.exe]" Path(sys._MEIPASS) / "tesseract" / "tessdata" / "eng.traineddata" """ from __future__ import annotations import os import shutil import subprocess import sys import urllib.request from pathlib import Path REPO = Path(__file__).resolve().parent.parent BUILD = REPO / "build" # Tesseract bundling. The runtime discovery code in # ``src/pdf_extract.py`` looks for the binary at # ``Path(sys._MEIPASS) / "tesseract" / "tesseract[.exe]"`` and tessdata # at ``... / "tesseract" / "tessdata" / "eng.traineddata"``. We stage # everything under ``build/_tesseract//`` (gitignored) and # the PyInstaller spec adds that staging dir to ``datas=`` so it lands # at the right place inside the frozen bundle. TESSERACT_VERSION = "5.5.0" TESSDATA_DIR = BUILD / "vendor" / "tessdata" TESSDATA_URL = ( "https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata" ) TESSERACT_STAGING = BUILD / "_tesseract" # --------------------------------------------------------------------------- # Output helpers — colourless so logs stay readable in any terminal/CI tail. # --------------------------------------------------------------------------- def _step(msg: str) -> None: print(f"\n==> {msg}", flush=True) def _ok(msg: str) -> None: print(f" ok: {msg}", flush=True) def _warn(msg: str) -> None: print(f" warn: {msg}", flush=True) def _err(msg: str) -> None: print(f" ERROR: {msg}", file=sys.stderr, flush=True) def _run(cmd: list[str], cwd: Path | None = None, env: dict | None = None) -> None: """Run *cmd*, stream output, exit on failure with a useful banner.""" printable = " ".join(map(str, cmd)) print(f" $ {printable}", flush=True) try: subprocess.run(cmd, check=True, cwd=cwd or REPO, env=env) except subprocess.CalledProcessError as e: _err(f"command failed (exit {e.returncode}): {printable}") sys.exit(e.returncode) except FileNotFoundError: _err(f"command not found: {cmd[0]}") sys.exit(127) # --------------------------------------------------------------------------- # Tesseract bundling — fetch the binary + tessdata at build time. # # We download (not vendor) because: # * Binaries are large (5-40 MB per platform) and license-encumbered # to keep current in git. # * tessdata is Apache-2.0 and ~16 MB — fine to redistribute but # bloats clones for contributors who don't touch OCR. # # Caching layout: # build/_tesseract/win/tesseract.exe + DLLs # build/_tesseract/mac/tesseract + dylibs # build/_tesseract/linux/tesseract + libs # build/vendor/tessdata/eng.traineddata (shared across platforms) # # The PyInstaller spec reads ``build/_tesseract//`` and the # tessdata dir, then bundles them under ``/tesseract/``. # --------------------------------------------------------------------------- def _download(url: str, dest: Path, *, expected_min_bytes: int = 1024) -> None: """Download *url* to *dest* atomically. Sanity-check the size.""" dest.parent.mkdir(parents=True, exist_ok=True) tmp = dest.with_suffix(dest.suffix + ".part") print(f" GET {url}", flush=True) try: with urllib.request.urlopen(url, timeout=120) as r, open(tmp, "wb") as f: shutil.copyfileobj(r, f) except Exception as e: # noqa: BLE001 — bubble any network error up if tmp.exists(): tmp.unlink() _err(f"download failed: {url}\n {e}") raise size = tmp.stat().st_size if size < expected_min_bytes: tmp.unlink() raise RuntimeError( f"downloaded file too small ({size} bytes < {expected_min_bytes}); " f"the URL probably 404'd into an HTML error page." ) tmp.replace(dest) _ok(f"downloaded {dest.name} ({size / (1024 * 1024):.1f} MB)") def fetch_tessdata() -> Path: """Ensure ``build/vendor/tessdata/eng.traineddata`` exists; return its path. Shared across platforms. Downloaded once and cached. The runtime expects this file at ``/tesseract/tessdata/eng.traineddata``; the PyInstaller spec handles the placement. """ _step("fetch tessdata (eng.traineddata)") TESSDATA_DIR.mkdir(parents=True, exist_ok=True) target = TESSDATA_DIR / "eng.traineddata" if target.exists() and target.stat().st_size > 1_000_000: _ok(f"already cached: {target.relative_to(REPO)} " f"({target.stat().st_size / (1024 * 1024):.1f} MB)") return target # ~16 MB on disk for the "best" model. Allow some slack on the # min-bytes check (3 MB) so we still catch HTML 404 pages. _download(TESSDATA_URL, target, expected_min_bytes=3 * 1024 * 1024) return target def _fetch_tesseract_windows(staging: Path) -> None: """Stage tesseract.exe + DLLs into *staging*. Strategy (no easy stand-alone Windows tarball exists — UB-Mannheim ships the canonical Windows builds as Inno Setup installers): 1. Download the installer .exe from the UB-Mannheim mirror. 2. Extract it with 7-Zip (which can read Inno Setup archives via the {app} group). 7-Zip is preinstalled on ``windows-latest`` GitHub Actions runners (`C:\\Program Files\\7-Zip\\7z.exe`). 3. Copy tesseract.exe + every DLL + the tessdata dir from the extraction into ``staging/``. The DLL set tesseract.exe needs at runtime (per UB-Mannheim's Inno Setup script): libtesseract-5.dll, libleptonica-6.dll, libgomp-1.dll, libstdc++-6.dll, libwinpthread-1.dll, libgcc_s_seh-1.dll, liblz4.dll, libjpeg-8.dll, libpng16-16.dll, libtiff-6.dll, libwebp-7.dll, libwebpmux-3.dll, libopenjp2-7.dll, zlib1.dll The whole {app} tree from the installer is ~120 MB; we copy just the .exe + .dll files (~50 MB) since the runtime only needs the binary and its direct deps. """ # UB-Mannheim posts builds under a versioned filename; the exact # build revision changes (5.5.0.20241111 at time of writing). # We pin a specific rev so reproducible builds don't drift. rev = "20241111" # patch rev for tesseract 5.5.0 on the UB-Mannheim mirror fname = f"tesseract-ocr-w64-setup-{TESSERACT_VERSION}.{rev}.exe" url = f"https://digi.bib.uni-mannheim.de/tesseract/{fname}" cache = TESSERACT_STAGING / fname if not cache.exists(): _download(url, cache, expected_min_bytes=20 * 1024 * 1024) # 7-Zip is preinstalled on windows-latest runners; on a dev box # the user installs it (choco install 7zip) or substitutes # innoextract. Locate it. sevenz = ( shutil.which("7z") or shutil.which("7z.exe") or r"C:\Program Files\7-Zip\7z.exe" ) if not Path(sevenz).exists() and not shutil.which("7z"): _err( "7-Zip not found. On Windows CI runners it's preinstalled; " "on a dev box install via ``choco install 7zip`` or extract " f"{cache} manually into {staging}/ and re-run with " "TESSERACT_SKIP_FETCH=1." ) raise FileNotFoundError("7z") extract = TESSERACT_STAGING / "win_extract" if extract.exists(): shutil.rmtree(extract) extract.mkdir(parents=True) _run([str(sevenz), "x", "-y", f"-o{extract}", str(cache)]) staging.mkdir(parents=True, exist_ok=True) # The Inno Setup payload lands under ``{app}/`` inside the # extraction. Recursively grab tesseract.exe + DLLs. found_exe = False for root, _dirs, files in os.walk(extract): for f in files: src = Path(root) / f if f.lower() == "tesseract.exe": shutil.copy2(src, staging / "tesseract.exe") found_exe = True elif f.lower().endswith(".dll"): shutil.copy2(src, staging / f) if not found_exe: raise RuntimeError( f"tesseract.exe not found inside extracted installer at {extract}" ) _ok(f"staged Windows tesseract into {staging.relative_to(REPO)}") def _fetch_tesseract_macos(staging: Path) -> None: """Stage tesseract + dylibs into *staging* on macOS. Strategy: use Homebrew. ``brew install tesseract`` is the sanctioned macOS path and the binary it installs is the same one every guide on the internet points at. We copy the binary + every dylib it links against into the staging dir, then run ``install_name_tool`` to rewrite the load paths so the binary works after relocation into the .app bundle. Caveat: ``brew`` must be on PATH (it is on ``macos-latest`` runners). If it isn't, we surface a helpful error rather than fail mysteriously. """ if not shutil.which("brew"): _err( "Homebrew not found. On macos-latest GitHub runners it's " "preinstalled; on a dev Mac install from https://brew.sh and " "re-run. Alternatively pre-stage tesseract into " f"{staging}/ and set TESSERACT_SKIP_FETCH=1." ) raise FileNotFoundError("brew") # ``brew install`` is idempotent — fine to run on every build. We # don't pin the version through brew because brew tracks its own # taps; instead we assert the version matches TESSERACT_VERSION # after install. _run(["brew", "install", "tesseract"]) # Find the binary brew just installed. tess_path = shutil.which("tesseract") if not tess_path: raise RuntimeError("brew install tesseract succeeded but tesseract not on PATH") staging.mkdir(parents=True, exist_ok=True) shutil.copy2(tess_path, staging / "tesseract") # Copy every non-system dylib the binary links against. The # ``otool -L`` output lists absolute paths under /opt/homebrew/ # (Apple Silicon) or /usr/local/ (Intel). We skip /usr/lib/* and # /System/* (Apple-shipped, present on every Mac). try: otool = subprocess.run( ["otool", "-L", str(staging / "tesseract")], check=True, capture_output=True, text=True, ) except subprocess.CalledProcessError as e: raise RuntimeError(f"otool failed: {e.stderr}") from e deps = [] for line in otool.stdout.splitlines()[1:]: path = line.strip().split(" ", 1)[0] if path.startswith(("/opt/homebrew/", "/usr/local/")): deps.append(path) # Copy each dep and its transitive deps. One level of recursion # is usually enough for the tesseract dep tree (libtesseract → # libleptonica → libpng/libjpeg/libtiff/libwebp). copied: set[str] = set() def _copy_with_deps(libpath: str) -> None: if libpath in copied or not Path(libpath).exists(): return copied.add(libpath) dest = staging / Path(libpath).name shutil.copy2(libpath, dest) # Rewrite the dest's own load path to @loader_path so the # bundle is relocatable. try: subprocess.run( ["install_name_tool", "-id", f"@loader_path/{Path(libpath).name}", str(dest)], check=True, capture_output=True, ) except subprocess.CalledProcessError: # Not fatal — install_name_tool refuses on already-relative # IDs. The dyld loader will still find them via # @loader_path rewrites on the consumer side. pass # Walk this lib's own deps. try: sub = subprocess.run( ["otool", "-L", libpath], check=True, capture_output=True, text=True, ) for sub_line in sub.stdout.splitlines()[1:]: sub_path = sub_line.strip().split(" ", 1)[0] if sub_path.startswith(("/opt/homebrew/", "/usr/local/")): _copy_with_deps(sub_path) except subprocess.CalledProcessError: pass for dep in deps: _copy_with_deps(dep) # Rewrite the tesseract binary's references to point at # @loader_path/ so it can find its deps inside the bundle. bin_path = staging / "tesseract" for dep in deps: try: subprocess.run( ["install_name_tool", "-change", dep, f"@loader_path/{Path(dep).name}", str(bin_path)], check=True, capture_output=True, ) except subprocess.CalledProcessError: pass _ok(f"staged macOS tesseract + {len(copied)} dylibs into {staging.relative_to(REPO)}") def _fetch_tesseract_linux(staging: Path) -> None: """Stage tesseract + .so files into *staging* on Linux. Strategy: ``apt-get install tesseract-ocr libtesseract5`` (preinstalled on most ubuntu-latest images; we run install anyway because the package is idempotent). Then copy the binary + every .so it links against into staging. ``patchelf`` rewrites RPATH so the bundle is relocatable. """ if not shutil.which("apt-get") and not shutil.which("tesseract"): _err( "Neither apt-get nor a pre-installed tesseract found. On " "ubuntu-latest runners both are present. On other distros " "install tesseract-ocr via your package manager and re-run " "with TESSERACT_SKIP_FETCH=1 after pre-staging the binary." ) raise FileNotFoundError("tesseract") if shutil.which("apt-get") and not shutil.which("tesseract"): _run(["sudo", "apt-get", "update"]) _run(["sudo", "apt-get", "install", "-y", "tesseract-ocr", "libtesseract5"]) tess_path = shutil.which("tesseract") if not tess_path: raise RuntimeError("apt-get install succeeded but tesseract not on PATH") staging.mkdir(parents=True, exist_ok=True) shutil.copy2(tess_path, staging / "tesseract") # Collect .so dependencies via ldd. Skip the dynamic linker and # libc/libpthread/libdl/libm/libstdc++/libgcc_s — those are # guaranteed to exist on every Linux target and shipping them can # cause GLIBC mismatch errors on older distros. The interesting # tesseract-specific deps are libtesseract, libleptonica, and the # image format libs (libpng, libjpeg, libtiff, libwebp, libgif). SKIP_PREFIXES = ( "linux-vdso", "/lib64/ld-linux", "/lib/ld-linux", "libc.so", "libdl.so", "libpthread.so", "libm.so", "librt.so", "libnsl.so", "libutil.so", ) try: ldd = subprocess.run( ["ldd", str(staging / "tesseract")], check=True, capture_output=True, text=True, ) except subprocess.CalledProcessError as e: raise RuntimeError(f"ldd failed: {e.stderr}") from e copied = 0 for line in ldd.stdout.splitlines(): # Format: " libfoo.so.N => /path/to/libfoo.so.N (0x...)" parts = line.split("=>") if len(parts) != 2: continue soname = parts[0].strip() if soname.startswith(SKIP_PREFIXES): continue path_part = parts[1].strip().split(" ", 1)[0] if not path_part or not Path(path_part).exists(): continue shutil.copy2(path_part, staging / Path(path_part).name) copied += 1 # patchelf is optional — if present, rewrite RPATH to $ORIGIN so # the binary finds its bundled .so files. If absent, the # PyInstaller LD_LIBRARY_PATH that the launcher sets will cover # it (we already chdir into _MEIPASS for the runtime). if shutil.which("patchelf"): try: _run(["patchelf", "--set-rpath", "$ORIGIN", str(staging / "tesseract")]) except SystemExit: _warn("patchelf rpath rewrite failed — relying on LD_LIBRARY_PATH at runtime") _ok(f"staged Linux tesseract + {copied} .so files into {staging.relative_to(REPO)}") def fetch_tesseract_for_platform(target: str) -> Path: """Stage the per-platform Tesseract binary + libs into ``build/_tesseract//``. Returns the staging dir path. The PyInstaller spec adds this dir (plus tessdata) to its ``datas=`` so the bundle ends up with everything under ``/tesseract/`` where the runtime discovery code expects it. Honours ``TESSERACT_SKIP_FETCH=1`` — set this when you've pre-staged the binary by hand (offline build, behind a proxy, custom build of tesseract, etc.). The script still verifies the binary is present and surfaces a helpful error if not. """ _step(f"fetch tesseract binary ({target})") staging = TESSERACT_STAGING / target exe_name = "tesseract.exe" if target == "win" else "tesseract" exe_path = staging / exe_name if os.environ.get("TESSERACT_SKIP_FETCH") == "1": if not exe_path.exists(): _err( f"TESSERACT_SKIP_FETCH=1 but {exe_path} is missing. " "Pre-stage the binary + its libs into that dir, then re-run." ) sys.exit(1) _ok(f"skipping fetch (TESSERACT_SKIP_FETCH=1); using {exe_path.relative_to(REPO)}") return staging if exe_path.exists(): _ok(f"already staged: {exe_path.relative_to(REPO)}") return staging if target == "win": _fetch_tesseract_windows(staging) elif target == "mac": _fetch_tesseract_macos(staging) elif target == "linux": _fetch_tesseract_linux(staging) else: _err(f"unknown target {target!r} for tesseract fetch") sys.exit(2) if not exe_path.exists(): _err( f"fetch step finished but {exe_path.relative_to(REPO)} is missing. " "Inspect the logs above; you may need to pre-stage the binary manually." ) sys.exit(1) return staging