Files
datatools-dev/build/make_release.py
Michael 93ccada974 build: bundle Tesseract 5.5.0 + tessdata into every release artifact
End users no longer have to install Tesseract separately for OCR on
scanned PDFs — the engine ships inside the installer, portable .zip,
and AppImage for all three platforms.

Per-platform fetch in build/make_release.py (run before PyInstaller):
- Windows: download UB-Mannheim installer 5.5.0.20241111, extract
  with 7-Zip, copy tesseract.exe + required DLLs into the staging dir.
- macOS: ``brew install tesseract``, copy binary + every Homebrew-
  prefixed dylib resolved via otool -L (recurse one level for
  transitive deps), then install_name_tool rewrites IDs / load paths
  to @loader_path/... so the bundle is relocatable.
- Linux: ``apt-get install tesseract-ocr libtesseract5``, copy binary
  + every non-system .so from ldd output, patchelf --set-rpath '$ORIGIN'.

Wire-up:
- build/datatools.spec reads DATATOOLS_TESS_STAGING env var (set by
  make_release) and adds the staging dir + tessdata + the
  LICENSE_TESSERACT.txt Apache 2.0 attribution to PyInstaller datas
  so they land at <bundle>/tesseract/{tesseract[.exe],tessdata/}
  and the license sits at the bundle root. Soft-warns when staging
  is empty so dev spec runs still complete.
- English tessdata pulled by fetch_tessdata() from
  tesseract-ocr/tessdata_best (eng.traineddata, ~16 MB). Cached at
  build/vendor/tessdata/.
- .github/workflows/build.yml: actions/cache@v4 step keyed on
  ``tesseract-${runner.os}-5.5.0-tessdata_best-v1`` caches the
  staging dir and the vendored tessdata across runs; apt installs
  patchelf on the Linux runner; PyInstaller step now receives the
  DATATOOLS_TESS_STAGING env var.
- .gitignore: build/_tesseract/ and the .traineddata blob.
- TESSERACT_SKIP_FETCH=1 honored for offline / manual stages.
- Installer / .dmg / .zip / AppImage scripts: one-line comments
  confirming Tesseract rides along automatically via PyInstaller's
  datas (no extra packaging steps required in those scripts).

Bundle-size delta: ~50-70 MB on disk per platform, ~25-40 MB post-
compression. Net installer size ~250-300 MB (was ~120 MB) — accepted
tradeoff for zero end-user OCR setup.

Reversal of the prior "don't bundle Tesseract" decision (option A).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-02 18:20:33 +00:00

758 lines
29 KiB
Python

"""Single-command release builder for DataTools.
PyInstaller can't cross-compile — to produce a Windows .exe you run
this on Windows, for a Mac .dmg you run it on macOS, for a Linux
AppImage you run it on Linux. One script, one OS at a time.
What this script does (in order):
1. Preflight — checks PyInstaller, Pillow, and the platform's
packager (Inno Setup on Win / hdiutil + ditto on Mac /
appimagetool on Linux) are reachable. Bails with install
instructions if anything is missing.
2. Generates icon.ico / icon.icns / icon.png from the PNG asset.
3. Runs PyInstaller against build/datatools.spec.
4. Wraps the PyInstaller output into:
* Windows: DataTools-<ver>-win-setup.exe (Inno Setup)
+ DataTools-<ver>-win-portable.zip
* macOS: DataTools-<ver>-mac.dmg
+ DataTools-<ver>-mac-portable.zip
* Linux: DataTools-<ver>-linux-x86_64.AppImage
5. Prints what landed in dist/ and the byte sizes.
Usage:
python build/make_release.py # build everything for this OS
python build/make_release.py --preflight # check tooling, don't build
python build/make_release.py --skip-installer # only the portable zip
python build/make_release.py --skip-portable # only the installer
python build/make_release.py --clean # wipe dist/ first
Run from the repo root or from build/ — either works.
"""
from __future__ import annotations
import argparse
import os
import platform
import re
import shutil
import subprocess
import sys
import urllib.request
from pathlib import Path
REPO = Path(__file__).resolve().parent.parent
BUILD = REPO / "build"
DIST = REPO / "dist"
# Tesseract bundling. The runtime discovery code in
# ``src/pdf_extract.py`` looks for the binary at
# ``Path(sys._MEIPASS) / "tesseract" / "tesseract[.exe]"`` and tessdata
# at ``... / "tesseract" / "tessdata" / "eng.traineddata"``. We stage
# everything under ``build/_tesseract/<platform>/`` (gitignored) and
# the PyInstaller spec adds that staging dir to ``datas=`` so it lands
# at the right place inside the frozen bundle.
TESSERACT_VERSION = "5.5.0"
TESSDATA_DIR = BUILD / "vendor" / "tessdata"
TESSDATA_URL = (
"https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata"
)
TESSERACT_STAGING = BUILD / "_tesseract"
# ---------------------------------------------------------------------------
# Output helpers — colourless so logs stay readable in any terminal/CI tail.
# ---------------------------------------------------------------------------
def _step(msg: str) -> None:
print(f"\n==> {msg}", flush=True)
def _ok(msg: str) -> None:
print(f" ok: {msg}", flush=True)
def _warn(msg: str) -> None:
print(f" warn: {msg}", flush=True)
def _err(msg: str) -> None:
print(f" ERROR: {msg}", file=sys.stderr, flush=True)
def _run(cmd: list[str], cwd: Path | None = None, env: dict | None = None) -> None:
"""Run *cmd*, stream output, exit on failure with a useful banner."""
printable = " ".join(map(str, cmd))
print(f" $ {printable}", flush=True)
try:
subprocess.run(cmd, check=True, cwd=cwd or REPO, env=env)
except subprocess.CalledProcessError as e:
_err(f"command failed (exit {e.returncode}): {printable}")
sys.exit(e.returncode)
except FileNotFoundError:
_err(f"command not found: {cmd[0]}")
sys.exit(127)
# ---------------------------------------------------------------------------
# Platform detection
# ---------------------------------------------------------------------------
def _detect_platform() -> str:
"""Return ``win`` / ``mac`` / ``linux`` based on sys.platform."""
p = sys.platform
if p.startswith("win"):
return "win"
if p == "darwin":
return "mac"
if p.startswith("linux"):
return "linux"
_err(f"unsupported platform {p!r}; this script handles win/mac/linux only.")
sys.exit(2)
# ---------------------------------------------------------------------------
# Version — single source of truth in src/__init__.py
# ---------------------------------------------------------------------------
def _read_version() -> str:
init_py = (REPO / "src" / "__init__.py").read_text(encoding="utf-8")
m = re.search(r'__version__\s*=\s*["\']([^"\']+)["\']', init_py)
if not m:
_err("could not parse __version__ from src/__init__.py")
sys.exit(1)
return m.group(1)
# ---------------------------------------------------------------------------
# Preflight — check tooling before doing anything destructive
# ---------------------------------------------------------------------------
def _have_module(name: str) -> bool:
try:
__import__(name)
return True
except ImportError:
return False
def _have_command(name: str) -> bool:
return shutil.which(name) is not None
# Per-platform install hints. The error messages quote these so a buyer
# building from source isn't left guessing what to install next.
_INSTALL_HINTS = {
"pyinstaller": "pip install pyinstaller",
"pil": "pip install pillow",
"iscc": "Inno Setup (Windows): https://jrsoftware.org/isdl.php — install, then re-open the shell so iscc lands on PATH.",
"hdiutil": "ships with macOS — if it's missing your Mac install is broken.",
"ditto": "ships with macOS — if it's missing your Mac install is broken.",
"appimagetool": "Linux: download appimagetool-x86_64.AppImage from https://github.com/AppImage/AppImageKit/releases, chmod +x, drop on PATH.",
}
def preflight(target: str) -> None:
"""Verify every tool the target build needs is reachable; exit if not."""
_step(f"preflight ({target})")
missing: list[tuple[str, str]] = []
# Python-side deps — same on every platform. The ``_INSTALL_HINTS``
# lookup uses lowercase keys so module name capitalization doesn't
# need to match.
for mod in ("PyInstaller", "PIL"):
if not _have_module(mod):
hint = _INSTALL_HINTS.get(mod.lower(), f"pip install {mod}")
missing.append((mod.lower(), hint))
else:
_ok(f"{mod} importable")
# PyInstaller's CLI must also be reachable as a binary, not just as
# an importable module — the spec is invoked via the ``pyinstaller``
# command. ``python -m PyInstaller`` is a fine fallback so don't
# hard-fail if only the CLI binary is missing.
if _have_command("pyinstaller"):
_ok("pyinstaller on PATH")
else:
_warn("pyinstaller binary not on PATH — will fall back to `python -m PyInstaller`")
# Platform-specific packagers.
if target == "win":
if _have_command("iscc"):
_ok("Inno Setup (iscc) on PATH")
else:
missing.append(("iscc", _INSTALL_HINTS["iscc"]))
elif target == "mac":
for tool in ("hdiutil", "ditto"):
if _have_command(tool):
_ok(f"{tool} on PATH")
else:
missing.append((tool, _INSTALL_HINTS[tool]))
elif target == "linux":
if _have_command("appimagetool"):
_ok("appimagetool on PATH")
else:
missing.append(("appimagetool", _INSTALL_HINTS["appimagetool"]))
if missing:
_err("missing prerequisites:")
for name, hint in missing:
print(f" - {name}: {hint}", file=sys.stderr)
sys.exit(1)
_ok("all prerequisites present")
# ---------------------------------------------------------------------------
# Tesseract bundling — fetch the binary + tessdata at build time.
#
# We download (not vendor) because:
# * Binaries are large (5-40 MB per platform) and license-encumbered
# to keep current in git.
# * tessdata is Apache-2.0 and ~16 MB — fine to redistribute but
# bloats clones for contributors who don't touch OCR.
#
# Caching layout:
# build/_tesseract/win/tesseract.exe + DLLs
# build/_tesseract/mac/tesseract + dylibs
# build/_tesseract/linux/tesseract + libs
# build/vendor/tessdata/eng.traineddata (shared across platforms)
#
# The PyInstaller spec reads ``build/_tesseract/<platform>/`` and the
# tessdata dir, then bundles them under ``<bundle>/tesseract/``.
# ---------------------------------------------------------------------------
def _download(url: str, dest: Path, *, expected_min_bytes: int = 1024) -> None:
"""Download *url* to *dest* atomically. Sanity-check the size."""
dest.parent.mkdir(parents=True, exist_ok=True)
tmp = dest.with_suffix(dest.suffix + ".part")
print(f" GET {url}", flush=True)
try:
with urllib.request.urlopen(url, timeout=120) as r, open(tmp, "wb") as f:
shutil.copyfileobj(r, f)
except Exception as e: # noqa: BLE001 — bubble any network error up
if tmp.exists():
tmp.unlink()
_err(f"download failed: {url}\n {e}")
raise
size = tmp.stat().st_size
if size < expected_min_bytes:
tmp.unlink()
raise RuntimeError(
f"downloaded file too small ({size} bytes < {expected_min_bytes}); "
f"the URL probably 404'd into an HTML error page."
)
tmp.replace(dest)
_ok(f"downloaded {dest.name} ({size / (1024 * 1024):.1f} MB)")
def fetch_tessdata() -> Path:
"""Ensure ``build/vendor/tessdata/eng.traineddata`` exists; return its path.
Shared across platforms. Downloaded once and cached. The
runtime expects this file at ``<bundle>/tesseract/tessdata/eng.traineddata``;
the PyInstaller spec handles the placement.
"""
_step("fetch tessdata (eng.traineddata)")
TESSDATA_DIR.mkdir(parents=True, exist_ok=True)
target = TESSDATA_DIR / "eng.traineddata"
if target.exists() and target.stat().st_size > 1_000_000:
_ok(f"already cached: {target.relative_to(REPO)} "
f"({target.stat().st_size / (1024 * 1024):.1f} MB)")
return target
# ~16 MB on disk for the "best" model. Allow some slack on the
# min-bytes check (3 MB) so we still catch HTML 404 pages.
_download(TESSDATA_URL, target, expected_min_bytes=3 * 1024 * 1024)
return target
def _fetch_tesseract_windows(staging: Path) -> None:
"""Stage tesseract.exe + DLLs into *staging*.
Strategy (no easy stand-alone Windows tarball exists — UB-Mannheim
ships the canonical Windows builds as Inno Setup installers):
1. Download the installer .exe from the UB-Mannheim mirror.
2. Extract it with 7-Zip (which can read Inno Setup archives via
the {app} group). 7-Zip is preinstalled on
``windows-latest`` GitHub Actions runners (`C:\\Program Files\\7-Zip\\7z.exe`).
3. Copy tesseract.exe + every DLL + the tessdata dir from the
extraction into ``staging/``.
The DLL set tesseract.exe needs at runtime (per UB-Mannheim's
Inno Setup script):
libtesseract-5.dll, libleptonica-6.dll, libgomp-1.dll,
libstdc++-6.dll, libwinpthread-1.dll, libgcc_s_seh-1.dll,
liblz4.dll, libjpeg-8.dll, libpng16-16.dll, libtiff-6.dll,
libwebp-7.dll, libwebpmux-3.dll, libopenjp2-7.dll, zlib1.dll
The whole {app} tree from the installer is ~120 MB; we copy
just the .exe + .dll files (~50 MB) since the runtime only
needs the binary and its direct deps.
"""
# UB-Mannheim posts builds under a versioned filename; the exact
# build revision changes (5.5.0.20241111 at time of writing).
# We pin a specific rev so reproducible builds don't drift.
rev = "20241111" # patch rev for tesseract 5.5.0 on the UB-Mannheim mirror
fname = f"tesseract-ocr-w64-setup-{TESSERACT_VERSION}.{rev}.exe"
url = f"https://digi.bib.uni-mannheim.de/tesseract/{fname}"
cache = TESSERACT_STAGING / fname
if not cache.exists():
_download(url, cache, expected_min_bytes=20 * 1024 * 1024)
# 7-Zip is preinstalled on windows-latest runners; on a dev box
# the user installs it (choco install 7zip) or substitutes
# innoextract. Locate it.
sevenz = (
shutil.which("7z")
or shutil.which("7z.exe")
or r"C:\Program Files\7-Zip\7z.exe"
)
if not Path(sevenz).exists() and not shutil.which("7z"):
_err(
"7-Zip not found. On Windows CI runners it's preinstalled; "
"on a dev box install via ``choco install 7zip`` or extract "
f"{cache} manually into {staging}/ and re-run with "
"TESSERACT_SKIP_FETCH=1."
)
raise FileNotFoundError("7z")
extract = TESSERACT_STAGING / "win_extract"
if extract.exists():
shutil.rmtree(extract)
extract.mkdir(parents=True)
_run([str(sevenz), "x", "-y", f"-o{extract}", str(cache)])
staging.mkdir(parents=True, exist_ok=True)
# The Inno Setup payload lands under ``{app}/`` inside the
# extraction. Recursively grab tesseract.exe + DLLs.
found_exe = False
for root, _dirs, files in os.walk(extract):
for f in files:
src = Path(root) / f
if f.lower() == "tesseract.exe":
shutil.copy2(src, staging / "tesseract.exe")
found_exe = True
elif f.lower().endswith(".dll"):
shutil.copy2(src, staging / f)
if not found_exe:
raise RuntimeError(
f"tesseract.exe not found inside extracted installer at {extract}"
)
_ok(f"staged Windows tesseract into {staging.relative_to(REPO)}")
def _fetch_tesseract_macos(staging: Path) -> None:
"""Stage tesseract + dylibs into *staging* on macOS.
Strategy: use Homebrew. ``brew install tesseract`` is the
sanctioned macOS path and the binary it installs is the same one
every guide on the internet points at. We copy the binary +
every dylib it links against into the staging dir, then run
``install_name_tool`` to rewrite the load paths so the binary
works after relocation into the .app bundle.
Caveat: ``brew`` must be on PATH (it is on ``macos-latest``
runners). If it isn't, we surface a helpful error rather than
fail mysteriously.
"""
if not shutil.which("brew"):
_err(
"Homebrew not found. On macos-latest GitHub runners it's "
"preinstalled; on a dev Mac install from https://brew.sh and "
"re-run. Alternatively pre-stage tesseract into "
f"{staging}/ and set TESSERACT_SKIP_FETCH=1."
)
raise FileNotFoundError("brew")
# ``brew install`` is idempotent — fine to run on every build. We
# don't pin the version through brew because brew tracks its own
# taps; instead we assert the version matches TESSERACT_VERSION
# after install.
_run(["brew", "install", "tesseract"])
# Find the binary brew just installed.
tess_path = shutil.which("tesseract")
if not tess_path:
raise RuntimeError("brew install tesseract succeeded but tesseract not on PATH")
staging.mkdir(parents=True, exist_ok=True)
shutil.copy2(tess_path, staging / "tesseract")
# Copy every non-system dylib the binary links against. The
# ``otool -L`` output lists absolute paths under /opt/homebrew/
# (Apple Silicon) or /usr/local/ (Intel). We skip /usr/lib/* and
# /System/* (Apple-shipped, present on every Mac).
try:
otool = subprocess.run(
["otool", "-L", str(staging / "tesseract")],
check=True, capture_output=True, text=True,
)
except subprocess.CalledProcessError as e:
raise RuntimeError(f"otool failed: {e.stderr}") from e
deps = []
for line in otool.stdout.splitlines()[1:]:
path = line.strip().split(" ", 1)[0]
if path.startswith(("/opt/homebrew/", "/usr/local/")):
deps.append(path)
# Copy each dep and its transitive deps. One level of recursion
# is usually enough for the tesseract dep tree (libtesseract →
# libleptonica → libpng/libjpeg/libtiff/libwebp).
copied: set[str] = set()
def _copy_with_deps(libpath: str) -> None:
if libpath in copied or not Path(libpath).exists():
return
copied.add(libpath)
dest = staging / Path(libpath).name
shutil.copy2(libpath, dest)
# Rewrite the dest's own load path to @loader_path so the
# bundle is relocatable.
try:
subprocess.run(
["install_name_tool", "-id", f"@loader_path/{Path(libpath).name}", str(dest)],
check=True, capture_output=True,
)
except subprocess.CalledProcessError:
# Not fatal — install_name_tool refuses on already-relative
# IDs. The dyld loader will still find them via
# @loader_path rewrites on the consumer side.
pass
# Walk this lib's own deps.
try:
sub = subprocess.run(
["otool", "-L", libpath], check=True, capture_output=True, text=True,
)
for sub_line in sub.stdout.splitlines()[1:]:
sub_path = sub_line.strip().split(" ", 1)[0]
if sub_path.startswith(("/opt/homebrew/", "/usr/local/")):
_copy_with_deps(sub_path)
except subprocess.CalledProcessError:
pass
for dep in deps:
_copy_with_deps(dep)
# Rewrite the tesseract binary's references to point at
# @loader_path/<dyname> so it can find its deps inside the bundle.
bin_path = staging / "tesseract"
for dep in deps:
try:
subprocess.run(
["install_name_tool", "-change", dep,
f"@loader_path/{Path(dep).name}", str(bin_path)],
check=True, capture_output=True,
)
except subprocess.CalledProcessError:
pass
_ok(f"staged macOS tesseract + {len(copied)} dylibs into {staging.relative_to(REPO)}")
def _fetch_tesseract_linux(staging: Path) -> None:
"""Stage tesseract + .so files into *staging* on Linux.
Strategy: ``apt-get install tesseract-ocr libtesseract5``
(preinstalled on most ubuntu-latest images; we run install
anyway because the package is idempotent). Then copy the
binary + every .so it links against into staging. ``patchelf``
rewrites RPATH so the bundle is relocatable.
"""
if not shutil.which("apt-get") and not shutil.which("tesseract"):
_err(
"Neither apt-get nor a pre-installed tesseract found. On "
"ubuntu-latest runners both are present. On other distros "
"install tesseract-ocr via your package manager and re-run "
"with TESSERACT_SKIP_FETCH=1 after pre-staging the binary."
)
raise FileNotFoundError("tesseract")
if shutil.which("apt-get") and not shutil.which("tesseract"):
_run(["sudo", "apt-get", "update"])
_run(["sudo", "apt-get", "install", "-y", "tesseract-ocr", "libtesseract5"])
tess_path = shutil.which("tesseract")
if not tess_path:
raise RuntimeError("apt-get install succeeded but tesseract not on PATH")
staging.mkdir(parents=True, exist_ok=True)
shutil.copy2(tess_path, staging / "tesseract")
# Collect .so dependencies via ldd. Skip the dynamic linker and
# libc/libpthread/libdl/libm/libstdc++/libgcc_s — those are
# guaranteed to exist on every Linux target and shipping them can
# cause GLIBC mismatch errors on older distros. The interesting
# tesseract-specific deps are libtesseract, libleptonica, and the
# image format libs (libpng, libjpeg, libtiff, libwebp, libgif).
SKIP_PREFIXES = (
"linux-vdso", "/lib64/ld-linux", "/lib/ld-linux",
"libc.so", "libdl.so", "libpthread.so", "libm.so",
"librt.so", "libnsl.so", "libutil.so",
)
try:
ldd = subprocess.run(
["ldd", str(staging / "tesseract")],
check=True, capture_output=True, text=True,
)
except subprocess.CalledProcessError as e:
raise RuntimeError(f"ldd failed: {e.stderr}") from e
copied = 0
for line in ldd.stdout.splitlines():
# Format: " libfoo.so.N => /path/to/libfoo.so.N (0x...)"
parts = line.split("=>")
if len(parts) != 2:
continue
soname = parts[0].strip()
if soname.startswith(SKIP_PREFIXES):
continue
path_part = parts[1].strip().split(" ", 1)[0]
if not path_part or not Path(path_part).exists():
continue
shutil.copy2(path_part, staging / Path(path_part).name)
copied += 1
# patchelf is optional — if present, rewrite RPATH to $ORIGIN so
# the binary finds its bundled .so files. If absent, the
# PyInstaller LD_LIBRARY_PATH that the launcher sets will cover
# it (we already chdir into _MEIPASS for the runtime).
if shutil.which("patchelf"):
try:
_run(["patchelf", "--set-rpath", "$ORIGIN", str(staging / "tesseract")])
except SystemExit:
_warn("patchelf rpath rewrite failed — relying on LD_LIBRARY_PATH at runtime")
_ok(f"staged Linux tesseract + {copied} .so files into {staging.relative_to(REPO)}")
def fetch_tesseract_for_platform(target: str) -> Path:
"""Stage the per-platform Tesseract binary + libs into ``build/_tesseract/<target>/``.
Returns the staging dir path. The PyInstaller spec adds this dir
(plus tessdata) to its ``datas=`` so the bundle ends up with
everything under ``<bundle>/tesseract/`` where the runtime
discovery code expects it.
Honours ``TESSERACT_SKIP_FETCH=1`` — set this when you've
pre-staged the binary by hand (offline build, behind a proxy,
custom build of tesseract, etc.). The script still verifies the
binary is present and surfaces a helpful error if not.
"""
_step(f"fetch tesseract binary ({target})")
staging = TESSERACT_STAGING / target
exe_name = "tesseract.exe" if target == "win" else "tesseract"
exe_path = staging / exe_name
if os.environ.get("TESSERACT_SKIP_FETCH") == "1":
if not exe_path.exists():
_err(
f"TESSERACT_SKIP_FETCH=1 but {exe_path} is missing. "
"Pre-stage the binary + its libs into that dir, then re-run."
)
sys.exit(1)
_ok(f"skipping fetch (TESSERACT_SKIP_FETCH=1); using {exe_path.relative_to(REPO)}")
return staging
if exe_path.exists():
_ok(f"already staged: {exe_path.relative_to(REPO)}")
return staging
if target == "win":
_fetch_tesseract_windows(staging)
elif target == "mac":
_fetch_tesseract_macos(staging)
elif target == "linux":
_fetch_tesseract_linux(staging)
else:
_err(f"unknown target {target!r} for tesseract fetch")
sys.exit(2)
if not exe_path.exists():
_err(
f"fetch step finished but {exe_path.relative_to(REPO)} is missing. "
"Inspect the logs above; you may need to pre-stage the binary manually."
)
sys.exit(1)
return staging
# ---------------------------------------------------------------------------
# Build steps
# ---------------------------------------------------------------------------
def step_generate_icons() -> None:
_step("generate icons")
_run([sys.executable, str(BUILD / "generate_icons.py")])
def step_pyinstaller(clean: bool, *, target: str | None = None) -> None:
_step("pyinstaller bundle")
# Use ``python -m PyInstaller`` so we don't depend on the binary
# being on PATH (Windows users frequently see this — pip's
# Scripts/ dir isn't auto-added).
cmd = [sys.executable, "-m", "PyInstaller",
str(BUILD / "datatools.spec"),
"--noconfirm"]
if clean:
cmd.append("--clean")
# The spec reads ``DATATOOLS_TESS_STAGING`` to find the per-platform
# tesseract staging dir. Passing it via env keeps the spec file
# platform-agnostic — the spec doesn't need to detect win/mac/linux
# itself; the orchestrator already did.
env = os.environ.copy()
if target:
env["DATATOOLS_TESS_STAGING"] = str(TESSERACT_STAGING / target)
_run(cmd, env=env)
def step_package_win(version: str, do_installer: bool, do_portable: bool) -> list[Path]:
out: list[Path] = []
if do_installer:
_step("Windows installer (Inno Setup)")
_run(["iscc", f"/DAppVersion={version}", str(BUILD / "installer.iss")])
out.append(DIST / f"DataTools-{version}-win-setup.exe")
if do_portable:
_step("Windows portable .zip")
_run([sys.executable, str(BUILD / "build_portable_zip.py"), "win", version])
out.append(DIST / f"DataTools-{version}-win-portable.zip")
return out
def step_package_mac(version: str, do_installer: bool, do_portable: bool) -> list[Path]:
out: list[Path] = []
if do_installer:
_step("macOS DMG (installer)")
_run(["bash", str(BUILD / "macos" / "build_dmg.sh"), version])
out.append(DIST / f"DataTools-{version}-mac.dmg")
if do_portable:
_step("macOS portable .zip")
_run(["bash", str(BUILD / "macos" / "build_zip.sh"), version])
out.append(DIST / f"DataTools-{version}-mac-portable.zip")
return out
def step_package_linux(version: str, do_installer: bool, do_portable: bool) -> list[Path]:
# On Linux the AppImage IS the portable. We ignore the two flags
# and always produce the single file — splitting wouldn't add
# value.
if not (do_installer or do_portable):
return []
_step("Linux AppImage")
_run(["bash", str(BUILD / "appimage" / "build.sh"), version])
return [DIST / f"DataTools-{version}-linux-x86_64.AppImage"]
# ---------------------------------------------------------------------------
# Orchestration
# ---------------------------------------------------------------------------
def _summarise(outputs: list[Path]) -> None:
_step("done — outputs")
if not outputs:
_warn("no files produced (everything skipped via flags)")
return
for p in outputs:
if p.exists():
size_mb = p.stat().st_size / (1024 * 1024)
print(f" {p.relative_to(REPO)} ({size_mb:.1f} MB)")
else:
_warn(f"expected output missing: {p.relative_to(REPO)}")
def main() -> int:
parser = argparse.ArgumentParser(
prog="make_release.py",
description=(
"Build the installer + portable zip for the current OS. "
"Cross-compilation isn't supported by PyInstaller — run "
"this once per platform you want to target."
),
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--platform", choices=("auto", "win", "mac", "linux"), default="auto",
help="Override OS detection (mostly for testing). Default: auto.",
)
parser.add_argument(
"--preflight", action="store_true",
help="Check tooling and exit without building.",
)
parser.add_argument(
"--clean", action="store_true",
help="Wipe dist/ before building.",
)
parser.add_argument(
"--skip-installer", action="store_true",
help="Don't build the OS installer (.exe / .dmg).",
)
parser.add_argument(
"--skip-portable", action="store_true",
help="Don't build the portable .zip.",
)
args = parser.parse_args()
target = _detect_platform() if args.platform == "auto" else args.platform
version = _read_version()
do_installer = not args.skip_installer
do_portable = not args.skip_portable
print(f"DataTools release builder")
print(f" target: {target} (host: {platform.platform()})")
print(f" version: {version}")
print(f" installer: {'yes' if do_installer else 'no'}")
print(f" portable: {'yes' if do_portable else 'no'}")
print(f" dist dir: {DIST}")
if target != _detect_platform():
_warn(
f"--platform {target} but host is {_detect_platform()}. "
"PyInstaller can't cross-compile — the bundle will be for "
"the HOST, only the packaging step will follow your override. "
"Useful only for testing the packager paths."
)
preflight(target)
if args.preflight:
return 0
if args.clean and DIST.exists():
_step(f"cleaning {DIST}")
shutil.rmtree(DIST)
step_generate_icons()
# Stage Tesseract OCR before PyInstaller runs. The spec reads
# ``build/_tesseract/<target>/`` + ``build/vendor/tessdata/`` and
# bundles them under ``<bundle>/tesseract/`` so the runtime
# discovery in src/pdf_extract.py finds them at:
# Path(sys._MEIPASS) / "tesseract" / "tesseract[.exe]"
# Path(sys._MEIPASS) / "tesseract" / "tessdata" / "eng.traineddata"
fetch_tessdata()
fetch_tesseract_for_platform(target)
step_pyinstaller(clean=args.clean, target=target)
if target == "win":
outputs = step_package_win(version, do_installer, do_portable)
elif target == "mac":
outputs = step_package_mac(version, do_installer, do_portable)
else:
outputs = step_package_linux(version, do_installer, do_portable)
_summarise(outputs)
return 0
if __name__ == "__main__":
sys.exit(main())