Files
datatools-dev/build/datatools.spec
Michael fd9606c67b build: drop the local Python release method, return to CI-only installer builds
Removes the single-command Python packaging method (build/make_release.py
+ build/build_portable_zip.py + build/macos/build_zip.sh) and the portable
.zip artifacts it produced. Release builds go back to the original GitHub
Actions process: the CI matrix builds one installer per platform (.dmg /
.exe / .AppImage) on tag push and attaches them to a GitHub Release.

Tesseract OCR bundling is preserved: the fetch helpers the workflow depends
on (fetch_tessdata, fetch_tesseract_for_platform) are extracted into a
standalone build/tesseract.py, which build.yml now imports.

Docs (README, build/README, DEVELOPER, TECHNICAL, USER-GUIDE, vendor README,
es translations) updated to drop the portable-zip flavor and point at the
new module.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-22 17:47:36 +00:00

259 lines
9.8 KiB
Python

# PyInstaller spec for DataTools.
#
# Build (from the repo root, after ``pip install pyinstaller``):
#
# pyinstaller build/datatools.spec
#
# Output: ``dist/DataTools/`` (folder mode) and ``dist/DataTools.exe``
# (or platform equivalent) on Windows; ``dist/DataTools.app`` on macOS
# when packaged via ``--target-arch universal2``. See ``build/README.md``
# for the full per-platform recipe.
#
# Why folder-mode (one-dir) is the default:
# * Streamlit's static assets + Python interpreter + ~300 MB of deps
# compress poorly into onefile. Onefile mode unpacks every launch
# to a temp dir — adds 5-15 s startup latency that confuses
# non-technical buyers ("did it crash?").
# * Folder mode lets the installer (Inno Setup on Win, .dmg on Mac)
# run a one-time copy. Subsequent launches are instant.
#
# Cross-platform note: this single spec file is built ON each target
# platform. Cross-compilation isn't supported — Mac builds need a
# Mac, Windows builds need a Windows machine (or a Windows GitHub
# Actions runner). See build/README.md for the matrix recipe.
# -*- mode: python ; coding: utf-8 -*-
import os
from pathlib import Path
from PyInstaller.utils.hooks import (
collect_all,
collect_data_files,
collect_submodules,
)
# Repo root from this spec's location (PyInstaller sets SPECPATH).
REPO = Path(SPECPATH).resolve().parent
# Single source of truth for the version string. Read directly from
# src/__init__.py instead of importing src/ — importing pulls in
# heavy deps (pandas etc) that PyInstaller's spec parser doesn't need.
import re as _re
_init_py = (REPO / "src" / "__init__.py").read_text(encoding="utf-8")
_m = _re.search(r'__version__\s*=\s*["\']([^"\']+)["\']', _init_py)
VERSION = _m.group(1) if _m else "0.0.0"
# ----- Hidden imports ------------------------------------------------
# PyInstaller's static analyser misses everything Streamlit reaches
# through ``importlib`` and the per-tool registries our app uses. We
# exhaustively pull every submodule of the libraries that bridge
# user code to runtime — better a 50 MB-bigger bundle than a runtime
# ImportError on the buyer's machine.
hidden_imports: list[str] = []
hidden_imports += collect_submodules("streamlit")
hidden_imports += collect_submodules("pandas")
hidden_imports += collect_submodules("phonenumbers")
hidden_imports += collect_submodules("rapidfuzz")
hidden_imports += collect_submodules("charset_normalizer")
hidden_imports += collect_submodules("openpyxl")
hidden_imports += collect_submodules("loguru")
# PDF Extractor stack. ``pypdfium2`` has its own PyInstaller hook
# under ``build/hooks/`` that pulls in the native PDFium binary —
# keep the ``collect_submodules`` calls here for belt-and-braces.
hidden_imports += collect_submodules("pdfplumber")
hidden_imports += collect_submodules("pdfminer")
hidden_imports += collect_submodules("pypdfium2")
hidden_imports += collect_submodules("PIL")
hidden_imports += collect_submodules("pytesseract")
# Our own engine + GUI modules. Even though we import them directly
# at the top of ``launcher.py`` / ``app.py``, the Streamlit
# session-state and per-page page discovery layers re-import via
# names that PyInstaller doesn't see.
hidden_imports += collect_submodules("src")
# ----- Data files ---------------------------------------------------
# Streamlit's static assets (the JS / CSS / fonts the browser fetches
# from the bundled HTTP server) are NOT Python files; PyInstaller
# can't auto-find them.
datas: list[tuple[str, str]] = []
# Streamlit's runtime assets.
datas += collect_data_files("streamlit", include_py_files=False)
# phonenumbers ships its country/area-code metadata as resources.
datas += collect_data_files("phonenumbers", include_py_files=False)
# PDF Extractor data files. ``pypdfium2`` ships a native PDFium
# shared library (``.dll`` / ``.so`` / ``.dylib``) under its package
# dir; ``pdfminer`` ships the Adobe CMap tables it uses for
# character mapping. The drawable-canvas frontend bundle is gone
# now that the visual picker was removed.
datas += collect_data_files("pypdfium2", include_py_files=False)
datas += collect_data_files("pdfminer", include_py_files=False)
# Our application files. PyInstaller's bundler treats source as code
# (.pyc) by default; we add it again as data so the launcher's
# ``Path(sys._MEIPASS) / "src" / "gui" / "app.py"`` resolution works.
datas += [
(str(REPO / "src"), "src"),
(str(REPO / "samples" / "demo"), "samples/demo"),
(str(REPO / ".streamlit" / "config.toml"),".streamlit"),
]
# ----- Tesseract OCR bundle ----------------------------------------
# ``build/tesseract.py`` stages the per-platform Tesseract binary
# + its runtime libs (DLLs/dylibs/sos) into
# ``build/_tesseract/<target>/`` and the shared eng.traineddata into
# ``build/vendor/tessdata/``. We add both to ``datas`` so PyInstaller
# drops them at the path the runtime expects:
#
# <bundle>/tesseract/tesseract[.exe]
# <bundle>/tesseract/<all dll/dylib/so deps>
# <bundle>/tesseract/tessdata/eng.traineddata
#
# The runtime discovery code in src/pdf_extract.py reads this layout
# from ``Path(sys._MEIPASS) / "tesseract" / ...``. Keep the two ends
# in sync — if you rename "tesseract" here, update pdf_extract.py too.
#
# CI (.github/workflows/build.yml) sets DATATOOLS_TESS_STAGING to the
# right per-platform dir before invoking PyInstaller. For ad-hoc
# `pyinstaller build/datatools.spec` runs without that env var, fall
# back to the canonical staging path.
_tess_staging_env = os.environ.get("DATATOOLS_TESS_STAGING")
if _tess_staging_env:
_tess_staging = Path(_tess_staging_env)
else:
# Pick the obvious per-host staging dir as a fallback so spec-only
# builds (without the CI env var) still work in dev.
import sys as _sys_for_target
_target_guess = (
"win" if _sys_for_target.platform.startswith("win")
else "mac" if _sys_for_target.platform == "darwin"
else "linux"
)
_tess_staging = REPO / "build" / "_tesseract" / _target_guess
_tessdata = REPO / "build" / "vendor" / "tessdata"
if _tess_staging.is_dir() and any(_tess_staging.iterdir()):
# Drop every file in the staging dir directly under
# ``<bundle>/tesseract/`` (binary + DLL/dylib/so siblings).
datas += [(str(_tess_staging), "tesseract")]
else:
# Don't hard-fail spec parse — useful for first-time devs running
# PyInstaller before fetching binaries. Surface a loud warning
# though, since the OCR feature will silently fail at runtime.
print(
f"WARNING: {_tess_staging} is empty or missing — OCR will be "
"disabled in the bundle. Run build/tesseract.py's "
"fetch_tesseract_for_platform before pyinstaller, or "
"pre-stage the binary manually."
)
if (_tessdata / "eng.traineddata").exists():
datas += [(str(_tessdata), "tesseract/tessdata")]
else:
print(
f"WARNING: {_tessdata}/eng.traineddata is missing — OCR will "
"have no language data at runtime. Run build/tesseract.py's "
"fetch_tessdata or fetch manually per build/vendor/README.md."
)
# Bundle the Apache-2.0 LICENSE text alongside the binary. The docs
# agent maintains LICENSE_TESSERACT.txt at the repo root; PyInstaller
# drops it at the bundle root next to DataTools[.exe].
_tess_license = REPO / "LICENSE_TESSERACT.txt"
if _tess_license.exists():
datas += [(str(_tess_license), ".")]
else:
print(
"WARNING: LICENSE_TESSERACT.txt missing at repo root. Required "
"by Apache-2.0 for redistribution; the docs agent should "
"create it. Continuing without it for now."
)
# ----- Analysis ------------------------------------------------------
a = Analysis(
[str(REPO / "build" / "launcher.py")],
pathex=[str(REPO)],
binaries=[],
datas=datas,
hiddenimports=hidden_imports,
hookspath=[str(REPO / "build" / "hooks")],
hooksconfig={},
runtime_hooks=[],
excludes=[
# Ship-trim — PyInstaller pulls these in but we never need
# them, and they add ~80 MB combined.
"tkinter",
"matplotlib",
"scipy",
"IPython",
"jupyter",
"notebook",
"test",
"tests",
],
noarchive=False,
)
pyz = PYZ(a.pure)
exe = EXE(
pyz,
a.scripts,
[],
exclude_binaries=True,
name="DataTools",
debug=False,
bootloader_ignore_signals=False,
strip=False,
upx=True,
console=False, # GUI app — no terminal window on Win/Mac
disable_windowed_traceback=False,
icon=str(REPO / "build" / "icon.icns") if (REPO / "build" / "icon.icns").exists() else None,
)
coll = COLLECT(
exe,
a.binaries,
a.datas,
strip=False,
upx=True,
upx_exclude=[],
name="DataTools",
)
# macOS .app bundle wrapper. PyInstaller produces it only on Mac;
# this block is a no-op on Win/Linux.
#
# Tesseract bundling note: ``BUNDLE(coll, ...)`` carries the entire
# COLLECT output (binaries + datas) into the .app's
# Contents/Resources tree, so the ``tesseract/`` subdir we built up
# in ``datas`` lands at ``DataTools.app/Contents/Resources/tesseract/``
# and the runtime ``sys._MEIPASS`` resolves there. No extra plumbing
# needed.
import sys as _sys
if _sys.platform == "darwin":
app = BUNDLE(
coll,
name="DataTools.app",
icon=str(REPO / "build" / "icon.icns") if (REPO / "build" / "icon.icns").exists() else None,
bundle_identifier="com.datatools.desktop",
info_plist={
"CFBundleDisplayName": "DataTools",
"CFBundleVersion": VERSION,
"CFBundleShortVersionString": VERSION,
"NSHighResolutionCapable": True,
# Buyer's macOS will not show the app's window in the dock
# if this is True. We want the dock icon so the buyer can
# see the app is running while the browser tab is open.
"LSUIElement": False,
},
)