datatools-dev/build/datatools.spec

# PyInstaller spec for DataTools.
#
# Build (from the repo root, after ``pip install pyinstaller``):
#
#     pyinstaller build/datatools.spec
#
# Output: ``dist/DataTools/`` (folder mode) and ``dist/DataTools.exe``
# (or platform equivalent) on Windows; ``dist/DataTools.app`` on macOS
# when packaged via ``--target-arch universal2``. See ``build/README.md``
# for the full per-platform recipe.
#
# Why folder-mode (one-dir) is the default:
#   * Streamlit's static assets + Python interpreter + ~300 MB of deps
#     compress poorly into onefile. Onefile mode unpacks every launch
#     to a temp dir — adds 5-15 s startup latency that confuses
#     non-technical buyers ("did it crash?").
#   * Folder mode lets the installer (Inno Setup on Win, .dmg on Mac)
#     run a one-time copy. Subsequent launches are instant.
#
# Cross-platform note: this single spec file is built ON each target
# platform. Cross-compilation isn't supported — Mac builds need a
# Mac, Windows builds need a Windows machine (or a Windows GitHub
# Actions runner). See build/README.md for the matrix recipe.

# -*- mode: python ; coding: utf-8 -*-

from pathlib import Path
from PyInstaller.utils.hooks import (
    collect_all,
    collect_data_files,
    collect_submodules,
)

# Repo root from this spec's location (PyInstaller sets SPECPATH).
REPO = Path(SPECPATH).resolve().parent

# Single source of truth for the version string. Read directly from
# src/__init__.py instead of importing src/ — importing pulls in
# heavy deps (pandas etc) that PyInstaller's spec parser doesn't need.
import re as _re
_init_py = (REPO / "src" / "__init__.py").read_text(encoding="utf-8")
_m = _re.search(r'__version__\s*=\s*["\']([^"\']+)["\']', _init_py)
VERSION = _m.group(1) if _m else "0.0.0"

# ----- Hidden imports ------------------------------------------------
# PyInstaller's static analyser misses everything Streamlit reaches
# through ``importlib`` and the per-tool registries our app uses. We
# exhaustively pull every submodule of the libraries that bridge
# user code to runtime — better a 50 MB-bigger bundle than a runtime
# ImportError on the buyer's machine.

hidden_imports: list[str] = []
hidden_imports += collect_submodules("streamlit")
hidden_imports += collect_submodules("pandas")
hidden_imports += collect_submodules("phonenumbers")
hidden_imports += collect_submodules("rapidfuzz")
hidden_imports += collect_submodules("charset_normalizer")
hidden_imports += collect_submodules("openpyxl")
hidden_imports += collect_submodules("loguru")

# PDF Extractor stack. ``pypdfium2`` has its own PyInstaller hook
# under ``build/hooks/`` that pulls in the native PDFium binary —
# keep the ``collect_submodules`` calls here for belt-and-braces.
hidden_imports += collect_submodules("pdfplumber")
hidden_imports += collect_submodules("pdfminer")
hidden_imports += collect_submodules("pypdfium2")
hidden_imports += collect_submodules("PIL")
hidden_imports += collect_submodules("pytesseract")

# Our own engine + GUI modules. Even though we import them directly
# at the top of ``launcher.py`` / ``app.py``, the Streamlit
# session-state and per-page page discovery layers re-import via
# names that PyInstaller doesn't see.
hidden_imports += collect_submodules("src")

# ----- Data files ---------------------------------------------------
# Streamlit's static assets (the JS / CSS / fonts the browser fetches
# from the bundled HTTP server) are NOT Python files; PyInstaller
# can't auto-find them.

datas: list[tuple[str, str]] = []

# Streamlit's runtime assets.
datas += collect_data_files("streamlit", include_py_files=False)

# phonenumbers ships its country/area-code metadata as resources.
datas += collect_data_files("phonenumbers", include_py_files=False)

# PDF Extractor data files. ``pypdfium2`` ships a native PDFium
# shared library (``.dll`` / ``.so`` / ``.dylib``) under its package
# dir; ``pdfminer`` ships the Adobe CMap tables it uses for
# character mapping. The drawable-canvas frontend bundle is gone
# now that the visual picker was removed.
datas += collect_data_files("pypdfium2", include_py_files=False)
datas += collect_data_files("pdfminer", include_py_files=False)

# Our application files. PyInstaller's bundler treats source as code
# (.pyc) by default; we add it again as data so the launcher's
# ``Path(sys._MEIPASS) / "src" / "gui" / "app.py"`` resolution works.
datas += [
    (str(REPO / "src"),                       "src"),
    (str(REPO / "samples" / "demo"),          "samples/demo"),
    (str(REPO / ".streamlit" / "config.toml"),".streamlit"),
]

# ----- Analysis ------------------------------------------------------

a = Analysis(
    [str(REPO / "build" / "launcher.py")],
    pathex=[str(REPO)],
    binaries=[],
    datas=datas,
    hiddenimports=hidden_imports,
    hookspath=[str(REPO / "build" / "hooks")],
    hooksconfig={},
    runtime_hooks=[],
    excludes=[
        # Ship-trim — PyInstaller pulls these in but we never need
        # them, and they add ~80 MB combined.
        "tkinter",
        "matplotlib",
        "scipy",
        "IPython",
        "jupyter",
        "notebook",
        "test",
        "tests",
    ],
    noarchive=False,
)

pyz = PYZ(a.pure)

exe = EXE(
    pyz,
    a.scripts,
    [],
    exclude_binaries=True,
    name="DataTools",
    debug=False,
    bootloader_ignore_signals=False,
    strip=False,
    upx=True,
    console=False,        # GUI app — no terminal window on Win/Mac
    disable_windowed_traceback=False,
    icon=str(REPO / "build" / "icon.icns") if (REPO / "build" / "icon.icns").exists() else None,
)

coll = COLLECT(
    exe,
    a.binaries,
    a.datas,
    strip=False,
    upx=True,
    upx_exclude=[],
    name="DataTools",
)

# macOS .app bundle wrapper. PyInstaller produces it only on Mac;
# this block is a no-op on Win/Linux.
import sys as _sys
if _sys.platform == "darwin":
    app = BUNDLE(
        coll,
        name="DataTools.app",
        icon=str(REPO / "build" / "icon.icns") if (REPO / "build" / "icon.icns").exists() else None,
        bundle_identifier="com.datatools.desktop",
        info_plist={
            "CFBundleDisplayName": "DataTools",
            "CFBundleVersion": VERSION,
            "CFBundleShortVersionString": VERSION,
            "NSHighResolutionCapable": True,
            # Buyer's macOS will not show the app's window in the dock
            # if this is True. We want the dock icon so the buyer can
            # see the app is running while the browser tab is open.
            "LSUIElement": False,
        },
    )