# PyInstaller spec for DataTools. # # Build (from the repo root, after ``pip install pyinstaller``): # # pyinstaller build/datatools.spec # # Output: ``dist/DataTools/`` (folder mode) and ``dist/DataTools.exe`` # (or platform equivalent) on Windows; ``dist/DataTools.app`` on macOS # when packaged via ``--target-arch universal2``. See ``build/README.md`` # for the full per-platform recipe. # # Why folder-mode (one-dir) is the default: # * Streamlit's static assets + Python interpreter + ~300 MB of deps # compress poorly into onefile. Onefile mode unpacks every launch # to a temp dir — adds 5-15 s startup latency that confuses # non-technical buyers ("did it crash?"). # * Folder mode lets the installer (Inno Setup on Win, .dmg on Mac) # run a one-time copy. Subsequent launches are instant. # # Cross-platform note: this single spec file is built ON each target # platform. Cross-compilation isn't supported — Mac builds need a # Mac, Windows builds need a Windows machine (or a Windows GitHub # Actions runner). See build/README.md for the matrix recipe. # -*- mode: python ; coding: utf-8 -*- import os from pathlib import Path from PyInstaller.utils.hooks import ( collect_all, collect_data_files, collect_submodules, ) # Repo root from this spec's location (PyInstaller sets SPECPATH). REPO = Path(SPECPATH).resolve().parent # Single source of truth for the version string. Read directly from # src/__init__.py instead of importing src/ — importing pulls in # heavy deps (pandas etc) that PyInstaller's spec parser doesn't need. import re as _re _init_py = (REPO / "src" / "__init__.py").read_text(encoding="utf-8") _m = _re.search(r'__version__\s*=\s*["\']([^"\']+)["\']', _init_py) VERSION = _m.group(1) if _m else "0.0.0" # ----- Hidden imports ------------------------------------------------ # PyInstaller's static analyser misses everything Streamlit reaches # through ``importlib`` and the per-tool registries our app uses. We # exhaustively pull every submodule of the libraries that bridge # user code to runtime — better a 50 MB-bigger bundle than a runtime # ImportError on the buyer's machine. hidden_imports: list[str] = [] hidden_imports += collect_submodules("streamlit") hidden_imports += collect_submodules("pandas") hidden_imports += collect_submodules("phonenumbers") hidden_imports += collect_submodules("rapidfuzz") hidden_imports += collect_submodules("charset_normalizer") hidden_imports += collect_submodules("openpyxl") hidden_imports += collect_submodules("loguru") # PDF Extractor stack. ``pypdfium2`` has its own PyInstaller hook # under ``build/hooks/`` that pulls in the native PDFium binary — # keep the ``collect_submodules`` calls here for belt-and-braces. hidden_imports += collect_submodules("pdfplumber") hidden_imports += collect_submodules("pdfminer") hidden_imports += collect_submodules("pypdfium2") hidden_imports += collect_submodules("PIL") hidden_imports += collect_submodules("pytesseract") # Our own engine + GUI modules. Even though we import them directly # at the top of ``launcher.py`` / ``app.py``, the Streamlit # session-state and per-page page discovery layers re-import via # names that PyInstaller doesn't see. hidden_imports += collect_submodules("src") # ----- Data files --------------------------------------------------- # Streamlit's static assets (the JS / CSS / fonts the browser fetches # from the bundled HTTP server) are NOT Python files; PyInstaller # can't auto-find them. datas: list[tuple[str, str]] = [] # Streamlit's runtime assets. datas += collect_data_files("streamlit", include_py_files=False) # phonenumbers ships its country/area-code metadata as resources. datas += collect_data_files("phonenumbers", include_py_files=False) # PDF Extractor data files. ``pypdfium2`` ships a native PDFium # shared library (``.dll`` / ``.so`` / ``.dylib``) under its package # dir; ``pdfminer`` ships the Adobe CMap tables it uses for # character mapping. The drawable-canvas frontend bundle is gone # now that the visual picker was removed. datas += collect_data_files("pypdfium2", include_py_files=False) datas += collect_data_files("pdfminer", include_py_files=False) # Our application files. PyInstaller's bundler treats source as code # (.pyc) by default; we add it again as data so the launcher's # ``Path(sys._MEIPASS) / "src" / "gui" / "app.py"`` resolution works. datas += [ (str(REPO / "src"), "src"), (str(REPO / "samples" / "demo"), "samples/demo"), (str(REPO / ".streamlit" / "config.toml"),".streamlit"), ] # ----- Tesseract OCR bundle ---------------------------------------- # ``build/make_release.py`` stages the per-platform Tesseract binary # + its runtime libs (DLLs/dylibs/sos) into # ``build/_tesseract//`` and the shared eng.traineddata into # ``build/vendor/tessdata/``. We add both to ``datas`` so PyInstaller # drops them at the path the runtime expects: # # /tesseract/tesseract[.exe] # /tesseract/ # /tesseract/tessdata/eng.traineddata # # The runtime discovery code in src/pdf_extract.py reads this layout # from ``Path(sys._MEIPASS) / "tesseract" / ...``. Keep the two ends # in sync — if you rename "tesseract" here, update pdf_extract.py too. # # The orchestrator (make_release.py) sets DATATOOLS_TESS_STAGING to # the right per-platform dir before invoking PyInstaller. For ad-hoc # `pyinstaller build/datatools.spec` runs without the orchestrator, # fall back to the canonical staging path. _tess_staging_env = os.environ.get("DATATOOLS_TESS_STAGING") if _tess_staging_env: _tess_staging = Path(_tess_staging_env) else: # Pick the obvious per-host staging dir as a fallback so spec-only # builds (without the orchestrator) still work in dev. import sys as _sys_for_target _target_guess = ( "win" if _sys_for_target.platform.startswith("win") else "mac" if _sys_for_target.platform == "darwin" else "linux" ) _tess_staging = REPO / "build" / "_tesseract" / _target_guess _tessdata = REPO / "build" / "vendor" / "tessdata" if _tess_staging.is_dir() and any(_tess_staging.iterdir()): # Drop every file in the staging dir directly under # ``/tesseract/`` (binary + DLL/dylib/so siblings). datas += [(str(_tess_staging), "tesseract")] else: # Don't hard-fail spec parse — useful for first-time devs running # PyInstaller before fetching binaries. Surface a loud warning # though, since the OCR feature will silently fail at runtime. print( f"WARNING: {_tess_staging} is empty or missing — OCR will be " "disabled in the bundle. Run build/make_release.py (which " "calls fetch_tesseract_for_platform) before pyinstaller, or " "pre-stage the binary manually." ) if (_tessdata / "eng.traineddata").exists(): datas += [(str(_tessdata), "tesseract/tessdata")] else: print( f"WARNING: {_tessdata}/eng.traineddata is missing — OCR will " "have no language data at runtime. Run build/make_release.py " "or fetch manually per build/vendor/README.md." ) # Bundle the Apache-2.0 LICENSE text alongside the binary. The docs # agent maintains LICENSE_TESSERACT.txt at the repo root; PyInstaller # drops it at the bundle root next to DataTools[.exe]. _tess_license = REPO / "LICENSE_TESSERACT.txt" if _tess_license.exists(): datas += [(str(_tess_license), ".")] else: print( "WARNING: LICENSE_TESSERACT.txt missing at repo root. Required " "by Apache-2.0 for redistribution; the docs agent should " "create it. Continuing without it for now." ) # ----- Analysis ------------------------------------------------------ a = Analysis( [str(REPO / "build" / "launcher.py")], pathex=[str(REPO)], binaries=[], datas=datas, hiddenimports=hidden_imports, hookspath=[str(REPO / "build" / "hooks")], hooksconfig={}, runtime_hooks=[], excludes=[ # Ship-trim — PyInstaller pulls these in but we never need # them, and they add ~80 MB combined. "tkinter", "matplotlib", "scipy", "IPython", "jupyter", "notebook", "test", "tests", ], noarchive=False, ) pyz = PYZ(a.pure) exe = EXE( pyz, a.scripts, [], exclude_binaries=True, name="DataTools", debug=False, bootloader_ignore_signals=False, strip=False, upx=True, console=False, # GUI app — no terminal window on Win/Mac disable_windowed_traceback=False, icon=str(REPO / "build" / "icon.icns") if (REPO / "build" / "icon.icns").exists() else None, ) coll = COLLECT( exe, a.binaries, a.datas, strip=False, upx=True, upx_exclude=[], name="DataTools", ) # macOS .app bundle wrapper. PyInstaller produces it only on Mac; # this block is a no-op on Win/Linux. # # Tesseract bundling note: ``BUNDLE(coll, ...)`` carries the entire # COLLECT output (binaries + datas) into the .app's # Contents/Resources tree, so the ``tesseract/`` subdir we built up # in ``datas`` lands at ``DataTools.app/Contents/Resources/tesseract/`` # and the runtime ``sys._MEIPASS`` resolves there. No extra plumbing # needed. import sys as _sys if _sys.platform == "darwin": app = BUNDLE( coll, name="DataTools.app", icon=str(REPO / "build" / "icon.icns") if (REPO / "build" / "icon.icns").exists() else None, bundle_identifier="com.datatools.desktop", info_plist={ "CFBundleDisplayName": "DataTools", "CFBundleVersion": VERSION, "CFBundleShortVersionString": VERSION, "NSHighResolutionCapable": True, # Buyer's macOS will not show the app's window in the dock # if this is True. We want the dock icon so the buyer can # see the app is running while the browser tab is open. "LSUIElement": False, }, )