# PyInstaller spec for DataTools. # # Build (from the repo root, after ``pip install pyinstaller``): # # pyinstaller build/datatools.spec # # Output: ``dist/DataTools/`` (folder mode) and ``dist/DataTools.exe`` # (or platform equivalent) on Windows; ``dist/DataTools.app`` on macOS # when packaged via ``--target-arch universal2``. See ``build/README.md`` # for the full per-platform recipe. # # Why folder-mode (one-dir) is the default: # * Streamlit's static assets + Python interpreter + ~300 MB of deps # compress poorly into onefile. Onefile mode unpacks every launch # to a temp dir — adds 5-15 s startup latency that confuses # non-technical buyers ("did it crash?"). # * Folder mode lets the installer (Inno Setup on Win, .dmg on Mac) # run a one-time copy. Subsequent launches are instant. # # Cross-platform note: this single spec file is built ON each target # platform. Cross-compilation isn't supported — Mac builds need a # Mac, Windows builds need a Windows machine (or a Windows GitHub # Actions runner). See build/README.md for the matrix recipe. # -*- mode: python ; coding: utf-8 -*- from pathlib import Path from PyInstaller.utils.hooks import ( collect_all, collect_data_files, collect_submodules, ) # Repo root from this spec's location (PyInstaller sets SPECPATH). REPO = Path(SPECPATH).resolve().parent # Single source of truth for the version string. Read directly from # src/__init__.py instead of importing src/ — importing pulls in # heavy deps (pandas etc) that PyInstaller's spec parser doesn't need. import re as _re _init_py = (REPO / "src" / "__init__.py").read_text(encoding="utf-8") _m = _re.search(r'__version__\s*=\s*["\']([^"\']+)["\']', _init_py) VERSION = _m.group(1) if _m else "0.0.0" # ----- Hidden imports ------------------------------------------------ # PyInstaller's static analyser misses everything Streamlit reaches # through ``importlib`` and the per-tool registries our app uses. We # exhaustively pull every submodule of the libraries that bridge # user code to runtime — better a 50 MB-bigger bundle than a runtime # ImportError on the buyer's machine. hidden_imports: list[str] = [] hidden_imports += collect_submodules("streamlit") hidden_imports += collect_submodules("pandas") hidden_imports += collect_submodules("phonenumbers") hidden_imports += collect_submodules("rapidfuzz") hidden_imports += collect_submodules("charset_normalizer") hidden_imports += collect_submodules("openpyxl") hidden_imports += collect_submodules("loguru") # PDF Extractor stack. ``pypdfium2`` has its own PyInstaller hook # under ``build/hooks/`` that pulls in the native PDFium binary — # keep the ``collect_submodules`` calls here for belt-and-braces. hidden_imports += collect_submodules("pdfplumber") hidden_imports += collect_submodules("pdfminer") hidden_imports += collect_submodules("pypdfium2") hidden_imports += collect_submodules("PIL") hidden_imports += collect_submodules("pytesseract") # Our own engine + GUI modules. Even though we import them directly # at the top of ``launcher.py`` / ``app.py``, the Streamlit # session-state and per-page page discovery layers re-import via # names that PyInstaller doesn't see. hidden_imports += collect_submodules("src") # ----- Data files --------------------------------------------------- # Streamlit's static assets (the JS / CSS / fonts the browser fetches # from the bundled HTTP server) are NOT Python files; PyInstaller # can't auto-find them. datas: list[tuple[str, str]] = [] # Streamlit's runtime assets. datas += collect_data_files("streamlit", include_py_files=False) # phonenumbers ships its country/area-code metadata as resources. datas += collect_data_files("phonenumbers", include_py_files=False) # PDF Extractor data files. ``pypdfium2`` ships a native PDFium # shared library (``.dll`` / ``.so`` / ``.dylib``) under its package # dir; ``pdfminer`` ships the Adobe CMap tables it uses for # character mapping. The drawable-canvas frontend bundle is gone # now that the visual picker was removed. datas += collect_data_files("pypdfium2", include_py_files=False) datas += collect_data_files("pdfminer", include_py_files=False) # Our application files. PyInstaller's bundler treats source as code # (.pyc) by default; we add it again as data so the launcher's # ``Path(sys._MEIPASS) / "src" / "gui" / "app.py"`` resolution works. datas += [ (str(REPO / "src"), "src"), (str(REPO / "samples" / "demo"), "samples/demo"), (str(REPO / ".streamlit" / "config.toml"),".streamlit"), ] # ----- Analysis ------------------------------------------------------ a = Analysis( [str(REPO / "build" / "launcher.py")], pathex=[str(REPO)], binaries=[], datas=datas, hiddenimports=hidden_imports, hookspath=[str(REPO / "build" / "hooks")], hooksconfig={}, runtime_hooks=[], excludes=[ # Ship-trim — PyInstaller pulls these in but we never need # them, and they add ~80 MB combined. "tkinter", "matplotlib", "scipy", "IPython", "jupyter", "notebook", "test", "tests", ], noarchive=False, ) pyz = PYZ(a.pure) exe = EXE( pyz, a.scripts, [], exclude_binaries=True, name="DataTools", debug=False, bootloader_ignore_signals=False, strip=False, upx=True, console=False, # GUI app — no terminal window on Win/Mac disable_windowed_traceback=False, icon=str(REPO / "build" / "icon.icns") if (REPO / "build" / "icon.icns").exists() else None, ) coll = COLLECT( exe, a.binaries, a.datas, strip=False, upx=True, upx_exclude=[], name="DataTools", ) # macOS .app bundle wrapper. PyInstaller produces it only on Mac; # this block is a no-op on Win/Linux. import sys as _sys if _sys.platform == "darwin": app = BUNDLE( coll, name="DataTools.app", icon=str(REPO / "build" / "icon.icns") if (REPO / "build" / "icon.icns").exists() else None, bundle_identifier="com.datatools.desktop", info_plist={ "CFBundleDisplayName": "DataTools", "CFBundleVersion": VERSION, "CFBundleShortVersionString": VERSION, "NSHighResolutionCapable": True, # Buyer's macOS will not show the app's window in the dock # if this is True. We want the dock icon so the buyer can # see the app is running while the browser tab is open. "LSUIElement": False, }, )