Compare commits
3 Commits
4d8513b1a3
...
b703911df3
| Author | SHA1 | Date | |
|---|---|---|---|
| b703911df3 | |||
| 93ccada974 | |||
| 17faf84aed |
49
.github/workflows/build.yml
vendored
49
.github/workflows/build.yml
vendored
@@ -65,6 +65,30 @@ jobs:
|
||||
pip install -r requirements.txt
|
||||
pip install pyinstaller pillow
|
||||
|
||||
# ---- Tesseract bundling cache --------------------------------
|
||||
# The fetch logic inside build/make_release.py downloads:
|
||||
# * build/vendor/tessdata/eng.traineddata (~16 MB, shared)
|
||||
# * build/_tesseract/<platform>/ (binary + libs, 30-120 MB)
|
||||
# Cache both so iterative CI runs don't re-download. The
|
||||
# cache key bakes in the pinned Tesseract version + tessdata
|
||||
# URL so a version bump invalidates automatically.
|
||||
- name: Cache Tesseract bundle inputs
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: |
|
||||
build/_tesseract
|
||||
build/vendor/tessdata
|
||||
key: tesseract-${{ runner.os }}-5.5.0-tessdata_best-v1
|
||||
|
||||
# ---- Linux: install patchelf so make_release.py can rewrite
|
||||
# RPATH on the bundled tesseract binary. apt-get install
|
||||
# tesseract-ocr is handled inside make_release.py itself. -----
|
||||
- name: Install Linux build prereqs for Tesseract bundling
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y patchelf
|
||||
|
||||
- name: Read version
|
||||
id: version
|
||||
shell: bash
|
||||
@@ -75,7 +99,32 @@ jobs:
|
||||
- name: Generate platform icons
|
||||
run: python build/generate_icons.py
|
||||
|
||||
# Stage Tesseract before PyInstaller. The make_release.py
|
||||
# helpers handle the per-platform fetch (UB-Mannheim on Win,
|
||||
# brew on Mac, apt on Linux) and stage the binary + libs into
|
||||
# build/_tesseract/<platform>/ where the spec picks them up.
|
||||
# We invoke a tiny inline Python so the workflow doesn't have
|
||||
# to know the per-platform target string.
|
||||
- name: Stage Tesseract binary + tessdata
|
||||
shell: bash
|
||||
env:
|
||||
DATATOOLS_PLATFORM: ${{ matrix.platform }}
|
||||
run: |
|
||||
python - <<'PY'
|
||||
import os, sys
|
||||
sys.path.insert(0, "build")
|
||||
from make_release import fetch_tessdata, fetch_tesseract_for_platform
|
||||
target = os.environ["DATATOOLS_PLATFORM"]
|
||||
fetch_tessdata()
|
||||
fetch_tesseract_for_platform(target)
|
||||
PY
|
||||
|
||||
- name: Build PyInstaller bundle
|
||||
shell: bash
|
||||
env:
|
||||
# The spec reads this to find the per-platform staging dir;
|
||||
# see build/datatools.spec for the contract.
|
||||
DATATOOLS_TESS_STAGING: build/_tesseract/${{ matrix.platform }}
|
||||
run: pyinstaller build/datatools.spec --clean --noconfirm
|
||||
|
||||
# ---- Per-platform installer packaging ------------------------
|
||||
|
||||
8
.gitignore
vendored
8
.gitignore
vendored
@@ -16,6 +16,14 @@ build/dist/
|
||||
build/icon.ico
|
||||
build/icon.icns
|
||||
build/icon.png
|
||||
|
||||
# Tesseract bundling — fetched at build time, not committed. See
|
||||
# build/vendor/README.md for the canonical URLs and rationale.
|
||||
# - build/_tesseract/ : per-platform binary + DLLs/dylibs staging dir
|
||||
# - build/vendor/tessdata/eng.traineddata : ~16 MB language data
|
||||
build/_tesseract/
|
||||
build/vendor/tessdata/*.traineddata
|
||||
|
||||
.pytest_cache/
|
||||
|
||||
# Claude Code agent worktrees + local settings
|
||||
|
||||
220
LICENSE_TESSERACT.txt
Normal file
220
LICENSE_TESSERACT.txt
Normal file
@@ -0,0 +1,220 @@
|
||||
This license applies to the bundled Tesseract OCR binary distributed
|
||||
inside DataTools installer artifacts (Windows .exe, macOS .dmg, Linux
|
||||
.AppImage) and the corresponding portable .zip downloads.
|
||||
|
||||
Tesseract OCR upstream: https://github.com/tesseract-ocr/tesseract
|
||||
Copyright (C) 2006-2024 Google Inc. and the Tesseract OCR contributors
|
||||
|
||||
The Tesseract OCR binary is distributed under the Apache License,
|
||||
Version 2.0, the full text of which is reproduced verbatim below.
|
||||
|
||||
The bundled `eng.traineddata` data file is the "best" English model
|
||||
from https://github.com/tesseract-ocr/tessdata_best and is licensed
|
||||
under the Apache License, Version 2.0 as well.
|
||||
|
||||
DataTools itself is proprietary and is NOT covered by this license;
|
||||
see LICENSE.txt at the repository root for DataTools' own license.
|
||||
|
||||
================================================================================
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for describing the origin of the Work and
|
||||
reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may accept and charge a
|
||||
fee for, acceptance of support, warranty, indemnity, or other
|
||||
liability obligations and/or rights consistent with this License.
|
||||
However, in accepting such obligations, You may act only on Your
|
||||
own behalf and on Your sole responsibility, not on behalf of any
|
||||
other Contributor, and only if You agree to indemnify, defend,
|
||||
and hold each Contributor harmless for any liability incurred by,
|
||||
or claims asserted against, such Contributor by reason of your
|
||||
accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied. See the License for the specific language governing
|
||||
permissions and limitations under the License.
|
||||
@@ -30,7 +30,9 @@ Paquetes precompilados — sin instalar Python, sin permisos de administrador, s
|
||||
| **Windows** | `DataTools-X.Y.Z-win-setup.exe` — ejecuta el instalador (por usuario, sin admin). Crea acceso directo en el escritorio + entrada en el menú Inicio. | `DataTools-X.Y.Z-win-portable.zip` — descomprime donde quieras, doble clic en `DataTools.exe`. |
|
||||
| **Linux** | `DataTools-X.Y.Z-linux-x86_64.AppImage` — `chmod +x` y doble clic. | El AppImage ya es portable. |
|
||||
|
||||
Última versión: consulta [GitHub Releases](https://git.invixiom.com/giteadmin/datatools-dev/releases) (o el listado de Gumroad). Cada paquete ocupa ~200 MB descomprimido; al primer arranque la app levanta un servidor local en http://127.0.0.1:8501 y abre tu navegador predeterminado. Nada sale de tu equipo — instalador y portable son idénticos por dentro.
|
||||
Última versión: consulta [GitHub Releases](https://git.invixiom.com/giteadmin/datatools-dev/releases) (o el listado de Gumroad). Cada paquete ocupa ~300 MB descomprimido; al primer arranque la app levanta un servidor local en http://127.0.0.1:8501 y abre tu navegador predeterminado. Nada sale de tu equipo — instalador y portable son idénticos por dentro.
|
||||
|
||||
**Tesseract OCR viene incluido.** El soporte para PDFs escaneados del Extractor de PDF funciona sin configuración adicional en las tres plataformas — no hace falta instalar Tesseract por separado. Atribución de licencia: ver [`LICENSE_TESSERACT.txt`](LICENSE_TESSERACT.txt).
|
||||
|
||||
**Avisos del primer arranque (una sola vez):**
|
||||
- **macOS** sin firma: clic derecho → **Abrir** → confirma. (Las compilaciones firmadas se lo saltan.)
|
||||
|
||||
@@ -30,7 +30,9 @@ Pre-built bundles — no Python install, no admin rights, no internet at runtime
|
||||
| **Windows** | `DataTools-X.Y.Z-win-setup.exe` — run installer (per-user, no admin). Desktop shortcut + Start Menu entry created. | `DataTools-X.Y.Z-win-portable.zip` — unzip anywhere, double-click `DataTools.exe`. |
|
||||
| **Linux** | `DataTools-X.Y.Z-linux-x86_64.AppImage` — `chmod +x`, double-click. | The AppImage is already portable. |
|
||||
|
||||
Latest release: see [GitHub Releases](https://git.invixiom.com/giteadmin/datatools-dev/releases) (or the Gumroad listing). Each bundle is ~200 MB unpacked; on first launch the app starts a local server at http://127.0.0.1:8501 and opens your default browser. Nothing leaves your machine — installers and portables are byte-identical inside.
|
||||
Latest release: see [GitHub Releases](https://git.invixiom.com/giteadmin/datatools-dev/releases) (or the Gumroad listing). Each bundle is ~300 MB unpacked; on first launch the app starts a local server at http://127.0.0.1:8501 and opens your default browser. Nothing leaves your machine — installers and portables are byte-identical inside.
|
||||
|
||||
**Tesseract OCR is bundled.** Scanned-PDF support in the PDF Extractor works out of the box on all three platforms — no separate Tesseract install required. License attribution: see [`LICENSE_TESSERACT.txt`](LICENSE_TESSERACT.txt).
|
||||
|
||||
**First-launch warnings (one-time):**
|
||||
- **macOS** unsigned builds: right-click → **Open** → confirm. (Signed builds skip this.)
|
||||
|
||||
@@ -54,8 +54,11 @@ for buyers (or IT-locked-down machines) that can't run installers:
|
||||
| Linux | `DataTools-<ver>-linux-x86_64.AppImage`| (the AppImage IS the portable) |
|
||||
|
||||
All six outputs are self-contained: every dependency (Python, pandas,
|
||||
streamlit, pdfplumber, the lot) is frozen into the bundle. The buyer
|
||||
does not need to install Python, pip, or anything else first.
|
||||
streamlit, pdfplumber, **Tesseract OCR + `eng.traineddata`**, the lot)
|
||||
is frozen into the bundle. The buyer does not need to install Python,
|
||||
pip, Tesseract, or anything else first. With Tesseract bundled, each
|
||||
artifact is roughly **250–300 MB** on disk (up from ~120 MB pre-OCR);
|
||||
unpacked installs run ~300–400 MB once scratch space is counted.
|
||||
|
||||
## Easy-launch surface
|
||||
|
||||
@@ -287,6 +290,56 @@ Mac code-signing in CI requires the cert + private key as a GitHub
|
||||
secret (encoded with `base64`). Detailed walkthrough belongs in a
|
||||
later doc — for v1, sign locally and upload to GitHub Releases.
|
||||
|
||||
## Tesseract bundling (PDF Extractor OCR)
|
||||
|
||||
Frozen artifacts ship a per-platform Tesseract binary plus the English
|
||||
`eng.traineddata` model so scanned-PDF support in the PDF Extractor
|
||||
works out of the box — no separate user install. Source / pip
|
||||
developer setups still need system Tesseract on `PATH`.
|
||||
|
||||
**Layout inside the bundle**:
|
||||
|
||||
```
|
||||
DataTools/ (or DataTools.app/Contents/MacOS/)
|
||||
└── tesseract/
|
||||
├── tesseract (Linux/macOS binary; tesseract.exe on Windows)
|
||||
└── tessdata/
|
||||
└── eng.traineddata
|
||||
```
|
||||
|
||||
The runtime resolver (in `src/`, owned by the runtime team) walks:
|
||||
|
||||
1. `DATATOOLS_TESSERACT_BIN` env var override.
|
||||
2. `Path(sys._MEIPASS) / "tesseract" / "tesseract[.exe]"` — frozen
|
||||
bundles only.
|
||||
3. `tesseract` on `PATH`.
|
||||
4. Windows well-known paths.
|
||||
|
||||
**Where the bytes come from**:
|
||||
|
||||
- **Tessdata** — vendored in-repo at `build/vendor/tessdata/eng.traineddata`
|
||||
(sourced from [tessdata_best](https://github.com/tesseract-ocr/tessdata_best)).
|
||||
`datatools.spec` copies it into `tesseract/tessdata/`.
|
||||
- **Binary** — fetched per-platform at build time by
|
||||
`build/make_release.py` from pinned upstream URLs. Current pin:
|
||||
**Tesseract 5.5.0**.
|
||||
|
||||
**Updating Tesseract**:
|
||||
|
||||
1. Bump the version pin and the per-platform fetch URLs in
|
||||
`build/make_release.py`.
|
||||
2. If the model schema changed upstream, refresh
|
||||
`build/vendor/tessdata/eng.traineddata` from `tessdata_best` at the
|
||||
matching tag.
|
||||
3. Rebuild on each platform (`python build/make_release.py`) and
|
||||
smoke-test a scanned PDF through the PDF Extractor.
|
||||
4. Update `LICENSE_TESSERACT.txt` at the repo root if upstream license
|
||||
terms change (Apache-2.0 today).
|
||||
|
||||
License attribution for the bundled binary lives at
|
||||
`LICENSE_TESSERACT.txt` at the repo root — it must ship alongside any
|
||||
binary that contains Tesseract.
|
||||
|
||||
## Common pitfalls
|
||||
|
||||
| Symptom | Fix |
|
||||
|
||||
@@ -9,6 +9,11 @@
|
||||
# latest release from https://github.com/AppImage/AppImageKit/releases).
|
||||
#
|
||||
# Output: dist/DataTools-<version>-linux-x86_64.AppImage
|
||||
#
|
||||
# Tesseract bundling: no-op here. The PyInstaller bundle in
|
||||
# dist/DataTools/ already contains tesseract/{tesseract, *.so,
|
||||
# tessdata/eng.traineddata} from the spec's datas; ``cp -R``
|
||||
# below carries it along into the AppDir.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
|
||||
# -*- mode: python ; coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from PyInstaller.utils.hooks import (
|
||||
collect_all,
|
||||
@@ -103,6 +104,78 @@ datas += [
|
||||
(str(REPO / ".streamlit" / "config.toml"),".streamlit"),
|
||||
]
|
||||
|
||||
# ----- Tesseract OCR bundle ----------------------------------------
|
||||
# ``build/make_release.py`` stages the per-platform Tesseract binary
|
||||
# + its runtime libs (DLLs/dylibs/sos) into
|
||||
# ``build/_tesseract/<target>/`` and the shared eng.traineddata into
|
||||
# ``build/vendor/tessdata/``. We add both to ``datas`` so PyInstaller
|
||||
# drops them at the path the runtime expects:
|
||||
#
|
||||
# <bundle>/tesseract/tesseract[.exe]
|
||||
# <bundle>/tesseract/<all dll/dylib/so deps>
|
||||
# <bundle>/tesseract/tessdata/eng.traineddata
|
||||
#
|
||||
# The runtime discovery code in src/pdf_extract.py reads this layout
|
||||
# from ``Path(sys._MEIPASS) / "tesseract" / ...``. Keep the two ends
|
||||
# in sync — if you rename "tesseract" here, update pdf_extract.py too.
|
||||
#
|
||||
# The orchestrator (make_release.py) sets DATATOOLS_TESS_STAGING to
|
||||
# the right per-platform dir before invoking PyInstaller. For ad-hoc
|
||||
# `pyinstaller build/datatools.spec` runs without the orchestrator,
|
||||
# fall back to the canonical staging path.
|
||||
_tess_staging_env = os.environ.get("DATATOOLS_TESS_STAGING")
|
||||
if _tess_staging_env:
|
||||
_tess_staging = Path(_tess_staging_env)
|
||||
else:
|
||||
# Pick the obvious per-host staging dir as a fallback so spec-only
|
||||
# builds (without the orchestrator) still work in dev.
|
||||
import sys as _sys_for_target
|
||||
_target_guess = (
|
||||
"win" if _sys_for_target.platform.startswith("win")
|
||||
else "mac" if _sys_for_target.platform == "darwin"
|
||||
else "linux"
|
||||
)
|
||||
_tess_staging = REPO / "build" / "_tesseract" / _target_guess
|
||||
|
||||
_tessdata = REPO / "build" / "vendor" / "tessdata"
|
||||
|
||||
if _tess_staging.is_dir() and any(_tess_staging.iterdir()):
|
||||
# Drop every file in the staging dir directly under
|
||||
# ``<bundle>/tesseract/`` (binary + DLL/dylib/so siblings).
|
||||
datas += [(str(_tess_staging), "tesseract")]
|
||||
else:
|
||||
# Don't hard-fail spec parse — useful for first-time devs running
|
||||
# PyInstaller before fetching binaries. Surface a loud warning
|
||||
# though, since the OCR feature will silently fail at runtime.
|
||||
print(
|
||||
f"WARNING: {_tess_staging} is empty or missing — OCR will be "
|
||||
"disabled in the bundle. Run build/make_release.py (which "
|
||||
"calls fetch_tesseract_for_platform) before pyinstaller, or "
|
||||
"pre-stage the binary manually."
|
||||
)
|
||||
|
||||
if (_tessdata / "eng.traineddata").exists():
|
||||
datas += [(str(_tessdata), "tesseract/tessdata")]
|
||||
else:
|
||||
print(
|
||||
f"WARNING: {_tessdata}/eng.traineddata is missing — OCR will "
|
||||
"have no language data at runtime. Run build/make_release.py "
|
||||
"or fetch manually per build/vendor/README.md."
|
||||
)
|
||||
|
||||
# Bundle the Apache-2.0 LICENSE text alongside the binary. The docs
|
||||
# agent maintains LICENSE_TESSERACT.txt at the repo root; PyInstaller
|
||||
# drops it at the bundle root next to DataTools[.exe].
|
||||
_tess_license = REPO / "LICENSE_TESSERACT.txt"
|
||||
if _tess_license.exists():
|
||||
datas += [(str(_tess_license), ".")]
|
||||
else:
|
||||
print(
|
||||
"WARNING: LICENSE_TESSERACT.txt missing at repo root. Required "
|
||||
"by Apache-2.0 for redistribution; the docs agent should "
|
||||
"create it. Continuing without it for now."
|
||||
)
|
||||
|
||||
# ----- Analysis ------------------------------------------------------
|
||||
|
||||
a = Analysis(
|
||||
@@ -158,6 +231,13 @@ coll = COLLECT(
|
||||
|
||||
# macOS .app bundle wrapper. PyInstaller produces it only on Mac;
|
||||
# this block is a no-op on Win/Linux.
|
||||
#
|
||||
# Tesseract bundling note: ``BUNDLE(coll, ...)`` carries the entire
|
||||
# COLLECT output (binaries + datas) into the .app's
|
||||
# Contents/Resources tree, so the ``tesseract/`` subdir we built up
|
||||
# in ``datas`` lands at ``DataTools.app/Contents/Resources/tesseract/``
|
||||
# and the runtime ``sys._MEIPASS`` resolves there. No extra plumbing
|
||||
# needed.
|
||||
import sys as _sys
|
||||
if _sys.platform == "darwin":
|
||||
app = BUNDLE(
|
||||
|
||||
@@ -63,6 +63,14 @@ Name: "desktopicon"; Description: "Create a &desktop shortcut"; GroupDescription
|
||||
Name: "quicklaunchicon"; Description: "Create a &Quick Launch shortcut"; GroupDescription: "Additional shortcuts:"; Flags: unchecked; OnlyBelowVersion: 6.1
|
||||
|
||||
[Files]
|
||||
; PyInstaller's dist/DataTools/ tree includes:
|
||||
; * DataTools.exe + frozen Python runtime
|
||||
; * tesseract/tesseract.exe + DLLs + tessdata/eng.traineddata
|
||||
; (bundled via build/datatools.spec datas; runtime discovery in
|
||||
; src/pdf_extract.py reads sys._MEIPASS / "tesseract" / ...).
|
||||
; * LICENSE_TESSERACT.txt at the bundle root (Apache-2.0).
|
||||
; The recursesubdirs flag below picks all of those up — no separate
|
||||
; Files: entry needed for tesseract/.
|
||||
Source: "..\dist\DataTools\*"; DestDir: "{app}"; Flags: recursesubdirs ignoreversion
|
||||
|
||||
[Icons]
|
||||
|
||||
@@ -10,6 +10,11 @@
|
||||
#
|
||||
# Code signing + notarization happen separately (see build/README.md
|
||||
# "Signing"). This script only handles the packaging step.
|
||||
#
|
||||
# Tesseract bundling: no-op here. The .app already contains
|
||||
# Contents/Resources/tesseract/{tesseract, *.dylib, tessdata/} thanks
|
||||
# to PyInstaller's BUNDLE() carrying the spec's datas through. This
|
||||
# script just wraps the finished .app — no extra steps for OCR.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
|
||||
@@ -14,6 +14,11 @@
|
||||
# Run after ``pyinstaller build/datatools.spec --clean --noconfirm``
|
||||
# has produced ``dist/DataTools.app``. Output goes to
|
||||
# ``dist/DataTools-<version>-mac-portable.zip``.
|
||||
#
|
||||
# Tesseract bundling: no-op here. The bundled Tesseract binary +
|
||||
# dylibs + tessdata are already inside DataTools.app/Contents/Resources/tesseract/
|
||||
# (placed by PyInstaller's BUNDLE/datas mechanism). ``ditto -c -k``
|
||||
# preserves the whole .app tree.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
|
||||
@@ -32,17 +32,33 @@ Run from the repo root or from build/ — either works.
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import platform
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
|
||||
REPO = Path(__file__).resolve().parent.parent
|
||||
BUILD = REPO / "build"
|
||||
DIST = REPO / "dist"
|
||||
|
||||
# Tesseract bundling. The runtime discovery code in
|
||||
# ``src/pdf_extract.py`` looks for the binary at
|
||||
# ``Path(sys._MEIPASS) / "tesseract" / "tesseract[.exe]"`` and tessdata
|
||||
# at ``... / "tesseract" / "tessdata" / "eng.traineddata"``. We stage
|
||||
# everything under ``build/_tesseract/<platform>/`` (gitignored) and
|
||||
# the PyInstaller spec adds that staging dir to ``datas=`` so it lands
|
||||
# at the right place inside the frozen bundle.
|
||||
TESSERACT_VERSION = "5.5.0"
|
||||
TESSDATA_DIR = BUILD / "vendor" / "tessdata"
|
||||
TESSDATA_URL = (
|
||||
"https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata"
|
||||
)
|
||||
TESSERACT_STAGING = BUILD / "_tesseract"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Output helpers — colourless so logs stay readable in any terminal/CI tail.
|
||||
@@ -192,6 +208,382 @@ def preflight(target: str) -> None:
|
||||
_ok("all prerequisites present")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tesseract bundling — fetch the binary + tessdata at build time.
|
||||
#
|
||||
# We download (not vendor) because:
|
||||
# * Binaries are large (5-40 MB per platform) and license-encumbered
|
||||
# to keep current in git.
|
||||
# * tessdata is Apache-2.0 and ~16 MB — fine to redistribute but
|
||||
# bloats clones for contributors who don't touch OCR.
|
||||
#
|
||||
# Caching layout:
|
||||
# build/_tesseract/win/tesseract.exe + DLLs
|
||||
# build/_tesseract/mac/tesseract + dylibs
|
||||
# build/_tesseract/linux/tesseract + libs
|
||||
# build/vendor/tessdata/eng.traineddata (shared across platforms)
|
||||
#
|
||||
# The PyInstaller spec reads ``build/_tesseract/<platform>/`` and the
|
||||
# tessdata dir, then bundles them under ``<bundle>/tesseract/``.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _download(url: str, dest: Path, *, expected_min_bytes: int = 1024) -> None:
|
||||
"""Download *url* to *dest* atomically. Sanity-check the size."""
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp = dest.with_suffix(dest.suffix + ".part")
|
||||
print(f" GET {url}", flush=True)
|
||||
try:
|
||||
with urllib.request.urlopen(url, timeout=120) as r, open(tmp, "wb") as f:
|
||||
shutil.copyfileobj(r, f)
|
||||
except Exception as e: # noqa: BLE001 — bubble any network error up
|
||||
if tmp.exists():
|
||||
tmp.unlink()
|
||||
_err(f"download failed: {url}\n {e}")
|
||||
raise
|
||||
size = tmp.stat().st_size
|
||||
if size < expected_min_bytes:
|
||||
tmp.unlink()
|
||||
raise RuntimeError(
|
||||
f"downloaded file too small ({size} bytes < {expected_min_bytes}); "
|
||||
f"the URL probably 404'd into an HTML error page."
|
||||
)
|
||||
tmp.replace(dest)
|
||||
_ok(f"downloaded {dest.name} ({size / (1024 * 1024):.1f} MB)")
|
||||
|
||||
|
||||
def fetch_tessdata() -> Path:
|
||||
"""Ensure ``build/vendor/tessdata/eng.traineddata`` exists; return its path.
|
||||
|
||||
Shared across platforms. Downloaded once and cached. The
|
||||
runtime expects this file at ``<bundle>/tesseract/tessdata/eng.traineddata``;
|
||||
the PyInstaller spec handles the placement.
|
||||
"""
|
||||
_step("fetch tessdata (eng.traineddata)")
|
||||
TESSDATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
target = TESSDATA_DIR / "eng.traineddata"
|
||||
if target.exists() and target.stat().st_size > 1_000_000:
|
||||
_ok(f"already cached: {target.relative_to(REPO)} "
|
||||
f"({target.stat().st_size / (1024 * 1024):.1f} MB)")
|
||||
return target
|
||||
# ~16 MB on disk for the "best" model. Allow some slack on the
|
||||
# min-bytes check (3 MB) so we still catch HTML 404 pages.
|
||||
_download(TESSDATA_URL, target, expected_min_bytes=3 * 1024 * 1024)
|
||||
return target
|
||||
|
||||
|
||||
def _fetch_tesseract_windows(staging: Path) -> None:
|
||||
"""Stage tesseract.exe + DLLs into *staging*.
|
||||
|
||||
Strategy (no easy stand-alone Windows tarball exists — UB-Mannheim
|
||||
ships the canonical Windows builds as Inno Setup installers):
|
||||
|
||||
1. Download the installer .exe from the UB-Mannheim mirror.
|
||||
2. Extract it with 7-Zip (which can read Inno Setup archives via
|
||||
the {app} group). 7-Zip is preinstalled on
|
||||
``windows-latest`` GitHub Actions runners (`C:\\Program Files\\7-Zip\\7z.exe`).
|
||||
3. Copy tesseract.exe + every DLL + the tessdata dir from the
|
||||
extraction into ``staging/``.
|
||||
|
||||
The DLL set tesseract.exe needs at runtime (per UB-Mannheim's
|
||||
Inno Setup script):
|
||||
libtesseract-5.dll, libleptonica-6.dll, libgomp-1.dll,
|
||||
libstdc++-6.dll, libwinpthread-1.dll, libgcc_s_seh-1.dll,
|
||||
liblz4.dll, libjpeg-8.dll, libpng16-16.dll, libtiff-6.dll,
|
||||
libwebp-7.dll, libwebpmux-3.dll, libopenjp2-7.dll, zlib1.dll
|
||||
The whole {app} tree from the installer is ~120 MB; we copy
|
||||
just the .exe + .dll files (~50 MB) since the runtime only
|
||||
needs the binary and its direct deps.
|
||||
"""
|
||||
# UB-Mannheim posts builds under a versioned filename; the exact
|
||||
# build revision changes (5.5.0.20241111 at time of writing).
|
||||
# We pin a specific rev so reproducible builds don't drift.
|
||||
rev = "20241111" # patch rev for tesseract 5.5.0 on the UB-Mannheim mirror
|
||||
fname = f"tesseract-ocr-w64-setup-{TESSERACT_VERSION}.{rev}.exe"
|
||||
url = f"https://digi.bib.uni-mannheim.de/tesseract/{fname}"
|
||||
|
||||
cache = TESSERACT_STAGING / fname
|
||||
if not cache.exists():
|
||||
_download(url, cache, expected_min_bytes=20 * 1024 * 1024)
|
||||
|
||||
# 7-Zip is preinstalled on windows-latest runners; on a dev box
|
||||
# the user installs it (choco install 7zip) or substitutes
|
||||
# innoextract. Locate it.
|
||||
sevenz = (
|
||||
shutil.which("7z")
|
||||
or shutil.which("7z.exe")
|
||||
or r"C:\Program Files\7-Zip\7z.exe"
|
||||
)
|
||||
if not Path(sevenz).exists() and not shutil.which("7z"):
|
||||
_err(
|
||||
"7-Zip not found. On Windows CI runners it's preinstalled; "
|
||||
"on a dev box install via ``choco install 7zip`` or extract "
|
||||
f"{cache} manually into {staging}/ and re-run with "
|
||||
"TESSERACT_SKIP_FETCH=1."
|
||||
)
|
||||
raise FileNotFoundError("7z")
|
||||
|
||||
extract = TESSERACT_STAGING / "win_extract"
|
||||
if extract.exists():
|
||||
shutil.rmtree(extract)
|
||||
extract.mkdir(parents=True)
|
||||
_run([str(sevenz), "x", "-y", f"-o{extract}", str(cache)])
|
||||
|
||||
staging.mkdir(parents=True, exist_ok=True)
|
||||
# The Inno Setup payload lands under ``{app}/`` inside the
|
||||
# extraction. Recursively grab tesseract.exe + DLLs.
|
||||
found_exe = False
|
||||
for root, _dirs, files in os.walk(extract):
|
||||
for f in files:
|
||||
src = Path(root) / f
|
||||
if f.lower() == "tesseract.exe":
|
||||
shutil.copy2(src, staging / "tesseract.exe")
|
||||
found_exe = True
|
||||
elif f.lower().endswith(".dll"):
|
||||
shutil.copy2(src, staging / f)
|
||||
if not found_exe:
|
||||
raise RuntimeError(
|
||||
f"tesseract.exe not found inside extracted installer at {extract}"
|
||||
)
|
||||
_ok(f"staged Windows tesseract into {staging.relative_to(REPO)}")
|
||||
|
||||
|
||||
def _fetch_tesseract_macos(staging: Path) -> None:
|
||||
"""Stage tesseract + dylibs into *staging* on macOS.
|
||||
|
||||
Strategy: use Homebrew. ``brew install tesseract`` is the
|
||||
sanctioned macOS path and the binary it installs is the same one
|
||||
every guide on the internet points at. We copy the binary +
|
||||
every dylib it links against into the staging dir, then run
|
||||
``install_name_tool`` to rewrite the load paths so the binary
|
||||
works after relocation into the .app bundle.
|
||||
|
||||
Caveat: ``brew`` must be on PATH (it is on ``macos-latest``
|
||||
runners). If it isn't, we surface a helpful error rather than
|
||||
fail mysteriously.
|
||||
"""
|
||||
if not shutil.which("brew"):
|
||||
_err(
|
||||
"Homebrew not found. On macos-latest GitHub runners it's "
|
||||
"preinstalled; on a dev Mac install from https://brew.sh and "
|
||||
"re-run. Alternatively pre-stage tesseract into "
|
||||
f"{staging}/ and set TESSERACT_SKIP_FETCH=1."
|
||||
)
|
||||
raise FileNotFoundError("brew")
|
||||
|
||||
# ``brew install`` is idempotent — fine to run on every build. We
|
||||
# don't pin the version through brew because brew tracks its own
|
||||
# taps; instead we assert the version matches TESSERACT_VERSION
|
||||
# after install.
|
||||
_run(["brew", "install", "tesseract"])
|
||||
|
||||
# Find the binary brew just installed.
|
||||
tess_path = shutil.which("tesseract")
|
||||
if not tess_path:
|
||||
raise RuntimeError("brew install tesseract succeeded but tesseract not on PATH")
|
||||
|
||||
staging.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(tess_path, staging / "tesseract")
|
||||
|
||||
# Copy every non-system dylib the binary links against. The
|
||||
# ``otool -L`` output lists absolute paths under /opt/homebrew/
|
||||
# (Apple Silicon) or /usr/local/ (Intel). We skip /usr/lib/* and
|
||||
# /System/* (Apple-shipped, present on every Mac).
|
||||
try:
|
||||
otool = subprocess.run(
|
||||
["otool", "-L", str(staging / "tesseract")],
|
||||
check=True, capture_output=True, text=True,
|
||||
)
|
||||
except subprocess.CalledProcessError as e:
|
||||
raise RuntimeError(f"otool failed: {e.stderr}") from e
|
||||
|
||||
deps = []
|
||||
for line in otool.stdout.splitlines()[1:]:
|
||||
path = line.strip().split(" ", 1)[0]
|
||||
if path.startswith(("/opt/homebrew/", "/usr/local/")):
|
||||
deps.append(path)
|
||||
|
||||
# Copy each dep and its transitive deps. One level of recursion
|
||||
# is usually enough for the tesseract dep tree (libtesseract →
|
||||
# libleptonica → libpng/libjpeg/libtiff/libwebp).
|
||||
copied: set[str] = set()
|
||||
|
||||
def _copy_with_deps(libpath: str) -> None:
|
||||
if libpath in copied or not Path(libpath).exists():
|
||||
return
|
||||
copied.add(libpath)
|
||||
dest = staging / Path(libpath).name
|
||||
shutil.copy2(libpath, dest)
|
||||
# Rewrite the dest's own load path to @loader_path so the
|
||||
# bundle is relocatable.
|
||||
try:
|
||||
subprocess.run(
|
||||
["install_name_tool", "-id", f"@loader_path/{Path(libpath).name}", str(dest)],
|
||||
check=True, capture_output=True,
|
||||
)
|
||||
except subprocess.CalledProcessError:
|
||||
# Not fatal — install_name_tool refuses on already-relative
|
||||
# IDs. The dyld loader will still find them via
|
||||
# @loader_path rewrites on the consumer side.
|
||||
pass
|
||||
# Walk this lib's own deps.
|
||||
try:
|
||||
sub = subprocess.run(
|
||||
["otool", "-L", libpath], check=True, capture_output=True, text=True,
|
||||
)
|
||||
for sub_line in sub.stdout.splitlines()[1:]:
|
||||
sub_path = sub_line.strip().split(" ", 1)[0]
|
||||
if sub_path.startswith(("/opt/homebrew/", "/usr/local/")):
|
||||
_copy_with_deps(sub_path)
|
||||
except subprocess.CalledProcessError:
|
||||
pass
|
||||
|
||||
for dep in deps:
|
||||
_copy_with_deps(dep)
|
||||
|
||||
# Rewrite the tesseract binary's references to point at
|
||||
# @loader_path/<dyname> so it can find its deps inside the bundle.
|
||||
bin_path = staging / "tesseract"
|
||||
for dep in deps:
|
||||
try:
|
||||
subprocess.run(
|
||||
["install_name_tool", "-change", dep,
|
||||
f"@loader_path/{Path(dep).name}", str(bin_path)],
|
||||
check=True, capture_output=True,
|
||||
)
|
||||
except subprocess.CalledProcessError:
|
||||
pass
|
||||
|
||||
_ok(f"staged macOS tesseract + {len(copied)} dylibs into {staging.relative_to(REPO)}")
|
||||
|
||||
|
||||
def _fetch_tesseract_linux(staging: Path) -> None:
|
||||
"""Stage tesseract + .so files into *staging* on Linux.
|
||||
|
||||
Strategy: ``apt-get install tesseract-ocr libtesseract5``
|
||||
(preinstalled on most ubuntu-latest images; we run install
|
||||
anyway because the package is idempotent). Then copy the
|
||||
binary + every .so it links against into staging. ``patchelf``
|
||||
rewrites RPATH so the bundle is relocatable.
|
||||
"""
|
||||
if not shutil.which("apt-get") and not shutil.which("tesseract"):
|
||||
_err(
|
||||
"Neither apt-get nor a pre-installed tesseract found. On "
|
||||
"ubuntu-latest runners both are present. On other distros "
|
||||
"install tesseract-ocr via your package manager and re-run "
|
||||
"with TESSERACT_SKIP_FETCH=1 after pre-staging the binary."
|
||||
)
|
||||
raise FileNotFoundError("tesseract")
|
||||
|
||||
if shutil.which("apt-get") and not shutil.which("tesseract"):
|
||||
_run(["sudo", "apt-get", "update"])
|
||||
_run(["sudo", "apt-get", "install", "-y", "tesseract-ocr", "libtesseract5"])
|
||||
|
||||
tess_path = shutil.which("tesseract")
|
||||
if not tess_path:
|
||||
raise RuntimeError("apt-get install succeeded but tesseract not on PATH")
|
||||
|
||||
staging.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(tess_path, staging / "tesseract")
|
||||
|
||||
# Collect .so dependencies via ldd. Skip the dynamic linker and
|
||||
# libc/libpthread/libdl/libm/libstdc++/libgcc_s — those are
|
||||
# guaranteed to exist on every Linux target and shipping them can
|
||||
# cause GLIBC mismatch errors on older distros. The interesting
|
||||
# tesseract-specific deps are libtesseract, libleptonica, and the
|
||||
# image format libs (libpng, libjpeg, libtiff, libwebp, libgif).
|
||||
SKIP_PREFIXES = (
|
||||
"linux-vdso", "/lib64/ld-linux", "/lib/ld-linux",
|
||||
"libc.so", "libdl.so", "libpthread.so", "libm.so",
|
||||
"librt.so", "libnsl.so", "libutil.so",
|
||||
)
|
||||
try:
|
||||
ldd = subprocess.run(
|
||||
["ldd", str(staging / "tesseract")],
|
||||
check=True, capture_output=True, text=True,
|
||||
)
|
||||
except subprocess.CalledProcessError as e:
|
||||
raise RuntimeError(f"ldd failed: {e.stderr}") from e
|
||||
|
||||
copied = 0
|
||||
for line in ldd.stdout.splitlines():
|
||||
# Format: " libfoo.so.N => /path/to/libfoo.so.N (0x...)"
|
||||
parts = line.split("=>")
|
||||
if len(parts) != 2:
|
||||
continue
|
||||
soname = parts[0].strip()
|
||||
if soname.startswith(SKIP_PREFIXES):
|
||||
continue
|
||||
path_part = parts[1].strip().split(" ", 1)[0]
|
||||
if not path_part or not Path(path_part).exists():
|
||||
continue
|
||||
shutil.copy2(path_part, staging / Path(path_part).name)
|
||||
copied += 1
|
||||
|
||||
# patchelf is optional — if present, rewrite RPATH to $ORIGIN so
|
||||
# the binary finds its bundled .so files. If absent, the
|
||||
# PyInstaller LD_LIBRARY_PATH that the launcher sets will cover
|
||||
# it (we already chdir into _MEIPASS for the runtime).
|
||||
if shutil.which("patchelf"):
|
||||
try:
|
||||
_run(["patchelf", "--set-rpath", "$ORIGIN", str(staging / "tesseract")])
|
||||
except SystemExit:
|
||||
_warn("patchelf rpath rewrite failed — relying on LD_LIBRARY_PATH at runtime")
|
||||
|
||||
_ok(f"staged Linux tesseract + {copied} .so files into {staging.relative_to(REPO)}")
|
||||
|
||||
|
||||
def fetch_tesseract_for_platform(target: str) -> Path:
|
||||
"""Stage the per-platform Tesseract binary + libs into ``build/_tesseract/<target>/``.
|
||||
|
||||
Returns the staging dir path. The PyInstaller spec adds this dir
|
||||
(plus tessdata) to its ``datas=`` so the bundle ends up with
|
||||
everything under ``<bundle>/tesseract/`` where the runtime
|
||||
discovery code expects it.
|
||||
|
||||
Honours ``TESSERACT_SKIP_FETCH=1`` — set this when you've
|
||||
pre-staged the binary by hand (offline build, behind a proxy,
|
||||
custom build of tesseract, etc.). The script still verifies the
|
||||
binary is present and surfaces a helpful error if not.
|
||||
"""
|
||||
_step(f"fetch tesseract binary ({target})")
|
||||
staging = TESSERACT_STAGING / target
|
||||
exe_name = "tesseract.exe" if target == "win" else "tesseract"
|
||||
exe_path = staging / exe_name
|
||||
|
||||
if os.environ.get("TESSERACT_SKIP_FETCH") == "1":
|
||||
if not exe_path.exists():
|
||||
_err(
|
||||
f"TESSERACT_SKIP_FETCH=1 but {exe_path} is missing. "
|
||||
"Pre-stage the binary + its libs into that dir, then re-run."
|
||||
)
|
||||
sys.exit(1)
|
||||
_ok(f"skipping fetch (TESSERACT_SKIP_FETCH=1); using {exe_path.relative_to(REPO)}")
|
||||
return staging
|
||||
|
||||
if exe_path.exists():
|
||||
_ok(f"already staged: {exe_path.relative_to(REPO)}")
|
||||
return staging
|
||||
|
||||
if target == "win":
|
||||
_fetch_tesseract_windows(staging)
|
||||
elif target == "mac":
|
||||
_fetch_tesseract_macos(staging)
|
||||
elif target == "linux":
|
||||
_fetch_tesseract_linux(staging)
|
||||
else:
|
||||
_err(f"unknown target {target!r} for tesseract fetch")
|
||||
sys.exit(2)
|
||||
|
||||
if not exe_path.exists():
|
||||
_err(
|
||||
f"fetch step finished but {exe_path.relative_to(REPO)} is missing. "
|
||||
"Inspect the logs above; you may need to pre-stage the binary manually."
|
||||
)
|
||||
sys.exit(1)
|
||||
return staging
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Build steps
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -202,7 +594,7 @@ def step_generate_icons() -> None:
|
||||
_run([sys.executable, str(BUILD / "generate_icons.py")])
|
||||
|
||||
|
||||
def step_pyinstaller(clean: bool) -> None:
|
||||
def step_pyinstaller(clean: bool, *, target: str | None = None) -> None:
|
||||
_step("pyinstaller bundle")
|
||||
# Use ``python -m PyInstaller`` so we don't depend on the binary
|
||||
# being on PATH (Windows users frequently see this — pip's
|
||||
@@ -212,7 +604,14 @@ def step_pyinstaller(clean: bool) -> None:
|
||||
"--noconfirm"]
|
||||
if clean:
|
||||
cmd.append("--clean")
|
||||
_run(cmd)
|
||||
# The spec reads ``DATATOOLS_TESS_STAGING`` to find the per-platform
|
||||
# tesseract staging dir. Passing it via env keeps the spec file
|
||||
# platform-agnostic — the spec doesn't need to detect win/mac/linux
|
||||
# itself; the orchestrator already did.
|
||||
env = os.environ.copy()
|
||||
if target:
|
||||
env["DATATOOLS_TESS_STAGING"] = str(TESSERACT_STAGING / target)
|
||||
_run(cmd, env=env)
|
||||
|
||||
|
||||
def step_package_win(version: str, do_installer: bool, do_portable: bool) -> list[Path]:
|
||||
@@ -331,7 +730,17 @@ def main() -> int:
|
||||
shutil.rmtree(DIST)
|
||||
|
||||
step_generate_icons()
|
||||
step_pyinstaller(clean=args.clean)
|
||||
|
||||
# Stage Tesseract OCR before PyInstaller runs. The spec reads
|
||||
# ``build/_tesseract/<target>/`` + ``build/vendor/tessdata/`` and
|
||||
# bundles them under ``<bundle>/tesseract/`` so the runtime
|
||||
# discovery in src/pdf_extract.py finds them at:
|
||||
# Path(sys._MEIPASS) / "tesseract" / "tesseract[.exe]"
|
||||
# Path(sys._MEIPASS) / "tesseract" / "tessdata" / "eng.traineddata"
|
||||
fetch_tessdata()
|
||||
fetch_tesseract_for_platform(target)
|
||||
|
||||
step_pyinstaller(clean=args.clean, target=target)
|
||||
|
||||
if target == "win":
|
||||
outputs = step_package_win(version, do_installer, do_portable)
|
||||
|
||||
62
build/vendor/README.md
vendored
Normal file
62
build/vendor/README.md
vendored
Normal file
@@ -0,0 +1,62 @@
|
||||
# build/vendor/ — third-party bundle inputs (fetched at build time)
|
||||
|
||||
This tree holds the third-party assets that get bundled into the
|
||||
PyInstaller artifacts but that we deliberately do **not** keep in git
|
||||
(too large / license-encumbered / re-fetchable on demand).
|
||||
|
||||
The build pipeline (`build/make_release.py`) populates everything in
|
||||
here before the PyInstaller step. The contents are git-ignored except
|
||||
for this README.
|
||||
|
||||
## tessdata/
|
||||
|
||||
Holds the Tesseract language data file(s) used by the PDF Extractor
|
||||
OCR fallback. Only English is bundled today.
|
||||
|
||||
### Canonical source
|
||||
|
||||
We use the **"best" model** from `tesseract-ocr/tessdata_best` (LSTM,
|
||||
slower but higher accuracy than the legacy `tessdata` set, and only
|
||||
~12 MB compressed → ~16 MB uncompressed):
|
||||
|
||||
```
|
||||
https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata
|
||||
```
|
||||
|
||||
There is also `tessdata_fast/` (~4 MB, lower accuracy) if you ever
|
||||
want to optimise for bundle size over recognition quality. For bank
|
||||
statements (the only OCR use case so far), the extra accuracy of the
|
||||
`_best` model is worth the 10 MB.
|
||||
|
||||
### Why we don't vendor it in git
|
||||
|
||||
* ~16 MB binary file — bloats clone times for everyone, including
|
||||
contributors who never touch the OCR code path.
|
||||
* Apache-2.0-licensed and stable; the file rarely changes upstream
|
||||
(last touched 2021), so a build-time fetch is safe.
|
||||
* The Tesseract project explicitly distributes these via GitHub
|
||||
raw URLs — they're meant to be downloaded, not redistributed
|
||||
through other repos.
|
||||
|
||||
### How it gets populated
|
||||
|
||||
`build/make_release.py::fetch_tessdata()` checks for
|
||||
`build/vendor/tessdata/eng.traineddata` on every run. If it's
|
||||
missing, the script downloads it from the canonical URL above and
|
||||
caches it here. Subsequent builds reuse the cached file.
|
||||
|
||||
On CI, the directory is restored from the GitHub Actions cache so we
|
||||
don't pay the download cost on every run (`.github/workflows/build.yml`
|
||||
caches `build/vendor/tessdata/` keyed on the URL above).
|
||||
|
||||
## Manual one-time fetch (if you're offline or behind a proxy)
|
||||
|
||||
```bash
|
||||
mkdir -p build/vendor/tessdata
|
||||
curl -L -o build/vendor/tessdata/eng.traineddata \
|
||||
https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata
|
||||
```
|
||||
|
||||
Verify the file is non-empty and starts with the magic bytes
|
||||
`b"\x00\x00\x00\x00"` followed by a header that `pytesseract` can
|
||||
read; the script does a basic sanity check after download.
|
||||
0
build/vendor/tessdata/.gitkeep
vendored
Normal file
0
build/vendor/tessdata/.gitkeep
vendored
Normal file
@@ -296,6 +296,37 @@ GUI / CLI handlers: use `format_for_user(exc, context="...")` to render.
|
||||
|
||||
All `DataToolsError` subclasses extend stdlib `ValueError` or `OSError` so existing handlers still catch them.
|
||||
|
||||
## PDF Extractor — bundled Tesseract
|
||||
|
||||
Frozen builds (installer / portable .zip / AppImage) ship Tesseract OCR inside the bundle so scanned PDFs work without a separate system install. Source / `pip` developer environments still resolve Tesseract from `PATH`.
|
||||
|
||||
**Runtime layout (frozen bundles)**:
|
||||
|
||||
| Resource | Path |
|
||||
|---|---|
|
||||
| Tesseract binary | `Path(sys._MEIPASS) / "tesseract" / "tesseract"` (Linux/macOS), `…/tesseract/tesseract.exe` (Windows) |
|
||||
| Tessdata directory | `Path(sys._MEIPASS) / "tesseract" / "tessdata"` |
|
||||
| English model | `Path(sys._MEIPASS) / "tesseract" / "tessdata" / "eng.traineddata"` |
|
||||
|
||||
**Discovery order** (PDF Extractor runtime):
|
||||
|
||||
1. `DATATOOLS_TESSERACT_BIN` env var (override — explicit path to a `tesseract` binary).
|
||||
2. Bundled path under `sys._MEIPASS` (frozen bundles only — falls through to step 3 otherwise).
|
||||
3. `tesseract` on `PATH` (developer setups, source checkouts).
|
||||
4. Windows well-known locations (`C:\Program Files\Tesseract-OCR\tesseract.exe`, etc.).
|
||||
|
||||
**Where the bytes come from**:
|
||||
|
||||
- **Tessdata** is vendored at `build/vendor/tessdata/eng.traineddata` — the "best" English model from [tessdata_best](https://github.com/tesseract-ocr/tessdata_best). PyInstaller's spec copies it into `tesseract/tessdata/` inside the bundle.
|
||||
- **Tesseract binary** is fetched at build time by `build/make_release.py` — per-platform download URLs are pinned in that script. The current pin is **Tesseract 5.5.0**.
|
||||
|
||||
**To update Tesseract**:
|
||||
|
||||
1. Bump the version pin + the per-platform fetch URLs in `build/make_release.py`.
|
||||
2. If upstream changed the `eng.traineddata` schema, refresh `build/vendor/tessdata/eng.traineddata` from `tessdata_best` at the matching tag.
|
||||
3. Rebuild on each platform (`python build/make_release.py`) and smoke-test a scanned-PDF run through the PDF Extractor before tagging the release.
|
||||
4. Update `LICENSE_TESSERACT.txt` at the repo root if the upstream license terms change (Tesseract is Apache-2.0 today).
|
||||
|
||||
## Tests
|
||||
|
||||
```bash
|
||||
|
||||
@@ -122,6 +122,17 @@ Tag a release → 3 platform artifacts upload to GitHub Releases. Manual: copy t
|
||||
|
||||
`demo/streamlit_app.py` → Streamlit Community Cloud. Configure deployment in Streamlit UI. Custom domain via CNAME (verify policy at deploy time). Fall back to $5/mo VPS if rate limits / branding constraints hit.
|
||||
|
||||
### 3.10 Bundled Tesseract (PDF Extractor OCR)
|
||||
|
||||
Frozen builds ship Tesseract 5.5 + `eng.traineddata` inside the PyInstaller bundle so scanned PDFs work without a separate install. Per-platform binary URLs pinned in `build/make_release.py`; tessdata vendored at `build/vendor/tessdata/eng.traineddata`. License attribution in `LICENSE_TESSERACT.txt` at the repo root.
|
||||
|
||||
**Discovery order at runtime** (see `docs/DEVELOPER.md` for the full Path layout):
|
||||
|
||||
1. `DATATOOLS_TESSERACT_BIN` env var override.
|
||||
2. Bundled path under `sys._MEIPASS / "tesseract" /` (frozen bundles only).
|
||||
3. `tesseract` on `PATH` (source / pip developer environments).
|
||||
4. Windows well-known locations.
|
||||
|
||||
## 4. Libraries
|
||||
|
||||
| Purpose | Library |
|
||||
|
||||
@@ -103,7 +103,9 @@ La ventana del lanzador queda abierta en segundo plano. Cerrarla detiene el serv
|
||||
|
||||
- Windows 10/11 (64 bits), macOS 11+, Linux moderno (2020+).
|
||||
- Navegador moderno (Chrome, Edge, Firefox, Safari, últimos 3 años).
|
||||
- ~400 MB de espacio libre en disco (el paquete ocupa ~200 MB; el resto es espacio de trabajo para CSV grandes).
|
||||
- ~500 MB de espacio libre en disco (el paquete ocupa ~300 MB; el resto es espacio de trabajo para CSV grandes).
|
||||
|
||||
**OCR para PDFs escaneados viene incluido** — Tesseract 5.5 y el modelo en inglés `eng.traineddata` vienen dentro de cada instalador / portable / AppImage. La ruta de extracción de PDFs escaneados del Extractor de PDF funciona sin configuración adicional; no hace falta instalar nada por separado. (Quien ejecute desde un checkout con `pip install -r requirements.txt` sigue necesitando Tesseract del sistema en el `PATH` — ver [DEVELOPER.md §PDF Extractor — bundled Tesseract](DEVELOPER.md#pdf-extractor--bundled-tesseract) (solo en inglés).)
|
||||
|
||||
Matriz de soporte completa: [REQUIREMENTS.md](REQUIREMENTS.md) (solo en inglés).
|
||||
|
||||
|
||||
@@ -103,7 +103,9 @@ The launcher window stays open in the background. Closing it stops the server
|
||||
|
||||
- Windows 10/11 (64-bit), macOS 11+, modern Linux (2020+).
|
||||
- Modern browser (Chrome, Edge, Firefox, Safari, last 3 years).
|
||||
- ~400 MB free disk space (the bundle itself is ~200 MB; the rest is working scratch space for large CSVs).
|
||||
- ~500 MB free disk space (the bundle itself is ~300 MB; the rest is working scratch space for large CSVs).
|
||||
|
||||
**OCR for scanned PDFs is bundled** — Tesseract 5.5 + the English `eng.traineddata` model ship inside every installer / portable / AppImage. The PDF Extractor's scanned-statement path works out of the box; no separate install required. (Developers running from a `pip install -r requirements.txt` checkout still need system Tesseract on `PATH` — see [DEVELOPER.md §PDF Extractor — bundled Tesseract](DEVELOPER.md#pdf-extractor--bundled-tesseract).)
|
||||
|
||||
Full numbered support matrix: [REQUIREMENTS.md](REQUIREMENTS.md).
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@ import io
|
||||
import os
|
||||
import platform
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
@@ -286,10 +287,96 @@ def page_has_extractable_text(page: Page, min_words: int = 5) -> bool:
|
||||
return len(page.words) >= min_words
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tesseract discovery
|
||||
#
|
||||
# Discovery order (shared with the PyInstaller build agent):
|
||||
#
|
||||
# 1. ``DATATOOLS_TESSERACT_PATH`` env var override (user escape hatch)
|
||||
# 2. Bundled binary inside the PyInstaller frozen bundle
|
||||
# (``sys._MEIPASS / "tesseract" / "tesseract[.exe]"``) — only
|
||||
# present when running from a frozen DataTools installer/portable
|
||||
# build. No-op in a dev checkout.
|
||||
# 3. System PATH lookup (``pytesseract.get_tesseract_version()``)
|
||||
# 4. Windows well-known install dirs (legacy fallback for users who
|
||||
# installed UB Mannheim's Tesseract-OCR themselves)
|
||||
#
|
||||
# When a bundled tessdata directory exists, ``TESSDATA_PREFIX`` is set
|
||||
# so Tesseract picks up the bundled ``eng.traineddata``. User-supplied
|
||||
# ``TESSDATA_PREFIX`` is never clobbered.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _bundled_tesseract_path() -> Path | None:
|
||||
"""Return the path to the bundled Tesseract binary, or ``None``.
|
||||
|
||||
Only returns a non-None value when running from a PyInstaller
|
||||
frozen bundle (``sys.frozen`` is truthy AND ``sys._MEIPASS`` is
|
||||
set). The bundled binary lives at
|
||||
``<_MEIPASS>/tesseract/tesseract`` (``.exe`` on Windows) per the
|
||||
contract shared with the build agent.
|
||||
|
||||
The file is NOT required to exist for this helper to return a
|
||||
path — callers ``stat`` / ``.exists()``-check it themselves so a
|
||||
missing bundled binary is treated the same as "not bundled" and
|
||||
discovery falls through to PATH lookup.
|
||||
"""
|
||||
if not getattr(sys, "frozen", False):
|
||||
return None
|
||||
meipass = getattr(sys, "_MEIPASS", None)
|
||||
if not meipass:
|
||||
return None
|
||||
binary = "tesseract.exe" if platform.system() == "Windows" else "tesseract"
|
||||
return Path(meipass) / "tesseract" / binary
|
||||
|
||||
|
||||
def _bundled_tessdata_dir() -> Path | None:
|
||||
"""Return the bundled ``tessdata`` directory or ``None``.
|
||||
|
||||
Same frozen-state gating as ``_bundled_tesseract_path``; the dir
|
||||
lives at ``<_MEIPASS>/tesseract/tessdata``. Callers use this to
|
||||
point Tesseract at the bundled language data via the
|
||||
``TESSDATA_PREFIX`` env var.
|
||||
"""
|
||||
if not getattr(sys, "frozen", False):
|
||||
return None
|
||||
meipass = getattr(sys, "_MEIPASS", None)
|
||||
if not meipass:
|
||||
return None
|
||||
return Path(meipass) / "tesseract" / "tessdata"
|
||||
|
||||
|
||||
def _apply_bundled_tessdata_prefix() -> None:
|
||||
"""Point Tesseract at the bundled ``tessdata`` directory.
|
||||
|
||||
Sets ``TESSDATA_PREFIX`` to the bundled path so the frozen
|
||||
Tesseract binary picks up the bundled ``eng.traineddata``. A
|
||||
user-supplied ``TESSDATA_PREFIX`` is preserved untouched — power
|
||||
users who explicitly chose their own language data win.
|
||||
|
||||
No-op outside a frozen bundle, or if the bundled dir doesn't
|
||||
exist (e.g. tessdata wasn't packaged for the current platform).
|
||||
"""
|
||||
if os.environ.get("TESSDATA_PREFIX"):
|
||||
return
|
||||
tessdata = _bundled_tessdata_dir()
|
||||
if tessdata is not None and tessdata.exists():
|
||||
os.environ["TESSDATA_PREFIX"] = str(tessdata)
|
||||
|
||||
|
||||
def _autodetect_tesseract_path() -> str | None:
|
||||
"""Probe well-known install locations for ``tesseract.exe`` on
|
||||
Windows. No-op on macOS/Linux where Tesseract is on PATH via
|
||||
the system package manager."""
|
||||
"""Locate a Tesseract binary outside the user's ``PATH``.
|
||||
|
||||
Tries the bundled binary first (only present in PyInstaller
|
||||
frozen builds) so installer/portable users get a working OCR
|
||||
without touching their system. Falls back to the legacy Windows
|
||||
well-known install locations so users who installed UB
|
||||
Mannheim's Tesseract-OCR themselves keep working too.
|
||||
"""
|
||||
bundled = _bundled_tesseract_path()
|
||||
if bundled is not None and bundled.exists():
|
||||
return str(bundled)
|
||||
|
||||
if platform.system() != "Windows":
|
||||
return None
|
||||
candidates = [
|
||||
@@ -309,17 +396,30 @@ def ocr_available() -> tuple[bool, str]:
|
||||
"""Return ``(available, reason)`` — is OCR usable right now?
|
||||
|
||||
Discovery order: ``DATATOOLS_TESSERACT_PATH`` env var override,
|
||||
then PATH-based lookup, then well-known Windows install
|
||||
locations.
|
||||
then the bundled binary (only present in a frozen build), then
|
||||
PATH-based lookup, then well-known Windows install locations.
|
||||
See the module-level discovery block for the full contract.
|
||||
"""
|
||||
try:
|
||||
import pytesseract # noqa: PLC0415
|
||||
except ImportError:
|
||||
return False, "pytesseract is not installed."
|
||||
|
||||
# Point Tesseract at the bundled tessdata (if any) BEFORE the
|
||||
# first ``get_tesseract_version`` call so the bundled language
|
||||
# data is loaded even when the user happens to also have a
|
||||
# system Tesseract that we'd otherwise fall through to.
|
||||
_apply_bundled_tessdata_prefix()
|
||||
|
||||
override = os.environ.get("DATATOOLS_TESSERACT_PATH")
|
||||
if override:
|
||||
pytesseract.pytesseract.tesseract_cmd = override
|
||||
else:
|
||||
# Probe the bundled binary BEFORE PATH so frozen builds use
|
||||
# their own Tesseract instead of any incidental system one.
|
||||
bundled = _bundled_tesseract_path()
|
||||
if bundled is not None and bundled.exists():
|
||||
pytesseract.pytesseract.tesseract_cmd = str(bundled)
|
||||
|
||||
try:
|
||||
pytesseract.get_tesseract_version()
|
||||
|
||||
@@ -12,9 +12,16 @@ a fixture statement at test time.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from src import pdf_extract
|
||||
from src.pdf_extract import (
|
||||
Page,
|
||||
WordBox,
|
||||
_apply_bundled_tessdata_prefix,
|
||||
_bundled_tessdata_dir,
|
||||
_bundled_tesseract_path,
|
||||
_extract_account_number,
|
||||
_extract_statement_period,
|
||||
_find_amount_tokens,
|
||||
@@ -456,3 +463,131 @@ class TestYearFromFilename:
|
||||
def test_empty_filename(self):
|
||||
assert year_from_filename("") is None
|
||||
assert year_from_filename(None) is None
|
||||
|
||||
|
||||
class TestBundledTesseractPath:
|
||||
"""Frozen-bundle Tesseract discovery for installer / portable builds.
|
||||
|
||||
The build agent packages Tesseract at
|
||||
``<sys._MEIPASS>/tesseract/tesseract[.exe]`` with language data
|
||||
at ``<sys._MEIPASS>/tesseract/tessdata``. These tests pin that
|
||||
contract on the runtime side."""
|
||||
|
||||
def test_returns_none_when_not_frozen(self, monkeypatch):
|
||||
# Default dev environment: ``sys.frozen`` is unset.
|
||||
monkeypatch.delattr("sys.frozen", raising=False)
|
||||
monkeypatch.delattr("sys._MEIPASS", raising=False)
|
||||
assert _bundled_tesseract_path() is None
|
||||
assert _bundled_tessdata_dir() is None
|
||||
|
||||
def test_returns_none_when_frozen_but_no_meipass(self, monkeypatch):
|
||||
# Defensive: ``sys.frozen`` true but ``_MEIPASS`` missing
|
||||
# (shouldn't happen in real PyInstaller bundles but guard
|
||||
# the helper so it can't NoneType-explode).
|
||||
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||
monkeypatch.delattr("sys._MEIPASS", raising=False)
|
||||
assert _bundled_tesseract_path() is None
|
||||
assert _bundled_tessdata_dir() is None
|
||||
|
||||
def test_frozen_linux_returns_unsuffixed_binary(
|
||||
self, monkeypatch, tmp_path,
|
||||
):
|
||||
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||
monkeypatch.setattr("platform.system", lambda: "Linux")
|
||||
expected = tmp_path / "tesseract" / "tesseract"
|
||||
assert _bundled_tesseract_path() == expected
|
||||
|
||||
def test_frozen_macos_returns_unsuffixed_binary(
|
||||
self, monkeypatch, tmp_path,
|
||||
):
|
||||
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||
monkeypatch.setattr("platform.system", lambda: "Darwin")
|
||||
expected = tmp_path / "tesseract" / "tesseract"
|
||||
assert _bundled_tesseract_path() == expected
|
||||
|
||||
def test_frozen_windows_returns_exe_binary(self, monkeypatch, tmp_path):
|
||||
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||
monkeypatch.setattr("platform.system", lambda: "Windows")
|
||||
expected = tmp_path / "tesseract" / "tesseract.exe"
|
||||
assert _bundled_tesseract_path() == expected
|
||||
|
||||
def test_frozen_returns_tessdata_dir(self, monkeypatch, tmp_path):
|
||||
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||
expected = tmp_path / "tesseract" / "tessdata"
|
||||
assert _bundled_tessdata_dir() == expected
|
||||
|
||||
|
||||
class TestAutodetectFavoursBundled:
|
||||
"""When a bundled binary exists, ``_autodetect_tesseract_path``
|
||||
should return it BEFORE falling through to Windows install
|
||||
locations — frozen builds shouldn't depend on the user's
|
||||
system tesseract even on Windows."""
|
||||
|
||||
def test_bundled_wins_over_windows_program_files(
|
||||
self, monkeypatch, tmp_path,
|
||||
):
|
||||
# Simulate frozen Windows build with a bundled binary on disk.
|
||||
bundle_root = tmp_path / "bundle"
|
||||
bundled_bin = bundle_root / "tesseract" / "tesseract.exe"
|
||||
bundled_bin.parent.mkdir(parents=True)
|
||||
bundled_bin.write_bytes(b"")
|
||||
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||
monkeypatch.setattr(
|
||||
"sys._MEIPASS", str(bundle_root), raising=False,
|
||||
)
|
||||
monkeypatch.setattr("platform.system", lambda: "Windows")
|
||||
# Pretend the Program Files install also exists — bundled
|
||||
# should still win because we probe it first.
|
||||
monkeypatch.setattr(Path, "exists", lambda self: True)
|
||||
assert pdf_extract._autodetect_tesseract_path() == str(bundled_bin)
|
||||
|
||||
def test_falls_through_when_not_frozen(self, monkeypatch):
|
||||
# Dev: not frozen, not Windows → no candidate at all.
|
||||
monkeypatch.delattr("sys.frozen", raising=False)
|
||||
monkeypatch.delattr("sys._MEIPASS", raising=False)
|
||||
monkeypatch.setattr("platform.system", lambda: "Linux")
|
||||
assert pdf_extract._autodetect_tesseract_path() is None
|
||||
|
||||
|
||||
class TestApplyBundledTessdataPrefix:
|
||||
"""``TESSDATA_PREFIX`` env var handling — bundled data should be
|
||||
pointed at without clobbering a user override."""
|
||||
|
||||
def test_no_op_when_not_frozen(self, monkeypatch):
|
||||
monkeypatch.delenv("TESSDATA_PREFIX", raising=False)
|
||||
monkeypatch.delattr("sys.frozen", raising=False)
|
||||
monkeypatch.delattr("sys._MEIPASS", raising=False)
|
||||
_apply_bundled_tessdata_prefix()
|
||||
assert "TESSDATA_PREFIX" not in os.environ
|
||||
|
||||
def test_sets_when_frozen_and_bundled_exists(
|
||||
self, monkeypatch, tmp_path,
|
||||
):
|
||||
tessdata = tmp_path / "tesseract" / "tessdata"
|
||||
tessdata.mkdir(parents=True)
|
||||
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||
monkeypatch.delenv("TESSDATA_PREFIX", raising=False)
|
||||
_apply_bundled_tessdata_prefix()
|
||||
assert os.environ.get("TESSDATA_PREFIX") == str(tessdata)
|
||||
|
||||
def test_does_not_clobber_user_override(self, monkeypatch, tmp_path):
|
||||
tessdata = tmp_path / "tesseract" / "tessdata"
|
||||
tessdata.mkdir(parents=True)
|
||||
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||
monkeypatch.setenv("TESSDATA_PREFIX", "/user/picked/this")
|
||||
_apply_bundled_tessdata_prefix()
|
||||
assert os.environ["TESSDATA_PREFIX"] == "/user/picked/this"
|
||||
|
||||
def test_no_op_when_bundled_dir_missing(self, monkeypatch, tmp_path):
|
||||
# Frozen, but the build didn't ship a tessdata dir.
|
||||
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||
monkeypatch.delenv("TESSDATA_PREFIX", raising=False)
|
||||
_apply_bundled_tessdata_prefix()
|
||||
assert "TESSDATA_PREFIX" not in os.environ
|
||||
|
||||
Reference in New Issue
Block a user