feat(pdf): probe bundled Tesseract first when running frozen

Adds runtime support for the bundled Tesseract that ships inside the
DataTools installer / portable / AppImage artifacts. When DataTools
is launched from a PyInstaller frozen bundle the OCR engine now
resolves automatically — no end-user install required.

New helpers in src/pdf_extract.py:
- _bundled_tesseract_path() → Path | None — returns
  <sys._MEIPASS>/tesseract/tesseract[.exe] when getattr(sys,
  "frozen", False) AND sys._MEIPASS are present; None in dev.
- _bundled_tessdata_dir() → Path | None — same gating, returns
  <sys._MEIPASS>/tesseract/tessdata.
- _apply_bundled_tessdata_prefix() — sets TESSDATA_PREFIX to the
  bundled tessdata dir before any pytesseract call; only if frozen,
  dir exists, and the user hasn't already overridden the env var.

Discovery order in ocr_available() / _autodetect_tesseract_path():
1. DATATOOLS_TESSERACT_PATH env override (existing)
2. Bundled binary (NEW — frozen-only)
3. System PATH (existing)
4. Windows well-known install dirs (existing legacy fallback)

In dev (not frozen) every new probe is a no-op so the developer
experience is unchanged.

12 new tests cover frozen vs. non-frozen detection on each platform,
the user-override respect for TESSDATA_PREFIX, autodetect priority
ordering, and the no-bundled-dir graceful path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-02 18:19:52 +00:00
parent 4d8513b1a3
commit 17faf84aed
2 changed files with 240 additions and 5 deletions

View File

@@ -24,6 +24,7 @@ import io
import os import os
import platform import platform
import re import re
import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
@@ -286,10 +287,96 @@ def page_has_extractable_text(page: Page, min_words: int = 5) -> bool:
return len(page.words) >= min_words return len(page.words) >= min_words
# ---------------------------------------------------------------------------
# Tesseract discovery
#
# Discovery order (shared with the PyInstaller build agent):
#
# 1. ``DATATOOLS_TESSERACT_PATH`` env var override (user escape hatch)
# 2. Bundled binary inside the PyInstaller frozen bundle
# (``sys._MEIPASS / "tesseract" / "tesseract[.exe]"``) — only
# present when running from a frozen DataTools installer/portable
# build. No-op in a dev checkout.
# 3. System PATH lookup (``pytesseract.get_tesseract_version()``)
# 4. Windows well-known install dirs (legacy fallback for users who
# installed UB Mannheim's Tesseract-OCR themselves)
#
# When a bundled tessdata directory exists, ``TESSDATA_PREFIX`` is set
# so Tesseract picks up the bundled ``eng.traineddata``. User-supplied
# ``TESSDATA_PREFIX`` is never clobbered.
# ---------------------------------------------------------------------------
def _bundled_tesseract_path() -> Path | None:
"""Return the path to the bundled Tesseract binary, or ``None``.
Only returns a non-None value when running from a PyInstaller
frozen bundle (``sys.frozen`` is truthy AND ``sys._MEIPASS`` is
set). The bundled binary lives at
``<_MEIPASS>/tesseract/tesseract`` (``.exe`` on Windows) per the
contract shared with the build agent.
The file is NOT required to exist for this helper to return a
path — callers ``stat`` / ``.exists()``-check it themselves so a
missing bundled binary is treated the same as "not bundled" and
discovery falls through to PATH lookup.
"""
if not getattr(sys, "frozen", False):
return None
meipass = getattr(sys, "_MEIPASS", None)
if not meipass:
return None
binary = "tesseract.exe" if platform.system() == "Windows" else "tesseract"
return Path(meipass) / "tesseract" / binary
def _bundled_tessdata_dir() -> Path | None:
"""Return the bundled ``tessdata`` directory or ``None``.
Same frozen-state gating as ``_bundled_tesseract_path``; the dir
lives at ``<_MEIPASS>/tesseract/tessdata``. Callers use this to
point Tesseract at the bundled language data via the
``TESSDATA_PREFIX`` env var.
"""
if not getattr(sys, "frozen", False):
return None
meipass = getattr(sys, "_MEIPASS", None)
if not meipass:
return None
return Path(meipass) / "tesseract" / "tessdata"
def _apply_bundled_tessdata_prefix() -> None:
"""Point Tesseract at the bundled ``tessdata`` directory.
Sets ``TESSDATA_PREFIX`` to the bundled path so the frozen
Tesseract binary picks up the bundled ``eng.traineddata``. A
user-supplied ``TESSDATA_PREFIX`` is preserved untouched — power
users who explicitly chose their own language data win.
No-op outside a frozen bundle, or if the bundled dir doesn't
exist (e.g. tessdata wasn't packaged for the current platform).
"""
if os.environ.get("TESSDATA_PREFIX"):
return
tessdata = _bundled_tessdata_dir()
if tessdata is not None and tessdata.exists():
os.environ["TESSDATA_PREFIX"] = str(tessdata)
def _autodetect_tesseract_path() -> str | None: def _autodetect_tesseract_path() -> str | None:
"""Probe well-known install locations for ``tesseract.exe`` on """Locate a Tesseract binary outside the user's ``PATH``.
Windows. No-op on macOS/Linux where Tesseract is on PATH via
the system package manager.""" Tries the bundled binary first (only present in PyInstaller
frozen builds) so installer/portable users get a working OCR
without touching their system. Falls back to the legacy Windows
well-known install locations so users who installed UB
Mannheim's Tesseract-OCR themselves keep working too.
"""
bundled = _bundled_tesseract_path()
if bundled is not None and bundled.exists():
return str(bundled)
if platform.system() != "Windows": if platform.system() != "Windows":
return None return None
candidates = [ candidates = [
@@ -309,17 +396,30 @@ def ocr_available() -> tuple[bool, str]:
"""Return ``(available, reason)`` — is OCR usable right now? """Return ``(available, reason)`` — is OCR usable right now?
Discovery order: ``DATATOOLS_TESSERACT_PATH`` env var override, Discovery order: ``DATATOOLS_TESSERACT_PATH`` env var override,
then PATH-based lookup, then well-known Windows install then the bundled binary (only present in a frozen build), then
locations. PATH-based lookup, then well-known Windows install locations.
See the module-level discovery block for the full contract.
""" """
try: try:
import pytesseract # noqa: PLC0415 import pytesseract # noqa: PLC0415
except ImportError: except ImportError:
return False, "pytesseract is not installed." return False, "pytesseract is not installed."
# Point Tesseract at the bundled tessdata (if any) BEFORE the
# first ``get_tesseract_version`` call so the bundled language
# data is loaded even when the user happens to also have a
# system Tesseract that we'd otherwise fall through to.
_apply_bundled_tessdata_prefix()
override = os.environ.get("DATATOOLS_TESSERACT_PATH") override = os.environ.get("DATATOOLS_TESSERACT_PATH")
if override: if override:
pytesseract.pytesseract.tesseract_cmd = override pytesseract.pytesseract.tesseract_cmd = override
else:
# Probe the bundled binary BEFORE PATH so frozen builds use
# their own Tesseract instead of any incidental system one.
bundled = _bundled_tesseract_path()
if bundled is not None and bundled.exists():
pytesseract.pytesseract.tesseract_cmd = str(bundled)
try: try:
pytesseract.get_tesseract_version() pytesseract.get_tesseract_version()

View File

@@ -12,9 +12,16 @@ a fixture statement at test time.
from __future__ import annotations from __future__ import annotations
import os
from pathlib import Path
from src import pdf_extract
from src.pdf_extract import ( from src.pdf_extract import (
Page, Page,
WordBox, WordBox,
_apply_bundled_tessdata_prefix,
_bundled_tessdata_dir,
_bundled_tesseract_path,
_extract_account_number, _extract_account_number,
_extract_statement_period, _extract_statement_period,
_find_amount_tokens, _find_amount_tokens,
@@ -456,3 +463,131 @@ class TestYearFromFilename:
def test_empty_filename(self): def test_empty_filename(self):
assert year_from_filename("") is None assert year_from_filename("") is None
assert year_from_filename(None) is None assert year_from_filename(None) is None
class TestBundledTesseractPath:
"""Frozen-bundle Tesseract discovery for installer / portable builds.
The build agent packages Tesseract at
``<sys._MEIPASS>/tesseract/tesseract[.exe]`` with language data
at ``<sys._MEIPASS>/tesseract/tessdata``. These tests pin that
contract on the runtime side."""
def test_returns_none_when_not_frozen(self, monkeypatch):
# Default dev environment: ``sys.frozen`` is unset.
monkeypatch.delattr("sys.frozen", raising=False)
monkeypatch.delattr("sys._MEIPASS", raising=False)
assert _bundled_tesseract_path() is None
assert _bundled_tessdata_dir() is None
def test_returns_none_when_frozen_but_no_meipass(self, monkeypatch):
# Defensive: ``sys.frozen`` true but ``_MEIPASS`` missing
# (shouldn't happen in real PyInstaller bundles but guard
# the helper so it can't NoneType-explode).
monkeypatch.setattr("sys.frozen", True, raising=False)
monkeypatch.delattr("sys._MEIPASS", raising=False)
assert _bundled_tesseract_path() is None
assert _bundled_tessdata_dir() is None
def test_frozen_linux_returns_unsuffixed_binary(
self, monkeypatch, tmp_path,
):
monkeypatch.setattr("sys.frozen", True, raising=False)
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
monkeypatch.setattr("platform.system", lambda: "Linux")
expected = tmp_path / "tesseract" / "tesseract"
assert _bundled_tesseract_path() == expected
def test_frozen_macos_returns_unsuffixed_binary(
self, monkeypatch, tmp_path,
):
monkeypatch.setattr("sys.frozen", True, raising=False)
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
monkeypatch.setattr("platform.system", lambda: "Darwin")
expected = tmp_path / "tesseract" / "tesseract"
assert _bundled_tesseract_path() == expected
def test_frozen_windows_returns_exe_binary(self, monkeypatch, tmp_path):
monkeypatch.setattr("sys.frozen", True, raising=False)
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
monkeypatch.setattr("platform.system", lambda: "Windows")
expected = tmp_path / "tesseract" / "tesseract.exe"
assert _bundled_tesseract_path() == expected
def test_frozen_returns_tessdata_dir(self, monkeypatch, tmp_path):
monkeypatch.setattr("sys.frozen", True, raising=False)
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
expected = tmp_path / "tesseract" / "tessdata"
assert _bundled_tessdata_dir() == expected
class TestAutodetectFavoursBundled:
"""When a bundled binary exists, ``_autodetect_tesseract_path``
should return it BEFORE falling through to Windows install
locations — frozen builds shouldn't depend on the user's
system tesseract even on Windows."""
def test_bundled_wins_over_windows_program_files(
self, monkeypatch, tmp_path,
):
# Simulate frozen Windows build with a bundled binary on disk.
bundle_root = tmp_path / "bundle"
bundled_bin = bundle_root / "tesseract" / "tesseract.exe"
bundled_bin.parent.mkdir(parents=True)
bundled_bin.write_bytes(b"")
monkeypatch.setattr("sys.frozen", True, raising=False)
monkeypatch.setattr(
"sys._MEIPASS", str(bundle_root), raising=False,
)
monkeypatch.setattr("platform.system", lambda: "Windows")
# Pretend the Program Files install also exists — bundled
# should still win because we probe it first.
monkeypatch.setattr(Path, "exists", lambda self: True)
assert pdf_extract._autodetect_tesseract_path() == str(bundled_bin)
def test_falls_through_when_not_frozen(self, monkeypatch):
# Dev: not frozen, not Windows → no candidate at all.
monkeypatch.delattr("sys.frozen", raising=False)
monkeypatch.delattr("sys._MEIPASS", raising=False)
monkeypatch.setattr("platform.system", lambda: "Linux")
assert pdf_extract._autodetect_tesseract_path() is None
class TestApplyBundledTessdataPrefix:
"""``TESSDATA_PREFIX`` env var handling — bundled data should be
pointed at without clobbering a user override."""
def test_no_op_when_not_frozen(self, monkeypatch):
monkeypatch.delenv("TESSDATA_PREFIX", raising=False)
monkeypatch.delattr("sys.frozen", raising=False)
monkeypatch.delattr("sys._MEIPASS", raising=False)
_apply_bundled_tessdata_prefix()
assert "TESSDATA_PREFIX" not in os.environ
def test_sets_when_frozen_and_bundled_exists(
self, monkeypatch, tmp_path,
):
tessdata = tmp_path / "tesseract" / "tessdata"
tessdata.mkdir(parents=True)
monkeypatch.setattr("sys.frozen", True, raising=False)
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
monkeypatch.delenv("TESSDATA_PREFIX", raising=False)
_apply_bundled_tessdata_prefix()
assert os.environ.get("TESSDATA_PREFIX") == str(tessdata)
def test_does_not_clobber_user_override(self, monkeypatch, tmp_path):
tessdata = tmp_path / "tesseract" / "tessdata"
tessdata.mkdir(parents=True)
monkeypatch.setattr("sys.frozen", True, raising=False)
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
monkeypatch.setenv("TESSDATA_PREFIX", "/user/picked/this")
_apply_bundled_tessdata_prefix()
assert os.environ["TESSDATA_PREFIX"] == "/user/picked/this"
def test_no_op_when_bundled_dir_missing(self, monkeypatch, tmp_path):
# Frozen, but the build didn't ship a tessdata dir.
monkeypatch.setattr("sys.frozen", True, raising=False)
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
monkeypatch.delenv("TESSDATA_PREFIX", raising=False)
_apply_bundled_tessdata_prefix()
assert "TESSDATA_PREFIX" not in os.environ