feat(pdf): probe bundled Tesseract first when running frozen

Adds runtime support for the bundled Tesseract that ships inside the
DataTools installer / portable / AppImage artifacts. When DataTools
is launched from a PyInstaller frozen bundle the OCR engine now
resolves automatically — no end-user install required.

New helpers in src/pdf_extract.py:
- _bundled_tesseract_path() → Path | None — returns
  <sys._MEIPASS>/tesseract/tesseract[.exe] when getattr(sys,
  "frozen", False) AND sys._MEIPASS are present; None in dev.
- _bundled_tessdata_dir() → Path | None — same gating, returns
  <sys._MEIPASS>/tesseract/tessdata.
- _apply_bundled_tessdata_prefix() — sets TESSDATA_PREFIX to the
  bundled tessdata dir before any pytesseract call; only if frozen,
  dir exists, and the user hasn't already overridden the env var.

Discovery order in ocr_available() / _autodetect_tesseract_path():
1. DATATOOLS_TESSERACT_PATH env override (existing)
2. Bundled binary (NEW — frozen-only)
3. System PATH (existing)
4. Windows well-known install dirs (existing legacy fallback)

In dev (not frozen) every new probe is a no-op so the developer
experience is unchanged.

12 new tests cover frozen vs. non-frozen detection on each platform,
the user-override respect for TESSDATA_PREFIX, autodetect priority
ordering, and the no-bundled-dir graceful path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-02 18:19:52 +00:00
parent 4d8513b1a3
commit 17faf84aed
2 changed files with 240 additions and 5 deletions

View File

@@ -24,6 +24,7 @@ import io
import os
import platform
import re
import sys
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
@@ -286,10 +287,96 @@ def page_has_extractable_text(page: Page, min_words: int = 5) -> bool:
return len(page.words) >= min_words
# ---------------------------------------------------------------------------
# Tesseract discovery
#
# Discovery order (shared with the PyInstaller build agent):
#
# 1. ``DATATOOLS_TESSERACT_PATH`` env var override (user escape hatch)
# 2. Bundled binary inside the PyInstaller frozen bundle
# (``sys._MEIPASS / "tesseract" / "tesseract[.exe]"``) — only
# present when running from a frozen DataTools installer/portable
# build. No-op in a dev checkout.
# 3. System PATH lookup (``pytesseract.get_tesseract_version()``)
# 4. Windows well-known install dirs (legacy fallback for users who
# installed UB Mannheim's Tesseract-OCR themselves)
#
# When a bundled tessdata directory exists, ``TESSDATA_PREFIX`` is set
# so Tesseract picks up the bundled ``eng.traineddata``. User-supplied
# ``TESSDATA_PREFIX`` is never clobbered.
# ---------------------------------------------------------------------------
def _bundled_tesseract_path() -> Path | None:
"""Return the path to the bundled Tesseract binary, or ``None``.
Only returns a non-None value when running from a PyInstaller
frozen bundle (``sys.frozen`` is truthy AND ``sys._MEIPASS`` is
set). The bundled binary lives at
``<_MEIPASS>/tesseract/tesseract`` (``.exe`` on Windows) per the
contract shared with the build agent.
The file is NOT required to exist for this helper to return a
path — callers ``stat`` / ``.exists()``-check it themselves so a
missing bundled binary is treated the same as "not bundled" and
discovery falls through to PATH lookup.
"""
if not getattr(sys, "frozen", False):
return None
meipass = getattr(sys, "_MEIPASS", None)
if not meipass:
return None
binary = "tesseract.exe" if platform.system() == "Windows" else "tesseract"
return Path(meipass) / "tesseract" / binary
def _bundled_tessdata_dir() -> Path | None:
"""Return the bundled ``tessdata`` directory or ``None``.
Same frozen-state gating as ``_bundled_tesseract_path``; the dir
lives at ``<_MEIPASS>/tesseract/tessdata``. Callers use this to
point Tesseract at the bundled language data via the
``TESSDATA_PREFIX`` env var.
"""
if not getattr(sys, "frozen", False):
return None
meipass = getattr(sys, "_MEIPASS", None)
if not meipass:
return None
return Path(meipass) / "tesseract" / "tessdata"
def _apply_bundled_tessdata_prefix() -> None:
"""Point Tesseract at the bundled ``tessdata`` directory.
Sets ``TESSDATA_PREFIX`` to the bundled path so the frozen
Tesseract binary picks up the bundled ``eng.traineddata``. A
user-supplied ``TESSDATA_PREFIX`` is preserved untouched — power
users who explicitly chose their own language data win.
No-op outside a frozen bundle, or if the bundled dir doesn't
exist (e.g. tessdata wasn't packaged for the current platform).
"""
if os.environ.get("TESSDATA_PREFIX"):
return
tessdata = _bundled_tessdata_dir()
if tessdata is not None and tessdata.exists():
os.environ["TESSDATA_PREFIX"] = str(tessdata)
def _autodetect_tesseract_path() -> str | None:
"""Probe well-known install locations for ``tesseract.exe`` on
Windows. No-op on macOS/Linux where Tesseract is on PATH via
the system package manager."""
"""Locate a Tesseract binary outside the user's ``PATH``.
Tries the bundled binary first (only present in PyInstaller
frozen builds) so installer/portable users get a working OCR
without touching their system. Falls back to the legacy Windows
well-known install locations so users who installed UB
Mannheim's Tesseract-OCR themselves keep working too.
"""
bundled = _bundled_tesseract_path()
if bundled is not None and bundled.exists():
return str(bundled)
if platform.system() != "Windows":
return None
candidates = [
@@ -309,17 +396,30 @@ def ocr_available() -> tuple[bool, str]:
"""Return ``(available, reason)`` — is OCR usable right now?
Discovery order: ``DATATOOLS_TESSERACT_PATH`` env var override,
then PATH-based lookup, then well-known Windows install
locations.
then the bundled binary (only present in a frozen build), then
PATH-based lookup, then well-known Windows install locations.
See the module-level discovery block for the full contract.
"""
try:
import pytesseract # noqa: PLC0415
except ImportError:
return False, "pytesseract is not installed."
# Point Tesseract at the bundled tessdata (if any) BEFORE the
# first ``get_tesseract_version`` call so the bundled language
# data is loaded even when the user happens to also have a
# system Tesseract that we'd otherwise fall through to.
_apply_bundled_tessdata_prefix()
override = os.environ.get("DATATOOLS_TESSERACT_PATH")
if override:
pytesseract.pytesseract.tesseract_cmd = override
else:
# Probe the bundled binary BEFORE PATH so frozen builds use
# their own Tesseract instead of any incidental system one.
bundled = _bundled_tesseract_path()
if bundled is not None and bundled.exists():
pytesseract.pytesseract.tesseract_cmd = str(bundled)
try:
pytesseract.get_tesseract_version()