feat(pdf): probe bundled Tesseract first when running frozen
Adds runtime support for the bundled Tesseract that ships inside the DataTools installer / portable / AppImage artifacts. When DataTools is launched from a PyInstaller frozen bundle the OCR engine now resolves automatically — no end-user install required. New helpers in src/pdf_extract.py: - _bundled_tesseract_path() → Path | None — returns <sys._MEIPASS>/tesseract/tesseract[.exe] when getattr(sys, "frozen", False) AND sys._MEIPASS are present; None in dev. - _bundled_tessdata_dir() → Path | None — same gating, returns <sys._MEIPASS>/tesseract/tessdata. - _apply_bundled_tessdata_prefix() — sets TESSDATA_PREFIX to the bundled tessdata dir before any pytesseract call; only if frozen, dir exists, and the user hasn't already overridden the env var. Discovery order in ocr_available() / _autodetect_tesseract_path(): 1. DATATOOLS_TESSERACT_PATH env override (existing) 2. Bundled binary (NEW — frozen-only) 3. System PATH (existing) 4. Windows well-known install dirs (existing legacy fallback) In dev (not frozen) every new probe is a no-op so the developer experience is unchanged. 12 new tests cover frozen vs. non-frozen detection on each platform, the user-override respect for TESSDATA_PREFIX, autodetect priority ordering, and the no-bundled-dir graceful path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -24,6 +24,7 @@ import io
|
|||||||
import os
|
import os
|
||||||
import platform
|
import platform
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -286,10 +287,96 @@ def page_has_extractable_text(page: Page, min_words: int = 5) -> bool:
|
|||||||
return len(page.words) >= min_words
|
return len(page.words) >= min_words
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Tesseract discovery
|
||||||
|
#
|
||||||
|
# Discovery order (shared with the PyInstaller build agent):
|
||||||
|
#
|
||||||
|
# 1. ``DATATOOLS_TESSERACT_PATH`` env var override (user escape hatch)
|
||||||
|
# 2. Bundled binary inside the PyInstaller frozen bundle
|
||||||
|
# (``sys._MEIPASS / "tesseract" / "tesseract[.exe]"``) — only
|
||||||
|
# present when running from a frozen DataTools installer/portable
|
||||||
|
# build. No-op in a dev checkout.
|
||||||
|
# 3. System PATH lookup (``pytesseract.get_tesseract_version()``)
|
||||||
|
# 4. Windows well-known install dirs (legacy fallback for users who
|
||||||
|
# installed UB Mannheim's Tesseract-OCR themselves)
|
||||||
|
#
|
||||||
|
# When a bundled tessdata directory exists, ``TESSDATA_PREFIX`` is set
|
||||||
|
# so Tesseract picks up the bundled ``eng.traineddata``. User-supplied
|
||||||
|
# ``TESSDATA_PREFIX`` is never clobbered.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _bundled_tesseract_path() -> Path | None:
|
||||||
|
"""Return the path to the bundled Tesseract binary, or ``None``.
|
||||||
|
|
||||||
|
Only returns a non-None value when running from a PyInstaller
|
||||||
|
frozen bundle (``sys.frozen`` is truthy AND ``sys._MEIPASS`` is
|
||||||
|
set). The bundled binary lives at
|
||||||
|
``<_MEIPASS>/tesseract/tesseract`` (``.exe`` on Windows) per the
|
||||||
|
contract shared with the build agent.
|
||||||
|
|
||||||
|
The file is NOT required to exist for this helper to return a
|
||||||
|
path — callers ``stat`` / ``.exists()``-check it themselves so a
|
||||||
|
missing bundled binary is treated the same as "not bundled" and
|
||||||
|
discovery falls through to PATH lookup.
|
||||||
|
"""
|
||||||
|
if not getattr(sys, "frozen", False):
|
||||||
|
return None
|
||||||
|
meipass = getattr(sys, "_MEIPASS", None)
|
||||||
|
if not meipass:
|
||||||
|
return None
|
||||||
|
binary = "tesseract.exe" if platform.system() == "Windows" else "tesseract"
|
||||||
|
return Path(meipass) / "tesseract" / binary
|
||||||
|
|
||||||
|
|
||||||
|
def _bundled_tessdata_dir() -> Path | None:
|
||||||
|
"""Return the bundled ``tessdata`` directory or ``None``.
|
||||||
|
|
||||||
|
Same frozen-state gating as ``_bundled_tesseract_path``; the dir
|
||||||
|
lives at ``<_MEIPASS>/tesseract/tessdata``. Callers use this to
|
||||||
|
point Tesseract at the bundled language data via the
|
||||||
|
``TESSDATA_PREFIX`` env var.
|
||||||
|
"""
|
||||||
|
if not getattr(sys, "frozen", False):
|
||||||
|
return None
|
||||||
|
meipass = getattr(sys, "_MEIPASS", None)
|
||||||
|
if not meipass:
|
||||||
|
return None
|
||||||
|
return Path(meipass) / "tesseract" / "tessdata"
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_bundled_tessdata_prefix() -> None:
|
||||||
|
"""Point Tesseract at the bundled ``tessdata`` directory.
|
||||||
|
|
||||||
|
Sets ``TESSDATA_PREFIX`` to the bundled path so the frozen
|
||||||
|
Tesseract binary picks up the bundled ``eng.traineddata``. A
|
||||||
|
user-supplied ``TESSDATA_PREFIX`` is preserved untouched — power
|
||||||
|
users who explicitly chose their own language data win.
|
||||||
|
|
||||||
|
No-op outside a frozen bundle, or if the bundled dir doesn't
|
||||||
|
exist (e.g. tessdata wasn't packaged for the current platform).
|
||||||
|
"""
|
||||||
|
if os.environ.get("TESSDATA_PREFIX"):
|
||||||
|
return
|
||||||
|
tessdata = _bundled_tessdata_dir()
|
||||||
|
if tessdata is not None and tessdata.exists():
|
||||||
|
os.environ["TESSDATA_PREFIX"] = str(tessdata)
|
||||||
|
|
||||||
|
|
||||||
def _autodetect_tesseract_path() -> str | None:
|
def _autodetect_tesseract_path() -> str | None:
|
||||||
"""Probe well-known install locations for ``tesseract.exe`` on
|
"""Locate a Tesseract binary outside the user's ``PATH``.
|
||||||
Windows. No-op on macOS/Linux where Tesseract is on PATH via
|
|
||||||
the system package manager."""
|
Tries the bundled binary first (only present in PyInstaller
|
||||||
|
frozen builds) so installer/portable users get a working OCR
|
||||||
|
without touching their system. Falls back to the legacy Windows
|
||||||
|
well-known install locations so users who installed UB
|
||||||
|
Mannheim's Tesseract-OCR themselves keep working too.
|
||||||
|
"""
|
||||||
|
bundled = _bundled_tesseract_path()
|
||||||
|
if bundled is not None and bundled.exists():
|
||||||
|
return str(bundled)
|
||||||
|
|
||||||
if platform.system() != "Windows":
|
if platform.system() != "Windows":
|
||||||
return None
|
return None
|
||||||
candidates = [
|
candidates = [
|
||||||
@@ -309,17 +396,30 @@ def ocr_available() -> tuple[bool, str]:
|
|||||||
"""Return ``(available, reason)`` — is OCR usable right now?
|
"""Return ``(available, reason)`` — is OCR usable right now?
|
||||||
|
|
||||||
Discovery order: ``DATATOOLS_TESSERACT_PATH`` env var override,
|
Discovery order: ``DATATOOLS_TESSERACT_PATH`` env var override,
|
||||||
then PATH-based lookup, then well-known Windows install
|
then the bundled binary (only present in a frozen build), then
|
||||||
locations.
|
PATH-based lookup, then well-known Windows install locations.
|
||||||
|
See the module-level discovery block for the full contract.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
import pytesseract # noqa: PLC0415
|
import pytesseract # noqa: PLC0415
|
||||||
except ImportError:
|
except ImportError:
|
||||||
return False, "pytesseract is not installed."
|
return False, "pytesseract is not installed."
|
||||||
|
|
||||||
|
# Point Tesseract at the bundled tessdata (if any) BEFORE the
|
||||||
|
# first ``get_tesseract_version`` call so the bundled language
|
||||||
|
# data is loaded even when the user happens to also have a
|
||||||
|
# system Tesseract that we'd otherwise fall through to.
|
||||||
|
_apply_bundled_tessdata_prefix()
|
||||||
|
|
||||||
override = os.environ.get("DATATOOLS_TESSERACT_PATH")
|
override = os.environ.get("DATATOOLS_TESSERACT_PATH")
|
||||||
if override:
|
if override:
|
||||||
pytesseract.pytesseract.tesseract_cmd = override
|
pytesseract.pytesseract.tesseract_cmd = override
|
||||||
|
else:
|
||||||
|
# Probe the bundled binary BEFORE PATH so frozen builds use
|
||||||
|
# their own Tesseract instead of any incidental system one.
|
||||||
|
bundled = _bundled_tesseract_path()
|
||||||
|
if bundled is not None and bundled.exists():
|
||||||
|
pytesseract.pytesseract.tesseract_cmd = str(bundled)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
pytesseract.get_tesseract_version()
|
pytesseract.get_tesseract_version()
|
||||||
|
|||||||
@@ -12,9 +12,16 @@ a fixture statement at test time.
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from src import pdf_extract
|
||||||
from src.pdf_extract import (
|
from src.pdf_extract import (
|
||||||
Page,
|
Page,
|
||||||
WordBox,
|
WordBox,
|
||||||
|
_apply_bundled_tessdata_prefix,
|
||||||
|
_bundled_tessdata_dir,
|
||||||
|
_bundled_tesseract_path,
|
||||||
_extract_account_number,
|
_extract_account_number,
|
||||||
_extract_statement_period,
|
_extract_statement_period,
|
||||||
_find_amount_tokens,
|
_find_amount_tokens,
|
||||||
@@ -456,3 +463,131 @@ class TestYearFromFilename:
|
|||||||
def test_empty_filename(self):
|
def test_empty_filename(self):
|
||||||
assert year_from_filename("") is None
|
assert year_from_filename("") is None
|
||||||
assert year_from_filename(None) is None
|
assert year_from_filename(None) is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestBundledTesseractPath:
|
||||||
|
"""Frozen-bundle Tesseract discovery for installer / portable builds.
|
||||||
|
|
||||||
|
The build agent packages Tesseract at
|
||||||
|
``<sys._MEIPASS>/tesseract/tesseract[.exe]`` with language data
|
||||||
|
at ``<sys._MEIPASS>/tesseract/tessdata``. These tests pin that
|
||||||
|
contract on the runtime side."""
|
||||||
|
|
||||||
|
def test_returns_none_when_not_frozen(self, monkeypatch):
|
||||||
|
# Default dev environment: ``sys.frozen`` is unset.
|
||||||
|
monkeypatch.delattr("sys.frozen", raising=False)
|
||||||
|
monkeypatch.delattr("sys._MEIPASS", raising=False)
|
||||||
|
assert _bundled_tesseract_path() is None
|
||||||
|
assert _bundled_tessdata_dir() is None
|
||||||
|
|
||||||
|
def test_returns_none_when_frozen_but_no_meipass(self, monkeypatch):
|
||||||
|
# Defensive: ``sys.frozen`` true but ``_MEIPASS`` missing
|
||||||
|
# (shouldn't happen in real PyInstaller bundles but guard
|
||||||
|
# the helper so it can't NoneType-explode).
|
||||||
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||||
|
monkeypatch.delattr("sys._MEIPASS", raising=False)
|
||||||
|
assert _bundled_tesseract_path() is None
|
||||||
|
assert _bundled_tessdata_dir() is None
|
||||||
|
|
||||||
|
def test_frozen_linux_returns_unsuffixed_binary(
|
||||||
|
self, monkeypatch, tmp_path,
|
||||||
|
):
|
||||||
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||||
|
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||||
|
monkeypatch.setattr("platform.system", lambda: "Linux")
|
||||||
|
expected = tmp_path / "tesseract" / "tesseract"
|
||||||
|
assert _bundled_tesseract_path() == expected
|
||||||
|
|
||||||
|
def test_frozen_macos_returns_unsuffixed_binary(
|
||||||
|
self, monkeypatch, tmp_path,
|
||||||
|
):
|
||||||
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||||
|
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||||
|
monkeypatch.setattr("platform.system", lambda: "Darwin")
|
||||||
|
expected = tmp_path / "tesseract" / "tesseract"
|
||||||
|
assert _bundled_tesseract_path() == expected
|
||||||
|
|
||||||
|
def test_frozen_windows_returns_exe_binary(self, monkeypatch, tmp_path):
|
||||||
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||||
|
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||||
|
monkeypatch.setattr("platform.system", lambda: "Windows")
|
||||||
|
expected = tmp_path / "tesseract" / "tesseract.exe"
|
||||||
|
assert _bundled_tesseract_path() == expected
|
||||||
|
|
||||||
|
def test_frozen_returns_tessdata_dir(self, monkeypatch, tmp_path):
|
||||||
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||||
|
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||||
|
expected = tmp_path / "tesseract" / "tessdata"
|
||||||
|
assert _bundled_tessdata_dir() == expected
|
||||||
|
|
||||||
|
|
||||||
|
class TestAutodetectFavoursBundled:
|
||||||
|
"""When a bundled binary exists, ``_autodetect_tesseract_path``
|
||||||
|
should return it BEFORE falling through to Windows install
|
||||||
|
locations — frozen builds shouldn't depend on the user's
|
||||||
|
system tesseract even on Windows."""
|
||||||
|
|
||||||
|
def test_bundled_wins_over_windows_program_files(
|
||||||
|
self, monkeypatch, tmp_path,
|
||||||
|
):
|
||||||
|
# Simulate frozen Windows build with a bundled binary on disk.
|
||||||
|
bundle_root = tmp_path / "bundle"
|
||||||
|
bundled_bin = bundle_root / "tesseract" / "tesseract.exe"
|
||||||
|
bundled_bin.parent.mkdir(parents=True)
|
||||||
|
bundled_bin.write_bytes(b"")
|
||||||
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"sys._MEIPASS", str(bundle_root), raising=False,
|
||||||
|
)
|
||||||
|
monkeypatch.setattr("platform.system", lambda: "Windows")
|
||||||
|
# Pretend the Program Files install also exists — bundled
|
||||||
|
# should still win because we probe it first.
|
||||||
|
monkeypatch.setattr(Path, "exists", lambda self: True)
|
||||||
|
assert pdf_extract._autodetect_tesseract_path() == str(bundled_bin)
|
||||||
|
|
||||||
|
def test_falls_through_when_not_frozen(self, monkeypatch):
|
||||||
|
# Dev: not frozen, not Windows → no candidate at all.
|
||||||
|
monkeypatch.delattr("sys.frozen", raising=False)
|
||||||
|
monkeypatch.delattr("sys._MEIPASS", raising=False)
|
||||||
|
monkeypatch.setattr("platform.system", lambda: "Linux")
|
||||||
|
assert pdf_extract._autodetect_tesseract_path() is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestApplyBundledTessdataPrefix:
|
||||||
|
"""``TESSDATA_PREFIX`` env var handling — bundled data should be
|
||||||
|
pointed at without clobbering a user override."""
|
||||||
|
|
||||||
|
def test_no_op_when_not_frozen(self, monkeypatch):
|
||||||
|
monkeypatch.delenv("TESSDATA_PREFIX", raising=False)
|
||||||
|
monkeypatch.delattr("sys.frozen", raising=False)
|
||||||
|
monkeypatch.delattr("sys._MEIPASS", raising=False)
|
||||||
|
_apply_bundled_tessdata_prefix()
|
||||||
|
assert "TESSDATA_PREFIX" not in os.environ
|
||||||
|
|
||||||
|
def test_sets_when_frozen_and_bundled_exists(
|
||||||
|
self, monkeypatch, tmp_path,
|
||||||
|
):
|
||||||
|
tessdata = tmp_path / "tesseract" / "tessdata"
|
||||||
|
tessdata.mkdir(parents=True)
|
||||||
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||||
|
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||||
|
monkeypatch.delenv("TESSDATA_PREFIX", raising=False)
|
||||||
|
_apply_bundled_tessdata_prefix()
|
||||||
|
assert os.environ.get("TESSDATA_PREFIX") == str(tessdata)
|
||||||
|
|
||||||
|
def test_does_not_clobber_user_override(self, monkeypatch, tmp_path):
|
||||||
|
tessdata = tmp_path / "tesseract" / "tessdata"
|
||||||
|
tessdata.mkdir(parents=True)
|
||||||
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||||
|
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||||
|
monkeypatch.setenv("TESSDATA_PREFIX", "/user/picked/this")
|
||||||
|
_apply_bundled_tessdata_prefix()
|
||||||
|
assert os.environ["TESSDATA_PREFIX"] == "/user/picked/this"
|
||||||
|
|
||||||
|
def test_no_op_when_bundled_dir_missing(self, monkeypatch, tmp_path):
|
||||||
|
# Frozen, but the build didn't ship a tessdata dir.
|
||||||
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||||
|
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||||
|
monkeypatch.delenv("TESSDATA_PREFIX", raising=False)
|
||||||
|
_apply_bundled_tessdata_prefix()
|
||||||
|
assert "TESSDATA_PREFIX" not in os.environ
|
||||||
|
|||||||
Reference in New Issue
Block a user