feat(pdf): probe bundled Tesseract first when running frozen
Adds runtime support for the bundled Tesseract that ships inside the DataTools installer / portable / AppImage artifacts. When DataTools is launched from a PyInstaller frozen bundle the OCR engine now resolves automatically — no end-user install required. New helpers in src/pdf_extract.py: - _bundled_tesseract_path() → Path | None — returns <sys._MEIPASS>/tesseract/tesseract[.exe] when getattr(sys, "frozen", False) AND sys._MEIPASS are present; None in dev. - _bundled_tessdata_dir() → Path | None — same gating, returns <sys._MEIPASS>/tesseract/tessdata. - _apply_bundled_tessdata_prefix() — sets TESSDATA_PREFIX to the bundled tessdata dir before any pytesseract call; only if frozen, dir exists, and the user hasn't already overridden the env var. Discovery order in ocr_available() / _autodetect_tesseract_path(): 1. DATATOOLS_TESSERACT_PATH env override (existing) 2. Bundled binary (NEW — frozen-only) 3. System PATH (existing) 4. Windows well-known install dirs (existing legacy fallback) In dev (not frozen) every new probe is a no-op so the developer experience is unchanged. 12 new tests cover frozen vs. non-frozen detection on each platform, the user-override respect for TESSDATA_PREFIX, autodetect priority ordering, and the no-bundled-dir graceful path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -12,9 +12,16 @@ a fixture statement at test time.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from src import pdf_extract
|
||||
from src.pdf_extract import (
|
||||
Page,
|
||||
WordBox,
|
||||
_apply_bundled_tessdata_prefix,
|
||||
_bundled_tessdata_dir,
|
||||
_bundled_tesseract_path,
|
||||
_extract_account_number,
|
||||
_extract_statement_period,
|
||||
_find_amount_tokens,
|
||||
@@ -456,3 +463,131 @@ class TestYearFromFilename:
|
||||
def test_empty_filename(self):
|
||||
assert year_from_filename("") is None
|
||||
assert year_from_filename(None) is None
|
||||
|
||||
|
||||
class TestBundledTesseractPath:
|
||||
"""Frozen-bundle Tesseract discovery for installer / portable builds.
|
||||
|
||||
The build agent packages Tesseract at
|
||||
``<sys._MEIPASS>/tesseract/tesseract[.exe]`` with language data
|
||||
at ``<sys._MEIPASS>/tesseract/tessdata``. These tests pin that
|
||||
contract on the runtime side."""
|
||||
|
||||
def test_returns_none_when_not_frozen(self, monkeypatch):
|
||||
# Default dev environment: ``sys.frozen`` is unset.
|
||||
monkeypatch.delattr("sys.frozen", raising=False)
|
||||
monkeypatch.delattr("sys._MEIPASS", raising=False)
|
||||
assert _bundled_tesseract_path() is None
|
||||
assert _bundled_tessdata_dir() is None
|
||||
|
||||
def test_returns_none_when_frozen_but_no_meipass(self, monkeypatch):
|
||||
# Defensive: ``sys.frozen`` true but ``_MEIPASS`` missing
|
||||
# (shouldn't happen in real PyInstaller bundles but guard
|
||||
# the helper so it can't NoneType-explode).
|
||||
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||
monkeypatch.delattr("sys._MEIPASS", raising=False)
|
||||
assert _bundled_tesseract_path() is None
|
||||
assert _bundled_tessdata_dir() is None
|
||||
|
||||
def test_frozen_linux_returns_unsuffixed_binary(
|
||||
self, monkeypatch, tmp_path,
|
||||
):
|
||||
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||
monkeypatch.setattr("platform.system", lambda: "Linux")
|
||||
expected = tmp_path / "tesseract" / "tesseract"
|
||||
assert _bundled_tesseract_path() == expected
|
||||
|
||||
def test_frozen_macos_returns_unsuffixed_binary(
|
||||
self, monkeypatch, tmp_path,
|
||||
):
|
||||
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||
monkeypatch.setattr("platform.system", lambda: "Darwin")
|
||||
expected = tmp_path / "tesseract" / "tesseract"
|
||||
assert _bundled_tesseract_path() == expected
|
||||
|
||||
def test_frozen_windows_returns_exe_binary(self, monkeypatch, tmp_path):
|
||||
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||
monkeypatch.setattr("platform.system", lambda: "Windows")
|
||||
expected = tmp_path / "tesseract" / "tesseract.exe"
|
||||
assert _bundled_tesseract_path() == expected
|
||||
|
||||
def test_frozen_returns_tessdata_dir(self, monkeypatch, tmp_path):
|
||||
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||
expected = tmp_path / "tesseract" / "tessdata"
|
||||
assert _bundled_tessdata_dir() == expected
|
||||
|
||||
|
||||
class TestAutodetectFavoursBundled:
|
||||
"""When a bundled binary exists, ``_autodetect_tesseract_path``
|
||||
should return it BEFORE falling through to Windows install
|
||||
locations — frozen builds shouldn't depend on the user's
|
||||
system tesseract even on Windows."""
|
||||
|
||||
def test_bundled_wins_over_windows_program_files(
|
||||
self, monkeypatch, tmp_path,
|
||||
):
|
||||
# Simulate frozen Windows build with a bundled binary on disk.
|
||||
bundle_root = tmp_path / "bundle"
|
||||
bundled_bin = bundle_root / "tesseract" / "tesseract.exe"
|
||||
bundled_bin.parent.mkdir(parents=True)
|
||||
bundled_bin.write_bytes(b"")
|
||||
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||
monkeypatch.setattr(
|
||||
"sys._MEIPASS", str(bundle_root), raising=False,
|
||||
)
|
||||
monkeypatch.setattr("platform.system", lambda: "Windows")
|
||||
# Pretend the Program Files install also exists — bundled
|
||||
# should still win because we probe it first.
|
||||
monkeypatch.setattr(Path, "exists", lambda self: True)
|
||||
assert pdf_extract._autodetect_tesseract_path() == str(bundled_bin)
|
||||
|
||||
def test_falls_through_when_not_frozen(self, monkeypatch):
|
||||
# Dev: not frozen, not Windows → no candidate at all.
|
||||
monkeypatch.delattr("sys.frozen", raising=False)
|
||||
monkeypatch.delattr("sys._MEIPASS", raising=False)
|
||||
monkeypatch.setattr("platform.system", lambda: "Linux")
|
||||
assert pdf_extract._autodetect_tesseract_path() is None
|
||||
|
||||
|
||||
class TestApplyBundledTessdataPrefix:
|
||||
"""``TESSDATA_PREFIX`` env var handling — bundled data should be
|
||||
pointed at without clobbering a user override."""
|
||||
|
||||
def test_no_op_when_not_frozen(self, monkeypatch):
|
||||
monkeypatch.delenv("TESSDATA_PREFIX", raising=False)
|
||||
monkeypatch.delattr("sys.frozen", raising=False)
|
||||
monkeypatch.delattr("sys._MEIPASS", raising=False)
|
||||
_apply_bundled_tessdata_prefix()
|
||||
assert "TESSDATA_PREFIX" not in os.environ
|
||||
|
||||
def test_sets_when_frozen_and_bundled_exists(
|
||||
self, monkeypatch, tmp_path,
|
||||
):
|
||||
tessdata = tmp_path / "tesseract" / "tessdata"
|
||||
tessdata.mkdir(parents=True)
|
||||
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||
monkeypatch.delenv("TESSDATA_PREFIX", raising=False)
|
||||
_apply_bundled_tessdata_prefix()
|
||||
assert os.environ.get("TESSDATA_PREFIX") == str(tessdata)
|
||||
|
||||
def test_does_not_clobber_user_override(self, monkeypatch, tmp_path):
|
||||
tessdata = tmp_path / "tesseract" / "tessdata"
|
||||
tessdata.mkdir(parents=True)
|
||||
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||
monkeypatch.setenv("TESSDATA_PREFIX", "/user/picked/this")
|
||||
_apply_bundled_tessdata_prefix()
|
||||
assert os.environ["TESSDATA_PREFIX"] == "/user/picked/this"
|
||||
|
||||
def test_no_op_when_bundled_dir_missing(self, monkeypatch, tmp_path):
|
||||
# Frozen, but the build didn't ship a tessdata dir.
|
||||
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||
monkeypatch.delenv("TESSDATA_PREFIX", raising=False)
|
||||
_apply_bundled_tessdata_prefix()
|
||||
assert "TESSDATA_PREFIX" not in os.environ
|
||||
|
||||
Reference in New Issue
Block a user