From 17faf84aed9f3927b1175c9123102b390fe2e321 Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 2 Jun 2026 18:19:52 +0000 Subject: [PATCH] feat(pdf): probe bundled Tesseract first when running frozen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds runtime support for the bundled Tesseract that ships inside the DataTools installer / portable / AppImage artifacts. When DataTools is launched from a PyInstaller frozen bundle the OCR engine now resolves automatically — no end-user install required. New helpers in src/pdf_extract.py: - _bundled_tesseract_path() → Path | None — returns /tesseract/tesseract[.exe] when getattr(sys, "frozen", False) AND sys._MEIPASS are present; None in dev. - _bundled_tessdata_dir() → Path | None — same gating, returns /tesseract/tessdata. - _apply_bundled_tessdata_prefix() — sets TESSDATA_PREFIX to the bundled tessdata dir before any pytesseract call; only if frozen, dir exists, and the user hasn't already overridden the env var. Discovery order in ocr_available() / _autodetect_tesseract_path(): 1. DATATOOLS_TESSERACT_PATH env override (existing) 2. Bundled binary (NEW — frozen-only) 3. System PATH (existing) 4. Windows well-known install dirs (existing legacy fallback) In dev (not frozen) every new probe is a no-op so the developer experience is unchanged. 12 new tests cover frozen vs. non-frozen detection on each platform, the user-override respect for TESSDATA_PREFIX, autodetect priority ordering, and the no-bundled-dir graceful path. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/pdf_extract.py | 110 +++++++++++++++++++++++++++++-- tests/test_pdf_extract.py | 135 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 240 insertions(+), 5 deletions(-) diff --git a/src/pdf_extract.py b/src/pdf_extract.py index cb19ac1..8d0f171 100644 --- a/src/pdf_extract.py +++ b/src/pdf_extract.py @@ -24,6 +24,7 @@ import io import os import platform import re +import sys from dataclasses import dataclass, field from datetime import datetime from pathlib import Path @@ -286,10 +287,96 @@ def page_has_extractable_text(page: Page, min_words: int = 5) -> bool: return len(page.words) >= min_words +# --------------------------------------------------------------------------- +# Tesseract discovery +# +# Discovery order (shared with the PyInstaller build agent): +# +# 1. ``DATATOOLS_TESSERACT_PATH`` env var override (user escape hatch) +# 2. Bundled binary inside the PyInstaller frozen bundle +# (``sys._MEIPASS / "tesseract" / "tesseract[.exe]"``) — only +# present when running from a frozen DataTools installer/portable +# build. No-op in a dev checkout. +# 3. System PATH lookup (``pytesseract.get_tesseract_version()``) +# 4. Windows well-known install dirs (legacy fallback for users who +# installed UB Mannheim's Tesseract-OCR themselves) +# +# When a bundled tessdata directory exists, ``TESSDATA_PREFIX`` is set +# so Tesseract picks up the bundled ``eng.traineddata``. User-supplied +# ``TESSDATA_PREFIX`` is never clobbered. +# --------------------------------------------------------------------------- + + +def _bundled_tesseract_path() -> Path | None: + """Return the path to the bundled Tesseract binary, or ``None``. + + Only returns a non-None value when running from a PyInstaller + frozen bundle (``sys.frozen`` is truthy AND ``sys._MEIPASS`` is + set). The bundled binary lives at + ``<_MEIPASS>/tesseract/tesseract`` (``.exe`` on Windows) per the + contract shared with the build agent. + + The file is NOT required to exist for this helper to return a + path — callers ``stat`` / ``.exists()``-check it themselves so a + missing bundled binary is treated the same as "not bundled" and + discovery falls through to PATH lookup. + """ + if not getattr(sys, "frozen", False): + return None + meipass = getattr(sys, "_MEIPASS", None) + if not meipass: + return None + binary = "tesseract.exe" if platform.system() == "Windows" else "tesseract" + return Path(meipass) / "tesseract" / binary + + +def _bundled_tessdata_dir() -> Path | None: + """Return the bundled ``tessdata`` directory or ``None``. + + Same frozen-state gating as ``_bundled_tesseract_path``; the dir + lives at ``<_MEIPASS>/tesseract/tessdata``. Callers use this to + point Tesseract at the bundled language data via the + ``TESSDATA_PREFIX`` env var. + """ + if not getattr(sys, "frozen", False): + return None + meipass = getattr(sys, "_MEIPASS", None) + if not meipass: + return None + return Path(meipass) / "tesseract" / "tessdata" + + +def _apply_bundled_tessdata_prefix() -> None: + """Point Tesseract at the bundled ``tessdata`` directory. + + Sets ``TESSDATA_PREFIX`` to the bundled path so the frozen + Tesseract binary picks up the bundled ``eng.traineddata``. A + user-supplied ``TESSDATA_PREFIX`` is preserved untouched — power + users who explicitly chose their own language data win. + + No-op outside a frozen bundle, or if the bundled dir doesn't + exist (e.g. tessdata wasn't packaged for the current platform). + """ + if os.environ.get("TESSDATA_PREFIX"): + return + tessdata = _bundled_tessdata_dir() + if tessdata is not None and tessdata.exists(): + os.environ["TESSDATA_PREFIX"] = str(tessdata) + + def _autodetect_tesseract_path() -> str | None: - """Probe well-known install locations for ``tesseract.exe`` on - Windows. No-op on macOS/Linux where Tesseract is on PATH via - the system package manager.""" + """Locate a Tesseract binary outside the user's ``PATH``. + + Tries the bundled binary first (only present in PyInstaller + frozen builds) so installer/portable users get a working OCR + without touching their system. Falls back to the legacy Windows + well-known install locations so users who installed UB + Mannheim's Tesseract-OCR themselves keep working too. + """ + bundled = _bundled_tesseract_path() + if bundled is not None and bundled.exists(): + return str(bundled) + if platform.system() != "Windows": return None candidates = [ @@ -309,17 +396,30 @@ def ocr_available() -> tuple[bool, str]: """Return ``(available, reason)`` — is OCR usable right now? Discovery order: ``DATATOOLS_TESSERACT_PATH`` env var override, - then PATH-based lookup, then well-known Windows install - locations. + then the bundled binary (only present in a frozen build), then + PATH-based lookup, then well-known Windows install locations. + See the module-level discovery block for the full contract. """ try: import pytesseract # noqa: PLC0415 except ImportError: return False, "pytesseract is not installed." + # Point Tesseract at the bundled tessdata (if any) BEFORE the + # first ``get_tesseract_version`` call so the bundled language + # data is loaded even when the user happens to also have a + # system Tesseract that we'd otherwise fall through to. + _apply_bundled_tessdata_prefix() + override = os.environ.get("DATATOOLS_TESSERACT_PATH") if override: pytesseract.pytesseract.tesseract_cmd = override + else: + # Probe the bundled binary BEFORE PATH so frozen builds use + # their own Tesseract instead of any incidental system one. + bundled = _bundled_tesseract_path() + if bundled is not None and bundled.exists(): + pytesseract.pytesseract.tesseract_cmd = str(bundled) try: pytesseract.get_tesseract_version() diff --git a/tests/test_pdf_extract.py b/tests/test_pdf_extract.py index 21cc589..1eab35d 100644 --- a/tests/test_pdf_extract.py +++ b/tests/test_pdf_extract.py @@ -12,9 +12,16 @@ a fixture statement at test time. from __future__ import annotations +import os +from pathlib import Path + +from src import pdf_extract from src.pdf_extract import ( Page, WordBox, + _apply_bundled_tessdata_prefix, + _bundled_tessdata_dir, + _bundled_tesseract_path, _extract_account_number, _extract_statement_period, _find_amount_tokens, @@ -456,3 +463,131 @@ class TestYearFromFilename: def test_empty_filename(self): assert year_from_filename("") is None assert year_from_filename(None) is None + + +class TestBundledTesseractPath: + """Frozen-bundle Tesseract discovery for installer / portable builds. + + The build agent packages Tesseract at + ``/tesseract/tesseract[.exe]`` with language data + at ``/tesseract/tessdata``. These tests pin that + contract on the runtime side.""" + + def test_returns_none_when_not_frozen(self, monkeypatch): + # Default dev environment: ``sys.frozen`` is unset. + monkeypatch.delattr("sys.frozen", raising=False) + monkeypatch.delattr("sys._MEIPASS", raising=False) + assert _bundled_tesseract_path() is None + assert _bundled_tessdata_dir() is None + + def test_returns_none_when_frozen_but_no_meipass(self, monkeypatch): + # Defensive: ``sys.frozen`` true but ``_MEIPASS`` missing + # (shouldn't happen in real PyInstaller bundles but guard + # the helper so it can't NoneType-explode). + monkeypatch.setattr("sys.frozen", True, raising=False) + monkeypatch.delattr("sys._MEIPASS", raising=False) + assert _bundled_tesseract_path() is None + assert _bundled_tessdata_dir() is None + + def test_frozen_linux_returns_unsuffixed_binary( + self, monkeypatch, tmp_path, + ): + monkeypatch.setattr("sys.frozen", True, raising=False) + monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False) + monkeypatch.setattr("platform.system", lambda: "Linux") + expected = tmp_path / "tesseract" / "tesseract" + assert _bundled_tesseract_path() == expected + + def test_frozen_macos_returns_unsuffixed_binary( + self, monkeypatch, tmp_path, + ): + monkeypatch.setattr("sys.frozen", True, raising=False) + monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False) + monkeypatch.setattr("platform.system", lambda: "Darwin") + expected = tmp_path / "tesseract" / "tesseract" + assert _bundled_tesseract_path() == expected + + def test_frozen_windows_returns_exe_binary(self, monkeypatch, tmp_path): + monkeypatch.setattr("sys.frozen", True, raising=False) + monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False) + monkeypatch.setattr("platform.system", lambda: "Windows") + expected = tmp_path / "tesseract" / "tesseract.exe" + assert _bundled_tesseract_path() == expected + + def test_frozen_returns_tessdata_dir(self, monkeypatch, tmp_path): + monkeypatch.setattr("sys.frozen", True, raising=False) + monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False) + expected = tmp_path / "tesseract" / "tessdata" + assert _bundled_tessdata_dir() == expected + + +class TestAutodetectFavoursBundled: + """When a bundled binary exists, ``_autodetect_tesseract_path`` + should return it BEFORE falling through to Windows install + locations — frozen builds shouldn't depend on the user's + system tesseract even on Windows.""" + + def test_bundled_wins_over_windows_program_files( + self, monkeypatch, tmp_path, + ): + # Simulate frozen Windows build with a bundled binary on disk. + bundle_root = tmp_path / "bundle" + bundled_bin = bundle_root / "tesseract" / "tesseract.exe" + bundled_bin.parent.mkdir(parents=True) + bundled_bin.write_bytes(b"") + monkeypatch.setattr("sys.frozen", True, raising=False) + monkeypatch.setattr( + "sys._MEIPASS", str(bundle_root), raising=False, + ) + monkeypatch.setattr("platform.system", lambda: "Windows") + # Pretend the Program Files install also exists — bundled + # should still win because we probe it first. + monkeypatch.setattr(Path, "exists", lambda self: True) + assert pdf_extract._autodetect_tesseract_path() == str(bundled_bin) + + def test_falls_through_when_not_frozen(self, monkeypatch): + # Dev: not frozen, not Windows → no candidate at all. + monkeypatch.delattr("sys.frozen", raising=False) + monkeypatch.delattr("sys._MEIPASS", raising=False) + monkeypatch.setattr("platform.system", lambda: "Linux") + assert pdf_extract._autodetect_tesseract_path() is None + + +class TestApplyBundledTessdataPrefix: + """``TESSDATA_PREFIX`` env var handling — bundled data should be + pointed at without clobbering a user override.""" + + def test_no_op_when_not_frozen(self, monkeypatch): + monkeypatch.delenv("TESSDATA_PREFIX", raising=False) + monkeypatch.delattr("sys.frozen", raising=False) + monkeypatch.delattr("sys._MEIPASS", raising=False) + _apply_bundled_tessdata_prefix() + assert "TESSDATA_PREFIX" not in os.environ + + def test_sets_when_frozen_and_bundled_exists( + self, monkeypatch, tmp_path, + ): + tessdata = tmp_path / "tesseract" / "tessdata" + tessdata.mkdir(parents=True) + monkeypatch.setattr("sys.frozen", True, raising=False) + monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False) + monkeypatch.delenv("TESSDATA_PREFIX", raising=False) + _apply_bundled_tessdata_prefix() + assert os.environ.get("TESSDATA_PREFIX") == str(tessdata) + + def test_does_not_clobber_user_override(self, monkeypatch, tmp_path): + tessdata = tmp_path / "tesseract" / "tessdata" + tessdata.mkdir(parents=True) + monkeypatch.setattr("sys.frozen", True, raising=False) + monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False) + monkeypatch.setenv("TESSDATA_PREFIX", "/user/picked/this") + _apply_bundled_tessdata_prefix() + assert os.environ["TESSDATA_PREFIX"] == "/user/picked/this" + + def test_no_op_when_bundled_dir_missing(self, monkeypatch, tmp_path): + # Frozen, but the build didn't ship a tessdata dir. + monkeypatch.setattr("sys.frozen", True, raising=False) + monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False) + monkeypatch.delenv("TESSDATA_PREFIX", raising=False) + _apply_bundled_tessdata_prefix() + assert "TESSDATA_PREFIX" not in os.environ