feat(pdf): robust Tesseract discovery + OS-aware install copy

User tried ``brew install tesseract`` in PowerShell after seeing
all three OSes listed inline in the OCR banner — easy mistake
when the install commands are crammed on one line with ``·``
separators. Two changes pre-empt this:

**OS-aware OCR banner.** The expander now detects the user's
platform via ``platform.system()`` and shows only the relevant
install instructions:

- **Windows**: UB-Mannheim installer link, numbered steps,
  explicit "keep the Add to PATH checkbox on" callout, plus a
  fallback paragraph telling the user how to set
  ``DATATOOLS_TESSERACT_PATH`` if they already installed
  without PATH and don't want to reinstall.
- **macOS**: ``brew install tesseract`` with a Homebrew link.
- **Linux**: ``apt install tesseract-ocr`` with a "or your
  distro's equivalent" hedge.

**Robust binary discovery in ``ocr_available()``.** Three-stage:

1. Honor ``DATATOOLS_TESSERACT_PATH`` env var if set — explicit
   override for portable installs or non-default locations.
2. Try ``pytesseract``'s default PATH-based lookup.
3. If PATH lookup fails, probe known Windows install paths
   (``C:\Program Files\Tesseract-OCR\tesseract.exe``,
   the x86 variant, and ``%LOCALAPPDATA%\Programs\Tesseract-OCR\``)
   via the new ``_autodetect_tesseract_path``. On hit, set
   ``pytesseract.pytesseract.tesseract_cmd`` so all subsequent
   ``image_to_data`` calls use the same binary without
   re-discovering.

This means a user who runs the UB-Mannheim installer with
default options but forgets the PATH checkbox will still get
OCR working after a launcher restart, without env-var
gymnastics.

Tests (4 new, 85 total in the suite):

- Auto-detect returns None on non-Windows (no false positives
  on dev laptops).
- Auto-detect finds the binary at a mocked
  ``C:\Program Files\Tesseract-OCR\tesseract.exe``.
- Auto-detect returns None when no candidate exists.
- ``DATATOOLS_TESSERACT_PATH`` env var beats both PATH lookup
  and auto-detect (sets ``tesseract_cmd`` even when the path
  doesn't resolve, so a real binary at a custom location works).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-19 23:15:00 +00:00
parent 538e23d219
commit e6ee2e3481
3 changed files with 156 additions and 16 deletions

View File

@@ -136,15 +136,43 @@ with c_ocr:
if _ocr_ok: if _ocr_ok:
st.caption("**OCR:** ready · scanned pages will be transcribed.") st.caption("**OCR:** ready · scanned pages will be transcribed.")
else: else:
import platform as _platform
_os_name = _platform.system()
with st.expander("**OCR:** unavailable", expanded=False): with st.expander("**OCR:** unavailable", expanded=False):
st.caption( st.markdown(
f"Reason: {_ocr_reason or 'unknown'}. Scanned (image-based) " f"**Reason:** {_ocr_reason or 'unknown'}.\n\n"
"statements will fall through with warnings. " "Scanned (image-based) statements will fall through "
"To enable OCR, install Tesseract on this machine — " "with warnings. Most modern bank statements are text-"
"[Windows](https://github.com/UB-Mannheim/tesseract/wiki) · " "based and don't need OCR — only install Tesseract if "
"macOS: ``brew install tesseract`` · " "your statements actually come through as images."
"Linux: ``apt install tesseract-ocr``. " )
"Modern text-based statements don't need OCR." if _os_name == "Windows":
st.markdown(
"**Install on Windows:**\n"
"1. Download the installer from "
"[UB-Mannheim/tesseract](https://github.com/UB-Mannheim/tesseract/wiki) "
"(look for ``tesseract-ocr-w64-setup-…``).\n"
"2. Run it. Keep the **\"Add tesseract to system "
"PATH\"** checkbox on during setup.\n"
"3. Restart the DataTools launcher.\n\n"
"If you installed without PATH and don't want to "
"reinstall, point DataTools at the binary directly "
"by setting the ``DATATOOLS_TESSERACT_PATH`` env "
"var to ``C:\\Program Files\\Tesseract-OCR\\tesseract.exe`` "
"before launching."
)
elif _os_name == "Darwin":
st.markdown(
"**Install on macOS:** ``brew install tesseract`` "
"(requires [Homebrew](https://brew.sh)). Restart "
"the DataTools launcher afterward."
)
else:
st.markdown(
"**Install on Linux:** ``sudo apt install "
"tesseract-ocr`` (Debian/Ubuntu) or your distro's "
"equivalent (``dnf``, ``pacman``, …). Restart the "
"DataTools launcher afterward."
) )
st.divider() st.divider()

View File

@@ -531,23 +531,82 @@ def page_has_extractable_text(page: Page, min_words: int = 5) -> bool:
return len(page.words) >= min_words return len(page.words) >= min_words
def _autodetect_tesseract_path() -> str | None:
"""Probe well-known install locations for ``tesseract.exe``.
UB-Mannheim's Windows installer drops Tesseract at one of two
paths by default. Auto-detecting them lets ``ocr_available``
succeed even when the user (or their installer) skipped the
"Add to PATH" step — the most common Windows install
snag based on real user reports.
No-op on non-Windows: macOS/Linux package managers
always put ``tesseract`` on PATH, so PATH-based discovery is
sufficient.
"""
import os as _os
import platform as _platform
from pathlib import Path as _Path
if _platform.system() != "Windows":
return None
candidates = [
r"C:\Program Files\Tesseract-OCR\tesseract.exe",
r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe",
_os.path.expandvars(
r"%LOCALAPPDATA%\Programs\Tesseract-OCR\tesseract.exe"
),
]
for p in candidates:
if p and _Path(p).exists():
return p
return None
def ocr_available() -> tuple[bool, str]: def ocr_available() -> tuple[bool, str]:
"""Return ``(available, reason)`` — is OCR usable right now? """Return ``(available, reason)`` — is OCR usable right now?
Checks both the Python binding (``pytesseract``) and the Checks both the Python binding (``pytesseract``) and the
Tesseract binary. The reason string is suitable for surfacing to Tesseract binary. The reason string is suitable for surfacing
the user when OCR is unavailable. to the user when OCR is unavailable.
Discovery order for the Tesseract binary:
1. ``DATATOOLS_TESSERACT_PATH`` env var — explicit override,
wins over everything else. Useful for portable installs.
2. Whatever's on PATH (``pytesseract``'s default).
3. ``_autodetect_tesseract_path`` — known Windows install
locations. Sets ``pytesseract.pytesseract.tesseract_cmd``
so subsequent ``image_to_data`` calls use the same binary.
""" """
import os as _os
try: try:
import pytesseract # noqa: F401 import pytesseract # noqa: F401, PLC0415
except ImportError: except ImportError:
return False, "pytesseract is not installed." return False, "pytesseract is not installed."
override = _os.environ.get("DATATOOLS_TESSERACT_PATH")
if override:
pytesseract.pytesseract.tesseract_cmd = override
try: try:
import pytesseract as pt pytesseract.get_tesseract_version()
pt.get_tesseract_version()
except Exception as e:
return False, f"Tesseract binary not found: {e}"
return True, "" return True, ""
except Exception as e_path:
# Fallback: probe known install locations.
candidate = _autodetect_tesseract_path()
if candidate:
pytesseract.pytesseract.tesseract_cmd = candidate
try:
pytesseract.get_tesseract_version()
return True, ""
except Exception as e_candidate:
return False, (
f"Tesseract found at {candidate} but failed to "
f"run: {e_candidate}"
)
return False, f"Tesseract binary not found on PATH: {e_path}"
def render_page_image( def render_page_image(

View File

@@ -313,3 +313,56 @@ class TestOcrAvailability:
assert len(pages) == 1 assert len(pages) == 1
# No OCR-disabled warning on a text PDF, since pages have text. # No OCR-disabled warning on a text PDF, since pages have text.
assert not any("OCR is disabled" in w for w in warnings) assert not any("OCR is disabled" in w for w in warnings)
class TestTesseractDiscovery:
"""Windows install paths + env-var override are how a real user
(no PATH munging) gets OCR working. Cover the discovery logic
even on Linux/macOS test runners by mocking out the OS check
and ``Path.exists``."""
def test_autodetect_returns_none_on_non_windows(self, monkeypatch):
from src import pdf_extract
monkeypatch.setattr(
"platform.system",
lambda: "Linux",
)
assert pdf_extract._autodetect_tesseract_path() is None
def test_autodetect_finds_program_files_on_windows(self, monkeypatch):
from src import pdf_extract
monkeypatch.setattr("platform.system", lambda: "Windows")
target = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
def fake_exists(self):
return str(self) == target
monkeypatch.setattr(
"pathlib.Path.exists",
fake_exists,
)
assert pdf_extract._autodetect_tesseract_path() == target
def test_autodetect_returns_none_when_nothing_installed(
self, monkeypatch,
):
from src import pdf_extract
monkeypatch.setattr("platform.system", lambda: "Windows")
monkeypatch.setattr("pathlib.Path.exists", lambda self: False)
assert pdf_extract._autodetect_tesseract_path() is None
def test_env_var_override_takes_precedence(self, monkeypatch, tmp_path):
"""``DATATOOLS_TESSERACT_PATH`` wins over discovery so a
portable install at a non-default path works without
relying on PATH."""
from src import pdf_extract
# Point the override at a path that doesn't exist —
# ocr_available will try it and report the failure, but
# importantly the cmd attribute is set BEFORE the call,
# which is what we're verifying.
fake_bin = str(tmp_path / "fake-tesseract.exe")
monkeypatch.setenv("DATATOOLS_TESSERACT_PATH", fake_bin)
pdf_extract.ocr_available()
import pytesseract
assert pytesseract.pytesseract.tesseract_cmd == fake_bin