feat(pdf): robust Tesseract discovery + OS-aware install copy
User tried ``brew install tesseract`` in PowerShell after seeing all three OSes listed inline in the OCR banner — easy mistake when the install commands are crammed on one line with ``·`` separators. Two changes pre-empt this: **OS-aware OCR banner.** The expander now detects the user's platform via ``platform.system()`` and shows only the relevant install instructions: - **Windows**: UB-Mannheim installer link, numbered steps, explicit "keep the Add to PATH checkbox on" callout, plus a fallback paragraph telling the user how to set ``DATATOOLS_TESSERACT_PATH`` if they already installed without PATH and don't want to reinstall. - **macOS**: ``brew install tesseract`` with a Homebrew link. - **Linux**: ``apt install tesseract-ocr`` with a "or your distro's equivalent" hedge. **Robust binary discovery in ``ocr_available()``.** Three-stage: 1. Honor ``DATATOOLS_TESSERACT_PATH`` env var if set — explicit override for portable installs or non-default locations. 2. Try ``pytesseract``'s default PATH-based lookup. 3. If PATH lookup fails, probe known Windows install paths (``C:\Program Files\Tesseract-OCR\tesseract.exe``, the x86 variant, and ``%LOCALAPPDATA%\Programs\Tesseract-OCR\``) via the new ``_autodetect_tesseract_path``. On hit, set ``pytesseract.pytesseract.tesseract_cmd`` so all subsequent ``image_to_data`` calls use the same binary without re-discovering. This means a user who runs the UB-Mannheim installer with default options but forgets the PATH checkbox will still get OCR working after a launcher restart, without env-var gymnastics. Tests (4 new, 85 total in the suite): - Auto-detect returns None on non-Windows (no false positives on dev laptops). - Auto-detect finds the binary at a mocked ``C:\Program Files\Tesseract-OCR\tesseract.exe``. - Auto-detect returns None when no candidate exists. - ``DATATOOLS_TESSERACT_PATH`` env var beats both PATH lookup and auto-detect (sets ``tesseract_cmd`` even when the path doesn't resolve, so a real binary at a custom location works). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -136,16 +136,44 @@ with c_ocr:
|
|||||||
if _ocr_ok:
|
if _ocr_ok:
|
||||||
st.caption("**OCR:** ready · scanned pages will be transcribed.")
|
st.caption("**OCR:** ready · scanned pages will be transcribed.")
|
||||||
else:
|
else:
|
||||||
|
import platform as _platform
|
||||||
|
_os_name = _platform.system()
|
||||||
with st.expander("**OCR:** unavailable", expanded=False):
|
with st.expander("**OCR:** unavailable", expanded=False):
|
||||||
st.caption(
|
st.markdown(
|
||||||
f"Reason: {_ocr_reason or 'unknown'}. Scanned (image-based) "
|
f"**Reason:** {_ocr_reason or 'unknown'}.\n\n"
|
||||||
"statements will fall through with warnings. "
|
"Scanned (image-based) statements will fall through "
|
||||||
"To enable OCR, install Tesseract on this machine — "
|
"with warnings. Most modern bank statements are text-"
|
||||||
"[Windows](https://github.com/UB-Mannheim/tesseract/wiki) · "
|
"based and don't need OCR — only install Tesseract if "
|
||||||
"macOS: ``brew install tesseract`` · "
|
"your statements actually come through as images."
|
||||||
"Linux: ``apt install tesseract-ocr``. "
|
|
||||||
"Modern text-based statements don't need OCR."
|
|
||||||
)
|
)
|
||||||
|
if _os_name == "Windows":
|
||||||
|
st.markdown(
|
||||||
|
"**Install on Windows:**\n"
|
||||||
|
"1. Download the installer from "
|
||||||
|
"[UB-Mannheim/tesseract](https://github.com/UB-Mannheim/tesseract/wiki) "
|
||||||
|
"(look for ``tesseract-ocr-w64-setup-…``).\n"
|
||||||
|
"2. Run it. Keep the **\"Add tesseract to system "
|
||||||
|
"PATH\"** checkbox on during setup.\n"
|
||||||
|
"3. Restart the DataTools launcher.\n\n"
|
||||||
|
"If you installed without PATH and don't want to "
|
||||||
|
"reinstall, point DataTools at the binary directly "
|
||||||
|
"by setting the ``DATATOOLS_TESSERACT_PATH`` env "
|
||||||
|
"var to ``C:\\Program Files\\Tesseract-OCR\\tesseract.exe`` "
|
||||||
|
"before launching."
|
||||||
|
)
|
||||||
|
elif _os_name == "Darwin":
|
||||||
|
st.markdown(
|
||||||
|
"**Install on macOS:** ``brew install tesseract`` "
|
||||||
|
"(requires [Homebrew](https://brew.sh)). Restart "
|
||||||
|
"the DataTools launcher afterward."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
st.markdown(
|
||||||
|
"**Install on Linux:** ``sudo apt install "
|
||||||
|
"tesseract-ocr`` (Debian/Ubuntu) or your distro's "
|
||||||
|
"equivalent (``dnf``, ``pacman``, …). Restart the "
|
||||||
|
"DataTools launcher afterward."
|
||||||
|
)
|
||||||
|
|
||||||
st.divider()
|
st.divider()
|
||||||
|
|
||||||
|
|||||||
@@ -531,23 +531,82 @@ def page_has_extractable_text(page: Page, min_words: int = 5) -> bool:
|
|||||||
return len(page.words) >= min_words
|
return len(page.words) >= min_words
|
||||||
|
|
||||||
|
|
||||||
|
def _autodetect_tesseract_path() -> str | None:
|
||||||
|
"""Probe well-known install locations for ``tesseract.exe``.
|
||||||
|
|
||||||
|
UB-Mannheim's Windows installer drops Tesseract at one of two
|
||||||
|
paths by default. Auto-detecting them lets ``ocr_available``
|
||||||
|
succeed even when the user (or their installer) skipped the
|
||||||
|
"Add to PATH" step — the most common Windows install
|
||||||
|
snag based on real user reports.
|
||||||
|
|
||||||
|
No-op on non-Windows: macOS/Linux package managers
|
||||||
|
always put ``tesseract`` on PATH, so PATH-based discovery is
|
||||||
|
sufficient.
|
||||||
|
"""
|
||||||
|
import os as _os
|
||||||
|
import platform as _platform
|
||||||
|
from pathlib import Path as _Path
|
||||||
|
|
||||||
|
if _platform.system() != "Windows":
|
||||||
|
return None
|
||||||
|
candidates = [
|
||||||
|
r"C:\Program Files\Tesseract-OCR\tesseract.exe",
|
||||||
|
r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe",
|
||||||
|
_os.path.expandvars(
|
||||||
|
r"%LOCALAPPDATA%\Programs\Tesseract-OCR\tesseract.exe"
|
||||||
|
),
|
||||||
|
]
|
||||||
|
for p in candidates:
|
||||||
|
if p and _Path(p).exists():
|
||||||
|
return p
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def ocr_available() -> tuple[bool, str]:
|
def ocr_available() -> tuple[bool, str]:
|
||||||
"""Return ``(available, reason)`` — is OCR usable right now?
|
"""Return ``(available, reason)`` — is OCR usable right now?
|
||||||
|
|
||||||
Checks both the Python binding (``pytesseract``) and the
|
Checks both the Python binding (``pytesseract``) and the
|
||||||
Tesseract binary. The reason string is suitable for surfacing to
|
Tesseract binary. The reason string is suitable for surfacing
|
||||||
the user when OCR is unavailable.
|
to the user when OCR is unavailable.
|
||||||
|
|
||||||
|
Discovery order for the Tesseract binary:
|
||||||
|
|
||||||
|
1. ``DATATOOLS_TESSERACT_PATH`` env var — explicit override,
|
||||||
|
wins over everything else. Useful for portable installs.
|
||||||
|
2. Whatever's on PATH (``pytesseract``'s default).
|
||||||
|
3. ``_autodetect_tesseract_path`` — known Windows install
|
||||||
|
locations. Sets ``pytesseract.pytesseract.tesseract_cmd``
|
||||||
|
so subsequent ``image_to_data`` calls use the same binary.
|
||||||
"""
|
"""
|
||||||
|
import os as _os
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import pytesseract # noqa: F401
|
import pytesseract # noqa: F401, PLC0415
|
||||||
except ImportError:
|
except ImportError:
|
||||||
return False, "pytesseract is not installed."
|
return False, "pytesseract is not installed."
|
||||||
|
|
||||||
|
override = _os.environ.get("DATATOOLS_TESSERACT_PATH")
|
||||||
|
if override:
|
||||||
|
pytesseract.pytesseract.tesseract_cmd = override
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import pytesseract as pt
|
pytesseract.get_tesseract_version()
|
||||||
pt.get_tesseract_version()
|
return True, ""
|
||||||
except Exception as e:
|
except Exception as e_path:
|
||||||
return False, f"Tesseract binary not found: {e}"
|
# Fallback: probe known install locations.
|
||||||
return True, ""
|
candidate = _autodetect_tesseract_path()
|
||||||
|
if candidate:
|
||||||
|
pytesseract.pytesseract.tesseract_cmd = candidate
|
||||||
|
try:
|
||||||
|
pytesseract.get_tesseract_version()
|
||||||
|
return True, ""
|
||||||
|
except Exception as e_candidate:
|
||||||
|
return False, (
|
||||||
|
f"Tesseract found at {candidate} but failed to "
|
||||||
|
f"run: {e_candidate}"
|
||||||
|
)
|
||||||
|
return False, f"Tesseract binary not found on PATH: {e_path}"
|
||||||
|
|
||||||
|
|
||||||
def render_page_image(
|
def render_page_image(
|
||||||
|
|||||||
@@ -313,3 +313,56 @@ class TestOcrAvailability:
|
|||||||
assert len(pages) == 1
|
assert len(pages) == 1
|
||||||
# No OCR-disabled warning on a text PDF, since pages have text.
|
# No OCR-disabled warning on a text PDF, since pages have text.
|
||||||
assert not any("OCR is disabled" in w for w in warnings)
|
assert not any("OCR is disabled" in w for w in warnings)
|
||||||
|
|
||||||
|
|
||||||
|
class TestTesseractDiscovery:
|
||||||
|
"""Windows install paths + env-var override are how a real user
|
||||||
|
(no PATH munging) gets OCR working. Cover the discovery logic
|
||||||
|
even on Linux/macOS test runners by mocking out the OS check
|
||||||
|
and ``Path.exists``."""
|
||||||
|
|
||||||
|
def test_autodetect_returns_none_on_non_windows(self, monkeypatch):
|
||||||
|
from src import pdf_extract
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"platform.system",
|
||||||
|
lambda: "Linux",
|
||||||
|
)
|
||||||
|
assert pdf_extract._autodetect_tesseract_path() is None
|
||||||
|
|
||||||
|
def test_autodetect_finds_program_files_on_windows(self, monkeypatch):
|
||||||
|
from src import pdf_extract
|
||||||
|
monkeypatch.setattr("platform.system", lambda: "Windows")
|
||||||
|
|
||||||
|
target = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
|
||||||
|
|
||||||
|
def fake_exists(self):
|
||||||
|
return str(self) == target
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"pathlib.Path.exists",
|
||||||
|
fake_exists,
|
||||||
|
)
|
||||||
|
assert pdf_extract._autodetect_tesseract_path() == target
|
||||||
|
|
||||||
|
def test_autodetect_returns_none_when_nothing_installed(
|
||||||
|
self, monkeypatch,
|
||||||
|
):
|
||||||
|
from src import pdf_extract
|
||||||
|
monkeypatch.setattr("platform.system", lambda: "Windows")
|
||||||
|
monkeypatch.setattr("pathlib.Path.exists", lambda self: False)
|
||||||
|
assert pdf_extract._autodetect_tesseract_path() is None
|
||||||
|
|
||||||
|
def test_env_var_override_takes_precedence(self, monkeypatch, tmp_path):
|
||||||
|
"""``DATATOOLS_TESSERACT_PATH`` wins over discovery so a
|
||||||
|
portable install at a non-default path works without
|
||||||
|
relying on PATH."""
|
||||||
|
from src import pdf_extract
|
||||||
|
# Point the override at a path that doesn't exist —
|
||||||
|
# ocr_available will try it and report the failure, but
|
||||||
|
# importantly the cmd attribute is set BEFORE the call,
|
||||||
|
# which is what we're verifying.
|
||||||
|
fake_bin = str(tmp_path / "fake-tesseract.exe")
|
||||||
|
monkeypatch.setenv("DATATOOLS_TESSERACT_PATH", fake_bin)
|
||||||
|
pdf_extract.ocr_available()
|
||||||
|
import pytesseract
|
||||||
|
assert pytesseract.pytesseract.tesseract_cmd == fake_bin
|
||||||
|
|||||||
Reference in New Issue
Block a user