From e6ee2e34812935883e0e1a25485f976aad18da5e Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 19 May 2026 23:15:00 +0000 Subject: [PATCH] feat(pdf): robust Tesseract discovery + OS-aware install copy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User tried ``brew install tesseract`` in PowerShell after seeing all three OSes listed inline in the OCR banner — easy mistake when the install commands are crammed on one line with ``·`` separators. Two changes pre-empt this: **OS-aware OCR banner.** The expander now detects the user's platform via ``platform.system()`` and shows only the relevant install instructions: - **Windows**: UB-Mannheim installer link, numbered steps, explicit "keep the Add to PATH checkbox on" callout, plus a fallback paragraph telling the user how to set ``DATATOOLS_TESSERACT_PATH`` if they already installed without PATH and don't want to reinstall. - **macOS**: ``brew install tesseract`` with a Homebrew link. - **Linux**: ``apt install tesseract-ocr`` with a "or your distro's equivalent" hedge. **Robust binary discovery in ``ocr_available()``.** Three-stage: 1. Honor ``DATATOOLS_TESSERACT_PATH`` env var if set — explicit override for portable installs or non-default locations. 2. Try ``pytesseract``'s default PATH-based lookup. 3. If PATH lookup fails, probe known Windows install paths (``C:\Program Files\Tesseract-OCR\tesseract.exe``, the x86 variant, and ``%LOCALAPPDATA%\Programs\Tesseract-OCR\``) via the new ``_autodetect_tesseract_path``. On hit, set ``pytesseract.pytesseract.tesseract_cmd`` so all subsequent ``image_to_data`` calls use the same binary without re-discovering. This means a user who runs the UB-Mannheim installer with default options but forgets the PATH checkbox will still get OCR working after a launcher restart, without env-var gymnastics. Tests (4 new, 85 total in the suite): - Auto-detect returns None on non-Windows (no false positives on dev laptops). - Auto-detect finds the binary at a mocked ``C:\Program Files\Tesseract-OCR\tesseract.exe``. - Auto-detect returns None when no candidate exists. - ``DATATOOLS_TESSERACT_PATH`` env var beats both PATH lookup and auto-detect (sets ``tesseract_cmd`` even when the path doesn't resolve, so a real binary at a custom location works). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/gui/pages/10_PDF_Extractor.py | 44 ++++++++++++++---- src/pdf_extract.py | 75 +++++++++++++++++++++++++++---- tests/test_pdf_extract_smoke.py | 53 ++++++++++++++++++++++ 3 files changed, 156 insertions(+), 16 deletions(-) diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py index 49dd68f..fc6e944 100644 --- a/src/gui/pages/10_PDF_Extractor.py +++ b/src/gui/pages/10_PDF_Extractor.py @@ -136,16 +136,44 @@ with c_ocr: if _ocr_ok: st.caption("**OCR:** ready · scanned pages will be transcribed.") else: + import platform as _platform + _os_name = _platform.system() with st.expander("**OCR:** unavailable", expanded=False): - st.caption( - f"Reason: {_ocr_reason or 'unknown'}. Scanned (image-based) " - "statements will fall through with warnings. " - "To enable OCR, install Tesseract on this machine — " - "[Windows](https://github.com/UB-Mannheim/tesseract/wiki) · " - "macOS: ``brew install tesseract`` · " - "Linux: ``apt install tesseract-ocr``. " - "Modern text-based statements don't need OCR." + st.markdown( + f"**Reason:** {_ocr_reason or 'unknown'}.\n\n" + "Scanned (image-based) statements will fall through " + "with warnings. Most modern bank statements are text-" + "based and don't need OCR — only install Tesseract if " + "your statements actually come through as images." ) + if _os_name == "Windows": + st.markdown( + "**Install on Windows:**\n" + "1. Download the installer from " + "[UB-Mannheim/tesseract](https://github.com/UB-Mannheim/tesseract/wiki) " + "(look for ``tesseract-ocr-w64-setup-…``).\n" + "2. Run it. Keep the **\"Add tesseract to system " + "PATH\"** checkbox on during setup.\n" + "3. Restart the DataTools launcher.\n\n" + "If you installed without PATH and don't want to " + "reinstall, point DataTools at the binary directly " + "by setting the ``DATATOOLS_TESSERACT_PATH`` env " + "var to ``C:\\Program Files\\Tesseract-OCR\\tesseract.exe`` " + "before launching." + ) + elif _os_name == "Darwin": + st.markdown( + "**Install on macOS:** ``brew install tesseract`` " + "(requires [Homebrew](https://brew.sh)). Restart " + "the DataTools launcher afterward." + ) + else: + st.markdown( + "**Install on Linux:** ``sudo apt install " + "tesseract-ocr`` (Debian/Ubuntu) or your distro's " + "equivalent (``dnf``, ``pacman``, …). Restart the " + "DataTools launcher afterward." + ) st.divider() diff --git a/src/pdf_extract.py b/src/pdf_extract.py index 2d853eb..07d23a2 100644 --- a/src/pdf_extract.py +++ b/src/pdf_extract.py @@ -531,23 +531,82 @@ def page_has_extractable_text(page: Page, min_words: int = 5) -> bool: return len(page.words) >= min_words +def _autodetect_tesseract_path() -> str | None: + """Probe well-known install locations for ``tesseract.exe``. + + UB-Mannheim's Windows installer drops Tesseract at one of two + paths by default. Auto-detecting them lets ``ocr_available`` + succeed even when the user (or their installer) skipped the + "Add to PATH" step — the most common Windows install + snag based on real user reports. + + No-op on non-Windows: macOS/Linux package managers + always put ``tesseract`` on PATH, so PATH-based discovery is + sufficient. + """ + import os as _os + import platform as _platform + from pathlib import Path as _Path + + if _platform.system() != "Windows": + return None + candidates = [ + r"C:\Program Files\Tesseract-OCR\tesseract.exe", + r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe", + _os.path.expandvars( + r"%LOCALAPPDATA%\Programs\Tesseract-OCR\tesseract.exe" + ), + ] + for p in candidates: + if p and _Path(p).exists(): + return p + return None + + def ocr_available() -> tuple[bool, str]: """Return ``(available, reason)`` — is OCR usable right now? Checks both the Python binding (``pytesseract``) and the - Tesseract binary. The reason string is suitable for surfacing to - the user when OCR is unavailable. + Tesseract binary. The reason string is suitable for surfacing + to the user when OCR is unavailable. + + Discovery order for the Tesseract binary: + + 1. ``DATATOOLS_TESSERACT_PATH`` env var — explicit override, + wins over everything else. Useful for portable installs. + 2. Whatever's on PATH (``pytesseract``'s default). + 3. ``_autodetect_tesseract_path`` — known Windows install + locations. Sets ``pytesseract.pytesseract.tesseract_cmd`` + so subsequent ``image_to_data`` calls use the same binary. """ + import os as _os + try: - import pytesseract # noqa: F401 + import pytesseract # noqa: F401, PLC0415 except ImportError: return False, "pytesseract is not installed." + + override = _os.environ.get("DATATOOLS_TESSERACT_PATH") + if override: + pytesseract.pytesseract.tesseract_cmd = override + try: - import pytesseract as pt - pt.get_tesseract_version() - except Exception as e: - return False, f"Tesseract binary not found: {e}" - return True, "" + pytesseract.get_tesseract_version() + return True, "" + except Exception as e_path: + # Fallback: probe known install locations. + candidate = _autodetect_tesseract_path() + if candidate: + pytesseract.pytesseract.tesseract_cmd = candidate + try: + pytesseract.get_tesseract_version() + return True, "" + except Exception as e_candidate: + return False, ( + f"Tesseract found at {candidate} but failed to " + f"run: {e_candidate}" + ) + return False, f"Tesseract binary not found on PATH: {e_path}" def render_page_image( diff --git a/tests/test_pdf_extract_smoke.py b/tests/test_pdf_extract_smoke.py index 5b3fe15..f6c4004 100644 --- a/tests/test_pdf_extract_smoke.py +++ b/tests/test_pdf_extract_smoke.py @@ -313,3 +313,56 @@ class TestOcrAvailability: assert len(pages) == 1 # No OCR-disabled warning on a text PDF, since pages have text. assert not any("OCR is disabled" in w for w in warnings) + + +class TestTesseractDiscovery: + """Windows install paths + env-var override are how a real user + (no PATH munging) gets OCR working. Cover the discovery logic + even on Linux/macOS test runners by mocking out the OS check + and ``Path.exists``.""" + + def test_autodetect_returns_none_on_non_windows(self, monkeypatch): + from src import pdf_extract + monkeypatch.setattr( + "platform.system", + lambda: "Linux", + ) + assert pdf_extract._autodetect_tesseract_path() is None + + def test_autodetect_finds_program_files_on_windows(self, monkeypatch): + from src import pdf_extract + monkeypatch.setattr("platform.system", lambda: "Windows") + + target = r"C:\Program Files\Tesseract-OCR\tesseract.exe" + + def fake_exists(self): + return str(self) == target + + monkeypatch.setattr( + "pathlib.Path.exists", + fake_exists, + ) + assert pdf_extract._autodetect_tesseract_path() == target + + def test_autodetect_returns_none_when_nothing_installed( + self, monkeypatch, + ): + from src import pdf_extract + monkeypatch.setattr("platform.system", lambda: "Windows") + monkeypatch.setattr("pathlib.Path.exists", lambda self: False) + assert pdf_extract._autodetect_tesseract_path() is None + + def test_env_var_override_takes_precedence(self, monkeypatch, tmp_path): + """``DATATOOLS_TESSERACT_PATH`` wins over discovery so a + portable install at a non-default path works without + relying on PATH.""" + from src import pdf_extract + # Point the override at a path that doesn't exist — + # ocr_available will try it and report the failure, but + # importantly the cmd attribute is set BEFORE the call, + # which is what we're verifying. + fake_bin = str(tmp_path / "fake-tesseract.exe") + monkeypatch.setenv("DATATOOLS_TESSERACT_PATH", fake_bin) + pdf_extract.ocr_available() + import pytesseract + assert pytesseract.pytesseract.tesseract_cmd == fake_bin