From e6ee2e34812935883e0e1a25485f976aad18da5e Mon Sep 17 00:00:00 2001
From: Michael <michael.dombaugh@gmail.com>
Date: Tue, 19 May 2026 23:15:00 +0000
Subject: [PATCH] feat(pdf): robust Tesseract discovery + OS-aware install copy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

User tried ``brew install tesseract`` in PowerShell after seeing
all three OSes listed inline in the OCR banner — easy mistake
when the install commands are crammed on one line with ``·``
separators. Two changes pre-empt this:

**OS-aware OCR banner.** The expander now detects the user's
platform via ``platform.system()`` and shows only the relevant
install instructions:

- **Windows**: UB-Mannheim installer link, numbered steps,
  explicit "keep the Add to PATH checkbox on" callout, plus a
  fallback paragraph telling the user how to set
  ``DATATOOLS_TESSERACT_PATH`` if they already installed
  without PATH and don't want to reinstall.
- **macOS**: ``brew install tesseract`` with a Homebrew link.
- **Linux**: ``apt install tesseract-ocr`` with a "or your
  distro's equivalent" hedge.

**Robust binary discovery in ``ocr_available()``.** Three-stage:

1. Honor ``DATATOOLS_TESSERACT_PATH`` env var if set — explicit
   override for portable installs or non-default locations.
2. Try ``pytesseract``'s default PATH-based lookup.
3. If PATH lookup fails, probe known Windows install paths
   (``C:\Program Files\Tesseract-OCR\tesseract.exe``,
   the x86 variant, and ``%LOCALAPPDATA%\Programs\Tesseract-OCR\``)
   via the new ``_autodetect_tesseract_path``. On hit, set
   ``pytesseract.pytesseract.tesseract_cmd`` so all subsequent
   ``image_to_data`` calls use the same binary without
   re-discovering.

This means a user who runs the UB-Mannheim installer with
default options but forgets the PATH checkbox will still get
OCR working after a launcher restart, without env-var
gymnastics.

Tests (4 new, 85 total in the suite):

- Auto-detect returns None on non-Windows (no false positives
  on dev laptops).
- Auto-detect finds the binary at a mocked
  ``C:\Program Files\Tesseract-OCR\tesseract.exe``.
- Auto-detect returns None when no candidate exists.
- ``DATATOOLS_TESSERACT_PATH`` env var beats both PATH lookup
  and auto-detect (sets ``tesseract_cmd`` even when the path
  doesn't resolve, so a real binary at a custom location works).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/gui/pages/10_PDF_Extractor.py | 44 ++++++++++++++----
 src/pdf_extract.py                | 75 +++++++++++++++++++++++++++----
 tests/test_pdf_extract_smoke.py   | 53 ++++++++++++++++++++++
 3 files changed, 156 insertions(+), 16 deletions(-)

diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py
index 49dd68f..fc6e944 100644
--- a/src/gui/pages/10_PDF_Extractor.py
+++ b/src/gui/pages/10_PDF_Extractor.py
@@ -136,16 +136,44 @@ with c_ocr:
     if _ocr_ok:
         st.caption("**OCR:** ready · scanned pages will be transcribed.")
     else:
+        import platform as _platform
+        _os_name = _platform.system()
         with st.expander("**OCR:** unavailable", expanded=False):
-            st.caption(
-                f"Reason: {_ocr_reason or 'unknown'}. Scanned (image-based) "
-                "statements will fall through with warnings. "
-                "To enable OCR, install Tesseract on this machine — "
-                "[Windows](https://github.com/UB-Mannheim/tesseract/wiki) · "
-                "macOS: ``brew install tesseract`` · "
-                "Linux: ``apt install tesseract-ocr``. "
-                "Modern text-based statements don't need OCR."
+            st.markdown(
+                f"**Reason:** {_ocr_reason or 'unknown'}.\n\n"
+                "Scanned (image-based) statements will fall through "
+                "with warnings. Most modern bank statements are text-"
+                "based and don't need OCR — only install Tesseract if "
+                "your statements actually come through as images."
             )
+            if _os_name == "Windows":
+                st.markdown(
+                    "**Install on Windows:**\n"
+                    "1. Download the installer from "
+                    "[UB-Mannheim/tesseract](https://github.com/UB-Mannheim/tesseract/wiki) "
+                    "(look for ``tesseract-ocr-w64-setup-…``).\n"
+                    "2. Run it. Keep the **\"Add tesseract to system "
+                    "PATH\"** checkbox on during setup.\n"
+                    "3. Restart the DataTools launcher.\n\n"
+                    "If you installed without PATH and don't want to "
+                    "reinstall, point DataTools at the binary directly "
+                    "by setting the ``DATATOOLS_TESSERACT_PATH`` env "
+                    "var to ``C:\\Program Files\\Tesseract-OCR\\tesseract.exe`` "
+                    "before launching."
+                )
+            elif _os_name == "Darwin":
+                st.markdown(
+                    "**Install on macOS:** ``brew install tesseract`` "
+                    "(requires [Homebrew](https://brew.sh)). Restart "
+                    "the DataTools launcher afterward."
+                )
+            else:
+                st.markdown(
+                    "**Install on Linux:** ``sudo apt install "
+                    "tesseract-ocr`` (Debian/Ubuntu) or your distro's "
+                    "equivalent (``dnf``, ``pacman``, …). Restart the "
+                    "DataTools launcher afterward."
+                )
 
 st.divider()
 
diff --git a/src/pdf_extract.py b/src/pdf_extract.py
index 2d853eb..07d23a2 100644
--- a/src/pdf_extract.py
+++ b/src/pdf_extract.py
@@ -531,23 +531,82 @@ def page_has_extractable_text(page: Page, min_words: int = 5) -> bool:
     return len(page.words) >= min_words
 
 
+def _autodetect_tesseract_path() -> str | None:
+    """Probe well-known install locations for ``tesseract.exe``.
+
+    UB-Mannheim's Windows installer drops Tesseract at one of two
+    paths by default. Auto-detecting them lets ``ocr_available``
+    succeed even when the user (or their installer) skipped the
+    "Add to PATH" step — the most common Windows install
+    snag based on real user reports.
+
+    No-op on non-Windows: macOS/Linux package managers
+    always put ``tesseract`` on PATH, so PATH-based discovery is
+    sufficient.
+    """
+    import os as _os
+    import platform as _platform
+    from pathlib import Path as _Path
+
+    if _platform.system() != "Windows":
+        return None
+    candidates = [
+        r"C:\Program Files\Tesseract-OCR\tesseract.exe",
+        r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe",
+        _os.path.expandvars(
+            r"%LOCALAPPDATA%\Programs\Tesseract-OCR\tesseract.exe"
+        ),
+    ]
+    for p in candidates:
+        if p and _Path(p).exists():
+            return p
+    return None
+
+
 def ocr_available() -> tuple[bool, str]:
     """Return ``(available, reason)`` — is OCR usable right now?
 
     Checks both the Python binding (``pytesseract``) and the
-    Tesseract binary. The reason string is suitable for surfacing to
-    the user when OCR is unavailable.
+    Tesseract binary. The reason string is suitable for surfacing
+    to the user when OCR is unavailable.
+
+    Discovery order for the Tesseract binary:
+
+    1. ``DATATOOLS_TESSERACT_PATH`` env var — explicit override,
+       wins over everything else. Useful for portable installs.
+    2. Whatever's on PATH (``pytesseract``'s default).
+    3. ``_autodetect_tesseract_path`` — known Windows install
+       locations. Sets ``pytesseract.pytesseract.tesseract_cmd``
+       so subsequent ``image_to_data`` calls use the same binary.
     """
+    import os as _os
+
     try:
-        import pytesseract  # noqa: F401
+        import pytesseract  # noqa: F401, PLC0415
     except ImportError:
         return False, "pytesseract is not installed."
+
+    override = _os.environ.get("DATATOOLS_TESSERACT_PATH")
+    if override:
+        pytesseract.pytesseract.tesseract_cmd = override
+
     try:
-        import pytesseract as pt
-        pt.get_tesseract_version()
-    except Exception as e:
-        return False, f"Tesseract binary not found: {e}"
-    return True, ""
+        pytesseract.get_tesseract_version()
+        return True, ""
+    except Exception as e_path:
+        # Fallback: probe known install locations.
+        candidate = _autodetect_tesseract_path()
+        if candidate:
+            pytesseract.pytesseract.tesseract_cmd = candidate
+            try:
+                pytesseract.get_tesseract_version()
+                return True, ""
+            except Exception as e_candidate:
+                return False, (
+                    f"Tesseract found at {candidate} but failed to "
+                    f"run: {e_candidate}"
+                )
+        return False, f"Tesseract binary not found on PATH: {e_path}"
 
 
 def render_page_image(
diff --git a/tests/test_pdf_extract_smoke.py b/tests/test_pdf_extract_smoke.py
index 5b3fe15..f6c4004 100644
--- a/tests/test_pdf_extract_smoke.py
+++ b/tests/test_pdf_extract_smoke.py
@@ -313,3 +313,56 @@ class TestOcrAvailability:
         assert len(pages) == 1
         # No OCR-disabled warning on a text PDF, since pages have text.
         assert not any("OCR is disabled" in w for w in warnings)
+
+
+class TestTesseractDiscovery:
+    """Windows install paths + env-var override are how a real user
+    (no PATH munging) gets OCR working. Cover the discovery logic
+    even on Linux/macOS test runners by mocking out the OS check
+    and ``Path.exists``."""
+
+    def test_autodetect_returns_none_on_non_windows(self, monkeypatch):
+        from src import pdf_extract
+        monkeypatch.setattr(
+            "platform.system",
+            lambda: "Linux",
+        )
+        assert pdf_extract._autodetect_tesseract_path() is None
+
+    def test_autodetect_finds_program_files_on_windows(self, monkeypatch):
+        from src import pdf_extract
+        monkeypatch.setattr("platform.system", lambda: "Windows")
+
+        target = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
+
+        def fake_exists(self):
+            return str(self) == target
+
+        monkeypatch.setattr(
+            "pathlib.Path.exists",
+            fake_exists,
+        )
+        assert pdf_extract._autodetect_tesseract_path() == target
+
+    def test_autodetect_returns_none_when_nothing_installed(
+        self, monkeypatch,
+    ):
+        from src import pdf_extract
+        monkeypatch.setattr("platform.system", lambda: "Windows")
+        monkeypatch.setattr("pathlib.Path.exists", lambda self: False)
+        assert pdf_extract._autodetect_tesseract_path() is None
+
+    def test_env_var_override_takes_precedence(self, monkeypatch, tmp_path):
+        """``DATATOOLS_TESSERACT_PATH`` wins over discovery so a
+        portable install at a non-default path works without
+        relying on PATH."""
+        from src import pdf_extract
+        # Point the override at a path that doesn't exist —
+        # ocr_available will try it and report the failure, but
+        # importantly the cmd attribute is set BEFORE the call,
+        # which is what we're verifying.
+        fake_bin = str(tmp_path / "fake-tesseract.exe")
+        monkeypatch.setenv("DATATOOLS_TESSERACT_PATH", fake_bin)
+        pdf_extract.ocr_available()
+        import pytesseract
+        assert pytesseract.pytesseract.tesseract_cmd == fake_bin