Adds runtime support for the bundled Tesseract that ships inside the DataTools installer / portable / AppImage artifacts. When DataTools is launched from a PyInstaller frozen bundle the OCR engine now resolves automatically — no end-user install required. New helpers in src/pdf_extract.py: - _bundled_tesseract_path() → Path | None — returns <sys._MEIPASS>/tesseract/tesseract[.exe] when getattr(sys, "frozen", False) AND sys._MEIPASS are present; None in dev. - _bundled_tessdata_dir() → Path | None — same gating, returns <sys._MEIPASS>/tesseract/tessdata. - _apply_bundled_tessdata_prefix() — sets TESSDATA_PREFIX to the bundled tessdata dir before any pytesseract call; only if frozen, dir exists, and the user hasn't already overridden the env var. Discovery order in ocr_available() / _autodetect_tesseract_path(): 1. DATATOOLS_TESSERACT_PATH env override (existing) 2. Bundled binary (NEW — frozen-only) 3. System PATH (existing) 4. Windows well-known install dirs (existing legacy fallback) In dev (not frozen) every new probe is a no-op so the developer experience is unchanged. 12 new tests cover frozen vs. non-frozen detection on each platform, the user-override respect for TESSDATA_PREFIX, autodetect priority ordering, and the no-bundled-dir graceful path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
594 lines
22 KiB
Python
594 lines
22 KiB
Python
"""Tests for the minimal PDF transaction scanner.
|
|
|
|
The public API is one function: ``scan_pdf_for_transactions``.
|
|
These tests cover the value-parsing helpers, the row clusterer,
|
|
the date/amount token finders, and the end-to-end scanner
|
|
against synthetic ``Page`` objects with no real PDF involved.
|
|
|
|
End-to-end-on-a-real-PDF coverage lives in
|
|
``test_pdf_extract_smoke.py``, which uses ``fpdf2`` to generate
|
|
a fixture statement at test time.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
from pathlib import Path
|
|
|
|
from src import pdf_extract
|
|
from src.pdf_extract import (
|
|
Page,
|
|
WordBox,
|
|
_apply_bundled_tessdata_prefix,
|
|
_bundled_tessdata_dir,
|
|
_bundled_tesseract_path,
|
|
_extract_account_number,
|
|
_extract_statement_period,
|
|
_find_amount_tokens,
|
|
_find_dates_in_words,
|
|
_infer_year_for_short_date,
|
|
cluster_rows,
|
|
extract_statement_metadata,
|
|
format_amount,
|
|
format_date,
|
|
parse_amount,
|
|
parse_date,
|
|
year_from_filename,
|
|
)
|
|
|
|
|
|
def _w(text: str, x0: float, top: float, x1: float | None = None) -> WordBox:
|
|
return WordBox(
|
|
x0=x0,
|
|
top=top,
|
|
x1=x1 if x1 is not None else x0 + 8 * len(text),
|
|
bottom=top + 10,
|
|
text=text,
|
|
)
|
|
|
|
|
|
class TestParseAmount:
|
|
def test_plain_positive(self):
|
|
assert parse_amount("1234.56") == 1234.56
|
|
|
|
def test_currency_and_thousands(self):
|
|
assert parse_amount("$1,234.56") == 1234.56
|
|
|
|
def test_parens_negative(self):
|
|
assert parse_amount("(1,234.56)") == -1234.56
|
|
|
|
def test_leading_minus(self):
|
|
assert parse_amount("-100.00") == -100.0
|
|
|
|
def test_trailing_minus(self):
|
|
assert parse_amount("100.00-") == -100.0
|
|
|
|
def test_blank_returns_none(self):
|
|
assert parse_amount("") is None
|
|
assert parse_amount(" ") is None
|
|
assert parse_amount(None) is None
|
|
|
|
def test_garbage_returns_none(self):
|
|
assert parse_amount("not a number") is None
|
|
|
|
def test_european_decimal(self):
|
|
assert parse_amount(
|
|
"€1.234,56",
|
|
decimal=",",
|
|
thousands=".",
|
|
currency_strip="€",
|
|
) == 1234.56
|
|
|
|
def test_parens_off_disables_paren_negative(self):
|
|
# With parens off, (4.50) won't be treated as negative —
|
|
# but it also won't parse cleanly since "(4.50)" isn't a
|
|
# plain number. Verify the off-path is non-flipping.
|
|
assert parse_amount("(4.50)", negative_in_parens=False) is None
|
|
|
|
|
|
class TestParseDate:
|
|
def test_us_slash(self):
|
|
assert parse_date("01/15/2026", ["%m/%d/%Y"]) == "2026-01-15"
|
|
|
|
def test_iso(self):
|
|
assert parse_date("2026-01-15", ["%Y-%m-%d"]) == "2026-01-15"
|
|
|
|
def test_fallback_format(self):
|
|
# Not in supplied list — should still parse via fallback.
|
|
assert parse_date("01/15/26") == "2026-01-15"
|
|
|
|
def test_invalid(self):
|
|
assert parse_date("not-a-date") is None
|
|
|
|
|
|
class TestClusterRows:
|
|
def test_groups_close_y(self):
|
|
words = [
|
|
_w("A", 0, 100), _w("B", 20, 101), _w("C", 40, 102),
|
|
]
|
|
rows = cluster_rows(words)
|
|
assert len(rows) == 1
|
|
assert [w.text for w in rows[0]] == ["A", "B", "C"]
|
|
|
|
def test_separates_far_y(self):
|
|
words = [_w("A", 0, 100), _w("B", 0, 120)]
|
|
assert [
|
|
[w.text for w in r] for r in cluster_rows(words)
|
|
] == [["A"], ["B"]]
|
|
|
|
def test_sorts_left_to_right_within_row(self):
|
|
words = [_w("C", 40, 100), _w("A", 0, 100), _w("B", 20, 100)]
|
|
assert [w.text for w in cluster_rows(words)[0]] == ["A", "B", "C"]
|
|
|
|
def test_empty(self):
|
|
assert cluster_rows([]) == []
|
|
|
|
|
|
class TestFindDatesInWords:
|
|
"""Returns ``[(start, end, text)]`` — end is exclusive index of
|
|
words the date consumed."""
|
|
|
|
def test_us_slash(self):
|
|
row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)]
|
|
assert _find_dates_in_words(row) == [(0, 1, "01/15/2026")]
|
|
|
|
def test_two_digit_year(self):
|
|
row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)]
|
|
result = _find_dates_in_words(row)
|
|
assert result and result[0][2] == "01/15/26"
|
|
|
|
def test_iso(self):
|
|
row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)]
|
|
assert _find_dates_in_words(row) == [(0, 1, "2026-01-15")]
|
|
|
|
def test_month_name_with_year_consumes_three_words(self):
|
|
row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)]
|
|
result = _find_dates_in_words(row)
|
|
assert result and "Jan 15" in result[0][2]
|
|
# Date consumes all 3 words so they don't leak to description.
|
|
assert result[0][1] == 3
|
|
|
|
def test_short_us_date_no_year(self):
|
|
"""Chase-style ``01/13`` without a year still detects."""
|
|
row = [_w("01/13", 0, 0), _w("Coffee", 100, 0), _w("$4.50", 200, 0)]
|
|
result = _find_dates_in_words(row)
|
|
assert result and result[0][2] == "01/13"
|
|
assert result[0][1] == 1 # one word consumed
|
|
|
|
def test_short_month_name_no_year_consumes_two_words(self):
|
|
row = [_w("Jan", 0, 0), _w("13", 30, 0), _w("Coffee", 100, 0)]
|
|
result = _find_dates_in_words(row)
|
|
assert result
|
|
assert "Jan 13" in result[0][2]
|
|
assert result[0][1] == 2 # "Jan" + "13" both consumed
|
|
|
|
def test_short_pattern_does_not_shadow_full_year(self):
|
|
"""If a full-year date is present, short patterns shouldn't
|
|
steal — e.g. ``Page 1/2 of 3 ... 01/13/2026 Coffee`` should
|
|
return the real ``01/13/2026`` first."""
|
|
row = [
|
|
_w("Page", 0, 0), _w("1/2", 40, 0), _w("of", 80, 0),
|
|
_w("3", 100, 0),
|
|
_w("01/13/2026", 200, 0), _w("Coffee", 300, 0),
|
|
]
|
|
result = _find_dates_in_words(row)
|
|
# Full-year match wins position 0 in the returned list.
|
|
assert result[0][2] == "01/13/2026"
|
|
|
|
def test_multiple_dates_returned_in_position_order(self):
|
|
"""Chase-style transaction with both posting and txn dates."""
|
|
row = [
|
|
_w("01/13", 0, 0), _w("01/14", 50, 0),
|
|
_w("Coffee", 100, 0), _w("$4.50", 200, 0),
|
|
]
|
|
result = _find_dates_in_words(row)
|
|
assert len(result) == 2
|
|
assert result[0][2] == "01/13"
|
|
assert result[1][2] == "01/14"
|
|
# First date claims word 0, second claims word 1
|
|
assert result[0][:2] == (0, 1)
|
|
assert result[1][:2] == (1, 2)
|
|
|
|
def test_no_date(self):
|
|
row = [_w("Just", 0, 0), _w("text", 50, 0)]
|
|
assert _find_dates_in_words(row) == []
|
|
|
|
|
|
class TestFindAmountTokens:
|
|
def test_currency_format(self):
|
|
row = [_w("Coffee", 0, 0), _w("$4.50", 100, 0)]
|
|
out = _find_amount_tokens(row)
|
|
assert len(out) == 1
|
|
assert out[0][2] == "$4.50"
|
|
|
|
def test_parens_negative(self):
|
|
row = [_w("(123.45)", 0, 0)]
|
|
out = _find_amount_tokens(row)
|
|
assert out and out[0][2] == "(123.45)"
|
|
|
|
def test_no_amount_on_pure_text(self):
|
|
row = [_w("Hello", 0, 0), _w("World", 50, 0)]
|
|
assert _find_amount_tokens(row) == []
|
|
|
|
def test_rejects_bare_year(self):
|
|
# A bare 4-digit year matches the digit pattern but lacks
|
|
# any money marker — should be filtered out.
|
|
row = [_w("2026", 0, 0)]
|
|
assert _find_amount_tokens(row) == []
|
|
|
|
|
|
# End-to-end tests against synthetic Page objects are in the smoke
|
|
# test module — they need ``scan_pdf_for_transactions`` which in
|
|
# turn uses ``extract_pages_auto``. The unit-test layer here pins
|
|
# the building blocks; smoke tests pin the wiring.
|
|
|
|
|
|
class TestFormatAmount:
|
|
"""Two-decimal-place consistency at the display + export layer."""
|
|
|
|
def test_drops_no_trailing_zeros(self):
|
|
# The bug: 4.5 should NOT render as "4.5" — accountants
|
|
# need consistent precision across rows.
|
|
assert format_amount(4.5) == "4.50"
|
|
assert format_amount(12.0) == "12.00"
|
|
assert format_amount(1000) == "1000.00"
|
|
|
|
def test_negatives(self):
|
|
assert format_amount(-40.0) == "-40.00"
|
|
assert format_amount(-4.5) == "-4.50"
|
|
|
|
def test_none_and_empty(self):
|
|
assert format_amount(None) == ""
|
|
assert format_amount("") == ""
|
|
|
|
def test_string_passthrough(self):
|
|
# ``(4.50)`` was preserved by the scanner because parsing
|
|
# failed; the user sees the raw text and can fix in editor.
|
|
assert format_amount("(4.50)") == "(4.50)"
|
|
|
|
def test_bool_doesnt_render_as_number(self):
|
|
# bool is an int subclass — guard prevents True → "1.00".
|
|
assert format_amount(True) == "True"
|
|
assert format_amount(False) == "False"
|
|
|
|
def test_nan_inf_become_empty(self):
|
|
assert format_amount(float("nan")) == ""
|
|
assert format_amount(float("inf")) == ""
|
|
|
|
def test_custom_places(self):
|
|
assert format_amount(4.5, places=4) == "4.5000"
|
|
assert format_amount(4.567, places=0) == "5"
|
|
|
|
|
|
class TestFormatDate:
|
|
def test_yyyymmdd(self):
|
|
assert format_date("2026-01-13", "%Y%m%d") == "20260113"
|
|
|
|
def test_iso_passthrough(self):
|
|
assert format_date("2026-01-13", "%Y-%m-%d") == "2026-01-13"
|
|
|
|
def test_us(self):
|
|
assert format_date("2026-01-13", "%m/%d/%Y") == "01/13/2026"
|
|
|
|
def test_invalid_input_passes_through(self):
|
|
# Non-ISO input — return as-is so the user sees what was
|
|
# actually there rather than a silent empty string.
|
|
assert format_date("01/13", "%Y%m%d") == "01/13"
|
|
|
|
def test_none_or_empty(self):
|
|
assert format_date(None) == ""
|
|
assert format_date("") == ""
|
|
|
|
def test_default_is_iso(self):
|
|
# Default format changed to ISO ``YYYY-MM-DD`` — sorts
|
|
# naturally and parses across every spreadsheet tool.
|
|
assert format_date("2026-01-13") == "2026-01-13"
|
|
|
|
|
|
class TestExtractAccountNumber:
|
|
def test_masked(self):
|
|
text = "Customer Name\nAccount Number: ****1234\nBalance"
|
|
assert _extract_account_number(text) == "****1234"
|
|
|
|
def test_with_hyphens(self):
|
|
text = "Account #: 1234-5678-9012"
|
|
assert _extract_account_number(text) == "1234-5678-9012"
|
|
|
|
def test_with_spaces(self):
|
|
text = "Account: 1234 5678 9012"
|
|
assert _extract_account_number(text) == "1234 5678 9012"
|
|
|
|
def test_no_label_no_match(self):
|
|
text = "Just some text with 1234567890 in it"
|
|
assert _extract_account_number(text) is None
|
|
|
|
def test_requires_at_least_four_digits(self):
|
|
# An "account" label followed by only XX shouldn't count.
|
|
text = "Account: XX"
|
|
assert _extract_account_number(text) is None
|
|
|
|
|
|
class TestExtractStatementPeriod:
|
|
def test_standard_period(self):
|
|
text = "Statement Period: 01/01/2025 - 01/31/2025\nBalance"
|
|
start, end = _extract_statement_period(text)
|
|
assert start == "2025-01-01"
|
|
assert end == "2025-01-31"
|
|
|
|
def test_from_to(self):
|
|
text = "From 01/01/2025 to 01/31/2025"
|
|
start, end = _extract_statement_period(text)
|
|
assert start == "2025-01-01"
|
|
assert end == "2025-01-31"
|
|
|
|
def test_single_date_both_fields(self):
|
|
# When only one date appears near the label, return it for both.
|
|
text = "Statement Date: 01/31/2025"
|
|
start, end = _extract_statement_period(text)
|
|
assert start == "2025-01-31"
|
|
assert end == "2025-01-31"
|
|
|
|
def test_no_label_no_match(self):
|
|
text = "Some random text with 01/01/2025 in it"
|
|
start, end = _extract_statement_period(text)
|
|
# No "Period" / "From" / "Statement Date" label
|
|
assert (start, end) == (None, None)
|
|
|
|
|
|
class TestExtractStatementMetadata:
|
|
def test_full_header(self):
|
|
pages = [Page(
|
|
page_no=1, width=600, height=800,
|
|
text=(
|
|
"ACME BANK\n"
|
|
"Customer: John Doe\n"
|
|
"Account Number: ****5678\n"
|
|
"Statement Period: 01/01/2025 - 01/31/2025\n"
|
|
"Beginning balance: $1,000.00\n"
|
|
),
|
|
words=[],
|
|
)]
|
|
meta = extract_statement_metadata(pages)
|
|
assert meta["account_number"] == "****5678"
|
|
assert meta["period_start"] == "2025-01-01"
|
|
assert meta["period_end"] == "2025-01-31"
|
|
|
|
def test_no_pages(self):
|
|
meta = extract_statement_metadata([])
|
|
assert meta == {
|
|
"account_number": None,
|
|
"period_start": None,
|
|
"period_end": None,
|
|
}
|
|
|
|
def test_fallback_to_page_two(self):
|
|
# Page 1 has only account; period is on page 2.
|
|
p1 = Page(
|
|
page_no=1, width=600, height=800,
|
|
text="Account Number: ****1234\nBalance summary",
|
|
words=[],
|
|
)
|
|
p2 = Page(
|
|
page_no=2, width=600, height=800,
|
|
text="Statement Period: 02/01/2025 - 02/28/2025",
|
|
words=[],
|
|
)
|
|
meta = extract_statement_metadata([p1, p2])
|
|
assert meta["account_number"] == "****1234"
|
|
assert meta["period_start"] == "2025-02-01"
|
|
assert meta["period_end"] == "2025-02-28"
|
|
|
|
|
|
class TestInferYearForShortDate:
|
|
"""The Dec/Jan-boundary-aware year inference. Picks the year
|
|
whose candidate date lands inside (or closest to) the period."""
|
|
|
|
def test_within_period_uses_period_year(self):
|
|
assert _infer_year_for_short_date(
|
|
"01/13", "2025-01-01", "2025-01-31",
|
|
) == "2025-01-13"
|
|
|
|
def test_dec_jan_boundary_dec_resolves_to_start_year(self):
|
|
# Statement period: 2024-12-16 → 2025-01-15
|
|
# Row "12/30" → should be 2024-12-30 (in period), not 2025.
|
|
assert _infer_year_for_short_date(
|
|
"12/30", "2024-12-16", "2025-01-15",
|
|
) == "2024-12-30"
|
|
|
|
def test_dec_jan_boundary_jan_resolves_to_end_year(self):
|
|
# Same period; "01/05" → 2025-01-05 (in period), not 2024.
|
|
assert _infer_year_for_short_date(
|
|
"01/05", "2024-12-16", "2025-01-15",
|
|
) == "2025-01-05"
|
|
|
|
def test_just_before_period_picks_closer_year(self):
|
|
# "12/15" is one day before period start (2024-12-16).
|
|
# 2024-12-15 is 1 day off; 2025-12-15 is 11 months off.
|
|
# The closer-by-distance candidate wins.
|
|
assert _infer_year_for_short_date(
|
|
"12/15", "2024-12-16", "2025-01-15",
|
|
) == "2024-12-15"
|
|
|
|
def test_override_beats_period(self):
|
|
assert _infer_year_for_short_date(
|
|
"01/13", "2025-01-01", "2025-01-31",
|
|
override_year=2030,
|
|
) == "2030-01-13"
|
|
|
|
def test_filename_hint_when_no_period(self):
|
|
assert _infer_year_for_short_date(
|
|
"01/13", None, None, filename_year_hint=2025,
|
|
) == "2025-01-13"
|
|
|
|
def test_no_signal_returns_none(self):
|
|
assert _infer_year_for_short_date("01/13", None, None) is None
|
|
|
|
def test_short_dash(self):
|
|
assert _infer_year_for_short_date(
|
|
"01-13", "2025-01-01", "2025-01-31",
|
|
) == "2025-01-13"
|
|
|
|
def test_month_name(self):
|
|
assert _infer_year_for_short_date(
|
|
"Jan 13", "2025-01-01", "2025-01-31",
|
|
) == "2025-01-13"
|
|
|
|
def test_unparseable(self):
|
|
assert _infer_year_for_short_date(
|
|
"xx/yy", "2025-01-01", "2025-01-31",
|
|
) is None
|
|
|
|
|
|
class TestYearFromFilename:
|
|
def test_estmt_pattern(self):
|
|
assert year_from_filename("eStmt_2025-01-13.pdf") == 2025
|
|
|
|
def test_year_embedded(self):
|
|
assert year_from_filename("chase-2024-statement.pdf") == 2024
|
|
|
|
def test_no_year(self):
|
|
assert year_from_filename("statement.pdf") is None
|
|
|
|
def test_rejects_non_20XX(self):
|
|
# Filename contains a long number but no 20XX-shaped year.
|
|
assert year_from_filename("doc-1234567890.pdf") is None
|
|
|
|
def test_first_match_wins(self):
|
|
# Filenames sometimes carry both period start and end years.
|
|
assert (
|
|
year_from_filename("statement-2024-12-16-to-2025-01-15.pdf")
|
|
== 2024
|
|
)
|
|
|
|
def test_empty_filename(self):
|
|
assert year_from_filename("") is None
|
|
assert year_from_filename(None) is None
|
|
|
|
|
|
class TestBundledTesseractPath:
|
|
"""Frozen-bundle Tesseract discovery for installer / portable builds.
|
|
|
|
The build agent packages Tesseract at
|
|
``<sys._MEIPASS>/tesseract/tesseract[.exe]`` with language data
|
|
at ``<sys._MEIPASS>/tesseract/tessdata``. These tests pin that
|
|
contract on the runtime side."""
|
|
|
|
def test_returns_none_when_not_frozen(self, monkeypatch):
|
|
# Default dev environment: ``sys.frozen`` is unset.
|
|
monkeypatch.delattr("sys.frozen", raising=False)
|
|
monkeypatch.delattr("sys._MEIPASS", raising=False)
|
|
assert _bundled_tesseract_path() is None
|
|
assert _bundled_tessdata_dir() is None
|
|
|
|
def test_returns_none_when_frozen_but_no_meipass(self, monkeypatch):
|
|
# Defensive: ``sys.frozen`` true but ``_MEIPASS`` missing
|
|
# (shouldn't happen in real PyInstaller bundles but guard
|
|
# the helper so it can't NoneType-explode).
|
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
|
monkeypatch.delattr("sys._MEIPASS", raising=False)
|
|
assert _bundled_tesseract_path() is None
|
|
assert _bundled_tessdata_dir() is None
|
|
|
|
def test_frozen_linux_returns_unsuffixed_binary(
|
|
self, monkeypatch, tmp_path,
|
|
):
|
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
|
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
|
monkeypatch.setattr("platform.system", lambda: "Linux")
|
|
expected = tmp_path / "tesseract" / "tesseract"
|
|
assert _bundled_tesseract_path() == expected
|
|
|
|
def test_frozen_macos_returns_unsuffixed_binary(
|
|
self, monkeypatch, tmp_path,
|
|
):
|
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
|
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
|
monkeypatch.setattr("platform.system", lambda: "Darwin")
|
|
expected = tmp_path / "tesseract" / "tesseract"
|
|
assert _bundled_tesseract_path() == expected
|
|
|
|
def test_frozen_windows_returns_exe_binary(self, monkeypatch, tmp_path):
|
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
|
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
|
monkeypatch.setattr("platform.system", lambda: "Windows")
|
|
expected = tmp_path / "tesseract" / "tesseract.exe"
|
|
assert _bundled_tesseract_path() == expected
|
|
|
|
def test_frozen_returns_tessdata_dir(self, monkeypatch, tmp_path):
|
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
|
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
|
expected = tmp_path / "tesseract" / "tessdata"
|
|
assert _bundled_tessdata_dir() == expected
|
|
|
|
|
|
class TestAutodetectFavoursBundled:
|
|
"""When a bundled binary exists, ``_autodetect_tesseract_path``
|
|
should return it BEFORE falling through to Windows install
|
|
locations — frozen builds shouldn't depend on the user's
|
|
system tesseract even on Windows."""
|
|
|
|
def test_bundled_wins_over_windows_program_files(
|
|
self, monkeypatch, tmp_path,
|
|
):
|
|
# Simulate frozen Windows build with a bundled binary on disk.
|
|
bundle_root = tmp_path / "bundle"
|
|
bundled_bin = bundle_root / "tesseract" / "tesseract.exe"
|
|
bundled_bin.parent.mkdir(parents=True)
|
|
bundled_bin.write_bytes(b"")
|
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
|
monkeypatch.setattr(
|
|
"sys._MEIPASS", str(bundle_root), raising=False,
|
|
)
|
|
monkeypatch.setattr("platform.system", lambda: "Windows")
|
|
# Pretend the Program Files install also exists — bundled
|
|
# should still win because we probe it first.
|
|
monkeypatch.setattr(Path, "exists", lambda self: True)
|
|
assert pdf_extract._autodetect_tesseract_path() == str(bundled_bin)
|
|
|
|
def test_falls_through_when_not_frozen(self, monkeypatch):
|
|
# Dev: not frozen, not Windows → no candidate at all.
|
|
monkeypatch.delattr("sys.frozen", raising=False)
|
|
monkeypatch.delattr("sys._MEIPASS", raising=False)
|
|
monkeypatch.setattr("platform.system", lambda: "Linux")
|
|
assert pdf_extract._autodetect_tesseract_path() is None
|
|
|
|
|
|
class TestApplyBundledTessdataPrefix:
|
|
"""``TESSDATA_PREFIX`` env var handling — bundled data should be
|
|
pointed at without clobbering a user override."""
|
|
|
|
def test_no_op_when_not_frozen(self, monkeypatch):
|
|
monkeypatch.delenv("TESSDATA_PREFIX", raising=False)
|
|
monkeypatch.delattr("sys.frozen", raising=False)
|
|
monkeypatch.delattr("sys._MEIPASS", raising=False)
|
|
_apply_bundled_tessdata_prefix()
|
|
assert "TESSDATA_PREFIX" not in os.environ
|
|
|
|
def test_sets_when_frozen_and_bundled_exists(
|
|
self, monkeypatch, tmp_path,
|
|
):
|
|
tessdata = tmp_path / "tesseract" / "tessdata"
|
|
tessdata.mkdir(parents=True)
|
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
|
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
|
monkeypatch.delenv("TESSDATA_PREFIX", raising=False)
|
|
_apply_bundled_tessdata_prefix()
|
|
assert os.environ.get("TESSDATA_PREFIX") == str(tessdata)
|
|
|
|
def test_does_not_clobber_user_override(self, monkeypatch, tmp_path):
|
|
tessdata = tmp_path / "tesseract" / "tessdata"
|
|
tessdata.mkdir(parents=True)
|
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
|
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
|
monkeypatch.setenv("TESSDATA_PREFIX", "/user/picked/this")
|
|
_apply_bundled_tessdata_prefix()
|
|
assert os.environ["TESSDATA_PREFIX"] == "/user/picked/this"
|
|
|
|
def test_no_op_when_bundled_dir_missing(self, monkeypatch, tmp_path):
|
|
# Frozen, but the build didn't ship a tessdata dir.
|
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
|
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
|
monkeypatch.delenv("TESSDATA_PREFIX", raising=False)
|
|
_apply_bundled_tessdata_prefix()
|
|
assert "TESSDATA_PREFIX" not in os.environ
|