Files
datatools-dev/tests/test_pdf_extract.py
Michael ad7c22d7fb fix(pdf): consistent 2-decimal amount precision in display and CSV
User reported amounts losing trailing zeros — 4.50 rendering as
4.5, 1000.00 as 1000 — on the same statement. Classic float
display issue: Python's native ``repr(4.5)`` drops the
``.0``, and pandas / Streamlit happily show that
inconsistency cell-by-cell.

Two layers of fix, internal type stays ``float`` for arithmetic:

**Display.** ``st.column_config.NumberColumn(format="%.2f")``
applied programmatically to every ``amount_*`` column on the
data_editor. Every numeric amount now shows with exactly two
decimal places regardless of trailing zeros.

**CSV export.** Pandas' default float-to-CSV writer also drops
trailing zeros (the same issue an accountant would see when
opening the file in Excel). Before serialising, each amount
column is mapped through the new ``format_amount`` helper —
returns ``f"{v:.2f}"`` for numerics, empty string for
None/NaN/inf, ``str(value)`` for booleans (guards the
``True → "1.00"`` foot-gun since ``bool`` is an ``int``
subclass), and passes through any string the scanner kept
because parsing failed (e.g. ``(4.50)`` when parens-negative is
off — user can correct in the editor before re-exporting).

``format_amount`` lives in ``src/pdf_extract.py`` so it's
testable in isolation (the page module can't easily be unit
tested because of its Streamlit import chain). 8 new tests
cover the trailing-zeros case, negatives, None/empty,
string-passthrough, bool guard, NaN/inf, and the ``places``
parameter.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 01:27:16 +00:00

384 lines
13 KiB
Python

"""Tests for the minimal PDF transaction scanner.
The public API is one function: ``scan_pdf_for_transactions``.
These tests cover the value-parsing helpers, the row clusterer,
the date/amount token finders, and the end-to-end scanner
against synthetic ``Page`` objects with no real PDF involved.
End-to-end-on-a-real-PDF coverage lives in
``test_pdf_extract_smoke.py``, which uses ``fpdf2`` to generate
a fixture statement at test time.
"""
from __future__ import annotations
from src.pdf_extract import (
Page,
WordBox,
_extract_account_number,
_extract_statement_period,
_find_amount_tokens,
_find_dates_in_words,
_infer_year_for_short_date,
cluster_rows,
extract_statement_metadata,
format_amount,
format_date,
parse_amount,
parse_date,
)
def _w(text: str, x0: float, top: float, x1: float | None = None) -> WordBox:
return WordBox(
x0=x0,
top=top,
x1=x1 if x1 is not None else x0 + 8 * len(text),
bottom=top + 10,
text=text,
)
class TestParseAmount:
def test_plain_positive(self):
assert parse_amount("1234.56") == 1234.56
def test_currency_and_thousands(self):
assert parse_amount("$1,234.56") == 1234.56
def test_parens_negative(self):
assert parse_amount("(1,234.56)") == -1234.56
def test_leading_minus(self):
assert parse_amount("-100.00") == -100.0
def test_trailing_minus(self):
assert parse_amount("100.00-") == -100.0
def test_blank_returns_none(self):
assert parse_amount("") is None
assert parse_amount(" ") is None
assert parse_amount(None) is None
def test_garbage_returns_none(self):
assert parse_amount("not a number") is None
def test_european_decimal(self):
assert parse_amount(
"€1.234,56",
decimal=",",
thousands=".",
currency_strip="",
) == 1234.56
def test_parens_off_disables_paren_negative(self):
# With parens off, (4.50) won't be treated as negative —
# but it also won't parse cleanly since "(4.50)" isn't a
# plain number. Verify the off-path is non-flipping.
assert parse_amount("(4.50)", negative_in_parens=False) is None
class TestParseDate:
def test_us_slash(self):
assert parse_date("01/15/2026", ["%m/%d/%Y"]) == "2026-01-15"
def test_iso(self):
assert parse_date("2026-01-15", ["%Y-%m-%d"]) == "2026-01-15"
def test_fallback_format(self):
# Not in supplied list — should still parse via fallback.
assert parse_date("01/15/26") == "2026-01-15"
def test_invalid(self):
assert parse_date("not-a-date") is None
class TestClusterRows:
def test_groups_close_y(self):
words = [
_w("A", 0, 100), _w("B", 20, 101), _w("C", 40, 102),
]
rows = cluster_rows(words)
assert len(rows) == 1
assert [w.text for w in rows[0]] == ["A", "B", "C"]
def test_separates_far_y(self):
words = [_w("A", 0, 100), _w("B", 0, 120)]
assert [
[w.text for w in r] for r in cluster_rows(words)
] == [["A"], ["B"]]
def test_sorts_left_to_right_within_row(self):
words = [_w("C", 40, 100), _w("A", 0, 100), _w("B", 20, 100)]
assert [w.text for w in cluster_rows(words)[0]] == ["A", "B", "C"]
def test_empty(self):
assert cluster_rows([]) == []
class TestFindDatesInWords:
"""Returns ``[(start, end, text)]`` — end is exclusive index of
words the date consumed."""
def test_us_slash(self):
row = [_w("01/15/2026", 0, 0), _w("Coffee", 100, 0)]
assert _find_dates_in_words(row) == [(0, 1, "01/15/2026")]
def test_two_digit_year(self):
row = [_w("01/15/26", 0, 0), _w("Foo", 100, 0)]
result = _find_dates_in_words(row)
assert result and result[0][2] == "01/15/26"
def test_iso(self):
row = [_w("2026-01-15", 0, 0), _w("Tx", 100, 0)]
assert _find_dates_in_words(row) == [(0, 1, "2026-01-15")]
def test_month_name_with_year_consumes_three_words(self):
row = [_w("Jan", 0, 0), _w("15,", 25, 0), _w("2026", 50, 0)]
result = _find_dates_in_words(row)
assert result and "Jan 15" in result[0][2]
# Date consumes all 3 words so they don't leak to description.
assert result[0][1] == 3
def test_short_us_date_no_year(self):
"""Chase-style ``01/13`` without a year still detects."""
row = [_w("01/13", 0, 0), _w("Coffee", 100, 0), _w("$4.50", 200, 0)]
result = _find_dates_in_words(row)
assert result and result[0][2] == "01/13"
assert result[0][1] == 1 # one word consumed
def test_short_month_name_no_year_consumes_two_words(self):
row = [_w("Jan", 0, 0), _w("13", 30, 0), _w("Coffee", 100, 0)]
result = _find_dates_in_words(row)
assert result
assert "Jan 13" in result[0][2]
assert result[0][1] == 2 # "Jan" + "13" both consumed
def test_short_pattern_does_not_shadow_full_year(self):
"""If a full-year date is present, short patterns shouldn't
steal — e.g. ``Page 1/2 of 3 ... 01/13/2026 Coffee`` should
return the real ``01/13/2026`` first."""
row = [
_w("Page", 0, 0), _w("1/2", 40, 0), _w("of", 80, 0),
_w("3", 100, 0),
_w("01/13/2026", 200, 0), _w("Coffee", 300, 0),
]
result = _find_dates_in_words(row)
# Full-year match wins position 0 in the returned list.
assert result[0][2] == "01/13/2026"
def test_multiple_dates_returned_in_position_order(self):
"""Chase-style transaction with both posting and txn dates."""
row = [
_w("01/13", 0, 0), _w("01/14", 50, 0),
_w("Coffee", 100, 0), _w("$4.50", 200, 0),
]
result = _find_dates_in_words(row)
assert len(result) == 2
assert result[0][2] == "01/13"
assert result[1][2] == "01/14"
# First date claims word 0, second claims word 1
assert result[0][:2] == (0, 1)
assert result[1][:2] == (1, 2)
def test_no_date(self):
row = [_w("Just", 0, 0), _w("text", 50, 0)]
assert _find_dates_in_words(row) == []
class TestFindAmountTokens:
def test_currency_format(self):
row = [_w("Coffee", 0, 0), _w("$4.50", 100, 0)]
out = _find_amount_tokens(row)
assert len(out) == 1
assert out[0][2] == "$4.50"
def test_parens_negative(self):
row = [_w("(123.45)", 0, 0)]
out = _find_amount_tokens(row)
assert out and out[0][2] == "(123.45)"
def test_no_amount_on_pure_text(self):
row = [_w("Hello", 0, 0), _w("World", 50, 0)]
assert _find_amount_tokens(row) == []
def test_rejects_bare_year(self):
# A bare 4-digit year matches the digit pattern but lacks
# any money marker — should be filtered out.
row = [_w("2026", 0, 0)]
assert _find_amount_tokens(row) == []
# End-to-end tests against synthetic Page objects are in the smoke
# test module — they need ``scan_pdf_for_transactions`` which in
# turn uses ``extract_pages_auto``. The unit-test layer here pins
# the building blocks; smoke tests pin the wiring.
class TestFormatAmount:
"""Two-decimal-place consistency at the display + export layer."""
def test_drops_no_trailing_zeros(self):
# The bug: 4.5 should NOT render as "4.5" — accountants
# need consistent precision across rows.
assert format_amount(4.5) == "4.50"
assert format_amount(12.0) == "12.00"
assert format_amount(1000) == "1000.00"
def test_negatives(self):
assert format_amount(-40.0) == "-40.00"
assert format_amount(-4.5) == "-4.50"
def test_none_and_empty(self):
assert format_amount(None) == ""
assert format_amount("") == ""
def test_string_passthrough(self):
# ``(4.50)`` was preserved by the scanner because parsing
# failed; the user sees the raw text and can fix in editor.
assert format_amount("(4.50)") == "(4.50)"
def test_bool_doesnt_render_as_number(self):
# bool is an int subclass — guard prevents True → "1.00".
assert format_amount(True) == "True"
assert format_amount(False) == "False"
def test_nan_inf_become_empty(self):
assert format_amount(float("nan")) == ""
assert format_amount(float("inf")) == ""
def test_custom_places(self):
assert format_amount(4.5, places=4) == "4.5000"
assert format_amount(4.567, places=0) == "5"
class TestFormatDate:
def test_yyyymmdd(self):
assert format_date("2026-01-13", "%Y%m%d") == "20260113"
def test_iso_passthrough(self):
assert format_date("2026-01-13", "%Y-%m-%d") == "2026-01-13"
def test_us(self):
assert format_date("2026-01-13", "%m/%d/%Y") == "01/13/2026"
def test_invalid_input_passes_through(self):
# Non-ISO input — return as-is so the user sees what was
# actually there rather than a silent empty string.
assert format_date("01/13", "%Y%m%d") == "01/13"
def test_none_or_empty(self):
assert format_date(None) == ""
assert format_date("") == ""
class TestExtractAccountNumber:
def test_masked(self):
text = "Customer Name\nAccount Number: ****1234\nBalance"
assert _extract_account_number(text) == "****1234"
def test_with_hyphens(self):
text = "Account #: 1234-5678-9012"
assert _extract_account_number(text) == "1234-5678-9012"
def test_with_spaces(self):
text = "Account: 1234 5678 9012"
assert _extract_account_number(text) == "1234 5678 9012"
def test_no_label_no_match(self):
text = "Just some text with 1234567890 in it"
assert _extract_account_number(text) is None
def test_requires_at_least_four_digits(self):
# An "account" label followed by only XX shouldn't count.
text = "Account: XX"
assert _extract_account_number(text) is None
class TestExtractStatementPeriod:
def test_standard_period(self):
text = "Statement Period: 01/01/2025 - 01/31/2025\nBalance"
start, end = _extract_statement_period(text)
assert start == "2025-01-01"
assert end == "2025-01-31"
def test_from_to(self):
text = "From 01/01/2025 to 01/31/2025"
start, end = _extract_statement_period(text)
assert start == "2025-01-01"
assert end == "2025-01-31"
def test_single_date_both_fields(self):
# When only one date appears near the label, return it for both.
text = "Statement Date: 01/31/2025"
start, end = _extract_statement_period(text)
assert start == "2025-01-31"
assert end == "2025-01-31"
def test_no_label_no_match(self):
text = "Some random text with 01/01/2025 in it"
start, end = _extract_statement_period(text)
# No "Period" / "From" / "Statement Date" label
assert (start, end) == (None, None)
class TestExtractStatementMetadata:
def test_full_header(self):
pages = [Page(
page_no=1, width=600, height=800,
text=(
"ACME BANK\n"
"Customer: John Doe\n"
"Account Number: ****5678\n"
"Statement Period: 01/01/2025 - 01/31/2025\n"
"Beginning balance: $1,000.00\n"
),
words=[],
)]
meta = extract_statement_metadata(pages)
assert meta["account_number"] == "****5678"
assert meta["period_start"] == "2025-01-01"
assert meta["period_end"] == "2025-01-31"
def test_no_pages(self):
meta = extract_statement_metadata([])
assert meta == {
"account_number": None,
"period_start": None,
"period_end": None,
}
def test_fallback_to_page_two(self):
# Page 1 has only account; period is on page 2.
p1 = Page(
page_no=1, width=600, height=800,
text="Account Number: ****1234\nBalance summary",
words=[],
)
p2 = Page(
page_no=2, width=600, height=800,
text="Statement Period: 02/01/2025 - 02/28/2025",
words=[],
)
meta = extract_statement_metadata([p1, p2])
assert meta["account_number"] == "****1234"
assert meta["period_start"] == "2025-02-01"
assert meta["period_end"] == "2025-02-28"
class TestInferYearForShortDate:
def test_us_short_with_period_end(self):
assert _infer_year_for_short_date("01/13", "2025-01-31") == "2025-01-13"
def test_short_dash(self):
assert _infer_year_for_short_date("01-13", "2025-01-31") == "2025-01-13"
def test_month_name(self):
assert _infer_year_for_short_date("Jan 13", "2025-01-31") == "2025-01-13"
def test_no_period_end(self):
assert _infer_year_for_short_date("01/13", None) is None
def test_unparseable(self):
assert _infer_year_for_short_date("xx/yy", "2025-01-31") is None