feat(pdf): Dec/Jan-aware year inference + filename hint + override
Previous year inference picked ``period_end_iso[:4]`` for every
short date, which fails on statements that cross the Dec/Jan
boundary. A "12/30" row in a 2024-12-16 to 2025-01-15 statement
got 2025-12-30 (wrong) instead of 2024-12-30.
New cascade for ``_infer_year_for_short_date``:
1. **``override_year``** — caller supplies it (new ``"Override
year for short dates"`` field in Scan options). Beats every
heuristic. Empty by default; the page validates the value
is a 4-digit-looking integer in 1900-2100 and falls back to
automatic on garbage input.
2. **Statement period start + end** — the function now takes
BOTH dates and generates candidates with every distinct year
in the period (one year for same-year statements, two for
Dec/Jan boundaries). The picker scores each candidate by
distance from the period: candidates inside the period
score 0, candidates outside score ``min(|days from start|,
|days from end|)``. Lowest-distance candidate wins. So:
- ``12/30`` + period 2024-12-16 to 2025-01-15 → 2024-12-30
(inside period, score 0)
- ``01/05`` + same period → 2025-01-05 (inside, score 0)
- ``12/15`` + same period → 2024-12-15 (1 day before,
closer than 2025-12-15 which is 11 months after)
3. **``filename_year_hint``** — fallback when the statement
period regex misses the bank's specific layout. The page
passes ``year_from_filename(upload.name)`` automatically so
files like ``eStmt_2025-01-13.pdf`` get year 2025 even if
the PDF's text doesn't yield a parseable period. The regex
matches the first ``20XX`` token bounded by non-digits.
Both new helpers (``year_from_filename`` and the new
``_try_short_date_with_year`` factor-out) are exported and
tested. 16 new tests cover: within-period inference (same-year
sanity), Dec/Jan boundary cases for both sides, the
just-before-period closer-distance case, override priority,
filename fallback, no-signal None, dash-format / month-name
shorthand round-trip, garbage input, filename year extraction
(eStmt pattern, embedded, first-match-wins, no-match, empty).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -26,6 +26,7 @@ from src.pdf_extract import (
|
||||
format_date,
|
||||
parse_amount,
|
||||
parse_date,
|
||||
year_from_filename,
|
||||
)
|
||||
|
||||
|
||||
@@ -367,17 +368,86 @@ class TestExtractStatementMetadata:
|
||||
|
||||
|
||||
class TestInferYearForShortDate:
|
||||
def test_us_short_with_period_end(self):
|
||||
assert _infer_year_for_short_date("01/13", "2025-01-31") == "2025-01-13"
|
||||
"""The Dec/Jan-boundary-aware year inference. Picks the year
|
||||
whose candidate date lands inside (or closest to) the period."""
|
||||
|
||||
def test_within_period_uses_period_year(self):
|
||||
assert _infer_year_for_short_date(
|
||||
"01/13", "2025-01-01", "2025-01-31",
|
||||
) == "2025-01-13"
|
||||
|
||||
def test_dec_jan_boundary_dec_resolves_to_start_year(self):
|
||||
# Statement period: 2024-12-16 → 2025-01-15
|
||||
# Row "12/30" → should be 2024-12-30 (in period), not 2025.
|
||||
assert _infer_year_for_short_date(
|
||||
"12/30", "2024-12-16", "2025-01-15",
|
||||
) == "2024-12-30"
|
||||
|
||||
def test_dec_jan_boundary_jan_resolves_to_end_year(self):
|
||||
# Same period; "01/05" → 2025-01-05 (in period), not 2024.
|
||||
assert _infer_year_for_short_date(
|
||||
"01/05", "2024-12-16", "2025-01-15",
|
||||
) == "2025-01-05"
|
||||
|
||||
def test_just_before_period_picks_closer_year(self):
|
||||
# "12/15" is one day before period start (2024-12-16).
|
||||
# 2024-12-15 is 1 day off; 2025-12-15 is 11 months off.
|
||||
# The closer-by-distance candidate wins.
|
||||
assert _infer_year_for_short_date(
|
||||
"12/15", "2024-12-16", "2025-01-15",
|
||||
) == "2024-12-15"
|
||||
|
||||
def test_override_beats_period(self):
|
||||
assert _infer_year_for_short_date(
|
||||
"01/13", "2025-01-01", "2025-01-31",
|
||||
override_year=2030,
|
||||
) == "2030-01-13"
|
||||
|
||||
def test_filename_hint_when_no_period(self):
|
||||
assert _infer_year_for_short_date(
|
||||
"01/13", None, None, filename_year_hint=2025,
|
||||
) == "2025-01-13"
|
||||
|
||||
def test_no_signal_returns_none(self):
|
||||
assert _infer_year_for_short_date("01/13", None, None) is None
|
||||
|
||||
def test_short_dash(self):
|
||||
assert _infer_year_for_short_date("01-13", "2025-01-31") == "2025-01-13"
|
||||
assert _infer_year_for_short_date(
|
||||
"01-13", "2025-01-01", "2025-01-31",
|
||||
) == "2025-01-13"
|
||||
|
||||
def test_month_name(self):
|
||||
assert _infer_year_for_short_date("Jan 13", "2025-01-31") == "2025-01-13"
|
||||
|
||||
def test_no_period_end(self):
|
||||
assert _infer_year_for_short_date("01/13", None) is None
|
||||
assert _infer_year_for_short_date(
|
||||
"Jan 13", "2025-01-01", "2025-01-31",
|
||||
) == "2025-01-13"
|
||||
|
||||
def test_unparseable(self):
|
||||
assert _infer_year_for_short_date("xx/yy", "2025-01-31") is None
|
||||
assert _infer_year_for_short_date(
|
||||
"xx/yy", "2025-01-01", "2025-01-31",
|
||||
) is None
|
||||
|
||||
|
||||
class TestYearFromFilename:
|
||||
def test_estmt_pattern(self):
|
||||
assert year_from_filename("eStmt_2025-01-13.pdf") == 2025
|
||||
|
||||
def test_year_embedded(self):
|
||||
assert year_from_filename("chase-2024-statement.pdf") == 2024
|
||||
|
||||
def test_no_year(self):
|
||||
assert year_from_filename("statement.pdf") is None
|
||||
|
||||
def test_rejects_non_20XX(self):
|
||||
# Filename contains a long number but no 20XX-shaped year.
|
||||
assert year_from_filename("doc-1234567890.pdf") is None
|
||||
|
||||
def test_first_match_wins(self):
|
||||
# Filenames sometimes carry both period start and end years.
|
||||
assert (
|
||||
year_from_filename("statement-2024-12-16-to-2025-01-15.pdf")
|
||||
== 2024
|
||||
)
|
||||
|
||||
def test_empty_filename(self):
|
||||
assert year_from_filename("") is None
|
||||
assert year_from_filename(None) is None
|
||||
|
||||
Reference in New Issue
Block a user