feat(pdf): Dec/Jan-aware year inference + filename hint + override
Previous year inference picked ``period_end_iso[:4]`` for every
short date, which fails on statements that cross the Dec/Jan
boundary. A "12/30" row in a 2024-12-16 to 2025-01-15 statement
got 2025-12-30 (wrong) instead of 2024-12-30.
New cascade for ``_infer_year_for_short_date``:
1. **``override_year``** — caller supplies it (new ``"Override
year for short dates"`` field in Scan options). Beats every
heuristic. Empty by default; the page validates the value
is a 4-digit-looking integer in 1900-2100 and falls back to
automatic on garbage input.
2. **Statement period start + end** — the function now takes
BOTH dates and generates candidates with every distinct year
in the period (one year for same-year statements, two for
Dec/Jan boundaries). The picker scores each candidate by
distance from the period: candidates inside the period
score 0, candidates outside score ``min(|days from start|,
|days from end|)``. Lowest-distance candidate wins. So:
- ``12/30`` + period 2024-12-16 to 2025-01-15 → 2024-12-30
(inside period, score 0)
- ``01/05`` + same period → 2025-01-05 (inside, score 0)
- ``12/15`` + same period → 2024-12-15 (1 day before,
closer than 2025-12-15 which is 11 months after)
3. **``filename_year_hint``** — fallback when the statement
period regex misses the bank's specific layout. The page
passes ``year_from_filename(upload.name)`` automatically so
files like ``eStmt_2025-01-13.pdf`` get year 2025 even if
the PDF's text doesn't yield a parseable period. The regex
matches the first ``20XX`` token bounded by non-digits.
Both new helpers (``year_from_filename`` and the new
``_try_short_date_with_year`` factor-out) are exported and
tested. 16 new tests cover: within-period inference (same-year
sanity), Dec/Jan boundary cases for both sides, the
just-before-period closer-distance case, override priority,
filename fallback, no-signal None, dash-format / month-name
shorthand round-trip, garbage input, filename year extraction
(eStmt pattern, embedded, first-match-wins, no-match, empty).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -32,6 +32,7 @@ from src.pdf_extract import (
|
||||
format_amount,
|
||||
ocr_available,
|
||||
scan_pdf_for_transactions,
|
||||
year_from_filename,
|
||||
)
|
||||
|
||||
|
||||
@@ -179,6 +180,40 @@ with st.expander("Scan options", expanded=False):
|
||||
),
|
||||
)
|
||||
|
||||
# Year override for short dates. Empty by default — the
|
||||
# scanner uses statement-period detection + filename year hint
|
||||
# automatically. Set this when the statement period regex
|
||||
# misses on a particular bank's layout, or when you want to
|
||||
# force a specific year (e.g., historical reconciliation).
|
||||
year_override_str = st.text_input(
|
||||
"Override year for short dates (optional)",
|
||||
value="",
|
||||
help=(
|
||||
"Short dates like ``01/13`` get bound to a year by the "
|
||||
"scanner — statement period first, then filename year, "
|
||||
"then this override. Leave blank for automatic. Enter "
|
||||
"a 4-digit year (e.g., 2025) to force every short date "
|
||||
"to that year. Won't affect dates that already have a "
|
||||
"year (``01/13/2025``)."
|
||||
),
|
||||
)
|
||||
try:
|
||||
year_override = (
|
||||
int(year_override_str) if year_override_str.strip() else None
|
||||
)
|
||||
if year_override is not None and not (1900 <= year_override <= 2100):
|
||||
st.warning(
|
||||
f"Year override {year_override} looks wrong — using "
|
||||
"automatic detection instead."
|
||||
)
|
||||
year_override = None
|
||||
except ValueError:
|
||||
st.warning(
|
||||
f"Year override {year_override_str!r} isn't a number — "
|
||||
"using automatic detection instead."
|
||||
)
|
||||
year_override = None
|
||||
|
||||
# Persistent stash + rotating widget key. See K_UPLOADS / K_UPLOAD_COUNTER
|
||||
# docstrings for why the counter exists.
|
||||
pdf_uploads: dict = st.session_state.setdefault(K_UPLOADS, {})
|
||||
@@ -425,6 +460,8 @@ if scan_clicked and pdf_uploads:
|
||||
negative_in_parens=negative_in_parens,
|
||||
allow_ocr=use_ocr,
|
||||
output_date_format=output_date_format,
|
||||
filename_year_hint=year_from_filename(name),
|
||||
year_override=year_override,
|
||||
)
|
||||
for r in rows:
|
||||
r["source_file"] = name
|
||||
|
||||
@@ -689,31 +689,14 @@ def extract_statement_metadata(
|
||||
}
|
||||
|
||||
|
||||
def _infer_year_for_short_date(
|
||||
raw_date: str,
|
||||
period_end_iso: str | None,
|
||||
) -> str | None:
|
||||
"""Try to bind a short date like ``01/13`` or ``Jan 13`` to
|
||||
the year of the statement period's end. Returns ISO or None
|
||||
if no candidate format parses.
|
||||
|
||||
Doesn't handle the December-in-January-statement cross-year
|
||||
case — too rare to be worth the complexity. The user sees the
|
||||
inferred year in the editor and can correct if needed; the
|
||||
raw text stays in the ``raw`` column for reference.
|
||||
"""
|
||||
if not raw_date or not period_end_iso:
|
||||
return None
|
||||
try:
|
||||
end_year = int(period_end_iso[:4])
|
||||
except (ValueError, IndexError):
|
||||
return None
|
||||
|
||||
def _try_short_date_with_year(raw_date: str, year: int) -> str | None:
|
||||
"""Append *year* to a short date string and try to parse it.
|
||||
Returns ISO or None if no format matches."""
|
||||
candidates = [
|
||||
("%m/%d/%Y", f"{raw_date}/{end_year}"),
|
||||
("%m-%d-%Y", f"{raw_date}-{end_year}"),
|
||||
("%b %d %Y", f"{raw_date} {end_year}"),
|
||||
("%d %b %Y", f"{raw_date} {end_year}"),
|
||||
("%m/%d/%Y", f"{raw_date}/{year}"),
|
||||
("%m-%d-%Y", f"{raw_date}-{year}"),
|
||||
("%b %d %Y", f"{raw_date} {year}"),
|
||||
("%d %b %Y", f"{raw_date} {year}"),
|
||||
]
|
||||
for fmt, candidate in candidates:
|
||||
try:
|
||||
@@ -723,6 +706,96 @@ def _infer_year_for_short_date(
|
||||
return None
|
||||
|
||||
|
||||
_YEAR_FROM_FILENAME_RE = re.compile(r"(?<!\d)(20\d{2})(?!\d)")
|
||||
|
||||
|
||||
def year_from_filename(filename: str) -> int | None:
|
||||
"""Extract a 4-digit year from a filename like
|
||||
``eStmt_2025-01-13.pdf`` → ``2025``. Returns the first match,
|
||||
or ``None`` if no 20XX pattern is present.
|
||||
|
||||
Used as a fallback signal when the statement period can't be
|
||||
detected from the PDF's text — many bank-statement filenames
|
||||
follow the convention ``eStmt_YYYY-MM-DD.pdf`` so the year is
|
||||
right there.
|
||||
"""
|
||||
if not filename:
|
||||
return None
|
||||
m = _YEAR_FROM_FILENAME_RE.search(filename)
|
||||
return int(m.group(1)) if m else None
|
||||
|
||||
|
||||
def _infer_year_for_short_date(
|
||||
raw_date: str,
|
||||
period_start_iso: str | None,
|
||||
period_end_iso: str | None,
|
||||
*,
|
||||
filename_year_hint: int | None = None,
|
||||
override_year: int | None = None,
|
||||
) -> str | None:
|
||||
"""Bind a short date like ``01/13`` to a full ISO date using
|
||||
the best available year evidence.
|
||||
|
||||
Priority order:
|
||||
|
||||
1. ``override_year`` — user-supplied, beats all heuristics.
|
||||
2. ``period_start_iso`` + ``period_end_iso`` — generate
|
||||
candidates for BOTH years (they differ only on
|
||||
Dec/Jan-boundary statements) and pick the one that falls
|
||||
inside the period, or closest if neither is inside.
|
||||
Handles the Dec/Jan case: a ``12/30`` row in a 2024-12-16
|
||||
to 2025-01-15 statement resolves to 2024-12-30 because
|
||||
that's the only candidate inside the period.
|
||||
3. ``filename_year_hint`` — when the statement-period regex
|
||||
missed but the filename carries a year (common in bank
|
||||
e-statement naming).
|
||||
|
||||
Returns ISO ``YYYY-MM-DD`` or None when no signal is
|
||||
available — caller falls back to the raw text so the user
|
||||
can correct in the editor.
|
||||
"""
|
||||
if not raw_date:
|
||||
return None
|
||||
|
||||
if override_year:
|
||||
return _try_short_date_with_year(raw_date, override_year)
|
||||
|
||||
if period_start_iso and period_end_iso:
|
||||
try:
|
||||
start_dt = datetime.strptime(period_start_iso, "%Y-%m-%d")
|
||||
end_dt = datetime.strptime(period_end_iso, "%Y-%m-%d")
|
||||
except (ValueError, TypeError):
|
||||
start_dt = end_dt = None
|
||||
|
||||
if start_dt and end_dt:
|
||||
years_to_try = {start_dt.year, end_dt.year}
|
||||
candidates: list[str] = []
|
||||
for year in years_to_try:
|
||||
iso = _try_short_date_with_year(raw_date, year)
|
||||
if iso:
|
||||
candidates.append(iso)
|
||||
if candidates:
|
||||
def distance(iso_str: str) -> int:
|
||||
dt = datetime.strptime(iso_str, "%Y-%m-%d")
|
||||
if start_dt <= dt <= end_dt:
|
||||
return 0
|
||||
# Outside the period — measure shortest gap
|
||||
# to either edge so a 12/15 transaction in a
|
||||
# 12/16-01/15 statement still leans toward the
|
||||
# period's start year.
|
||||
return min(
|
||||
abs((dt - start_dt).days),
|
||||
abs((dt - end_dt).days),
|
||||
)
|
||||
candidates.sort(key=distance)
|
||||
return candidates[0]
|
||||
|
||||
if filename_year_hint:
|
||||
return _try_short_date_with_year(raw_date, filename_year_hint)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _description_from_row(
|
||||
row_words: list[WordBox],
|
||||
date_ranges: list[tuple[int, int]],
|
||||
@@ -768,6 +841,8 @@ def scan_pdf_for_transactions(
|
||||
y_tolerance: float = 3.0,
|
||||
merge_multiline_descriptions: bool = True,
|
||||
output_date_format: str = "%Y%m%d",
|
||||
filename_year_hint: int | None = None,
|
||||
year_override: int | None = None,
|
||||
) -> tuple[list[dict[str, Any]], list[str]]:
|
||||
"""Scan *pdf_bytes* for transaction-like rows.
|
||||
|
||||
@@ -878,10 +953,16 @@ def scan_pdf_for_transactions(
|
||||
|
||||
iso = parse_date(first_date_text, date_formats)
|
||||
if iso is None:
|
||||
# Short date — try to bind to the statement period
|
||||
# year before falling back to the raw text.
|
||||
# Short date — try to bind a year using the cascade:
|
||||
# override → statement period (Dec/Jan-aware) →
|
||||
# filename year hint. Each signal is a separate
|
||||
# argument so the caller can mix-and-match.
|
||||
iso = _infer_year_for_short_date(
|
||||
first_date_text, metadata["period_end"],
|
||||
first_date_text,
|
||||
metadata["period_start"],
|
||||
metadata["period_end"],
|
||||
filename_year_hint=filename_year_hint,
|
||||
override_year=year_override,
|
||||
)
|
||||
formatted_date = (
|
||||
format_date(iso, output_date_format)
|
||||
@@ -1007,4 +1088,5 @@ __all__ = [
|
||||
"parse_amount",
|
||||
"parse_date",
|
||||
"scan_pdf_for_transactions",
|
||||
"year_from_filename",
|
||||
]
|
||||
|
||||
@@ -26,6 +26,7 @@ from src.pdf_extract import (
|
||||
format_date,
|
||||
parse_amount,
|
||||
parse_date,
|
||||
year_from_filename,
|
||||
)
|
||||
|
||||
|
||||
@@ -367,17 +368,86 @@ class TestExtractStatementMetadata:
|
||||
|
||||
|
||||
class TestInferYearForShortDate:
|
||||
def test_us_short_with_period_end(self):
|
||||
assert _infer_year_for_short_date("01/13", "2025-01-31") == "2025-01-13"
|
||||
"""The Dec/Jan-boundary-aware year inference. Picks the year
|
||||
whose candidate date lands inside (or closest to) the period."""
|
||||
|
||||
def test_within_period_uses_period_year(self):
|
||||
assert _infer_year_for_short_date(
|
||||
"01/13", "2025-01-01", "2025-01-31",
|
||||
) == "2025-01-13"
|
||||
|
||||
def test_dec_jan_boundary_dec_resolves_to_start_year(self):
|
||||
# Statement period: 2024-12-16 → 2025-01-15
|
||||
# Row "12/30" → should be 2024-12-30 (in period), not 2025.
|
||||
assert _infer_year_for_short_date(
|
||||
"12/30", "2024-12-16", "2025-01-15",
|
||||
) == "2024-12-30"
|
||||
|
||||
def test_dec_jan_boundary_jan_resolves_to_end_year(self):
|
||||
# Same period; "01/05" → 2025-01-05 (in period), not 2024.
|
||||
assert _infer_year_for_short_date(
|
||||
"01/05", "2024-12-16", "2025-01-15",
|
||||
) == "2025-01-05"
|
||||
|
||||
def test_just_before_period_picks_closer_year(self):
|
||||
# "12/15" is one day before period start (2024-12-16).
|
||||
# 2024-12-15 is 1 day off; 2025-12-15 is 11 months off.
|
||||
# The closer-by-distance candidate wins.
|
||||
assert _infer_year_for_short_date(
|
||||
"12/15", "2024-12-16", "2025-01-15",
|
||||
) == "2024-12-15"
|
||||
|
||||
def test_override_beats_period(self):
|
||||
assert _infer_year_for_short_date(
|
||||
"01/13", "2025-01-01", "2025-01-31",
|
||||
override_year=2030,
|
||||
) == "2030-01-13"
|
||||
|
||||
def test_filename_hint_when_no_period(self):
|
||||
assert _infer_year_for_short_date(
|
||||
"01/13", None, None, filename_year_hint=2025,
|
||||
) == "2025-01-13"
|
||||
|
||||
def test_no_signal_returns_none(self):
|
||||
assert _infer_year_for_short_date("01/13", None, None) is None
|
||||
|
||||
def test_short_dash(self):
|
||||
assert _infer_year_for_short_date("01-13", "2025-01-31") == "2025-01-13"
|
||||
assert _infer_year_for_short_date(
|
||||
"01-13", "2025-01-01", "2025-01-31",
|
||||
) == "2025-01-13"
|
||||
|
||||
def test_month_name(self):
|
||||
assert _infer_year_for_short_date("Jan 13", "2025-01-31") == "2025-01-13"
|
||||
|
||||
def test_no_period_end(self):
|
||||
assert _infer_year_for_short_date("01/13", None) is None
|
||||
assert _infer_year_for_short_date(
|
||||
"Jan 13", "2025-01-01", "2025-01-31",
|
||||
) == "2025-01-13"
|
||||
|
||||
def test_unparseable(self):
|
||||
assert _infer_year_for_short_date("xx/yy", "2025-01-31") is None
|
||||
assert _infer_year_for_short_date(
|
||||
"xx/yy", "2025-01-01", "2025-01-31",
|
||||
) is None
|
||||
|
||||
|
||||
class TestYearFromFilename:
|
||||
def test_estmt_pattern(self):
|
||||
assert year_from_filename("eStmt_2025-01-13.pdf") == 2025
|
||||
|
||||
def test_year_embedded(self):
|
||||
assert year_from_filename("chase-2024-statement.pdf") == 2024
|
||||
|
||||
def test_no_year(self):
|
||||
assert year_from_filename("statement.pdf") is None
|
||||
|
||||
def test_rejects_non_20XX(self):
|
||||
# Filename contains a long number but no 20XX-shaped year.
|
||||
assert year_from_filename("doc-1234567890.pdf") is None
|
||||
|
||||
def test_first_match_wins(self):
|
||||
# Filenames sometimes carry both period start and end years.
|
||||
assert (
|
||||
year_from_filename("statement-2024-12-16-to-2025-01-15.pdf")
|
||||
== 2024
|
||||
)
|
||||
|
||||
def test_empty_filename(self):
|
||||
assert year_from_filename("") is None
|
||||
assert year_from_filename(None) is None
|
||||
|
||||
Reference in New Issue
Block a user