feat(pdf): Dec/Jan-aware year inference + filename hint + override
Previous year inference picked ``period_end_iso[:4]`` for every
short date, which fails on statements that cross the Dec/Jan
boundary. A "12/30" row in a 2024-12-16 to 2025-01-15 statement
got 2025-12-30 (wrong) instead of 2024-12-30.
New cascade for ``_infer_year_for_short_date``:
1. **``override_year``** — caller supplies it (new ``"Override
year for short dates"`` field in Scan options). Beats every
heuristic. Empty by default; the page validates the value
is a 4-digit-looking integer in 1900-2100 and falls back to
automatic on garbage input.
2. **Statement period start + end** — the function now takes
BOTH dates and generates candidates with every distinct year
in the period (one year for same-year statements, two for
Dec/Jan boundaries). The picker scores each candidate by
distance from the period: candidates inside the period
score 0, candidates outside score ``min(|days from start|,
|days from end|)``. Lowest-distance candidate wins. So:
- ``12/30`` + period 2024-12-16 to 2025-01-15 → 2024-12-30
(inside period, score 0)
- ``01/05`` + same period → 2025-01-05 (inside, score 0)
- ``12/15`` + same period → 2024-12-15 (1 day before,
closer than 2025-12-15 which is 11 months after)
3. **``filename_year_hint``** — fallback when the statement
period regex misses the bank's specific layout. The page
passes ``year_from_filename(upload.name)`` automatically so
files like ``eStmt_2025-01-13.pdf`` get year 2025 even if
the PDF's text doesn't yield a parseable period. The regex
matches the first ``20XX`` token bounded by non-digits.
Both new helpers (``year_from_filename`` and the new
``_try_short_date_with_year`` factor-out) are exported and
tested. 16 new tests cover: within-period inference (same-year
sanity), Dec/Jan boundary cases for both sides, the
just-before-period closer-distance case, override priority,
filename fallback, no-signal None, dash-format / month-name
shorthand round-trip, garbage input, filename year extraction
(eStmt pattern, embedded, first-match-wins, no-match, empty).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -689,31 +689,14 @@ def extract_statement_metadata(
|
||||
}
|
||||
|
||||
|
||||
def _infer_year_for_short_date(
|
||||
raw_date: str,
|
||||
period_end_iso: str | None,
|
||||
) -> str | None:
|
||||
"""Try to bind a short date like ``01/13`` or ``Jan 13`` to
|
||||
the year of the statement period's end. Returns ISO or None
|
||||
if no candidate format parses.
|
||||
|
||||
Doesn't handle the December-in-January-statement cross-year
|
||||
case — too rare to be worth the complexity. The user sees the
|
||||
inferred year in the editor and can correct if needed; the
|
||||
raw text stays in the ``raw`` column for reference.
|
||||
"""
|
||||
if not raw_date or not period_end_iso:
|
||||
return None
|
||||
try:
|
||||
end_year = int(period_end_iso[:4])
|
||||
except (ValueError, IndexError):
|
||||
return None
|
||||
|
||||
def _try_short_date_with_year(raw_date: str, year: int) -> str | None:
|
||||
"""Append *year* to a short date string and try to parse it.
|
||||
Returns ISO or None if no format matches."""
|
||||
candidates = [
|
||||
("%m/%d/%Y", f"{raw_date}/{end_year}"),
|
||||
("%m-%d-%Y", f"{raw_date}-{end_year}"),
|
||||
("%b %d %Y", f"{raw_date} {end_year}"),
|
||||
("%d %b %Y", f"{raw_date} {end_year}"),
|
||||
("%m/%d/%Y", f"{raw_date}/{year}"),
|
||||
("%m-%d-%Y", f"{raw_date}-{year}"),
|
||||
("%b %d %Y", f"{raw_date} {year}"),
|
||||
("%d %b %Y", f"{raw_date} {year}"),
|
||||
]
|
||||
for fmt, candidate in candidates:
|
||||
try:
|
||||
@@ -723,6 +706,96 @@ def _infer_year_for_short_date(
|
||||
return None
|
||||
|
||||
|
||||
_YEAR_FROM_FILENAME_RE = re.compile(r"(?<!\d)(20\d{2})(?!\d)")
|
||||
|
||||
|
||||
def year_from_filename(filename: str) -> int | None:
|
||||
"""Extract a 4-digit year from a filename like
|
||||
``eStmt_2025-01-13.pdf`` → ``2025``. Returns the first match,
|
||||
or ``None`` if no 20XX pattern is present.
|
||||
|
||||
Used as a fallback signal when the statement period can't be
|
||||
detected from the PDF's text — many bank-statement filenames
|
||||
follow the convention ``eStmt_YYYY-MM-DD.pdf`` so the year is
|
||||
right there.
|
||||
"""
|
||||
if not filename:
|
||||
return None
|
||||
m = _YEAR_FROM_FILENAME_RE.search(filename)
|
||||
return int(m.group(1)) if m else None
|
||||
|
||||
|
||||
def _infer_year_for_short_date(
|
||||
raw_date: str,
|
||||
period_start_iso: str | None,
|
||||
period_end_iso: str | None,
|
||||
*,
|
||||
filename_year_hint: int | None = None,
|
||||
override_year: int | None = None,
|
||||
) -> str | None:
|
||||
"""Bind a short date like ``01/13`` to a full ISO date using
|
||||
the best available year evidence.
|
||||
|
||||
Priority order:
|
||||
|
||||
1. ``override_year`` — user-supplied, beats all heuristics.
|
||||
2. ``period_start_iso`` + ``period_end_iso`` — generate
|
||||
candidates for BOTH years (they differ only on
|
||||
Dec/Jan-boundary statements) and pick the one that falls
|
||||
inside the period, or closest if neither is inside.
|
||||
Handles the Dec/Jan case: a ``12/30`` row in a 2024-12-16
|
||||
to 2025-01-15 statement resolves to 2024-12-30 because
|
||||
that's the only candidate inside the period.
|
||||
3. ``filename_year_hint`` — when the statement-period regex
|
||||
missed but the filename carries a year (common in bank
|
||||
e-statement naming).
|
||||
|
||||
Returns ISO ``YYYY-MM-DD`` or None when no signal is
|
||||
available — caller falls back to the raw text so the user
|
||||
can correct in the editor.
|
||||
"""
|
||||
if not raw_date:
|
||||
return None
|
||||
|
||||
if override_year:
|
||||
return _try_short_date_with_year(raw_date, override_year)
|
||||
|
||||
if period_start_iso and period_end_iso:
|
||||
try:
|
||||
start_dt = datetime.strptime(period_start_iso, "%Y-%m-%d")
|
||||
end_dt = datetime.strptime(period_end_iso, "%Y-%m-%d")
|
||||
except (ValueError, TypeError):
|
||||
start_dt = end_dt = None
|
||||
|
||||
if start_dt and end_dt:
|
||||
years_to_try = {start_dt.year, end_dt.year}
|
||||
candidates: list[str] = []
|
||||
for year in years_to_try:
|
||||
iso = _try_short_date_with_year(raw_date, year)
|
||||
if iso:
|
||||
candidates.append(iso)
|
||||
if candidates:
|
||||
def distance(iso_str: str) -> int:
|
||||
dt = datetime.strptime(iso_str, "%Y-%m-%d")
|
||||
if start_dt <= dt <= end_dt:
|
||||
return 0
|
||||
# Outside the period — measure shortest gap
|
||||
# to either edge so a 12/15 transaction in a
|
||||
# 12/16-01/15 statement still leans toward the
|
||||
# period's start year.
|
||||
return min(
|
||||
abs((dt - start_dt).days),
|
||||
abs((dt - end_dt).days),
|
||||
)
|
||||
candidates.sort(key=distance)
|
||||
return candidates[0]
|
||||
|
||||
if filename_year_hint:
|
||||
return _try_short_date_with_year(raw_date, filename_year_hint)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _description_from_row(
|
||||
row_words: list[WordBox],
|
||||
date_ranges: list[tuple[int, int]],
|
||||
@@ -768,6 +841,8 @@ def scan_pdf_for_transactions(
|
||||
y_tolerance: float = 3.0,
|
||||
merge_multiline_descriptions: bool = True,
|
||||
output_date_format: str = "%Y%m%d",
|
||||
filename_year_hint: int | None = None,
|
||||
year_override: int | None = None,
|
||||
) -> tuple[list[dict[str, Any]], list[str]]:
|
||||
"""Scan *pdf_bytes* for transaction-like rows.
|
||||
|
||||
@@ -878,10 +953,16 @@ def scan_pdf_for_transactions(
|
||||
|
||||
iso = parse_date(first_date_text, date_formats)
|
||||
if iso is None:
|
||||
# Short date — try to bind to the statement period
|
||||
# year before falling back to the raw text.
|
||||
# Short date — try to bind a year using the cascade:
|
||||
# override → statement period (Dec/Jan-aware) →
|
||||
# filename year hint. Each signal is a separate
|
||||
# argument so the caller can mix-and-match.
|
||||
iso = _infer_year_for_short_date(
|
||||
first_date_text, metadata["period_end"],
|
||||
first_date_text,
|
||||
metadata["period_start"],
|
||||
metadata["period_end"],
|
||||
filename_year_hint=filename_year_hint,
|
||||
override_year=year_override,
|
||||
)
|
||||
formatted_date = (
|
||||
format_date(iso, output_date_format)
|
||||
@@ -1007,4 +1088,5 @@ __all__ = [
|
||||
"parse_amount",
|
||||
"parse_date",
|
||||
"scan_pdf_for_transactions",
|
||||
"year_from_filename",
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user