diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py index bea18cc..e3f36ff 100644 --- a/src/gui/pages/10_PDF_Extractor.py +++ b/src/gui/pages/10_PDF_Extractor.py @@ -32,6 +32,7 @@ from src.pdf_extract import ( format_amount, ocr_available, scan_pdf_for_transactions, + year_from_filename, ) @@ -179,6 +180,40 @@ with st.expander("Scan options", expanded=False): ), ) + # Year override for short dates. Empty by default — the + # scanner uses statement-period detection + filename year hint + # automatically. Set this when the statement period regex + # misses on a particular bank's layout, or when you want to + # force a specific year (e.g., historical reconciliation). + year_override_str = st.text_input( + "Override year for short dates (optional)", + value="", + help=( + "Short dates like ``01/13`` get bound to a year by the " + "scanner — statement period first, then filename year, " + "then this override. Leave blank for automatic. Enter " + "a 4-digit year (e.g., 2025) to force every short date " + "to that year. Won't affect dates that already have a " + "year (``01/13/2025``)." + ), + ) + try: + year_override = ( + int(year_override_str) if year_override_str.strip() else None + ) + if year_override is not None and not (1900 <= year_override <= 2100): + st.warning( + f"Year override {year_override} looks wrong — using " + "automatic detection instead." + ) + year_override = None + except ValueError: + st.warning( + f"Year override {year_override_str!r} isn't a number — " + "using automatic detection instead." + ) + year_override = None + # Persistent stash + rotating widget key. See K_UPLOADS / K_UPLOAD_COUNTER # docstrings for why the counter exists. pdf_uploads: dict = st.session_state.setdefault(K_UPLOADS, {}) @@ -425,6 +460,8 @@ if scan_clicked and pdf_uploads: negative_in_parens=negative_in_parens, allow_ocr=use_ocr, output_date_format=output_date_format, + filename_year_hint=year_from_filename(name), + year_override=year_override, ) for r in rows: r["source_file"] = name diff --git a/src/pdf_extract.py b/src/pdf_extract.py index dd4e6f3..0c40415 100644 --- a/src/pdf_extract.py +++ b/src/pdf_extract.py @@ -689,31 +689,14 @@ def extract_statement_metadata( } -def _infer_year_for_short_date( - raw_date: str, - period_end_iso: str | None, -) -> str | None: - """Try to bind a short date like ``01/13`` or ``Jan 13`` to - the year of the statement period's end. Returns ISO or None - if no candidate format parses. - - Doesn't handle the December-in-January-statement cross-year - case — too rare to be worth the complexity. The user sees the - inferred year in the editor and can correct if needed; the - raw text stays in the ``raw`` column for reference. - """ - if not raw_date or not period_end_iso: - return None - try: - end_year = int(period_end_iso[:4]) - except (ValueError, IndexError): - return None - +def _try_short_date_with_year(raw_date: str, year: int) -> str | None: + """Append *year* to a short date string and try to parse it. + Returns ISO or None if no format matches.""" candidates = [ - ("%m/%d/%Y", f"{raw_date}/{end_year}"), - ("%m-%d-%Y", f"{raw_date}-{end_year}"), - ("%b %d %Y", f"{raw_date} {end_year}"), - ("%d %b %Y", f"{raw_date} {end_year}"), + ("%m/%d/%Y", f"{raw_date}/{year}"), + ("%m-%d-%Y", f"{raw_date}-{year}"), + ("%b %d %Y", f"{raw_date} {year}"), + ("%d %b %Y", f"{raw_date} {year}"), ] for fmt, candidate in candidates: try: @@ -723,6 +706,96 @@ def _infer_year_for_short_date( return None +_YEAR_FROM_FILENAME_RE = re.compile(r"(? int | None: + """Extract a 4-digit year from a filename like + ``eStmt_2025-01-13.pdf`` → ``2025``. Returns the first match, + or ``None`` if no 20XX pattern is present. + + Used as a fallback signal when the statement period can't be + detected from the PDF's text — many bank-statement filenames + follow the convention ``eStmt_YYYY-MM-DD.pdf`` so the year is + right there. + """ + if not filename: + return None + m = _YEAR_FROM_FILENAME_RE.search(filename) + return int(m.group(1)) if m else None + + +def _infer_year_for_short_date( + raw_date: str, + period_start_iso: str | None, + period_end_iso: str | None, + *, + filename_year_hint: int | None = None, + override_year: int | None = None, +) -> str | None: + """Bind a short date like ``01/13`` to a full ISO date using + the best available year evidence. + + Priority order: + + 1. ``override_year`` — user-supplied, beats all heuristics. + 2. ``period_start_iso`` + ``period_end_iso`` — generate + candidates for BOTH years (they differ only on + Dec/Jan-boundary statements) and pick the one that falls + inside the period, or closest if neither is inside. + Handles the Dec/Jan case: a ``12/30`` row in a 2024-12-16 + to 2025-01-15 statement resolves to 2024-12-30 because + that's the only candidate inside the period. + 3. ``filename_year_hint`` — when the statement-period regex + missed but the filename carries a year (common in bank + e-statement naming). + + Returns ISO ``YYYY-MM-DD`` or None when no signal is + available — caller falls back to the raw text so the user + can correct in the editor. + """ + if not raw_date: + return None + + if override_year: + return _try_short_date_with_year(raw_date, override_year) + + if period_start_iso and period_end_iso: + try: + start_dt = datetime.strptime(period_start_iso, "%Y-%m-%d") + end_dt = datetime.strptime(period_end_iso, "%Y-%m-%d") + except (ValueError, TypeError): + start_dt = end_dt = None + + if start_dt and end_dt: + years_to_try = {start_dt.year, end_dt.year} + candidates: list[str] = [] + for year in years_to_try: + iso = _try_short_date_with_year(raw_date, year) + if iso: + candidates.append(iso) + if candidates: + def distance(iso_str: str) -> int: + dt = datetime.strptime(iso_str, "%Y-%m-%d") + if start_dt <= dt <= end_dt: + return 0 + # Outside the period — measure shortest gap + # to either edge so a 12/15 transaction in a + # 12/16-01/15 statement still leans toward the + # period's start year. + return min( + abs((dt - start_dt).days), + abs((dt - end_dt).days), + ) + candidates.sort(key=distance) + return candidates[0] + + if filename_year_hint: + return _try_short_date_with_year(raw_date, filename_year_hint) + + return None + + def _description_from_row( row_words: list[WordBox], date_ranges: list[tuple[int, int]], @@ -768,6 +841,8 @@ def scan_pdf_for_transactions( y_tolerance: float = 3.0, merge_multiline_descriptions: bool = True, output_date_format: str = "%Y%m%d", + filename_year_hint: int | None = None, + year_override: int | None = None, ) -> tuple[list[dict[str, Any]], list[str]]: """Scan *pdf_bytes* for transaction-like rows. @@ -878,10 +953,16 @@ def scan_pdf_for_transactions( iso = parse_date(first_date_text, date_formats) if iso is None: - # Short date — try to bind to the statement period - # year before falling back to the raw text. + # Short date — try to bind a year using the cascade: + # override → statement period (Dec/Jan-aware) → + # filename year hint. Each signal is a separate + # argument so the caller can mix-and-match. iso = _infer_year_for_short_date( - first_date_text, metadata["period_end"], + first_date_text, + metadata["period_start"], + metadata["period_end"], + filename_year_hint=filename_year_hint, + override_year=year_override, ) formatted_date = ( format_date(iso, output_date_format) @@ -1007,4 +1088,5 @@ __all__ = [ "parse_amount", "parse_date", "scan_pdf_for_transactions", + "year_from_filename", ] diff --git a/tests/test_pdf_extract.py b/tests/test_pdf_extract.py index 3f9cff1..3bbd39d 100644 --- a/tests/test_pdf_extract.py +++ b/tests/test_pdf_extract.py @@ -26,6 +26,7 @@ from src.pdf_extract import ( format_date, parse_amount, parse_date, + year_from_filename, ) @@ -367,17 +368,86 @@ class TestExtractStatementMetadata: class TestInferYearForShortDate: - def test_us_short_with_period_end(self): - assert _infer_year_for_short_date("01/13", "2025-01-31") == "2025-01-13" + """The Dec/Jan-boundary-aware year inference. Picks the year + whose candidate date lands inside (or closest to) the period.""" + + def test_within_period_uses_period_year(self): + assert _infer_year_for_short_date( + "01/13", "2025-01-01", "2025-01-31", + ) == "2025-01-13" + + def test_dec_jan_boundary_dec_resolves_to_start_year(self): + # Statement period: 2024-12-16 → 2025-01-15 + # Row "12/30" → should be 2024-12-30 (in period), not 2025. + assert _infer_year_for_short_date( + "12/30", "2024-12-16", "2025-01-15", + ) == "2024-12-30" + + def test_dec_jan_boundary_jan_resolves_to_end_year(self): + # Same period; "01/05" → 2025-01-05 (in period), not 2024. + assert _infer_year_for_short_date( + "01/05", "2024-12-16", "2025-01-15", + ) == "2025-01-05" + + def test_just_before_period_picks_closer_year(self): + # "12/15" is one day before period start (2024-12-16). + # 2024-12-15 is 1 day off; 2025-12-15 is 11 months off. + # The closer-by-distance candidate wins. + assert _infer_year_for_short_date( + "12/15", "2024-12-16", "2025-01-15", + ) == "2024-12-15" + + def test_override_beats_period(self): + assert _infer_year_for_short_date( + "01/13", "2025-01-01", "2025-01-31", + override_year=2030, + ) == "2030-01-13" + + def test_filename_hint_when_no_period(self): + assert _infer_year_for_short_date( + "01/13", None, None, filename_year_hint=2025, + ) == "2025-01-13" + + def test_no_signal_returns_none(self): + assert _infer_year_for_short_date("01/13", None, None) is None def test_short_dash(self): - assert _infer_year_for_short_date("01-13", "2025-01-31") == "2025-01-13" + assert _infer_year_for_short_date( + "01-13", "2025-01-01", "2025-01-31", + ) == "2025-01-13" def test_month_name(self): - assert _infer_year_for_short_date("Jan 13", "2025-01-31") == "2025-01-13" - - def test_no_period_end(self): - assert _infer_year_for_short_date("01/13", None) is None + assert _infer_year_for_short_date( + "Jan 13", "2025-01-01", "2025-01-31", + ) == "2025-01-13" def test_unparseable(self): - assert _infer_year_for_short_date("xx/yy", "2025-01-31") is None + assert _infer_year_for_short_date( + "xx/yy", "2025-01-01", "2025-01-31", + ) is None + + +class TestYearFromFilename: + def test_estmt_pattern(self): + assert year_from_filename("eStmt_2025-01-13.pdf") == 2025 + + def test_year_embedded(self): + assert year_from_filename("chase-2024-statement.pdf") == 2024 + + def test_no_year(self): + assert year_from_filename("statement.pdf") is None + + def test_rejects_non_20XX(self): + # Filename contains a long number but no 20XX-shaped year. + assert year_from_filename("doc-1234567890.pdf") is None + + def test_first_match_wins(self): + # Filenames sometimes carry both period start and end years. + assert ( + year_from_filename("statement-2024-12-16-to-2025-01-15.pdf") + == 2024 + ) + + def test_empty_filename(self): + assert year_from_filename("") is None + assert year_from_filename(None) is None