From 155dd30746d4e374c7d3f7a955b15d53cf6062e0 Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 20 May 2026 00:20:46 +0000 Subject: [PATCH] feat(pdf): extract statement header (account + period) + date format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related additions for the accountant workflow: **1. Statement header extraction.** New ``extract_statement_metadata(pages)`` pulls the account number and statement period out of the first page (falls back to page 1+2 if either is missing on page 1 — Wells Fargo business accounts put header info on page 2). Detected fields are stamped onto EVERY transaction row so a multi-statement CSV is self-attributing per row:: { "date": "20250113", "description": "Coffee Shop", "amount_1": -4.50, "account_number": "****5678", "statement_period_start": "20250101", "statement_period_end": "20250131", ... } Account-number regex is tolerant of masks (``****1234``), hyphens (``1234-5678-9012``), and spaces. Period regex looks for "Statement Period" / "From" / "Period Covered" labels plus the first 1-2 full-year dates that follow. If only one date is present near the label, it's used for both start and end (some statements show only the closing date). **2. Year inference for short dates.** When the row date is a short ``01/13`` or ``Jan 13`` without a year, the scanner now binds the year from the statement period's end date BEFORE formatting. Doesn't handle the December-in-January-statement cross-year case (rare; user can edit in the table). **3. Configurable output date format.** New ``output_date_format`` parameter on ``scan_pdf_for_transactions`` defaults to ``%Y%m%d``. Applied to: the transaction date column AND the statement period start/end fields. The page surfaces a dropdown in Scan options with common presets (YYYYMMDD, YYYY-MM-DD, MM/DD/YYYY, DD/MM/YYYY, ``Mon DD, YYYY``) plus a Custom option that accepts a raw strftime string. New helper: ``format_date(iso_str, fmt)`` converts ISO ``YYYY-MM-DD`` to any strftime; passes invalid input through unchanged so the user can see what was actually there rather than getting silent empties. 20 new tests cover: format_date, account-number extraction (masked / hyphenated / spaced / no-label / short), period extraction (standard / from-to / single-date / no-label), metadata orchestrator (full header / no pages / page-2 fallback), year inference (US / dash / month-name / no-period / unparseable), plus an end-to-end class that builds a header'd PDF with short-date transactions and confirms metadata attribution + year inference + format round-trip. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/gui/pages/10_PDF_Extractor.py | 51 ++++++- src/pdf_extract.py | 227 ++++++++++++++++++++++++++++-- tests/test_pdf_extract.py | 136 ++++++++++++++++++ tests/test_pdf_extract_smoke.py | 98 ++++++++++++- 4 files changed, 499 insertions(+), 13 deletions(-) diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py index 7033a60..ffd3956 100644 --- a/src/gui/pages/10_PDF_Extractor.py +++ b/src/gui/pages/10_PDF_Extractor.py @@ -90,6 +90,15 @@ if not _pdf_ok: # Options + upload # --------------------------------------------------------------------------- +_DATE_FORMAT_CHOICES = { + "YYYYMMDD (20260113)": "%Y%m%d", + "YYYY-MM-DD (2026-01-13)": "%Y-%m-%d", + "MM/DD/YYYY (01/13/2026)": "%m/%d/%Y", + "DD/MM/YYYY (13/01/2026)": "%d/%m/%Y", + "MMM DD, YYYY (Jan 13, 2026)": "%b %d, %Y", + "Custom strftime…": "__custom__", +} + with st.expander("Scan options", expanded=False): c1, c2 = st.columns(2) negative_in_parens = c1.checkbox( @@ -112,6 +121,28 @@ with st.expander("Scan options", expanded=False): ), ) + c3, c4 = st.columns(2) + date_label = c3.selectbox( + "Output date format", + list(_DATE_FORMAT_CHOICES.keys()), + index=0, + help=( + "Applied to the transaction date AND the statement " + "period dates pulled from the header. Pick Custom to " + "enter your own ``strftime`` string." + ), + ) + output_date_format = _DATE_FORMAT_CHOICES[date_label] + if output_date_format == "__custom__": + output_date_format = c4.text_input( + "Custom strftime format", + value="%Y%m%d", + help=( + "Python ``strftime`` codes — e.g., ``%Y%m%d`` for " + "20260113, ``%Y-%m-%d`` for 2026-01-13." + ), + ) + uploads = st.file_uploader( "PDF file(s)", type=["pdf"], @@ -148,6 +179,7 @@ if scan_clicked and uploads: raw, negative_in_parens=negative_in_parens, allow_ocr=use_ocr, + output_date_format=output_date_format, ) for r in rows: r["source_file"] = up.name @@ -258,11 +290,24 @@ else: # Order columns so the user-facing fields are leftmost; raw + # internals are last and easy to scroll past or unselect at - # download time. - front = ["date", "description"] + # download time. Statement metadata sits with the transaction + # detail since it's per-row context an accountant typically + # wants alongside the amounts. + front = [ + "date", + "description", + ] amount_cols = sorted(c for c in df.columns if c.startswith("amount_")) + metadata_cols = [ + "account_number", + "statement_period_start", + "statement_period_end", + ] tail = ["source_file", "page", "raw"] - ordered = [c for c in front + amount_cols + tail if c in df.columns] + ordered = [ + c for c in front + amount_cols + metadata_cols + tail + if c in df.columns + ] extras = [c for c in df.columns if c not in ordered] df = df[ordered + extras] diff --git a/src/pdf_extract.py b/src/pdf_extract.py index 5f9ef91..31cac7b 100644 --- a/src/pdf_extract.py +++ b/src/pdf_extract.py @@ -520,6 +520,180 @@ def _find_amount_tokens( return out +def format_date(iso_str: str | None, fmt: str = "%Y%m%d") -> str: + """Convert an ISO ``YYYY-MM-DD`` date string to *fmt*. + + Returns the input unchanged if it's not parseable as ISO, + empty string if input is None/empty. The scanner uses this + on every date column (transaction date + statement period + start/end) so the output CSV is consistent. + """ + if not iso_str: + return "" + try: + return datetime.strptime(iso_str, "%Y-%m-%d").strftime(fmt) + except (ValueError, TypeError): + return iso_str + + +# --------------------------------------------------------------------------- +# Statement-level metadata (account number + period) +# --------------------------------------------------------------------------- + +# Account number regexes. Bank statements label these in a small +# handful of conventional ways. The capture group is a permissive +# run of digits / X / * / dashes / spaces — accounts are often +# masked like ``****1234`` or printed with grouping like +# ``1234-5678-9012``. +_ACCOUNT_RES = [ + re.compile( + r"Account\s*(?:Number|No\.?|#)\s*[:.]?\s*" + r"([X\*\d][X\*\d\-\s]{3,30}[X\*\d])", + re.IGNORECASE, + ), + re.compile( + r"Account\s*[:.]\s*([X\*\d][X\*\d\-\s]{3,30}[X\*\d])", + re.IGNORECASE, + ), + re.compile( + r"A/?[Cc]\s*(?:#|No\.?)?\s*[:.]?\s*" + r"([X\*\d][X\*\d\-\s]{3,30}[X\*\d])", + re.IGNORECASE, + ), +] + + +def _extract_account_number(text: str) -> str | None: + """Find the first plausible account number in *text*. + + Plausible = at least 4 digit characters and matched near an + 'Account' label. Whitespace is collapsed; the literal mask + characters (``X``, ``*``) and dashes are preserved so the + user sees ``****1234`` rather than ``1234`` (which would lose + information). + """ + for rx in _ACCOUNT_RES: + for m in rx.finditer(text): + value = re.sub(r"\s+", " ", m.group(1).strip()) + digit_count = sum(1 for c in value if c.isdigit()) + if digit_count >= 4: + return value + return None + + +_PERIOD_LABEL_RE = re.compile( + r"(?:Statement\s*(?:Period|Date)|" + r"For\s+the\s+(?:period|statement\s+period)|" + r"Period\s+(?:Covered|Beginning|of\s+Statement)|" + r"From)", + re.IGNORECASE, +) + + +def _extract_statement_period( + text: str, +) -> tuple[str | None, str | None]: + """Locate the statement period dates and return them as ISO + ``(start, end)`` or ``(None, None)``. + + Strategy: find every "Statement Period" / "From" / etc. label, + then look for full-year dates in the ~150 chars following the + label. The first two dates become start/end. If only one date + appears, both fields get the same value (single-statement-date + case — common on monthly cycles where only the closing date + is shown). + """ + for label_m in _PERIOD_LABEL_RE.finditer(text): + snippet = text[label_m.end() : label_m.end() + 150] + dates: list[tuple[int, str]] = [] + for rx in _DATE_RES_FULL: + for m in rx.finditer(snippet): + iso = parse_date(m.group(1)) + if iso: + dates.append((m.start(), iso)) + if dates: + dates.sort(key=lambda x: x[0]) + if len(dates) >= 2: + return dates[0][1], dates[1][1] + return dates[0][1], dates[0][1] + return None, None + + +def extract_statement_metadata( + pages: list[Page], +) -> dict[str, str | None]: + """Pull account number + statement period out of the header + region of *pages*. + + Searches page 1's text, falling back to page 1 + 2 combined + if page 1's account/period detection comes up empty (some + statements put header info on page 2 — Wells Fargo business + accounts do this). + + Returns ``{"account_number", "period_start", "period_end"}`` + with ``None`` for any field that couldn't be detected. ISO + format for the dates. + """ + if not pages: + return { + "account_number": None, + "period_start": None, + "period_end": None, + } + + text = pages[0].text + account = _extract_account_number(text) + start, end = _extract_statement_period(text) + + # Fallback to pages 1+2 if anything was missed. + if (account is None or start is None) and len(pages) > 1: + extended = pages[0].text + "\n" + pages[1].text + if account is None: + account = _extract_account_number(extended) + if start is None: + start, end = _extract_statement_period(extended) + + return { + "account_number": account, + "period_start": start, + "period_end": end, + } + + +def _infer_year_for_short_date( + raw_date: str, + period_end_iso: str | None, +) -> str | None: + """Try to bind a short date like ``01/13`` or ``Jan 13`` to + the year of the statement period's end. Returns ISO or None + if no candidate format parses. + + Doesn't handle the December-in-January-statement cross-year + case — too rare to be worth the complexity. The user sees the + inferred year in the editor and can correct if needed; the + raw text stays in the ``raw`` column for reference. + """ + if not raw_date or not period_end_iso: + return None + try: + end_year = int(period_end_iso[:4]) + except (ValueError, IndexError): + return None + + candidates = [ + ("%m/%d/%Y", f"{raw_date}/{end_year}"), + ("%m-%d-%Y", f"{raw_date}-{end_year}"), + ("%b %d %Y", f"{raw_date} {end_year}"), + ("%d %b %Y", f"{raw_date} {end_year}"), + ] + for fmt, candidate in candidates: + try: + return datetime.strptime(candidate, fmt).strftime("%Y-%m-%d") + except ValueError: + continue + return None + + def _description_from_row( row_words: list[WordBox], date_ranges: list[tuple[int, int]], @@ -564,6 +738,7 @@ def scan_pdf_for_transactions( date_formats: list[str] | None = None, y_tolerance: float = 3.0, merge_multiline_descriptions: bool = True, + output_date_format: str = "%Y%m%d", ) -> tuple[list[dict[str, Any]], list[str]]: """Scan *pdf_bytes* for transaction-like rows. @@ -571,23 +746,36 @@ def scan_pdf_for_transactions( amount pattern. Each returned record looks like:: { - "date": "2026-01-15", # ISO, or raw text if unparsable + "date": "20260115", # output_date_format applied "description": "...", - "amount_1": 4.50, # always present + "amount_1": 4.50, "amount_2": 1000.00, # if a second amount was found - "amount_3": ..., # if a third was found "page": 1, "raw": "01/15/2026 Coffee $4.50", + "account_number": "****1234", # from header + "statement_period_start": "20260101", + "statement_period_end": "20260131", } - Multi-line descriptions (rows with no date and no amount) attach - to the most recent transaction row when + Header metadata (``account_number`` / + ``statement_period_start`` / ``statement_period_end``) is + extracted once per PDF and stamped onto every detected row. + That way a multi-statement CSV remains attributable per row + when it's reshaped or imported elsewhere. + + Short dates without a year (``01/13``, ``Jan 13``) are bound + to the year of the statement period's end before formatting. + If period detection fails, the raw short text is preserved. + + Multi-line descriptions (rows with no date and no amount) + attach to the most recent transaction row when ``merge_multiline_descriptions=True`` (default). Returns ``(rows, warnings)``. Warnings are human-readable strings the GUI surfaces in an expander. """ pages, warnings = extract_pages_auto(pdf_bytes, allow_ocr=allow_ocr) + metadata = extract_statement_metadata(pages) out_rows: list[dict[str, Any]] = [] prev: dict[str, Any] | None = None @@ -628,11 +816,20 @@ def scan_pdf_for_transactions( row_words, date_ranges, amount_idxs, ) + iso = parse_date(first_date_text, date_formats) + if iso is None: + # Short date — try to bind to the statement period + # year before falling back to the raw text. + iso = _infer_year_for_short_date( + first_date_text, metadata["period_end"], + ) + formatted_date = ( + format_date(iso, output_date_format) + if iso else first_date_text + ) + record: dict[str, Any] = { - "date": ( - parse_date(first_date_text, date_formats) - or first_date_text - ), + "date": formatted_date, "description": desc, "page": page.page_no, "raw": line, @@ -658,6 +855,16 @@ def scan_pdf_for_transactions( if not _has_real_transaction_amount(record): continue + # Stamp the header metadata onto every kept row so the + # CSV is self-attributing. + record["account_number"] = metadata["account_number"] or "" + record["statement_period_start"] = format_date( + metadata["period_start"], output_date_format, + ) + record["statement_period_end"] = format_date( + metadata["period_end"], output_date_format, + ) + out_rows.append(record) prev = record @@ -731,6 +938,8 @@ __all__ = [ "diagnose_pdf_lines", "extract_pages", "extract_pages_auto", + "extract_statement_metadata", + "format_date", "ocr_available", "parse_amount", "parse_date", diff --git a/tests/test_pdf_extract.py b/tests/test_pdf_extract.py index ebaba7c..517d9a2 100644 --- a/tests/test_pdf_extract.py +++ b/tests/test_pdf_extract.py @@ -15,9 +15,14 @@ from __future__ import annotations from src.pdf_extract import ( Page, WordBox, + _extract_account_number, + _extract_statement_period, _find_amount_tokens, _find_dates_in_words, + _infer_year_for_short_date, cluster_rows, + extract_statement_metadata, + format_date, parse_amount, parse_date, ) @@ -207,3 +212,134 @@ class TestFindAmountTokens: # test module — they need ``scan_pdf_for_transactions`` which in # turn uses ``extract_pages_auto``. The unit-test layer here pins # the building blocks; smoke tests pin the wiring. + + +class TestFormatDate: + def test_yyyymmdd(self): + assert format_date("2026-01-13", "%Y%m%d") == "20260113" + + def test_iso_passthrough(self): + assert format_date("2026-01-13", "%Y-%m-%d") == "2026-01-13" + + def test_us(self): + assert format_date("2026-01-13", "%m/%d/%Y") == "01/13/2026" + + def test_invalid_input_passes_through(self): + # Non-ISO input — return as-is so the user sees what was + # actually there rather than a silent empty string. + assert format_date("01/13", "%Y%m%d") == "01/13" + + def test_none_or_empty(self): + assert format_date(None) == "" + assert format_date("") == "" + + +class TestExtractAccountNumber: + def test_masked(self): + text = "Customer Name\nAccount Number: ****1234\nBalance" + assert _extract_account_number(text) == "****1234" + + def test_with_hyphens(self): + text = "Account #: 1234-5678-9012" + assert _extract_account_number(text) == "1234-5678-9012" + + def test_with_spaces(self): + text = "Account: 1234 5678 9012" + assert _extract_account_number(text) == "1234 5678 9012" + + def test_no_label_no_match(self): + text = "Just some text with 1234567890 in it" + assert _extract_account_number(text) is None + + def test_requires_at_least_four_digits(self): + # An "account" label followed by only XX shouldn't count. + text = "Account: XX" + assert _extract_account_number(text) is None + + +class TestExtractStatementPeriod: + def test_standard_period(self): + text = "Statement Period: 01/01/2025 - 01/31/2025\nBalance" + start, end = _extract_statement_period(text) + assert start == "2025-01-01" + assert end == "2025-01-31" + + def test_from_to(self): + text = "From 01/01/2025 to 01/31/2025" + start, end = _extract_statement_period(text) + assert start == "2025-01-01" + assert end == "2025-01-31" + + def test_single_date_both_fields(self): + # When only one date appears near the label, return it for both. + text = "Statement Date: 01/31/2025" + start, end = _extract_statement_period(text) + assert start == "2025-01-31" + assert end == "2025-01-31" + + def test_no_label_no_match(self): + text = "Some random text with 01/01/2025 in it" + start, end = _extract_statement_period(text) + # No "Period" / "From" / "Statement Date" label + assert (start, end) == (None, None) + + +class TestExtractStatementMetadata: + def test_full_header(self): + pages = [Page( + page_no=1, width=600, height=800, + text=( + "ACME BANK\n" + "Customer: John Doe\n" + "Account Number: ****5678\n" + "Statement Period: 01/01/2025 - 01/31/2025\n" + "Beginning balance: $1,000.00\n" + ), + words=[], + )] + meta = extract_statement_metadata(pages) + assert meta["account_number"] == "****5678" + assert meta["period_start"] == "2025-01-01" + assert meta["period_end"] == "2025-01-31" + + def test_no_pages(self): + meta = extract_statement_metadata([]) + assert meta == { + "account_number": None, + "period_start": None, + "period_end": None, + } + + def test_fallback_to_page_two(self): + # Page 1 has only account; period is on page 2. + p1 = Page( + page_no=1, width=600, height=800, + text="Account Number: ****1234\nBalance summary", + words=[], + ) + p2 = Page( + page_no=2, width=600, height=800, + text="Statement Period: 02/01/2025 - 02/28/2025", + words=[], + ) + meta = extract_statement_metadata([p1, p2]) + assert meta["account_number"] == "****1234" + assert meta["period_start"] == "2025-02-01" + assert meta["period_end"] == "2025-02-28" + + +class TestInferYearForShortDate: + def test_us_short_with_period_end(self): + assert _infer_year_for_short_date("01/13", "2025-01-31") == "2025-01-13" + + def test_short_dash(self): + assert _infer_year_for_short_date("01-13", "2025-01-31") == "2025-01-13" + + def test_month_name(self): + assert _infer_year_for_short_date("Jan 13", "2025-01-31") == "2025-01-13" + + def test_no_period_end(self): + assert _infer_year_for_short_date("01/13", None) is None + + def test_unparseable(self): + assert _infer_year_for_short_date("xx/yy", "2025-01-31") is None diff --git a/tests/test_pdf_extract_smoke.py b/tests/test_pdf_extract_smoke.py index eae937f..79a0d13 100644 --- a/tests/test_pdf_extract_smoke.py +++ b/tests/test_pdf_extract_smoke.py @@ -15,6 +15,45 @@ from __future__ import annotations import pytest +def _build_statement_pdf_with_header() -> bytes: + """Statement with realistic header (account + period) plus + transactions. Exercises the metadata-extraction path end-to-end.""" + from fpdf import FPDF + + pdf = FPDF(orientation="P", unit="pt", format="letter") + pdf.add_page() + pdf.set_font("Helvetica", size=12) + pdf.set_xy(40, 50) + pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT") + pdf.set_xy(40, 70) + pdf.cell(0, 14, "Account Number: ****5678", new_x="LMARGIN", new_y="NEXT") + pdf.set_xy(40, 85) + pdf.cell(0, 14, "Statement Period: 01/01/2025 - 01/31/2025", + new_x="LMARGIN", new_y="NEXT") + # Header row + pdf.set_xy(40, 130) + pdf.cell(120, 14, "Date") + pdf.set_xy(160, 130) + pdf.cell(200, 14, "Description") + pdf.set_xy(360, 130) + pdf.cell(80, 14, "Amount") + # Transactions with SHORT dates — year is implied by period. + rows = [ + ("01/13", "Coffee Shop", "(4.50)"), + ("01/16", "Refund Vendor", "$12.00"), + ] + y = 160 + for date, desc, amt in rows: + pdf.set_xy(40, y) + pdf.cell(120, 14, date) + pdf.set_xy(160, y) + pdf.cell(200, 14, desc) + pdf.set_xy(360, y) + pdf.cell(80, 14, amt) + y += 20 + return bytes(pdf.output()) + + def _build_tiny_statement_pdf() -> bytes: """One-page PDF: header line + three transaction rows + a closing-balance footer. The scanner should pick up exactly the @@ -97,13 +136,34 @@ class TestScanPdfForTransactions: f"{[r.get('raw') for r in rows]}" ) - def test_parses_dates_to_iso(self, pdf_bytes): + def test_dates_formatted_yyyymmdd_by_default(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions rows, _ = scan_pdf_for_transactions(pdf_bytes) + # Default output format is %Y%m%d + assert [r["date"] for r in rows] == [ + "20260115", "20260116", "20260117", + ] + + def test_output_date_format_override(self, pdf_bytes): + from src.pdf_extract import scan_pdf_for_transactions + rows, _ = scan_pdf_for_transactions( + pdf_bytes, output_date_format="%Y-%m-%d", + ) assert [r["date"] for r in rows] == [ "2026-01-15", "2026-01-16", "2026-01-17", ] + def test_metadata_fields_present_on_every_row(self, pdf_bytes): + from src.pdf_extract import scan_pdf_for_transactions + rows, _ = scan_pdf_for_transactions(pdf_bytes) + # The fixture PDF has no statement-period or account + # header, so the metadata fields exist but are empty + # strings — the contract is: ALWAYS present on every row. + for r in rows: + assert "account_number" in r + assert "statement_period_start" in r + assert "statement_period_end" in r + def test_parses_amounts_with_signs(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions rows, _ = scan_pdf_for_transactions(pdf_bytes) @@ -144,6 +204,42 @@ class TestScanPdfForTransactions: # --------------------------------------------------------------------------- +class TestStatementHeaderEndToEnd: + """A real PDF with a real header — exercise the full pipeline: + metadata extraction + year inference for short dates + format + application. This is the failure mode most likely to break on + the user's actual Chase statements.""" + + @pytest.fixture + def pdf_bytes(self) -> bytes: + return _build_statement_pdf_with_header() + + def test_metadata_extracted_and_stamped(self, pdf_bytes): + from src.pdf_extract import scan_pdf_for_transactions + rows, _ = scan_pdf_for_transactions(pdf_bytes) + assert rows, "expected at least one transaction" + for r in rows: + assert r["account_number"] == "****5678" + assert r["statement_period_start"] == "20250101" + assert r["statement_period_end"] == "20250131" + + def test_short_dates_get_year_from_period(self, pdf_bytes): + from src.pdf_extract import scan_pdf_for_transactions + rows, _ = scan_pdf_for_transactions(pdf_bytes) + # Short ``01/13`` + period ending in 2025 → 20250113 + assert rows[0]["date"] == "20250113" + assert rows[1]["date"] == "20250116" + + def test_iso_format_round_trip(self, pdf_bytes): + from src.pdf_extract import scan_pdf_for_transactions + rows, _ = scan_pdf_for_transactions( + pdf_bytes, output_date_format="%Y-%m-%d", + ) + assert rows[0]["date"] == "2025-01-13" + assert rows[0]["statement_period_start"] == "2025-01-01" + assert rows[0]["statement_period_end"] == "2025-01-31" + + class TestMultiDateRow: """Some statements (Chase, BofA) show both a transaction date and a posting date per row. The scanner uses the first date