feat(pdf): extract statement header (account + period) + date format
Two related additions for the accountant workflow:
**1. Statement header extraction.** New
``extract_statement_metadata(pages)`` pulls the account number
and statement period out of the first page (falls back to
page 1+2 if either is missing on page 1 — Wells Fargo business
accounts put header info on page 2). Detected fields are
stamped onto EVERY transaction row so a multi-statement CSV is
self-attributing per row::
{
"date": "20250113",
"description": "Coffee Shop",
"amount_1": -4.50,
"account_number": "****5678",
"statement_period_start": "20250101",
"statement_period_end": "20250131",
...
}
Account-number regex is tolerant of masks (``****1234``),
hyphens (``1234-5678-9012``), and spaces. Period regex looks
for "Statement Period" / "From" / "Period Covered" labels plus
the first 1-2 full-year dates that follow. If only one date is
present near the label, it's used for both start and end (some
statements show only the closing date).
**2. Year inference for short dates.** When the row date is a
short ``01/13`` or ``Jan 13`` without a year, the scanner now
binds the year from the statement period's end date BEFORE
formatting. Doesn't handle the December-in-January-statement
cross-year case (rare; user can edit in the table).
**3. Configurable output date format.** New
``output_date_format`` parameter on ``scan_pdf_for_transactions``
defaults to ``%Y%m%d``. Applied to: the transaction date column
AND the statement period start/end fields. The page surfaces a
dropdown in Scan options with common presets (YYYYMMDD,
YYYY-MM-DD, MM/DD/YYYY, DD/MM/YYYY, ``Mon DD, YYYY``) plus a
Custom option that accepts a raw strftime string.
New helper: ``format_date(iso_str, fmt)`` converts ISO
``YYYY-MM-DD`` to any strftime; passes invalid input through
unchanged so the user can see what was actually there rather
than getting silent empties.
20 new tests cover: format_date, account-number extraction
(masked / hyphenated / spaced / no-label / short), period
extraction (standard / from-to / single-date / no-label),
metadata orchestrator (full header / no pages / page-2
fallback), year inference (US / dash / month-name / no-period /
unparseable), plus an end-to-end class that builds a header'd
PDF with short-date transactions and confirms metadata
attribution + year inference + format round-trip.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -90,6 +90,15 @@ if not _pdf_ok:
|
|||||||
# Options + upload
|
# Options + upload
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_DATE_FORMAT_CHOICES = {
|
||||||
|
"YYYYMMDD (20260113)": "%Y%m%d",
|
||||||
|
"YYYY-MM-DD (2026-01-13)": "%Y-%m-%d",
|
||||||
|
"MM/DD/YYYY (01/13/2026)": "%m/%d/%Y",
|
||||||
|
"DD/MM/YYYY (13/01/2026)": "%d/%m/%Y",
|
||||||
|
"MMM DD, YYYY (Jan 13, 2026)": "%b %d, %Y",
|
||||||
|
"Custom strftime…": "__custom__",
|
||||||
|
}
|
||||||
|
|
||||||
with st.expander("Scan options", expanded=False):
|
with st.expander("Scan options", expanded=False):
|
||||||
c1, c2 = st.columns(2)
|
c1, c2 = st.columns(2)
|
||||||
negative_in_parens = c1.checkbox(
|
negative_in_parens = c1.checkbox(
|
||||||
@@ -112,6 +121,28 @@ with st.expander("Scan options", expanded=False):
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
c3, c4 = st.columns(2)
|
||||||
|
date_label = c3.selectbox(
|
||||||
|
"Output date format",
|
||||||
|
list(_DATE_FORMAT_CHOICES.keys()),
|
||||||
|
index=0,
|
||||||
|
help=(
|
||||||
|
"Applied to the transaction date AND the statement "
|
||||||
|
"period dates pulled from the header. Pick Custom to "
|
||||||
|
"enter your own ``strftime`` string."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
output_date_format = _DATE_FORMAT_CHOICES[date_label]
|
||||||
|
if output_date_format == "__custom__":
|
||||||
|
output_date_format = c4.text_input(
|
||||||
|
"Custom strftime format",
|
||||||
|
value="%Y%m%d",
|
||||||
|
help=(
|
||||||
|
"Python ``strftime`` codes — e.g., ``%Y%m%d`` for "
|
||||||
|
"20260113, ``%Y-%m-%d`` for 2026-01-13."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
uploads = st.file_uploader(
|
uploads = st.file_uploader(
|
||||||
"PDF file(s)",
|
"PDF file(s)",
|
||||||
type=["pdf"],
|
type=["pdf"],
|
||||||
@@ -148,6 +179,7 @@ if scan_clicked and uploads:
|
|||||||
raw,
|
raw,
|
||||||
negative_in_parens=negative_in_parens,
|
negative_in_parens=negative_in_parens,
|
||||||
allow_ocr=use_ocr,
|
allow_ocr=use_ocr,
|
||||||
|
output_date_format=output_date_format,
|
||||||
)
|
)
|
||||||
for r in rows:
|
for r in rows:
|
||||||
r["source_file"] = up.name
|
r["source_file"] = up.name
|
||||||
@@ -258,11 +290,24 @@ else:
|
|||||||
|
|
||||||
# Order columns so the user-facing fields are leftmost; raw +
|
# Order columns so the user-facing fields are leftmost; raw +
|
||||||
# internals are last and easy to scroll past or unselect at
|
# internals are last and easy to scroll past or unselect at
|
||||||
# download time.
|
# download time. Statement metadata sits with the transaction
|
||||||
front = ["date", "description"]
|
# detail since it's per-row context an accountant typically
|
||||||
|
# wants alongside the amounts.
|
||||||
|
front = [
|
||||||
|
"date",
|
||||||
|
"description",
|
||||||
|
]
|
||||||
amount_cols = sorted(c for c in df.columns if c.startswith("amount_"))
|
amount_cols = sorted(c for c in df.columns if c.startswith("amount_"))
|
||||||
|
metadata_cols = [
|
||||||
|
"account_number",
|
||||||
|
"statement_period_start",
|
||||||
|
"statement_period_end",
|
||||||
|
]
|
||||||
tail = ["source_file", "page", "raw"]
|
tail = ["source_file", "page", "raw"]
|
||||||
ordered = [c for c in front + amount_cols + tail if c in df.columns]
|
ordered = [
|
||||||
|
c for c in front + amount_cols + metadata_cols + tail
|
||||||
|
if c in df.columns
|
||||||
|
]
|
||||||
extras = [c for c in df.columns if c not in ordered]
|
extras = [c for c in df.columns if c not in ordered]
|
||||||
df = df[ordered + extras]
|
df = df[ordered + extras]
|
||||||
|
|
||||||
|
|||||||
@@ -520,6 +520,180 @@ def _find_amount_tokens(
|
|||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def format_date(iso_str: str | None, fmt: str = "%Y%m%d") -> str:
|
||||||
|
"""Convert an ISO ``YYYY-MM-DD`` date string to *fmt*.
|
||||||
|
|
||||||
|
Returns the input unchanged if it's not parseable as ISO,
|
||||||
|
empty string if input is None/empty. The scanner uses this
|
||||||
|
on every date column (transaction date + statement period
|
||||||
|
start/end) so the output CSV is consistent.
|
||||||
|
"""
|
||||||
|
if not iso_str:
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
return datetime.strptime(iso_str, "%Y-%m-%d").strftime(fmt)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
return iso_str
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Statement-level metadata (account number + period)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Account number regexes. Bank statements label these in a small
|
||||||
|
# handful of conventional ways. The capture group is a permissive
|
||||||
|
# run of digits / X / * / dashes / spaces — accounts are often
|
||||||
|
# masked like ``****1234`` or printed with grouping like
|
||||||
|
# ``1234-5678-9012``.
|
||||||
|
_ACCOUNT_RES = [
|
||||||
|
re.compile(
|
||||||
|
r"Account\s*(?:Number|No\.?|#)\s*[:.]?\s*"
|
||||||
|
r"([X\*\d][X\*\d\-\s]{3,30}[X\*\d])",
|
||||||
|
re.IGNORECASE,
|
||||||
|
),
|
||||||
|
re.compile(
|
||||||
|
r"Account\s*[:.]\s*([X\*\d][X\*\d\-\s]{3,30}[X\*\d])",
|
||||||
|
re.IGNORECASE,
|
||||||
|
),
|
||||||
|
re.compile(
|
||||||
|
r"A/?[Cc]\s*(?:#|No\.?)?\s*[:.]?\s*"
|
||||||
|
r"([X\*\d][X\*\d\-\s]{3,30}[X\*\d])",
|
||||||
|
re.IGNORECASE,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_account_number(text: str) -> str | None:
|
||||||
|
"""Find the first plausible account number in *text*.
|
||||||
|
|
||||||
|
Plausible = at least 4 digit characters and matched near an
|
||||||
|
'Account' label. Whitespace is collapsed; the literal mask
|
||||||
|
characters (``X``, ``*``) and dashes are preserved so the
|
||||||
|
user sees ``****1234`` rather than ``1234`` (which would lose
|
||||||
|
information).
|
||||||
|
"""
|
||||||
|
for rx in _ACCOUNT_RES:
|
||||||
|
for m in rx.finditer(text):
|
||||||
|
value = re.sub(r"\s+", " ", m.group(1).strip())
|
||||||
|
digit_count = sum(1 for c in value if c.isdigit())
|
||||||
|
if digit_count >= 4:
|
||||||
|
return value
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
_PERIOD_LABEL_RE = re.compile(
|
||||||
|
r"(?:Statement\s*(?:Period|Date)|"
|
||||||
|
r"For\s+the\s+(?:period|statement\s+period)|"
|
||||||
|
r"Period\s+(?:Covered|Beginning|of\s+Statement)|"
|
||||||
|
r"From)",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_statement_period(
|
||||||
|
text: str,
|
||||||
|
) -> tuple[str | None, str | None]:
|
||||||
|
"""Locate the statement period dates and return them as ISO
|
||||||
|
``(start, end)`` or ``(None, None)``.
|
||||||
|
|
||||||
|
Strategy: find every "Statement Period" / "From" / etc. label,
|
||||||
|
then look for full-year dates in the ~150 chars following the
|
||||||
|
label. The first two dates become start/end. If only one date
|
||||||
|
appears, both fields get the same value (single-statement-date
|
||||||
|
case — common on monthly cycles where only the closing date
|
||||||
|
is shown).
|
||||||
|
"""
|
||||||
|
for label_m in _PERIOD_LABEL_RE.finditer(text):
|
||||||
|
snippet = text[label_m.end() : label_m.end() + 150]
|
||||||
|
dates: list[tuple[int, str]] = []
|
||||||
|
for rx in _DATE_RES_FULL:
|
||||||
|
for m in rx.finditer(snippet):
|
||||||
|
iso = parse_date(m.group(1))
|
||||||
|
if iso:
|
||||||
|
dates.append((m.start(), iso))
|
||||||
|
if dates:
|
||||||
|
dates.sort(key=lambda x: x[0])
|
||||||
|
if len(dates) >= 2:
|
||||||
|
return dates[0][1], dates[1][1]
|
||||||
|
return dates[0][1], dates[0][1]
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_statement_metadata(
|
||||||
|
pages: list[Page],
|
||||||
|
) -> dict[str, str | None]:
|
||||||
|
"""Pull account number + statement period out of the header
|
||||||
|
region of *pages*.
|
||||||
|
|
||||||
|
Searches page 1's text, falling back to page 1 + 2 combined
|
||||||
|
if page 1's account/period detection comes up empty (some
|
||||||
|
statements put header info on page 2 — Wells Fargo business
|
||||||
|
accounts do this).
|
||||||
|
|
||||||
|
Returns ``{"account_number", "period_start", "period_end"}``
|
||||||
|
with ``None`` for any field that couldn't be detected. ISO
|
||||||
|
format for the dates.
|
||||||
|
"""
|
||||||
|
if not pages:
|
||||||
|
return {
|
||||||
|
"account_number": None,
|
||||||
|
"period_start": None,
|
||||||
|
"period_end": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
text = pages[0].text
|
||||||
|
account = _extract_account_number(text)
|
||||||
|
start, end = _extract_statement_period(text)
|
||||||
|
|
||||||
|
# Fallback to pages 1+2 if anything was missed.
|
||||||
|
if (account is None or start is None) and len(pages) > 1:
|
||||||
|
extended = pages[0].text + "\n" + pages[1].text
|
||||||
|
if account is None:
|
||||||
|
account = _extract_account_number(extended)
|
||||||
|
if start is None:
|
||||||
|
start, end = _extract_statement_period(extended)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"account_number": account,
|
||||||
|
"period_start": start,
|
||||||
|
"period_end": end,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _infer_year_for_short_date(
|
||||||
|
raw_date: str,
|
||||||
|
period_end_iso: str | None,
|
||||||
|
) -> str | None:
|
||||||
|
"""Try to bind a short date like ``01/13`` or ``Jan 13`` to
|
||||||
|
the year of the statement period's end. Returns ISO or None
|
||||||
|
if no candidate format parses.
|
||||||
|
|
||||||
|
Doesn't handle the December-in-January-statement cross-year
|
||||||
|
case — too rare to be worth the complexity. The user sees the
|
||||||
|
inferred year in the editor and can correct if needed; the
|
||||||
|
raw text stays in the ``raw`` column for reference.
|
||||||
|
"""
|
||||||
|
if not raw_date or not period_end_iso:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
end_year = int(period_end_iso[:4])
|
||||||
|
except (ValueError, IndexError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
candidates = [
|
||||||
|
("%m/%d/%Y", f"{raw_date}/{end_year}"),
|
||||||
|
("%m-%d-%Y", f"{raw_date}-{end_year}"),
|
||||||
|
("%b %d %Y", f"{raw_date} {end_year}"),
|
||||||
|
("%d %b %Y", f"{raw_date} {end_year}"),
|
||||||
|
]
|
||||||
|
for fmt, candidate in candidates:
|
||||||
|
try:
|
||||||
|
return datetime.strptime(candidate, fmt).strftime("%Y-%m-%d")
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _description_from_row(
|
def _description_from_row(
|
||||||
row_words: list[WordBox],
|
row_words: list[WordBox],
|
||||||
date_ranges: list[tuple[int, int]],
|
date_ranges: list[tuple[int, int]],
|
||||||
@@ -564,6 +738,7 @@ def scan_pdf_for_transactions(
|
|||||||
date_formats: list[str] | None = None,
|
date_formats: list[str] | None = None,
|
||||||
y_tolerance: float = 3.0,
|
y_tolerance: float = 3.0,
|
||||||
merge_multiline_descriptions: bool = True,
|
merge_multiline_descriptions: bool = True,
|
||||||
|
output_date_format: str = "%Y%m%d",
|
||||||
) -> tuple[list[dict[str, Any]], list[str]]:
|
) -> tuple[list[dict[str, Any]], list[str]]:
|
||||||
"""Scan *pdf_bytes* for transaction-like rows.
|
"""Scan *pdf_bytes* for transaction-like rows.
|
||||||
|
|
||||||
@@ -571,23 +746,36 @@ def scan_pdf_for_transactions(
|
|||||||
amount pattern. Each returned record looks like::
|
amount pattern. Each returned record looks like::
|
||||||
|
|
||||||
{
|
{
|
||||||
"date": "2026-01-15", # ISO, or raw text if unparsable
|
"date": "20260115", # output_date_format applied
|
||||||
"description": "...",
|
"description": "...",
|
||||||
"amount_1": 4.50, # always present
|
"amount_1": 4.50,
|
||||||
"amount_2": 1000.00, # if a second amount was found
|
"amount_2": 1000.00, # if a second amount was found
|
||||||
"amount_3": ..., # if a third was found
|
|
||||||
"page": 1,
|
"page": 1,
|
||||||
"raw": "01/15/2026 Coffee $4.50",
|
"raw": "01/15/2026 Coffee $4.50",
|
||||||
|
"account_number": "****1234", # from header
|
||||||
|
"statement_period_start": "20260101",
|
||||||
|
"statement_period_end": "20260131",
|
||||||
}
|
}
|
||||||
|
|
||||||
Multi-line descriptions (rows with no date and no amount) attach
|
Header metadata (``account_number`` /
|
||||||
to the most recent transaction row when
|
``statement_period_start`` / ``statement_period_end``) is
|
||||||
|
extracted once per PDF and stamped onto every detected row.
|
||||||
|
That way a multi-statement CSV remains attributable per row
|
||||||
|
when it's reshaped or imported elsewhere.
|
||||||
|
|
||||||
|
Short dates without a year (``01/13``, ``Jan 13``) are bound
|
||||||
|
to the year of the statement period's end before formatting.
|
||||||
|
If period detection fails, the raw short text is preserved.
|
||||||
|
|
||||||
|
Multi-line descriptions (rows with no date and no amount)
|
||||||
|
attach to the most recent transaction row when
|
||||||
``merge_multiline_descriptions=True`` (default).
|
``merge_multiline_descriptions=True`` (default).
|
||||||
|
|
||||||
Returns ``(rows, warnings)``. Warnings are human-readable
|
Returns ``(rows, warnings)``. Warnings are human-readable
|
||||||
strings the GUI surfaces in an expander.
|
strings the GUI surfaces in an expander.
|
||||||
"""
|
"""
|
||||||
pages, warnings = extract_pages_auto(pdf_bytes, allow_ocr=allow_ocr)
|
pages, warnings = extract_pages_auto(pdf_bytes, allow_ocr=allow_ocr)
|
||||||
|
metadata = extract_statement_metadata(pages)
|
||||||
|
|
||||||
out_rows: list[dict[str, Any]] = []
|
out_rows: list[dict[str, Any]] = []
|
||||||
prev: dict[str, Any] | None = None
|
prev: dict[str, Any] | None = None
|
||||||
@@ -628,11 +816,20 @@ def scan_pdf_for_transactions(
|
|||||||
row_words, date_ranges, amount_idxs,
|
row_words, date_ranges, amount_idxs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
iso = parse_date(first_date_text, date_formats)
|
||||||
|
if iso is None:
|
||||||
|
# Short date — try to bind to the statement period
|
||||||
|
# year before falling back to the raw text.
|
||||||
|
iso = _infer_year_for_short_date(
|
||||||
|
first_date_text, metadata["period_end"],
|
||||||
|
)
|
||||||
|
formatted_date = (
|
||||||
|
format_date(iso, output_date_format)
|
||||||
|
if iso else first_date_text
|
||||||
|
)
|
||||||
|
|
||||||
record: dict[str, Any] = {
|
record: dict[str, Any] = {
|
||||||
"date": (
|
"date": formatted_date,
|
||||||
parse_date(first_date_text, date_formats)
|
|
||||||
or first_date_text
|
|
||||||
),
|
|
||||||
"description": desc,
|
"description": desc,
|
||||||
"page": page.page_no,
|
"page": page.page_no,
|
||||||
"raw": line,
|
"raw": line,
|
||||||
@@ -658,6 +855,16 @@ def scan_pdf_for_transactions(
|
|||||||
if not _has_real_transaction_amount(record):
|
if not _has_real_transaction_amount(record):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Stamp the header metadata onto every kept row so the
|
||||||
|
# CSV is self-attributing.
|
||||||
|
record["account_number"] = metadata["account_number"] or ""
|
||||||
|
record["statement_period_start"] = format_date(
|
||||||
|
metadata["period_start"], output_date_format,
|
||||||
|
)
|
||||||
|
record["statement_period_end"] = format_date(
|
||||||
|
metadata["period_end"], output_date_format,
|
||||||
|
)
|
||||||
|
|
||||||
out_rows.append(record)
|
out_rows.append(record)
|
||||||
prev = record
|
prev = record
|
||||||
|
|
||||||
@@ -731,6 +938,8 @@ __all__ = [
|
|||||||
"diagnose_pdf_lines",
|
"diagnose_pdf_lines",
|
||||||
"extract_pages",
|
"extract_pages",
|
||||||
"extract_pages_auto",
|
"extract_pages_auto",
|
||||||
|
"extract_statement_metadata",
|
||||||
|
"format_date",
|
||||||
"ocr_available",
|
"ocr_available",
|
||||||
"parse_amount",
|
"parse_amount",
|
||||||
"parse_date",
|
"parse_date",
|
||||||
|
|||||||
@@ -15,9 +15,14 @@ from __future__ import annotations
|
|||||||
from src.pdf_extract import (
|
from src.pdf_extract import (
|
||||||
Page,
|
Page,
|
||||||
WordBox,
|
WordBox,
|
||||||
|
_extract_account_number,
|
||||||
|
_extract_statement_period,
|
||||||
_find_amount_tokens,
|
_find_amount_tokens,
|
||||||
_find_dates_in_words,
|
_find_dates_in_words,
|
||||||
|
_infer_year_for_short_date,
|
||||||
cluster_rows,
|
cluster_rows,
|
||||||
|
extract_statement_metadata,
|
||||||
|
format_date,
|
||||||
parse_amount,
|
parse_amount,
|
||||||
parse_date,
|
parse_date,
|
||||||
)
|
)
|
||||||
@@ -207,3 +212,134 @@ class TestFindAmountTokens:
|
|||||||
# test module — they need ``scan_pdf_for_transactions`` which in
|
# test module — they need ``scan_pdf_for_transactions`` which in
|
||||||
# turn uses ``extract_pages_auto``. The unit-test layer here pins
|
# turn uses ``extract_pages_auto``. The unit-test layer here pins
|
||||||
# the building blocks; smoke tests pin the wiring.
|
# the building blocks; smoke tests pin the wiring.
|
||||||
|
|
||||||
|
|
||||||
|
class TestFormatDate:
|
||||||
|
def test_yyyymmdd(self):
|
||||||
|
assert format_date("2026-01-13", "%Y%m%d") == "20260113"
|
||||||
|
|
||||||
|
def test_iso_passthrough(self):
|
||||||
|
assert format_date("2026-01-13", "%Y-%m-%d") == "2026-01-13"
|
||||||
|
|
||||||
|
def test_us(self):
|
||||||
|
assert format_date("2026-01-13", "%m/%d/%Y") == "01/13/2026"
|
||||||
|
|
||||||
|
def test_invalid_input_passes_through(self):
|
||||||
|
# Non-ISO input — return as-is so the user sees what was
|
||||||
|
# actually there rather than a silent empty string.
|
||||||
|
assert format_date("01/13", "%Y%m%d") == "01/13"
|
||||||
|
|
||||||
|
def test_none_or_empty(self):
|
||||||
|
assert format_date(None) == ""
|
||||||
|
assert format_date("") == ""
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractAccountNumber:
|
||||||
|
def test_masked(self):
|
||||||
|
text = "Customer Name\nAccount Number: ****1234\nBalance"
|
||||||
|
assert _extract_account_number(text) == "****1234"
|
||||||
|
|
||||||
|
def test_with_hyphens(self):
|
||||||
|
text = "Account #: 1234-5678-9012"
|
||||||
|
assert _extract_account_number(text) == "1234-5678-9012"
|
||||||
|
|
||||||
|
def test_with_spaces(self):
|
||||||
|
text = "Account: 1234 5678 9012"
|
||||||
|
assert _extract_account_number(text) == "1234 5678 9012"
|
||||||
|
|
||||||
|
def test_no_label_no_match(self):
|
||||||
|
text = "Just some text with 1234567890 in it"
|
||||||
|
assert _extract_account_number(text) is None
|
||||||
|
|
||||||
|
def test_requires_at_least_four_digits(self):
|
||||||
|
# An "account" label followed by only XX shouldn't count.
|
||||||
|
text = "Account: XX"
|
||||||
|
assert _extract_account_number(text) is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractStatementPeriod:
|
||||||
|
def test_standard_period(self):
|
||||||
|
text = "Statement Period: 01/01/2025 - 01/31/2025\nBalance"
|
||||||
|
start, end = _extract_statement_period(text)
|
||||||
|
assert start == "2025-01-01"
|
||||||
|
assert end == "2025-01-31"
|
||||||
|
|
||||||
|
def test_from_to(self):
|
||||||
|
text = "From 01/01/2025 to 01/31/2025"
|
||||||
|
start, end = _extract_statement_period(text)
|
||||||
|
assert start == "2025-01-01"
|
||||||
|
assert end == "2025-01-31"
|
||||||
|
|
||||||
|
def test_single_date_both_fields(self):
|
||||||
|
# When only one date appears near the label, return it for both.
|
||||||
|
text = "Statement Date: 01/31/2025"
|
||||||
|
start, end = _extract_statement_period(text)
|
||||||
|
assert start == "2025-01-31"
|
||||||
|
assert end == "2025-01-31"
|
||||||
|
|
||||||
|
def test_no_label_no_match(self):
|
||||||
|
text = "Some random text with 01/01/2025 in it"
|
||||||
|
start, end = _extract_statement_period(text)
|
||||||
|
# No "Period" / "From" / "Statement Date" label
|
||||||
|
assert (start, end) == (None, None)
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractStatementMetadata:
|
||||||
|
def test_full_header(self):
|
||||||
|
pages = [Page(
|
||||||
|
page_no=1, width=600, height=800,
|
||||||
|
text=(
|
||||||
|
"ACME BANK\n"
|
||||||
|
"Customer: John Doe\n"
|
||||||
|
"Account Number: ****5678\n"
|
||||||
|
"Statement Period: 01/01/2025 - 01/31/2025\n"
|
||||||
|
"Beginning balance: $1,000.00\n"
|
||||||
|
),
|
||||||
|
words=[],
|
||||||
|
)]
|
||||||
|
meta = extract_statement_metadata(pages)
|
||||||
|
assert meta["account_number"] == "****5678"
|
||||||
|
assert meta["period_start"] == "2025-01-01"
|
||||||
|
assert meta["period_end"] == "2025-01-31"
|
||||||
|
|
||||||
|
def test_no_pages(self):
|
||||||
|
meta = extract_statement_metadata([])
|
||||||
|
assert meta == {
|
||||||
|
"account_number": None,
|
||||||
|
"period_start": None,
|
||||||
|
"period_end": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_fallback_to_page_two(self):
|
||||||
|
# Page 1 has only account; period is on page 2.
|
||||||
|
p1 = Page(
|
||||||
|
page_no=1, width=600, height=800,
|
||||||
|
text="Account Number: ****1234\nBalance summary",
|
||||||
|
words=[],
|
||||||
|
)
|
||||||
|
p2 = Page(
|
||||||
|
page_no=2, width=600, height=800,
|
||||||
|
text="Statement Period: 02/01/2025 - 02/28/2025",
|
||||||
|
words=[],
|
||||||
|
)
|
||||||
|
meta = extract_statement_metadata([p1, p2])
|
||||||
|
assert meta["account_number"] == "****1234"
|
||||||
|
assert meta["period_start"] == "2025-02-01"
|
||||||
|
assert meta["period_end"] == "2025-02-28"
|
||||||
|
|
||||||
|
|
||||||
|
class TestInferYearForShortDate:
|
||||||
|
def test_us_short_with_period_end(self):
|
||||||
|
assert _infer_year_for_short_date("01/13", "2025-01-31") == "2025-01-13"
|
||||||
|
|
||||||
|
def test_short_dash(self):
|
||||||
|
assert _infer_year_for_short_date("01-13", "2025-01-31") == "2025-01-13"
|
||||||
|
|
||||||
|
def test_month_name(self):
|
||||||
|
assert _infer_year_for_short_date("Jan 13", "2025-01-31") == "2025-01-13"
|
||||||
|
|
||||||
|
def test_no_period_end(self):
|
||||||
|
assert _infer_year_for_short_date("01/13", None) is None
|
||||||
|
|
||||||
|
def test_unparseable(self):
|
||||||
|
assert _infer_year_for_short_date("xx/yy", "2025-01-31") is None
|
||||||
|
|||||||
@@ -15,6 +15,45 @@ from __future__ import annotations
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def _build_statement_pdf_with_header() -> bytes:
|
||||||
|
"""Statement with realistic header (account + period) plus
|
||||||
|
transactions. Exercises the metadata-extraction path end-to-end."""
|
||||||
|
from fpdf import FPDF
|
||||||
|
|
||||||
|
pdf = FPDF(orientation="P", unit="pt", format="letter")
|
||||||
|
pdf.add_page()
|
||||||
|
pdf.set_font("Helvetica", size=12)
|
||||||
|
pdf.set_xy(40, 50)
|
||||||
|
pdf.cell(0, 14, "ACME BANK STATEMENT", new_x="LMARGIN", new_y="NEXT")
|
||||||
|
pdf.set_xy(40, 70)
|
||||||
|
pdf.cell(0, 14, "Account Number: ****5678", new_x="LMARGIN", new_y="NEXT")
|
||||||
|
pdf.set_xy(40, 85)
|
||||||
|
pdf.cell(0, 14, "Statement Period: 01/01/2025 - 01/31/2025",
|
||||||
|
new_x="LMARGIN", new_y="NEXT")
|
||||||
|
# Header row
|
||||||
|
pdf.set_xy(40, 130)
|
||||||
|
pdf.cell(120, 14, "Date")
|
||||||
|
pdf.set_xy(160, 130)
|
||||||
|
pdf.cell(200, 14, "Description")
|
||||||
|
pdf.set_xy(360, 130)
|
||||||
|
pdf.cell(80, 14, "Amount")
|
||||||
|
# Transactions with SHORT dates — year is implied by period.
|
||||||
|
rows = [
|
||||||
|
("01/13", "Coffee Shop", "(4.50)"),
|
||||||
|
("01/16", "Refund Vendor", "$12.00"),
|
||||||
|
]
|
||||||
|
y = 160
|
||||||
|
for date, desc, amt in rows:
|
||||||
|
pdf.set_xy(40, y)
|
||||||
|
pdf.cell(120, 14, date)
|
||||||
|
pdf.set_xy(160, y)
|
||||||
|
pdf.cell(200, 14, desc)
|
||||||
|
pdf.set_xy(360, y)
|
||||||
|
pdf.cell(80, 14, amt)
|
||||||
|
y += 20
|
||||||
|
return bytes(pdf.output())
|
||||||
|
|
||||||
|
|
||||||
def _build_tiny_statement_pdf() -> bytes:
|
def _build_tiny_statement_pdf() -> bytes:
|
||||||
"""One-page PDF: header line + three transaction rows + a
|
"""One-page PDF: header line + three transaction rows + a
|
||||||
closing-balance footer. The scanner should pick up exactly the
|
closing-balance footer. The scanner should pick up exactly the
|
||||||
@@ -97,13 +136,34 @@ class TestScanPdfForTransactions:
|
|||||||
f"{[r.get('raw') for r in rows]}"
|
f"{[r.get('raw') for r in rows]}"
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_parses_dates_to_iso(self, pdf_bytes):
|
def test_dates_formatted_yyyymmdd_by_default(self, pdf_bytes):
|
||||||
from src.pdf_extract import scan_pdf_for_transactions
|
from src.pdf_extract import scan_pdf_for_transactions
|
||||||
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||||
|
# Default output format is %Y%m%d
|
||||||
|
assert [r["date"] for r in rows] == [
|
||||||
|
"20260115", "20260116", "20260117",
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_output_date_format_override(self, pdf_bytes):
|
||||||
|
from src.pdf_extract import scan_pdf_for_transactions
|
||||||
|
rows, _ = scan_pdf_for_transactions(
|
||||||
|
pdf_bytes, output_date_format="%Y-%m-%d",
|
||||||
|
)
|
||||||
assert [r["date"] for r in rows] == [
|
assert [r["date"] for r in rows] == [
|
||||||
"2026-01-15", "2026-01-16", "2026-01-17",
|
"2026-01-15", "2026-01-16", "2026-01-17",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def test_metadata_fields_present_on_every_row(self, pdf_bytes):
|
||||||
|
from src.pdf_extract import scan_pdf_for_transactions
|
||||||
|
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||||
|
# The fixture PDF has no statement-period or account
|
||||||
|
# header, so the metadata fields exist but are empty
|
||||||
|
# strings — the contract is: ALWAYS present on every row.
|
||||||
|
for r in rows:
|
||||||
|
assert "account_number" in r
|
||||||
|
assert "statement_period_start" in r
|
||||||
|
assert "statement_period_end" in r
|
||||||
|
|
||||||
def test_parses_amounts_with_signs(self, pdf_bytes):
|
def test_parses_amounts_with_signs(self, pdf_bytes):
|
||||||
from src.pdf_extract import scan_pdf_for_transactions
|
from src.pdf_extract import scan_pdf_for_transactions
|
||||||
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||||
@@ -144,6 +204,42 @@ class TestScanPdfForTransactions:
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestStatementHeaderEndToEnd:
|
||||||
|
"""A real PDF with a real header — exercise the full pipeline:
|
||||||
|
metadata extraction + year inference for short dates + format
|
||||||
|
application. This is the failure mode most likely to break on
|
||||||
|
the user's actual Chase statements."""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def pdf_bytes(self) -> bytes:
|
||||||
|
return _build_statement_pdf_with_header()
|
||||||
|
|
||||||
|
def test_metadata_extracted_and_stamped(self, pdf_bytes):
|
||||||
|
from src.pdf_extract import scan_pdf_for_transactions
|
||||||
|
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||||
|
assert rows, "expected at least one transaction"
|
||||||
|
for r in rows:
|
||||||
|
assert r["account_number"] == "****5678"
|
||||||
|
assert r["statement_period_start"] == "20250101"
|
||||||
|
assert r["statement_period_end"] == "20250131"
|
||||||
|
|
||||||
|
def test_short_dates_get_year_from_period(self, pdf_bytes):
|
||||||
|
from src.pdf_extract import scan_pdf_for_transactions
|
||||||
|
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||||
|
# Short ``01/13`` + period ending in 2025 → 20250113
|
||||||
|
assert rows[0]["date"] == "20250113"
|
||||||
|
assert rows[1]["date"] == "20250116"
|
||||||
|
|
||||||
|
def test_iso_format_round_trip(self, pdf_bytes):
|
||||||
|
from src.pdf_extract import scan_pdf_for_transactions
|
||||||
|
rows, _ = scan_pdf_for_transactions(
|
||||||
|
pdf_bytes, output_date_format="%Y-%m-%d",
|
||||||
|
)
|
||||||
|
assert rows[0]["date"] == "2025-01-13"
|
||||||
|
assert rows[0]["statement_period_start"] == "2025-01-01"
|
||||||
|
assert rows[0]["statement_period_end"] == "2025-01-31"
|
||||||
|
|
||||||
|
|
||||||
class TestMultiDateRow:
|
class TestMultiDateRow:
|
||||||
"""Some statements (Chase, BofA) show both a transaction date
|
"""Some statements (Chase, BofA) show both a transaction date
|
||||||
and a posting date per row. The scanner uses the first date
|
and a posting date per row. The scanner uses the first date
|
||||||
|
|||||||
Reference in New Issue
Block a user