feat(pdf): default output date format to YYYY-MM-DD
User asked to flip the default from YYYYMMDD to YYYY-MM-DD. ISO is the better default for an accountant CSV workflow: - Lexicographic sort = chronological sort (no parsing needed). - Every spreadsheet tool the user might import into recognises it as a real date with no ambiguity (US vs EU readers can't disagree on the order). - Hyphens make the year/month/day boundaries scan-able by eye. Concrete changes: - New module constant ``DEFAULT_DATE_FORMAT = "%Y-%m-%d"``, used as the default for ``format_date()`` and the ``output_date_format`` keyword on ``scan_pdf_for_transactions``. - Page's ``_DATE_FORMAT_CHOICES`` reordered so the ISO entry is first (index 0 = default Streamlit selection); YYYYMMDD drops to second. - Custom-strftime input default also flips to ``%Y-%m-%d``. Tests updated to reflect the new default (``test_dates_formatted_iso_by_default``, ``test_short_dates_get_year_from_period``, ``test_compact_format_round_trip``, plus a new ``test_default_is_iso`` for the format_date helper). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -128,8 +128,8 @@ if not _pdf_ok:
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_DATE_FORMAT_CHOICES = {
|
||||
"YYYYMMDD (20260113)": "%Y%m%d",
|
||||
"YYYY-MM-DD (2026-01-13)": "%Y-%m-%d",
|
||||
"YYYYMMDD (20260113)": "%Y%m%d",
|
||||
"MM/DD/YYYY (01/13/2026)": "%m/%d/%Y",
|
||||
"DD/MM/YYYY (13/01/2026)": "%d/%m/%Y",
|
||||
"MMM DD, YYYY (Jan 13, 2026)": "%b %d, %Y",
|
||||
@@ -173,10 +173,10 @@ with st.expander("Scan options", expanded=False):
|
||||
if output_date_format == "__custom__":
|
||||
output_date_format = c4.text_input(
|
||||
"Custom strftime format",
|
||||
value="%Y%m%d",
|
||||
value="%Y-%m-%d",
|
||||
help=(
|
||||
"Python ``strftime`` codes — e.g., ``%Y%m%d`` for "
|
||||
"20260113, ``%Y-%m-%d`` for 2026-01-13."
|
||||
"Python ``strftime`` codes — e.g., ``%Y-%m-%d`` for "
|
||||
"2026-01-13, ``%Y%m%d`` for 20260113."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -520,6 +520,13 @@ def _find_amount_tokens(
|
||||
return out
|
||||
|
||||
|
||||
DEFAULT_DATE_FORMAT = "%Y-%m-%d"
|
||||
"""ISO-8601-style ``YYYY-MM-DD``. Default for output date columns
|
||||
because it sorts lexicographically, parses in every spreadsheet
|
||||
tool the user might import the CSV into, and is unambiguous
|
||||
across US/EU readers."""
|
||||
|
||||
|
||||
def format_amount(value, places: int = 2) -> str:
|
||||
"""Render an amount value as a fixed-precision string.
|
||||
|
||||
@@ -549,7 +556,7 @@ def format_amount(value, places: int = 2) -> str:
|
||||
return str(value)
|
||||
|
||||
|
||||
def format_date(iso_str: str | None, fmt: str = "%Y%m%d") -> str:
|
||||
def format_date(iso_str: str | None, fmt: str = DEFAULT_DATE_FORMAT) -> str:
|
||||
"""Convert an ISO ``YYYY-MM-DD`` date string to *fmt*.
|
||||
|
||||
Returns the input unchanged if it's not parseable as ISO,
|
||||
@@ -840,7 +847,7 @@ def scan_pdf_for_transactions(
|
||||
date_formats: list[str] | None = None,
|
||||
y_tolerance: float = 3.0,
|
||||
merge_multiline_descriptions: bool = True,
|
||||
output_date_format: str = "%Y%m%d",
|
||||
output_date_format: str = DEFAULT_DATE_FORMAT,
|
||||
filename_year_hint: int | None = None,
|
||||
year_override: int | None = None,
|
||||
) -> tuple[list[dict[str, Any]], list[str]]:
|
||||
@@ -850,7 +857,7 @@ def scan_pdf_for_transactions(
|
||||
amount pattern. Each returned record looks like::
|
||||
|
||||
{
|
||||
"date": "20260115", # output_date_format applied
|
||||
"date": "2026-01-15", # output_date_format applied
|
||||
"description": "...",
|
||||
"amount_1": 4.50,
|
||||
"amount_2": 1000.00, # if a second amount was found
|
||||
|
||||
@@ -272,6 +272,11 @@ class TestFormatDate:
|
||||
assert format_date(None) == ""
|
||||
assert format_date("") == ""
|
||||
|
||||
def test_default_is_iso(self):
|
||||
# Default format changed to ISO ``YYYY-MM-DD`` — sorts
|
||||
# naturally and parses across every spreadsheet tool.
|
||||
assert format_date("2026-01-13") == "2026-01-13"
|
||||
|
||||
|
||||
class TestExtractAccountNumber:
|
||||
def test_masked(self):
|
||||
|
||||
@@ -136,21 +136,23 @@ class TestScanPdfForTransactions:
|
||||
f"{[r.get('raw') for r in rows]}"
|
||||
)
|
||||
|
||||
def test_dates_formatted_yyyymmdd_by_default(self, pdf_bytes):
|
||||
def test_dates_formatted_iso_by_default(self, pdf_bytes):
|
||||
from src.pdf_extract import scan_pdf_for_transactions
|
||||
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||
# Default output format is %Y%m%d
|
||||
# Default output format is %Y-%m-%d — ISO ordering, parses
|
||||
# cleanly in every spreadsheet tool the user might import
|
||||
# this CSV into.
|
||||
assert [r["date"] for r in rows] == [
|
||||
"20260115", "20260116", "20260117",
|
||||
"2026-01-15", "2026-01-16", "2026-01-17",
|
||||
]
|
||||
|
||||
def test_output_date_format_override(self, pdf_bytes):
|
||||
from src.pdf_extract import scan_pdf_for_transactions
|
||||
rows, _ = scan_pdf_for_transactions(
|
||||
pdf_bytes, output_date_format="%Y-%m-%d",
|
||||
pdf_bytes, output_date_format="%Y%m%d",
|
||||
)
|
||||
assert [r["date"] for r in rows] == [
|
||||
"2026-01-15", "2026-01-16", "2026-01-17",
|
||||
"20260115", "20260116", "20260117",
|
||||
]
|
||||
|
||||
def test_account_number_field_present_on_every_row(self, pdf_bytes):
|
||||
@@ -224,12 +226,13 @@ class TestStatementHeaderEndToEnd:
|
||||
def test_short_dates_get_year_from_period(self, pdf_bytes):
|
||||
from src.pdf_extract import scan_pdf_for_transactions
|
||||
rows, _ = scan_pdf_for_transactions(pdf_bytes)
|
||||
# Short ``01/13`` + period ending in 2025 → 20250113.
|
||||
# Short ``01/13`` + period ending in 2025 → 2025-01-13.
|
||||
# The period itself isn't surfaced as a column anymore, but
|
||||
# the year inference that depends on it still works because
|
||||
# extraction happens internally before the per-row stamp.
|
||||
assert rows[0]["date"] == "20250113"
|
||||
assert rows[1]["date"] == "20250116"
|
||||
# Output is in the default ISO format now.
|
||||
assert rows[0]["date"] == "2025-01-13"
|
||||
assert rows[1]["date"] == "2025-01-16"
|
||||
|
||||
def test_period_fields_not_in_output(self, pdf_bytes):
|
||||
from src.pdf_extract import scan_pdf_for_transactions
|
||||
@@ -238,12 +241,12 @@ class TestStatementHeaderEndToEnd:
|
||||
assert "statement_period_start" not in r
|
||||
assert "statement_period_end" not in r
|
||||
|
||||
def test_iso_format_round_trip(self, pdf_bytes):
|
||||
def test_compact_format_round_trip(self, pdf_bytes):
|
||||
from src.pdf_extract import scan_pdf_for_transactions
|
||||
rows, _ = scan_pdf_for_transactions(
|
||||
pdf_bytes, output_date_format="%Y-%m-%d",
|
||||
pdf_bytes, output_date_format="%Y%m%d",
|
||||
)
|
||||
assert rows[0]["date"] == "2025-01-13"
|
||||
assert rows[0]["date"] == "20250113"
|
||||
|
||||
|
||||
class TestMultiDateRow:
|
||||
|
||||
Reference in New Issue
Block a user