feat(pdf): default output date format to YYYY-MM-DD

User asked to flip the default from YYYYMMDD to YYYY-MM-DD.
ISO is the better default for an accountant CSV workflow:

- Lexicographic sort = chronological sort (no parsing needed).
- Every spreadsheet tool the user might import into recognises
  it as a real date with no ambiguity (US vs EU readers can't
  disagree on the order).
- Hyphens make the year/month/day boundaries scan-able by eye.

Concrete changes:

- New module constant ``DEFAULT_DATE_FORMAT = "%Y-%m-%d"``,
  used as the default for ``format_date()`` and the
  ``output_date_format`` keyword on
  ``scan_pdf_for_transactions``.
- Page's ``_DATE_FORMAT_CHOICES`` reordered so the ISO entry
  is first (index 0 = default Streamlit selection); YYYYMMDD
  drops to second.
- Custom-strftime input default also flips to ``%Y-%m-%d``.

Tests updated to reflect the new default (``test_dates_formatted_iso_by_default``,
``test_short_dates_get_year_from_period``,
``test_compact_format_round_trip``, plus a new
``test_default_is_iso`` for the format_date helper).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-20 02:04:34 +00:00
parent a0042d4aba
commit 450d4fc9a8
4 changed files with 33 additions and 18 deletions

View File

@@ -128,8 +128,8 @@ if not _pdf_ok:
# ---------------------------------------------------------------------------
_DATE_FORMAT_CHOICES = {
"YYYYMMDD (20260113)": "%Y%m%d",
"YYYY-MM-DD (2026-01-13)": "%Y-%m-%d",
"YYYYMMDD (20260113)": "%Y%m%d",
"MM/DD/YYYY (01/13/2026)": "%m/%d/%Y",
"DD/MM/YYYY (13/01/2026)": "%d/%m/%Y",
"MMM DD, YYYY (Jan 13, 2026)": "%b %d, %Y",
@@ -173,10 +173,10 @@ with st.expander("Scan options", expanded=False):
if output_date_format == "__custom__":
output_date_format = c4.text_input(
"Custom strftime format",
value="%Y%m%d",
value="%Y-%m-%d",
help=(
"Python ``strftime`` codes — e.g., ``%Y%m%d`` for "
"20260113, ``%Y-%m-%d`` for 2026-01-13."
"Python ``strftime`` codes — e.g., ``%Y-%m-%d`` for "
"2026-01-13, ``%Y%m%d`` for 20260113."
),
)

View File

@@ -520,6 +520,13 @@ def _find_amount_tokens(
return out
DEFAULT_DATE_FORMAT = "%Y-%m-%d"
"""ISO-8601-style ``YYYY-MM-DD``. Default for output date columns
because it sorts lexicographically, parses in every spreadsheet
tool the user might import the CSV into, and is unambiguous
across US/EU readers."""
def format_amount(value, places: int = 2) -> str:
"""Render an amount value as a fixed-precision string.
@@ -549,7 +556,7 @@ def format_amount(value, places: int = 2) -> str:
return str(value)
def format_date(iso_str: str | None, fmt: str = "%Y%m%d") -> str:
def format_date(iso_str: str | None, fmt: str = DEFAULT_DATE_FORMAT) -> str:
"""Convert an ISO ``YYYY-MM-DD`` date string to *fmt*.
Returns the input unchanged if it's not parseable as ISO,
@@ -840,7 +847,7 @@ def scan_pdf_for_transactions(
date_formats: list[str] | None = None,
y_tolerance: float = 3.0,
merge_multiline_descriptions: bool = True,
output_date_format: str = "%Y%m%d",
output_date_format: str = DEFAULT_DATE_FORMAT,
filename_year_hint: int | None = None,
year_override: int | None = None,
) -> tuple[list[dict[str, Any]], list[str]]:
@@ -850,7 +857,7 @@ def scan_pdf_for_transactions(
amount pattern. Each returned record looks like::
{
"date": "20260115", # output_date_format applied
"date": "2026-01-15", # output_date_format applied
"description": "...",
"amount_1": 4.50,
"amount_2": 1000.00, # if a second amount was found