feat(pdf): default output date format to YYYY-MM-DD

User asked to flip the default from YYYYMMDD to YYYY-MM-DD. ISO is the better default for an accountant CSV workflow: - Lexicographic sort = chronological sort (no parsing needed). - Every spreadsheet tool the user might import into recognises it as a real date with no ambiguity (US vs EU readers can't disagree on the order). - Hyphens make the year/month/day boundaries scan-able by eye. Concrete changes: - New module constant ``DEFAULT_DATE_FORMAT = "%Y-%m-%d"``, used as the default for ``format_date()`` and the ``output_date_format`` keyword on ``scan_pdf_for_transactions``. - Page's ``_DATE_FORMAT_CHOICES`` reordered so the ISO entry is first (index 0 = default Streamlit selection); YYYYMMDD drops to second. - Custom-strftime input default also flips to ``%Y-%m-%d``. Tests updated to reflect the new default (``test_dates_formatted_iso_by_default``, ``test_short_dates_get_year_from_period``, ``test_compact_format_round_trip``, plus a new ``test_default_is_iso`` for the format_date helper). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 02:04:34 +00:00
parent a0042d4aba
commit 450d4fc9a8
4 changed files with 33 additions and 18 deletions
--- a/tests/test_pdf_extract.py
+++ b/tests/test_pdf_extract.py
@@ -272,6 +272,11 @@ class TestFormatDate:
        assert format_date(None) == ""
        assert format_date("") == ""

+    def test_default_is_iso(self):
+        # Default format changed to ISO ``YYYY-MM-DD`` — sorts
+        # naturally and parses across every spreadsheet tool.
+        assert format_date("2026-01-13") == "2026-01-13"
+

 class TestExtractAccountNumber:
    def test_masked(self):
--- a/tests/test_pdf_extract_smoke.py
+++ b/tests/test_pdf_extract_smoke.py
@@ -136,21 +136,23 @@ class TestScanPdfForTransactions:
            f"{[r.get('raw') for r in rows]}"
        )

-    def test_dates_formatted_yyyymmdd_by_default(self, pdf_bytes):
+    def test_dates_formatted_iso_by_default(self, pdf_bytes):
        from src.pdf_extract import scan_pdf_for_transactions
        rows, _ = scan_pdf_for_transactions(pdf_bytes)
-        # Default output format is %Y%m%d
+        # Default output format is %Y-%m-%d — ISO ordering, parses
+        # cleanly in every spreadsheet tool the user might import
+        # this CSV into.
        assert [r["date"] for r in rows] == [
-            "20260115", "20260116", "20260117",
+            "2026-01-15", "2026-01-16", "2026-01-17",
        ]

    def test_output_date_format_override(self, pdf_bytes):
        from src.pdf_extract import scan_pdf_for_transactions
        rows, _ = scan_pdf_for_transactions(
-            pdf_bytes, output_date_format="%Y-%m-%d",
+            pdf_bytes, output_date_format="%Y%m%d",
        )
        assert [r["date"] for r in rows] == [
-            "2026-01-15", "2026-01-16", "2026-01-17",
+            "20260115", "20260116", "20260117",
        ]

    def test_account_number_field_present_on_every_row(self, pdf_bytes):
@@ -224,12 +226,13 @@ class TestStatementHeaderEndToEnd:
    def test_short_dates_get_year_from_period(self, pdf_bytes):
        from src.pdf_extract import scan_pdf_for_transactions
        rows, _ = scan_pdf_for_transactions(pdf_bytes)
-        # Short ``01/13`` + period ending in 2025 → 20250113.
+        # Short ``01/13`` + period ending in 2025 → 2025-01-13.
        # The period itself isn't surfaced as a column anymore, but
        # the year inference that depends on it still works because
        # extraction happens internally before the per-row stamp.
-        assert rows[0]["date"] == "20250113"
-        assert rows[1]["date"] == "20250116"
+        # Output is in the default ISO format now.
+        assert rows[0]["date"] == "2025-01-13"
+        assert rows[1]["date"] == "2025-01-16"

    def test_period_fields_not_in_output(self, pdf_bytes):
        from src.pdf_extract import scan_pdf_for_transactions
@@ -238,12 +241,12 @@ class TestStatementHeaderEndToEnd:
            assert "statement_period_start" not in r
            assert "statement_period_end" not in r

-    def test_iso_format_round_trip(self, pdf_bytes):
+    def test_compact_format_round_trip(self, pdf_bytes):
        from src.pdf_extract import scan_pdf_for_transactions
        rows, _ = scan_pdf_for_transactions(
-            pdf_bytes, output_date_format="%Y-%m-%d",
+            pdf_bytes, output_date_format="%Y%m%d",
        )
-        assert rows[0]["date"] == "2025-01-13"
+        assert rows[0]["date"] == "20250113"


 class TestMultiDateRow: