diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py index e3f36ff..5f6d5d6 100644 --- a/src/gui/pages/10_PDF_Extractor.py +++ b/src/gui/pages/10_PDF_Extractor.py @@ -128,8 +128,8 @@ if not _pdf_ok: # --------------------------------------------------------------------------- _DATE_FORMAT_CHOICES = { - "YYYYMMDD (20260113)": "%Y%m%d", "YYYY-MM-DD (2026-01-13)": "%Y-%m-%d", + "YYYYMMDD (20260113)": "%Y%m%d", "MM/DD/YYYY (01/13/2026)": "%m/%d/%Y", "DD/MM/YYYY (13/01/2026)": "%d/%m/%Y", "MMM DD, YYYY (Jan 13, 2026)": "%b %d, %Y", @@ -173,10 +173,10 @@ with st.expander("Scan options", expanded=False): if output_date_format == "__custom__": output_date_format = c4.text_input( "Custom strftime format", - value="%Y%m%d", + value="%Y-%m-%d", help=( - "Python ``strftime`` codes — e.g., ``%Y%m%d`` for " - "20260113, ``%Y-%m-%d`` for 2026-01-13." + "Python ``strftime`` codes — e.g., ``%Y-%m-%d`` for " + "2026-01-13, ``%Y%m%d`` for 20260113." ), ) diff --git a/src/pdf_extract.py b/src/pdf_extract.py index 0c40415..cb19ac1 100644 --- a/src/pdf_extract.py +++ b/src/pdf_extract.py @@ -520,6 +520,13 @@ def _find_amount_tokens( return out +DEFAULT_DATE_FORMAT = "%Y-%m-%d" +"""ISO-8601-style ``YYYY-MM-DD``. Default for output date columns +because it sorts lexicographically, parses in every spreadsheet +tool the user might import the CSV into, and is unambiguous +across US/EU readers.""" + + def format_amount(value, places: int = 2) -> str: """Render an amount value as a fixed-precision string. @@ -549,7 +556,7 @@ def format_amount(value, places: int = 2) -> str: return str(value) -def format_date(iso_str: str | None, fmt: str = "%Y%m%d") -> str: +def format_date(iso_str: str | None, fmt: str = DEFAULT_DATE_FORMAT) -> str: """Convert an ISO ``YYYY-MM-DD`` date string to *fmt*. Returns the input unchanged if it's not parseable as ISO, @@ -840,7 +847,7 @@ def scan_pdf_for_transactions( date_formats: list[str] | None = None, y_tolerance: float = 3.0, merge_multiline_descriptions: bool = True, - output_date_format: str = "%Y%m%d", + output_date_format: str = DEFAULT_DATE_FORMAT, filename_year_hint: int | None = None, year_override: int | None = None, ) -> tuple[list[dict[str, Any]], list[str]]: @@ -850,7 +857,7 @@ def scan_pdf_for_transactions( amount pattern. Each returned record looks like:: { - "date": "20260115", # output_date_format applied + "date": "2026-01-15", # output_date_format applied "description": "...", "amount_1": 4.50, "amount_2": 1000.00, # if a second amount was found diff --git a/tests/test_pdf_extract.py b/tests/test_pdf_extract.py index 3bbd39d..21cc589 100644 --- a/tests/test_pdf_extract.py +++ b/tests/test_pdf_extract.py @@ -272,6 +272,11 @@ class TestFormatDate: assert format_date(None) == "" assert format_date("") == "" + def test_default_is_iso(self): + # Default format changed to ISO ``YYYY-MM-DD`` — sorts + # naturally and parses across every spreadsheet tool. + assert format_date("2026-01-13") == "2026-01-13" + class TestExtractAccountNumber: def test_masked(self): diff --git a/tests/test_pdf_extract_smoke.py b/tests/test_pdf_extract_smoke.py index a7c0fd2..bc5133c 100644 --- a/tests/test_pdf_extract_smoke.py +++ b/tests/test_pdf_extract_smoke.py @@ -136,21 +136,23 @@ class TestScanPdfForTransactions: f"{[r.get('raw') for r in rows]}" ) - def test_dates_formatted_yyyymmdd_by_default(self, pdf_bytes): + def test_dates_formatted_iso_by_default(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions rows, _ = scan_pdf_for_transactions(pdf_bytes) - # Default output format is %Y%m%d + # Default output format is %Y-%m-%d — ISO ordering, parses + # cleanly in every spreadsheet tool the user might import + # this CSV into. assert [r["date"] for r in rows] == [ - "20260115", "20260116", "20260117", + "2026-01-15", "2026-01-16", "2026-01-17", ] def test_output_date_format_override(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions rows, _ = scan_pdf_for_transactions( - pdf_bytes, output_date_format="%Y-%m-%d", + pdf_bytes, output_date_format="%Y%m%d", ) assert [r["date"] for r in rows] == [ - "2026-01-15", "2026-01-16", "2026-01-17", + "20260115", "20260116", "20260117", ] def test_account_number_field_present_on_every_row(self, pdf_bytes): @@ -224,12 +226,13 @@ class TestStatementHeaderEndToEnd: def test_short_dates_get_year_from_period(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions rows, _ = scan_pdf_for_transactions(pdf_bytes) - # Short ``01/13`` + period ending in 2025 → 20250113. + # Short ``01/13`` + period ending in 2025 → 2025-01-13. # The period itself isn't surfaced as a column anymore, but # the year inference that depends on it still works because # extraction happens internally before the per-row stamp. - assert rows[0]["date"] == "20250113" - assert rows[1]["date"] == "20250116" + # Output is in the default ISO format now. + assert rows[0]["date"] == "2025-01-13" + assert rows[1]["date"] == "2025-01-16" def test_period_fields_not_in_output(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions @@ -238,12 +241,12 @@ class TestStatementHeaderEndToEnd: assert "statement_period_start" not in r assert "statement_period_end" not in r - def test_iso_format_round_trip(self, pdf_bytes): + def test_compact_format_round_trip(self, pdf_bytes): from src.pdf_extract import scan_pdf_for_transactions rows, _ = scan_pdf_for_transactions( - pdf_bytes, output_date_format="%Y-%m-%d", + pdf_bytes, output_date_format="%Y%m%d", ) - assert rows[0]["date"] == "2025-01-13" + assert rows[0]["date"] == "20250113" class TestMultiDateRow: