From ad7c22d7fbba3ea608648b6b94f4195272806c9e Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 20 May 2026 01:27:16 +0000 Subject: [PATCH] fix(pdf): consistent 2-decimal amount precision in display and CSV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User reported amounts losing trailing zeros — 4.50 rendering as 4.5, 1000.00 as 1000 — on the same statement. Classic float display issue: Python's native ``repr(4.5)`` drops the ``.0``, and pandas / Streamlit happily show that inconsistency cell-by-cell. Two layers of fix, internal type stays ``float`` for arithmetic: **Display.** ``st.column_config.NumberColumn(format="%.2f")`` applied programmatically to every ``amount_*`` column on the data_editor. Every numeric amount now shows with exactly two decimal places regardless of trailing zeros. **CSV export.** Pandas' default float-to-CSV writer also drops trailing zeros (the same issue an accountant would see when opening the file in Excel). Before serialising, each amount column is mapped through the new ``format_amount`` helper — returns ``f"{v:.2f}"`` for numerics, empty string for None/NaN/inf, ``str(value)`` for booleans (guards the ``True → "1.00"`` foot-gun since ``bool`` is an ``int`` subclass), and passes through any string the scanner kept because parsing failed (e.g. ``(4.50)`` when parens-negative is off — user can correct in the editor before re-exporting). ``format_amount`` lives in ``src/pdf_extract.py`` so it's testable in isolation (the page module can't easily be unit tested because of its Streamlit import chain). 8 new tests cover the trailing-zeros case, negatives, None/empty, string-passthrough, bool guard, NaN/inf, and the ``places`` parameter. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/gui/pages/10_PDF_Extractor.py | 24 ++++++++++++++++++- src/pdf_extract.py | 30 ++++++++++++++++++++++++ tests/test_pdf_extract.py | 38 +++++++++++++++++++++++++++++++ 3 files changed, 91 insertions(+), 1 deletion(-) diff --git a/src/gui/pages/10_PDF_Extractor.py b/src/gui/pages/10_PDF_Extractor.py index 70ac27e..e6a1698 100644 --- a/src/gui/pages/10_PDF_Extractor.py +++ b/src/gui/pages/10_PDF_Extractor.py @@ -25,6 +25,7 @@ from src.gui.components import hide_streamlit_chrome, render_sticky_footer from src.pdf_extract import ( PdfDependencyMissing, diagnose_pdf_lines, + format_amount, ocr_available, scan_pdf_for_transactions, ) @@ -480,6 +481,18 @@ else: column_config["source_file"] = st.column_config.TextColumn( "source_file", disabled=True, ) + # Force 2-decimal display on every amount column. Without this, + # Streamlit / Pandas show floats with their raw repr ("4.5", + # "12.0", "1000") and the precision looks inconsistent across + # rows that all came from the same statement. Internal dtype + # stays float for arithmetic accuracy; only the rendering and + # CSV-export formatting force two-place precision. + for amt_col in (c for c in df.columns if c.startswith("amount_")): + column_config[amt_col] = st.column_config.NumberColumn( + amt_col, + format="%.2f", + help="Two-decimal currency amount.", + ) edited = st.data_editor( df, @@ -511,7 +524,16 @@ else: help="``page`` and ``raw`` are kept off by default; " "tick them if you want them in the file.", ) - export = selected[keep] if keep else selected + export = (selected[keep] if keep else selected).copy() + # Coerce every amount column to a fixed 2-decimal string + # before serialising. Pandas' default float-to-CSV + # writer drops trailing zeros (4.50 → 4.5) which an + # accountant immediately notices in Excel; preserving + # the precision is the whole point of this commit. + for amt_col in ( + c for c in export.columns if c.startswith("amount_") + ): + export[amt_col] = export[amt_col].map(format_amount) csv_bytes = export.to_csv(index=False).encode("utf-8") st.download_button( f"Download {len(export):,} rows as CSV", diff --git a/src/pdf_extract.py b/src/pdf_extract.py index 1989043..8f43d3b 100644 --- a/src/pdf_extract.py +++ b/src/pdf_extract.py @@ -520,6 +520,35 @@ def _find_amount_tokens( return out +def format_amount(value, places: int = 2) -> str: + """Render an amount value as a fixed-precision string. + + Floats lose trailing zeros in their native repr (``4.5`` is + not ``4.50``), and pandas / Streamlit happily show that + inconsistency cell-by-cell — confusing on a statement where + every number is currency. This formatter forces *places* + decimals so 4.5, 12.0 and 1000 all render with the same + precision. + + Numeric → ``{value:.{places}f}``. None / empty / non-finite → + empty string. Strings (typically the raw token preserved when + ``parse_amount`` couldn't decode the original) pass through + untouched so the user sees the source text in the editor. + Booleans pass through as ``str(value)`` — guards against ``True`` + rendering as ``"1.00"`` because Python treats ``bool`` as ``int``. + """ + if value is None or value == "": + return "" + if isinstance(value, bool): + return str(value) + if isinstance(value, (int, float)): + import math + if isinstance(value, float) and not math.isfinite(value): + return "" + return f"{value:.{places}f}" + return str(value) + + def format_date(iso_str: str | None, fmt: str = "%Y%m%d") -> str: """Convert an ISO ``YYYY-MM-DD`` date string to *fmt*. @@ -973,6 +1002,7 @@ __all__ = [ "extract_pages", "extract_pages_auto", "extract_statement_metadata", + "format_amount", "format_date", "ocr_available", "parse_amount", diff --git a/tests/test_pdf_extract.py b/tests/test_pdf_extract.py index 517d9a2..3f9cff1 100644 --- a/tests/test_pdf_extract.py +++ b/tests/test_pdf_extract.py @@ -22,6 +22,7 @@ from src.pdf_extract import ( _infer_year_for_short_date, cluster_rows, extract_statement_metadata, + format_amount, format_date, parse_amount, parse_date, @@ -214,6 +215,43 @@ class TestFindAmountTokens: # the building blocks; smoke tests pin the wiring. +class TestFormatAmount: + """Two-decimal-place consistency at the display + export layer.""" + + def test_drops_no_trailing_zeros(self): + # The bug: 4.5 should NOT render as "4.5" — accountants + # need consistent precision across rows. + assert format_amount(4.5) == "4.50" + assert format_amount(12.0) == "12.00" + assert format_amount(1000) == "1000.00" + + def test_negatives(self): + assert format_amount(-40.0) == "-40.00" + assert format_amount(-4.5) == "-4.50" + + def test_none_and_empty(self): + assert format_amount(None) == "" + assert format_amount("") == "" + + def test_string_passthrough(self): + # ``(4.50)`` was preserved by the scanner because parsing + # failed; the user sees the raw text and can fix in editor. + assert format_amount("(4.50)") == "(4.50)" + + def test_bool_doesnt_render_as_number(self): + # bool is an int subclass — guard prevents True → "1.00". + assert format_amount(True) == "True" + assert format_amount(False) == "False" + + def test_nan_inf_become_empty(self): + assert format_amount(float("nan")) == "" + assert format_amount(float("inf")) == "" + + def test_custom_places(self): + assert format_amount(4.5, places=4) == "4.5000" + assert format_amount(4.567, places=0) == "5" + + class TestFormatDate: def test_yyyymmdd(self): assert format_date("2026-01-13", "%Y%m%d") == "20260113"