fix(pdf): consistent 2-decimal amount precision in display and CSV
User reported amounts losing trailing zeros — 4.50 rendering as
4.5, 1000.00 as 1000 — on the same statement. Classic float
display issue: Python's native ``repr(4.5)`` drops the
``.0``, and pandas / Streamlit happily show that
inconsistency cell-by-cell.
Two layers of fix, internal type stays ``float`` for arithmetic:
**Display.** ``st.column_config.NumberColumn(format="%.2f")``
applied programmatically to every ``amount_*`` column on the
data_editor. Every numeric amount now shows with exactly two
decimal places regardless of trailing zeros.
**CSV export.** Pandas' default float-to-CSV writer also drops
trailing zeros (the same issue an accountant would see when
opening the file in Excel). Before serialising, each amount
column is mapped through the new ``format_amount`` helper —
returns ``f"{v:.2f}"`` for numerics, empty string for
None/NaN/inf, ``str(value)`` for booleans (guards the
``True → "1.00"`` foot-gun since ``bool`` is an ``int``
subclass), and passes through any string the scanner kept
because parsing failed (e.g. ``(4.50)`` when parens-negative is
off — user can correct in the editor before re-exporting).
``format_amount`` lives in ``src/pdf_extract.py`` so it's
testable in isolation (the page module can't easily be unit
tested because of its Streamlit import chain). 8 new tests
cover the trailing-zeros case, negatives, None/empty,
string-passthrough, bool guard, NaN/inf, and the ``places``
parameter.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -25,6 +25,7 @@ from src.gui.components import hide_streamlit_chrome, render_sticky_footer
|
||||
from src.pdf_extract import (
|
||||
PdfDependencyMissing,
|
||||
diagnose_pdf_lines,
|
||||
format_amount,
|
||||
ocr_available,
|
||||
scan_pdf_for_transactions,
|
||||
)
|
||||
@@ -480,6 +481,18 @@ else:
|
||||
column_config["source_file"] = st.column_config.TextColumn(
|
||||
"source_file", disabled=True,
|
||||
)
|
||||
# Force 2-decimal display on every amount column. Without this,
|
||||
# Streamlit / Pandas show floats with their raw repr ("4.5",
|
||||
# "12.0", "1000") and the precision looks inconsistent across
|
||||
# rows that all came from the same statement. Internal dtype
|
||||
# stays float for arithmetic accuracy; only the rendering and
|
||||
# CSV-export formatting force two-place precision.
|
||||
for amt_col in (c for c in df.columns if c.startswith("amount_")):
|
||||
column_config[amt_col] = st.column_config.NumberColumn(
|
||||
amt_col,
|
||||
format="%.2f",
|
||||
help="Two-decimal currency amount.",
|
||||
)
|
||||
|
||||
edited = st.data_editor(
|
||||
df,
|
||||
@@ -511,7 +524,16 @@ else:
|
||||
help="``page`` and ``raw`` are kept off by default; "
|
||||
"tick them if you want them in the file.",
|
||||
)
|
||||
export = selected[keep] if keep else selected
|
||||
export = (selected[keep] if keep else selected).copy()
|
||||
# Coerce every amount column to a fixed 2-decimal string
|
||||
# before serialising. Pandas' default float-to-CSV
|
||||
# writer drops trailing zeros (4.50 → 4.5) which an
|
||||
# accountant immediately notices in Excel; preserving
|
||||
# the precision is the whole point of this commit.
|
||||
for amt_col in (
|
||||
c for c in export.columns if c.startswith("amount_")
|
||||
):
|
||||
export[amt_col] = export[amt_col].map(format_amount)
|
||||
csv_bytes = export.to_csv(index=False).encode("utf-8")
|
||||
st.download_button(
|
||||
f"Download {len(export):,} rows as CSV",
|
||||
|
||||
@@ -520,6 +520,35 @@ def _find_amount_tokens(
|
||||
return out
|
||||
|
||||
|
||||
def format_amount(value, places: int = 2) -> str:
|
||||
"""Render an amount value as a fixed-precision string.
|
||||
|
||||
Floats lose trailing zeros in their native repr (``4.5`` is
|
||||
not ``4.50``), and pandas / Streamlit happily show that
|
||||
inconsistency cell-by-cell — confusing on a statement where
|
||||
every number is currency. This formatter forces *places*
|
||||
decimals so 4.5, 12.0 and 1000 all render with the same
|
||||
precision.
|
||||
|
||||
Numeric → ``{value:.{places}f}``. None / empty / non-finite →
|
||||
empty string. Strings (typically the raw token preserved when
|
||||
``parse_amount`` couldn't decode the original) pass through
|
||||
untouched so the user sees the source text in the editor.
|
||||
Booleans pass through as ``str(value)`` — guards against ``True``
|
||||
rendering as ``"1.00"`` because Python treats ``bool`` as ``int``.
|
||||
"""
|
||||
if value is None or value == "":
|
||||
return ""
|
||||
if isinstance(value, bool):
|
||||
return str(value)
|
||||
if isinstance(value, (int, float)):
|
||||
import math
|
||||
if isinstance(value, float) and not math.isfinite(value):
|
||||
return ""
|
||||
return f"{value:.{places}f}"
|
||||
return str(value)
|
||||
|
||||
|
||||
def format_date(iso_str: str | None, fmt: str = "%Y%m%d") -> str:
|
||||
"""Convert an ISO ``YYYY-MM-DD`` date string to *fmt*.
|
||||
|
||||
@@ -973,6 +1002,7 @@ __all__ = [
|
||||
"extract_pages",
|
||||
"extract_pages_auto",
|
||||
"extract_statement_metadata",
|
||||
"format_amount",
|
||||
"format_date",
|
||||
"ocr_available",
|
||||
"parse_amount",
|
||||
|
||||
@@ -22,6 +22,7 @@ from src.pdf_extract import (
|
||||
_infer_year_for_short_date,
|
||||
cluster_rows,
|
||||
extract_statement_metadata,
|
||||
format_amount,
|
||||
format_date,
|
||||
parse_amount,
|
||||
parse_date,
|
||||
@@ -214,6 +215,43 @@ class TestFindAmountTokens:
|
||||
# the building blocks; smoke tests pin the wiring.
|
||||
|
||||
|
||||
class TestFormatAmount:
|
||||
"""Two-decimal-place consistency at the display + export layer."""
|
||||
|
||||
def test_drops_no_trailing_zeros(self):
|
||||
# The bug: 4.5 should NOT render as "4.5" — accountants
|
||||
# need consistent precision across rows.
|
||||
assert format_amount(4.5) == "4.50"
|
||||
assert format_amount(12.0) == "12.00"
|
||||
assert format_amount(1000) == "1000.00"
|
||||
|
||||
def test_negatives(self):
|
||||
assert format_amount(-40.0) == "-40.00"
|
||||
assert format_amount(-4.5) == "-4.50"
|
||||
|
||||
def test_none_and_empty(self):
|
||||
assert format_amount(None) == ""
|
||||
assert format_amount("") == ""
|
||||
|
||||
def test_string_passthrough(self):
|
||||
# ``(4.50)`` was preserved by the scanner because parsing
|
||||
# failed; the user sees the raw text and can fix in editor.
|
||||
assert format_amount("(4.50)") == "(4.50)"
|
||||
|
||||
def test_bool_doesnt_render_as_number(self):
|
||||
# bool is an int subclass — guard prevents True → "1.00".
|
||||
assert format_amount(True) == "True"
|
||||
assert format_amount(False) == "False"
|
||||
|
||||
def test_nan_inf_become_empty(self):
|
||||
assert format_amount(float("nan")) == ""
|
||||
assert format_amount(float("inf")) == ""
|
||||
|
||||
def test_custom_places(self):
|
||||
assert format_amount(4.5, places=4) == "4.5000"
|
||||
assert format_amount(4.567, places=0) == "5"
|
||||
|
||||
|
||||
class TestFormatDate:
|
||||
def test_yyyymmdd(self):
|
||||
assert format_date("2026-01-13", "%Y%m%d") == "20260113"
|
||||
|
||||
Reference in New Issue
Block a user