feat(reconcile): two-source reconciliation tool
Bank-feed-vs-ledger style matcher: 4-pass greedy assignment (key → exact → tolerance → fuzzy) with ambiguous candidates routed to a review bucket instead of arbitrary picks. CLI mirrors the cli_text_clean preview/--apply pattern; Streamlit page registered in the automations section. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
198
src/cli_reconcile.py
Normal file
198
src/cli_reconcile.py
Normal file
@@ -0,0 +1,198 @@
|
|||||||
|
"""CLI for the DataTools reconciliation tool.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python -m src.cli_reconcile bank.csv ledger.csv \\
|
||||||
|
--left-amount amount --right-amount amt \\
|
||||||
|
--left-date date --right-date posted # dry-run preview
|
||||||
|
python -m src.cli_reconcile bank.csv ledger.csv \\
|
||||||
|
--left-amount amount --right-amount amt \\
|
||||||
|
--left-date date --right-date posted --apply # write matched/unmatched CSVs
|
||||||
|
python -m src.cli_reconcile --help # full help
|
||||||
|
|
||||||
|
Outputs (with --apply) sit beside the LEFT input file:
|
||||||
|
{stem}_matched.csv one row per accepted pair
|
||||||
|
{stem}_unmatched_left.csv left rows with no counterpart
|
||||||
|
{stem}_unmatched_right.csv right rows with no counterpart
|
||||||
|
{stem}_review.csv ambiguous pairs flagged for review
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import typer
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
app = typer.Typer(
|
||||||
|
name="reconcile",
|
||||||
|
help=(
|
||||||
|
"Reconcile two data sources (e.g. bank feed vs. ledger export).\n\n"
|
||||||
|
"By default, runs in preview mode — shows the match stats without "
|
||||||
|
"writing anything. Add --apply to write the four output CSVs.\n\n"
|
||||||
|
"Examples:\n\n"
|
||||||
|
" # Bank feed vs ledger, exact match\n"
|
||||||
|
" python -m src.cli_reconcile bank.csv ledger.csv \\\n"
|
||||||
|
" --left-amount amount --right-amount amt \\\n"
|
||||||
|
" --left-date date --right-date posted\n\n"
|
||||||
|
" # Allow 2-day posting drift and a cent of rounding tolerance\n"
|
||||||
|
" python -m src.cli_reconcile bank.csv ledger.csv \\\n"
|
||||||
|
" --left-amount amount --right-amount amt \\\n"
|
||||||
|
" --left-date date --right-date posted \\\n"
|
||||||
|
" --date-tolerance 2 --amount-tolerance 0.01 --apply\n\n"
|
||||||
|
" # Bank shows debits as positive; ledger as negative\n"
|
||||||
|
" python -m src.cli_reconcile bank.csv ledger.csv \\\n"
|
||||||
|
" --left-amount amount --right-amount amt --invert-right-sign --apply\n"
|
||||||
|
),
|
||||||
|
add_completion=False,
|
||||||
|
no_args_is_help=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _setup_logging(log_dir: Path) -> Path:
|
||||||
|
log_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
log_path = log_dir / f"reconcile_{ts}.log"
|
||||||
|
logger.remove()
|
||||||
|
logger.add(sys.stderr, level="WARNING", format="{message}")
|
||||||
|
logger.add(
|
||||||
|
str(log_path),
|
||||||
|
level="DEBUG",
|
||||||
|
format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}",
|
||||||
|
)
|
||||||
|
return log_path
|
||||||
|
|
||||||
|
|
||||||
|
def _split_csv_arg(raw: Optional[str]) -> list[str]:
|
||||||
|
if raw is None:
|
||||||
|
return []
|
||||||
|
return [c.strip() for c in raw.split(",") if c.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
@app.command()
|
||||||
|
def run(
|
||||||
|
left_file: str = typer.Argument(..., help="Path to the LEFT input (e.g. bank feed)."),
|
||||||
|
right_file: str = typer.Argument(..., help="Path to the RIGHT input (e.g. ledger)."),
|
||||||
|
left_amount: str = typer.Option(..., "--left-amount", help="Amount column on the LEFT."),
|
||||||
|
right_amount: str = typer.Option(..., "--right-amount", help="Amount column on the RIGHT."),
|
||||||
|
left_date: Optional[str] = typer.Option(None, "--left-date", help="Date column on the LEFT."),
|
||||||
|
right_date: Optional[str] = typer.Option(None, "--right-date", help="Date column on the RIGHT."),
|
||||||
|
left_keys: Optional[str] = typer.Option(
|
||||||
|
None, "--left-keys",
|
||||||
|
help="Comma-separated reference/key columns on the LEFT (paired with --right-keys).",
|
||||||
|
),
|
||||||
|
right_keys: Optional[str] = typer.Option(
|
||||||
|
None, "--right-keys",
|
||||||
|
help="Comma-separated reference/key columns on the RIGHT (paired with --left-keys).",
|
||||||
|
),
|
||||||
|
left_desc: Optional[str] = typer.Option(None, "--left-desc", help="Description column on the LEFT (fuzzy)."),
|
||||||
|
right_desc: Optional[str] = typer.Option(None, "--right-desc", help="Description column on the RIGHT (fuzzy)."),
|
||||||
|
desc_min_score: int = typer.Option(
|
||||||
|
0, "--desc-min-score",
|
||||||
|
help="Min description similarity (0-100) to accept a fuzzy match. 0 disables.",
|
||||||
|
),
|
||||||
|
amount_tolerance: float = typer.Option(
|
||||||
|
0.0, "--amount-tolerance",
|
||||||
|
help="Absolute amount tolerance (e.g. 0.01 to absorb cent-rounding).",
|
||||||
|
),
|
||||||
|
date_tolerance: int = typer.Option(
|
||||||
|
0, "--date-tolerance",
|
||||||
|
help="Date tolerance in calendar days (± N).",
|
||||||
|
),
|
||||||
|
invert_right_sign: bool = typer.Option(
|
||||||
|
False, "--invert-right-sign",
|
||||||
|
help="Negate the RIGHT amount before matching (use when sign conventions differ).",
|
||||||
|
),
|
||||||
|
apply: bool = typer.Option(
|
||||||
|
False, "--apply",
|
||||||
|
help="Write the four output CSV files. Without this flag, only stats are shown.",
|
||||||
|
),
|
||||||
|
):
|
||||||
|
"""Reconcile two CSV/Excel files."""
|
||||||
|
from src.core.io import read_file, write_file
|
||||||
|
from src.core.reconcile import ReconcileOptions, reconcile
|
||||||
|
|
||||||
|
left_path = Path(left_file)
|
||||||
|
right_path = Path(right_file)
|
||||||
|
for p in (left_path, right_path):
|
||||||
|
if not p.exists():
|
||||||
|
typer.echo(f"Error: File not found: {p}", err=True)
|
||||||
|
raise typer.Exit(1)
|
||||||
|
|
||||||
|
log_path = _setup_logging(Path("logs"))
|
||||||
|
|
||||||
|
typer.echo(f"Reading {left_path.name}...")
|
||||||
|
try:
|
||||||
|
left_df = read_file(left_path)
|
||||||
|
except Exception as e:
|
||||||
|
typer.echo(f"Error reading {left_path.name}: {e}", err=True)
|
||||||
|
raise typer.Exit(1)
|
||||||
|
typer.echo(f" {len(left_df)} rows, {len(left_df.columns)} columns")
|
||||||
|
|
||||||
|
typer.echo(f"Reading {right_path.name}...")
|
||||||
|
try:
|
||||||
|
right_df = read_file(right_path)
|
||||||
|
except Exception as e:
|
||||||
|
typer.echo(f"Error reading {right_path.name}: {e}", err=True)
|
||||||
|
raise typer.Exit(1)
|
||||||
|
typer.echo(f" {len(right_df)} rows, {len(right_df.columns)} columns")
|
||||||
|
|
||||||
|
options = ReconcileOptions(
|
||||||
|
left_amount=left_amount,
|
||||||
|
right_amount=right_amount,
|
||||||
|
left_date=left_date,
|
||||||
|
right_date=right_date,
|
||||||
|
left_keys=_split_csv_arg(left_keys),
|
||||||
|
right_keys=_split_csv_arg(right_keys),
|
||||||
|
left_desc=left_desc,
|
||||||
|
right_desc=right_desc,
|
||||||
|
desc_min_score=desc_min_score,
|
||||||
|
amount_tolerance=amount_tolerance,
|
||||||
|
date_tolerance_days=date_tolerance,
|
||||||
|
invert_right_sign=invert_right_sign,
|
||||||
|
)
|
||||||
|
|
||||||
|
typer.echo("Reconciling...")
|
||||||
|
try:
|
||||||
|
result = reconcile(left_df, right_df, options)
|
||||||
|
except ValueError as e:
|
||||||
|
typer.echo(f"Error: {e}", err=True)
|
||||||
|
raise typer.Exit(1)
|
||||||
|
|
||||||
|
_print_stats(result.stats)
|
||||||
|
|
||||||
|
if apply:
|
||||||
|
stem = left_path.stem
|
||||||
|
out_dir = left_path.parent
|
||||||
|
write_file(result.matched, out_dir / f"{stem}_matched.csv")
|
||||||
|
write_file(result.unmatched_left, out_dir / f"{stem}_unmatched_left.csv")
|
||||||
|
write_file(result.unmatched_right, out_dir / f"{stem}_unmatched_right.csv")
|
||||||
|
write_file(result.review, out_dir / f"{stem}_review.csv")
|
||||||
|
typer.echo(f"\nWrote 4 files to {out_dir}:")
|
||||||
|
for suffix in ("matched", "unmatched_left", "unmatched_right", "review"):
|
||||||
|
typer.echo(f" {stem}_{suffix}.csv")
|
||||||
|
else:
|
||||||
|
typer.echo("\nThis was a preview. Add --apply to write the output files.")
|
||||||
|
|
||||||
|
typer.echo(f"Log: {log_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def _print_stats(stats: dict) -> None:
|
||||||
|
typer.echo(f"\n{'─'*50}")
|
||||||
|
typer.echo(f" Left rows: {stats['left_rows']}")
|
||||||
|
typer.echo(f" Right rows: {stats['right_rows']}")
|
||||||
|
typer.echo(f" Matched: {stats['matched']}")
|
||||||
|
typer.echo(f" Review (ambiguous): {stats['review']}")
|
||||||
|
typer.echo(f" Unmatched left: {stats['unmatched_left']}")
|
||||||
|
typer.echo(f" Unmatched right: {stats['unmatched_right']}")
|
||||||
|
typer.echo(f"{'─'*50}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
app()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
598
src/core/reconcile.py
Normal file
598
src/core/reconcile.py
Normal file
@@ -0,0 +1,598 @@
|
|||||||
|
"""Two-source data reconciliation.
|
||||||
|
|
||||||
|
Given two DataFrames (typically a bank/credit-card feed and a ledger
|
||||||
|
export), find which rows on the left correspond to rows on the right
|
||||||
|
based on amount, date, and optional reference/description fields.
|
||||||
|
|
||||||
|
Output buckets:
|
||||||
|
matched — one row per accepted pair, with both originals.
|
||||||
|
unmatched_left — left rows with no acceptable right counterpart.
|
||||||
|
unmatched_right — right rows with no acceptable left counterpart.
|
||||||
|
review — ambiguous cases (a left row had >1 equally good
|
||||||
|
right candidates, or vice versa) surfaced for the
|
||||||
|
user to disambiguate manually.
|
||||||
|
|
||||||
|
Matching strategy is a multi-pass greedy one-to-one assignment:
|
||||||
|
Pass 1: exact key match (when ``key_columns`` is set on either side)
|
||||||
|
Pass 2: exact (amount, date) match
|
||||||
|
Pass 3: amount within tolerance AND date within window
|
||||||
|
Pass 4: + optional description fuzzy similarity boost
|
||||||
|
|
||||||
|
Within each pass, candidate pairs are scored and assigned greedily by
|
||||||
|
descending score; ties for the same left row that span multiple right
|
||||||
|
rows (or vice versa) are sent to ``review`` instead of being matched
|
||||||
|
arbitrarily.
|
||||||
|
|
||||||
|
The module is pure: no I/O, no Streamlit, no logging side effects beyond
|
||||||
|
loguru. Caller drives file reading and result rendering.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
try:
|
||||||
|
from rapidfuzz import fuzz as _rf_fuzz
|
||||||
|
_HAS_RAPIDFUZZ = True
|
||||||
|
except ImportError: # pragma: no cover — rapidfuzz is in requirements.txt
|
||||||
|
_HAS_RAPIDFUZZ = False
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Options & result
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ReconcileOptions:
|
||||||
|
"""Configuration for :func:`reconcile`.
|
||||||
|
|
||||||
|
``left_amount`` / ``right_amount`` are required: every match needs
|
||||||
|
an amount to anchor on. Everything else is optional.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Amount columns (required). Values are coerced to float; non-numeric
|
||||||
|
# rows are dropped from matching but appear in the unmatched buckets.
|
||||||
|
left_amount: str = ""
|
||||||
|
right_amount: str = ""
|
||||||
|
|
||||||
|
# Date columns. When both are set, candidates must fall within
|
||||||
|
# ``date_tolerance_days``. When unset, date is ignored entirely.
|
||||||
|
left_date: Optional[str] = None
|
||||||
|
right_date: Optional[str] = None
|
||||||
|
|
||||||
|
# Optional reference / key columns for exact-match Pass 1. List
|
||||||
|
# forms must be the same length so the i-th left key pairs with the
|
||||||
|
# i-th right key (e.g. ``["check_no"]`` ↔ ``["ref"]``).
|
||||||
|
left_keys: list[str] = field(default_factory=list)
|
||||||
|
right_keys: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
# Description columns for fuzzy similarity boost (optional). Only
|
||||||
|
# used when ``desc_min_score`` > 0 AND rapidfuzz is installed.
|
||||||
|
left_desc: Optional[str] = None
|
||||||
|
right_desc: Optional[str] = None
|
||||||
|
desc_min_score: int = 0 # 0–100; 0 disables fuzzy.
|
||||||
|
|
||||||
|
# Tolerances. Defaults are exact match.
|
||||||
|
amount_tolerance: float = 0.0 # absolute (e.g. 0.01 for cent rounding)
|
||||||
|
date_tolerance_days: int = 0 # ± N calendar days
|
||||||
|
|
||||||
|
# Some bank feeds use opposite sign convention from the ledger
|
||||||
|
# (debits positive vs. negative). Flipping this multiplies the
|
||||||
|
# right side's amount by -1 before matching.
|
||||||
|
invert_right_sign: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ReconcileResult:
|
||||||
|
"""Outcome of a reconcile run.
|
||||||
|
|
||||||
|
All four DataFrames preserve the original columns from each side,
|
||||||
|
prefixed with ``left_`` and ``right_`` where applicable, plus a
|
||||||
|
small set of bookkeeping columns (``match_pass``, ``amount_diff``,
|
||||||
|
``date_diff_days``, ``desc_score``).
|
||||||
|
"""
|
||||||
|
|
||||||
|
matched: pd.DataFrame
|
||||||
|
unmatched_left: pd.DataFrame
|
||||||
|
unmatched_right: pd.DataFrame
|
||||||
|
review: pd.DataFrame
|
||||||
|
stats: dict[str, int] = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Public entry point
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def reconcile(
|
||||||
|
left: pd.DataFrame,
|
||||||
|
right: pd.DataFrame,
|
||||||
|
options: ReconcileOptions,
|
||||||
|
) -> ReconcileResult:
|
||||||
|
"""Reconcile *left* against *right* using *options*.
|
||||||
|
|
||||||
|
Neither input is mutated. The result's frames hold copies of the
|
||||||
|
relevant rows from the originals, joined via the bookkeeping
|
||||||
|
columns described on :class:`ReconcileResult`.
|
||||||
|
"""
|
||||||
|
_validate_options(left, right, options)
|
||||||
|
|
||||||
|
# Normalize amounts and dates to typed columns we can reason about
|
||||||
|
# without re-parsing per pass. The work columns live on copies so
|
||||||
|
# callers' inputs are untouched.
|
||||||
|
L = _prep_side(left, options, side="left")
|
||||||
|
R = _prep_side(right, options, side="right")
|
||||||
|
|
||||||
|
# Track which left/right indices remain unmatched across passes.
|
||||||
|
# Seeded from the FULL input frame, not the prepped one — rows
|
||||||
|
# dropped during prep (unparseable amount/date) must still surface
|
||||||
|
# in the unmatched bucket so users can see they exist. Candidate
|
||||||
|
# generators iterate L.index, so prep-dropped rows simply never
|
||||||
|
# get claimed.
|
||||||
|
left_open: set = set(left.index)
|
||||||
|
right_open: set = set(right.index)
|
||||||
|
|
||||||
|
matched_pairs: list[dict] = []
|
||||||
|
review_pairs: list[dict] = []
|
||||||
|
|
||||||
|
# Pass 1 — exact key match on user-supplied reference columns.
|
||||||
|
if options.left_keys and options.right_keys:
|
||||||
|
_run_pass(
|
||||||
|
L, R, left_open, right_open, matched_pairs, review_pairs,
|
||||||
|
options=options, pass_name="key",
|
||||||
|
candidate_fn=_candidates_by_key,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Pass 2 — exact (amount, date) match.
|
||||||
|
_run_pass(
|
||||||
|
L, R, left_open, right_open, matched_pairs, review_pairs,
|
||||||
|
options=options, pass_name="exact",
|
||||||
|
candidate_fn=_candidates_exact,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Pass 3 — tolerance-window match.
|
||||||
|
if options.amount_tolerance > 0 or options.date_tolerance_days > 0:
|
||||||
|
_run_pass(
|
||||||
|
L, R, left_open, right_open, matched_pairs, review_pairs,
|
||||||
|
options=options, pass_name="tolerance",
|
||||||
|
candidate_fn=_candidates_tolerance,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Pass 4 — description fuzzy boost (only over what's left).
|
||||||
|
if (
|
||||||
|
options.desc_min_score > 0
|
||||||
|
and options.left_desc
|
||||||
|
and options.right_desc
|
||||||
|
and _HAS_RAPIDFUZZ
|
||||||
|
):
|
||||||
|
_run_pass(
|
||||||
|
L, R, left_open, right_open, matched_pairs, review_pairs,
|
||||||
|
options=options, pass_name="fuzzy",
|
||||||
|
candidate_fn=_candidates_fuzzy,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build the four output frames from what remains.
|
||||||
|
matched_df = _build_matched(left, right, matched_pairs, options)
|
||||||
|
review_df = _build_matched(left, right, review_pairs, options, review=True)
|
||||||
|
unmatched_left_df = left.loc[sorted(left_open)].copy()
|
||||||
|
unmatched_right_df = right.loc[sorted(right_open)].copy()
|
||||||
|
|
||||||
|
stats = {
|
||||||
|
"left_rows": len(left),
|
||||||
|
"right_rows": len(right),
|
||||||
|
"matched": len(matched_pairs),
|
||||||
|
"review": len(review_pairs),
|
||||||
|
"unmatched_left": len(unmatched_left_df),
|
||||||
|
"unmatched_right": len(unmatched_right_df),
|
||||||
|
}
|
||||||
|
logger.debug("reconcile stats: {}", stats)
|
||||||
|
|
||||||
|
return ReconcileResult(
|
||||||
|
matched=matched_df,
|
||||||
|
unmatched_left=unmatched_left_df,
|
||||||
|
unmatched_right=unmatched_right_df,
|
||||||
|
review=review_df,
|
||||||
|
stats=stats,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Input validation & prep
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_options(
|
||||||
|
left: pd.DataFrame, right: pd.DataFrame, options: ReconcileOptions
|
||||||
|
) -> None:
|
||||||
|
if not options.left_amount or not options.right_amount:
|
||||||
|
raise ValueError(
|
||||||
|
"Reconcile requires both left_amount and right_amount columns."
|
||||||
|
)
|
||||||
|
if options.left_amount not in left.columns:
|
||||||
|
raise ValueError(
|
||||||
|
f"left_amount column {options.left_amount!r} not in left DataFrame."
|
||||||
|
)
|
||||||
|
if options.right_amount not in right.columns:
|
||||||
|
raise ValueError(
|
||||||
|
f"right_amount column {options.right_amount!r} not in right DataFrame."
|
||||||
|
)
|
||||||
|
if bool(options.left_date) != bool(options.right_date):
|
||||||
|
raise ValueError(
|
||||||
|
"left_date and right_date must both be set or both be None."
|
||||||
|
)
|
||||||
|
if options.left_date and options.left_date not in left.columns:
|
||||||
|
raise ValueError(f"left_date column {options.left_date!r} not in left.")
|
||||||
|
if options.right_date and options.right_date not in right.columns:
|
||||||
|
raise ValueError(f"right_date column {options.right_date!r} not in right.")
|
||||||
|
if len(options.left_keys) != len(options.right_keys):
|
||||||
|
raise ValueError(
|
||||||
|
"left_keys and right_keys must be the same length "
|
||||||
|
f"(got {len(options.left_keys)} vs {len(options.right_keys)})."
|
||||||
|
)
|
||||||
|
for c in options.left_keys:
|
||||||
|
if c not in left.columns:
|
||||||
|
raise ValueError(f"left key column {c!r} not in left DataFrame.")
|
||||||
|
for c in options.right_keys:
|
||||||
|
if c not in right.columns:
|
||||||
|
raise ValueError(f"right key column {c!r} not in right DataFrame.")
|
||||||
|
if options.amount_tolerance < 0:
|
||||||
|
raise ValueError("amount_tolerance must be >= 0.")
|
||||||
|
if options.date_tolerance_days < 0:
|
||||||
|
raise ValueError("date_tolerance_days must be >= 0.")
|
||||||
|
if not (0 <= options.desc_min_score <= 100):
|
||||||
|
raise ValueError("desc_min_score must be between 0 and 100.")
|
||||||
|
|
||||||
|
|
||||||
|
def _prep_side(
|
||||||
|
df: pd.DataFrame, options: ReconcileOptions, side: str
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""Return a copy with ``_amt`` and ``_date`` work columns added.
|
||||||
|
|
||||||
|
Rows whose amount cannot be parsed as a number are dropped from the
|
||||||
|
matching frame so they fall through to the unmatched bucket on the
|
||||||
|
caller side. The same is true for unparseable dates when date
|
||||||
|
matching is in use — date is required-when-configured.
|
||||||
|
"""
|
||||||
|
work = df.copy()
|
||||||
|
amt_col = options.left_amount if side == "left" else options.right_amount
|
||||||
|
date_col = options.left_date if side == "left" else options.right_date
|
||||||
|
|
||||||
|
work["_amt"] = pd.to_numeric(work[amt_col], errors="coerce")
|
||||||
|
if side == "right" and options.invert_right_sign:
|
||||||
|
work["_amt"] = -work["_amt"]
|
||||||
|
|
||||||
|
if date_col:
|
||||||
|
work["_date"] = pd.to_datetime(work[date_col], errors="coerce")
|
||||||
|
else:
|
||||||
|
work["_date"] = pd.NaT
|
||||||
|
|
||||||
|
# Drop rows that lack the inputs needed to participate. Their
|
||||||
|
# original index labels are intentionally preserved on the source
|
||||||
|
# frame so they show up in unmatched buckets below.
|
||||||
|
bad_amt = work["_amt"].isna()
|
||||||
|
bad_date = work["_date"].isna() if date_col else pd.Series(False, index=work.index)
|
||||||
|
keep = ~(bad_amt | bad_date)
|
||||||
|
if (~keep).any():
|
||||||
|
logger.debug(
|
||||||
|
"{} side: dropping {} row(s) with unparseable amount/date",
|
||||||
|
side, (~keep).sum(),
|
||||||
|
)
|
||||||
|
return work.loc[keep].copy()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Per-pass orchestration
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _run_pass(
|
||||||
|
L: pd.DataFrame,
|
||||||
|
R: pd.DataFrame,
|
||||||
|
left_open: set,
|
||||||
|
right_open: set,
|
||||||
|
matched_pairs: list[dict],
|
||||||
|
review_pairs: list[dict],
|
||||||
|
*,
|
||||||
|
options: ReconcileOptions,
|
||||||
|
pass_name: str,
|
||||||
|
candidate_fn,
|
||||||
|
) -> None:
|
||||||
|
"""Run one matching pass over the still-open indices.
|
||||||
|
|
||||||
|
The pass collects (left_idx, right_idx, score, extras) candidates
|
||||||
|
from ``candidate_fn``, then greedily assigns by descending score.
|
||||||
|
A left row with two right candidates tied at the top score (and
|
||||||
|
vice versa) gets routed to the review bucket so we don't pick one
|
||||||
|
arbitrarily.
|
||||||
|
"""
|
||||||
|
L_open = L.loc[L.index.intersection(left_open)]
|
||||||
|
R_open = R.loc[R.index.intersection(right_open)]
|
||||||
|
if L_open.empty or R_open.empty:
|
||||||
|
return
|
||||||
|
|
||||||
|
candidates = candidate_fn(L_open, R_open, options)
|
||||||
|
if not candidates:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Group candidates by left index. For each left row, partition into
|
||||||
|
# confident-best (single top score) vs. ambiguous (top score tied).
|
||||||
|
by_left: dict = {}
|
||||||
|
for cand in candidates:
|
||||||
|
by_left.setdefault(cand["left_idx"], []).append(cand)
|
||||||
|
|
||||||
|
# Two-pointer assignment by best-score-first, with reverse-direction
|
||||||
|
# ambiguity check so a right row claimed by two equally-good lefts
|
||||||
|
# also routes to review.
|
||||||
|
by_right_top: dict = {}
|
||||||
|
for li, cands in by_left.items():
|
||||||
|
cands.sort(key=lambda c: c["score"], reverse=True)
|
||||||
|
top = cands[0]["score"]
|
||||||
|
leaders = [c for c in cands if c["score"] == top]
|
||||||
|
for c in leaders:
|
||||||
|
by_right_top.setdefault(c["right_idx"], []).append(c)
|
||||||
|
|
||||||
|
# Sort left rows by their leader's score so high-confidence matches
|
||||||
|
# claim their right counterpart first; low-confidence rows lose
|
||||||
|
# contention if the right row was already taken.
|
||||||
|
left_order = sorted(
|
||||||
|
by_left.keys(),
|
||||||
|
key=lambda li: -by_left[li][0]["score"],
|
||||||
|
)
|
||||||
|
|
||||||
|
for li in left_order:
|
||||||
|
if li not in left_open:
|
||||||
|
continue
|
||||||
|
cands = by_left[li]
|
||||||
|
top_score = cands[0]["score"]
|
||||||
|
leaders = [c for c in cands if c["score"] == top_score]
|
||||||
|
|
||||||
|
# Filter to still-open right indices.
|
||||||
|
leaders = [c for c in leaders if c["right_idx"] in right_open]
|
||||||
|
if not leaders:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(leaders) > 1:
|
||||||
|
# Left row is ambiguous on its own side — multiple equally
|
||||||
|
# good right candidates remain. Park them all in review.
|
||||||
|
for c in leaders:
|
||||||
|
review_pairs.append({**c, "pass": pass_name})
|
||||||
|
left_open.discard(li)
|
||||||
|
for c in leaders:
|
||||||
|
right_open.discard(c["right_idx"])
|
||||||
|
continue
|
||||||
|
|
||||||
|
pick = leaders[0]
|
||||||
|
ri = pick["right_idx"]
|
||||||
|
|
||||||
|
# Mirror check: is the right row contested by another left at
|
||||||
|
# the same top score? If so, both lefts go to review and the
|
||||||
|
# right row is consumed.
|
||||||
|
contenders = [
|
||||||
|
c for c in by_right_top.get(ri, [])
|
||||||
|
if c["left_idx"] in left_open and c["score"] == pick["score"]
|
||||||
|
]
|
||||||
|
if len(contenders) > 1:
|
||||||
|
for c in contenders:
|
||||||
|
review_pairs.append({**c, "pass": pass_name})
|
||||||
|
left_open.discard(c["left_idx"])
|
||||||
|
right_open.discard(ri)
|
||||||
|
continue
|
||||||
|
|
||||||
|
matched_pairs.append({**pick, "pass": pass_name})
|
||||||
|
left_open.discard(li)
|
||||||
|
right_open.discard(ri)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Candidate generators (one per pass)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _candidates_by_key(
|
||||||
|
L: pd.DataFrame, R: pd.DataFrame, options: ReconcileOptions
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Exact match on the user-supplied key columns + same amount.
|
||||||
|
|
||||||
|
Amount must still tie out; otherwise a shared reference number
|
||||||
|
(e.g. a check number reused across years) would over-match. We do
|
||||||
|
NOT require date in this pass — the assumption is that a confirmed
|
||||||
|
reference like an invoice number is authoritative even when the
|
||||||
|
posting date drifts.
|
||||||
|
"""
|
||||||
|
if not options.left_keys:
|
||||||
|
return []
|
||||||
|
# Build a composite key on each side as a tuple of stringified values.
|
||||||
|
L_key = L[options.left_keys].astype(str).agg("|".join, axis=1)
|
||||||
|
R_key = R[options.right_keys].astype(str).agg("|".join, axis=1)
|
||||||
|
R_by_key: dict = {}
|
||||||
|
for ri, k in R_key.items():
|
||||||
|
R_by_key.setdefault(k, []).append(ri)
|
||||||
|
|
||||||
|
out: list[dict] = []
|
||||||
|
for li, k in L_key.items():
|
||||||
|
if k == "" or k == "|".join([""] * len(options.left_keys)):
|
||||||
|
continue
|
||||||
|
for ri in R_by_key.get(k, []):
|
||||||
|
if abs(L.at[li, "_amt"] - R.at[ri, "_amt"]) <= options.amount_tolerance:
|
||||||
|
out.append(_score_pair(L, R, li, ri, base_score=1000))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _candidates_exact(
|
||||||
|
L: pd.DataFrame, R: pd.DataFrame, options: ReconcileOptions
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Exact match on amount (and date if configured)."""
|
||||||
|
out: list[dict] = []
|
||||||
|
has_date = options.left_date is not None
|
||||||
|
# Bucket right side by amount for cheap lookup.
|
||||||
|
R_by_amt: dict = {}
|
||||||
|
for ri, amt in R["_amt"].items():
|
||||||
|
R_by_amt.setdefault(amt, []).append(ri)
|
||||||
|
|
||||||
|
for li, amt in L["_amt"].items():
|
||||||
|
for ri in R_by_amt.get(amt, []):
|
||||||
|
if has_date and L.at[li, "_date"] != R.at[ri, "_date"]:
|
||||||
|
continue
|
||||||
|
out.append(_score_pair(L, R, li, ri, base_score=900))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _candidates_tolerance(
|
||||||
|
L: pd.DataFrame, R: pd.DataFrame, options: ReconcileOptions
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Amount within tolerance and (if configured) date within window.
|
||||||
|
|
||||||
|
Quadratic in the open set size. For typical reconciliation sizes
|
||||||
|
(a month of statements: low thousands of rows) this is fine; if a
|
||||||
|
user hands us 100k×100k we'll need a smarter blocking strategy.
|
||||||
|
"""
|
||||||
|
out: list[dict] = []
|
||||||
|
has_date = options.left_date is not None
|
||||||
|
tol = options.amount_tolerance
|
||||||
|
win = pd.Timedelta(days=options.date_tolerance_days) if has_date else None
|
||||||
|
|
||||||
|
R_amts = R["_amt"].to_numpy()
|
||||||
|
R_dates = R["_date"].to_numpy() if has_date else None
|
||||||
|
R_index = R.index.to_numpy()
|
||||||
|
|
||||||
|
for li in L.index:
|
||||||
|
l_amt = L.at[li, "_amt"]
|
||||||
|
l_date = L.at[li, "_date"] if has_date else None
|
||||||
|
amt_ok = (R_amts >= l_amt - tol) & (R_amts <= l_amt + tol)
|
||||||
|
if has_date:
|
||||||
|
date_diff = R_dates - l_date.to_datetime64()
|
||||||
|
date_ok = (date_diff >= -win.to_timedelta64()) & (
|
||||||
|
date_diff <= win.to_timedelta64()
|
||||||
|
)
|
||||||
|
mask = amt_ok & date_ok
|
||||||
|
else:
|
||||||
|
mask = amt_ok
|
||||||
|
for ri in R_index[mask]:
|
||||||
|
out.append(_score_pair(L, R, li, ri, base_score=500))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _candidates_fuzzy(
|
||||||
|
L: pd.DataFrame, R: pd.DataFrame, options: ReconcileOptions
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Tolerance-pass candidates re-scored by description similarity.
|
||||||
|
|
||||||
|
Only kept when the description similarity meets the threshold AND
|
||||||
|
the amount is within tolerance. Score blends the two so a strong
|
||||||
|
description match outranks a marginal amount match within the same
|
||||||
|
pass.
|
||||||
|
"""
|
||||||
|
if not (_HAS_RAPIDFUZZ and options.left_desc and options.right_desc):
|
||||||
|
return []
|
||||||
|
out: list[dict] = []
|
||||||
|
has_date = options.left_date is not None
|
||||||
|
tol = options.amount_tolerance
|
||||||
|
win = pd.Timedelta(days=options.date_tolerance_days) if has_date else None
|
||||||
|
min_score = options.desc_min_score
|
||||||
|
|
||||||
|
L_desc = L[options.left_desc].astype(str)
|
||||||
|
R_desc = R[options.right_desc].astype(str)
|
||||||
|
|
||||||
|
for li in L.index:
|
||||||
|
l_amt = L.at[li, "_amt"]
|
||||||
|
l_date = L.at[li, "_date"] if has_date else None
|
||||||
|
l_text = L_desc.at[li]
|
||||||
|
for ri in R.index:
|
||||||
|
if abs(R.at[ri, "_amt"] - l_amt) > tol:
|
||||||
|
continue
|
||||||
|
if has_date:
|
||||||
|
diff = R.at[ri, "_date"] - l_date
|
||||||
|
if abs(diff) > win:
|
||||||
|
continue
|
||||||
|
score = int(_rf_fuzz.token_set_ratio(l_text, R_desc.at[ri]))
|
||||||
|
if score < min_score:
|
||||||
|
continue
|
||||||
|
# Base 300 keeps fuzzy below exact/tolerance passes; the
|
||||||
|
# 0–100 description score breaks ties within the pass.
|
||||||
|
out.append(
|
||||||
|
_score_pair(L, R, li, ri, base_score=300 + score, desc_score=score)
|
||||||
|
)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Scoring & output assembly
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _score_pair(
|
||||||
|
L: pd.DataFrame,
|
||||||
|
R: pd.DataFrame,
|
||||||
|
li,
|
||||||
|
ri,
|
||||||
|
*,
|
||||||
|
base_score: int,
|
||||||
|
desc_score: int = 0,
|
||||||
|
) -> dict:
|
||||||
|
"""Build the candidate record used by the assignment phase."""
|
||||||
|
amt_diff = float(L.at[li, "_amt"] - R.at[ri, "_amt"])
|
||||||
|
l_date = L.at[li, "_date"]
|
||||||
|
r_date = R.at[ri, "_date"]
|
||||||
|
if pd.notna(l_date) and pd.notna(r_date):
|
||||||
|
date_diff_days = int((l_date - r_date).days)
|
||||||
|
else:
|
||||||
|
date_diff_days = None
|
||||||
|
# Penalize larger differences within the same pass so closer matches
|
||||||
|
# win ties. Cap penalty so it can't flip pass ordering.
|
||||||
|
penalty = min(abs(amt_diff) * 10, 50)
|
||||||
|
if date_diff_days is not None:
|
||||||
|
penalty += min(abs(date_diff_days), 50)
|
||||||
|
return {
|
||||||
|
"left_idx": li,
|
||||||
|
"right_idx": ri,
|
||||||
|
"score": base_score - penalty,
|
||||||
|
"amount_diff": amt_diff,
|
||||||
|
"date_diff_days": date_diff_days,
|
||||||
|
"desc_score": desc_score,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _build_matched(
|
||||||
|
left: pd.DataFrame,
|
||||||
|
right: pd.DataFrame,
|
||||||
|
pairs: list[dict],
|
||||||
|
options: ReconcileOptions,
|
||||||
|
*,
|
||||||
|
review: bool = False,
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""Assemble a matched/review frame: bookkeeping cols + originals."""
|
||||||
|
if not pairs:
|
||||||
|
cols = ["match_pass", "score", "amount_diff", "date_diff_days", "desc_score"]
|
||||||
|
cols += [f"left_{c}" for c in left.columns]
|
||||||
|
cols += [f"right_{c}" for c in right.columns]
|
||||||
|
return pd.DataFrame(columns=cols)
|
||||||
|
|
||||||
|
rows = []
|
||||||
|
for p in pairs:
|
||||||
|
li, ri = p["left_idx"], p["right_idx"]
|
||||||
|
row = {
|
||||||
|
"match_pass": p["pass"],
|
||||||
|
"score": p["score"],
|
||||||
|
"amount_diff": p["amount_diff"],
|
||||||
|
"date_diff_days": p["date_diff_days"],
|
||||||
|
"desc_score": p["desc_score"],
|
||||||
|
}
|
||||||
|
for c in left.columns:
|
||||||
|
row[f"left_{c}"] = left.at[li, c]
|
||||||
|
for c in right.columns:
|
||||||
|
row[f"right_{c}"] = right.at[ri, c]
|
||||||
|
rows.append(row)
|
||||||
|
out = pd.DataFrame(rows)
|
||||||
|
# Stable ordering: review by left_idx so paired rows stay adjacent;
|
||||||
|
# matched by score descending so the user sees the strongest pairs
|
||||||
|
# first.
|
||||||
|
if review:
|
||||||
|
out = out.sort_values("score", ascending=False, kind="stable")
|
||||||
|
else:
|
||||||
|
out = out.sort_values("score", ascending=False, kind="stable")
|
||||||
|
return out.reset_index(drop=True)
|
||||||
324
src/gui/pages/11_Reconciler.py
Normal file
324
src/gui/pages/11_Reconciler.py
Normal file
@@ -0,0 +1,324 @@
|
|||||||
|
"""DataTools Reconcile — Streamlit page.
|
||||||
|
|
||||||
|
Two-source reconciliation (e.g. bank feed vs. ledger): upload both
|
||||||
|
files, pick the amount/date columns on each side, choose tolerance
|
||||||
|
settings, then download four output CSVs (matched, unmatched-left,
|
||||||
|
unmatched-right, review).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import io
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||||
|
if str(_project_root) not in sys.path:
|
||||||
|
sys.path.insert(0, str(_project_root))
|
||||||
|
|
||||||
|
from src.audit import log_event, log_page_open
|
||||||
|
from src.gui.components import (
|
||||||
|
back_to_home_link,
|
||||||
|
hide_streamlit_chrome,
|
||||||
|
html_download_button,
|
||||||
|
render_sticky_footer,
|
||||||
|
)
|
||||||
|
from src.core.reconcile import ReconcileOptions, reconcile
|
||||||
|
|
||||||
|
hide_streamlit_chrome()
|
||||||
|
render_sticky_footer()
|
||||||
|
back_to_home_link()
|
||||||
|
log_page_open("11_Reconciler")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Header
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.title("Reconcile Two Files")
|
||||||
|
st.caption(
|
||||||
|
"Match transactions between two sources (e.g. bank feed vs. ledger). "
|
||||||
|
"Outputs four buckets: matched, unmatched-left, unmatched-right, and "
|
||||||
|
"ambiguous-for-review."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# File readers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@st.cache_data(show_spinner=False)
|
||||||
|
def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
|
||||||
|
"""Read uploaded bytes into a DataFrame. Mirrors the helper used by
|
||||||
|
other tool pages — keeps everything as strings so the user controls
|
||||||
|
coercion via the column-type selectors below."""
|
||||||
|
suffix = Path(name).suffix.lower()
|
||||||
|
bio = io.BytesIO(data)
|
||||||
|
if suffix in (".xlsx", ".xls"):
|
||||||
|
return pd.read_excel(bio, dtype=str, keep_default_na=False)
|
||||||
|
for enc in ("utf-8", "utf-8-sig", "latin-1"):
|
||||||
|
try:
|
||||||
|
bio.seek(0)
|
||||||
|
sep = "\t" if suffix == ".tsv" else ","
|
||||||
|
return pd.read_csv(
|
||||||
|
bio, dtype=str, keep_default_na=False,
|
||||||
|
encoding=enc, sep=sep, on_bad_lines="warn",
|
||||||
|
)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
continue
|
||||||
|
bio.seek(0)
|
||||||
|
return pd.read_csv(bio, dtype=str, keep_default_na=False, encoding="latin-1")
|
||||||
|
|
||||||
|
|
||||||
|
def _side_panel(side_label: str, key_prefix: str):
|
||||||
|
"""Render one side's upload + preview. Returns the DataFrame or None."""
|
||||||
|
st.markdown(f"**{side_label}**")
|
||||||
|
upload = st.file_uploader(
|
||||||
|
f"Upload {side_label.lower()} file (CSV / Excel)",
|
||||||
|
type=["csv", "tsv", "xlsx", "xls"],
|
||||||
|
key=f"{key_prefix}_upload",
|
||||||
|
label_visibility="collapsed",
|
||||||
|
)
|
||||||
|
if upload is None:
|
||||||
|
st.caption(f"_No {side_label.lower()} file yet._")
|
||||||
|
return None, None
|
||||||
|
try:
|
||||||
|
df = _read_uploaded(upload.name, upload.getvalue())
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Could not read `{upload.name}`: {e}")
|
||||||
|
return None, None
|
||||||
|
st.caption(f"`{upload.name}` — {len(df)} rows, {len(df.columns)} columns")
|
||||||
|
with st.expander(f"Preview {side_label.lower()}", expanded=False):
|
||||||
|
st.dataframe(df.head(10), width="stretch")
|
||||||
|
return df, upload.name
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Side-by-side upload
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
col_left, col_right = st.columns(2)
|
||||||
|
with col_left:
|
||||||
|
left_df, left_name = _side_panel("Left (e.g. bank feed)", "left")
|
||||||
|
with col_right:
|
||||||
|
right_df, right_name = _side_panel("Right (e.g. ledger)", "right")
|
||||||
|
|
||||||
|
if left_df is None or right_df is None:
|
||||||
|
st.info("Upload both files to continue.")
|
||||||
|
st.stop()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Column mapping
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.subheader("Match settings")
|
||||||
|
|
||||||
|
map_left, map_right = st.columns(2)
|
||||||
|
|
||||||
|
|
||||||
|
def _col_pick(label: str, df: pd.DataFrame, key: str, *, allow_none: bool):
|
||||||
|
"""Selectbox for picking a column. Optional 'None' slot for date/desc."""
|
||||||
|
cols = list(df.columns)
|
||||||
|
if allow_none:
|
||||||
|
cols = ["(none)"] + cols
|
||||||
|
pick = st.selectbox(label, cols, key=key)
|
||||||
|
return None if pick == "(none)" else pick
|
||||||
|
|
||||||
|
|
||||||
|
with map_left:
|
||||||
|
st.markdown("**Left columns**")
|
||||||
|
left_amount = _col_pick("Amount column", left_df, "left_amount_col", allow_none=False)
|
||||||
|
left_date = _col_pick("Date column (optional)", left_df, "left_date_col", allow_none=True)
|
||||||
|
left_desc = _col_pick("Description column (optional)", left_df, "left_desc_col", allow_none=True)
|
||||||
|
left_keys = st.multiselect(
|
||||||
|
"Reference columns (optional, e.g. check / invoice no.)",
|
||||||
|
list(left_df.columns), key="left_keys_col",
|
||||||
|
)
|
||||||
|
|
||||||
|
with map_right:
|
||||||
|
st.markdown("**Right columns**")
|
||||||
|
right_amount = _col_pick("Amount column", right_df, "right_amount_col", allow_none=False)
|
||||||
|
right_date = _col_pick("Date column (optional)", right_df, "right_date_col", allow_none=True)
|
||||||
|
right_desc = _col_pick("Description column (optional)", right_df, "right_desc_col", allow_none=True)
|
||||||
|
right_keys = st.multiselect(
|
||||||
|
"Reference columns (must match left count)",
|
||||||
|
list(right_df.columns), key="right_keys_col",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Tolerances & options
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
with st.expander("Tolerances & options", expanded=True):
|
||||||
|
tol_a, tol_b, tol_c = st.columns(3)
|
||||||
|
with tol_a:
|
||||||
|
amount_tolerance = st.number_input(
|
||||||
|
"Amount tolerance",
|
||||||
|
min_value=0.0, value=0.0, step=0.01, format="%.4f",
|
||||||
|
help="Absolute tolerance on amount (e.g. 0.01 to absorb cent rounding).",
|
||||||
|
)
|
||||||
|
with tol_b:
|
||||||
|
date_tolerance = st.number_input(
|
||||||
|
"Date tolerance (days)",
|
||||||
|
min_value=0, value=0, step=1,
|
||||||
|
help="Allow N calendar days of drift between posting dates.",
|
||||||
|
)
|
||||||
|
with tol_c:
|
||||||
|
invert_right_sign = st.checkbox(
|
||||||
|
"Invert right amount sign",
|
||||||
|
value=False,
|
||||||
|
help="Use when one side records debits as positive and the other as negative.",
|
||||||
|
)
|
||||||
|
desc_min_score = st.slider(
|
||||||
|
"Description similarity boost (0 disables)",
|
||||||
|
min_value=0, max_value=100, value=0, step=5,
|
||||||
|
help=(
|
||||||
|
"When both sides have a description column set, accept matches with "
|
||||||
|
"this minimum fuzzy similarity even if amount/date are merely within "
|
||||||
|
"tolerance. Lower = more permissive."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Run
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
if st.button("Reconcile", type="primary", width="stretch"):
|
||||||
|
if len(left_keys) != len(right_keys):
|
||||||
|
st.error(
|
||||||
|
"Reference columns must match in count: "
|
||||||
|
f"left has {len(left_keys)}, right has {len(right_keys)}."
|
||||||
|
)
|
||||||
|
st.stop()
|
||||||
|
options = ReconcileOptions(
|
||||||
|
left_amount=left_amount,
|
||||||
|
right_amount=right_amount,
|
||||||
|
left_date=left_date,
|
||||||
|
right_date=right_date,
|
||||||
|
left_keys=list(left_keys),
|
||||||
|
right_keys=list(right_keys),
|
||||||
|
left_desc=left_desc,
|
||||||
|
right_desc=right_desc,
|
||||||
|
desc_min_score=int(desc_min_score),
|
||||||
|
amount_tolerance=float(amount_tolerance),
|
||||||
|
date_tolerance_days=int(date_tolerance),
|
||||||
|
invert_right_sign=bool(invert_right_sign),
|
||||||
|
)
|
||||||
|
with st.spinner("Reconciling..."):
|
||||||
|
try:
|
||||||
|
result = reconcile(left_df, right_df, options)
|
||||||
|
except ValueError as e:
|
||||||
|
st.error(str(e))
|
||||||
|
st.stop()
|
||||||
|
st.session_state["reconcile_result"] = result
|
||||||
|
st.session_state["reconcile_left_name"] = left_name
|
||||||
|
log_event("tool_run", "Reconcile run", page="11_Reconciler")
|
||||||
|
|
||||||
|
result = st.session_state.get("reconcile_result")
|
||||||
|
if result is None:
|
||||||
|
st.stop()
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Results
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.subheader("Results")
|
||||||
|
|
||||||
|
stats = result.stats
|
||||||
|
m1, m2, m3, m4 = st.columns(4)
|
||||||
|
m1.metric("Matched", stats["matched"])
|
||||||
|
m2.metric("Review", stats["review"])
|
||||||
|
m3.metric("Unmatched left", stats["unmatched_left"])
|
||||||
|
m4.metric("Unmatched right", stats["unmatched_right"])
|
||||||
|
|
||||||
|
# Health bar: matched / max(left, right)
|
||||||
|
denom = max(stats["left_rows"], stats["right_rows"]) or 1
|
||||||
|
pct = stats["matched"] / denom * 100
|
||||||
|
st.caption(f"Coverage: {pct:.1f}% of the larger side")
|
||||||
|
|
||||||
|
tab_matched, tab_review, tab_left, tab_right = st.tabs(
|
||||||
|
[
|
||||||
|
f"Matched ({stats['matched']})",
|
||||||
|
f"Review ({stats['review']})",
|
||||||
|
f"Unmatched left ({stats['unmatched_left']})",
|
||||||
|
f"Unmatched right ({stats['unmatched_right']})",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
with tab_matched:
|
||||||
|
if result.matched.empty:
|
||||||
|
st.info("No matches.")
|
||||||
|
else:
|
||||||
|
st.dataframe(result.matched, width="stretch", hide_index=True)
|
||||||
|
|
||||||
|
with tab_review:
|
||||||
|
if result.review.empty:
|
||||||
|
st.info("Nothing to review — no ambiguous candidates.")
|
||||||
|
else:
|
||||||
|
st.caption(
|
||||||
|
"Pairs flagged because the algorithm couldn't pick a single "
|
||||||
|
"best match (e.g. multiple equally-good candidates). Use the "
|
||||||
|
"left/right indices to disambiguate manually."
|
||||||
|
)
|
||||||
|
st.dataframe(result.review, width="stretch", hide_index=True)
|
||||||
|
|
||||||
|
with tab_left:
|
||||||
|
if result.unmatched_left.empty:
|
||||||
|
st.info("Every left row was matched.")
|
||||||
|
else:
|
||||||
|
st.dataframe(result.unmatched_left, width="stretch", hide_index=True)
|
||||||
|
|
||||||
|
with tab_right:
|
||||||
|
if result.unmatched_right.empty:
|
||||||
|
st.info("Every right row was matched.")
|
||||||
|
else:
|
||||||
|
st.dataframe(result.unmatched_right, width="stretch", hide_index=True)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Downloads
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
stem = Path(st.session_state.get("reconcile_left_name", "reconcile")).stem
|
||||||
|
|
||||||
|
dl_a, dl_b, dl_c, dl_d = st.columns(4)
|
||||||
|
with dl_a:
|
||||||
|
html_download_button(
|
||||||
|
"Matched CSV",
|
||||||
|
result.matched.to_csv(index=False).encode("utf-8-sig"),
|
||||||
|
file_name=f"{stem}_matched.csv",
|
||||||
|
mime="text/csv",
|
||||||
|
disabled=result.matched.empty,
|
||||||
|
)
|
||||||
|
with dl_b:
|
||||||
|
html_download_button(
|
||||||
|
"Review CSV",
|
||||||
|
result.review.to_csv(index=False).encode("utf-8-sig"),
|
||||||
|
file_name=f"{stem}_review.csv",
|
||||||
|
mime="text/csv",
|
||||||
|
disabled=result.review.empty,
|
||||||
|
)
|
||||||
|
with dl_c:
|
||||||
|
html_download_button(
|
||||||
|
"Unmatched left",
|
||||||
|
result.unmatched_left.to_csv(index=False).encode("utf-8-sig"),
|
||||||
|
file_name=f"{stem}_unmatched_left.csv",
|
||||||
|
mime="text/csv",
|
||||||
|
disabled=result.unmatched_left.empty,
|
||||||
|
)
|
||||||
|
with dl_d:
|
||||||
|
html_download_button(
|
||||||
|
"Unmatched right",
|
||||||
|
result.unmatched_right.to_csv(index=False).encode("utf-8-sig"),
|
||||||
|
file_name=f"{stem}_unmatched_right.csv",
|
||||||
|
mime="text/csv",
|
||||||
|
disabled=result.unmatched_right.empty,
|
||||||
|
)
|
||||||
@@ -157,6 +157,18 @@ TOOLS: list[Tool] = [
|
|||||||
status="Ready",
|
status="Ready",
|
||||||
section="transformations",
|
section="transformations",
|
||||||
),
|
),
|
||||||
|
Tool(
|
||||||
|
tool_id="11_reconciler",
|
||||||
|
icon=":material/compare_arrows:",
|
||||||
|
name="Reconcile Two Files",
|
||||||
|
description=(
|
||||||
|
"Match transactions between two sources (e.g. bank feed vs. "
|
||||||
|
"ledger) with amount and date tolerance."
|
||||||
|
),
|
||||||
|
page_slug="11_Reconciler",
|
||||||
|
status="Ready",
|
||||||
|
section="automations",
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
317
tests/test_reconcile.py
Normal file
317
tests/test_reconcile.py
Normal file
@@ -0,0 +1,317 @@
|
|||||||
|
"""Tests for src.core.reconcile — two-source matching engine."""
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src.core.reconcile import (
|
||||||
|
ReconcileOptions,
|
||||||
|
ReconcileResult,
|
||||||
|
reconcile,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _bank(rows):
|
||||||
|
return pd.DataFrame(rows, columns=["date", "amount", "desc"])
|
||||||
|
|
||||||
|
|
||||||
|
def _ledger(rows):
|
||||||
|
return pd.DataFrame(rows, columns=["posted", "amt", "memo"])
|
||||||
|
|
||||||
|
|
||||||
|
class TestExactMatch:
|
||||||
|
def test_one_to_one_exact(self):
|
||||||
|
left = _bank([
|
||||||
|
("2026-01-05", 100.00, "ACME"),
|
||||||
|
("2026-01-06", 250.00, "WIDGET CO"),
|
||||||
|
])
|
||||||
|
right = _ledger([
|
||||||
|
("2026-01-05", 100.00, "Acme Inc"),
|
||||||
|
("2026-01-06", 250.00, "Widget"),
|
||||||
|
])
|
||||||
|
result = reconcile(left, right, ReconcileOptions(
|
||||||
|
left_amount="amount", right_amount="amt",
|
||||||
|
left_date="date", right_date="posted",
|
||||||
|
))
|
||||||
|
assert result.stats["matched"] == 2
|
||||||
|
assert result.stats["unmatched_left"] == 0
|
||||||
|
assert result.stats["unmatched_right"] == 0
|
||||||
|
assert (result.matched["match_pass"] == "exact").all()
|
||||||
|
|
||||||
|
def test_unmatched_left_and_right(self):
|
||||||
|
left = _bank([
|
||||||
|
("2026-01-05", 100.00, "ACME"),
|
||||||
|
("2026-01-07", 99.99, "ONLY ON LEFT"),
|
||||||
|
])
|
||||||
|
right = _ledger([
|
||||||
|
("2026-01-05", 100.00, "Acme"),
|
||||||
|
("2026-01-08", 500.00, "Only on right"),
|
||||||
|
])
|
||||||
|
result = reconcile(left, right, ReconcileOptions(
|
||||||
|
left_amount="amount", right_amount="amt",
|
||||||
|
left_date="date", right_date="posted",
|
||||||
|
))
|
||||||
|
assert result.stats["matched"] == 1
|
||||||
|
assert result.stats["unmatched_left"] == 1
|
||||||
|
assert result.stats["unmatched_right"] == 1
|
||||||
|
# The unmatched rows preserve their original columns.
|
||||||
|
assert "ONLY ON LEFT" in result.unmatched_left["desc"].tolist()
|
||||||
|
assert "Only on right" in result.unmatched_right["memo"].tolist()
|
||||||
|
|
||||||
|
def test_amount_only_no_date(self):
|
||||||
|
# No date columns set — match purely on amount. Distinct
|
||||||
|
# amounts pair off one-to-one.
|
||||||
|
left = _bank([
|
||||||
|
("2026-01-01", 42.50, "A"),
|
||||||
|
("2026-02-15", 99.00, "B"),
|
||||||
|
])
|
||||||
|
right = _ledger([
|
||||||
|
("2099-12-31", 42.50, "X"),
|
||||||
|
("1970-01-01", 99.00, "Y"),
|
||||||
|
])
|
||||||
|
result = reconcile(left, right, ReconcileOptions(
|
||||||
|
left_amount="amount", right_amount="amt",
|
||||||
|
))
|
||||||
|
assert result.stats["matched"] == 2
|
||||||
|
|
||||||
|
def test_identical_amounts_with_no_date_are_ambiguous(self):
|
||||||
|
# Without a date column to disambiguate, two left rows with
|
||||||
|
# the same amount and two right rows with the same amount
|
||||||
|
# are genuinely undecidable — route to review.
|
||||||
|
left = _bank([
|
||||||
|
("2026-01-01", 42.50, "A"),
|
||||||
|
("2026-02-15", 42.50, "B"),
|
||||||
|
])
|
||||||
|
right = _ledger([
|
||||||
|
("2099-12-31", 42.50, "X"),
|
||||||
|
("1970-01-01", 42.50, "Y"),
|
||||||
|
])
|
||||||
|
result = reconcile(left, right, ReconcileOptions(
|
||||||
|
left_amount="amount", right_amount="amt",
|
||||||
|
))
|
||||||
|
assert result.stats["matched"] == 0
|
||||||
|
assert result.stats["review"] >= 2
|
||||||
|
|
||||||
|
|
||||||
|
class TestAmountTolerance:
|
||||||
|
def test_amount_within_tolerance(self):
|
||||||
|
left = _bank([("2026-01-05", 100.00, "A")])
|
||||||
|
right = _ledger([("2026-01-05", 100.02, "X")])
|
||||||
|
# Exact pass misses (100.00 != 100.02). Tolerance pass catches it.
|
||||||
|
result = reconcile(left, right, ReconcileOptions(
|
||||||
|
left_amount="amount", right_amount="amt",
|
||||||
|
left_date="date", right_date="posted",
|
||||||
|
amount_tolerance=0.05,
|
||||||
|
))
|
||||||
|
assert result.stats["matched"] == 1
|
||||||
|
assert result.matched.iloc[0]["match_pass"] == "tolerance"
|
||||||
|
assert abs(result.matched.iloc[0]["amount_diff"] - -0.02) < 1e-9
|
||||||
|
|
||||||
|
def test_outside_tolerance_unmatched(self):
|
||||||
|
left = _bank([("2026-01-05", 100.00, "A")])
|
||||||
|
right = _ledger([("2026-01-05", 100.50, "X")])
|
||||||
|
result = reconcile(left, right, ReconcileOptions(
|
||||||
|
left_amount="amount", right_amount="amt",
|
||||||
|
left_date="date", right_date="posted",
|
||||||
|
amount_tolerance=0.05,
|
||||||
|
))
|
||||||
|
assert result.stats["matched"] == 0
|
||||||
|
assert result.stats["unmatched_left"] == 1
|
||||||
|
assert result.stats["unmatched_right"] == 1
|
||||||
|
|
||||||
|
|
||||||
|
class TestDateWindow:
|
||||||
|
def test_date_within_window(self):
|
||||||
|
left = _bank([("2026-01-05", 100.00, "A")])
|
||||||
|
right = _ledger([("2026-01-07", 100.00, "X")]) # 2 days later
|
||||||
|
result = reconcile(left, right, ReconcileOptions(
|
||||||
|
left_amount="amount", right_amount="amt",
|
||||||
|
left_date="date", right_date="posted",
|
||||||
|
date_tolerance_days=3,
|
||||||
|
))
|
||||||
|
assert result.stats["matched"] == 1
|
||||||
|
assert result.matched.iloc[0]["date_diff_days"] == -2
|
||||||
|
|
||||||
|
def test_date_outside_window(self):
|
||||||
|
left = _bank([("2026-01-05", 100.00, "A")])
|
||||||
|
right = _ledger([("2026-01-20", 100.00, "X")]) # 15 days later
|
||||||
|
result = reconcile(left, right, ReconcileOptions(
|
||||||
|
left_amount="amount", right_amount="amt",
|
||||||
|
left_date="date", right_date="posted",
|
||||||
|
date_tolerance_days=5,
|
||||||
|
))
|
||||||
|
assert result.stats["matched"] == 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestSignInversion:
|
||||||
|
def test_invert_right_sign(self):
|
||||||
|
# Bank: deposit = +100 ; Ledger: deposit recorded as -100.
|
||||||
|
left = _bank([("2026-01-05", 100.00, "A")])
|
||||||
|
right = _ledger([("2026-01-05", -100.00, "X")])
|
||||||
|
# Without inversion: no match.
|
||||||
|
r1 = reconcile(left, right, ReconcileOptions(
|
||||||
|
left_amount="amount", right_amount="amt",
|
||||||
|
left_date="date", right_date="posted",
|
||||||
|
))
|
||||||
|
assert r1.stats["matched"] == 0
|
||||||
|
# With inversion: match.
|
||||||
|
r2 = reconcile(left, right, ReconcileOptions(
|
||||||
|
left_amount="amount", right_amount="amt",
|
||||||
|
left_date="date", right_date="posted",
|
||||||
|
invert_right_sign=True,
|
||||||
|
))
|
||||||
|
assert r2.stats["matched"] == 1
|
||||||
|
|
||||||
|
|
||||||
|
class TestAmbiguity:
|
||||||
|
def test_two_equal_candidates_go_to_review(self):
|
||||||
|
# One left row, two identical right rows → ambiguous.
|
||||||
|
left = _bank([("2026-01-05", 100.00, "A")])
|
||||||
|
right = _ledger([
|
||||||
|
("2026-01-05", 100.00, "X"),
|
||||||
|
("2026-01-05", 100.00, "Y"),
|
||||||
|
])
|
||||||
|
result = reconcile(left, right, ReconcileOptions(
|
||||||
|
left_amount="amount", right_amount="amt",
|
||||||
|
left_date="date", right_date="posted",
|
||||||
|
))
|
||||||
|
assert result.stats["matched"] == 0
|
||||||
|
assert result.stats["review"] == 2 # both candidate pairs flagged
|
||||||
|
# Left was consumed by the ambiguity, both rights too.
|
||||||
|
assert result.stats["unmatched_left"] == 0
|
||||||
|
assert result.stats["unmatched_right"] == 0
|
||||||
|
|
||||||
|
def test_uniquely_better_match_wins(self):
|
||||||
|
# Two left rows, two right rows; one pair is a closer match.
|
||||||
|
left = _bank([
|
||||||
|
("2026-01-05", 100.00, "A"),
|
||||||
|
("2026-01-05", 100.05, "B"),
|
||||||
|
])
|
||||||
|
right = _ledger([
|
||||||
|
("2026-01-05", 100.00, "X"), # closer to A
|
||||||
|
("2026-01-05", 100.05, "Y"), # closer to B
|
||||||
|
])
|
||||||
|
result = reconcile(left, right, ReconcileOptions(
|
||||||
|
left_amount="amount", right_amount="amt",
|
||||||
|
left_date="date", right_date="posted",
|
||||||
|
amount_tolerance=0.10,
|
||||||
|
))
|
||||||
|
# Both should pair uniquely on the exact pass (penalty inside
|
||||||
|
# exact pass breaks the symmetric near-ties).
|
||||||
|
assert result.stats["matched"] == 2
|
||||||
|
|
||||||
|
|
||||||
|
class TestKeyMatch:
|
||||||
|
def test_reference_number_authoritative(self):
|
||||||
|
# Same check number, same amount, different posting dates.
|
||||||
|
# Key match should pair them even though dates differ.
|
||||||
|
left = pd.DataFrame([
|
||||||
|
{"date": "2026-01-05", "amount": 100.00, "check_no": "1042"},
|
||||||
|
])
|
||||||
|
right = pd.DataFrame([
|
||||||
|
{"posted": "2026-01-12", "amt": 100.00, "ref": "1042"},
|
||||||
|
])
|
||||||
|
result = reconcile(left, right, ReconcileOptions(
|
||||||
|
left_amount="amount", right_amount="amt",
|
||||||
|
left_date="date", right_date="posted",
|
||||||
|
left_keys=["check_no"], right_keys=["ref"],
|
||||||
|
date_tolerance_days=0, # exact-pass would miss
|
||||||
|
))
|
||||||
|
assert result.stats["matched"] == 1
|
||||||
|
assert result.matched.iloc[0]["match_pass"] == "key"
|
||||||
|
|
||||||
|
def test_key_requires_amount_to_tie(self):
|
||||||
|
# Same ref but mismatched amounts → not a key match.
|
||||||
|
left = pd.DataFrame([
|
||||||
|
{"date": "2026-01-05", "amount": 100.00, "check_no": "1042"},
|
||||||
|
])
|
||||||
|
right = pd.DataFrame([
|
||||||
|
{"posted": "2026-01-05", "amt": 200.00, "ref": "1042"},
|
||||||
|
])
|
||||||
|
result = reconcile(left, right, ReconcileOptions(
|
||||||
|
left_amount="amount", right_amount="amt",
|
||||||
|
left_date="date", right_date="posted",
|
||||||
|
left_keys=["check_no"], right_keys=["ref"],
|
||||||
|
))
|
||||||
|
assert result.stats["matched"] == 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestInputValidation:
|
||||||
|
def test_missing_amount_columns(self):
|
||||||
|
left = _bank([("2026-01-05", 100.00, "A")])
|
||||||
|
right = _ledger([("2026-01-05", 100.00, "X")])
|
||||||
|
with pytest.raises(ValueError, match="left_amount"):
|
||||||
|
reconcile(left, right, ReconcileOptions(
|
||||||
|
right_amount="amt",
|
||||||
|
))
|
||||||
|
|
||||||
|
def test_left_date_without_right_date(self):
|
||||||
|
left = _bank([("2026-01-05", 100.00, "A")])
|
||||||
|
right = _ledger([("2026-01-05", 100.00, "X")])
|
||||||
|
with pytest.raises(ValueError, match="both be set or both be None"):
|
||||||
|
reconcile(left, right, ReconcileOptions(
|
||||||
|
left_amount="amount", right_amount="amt",
|
||||||
|
left_date="date", # right_date missing
|
||||||
|
))
|
||||||
|
|
||||||
|
def test_mismatched_key_lengths(self):
|
||||||
|
left = _bank([("2026-01-05", 100.00, "A")])
|
||||||
|
right = _ledger([("2026-01-05", 100.00, "X")])
|
||||||
|
with pytest.raises(ValueError, match="same length"):
|
||||||
|
reconcile(left, right, ReconcileOptions(
|
||||||
|
left_amount="amount", right_amount="amt",
|
||||||
|
left_keys=["a", "b"], right_keys=["x"],
|
||||||
|
))
|
||||||
|
|
||||||
|
def test_negative_tolerance_rejected(self):
|
||||||
|
left = _bank([("2026-01-05", 100.00, "A")])
|
||||||
|
right = _ledger([("2026-01-05", 100.00, "X")])
|
||||||
|
with pytest.raises(ValueError, match="amount_tolerance"):
|
||||||
|
reconcile(left, right, ReconcileOptions(
|
||||||
|
left_amount="amount", right_amount="amt",
|
||||||
|
amount_tolerance=-0.01,
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
class TestUnparseableInputs:
|
||||||
|
def test_non_numeric_amount_falls_through(self):
|
||||||
|
# Left row with garbage amount should land in unmatched_left
|
||||||
|
# (it can't participate in matching but must be visible).
|
||||||
|
left = pd.DataFrame([
|
||||||
|
{"date": "2026-01-05", "amount": "not a number", "desc": "BAD"},
|
||||||
|
{"date": "2026-01-05", "amount": 100.00, "desc": "OK"},
|
||||||
|
])
|
||||||
|
right = _ledger([("2026-01-05", 100.00, "X")])
|
||||||
|
result = reconcile(left, right, ReconcileOptions(
|
||||||
|
left_amount="amount", right_amount="amt",
|
||||||
|
left_date="date", right_date="posted",
|
||||||
|
))
|
||||||
|
assert result.stats["matched"] == 1
|
||||||
|
# The garbage row appears in unmatched_left.
|
||||||
|
assert "BAD" in result.unmatched_left["desc"].tolist()
|
||||||
|
|
||||||
|
|
||||||
|
class TestResultShape:
|
||||||
|
def test_matched_carries_both_sides(self):
|
||||||
|
left = _bank([("2026-01-05", 100.00, "ACME")])
|
||||||
|
right = _ledger([("2026-01-05", 100.00, "Acme Inc")])
|
||||||
|
result = reconcile(left, right, ReconcileOptions(
|
||||||
|
left_amount="amount", right_amount="amt",
|
||||||
|
left_date="date", right_date="posted",
|
||||||
|
))
|
||||||
|
row = result.matched.iloc[0]
|
||||||
|
assert row["left_desc"] == "ACME"
|
||||||
|
assert row["right_memo"] == "Acme Inc"
|
||||||
|
assert row["left_amount"] == 100.00
|
||||||
|
assert row["right_amt"] == 100.00
|
||||||
|
|
||||||
|
def test_empty_inputs_return_empty_result(self):
|
||||||
|
left = _bank([])
|
||||||
|
right = _ledger([])
|
||||||
|
result = reconcile(left, right, ReconcileOptions(
|
||||||
|
left_amount="amount", right_amount="amt",
|
||||||
|
left_date="date", right_date="posted",
|
||||||
|
))
|
||||||
|
assert result.stats["matched"] == 0
|
||||||
|
assert result.matched.empty
|
||||||
|
assert result.unmatched_left.empty
|
||||||
|
assert result.unmatched_right.empty
|
||||||
Reference in New Issue
Block a user