feat: add documentation, Streamlit GUI, and full source tree
- Rewrite README.md with project overview, quick-start, and CLI summary - Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections - Add docs/DEVELOPER.md with architecture, data flow, and extension guides - Rewrite src/core/__init__.py with public API exports and module docstring - Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive match group review with side-by-side diff, and download buttons - Add .gitignore, requirements.txt, all source code, tests, and sample data - Add streamlit to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
0
src/__init__.py
Normal file
0
src/__init__.py
Normal file
4
src/__main__.py
Normal file
4
src/__main__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
"""Allow running as ``python -m src``."""
|
||||
from src.cli import main
|
||||
|
||||
main()
|
||||
502
src/cli.py
Normal file
502
src/cli.py
Normal file
@@ -0,0 +1,502 @@
|
||||
"""CLI for the DataTools deduplicator.
|
||||
|
||||
Usage:
|
||||
python -m src.cli input.csv # dry-run preview
|
||||
python -m src.cli input.csv --apply # write deduplicated output
|
||||
python -m src.cli input.csv --fuzzy name --merge # fuzzy match + merge
|
||||
python -m src.cli --help # full help
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import typer
|
||||
from loguru import logger
|
||||
from rapidfuzz import process as rf_process
|
||||
|
||||
app = typer.Typer(
|
||||
name="dedup",
|
||||
help=(
|
||||
"Find and remove duplicate rows in CSV and Excel files.\n\n"
|
||||
"By default, runs in preview mode — shows what would change without "
|
||||
"modifying anything. Add --apply to write the output.\n\n"
|
||||
"Examples:\n\n"
|
||||
" # Preview duplicates in a CSV file\n"
|
||||
" python -m src.cli customers.csv\n\n"
|
||||
" # Remove duplicates and save the result\n"
|
||||
" python -m src.cli customers.csv --apply\n\n"
|
||||
" # Fuzzy-match on the 'name' column with 80% threshold\n"
|
||||
" python -m src.cli customers.csv --fuzzy name --threshold 80 --apply\n\n"
|
||||
" # Match on specific columns only\n"
|
||||
" python -m src.cli customers.csv --subset email,phone --apply\n\n"
|
||||
" # Keep the most complete row and merge missing fields\n"
|
||||
" python -m src.cli customers.csv --survivor most-complete --merge --apply\n"
|
||||
),
|
||||
add_completion=False,
|
||||
no_args_is_help=True,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _setup_logging(log_dir: Path) -> Path:
|
||||
"""Configure loguru to write a timestamped log file. Returns the log path."""
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
log_path = log_dir / f"dedup_{ts}.log"
|
||||
logger.remove() # remove default stderr handler
|
||||
logger.add(sys.stderr, level="WARNING", format="{message}")
|
||||
logger.add(str(log_path), level="DEBUG",
|
||||
format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}")
|
||||
return log_path
|
||||
|
||||
|
||||
def _suggest_column(name: str, available: list[str]) -> str:
|
||||
"""Return a helpful error message when a column is not found."""
|
||||
cols_str = ", ".join(available)
|
||||
matches = rf_process.extract(name, available, limit=1, score_cutoff=50)
|
||||
if matches:
|
||||
suggestion = matches[0][0]
|
||||
return (
|
||||
f"Column '{name}' not found. "
|
||||
f"Available columns: {cols_str}. "
|
||||
f"Did you mean '{suggestion}'?"
|
||||
)
|
||||
return f"Column '{name}' not found. Available columns: {cols_str}."
|
||||
|
||||
|
||||
def _validate_columns(requested: list[str], available: list[str]) -> None:
|
||||
"""Raise typer.BadParameter if any requested column doesn't exist."""
|
||||
for col in requested:
|
||||
if col not in available:
|
||||
raise typer.BadParameter(_suggest_column(col, available))
|
||||
|
||||
|
||||
def _parse_normalize_map(raw: Optional[str]) -> dict[str, str]:
|
||||
"""Parse 'col:type,col:type' into a dict."""
|
||||
if not raw:
|
||||
return {}
|
||||
result = {}
|
||||
for pair in raw.split(","):
|
||||
pair = pair.strip()
|
||||
if ":" not in pair:
|
||||
raise typer.BadParameter(
|
||||
f"Invalid normalize format: '{pair}'. "
|
||||
f"Expected 'column:type' (e.g., 'email:email,phone:phone')."
|
||||
)
|
||||
col, ntype = pair.split(":", 1)
|
||||
result[col.strip()] = ntype.strip()
|
||||
return result
|
||||
|
||||
|
||||
def _interactive_review(group, df) -> Optional[bool]:
|
||||
"""Side-by-side CLI review for a match group. Returns True/False/None."""
|
||||
from src.core.dedup import MatchResult
|
||||
group: MatchResult
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Match Group {group.group_id + 1} — Confidence: {group.confidence:.1f}%")
|
||||
print(f"Matched on: {', '.join(group.matched_on)}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
display_cols = [c for c in df.columns if not str(c).startswith("_norm_")]
|
||||
for idx in group.row_indices:
|
||||
print(f"\n Row {idx + 1}:")
|
||||
for col in display_cols:
|
||||
val = df.iloc[idx].get(col, "")
|
||||
if str(val).strip():
|
||||
print(f" {col}: {val}")
|
||||
|
||||
while True:
|
||||
choice = input("\n [y] Merge [n] Keep both [s] Skip remaining: ").strip().lower()
|
||||
if choice == "y":
|
||||
return True
|
||||
if choice == "n":
|
||||
return False
|
||||
if choice == "s":
|
||||
return None
|
||||
print(" Please enter y, n, or s.")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main command
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@app.command()
|
||||
def dedup(
|
||||
input_file: str = typer.Argument(
|
||||
...,
|
||||
help="Path to the CSV or Excel file to deduplicate.",
|
||||
),
|
||||
output: Optional[str] = typer.Option(
|
||||
None, "--output", "-o",
|
||||
help="Output file path. Default: {input}_deduplicated.csv",
|
||||
),
|
||||
apply: bool = typer.Option(
|
||||
False, "--apply",
|
||||
help="Write the output file. Without this flag, only a preview is shown.",
|
||||
),
|
||||
key: Optional[str] = typer.Option(
|
||||
None, "--key", "-k",
|
||||
help="Comma-separated strong-key columns (e.g., 'fb_id,ein'). Each is an independent exact-match dedup key.",
|
||||
),
|
||||
subset: Optional[str] = typer.Option(
|
||||
None, "--subset", "-s",
|
||||
help="Comma-separated columns to match on (default: auto-detect).",
|
||||
),
|
||||
fuzzy: Optional[str] = typer.Option(
|
||||
None, "--fuzzy",
|
||||
help="Comma-separated columns to fuzzy-match (others use exact match).",
|
||||
),
|
||||
algorithm: str = typer.Option(
|
||||
"jaro_winkler", "--algorithm", "-a",
|
||||
help="Fuzzy algorithm: levenshtein, jaro_winkler, or token_set_ratio.",
|
||||
),
|
||||
threshold: int = typer.Option(
|
||||
85, "--threshold", "-t",
|
||||
help="Similarity threshold 0-100 for fuzzy matching.",
|
||||
),
|
||||
normalize: Optional[str] = typer.Option(
|
||||
None, "--normalize",
|
||||
help="Column normalizers as 'col:type' pairs (e.g., 'email:email,phone:phone').",
|
||||
),
|
||||
survivor: str = typer.Option(
|
||||
"first", "--survivor",
|
||||
help="Survivor rule: first, last, most-complete, or most-recent.",
|
||||
),
|
||||
date_column: Optional[str] = typer.Option(
|
||||
None, "--date-column",
|
||||
help="Date column for most-recent survivor rule.",
|
||||
),
|
||||
merge: bool = typer.Option(
|
||||
False, "--merge",
|
||||
help="Fill missing fields in the surviving row from removed duplicates.",
|
||||
),
|
||||
review: bool = typer.Option(
|
||||
False, "--review",
|
||||
help="Interactively review each match group before merging.",
|
||||
),
|
||||
config: Optional[str] = typer.Option(
|
||||
None, "--config",
|
||||
help="Load settings from a saved JSON config file.",
|
||||
),
|
||||
save_config: Optional[str] = typer.Option(
|
||||
None, "--save-config",
|
||||
help="Save current settings to a JSON config file.",
|
||||
),
|
||||
sheet: Optional[str] = typer.Option(
|
||||
None, "--sheet",
|
||||
help="Excel sheet name or index (default: first sheet).",
|
||||
),
|
||||
encoding_override: Optional[str] = typer.Option(
|
||||
None, "--encoding",
|
||||
help="Override auto-detected file encoding.",
|
||||
),
|
||||
header_row: Optional[int] = typer.Option(
|
||||
None, "--header-row",
|
||||
help="0-based row index for the header (default: auto-detect).",
|
||||
),
|
||||
):
|
||||
"""Find and remove duplicate rows in CSV and Excel files."""
|
||||
from src.core.io import read_file, write_file, list_sheets
|
||||
from src.core.dedup import (
|
||||
Algorithm, ColumnMatchStrategy, MatchStrategy, SurvivorRule,
|
||||
build_default_strategies, deduplicate,
|
||||
)
|
||||
from src.core.normalizers import NormalizerType
|
||||
from src.core.config import DeduplicationConfig
|
||||
|
||||
# Setup
|
||||
input_path = Path(input_file)
|
||||
if not input_path.exists():
|
||||
typer.echo(f"Error: File not found: {input_path}", err=True)
|
||||
raise typer.Exit(1)
|
||||
|
||||
log_path = _setup_logging(Path("logs"))
|
||||
|
||||
# Load config if provided
|
||||
cfg: Optional[DeduplicationConfig] = None
|
||||
if config:
|
||||
config_path = Path(config)
|
||||
if not config_path.exists():
|
||||
typer.echo(f"Error: Config file not found: {config_path}", err=True)
|
||||
raise typer.Exit(1)
|
||||
cfg = DeduplicationConfig.from_file(config_path)
|
||||
logger.info("Loaded config from {}", config_path)
|
||||
|
||||
# Read input
|
||||
typer.echo(f"Reading {input_path.name}...")
|
||||
try:
|
||||
sheet_arg: str | int | None = None
|
||||
if sheet is not None:
|
||||
try:
|
||||
sheet_arg = int(sheet)
|
||||
except ValueError:
|
||||
sheet_arg = sheet
|
||||
|
||||
df = read_file(
|
||||
input_path,
|
||||
encoding=encoding_override,
|
||||
header_row=header_row,
|
||||
sheet_name=sheet_arg if sheet_arg is not None else 0,
|
||||
)
|
||||
if not isinstance(df, __import__("pandas").DataFrame):
|
||||
# chunked reading returns generator — materialise for v1
|
||||
import pandas as pd
|
||||
df = pd.concat(list(df), ignore_index=True)
|
||||
except Exception as e:
|
||||
typer.echo(f"Error reading file: {e}", err=True)
|
||||
raise typer.Exit(1)
|
||||
|
||||
typer.echo(f" {len(df)} rows, {len(df.columns)} columns")
|
||||
available_columns = list(df.columns)
|
||||
|
||||
# Build strategies
|
||||
strategies: Optional[list[MatchStrategy]] = None
|
||||
|
||||
if cfg and cfg.strategies:
|
||||
strategies = cfg.to_strategies()
|
||||
elif subset or fuzzy:
|
||||
# Build from CLI flags
|
||||
normalize_map = _parse_normalize_map(normalize)
|
||||
strategies = []
|
||||
|
||||
fuzzy_cols = set(c.strip() for c in fuzzy.split(",")) if fuzzy else set()
|
||||
if subset:
|
||||
subset_cols = [c.strip() for c in subset.split(",")]
|
||||
elif fuzzy_cols:
|
||||
# When only --fuzzy is given, match on just those columns
|
||||
subset_cols = list(fuzzy_cols)
|
||||
else:
|
||||
subset_cols = available_columns
|
||||
|
||||
_validate_columns(subset_cols, available_columns)
|
||||
if fuzzy_cols:
|
||||
_validate_columns(list(fuzzy_cols), available_columns)
|
||||
|
||||
col_strats: list[ColumnMatchStrategy] = []
|
||||
for col in subset_cols:
|
||||
norm = None
|
||||
if col in normalize_map:
|
||||
norm = NormalizerType(normalize_map[col])
|
||||
|
||||
if col in fuzzy_cols:
|
||||
algo = Algorithm(algorithm)
|
||||
thresh = float(threshold)
|
||||
else:
|
||||
algo = Algorithm.EXACT
|
||||
thresh = 100.0
|
||||
|
||||
col_strats.append(ColumnMatchStrategy(
|
||||
column=col, algorithm=algo, threshold=thresh, normalizer=norm,
|
||||
))
|
||||
|
||||
strategies = [MatchStrategy(column_strategies=col_strats)]
|
||||
|
||||
# Apply normalizer overrides even with auto-detect
|
||||
if normalize and strategies is None:
|
||||
normalize_map = _parse_normalize_map(normalize)
|
||||
auto_strats = build_default_strategies(df)
|
||||
# Inject normalize_map into auto strategies
|
||||
for strat in auto_strats:
|
||||
for cs in strat.column_strategies:
|
||||
if cs.column in normalize_map:
|
||||
cs.normalizer = NormalizerType(normalize_map[cs.column])
|
||||
strategies = auto_strats
|
||||
|
||||
# --key: add user-declared strong keys as standalone exact-match strategies
|
||||
if key:
|
||||
key_cols = [c.strip() for c in key.split(",")]
|
||||
_validate_columns(key_cols, available_columns)
|
||||
key_strats = [
|
||||
MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column=col, algorithm=Algorithm.EXACT, threshold=100.0)
|
||||
])
|
||||
for col in key_cols
|
||||
]
|
||||
if strategies is None:
|
||||
# Combine with auto-detect so user gets both
|
||||
strategies = build_default_strategies(df) + key_strats
|
||||
else:
|
||||
strategies.extend(key_strats)
|
||||
|
||||
# Survivor rule
|
||||
survivor_map = {
|
||||
"first": SurvivorRule.KEEP_FIRST,
|
||||
"last": SurvivorRule.KEEP_LAST,
|
||||
"most-complete": SurvivorRule.KEEP_MOST_COMPLETE,
|
||||
"most_complete": SurvivorRule.KEEP_MOST_COMPLETE,
|
||||
"most-recent": SurvivorRule.KEEP_MOST_RECENT,
|
||||
"most_recent": SurvivorRule.KEEP_MOST_RECENT,
|
||||
}
|
||||
if cfg:
|
||||
surv_rule = cfg.to_survivor_rule()
|
||||
do_merge = cfg.merge
|
||||
dc = cfg.date_column
|
||||
else:
|
||||
surv_key = survivor.lower().replace("-", "_")
|
||||
if surv_key not in {r.value for r in SurvivorRule} and surv_key not in survivor_map:
|
||||
typer.echo(
|
||||
f"Error: Unknown survivor rule '{survivor}'. "
|
||||
f"Choose from: first, last, most-complete, most-recent.",
|
||||
err=True,
|
||||
)
|
||||
raise typer.Exit(1)
|
||||
surv_rule = survivor_map.get(survivor.lower(), SurvivorRule(surv_key))
|
||||
do_merge = merge
|
||||
dc = date_column
|
||||
|
||||
# Save config if requested
|
||||
if save_config:
|
||||
from src.core.config import DeduplicationConfig, StrategyConfig, ColumnStrategyConfig
|
||||
save_cfg = DeduplicationConfig(
|
||||
survivor_rule=surv_rule.value,
|
||||
date_column=dc,
|
||||
merge=do_merge,
|
||||
subset_columns=[c.strip() for c in subset.split(",")] if subset else None,
|
||||
fuzzy_columns=[c.strip() for c in fuzzy.split(",")] if fuzzy else None,
|
||||
default_algorithm=algorithm,
|
||||
default_threshold=float(threshold),
|
||||
normalize_map=_parse_normalize_map(normalize),
|
||||
)
|
||||
if strategies:
|
||||
save_cfg.strategies = [
|
||||
StrategyConfig(columns=[
|
||||
ColumnStrategyConfig(
|
||||
column=cs.column,
|
||||
algorithm=cs.algorithm.value,
|
||||
threshold=cs.threshold,
|
||||
normalizer=cs.normalizer.value if cs.normalizer else None,
|
||||
)
|
||||
for cs in s.column_strategies
|
||||
])
|
||||
for s in strategies
|
||||
]
|
||||
saved = save_cfg.to_file(save_config)
|
||||
typer.echo(f"Config saved to {saved}")
|
||||
|
||||
# Progress bar
|
||||
progress_cb = None
|
||||
if len(df) > 10_000:
|
||||
from tqdm import tqdm
|
||||
pbar = tqdm(total=len(df) * (len(df) - 1) // 2, desc="Comparing rows",
|
||||
unit="pairs", leave=False)
|
||||
|
||||
def _progress(current: int, total: int):
|
||||
pbar.update(current - pbar.n)
|
||||
if current >= total:
|
||||
pbar.close()
|
||||
|
||||
progress_cb = _progress
|
||||
|
||||
# Review callback
|
||||
review_cb = _interactive_review if review else None
|
||||
|
||||
# Run dedup
|
||||
typer.echo("Finding duplicates...")
|
||||
result = deduplicate(
|
||||
df,
|
||||
strategies=strategies,
|
||||
survivor_rule=surv_rule,
|
||||
date_column=dc,
|
||||
merge=do_merge,
|
||||
preview=not apply,
|
||||
review_callback=review_cb,
|
||||
progress_callback=progress_cb,
|
||||
)
|
||||
|
||||
# Print results
|
||||
_print_results(result, input_path)
|
||||
|
||||
# Write output files
|
||||
if apply:
|
||||
stem = input_path.stem
|
||||
suffix = input_path.suffix
|
||||
|
||||
out_path = Path(output) if output else input_path.parent / f"{stem}_deduplicated.csv"
|
||||
write_file(result.deduplicated_df, out_path)
|
||||
typer.echo(f"\nDeduplicated file: {out_path}")
|
||||
|
||||
if not result.removed_df.empty:
|
||||
removed_path = input_path.parent / f"{stem}_removed.csv"
|
||||
write_file(result.removed_df, removed_path)
|
||||
typer.echo(f"Removed rows: {removed_path}")
|
||||
|
||||
if result.match_groups:
|
||||
groups_path = input_path.parent / f"{stem}_match_groups.csv"
|
||||
_write_match_groups(result, df, groups_path)
|
||||
typer.echo(f"Match groups: {groups_path}")
|
||||
else:
|
||||
typer.echo("\nThis was a preview. Add --apply to write the output files.")
|
||||
|
||||
typer.echo(f"Log: {log_path}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Output formatting
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _print_results(result, input_path: Path) -> None:
|
||||
"""Print a human-readable summary."""
|
||||
removed = result.original_row_count - len(result.deduplicated_df)
|
||||
typer.echo(f"\n{'─'*50}")
|
||||
typer.echo(f" File: {input_path.name}")
|
||||
typer.echo(f" Rows in: {result.original_row_count}")
|
||||
typer.echo(f" Rows out: {len(result.deduplicated_df)}")
|
||||
typer.echo(f" Removed: {removed}")
|
||||
typer.echo(f" Groups: {len(result.match_groups)}")
|
||||
typer.echo(f"{'─'*50}")
|
||||
|
||||
if result.match_groups:
|
||||
typer.echo("\nMatch groups:")
|
||||
for g in result.match_groups[:20]: # cap display
|
||||
rows_str = ", ".join(str(i + 1) for i in g.row_indices)
|
||||
surv = g.survivor_index + 1
|
||||
typer.echo(
|
||||
f" Group {g.group_id + 1}: rows [{rows_str}] "
|
||||
f"→ keep row {surv} "
|
||||
f"(confidence: {g.confidence:.1f}%, "
|
||||
f"matched on: {', '.join(g.matched_on)})"
|
||||
)
|
||||
if len(result.match_groups) > 20:
|
||||
typer.echo(f" ... and {len(result.match_groups) - 20} more groups")
|
||||
|
||||
|
||||
def _write_match_groups(result, original_df, path: Path) -> None:
|
||||
"""Write match groups to a CSV for audit."""
|
||||
import pandas as pd
|
||||
from src.core.io import write_file
|
||||
|
||||
rows = []
|
||||
for g in result.match_groups:
|
||||
for idx in g.row_indices:
|
||||
row_data = {"_group_id": g.group_id + 1}
|
||||
row_data["_is_survivor"] = idx == g.survivor_index
|
||||
row_data["_confidence"] = g.confidence
|
||||
row_data["_matched_on"] = ", ".join(g.matched_on)
|
||||
row_data["_original_row"] = idx + 1
|
||||
# Include original data
|
||||
for col in original_df.columns:
|
||||
row_data[col] = original_df.iloc[idx].get(col, "")
|
||||
rows.append(row_data)
|
||||
|
||||
groups_df = pd.DataFrame(rows)
|
||||
write_file(groups_df, path)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# __main__ support
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
app()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
93
src/core/__init__.py
Normal file
93
src/core/__init__.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""DataTools deduplication engine.
|
||||
|
||||
Public API
|
||||
----------
|
||||
Core:
|
||||
deduplicate(df, ...) -> DeduplicationResult
|
||||
build_default_strategies(df) -> list[MatchStrategy]
|
||||
|
||||
Types:
|
||||
Algorithm, SurvivorRule, ColumnMatchStrategy, MatchStrategy
|
||||
MatchResult, DeduplicationResult
|
||||
|
||||
Normalizers:
|
||||
get_normalizer(type) -> Callable
|
||||
NormalizerType
|
||||
normalize_email, normalize_phone, normalize_name,
|
||||
normalize_address, normalize_string
|
||||
|
||||
I/O:
|
||||
read_file(path, ...) -> DataFrame
|
||||
write_file(df, path, ...)
|
||||
list_sheets(path) -> list[str]
|
||||
detect_encoding, detect_delimiter, detect_header_row
|
||||
|
||||
Configuration:
|
||||
DeduplicationConfig.from_file(path) -> DeduplicationConfig
|
||||
DeduplicationConfig.to_file(path)
|
||||
"""
|
||||
|
||||
from .dedup import (
|
||||
Algorithm,
|
||||
ColumnMatchStrategy,
|
||||
DeduplicationResult,
|
||||
MatchResult,
|
||||
MatchStrategy,
|
||||
SurvivorRule,
|
||||
build_default_strategies,
|
||||
deduplicate,
|
||||
)
|
||||
from .normalizers import (
|
||||
NormalizerType,
|
||||
get_normalizer,
|
||||
normalize_address,
|
||||
normalize_email,
|
||||
normalize_name,
|
||||
normalize_phone,
|
||||
normalize_string,
|
||||
)
|
||||
from .io import (
|
||||
detect_delimiter,
|
||||
detect_encoding,
|
||||
detect_header_row,
|
||||
list_sheets,
|
||||
read_file,
|
||||
write_file,
|
||||
)
|
||||
from .config import (
|
||||
ColumnStrategyConfig,
|
||||
DeduplicationConfig,
|
||||
StrategyConfig,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Core
|
||||
"deduplicate",
|
||||
"build_default_strategies",
|
||||
# Types
|
||||
"Algorithm",
|
||||
"SurvivorRule",
|
||||
"ColumnMatchStrategy",
|
||||
"MatchStrategy",
|
||||
"MatchResult",
|
||||
"DeduplicationResult",
|
||||
# Normalizers
|
||||
"NormalizerType",
|
||||
"get_normalizer",
|
||||
"normalize_email",
|
||||
"normalize_phone",
|
||||
"normalize_name",
|
||||
"normalize_address",
|
||||
"normalize_string",
|
||||
# I/O
|
||||
"read_file",
|
||||
"write_file",
|
||||
"list_sheets",
|
||||
"detect_encoding",
|
||||
"detect_delimiter",
|
||||
"detect_header_row",
|
||||
# Config
|
||||
"DeduplicationConfig",
|
||||
"StrategyConfig",
|
||||
"ColumnStrategyConfig",
|
||||
]
|
||||
117
src/core/config.py
Normal file
117
src/core/config.py
Normal file
@@ -0,0 +1,117 @@
|
||||
"""Configuration profiles: save/load deduplication settings as JSON."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from .dedup import (
|
||||
Algorithm,
|
||||
ColumnMatchStrategy,
|
||||
MatchStrategy,
|
||||
NormalizerType,
|
||||
SurvivorRule,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ColumnStrategyConfig:
|
||||
"""JSON-serializable mirror of ColumnMatchStrategy."""
|
||||
column: str
|
||||
algorithm: str = "exact"
|
||||
threshold: float = 100.0
|
||||
normalizer: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class StrategyConfig:
|
||||
"""JSON-serializable mirror of MatchStrategy."""
|
||||
columns: list[ColumnStrategyConfig] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DeduplicationConfig:
|
||||
"""All deduplication settings as a flat JSON-serializable structure."""
|
||||
|
||||
strategies: list[StrategyConfig] = field(default_factory=list)
|
||||
survivor_rule: str = "first"
|
||||
date_column: Optional[str] = None
|
||||
merge: bool = False
|
||||
subset_columns: Optional[list[str]] = None
|
||||
fuzzy_columns: Optional[list[str]] = None
|
||||
default_algorithm: str = "jaro_winkler"
|
||||
default_threshold: float = 85.0
|
||||
normalize_map: Optional[dict[str, str]] = None # column -> normalizer type
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Serialisation
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return asdict(self)
|
||||
|
||||
def to_file(self, path: str | Path) -> Path:
|
||||
"""Save configuration to a JSON file."""
|
||||
out = Path(path)
|
||||
out.write_text(json.dumps(self.to_dict(), indent=2))
|
||||
return out
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> DeduplicationConfig:
|
||||
strategies = []
|
||||
for s in data.get("strategies", []):
|
||||
cols = [ColumnStrategyConfig(**c) for c in s.get("columns", [])]
|
||||
strategies.append(StrategyConfig(columns=cols))
|
||||
return cls(
|
||||
strategies=strategies,
|
||||
survivor_rule=data.get("survivor_rule", "first"),
|
||||
date_column=data.get("date_column"),
|
||||
merge=data.get("merge", False),
|
||||
subset_columns=data.get("subset_columns"),
|
||||
fuzzy_columns=data.get("fuzzy_columns"),
|
||||
default_algorithm=data.get("default_algorithm", "jaro_winkler"),
|
||||
default_threshold=data.get("default_threshold", 85.0),
|
||||
normalize_map=data.get("normalize_map"),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, path: str | Path) -> DeduplicationConfig:
|
||||
"""Load configuration from a JSON file."""
|
||||
data = json.loads(Path(path).read_text())
|
||||
return cls.from_dict(data)
|
||||
|
||||
@classmethod
|
||||
def default(cls) -> DeduplicationConfig:
|
||||
"""Return sensible defaults (auto-detect strategies at runtime)."""
|
||||
return cls()
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Convert to engine objects
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
def to_strategies(self) -> Optional[list[MatchStrategy]]:
|
||||
"""Convert the config back to MatchStrategy objects.
|
||||
|
||||
Returns None if no explicit strategies are configured
|
||||
(the engine will auto-detect).
|
||||
"""
|
||||
if not self.strategies:
|
||||
return None
|
||||
|
||||
result: list[MatchStrategy] = []
|
||||
for sc in self.strategies:
|
||||
col_strats = []
|
||||
for cc in sc.columns:
|
||||
col_strats.append(ColumnMatchStrategy(
|
||||
column=cc.column,
|
||||
algorithm=Algorithm(cc.algorithm),
|
||||
threshold=cc.threshold,
|
||||
normalizer=NormalizerType(cc.normalizer) if cc.normalizer else None,
|
||||
))
|
||||
result.append(MatchStrategy(column_strategies=col_strats))
|
||||
return result
|
||||
|
||||
def to_survivor_rule(self) -> SurvivorRule:
|
||||
return SurvivorRule(self.survivor_rule)
|
||||
568
src/core/dedup.py
Normal file
568
src/core/dedup.py
Normal file
@@ -0,0 +1,568 @@
|
||||
"""Deduplication engine: matching, survivor selection, and merge.
|
||||
|
||||
Core algorithm:
|
||||
1. Normalise columns → shadow ``_norm_*`` columns (computed once).
|
||||
2. Pairwise comparison within each strategy → candidate pairs.
|
||||
3. Union-find for transitive closure (A~B, B~C ⇒ one group).
|
||||
4. Multi-strategy OR: feed all pairs from all strategies into the same union-find.
|
||||
5. Survivor selection per group + optional field merge.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import Callable, Optional
|
||||
|
||||
import pandas as pd
|
||||
from loguru import logger
|
||||
from rapidfuzz import fuzz as rf_fuzz
|
||||
from rapidfuzz import distance as rf_distance
|
||||
|
||||
from .normalizers import NormalizerType, get_normalizer
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Enums & data structures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class Algorithm(str, Enum):
|
||||
EXACT = "exact"
|
||||
LEVENSHTEIN = "levenshtein"
|
||||
JARO_WINKLER = "jaro_winkler"
|
||||
TOKEN_SET_RATIO = "token_set_ratio"
|
||||
|
||||
|
||||
class SurvivorRule(str, Enum):
|
||||
KEEP_FIRST = "first"
|
||||
KEEP_LAST = "last"
|
||||
KEEP_MOST_COMPLETE = "most_complete"
|
||||
KEEP_MOST_RECENT = "most_recent"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ColumnMatchStrategy:
|
||||
"""How to match on a single column."""
|
||||
column: str
|
||||
algorithm: Algorithm = Algorithm.EXACT
|
||||
threshold: float = 100.0 # 0-100 scale
|
||||
normalizer: Optional[NormalizerType] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class MatchStrategy:
|
||||
"""A set of column strategies combined with AND.
|
||||
|
||||
Multiple ``MatchStrategy`` instances are combined with OR at the top level.
|
||||
"""
|
||||
column_strategies: list[ColumnMatchStrategy]
|
||||
|
||||
|
||||
@dataclass
|
||||
class MatchResult:
|
||||
"""One group of duplicate rows."""
|
||||
group_id: int
|
||||
row_indices: list[int]
|
||||
confidence: float # min confidence across pairs in the group
|
||||
matched_on: list[str] # column names that contributed to the match
|
||||
survivor_index: int # index of the row to keep
|
||||
|
||||
|
||||
@dataclass
|
||||
class DeduplicationResult:
|
||||
"""Full result of a deduplication run."""
|
||||
original_row_count: int
|
||||
deduplicated_df: pd.DataFrame
|
||||
removed_df: pd.DataFrame
|
||||
match_groups: list[MatchResult]
|
||||
log_entries: list[str] = field(default_factory=list)
|
||||
is_preview: bool = True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Union-Find
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class _UnionFind:
|
||||
"""Disjoint-set / union-find for transitive closure of match pairs."""
|
||||
|
||||
def __init__(self, n: int):
|
||||
self._parent = list(range(n))
|
||||
self._rank = [0] * n
|
||||
|
||||
def find(self, x: int) -> int:
|
||||
while self._parent[x] != x:
|
||||
self._parent[x] = self._parent[self._parent[x]] # path halving
|
||||
x = self._parent[x]
|
||||
return x
|
||||
|
||||
def union(self, a: int, b: int) -> None:
|
||||
ra, rb = self.find(a), self.find(b)
|
||||
if ra == rb:
|
||||
return
|
||||
if self._rank[ra] < self._rank[rb]:
|
||||
ra, rb = rb, ra
|
||||
self._parent[rb] = ra
|
||||
if self._rank[ra] == self._rank[rb]:
|
||||
self._rank[ra] += 1
|
||||
|
||||
def groups(self) -> dict[int, list[int]]:
|
||||
"""Return {root: [members]} for all non-singleton groups."""
|
||||
from collections import defaultdict
|
||||
g: dict[int, list[int]] = defaultdict(list)
|
||||
for i in range(len(self._parent)):
|
||||
g[self.find(i)].append(i)
|
||||
return {root: members for root, members in g.items() if len(members) > 1}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Similarity computation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _compute_similarity(val_a: str, val_b: str, algorithm: Algorithm) -> float:
|
||||
"""Return similarity score on a 0-100 scale."""
|
||||
if algorithm == Algorithm.EXACT:
|
||||
return 100.0 if val_a == val_b else 0.0
|
||||
if algorithm == Algorithm.LEVENSHTEIN:
|
||||
return rf_fuzz.ratio(val_a, val_b)
|
||||
if algorithm == Algorithm.JARO_WINKLER:
|
||||
# rapidfuzz jaro_winkler_similarity returns 0-100
|
||||
return rf_distance.JaroWinkler.similarity(val_a, val_b) * 100
|
||||
if algorithm == Algorithm.TOKEN_SET_RATIO:
|
||||
return rf_fuzz.token_set_ratio(val_a, val_b)
|
||||
raise ValueError(f"Unknown algorithm: {algorithm}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pair comparison
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _compare_pair(
|
||||
row_a: pd.Series,
|
||||
row_b: pd.Series,
|
||||
strategy: MatchStrategy,
|
||||
norm_prefix: str = "_norm_",
|
||||
) -> tuple[bool, float, list[str]]:
|
||||
"""Compare two rows using a single MatchStrategy (AND of column strategies).
|
||||
|
||||
Returns ``(is_match, confidence, matched_columns)``.
|
||||
"""
|
||||
min_score = 100.0
|
||||
matched_cols: list[str] = []
|
||||
|
||||
for cs in strategy.column_strategies:
|
||||
col = f"{norm_prefix}{cs.column}" if cs.normalizer else cs.column
|
||||
va = str(row_a.get(col, ""))
|
||||
vb = str(row_b.get(col, ""))
|
||||
|
||||
# Skip if both empty
|
||||
if not va and not vb:
|
||||
continue
|
||||
# If one empty and one not — no match for this column
|
||||
if not va or not vb:
|
||||
return False, 0.0, []
|
||||
|
||||
score = _compute_similarity(va, vb, cs.algorithm)
|
||||
if score < cs.threshold:
|
||||
return False, 0.0, []
|
||||
min_score = min(min_score, score)
|
||||
matched_cols.append(cs.column)
|
||||
|
||||
if not matched_cols:
|
||||
return False, 0.0, []
|
||||
|
||||
return True, min_score, matched_cols
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Match-group finding
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _find_match_groups(
|
||||
df: pd.DataFrame,
|
||||
strategies: list[MatchStrategy],
|
||||
*,
|
||||
progress_callback: Optional[Callable[[int, int], None]] = None,
|
||||
) -> tuple[list[MatchResult], dict[tuple[int, int], tuple[float, list[str]]]]:
|
||||
"""Pairwise comparison + union-find for transitive closure.
|
||||
|
||||
Returns ``(match_groups, pair_info)`` where *pair_info* maps
|
||||
``(i, j)`` → ``(confidence, matched_columns)`` for logging.
|
||||
"""
|
||||
n = len(df)
|
||||
uf = _UnionFind(n)
|
||||
pair_info: dict[tuple[int, int], tuple[float, list[str]]] = {}
|
||||
total_pairs = n * (n - 1) // 2
|
||||
checked = 0
|
||||
|
||||
for i in range(n):
|
||||
for j in range(i + 1, n):
|
||||
for strategy in strategies:
|
||||
is_match, confidence, cols = _compare_pair(
|
||||
df.iloc[i], df.iloc[j], strategy
|
||||
)
|
||||
if is_match:
|
||||
uf.union(i, j)
|
||||
key = (i, j)
|
||||
# Keep the highest-confidence match for this pair
|
||||
if key not in pair_info or confidence > pair_info[key][0]:
|
||||
pair_info[key] = (confidence, cols)
|
||||
break # OR logic: one strategy match is enough
|
||||
|
||||
checked += 1
|
||||
if progress_callback and checked % 1000 == 0:
|
||||
progress_callback(checked, total_pairs)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(total_pairs, total_pairs)
|
||||
|
||||
# Build MatchResult objects (survivor not yet selected)
|
||||
raw_groups = uf.groups()
|
||||
match_groups: list[MatchResult] = []
|
||||
for gid, (root, members) in enumerate(sorted(raw_groups.items())):
|
||||
# Confidence = min across all pairs in the group
|
||||
group_confidence = 100.0
|
||||
group_cols: set[str] = set()
|
||||
for idx_a, m in enumerate(members):
|
||||
for idx_b in range(idx_a + 1, len(members)):
|
||||
key = (min(m, members[idx_b]), max(m, members[idx_b]))
|
||||
if key in pair_info:
|
||||
conf, cols = pair_info[key]
|
||||
group_confidence = min(group_confidence, conf)
|
||||
group_cols.update(cols)
|
||||
|
||||
match_groups.append(MatchResult(
|
||||
group_id=gid,
|
||||
row_indices=members,
|
||||
confidence=round(group_confidence, 2),
|
||||
matched_on=sorted(group_cols),
|
||||
survivor_index=members[0], # placeholder
|
||||
))
|
||||
|
||||
return match_groups, pair_info
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Survivor selection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _select_survivor(
|
||||
group: MatchResult,
|
||||
df: pd.DataFrame,
|
||||
rule: SurvivorRule,
|
||||
date_column: Optional[str] = None,
|
||||
) -> int:
|
||||
"""Choose the survivor row index within a match group."""
|
||||
indices = group.row_indices
|
||||
|
||||
if rule == SurvivorRule.KEEP_FIRST:
|
||||
return indices[0]
|
||||
|
||||
if rule == SurvivorRule.KEEP_LAST:
|
||||
return indices[-1]
|
||||
|
||||
if rule == SurvivorRule.KEEP_MOST_COMPLETE:
|
||||
# Fewest empty/blank cells wins
|
||||
best_idx = indices[0]
|
||||
best_empty = _count_empty(df.iloc[indices[0]])
|
||||
for idx in indices[1:]:
|
||||
empty = _count_empty(df.iloc[idx])
|
||||
if empty < best_empty:
|
||||
best_empty = empty
|
||||
best_idx = idx
|
||||
return best_idx
|
||||
|
||||
if rule == SurvivorRule.KEEP_MOST_RECENT:
|
||||
if not date_column or date_column not in df.columns:
|
||||
logger.warning("date_column '{}' not found; falling back to keep_first", date_column)
|
||||
return indices[0]
|
||||
best_idx = indices[0]
|
||||
best_date = _parse_date(df.iloc[indices[0]].get(date_column, ""))
|
||||
for idx in indices[1:]:
|
||||
d = _parse_date(df.iloc[idx].get(date_column, ""))
|
||||
if d is not None and (best_date is None or d > best_date):
|
||||
best_date = d
|
||||
best_idx = idx
|
||||
return best_idx
|
||||
|
||||
return indices[0]
|
||||
|
||||
|
||||
def _count_empty(row: pd.Series) -> int:
|
||||
"""Count empty/blank cells in a row, ignoring internal shadow columns."""
|
||||
count = 0
|
||||
for col, val in row.items():
|
||||
if isinstance(col, str) and col.startswith("_norm_"):
|
||||
continue
|
||||
if pd.isna(val) or str(val).strip() == "":
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def _parse_date(value) -> Optional[pd.Timestamp]:
|
||||
try:
|
||||
return pd.to_datetime(value)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Merge mode
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _merge_group(df: pd.DataFrame, survivor_idx: int, loser_indices: list[int]) -> pd.Series:
|
||||
"""Fill missing fields in survivor from losers (ordered by position)."""
|
||||
survivor = df.iloc[survivor_idx].copy()
|
||||
for col in survivor.index:
|
||||
if isinstance(col, str) and col.startswith("_norm_"):
|
||||
continue
|
||||
val = survivor[col]
|
||||
if pd.isna(val) or str(val).strip() == "":
|
||||
for loser_idx in loser_indices:
|
||||
candidate = df.iloc[loser_idx][col]
|
||||
if not pd.isna(candidate) and str(candidate).strip() != "":
|
||||
survivor[col] = candidate
|
||||
break
|
||||
return survivor
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Auto-detect strategies
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# (pattern, normalizer, algorithm, threshold, is_strong_key)
|
||||
# Strong keys (email, phone) can be standalone strategies.
|
||||
# Weak keys (name, address) must be combined with a strong key via AND.
|
||||
_COLUMN_TYPE_PATTERNS: list[tuple[re.Pattern, NormalizerType, Algorithm, float, bool]] = [
|
||||
(re.compile(r"e[-_]?mail", re.I), NormalizerType.EMAIL, Algorithm.EXACT, 100.0, True),
|
||||
(re.compile(r"phone|telephone|mobile|cell", re.I), NormalizerType.PHONE, Algorithm.EXACT, 100.0, True),
|
||||
(re.compile(r"^(name|full_name|customer_name|first_name|last_name|contact_name|respondent_name)$", re.I),
|
||||
NormalizerType.NAME, Algorithm.JARO_WINKLER, 85.0, False),
|
||||
(re.compile(r"address|street|addr", re.I), NormalizerType.ADDRESS, Algorithm.TOKEN_SET_RATIO, 80.0, False),
|
||||
]
|
||||
|
||||
|
||||
def build_default_strategies(df: pd.DataFrame) -> list[MatchStrategy]:
|
||||
"""Auto-detect column types and build match strategies.
|
||||
|
||||
Strategy logic:
|
||||
- Strong keys (email, phone): each gets its own standalone OR strategy.
|
||||
- Weak keys (name, address): combined with each strong key via AND to
|
||||
form additional strategies. Weak keys never stand alone (too many
|
||||
false positives — "John" ≈ "Jon" at 93 % Jaro-Winkler).
|
||||
- If only weak keys are found (no strong keys), they're promoted to
|
||||
standalone strategies as a fallback.
|
||||
- If no columns match, exact match on all columns (drop_duplicates
|
||||
equivalent).
|
||||
"""
|
||||
strong_cols: list[ColumnMatchStrategy] = []
|
||||
weak_cols: list[ColumnMatchStrategy] = []
|
||||
|
||||
for col in df.columns:
|
||||
if col.startswith("_norm_"):
|
||||
continue
|
||||
for pattern, norm_type, algo, threshold, is_strong in _COLUMN_TYPE_PATTERNS:
|
||||
if pattern.search(col):
|
||||
cs = ColumnMatchStrategy(
|
||||
column=col, algorithm=algo,
|
||||
threshold=threshold, normalizer=norm_type,
|
||||
)
|
||||
if is_strong:
|
||||
strong_cols.append(cs)
|
||||
else:
|
||||
weak_cols.append(cs)
|
||||
break
|
||||
|
||||
strategies: list[MatchStrategy] = []
|
||||
|
||||
if strong_cols:
|
||||
# Each strong key is a standalone strategy (OR)
|
||||
for sc in strong_cols:
|
||||
strategies.append(MatchStrategy(column_strategies=[sc]))
|
||||
|
||||
# Each weak key is paired with each strong key (AND) for extra recall
|
||||
for wc in weak_cols:
|
||||
for sc in strong_cols:
|
||||
strategies.append(MatchStrategy(column_strategies=[wc, sc]))
|
||||
elif weak_cols:
|
||||
# No strong keys — promote weak to standalone (best effort)
|
||||
for wc in weak_cols:
|
||||
strategies.append(MatchStrategy(column_strategies=[wc]))
|
||||
|
||||
if strategies:
|
||||
return strategies
|
||||
|
||||
# Fallback: exact match on all columns (equivalent to drop_duplicates)
|
||||
logger.info("No column patterns matched; using exact match on all columns")
|
||||
all_cols = [
|
||||
ColumnMatchStrategy(column=c, algorithm=Algorithm.EXACT, threshold=100.0)
|
||||
for c in df.columns
|
||||
]
|
||||
return [MatchStrategy(column_strategies=all_cols)]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Normalisation pass
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _apply_normalizations(df: pd.DataFrame, strategies: list[MatchStrategy]) -> pd.DataFrame:
|
||||
"""Add ``_norm_*`` shadow columns for every column that has a normalizer."""
|
||||
df = df.copy()
|
||||
seen: set[str] = set()
|
||||
for strategy in strategies:
|
||||
for cs in strategy.column_strategies:
|
||||
if cs.normalizer and cs.column not in seen and cs.column in df.columns:
|
||||
seen.add(cs.column)
|
||||
norm_fn = get_normalizer(cs.normalizer)
|
||||
norm_col = f"_norm_{cs.column}"
|
||||
df[norm_col] = df[cs.column].apply(
|
||||
lambda v, fn=norm_fn: fn(str(v)) if pd.notna(v) and str(v).strip() else ""
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def deduplicate(
|
||||
df: pd.DataFrame,
|
||||
*,
|
||||
strategies: Optional[list[MatchStrategy]] = None,
|
||||
survivor_rule: SurvivorRule = SurvivorRule.KEEP_FIRST,
|
||||
date_column: Optional[str] = None,
|
||||
merge: bool = False,
|
||||
preview: bool = True,
|
||||
review_callback: Optional[Callable] = None,
|
||||
progress_callback: Optional[Callable[[int, int], None]] = None,
|
||||
) -> DeduplicationResult:
|
||||
"""Run the full deduplication pipeline.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : input DataFrame
|
||||
strategies : matching strategies (auto-detected if None)
|
||||
survivor_rule : which row to keep per group
|
||||
date_column : used with ``KEEP_MOST_RECENT``
|
||||
merge : fill missing fields in survivor from losers
|
||||
preview : if True, result is informational only (no writes)
|
||||
review_callback : ``(group: MatchResult, df: DataFrame) -> bool|None``
|
||||
Called for each match group. Return True to accept, False to reject,
|
||||
None to skip (keep both rows). Used for interactive review.
|
||||
progress_callback : ``(current: int, total: int) -> None``
|
||||
Called periodically during pairwise comparison.
|
||||
|
||||
Returns a ``DeduplicationResult``.
|
||||
"""
|
||||
log_entries: list[str] = []
|
||||
original_count = len(df)
|
||||
|
||||
if strategies is None:
|
||||
strategies = build_default_strategies(df)
|
||||
log_entries.append(f"Auto-detected {len(strategies)} match strategies")
|
||||
|
||||
# Log strategies
|
||||
for i, s in enumerate(strategies):
|
||||
cols_desc = ", ".join(
|
||||
f"{cs.column}({cs.algorithm.value}@{cs.threshold})"
|
||||
for cs in s.column_strategies
|
||||
)
|
||||
log_entries.append(f"Strategy {i}: {cols_desc}")
|
||||
logger.info("Strategy {}: {}", i, cols_desc)
|
||||
|
||||
# Normalise
|
||||
df_work = _apply_normalizations(df, strategies)
|
||||
|
||||
# Find matches
|
||||
match_groups, pair_info = _find_match_groups(
|
||||
df_work, strategies, progress_callback=progress_callback
|
||||
)
|
||||
log_entries.append(f"Found {len(match_groups)} duplicate groups")
|
||||
logger.info("Found {} duplicate groups from {} rows", len(match_groups), original_count)
|
||||
|
||||
# Interactive review
|
||||
if review_callback and match_groups:
|
||||
reviewed_groups: list[MatchResult] = []
|
||||
for group in match_groups:
|
||||
decision = review_callback(group, df_work)
|
||||
if decision is True:
|
||||
reviewed_groups.append(group)
|
||||
log_entries.append(f"Group {group.group_id}: accepted by reviewer")
|
||||
elif decision is False:
|
||||
log_entries.append(f"Group {group.group_id}: rejected by reviewer")
|
||||
else:
|
||||
log_entries.append(f"Group {group.group_id}: skipped by reviewer")
|
||||
match_groups = reviewed_groups
|
||||
|
||||
# Survivor selection
|
||||
for group in match_groups:
|
||||
group.survivor_index = _select_survivor(group, df_work, survivor_rule, date_column)
|
||||
log_entries.append(
|
||||
f"Group {group.group_id}: survivor=row {group.survivor_index} "
|
||||
f"(rule={survivor_rule.value}, confidence={group.confidence}%)"
|
||||
)
|
||||
|
||||
# Build result dataframes
|
||||
remove_indices: set[int] = set()
|
||||
merged_rows: dict[int, pd.Series] = {}
|
||||
|
||||
for group in match_groups:
|
||||
survivor_idx = group.survivor_index
|
||||
losers = [i for i in group.row_indices if i != survivor_idx]
|
||||
remove_indices.update(losers)
|
||||
|
||||
if merge and losers:
|
||||
merged = _merge_group(df_work, survivor_idx, losers)
|
||||
merged_rows[survivor_idx] = merged
|
||||
# Log merged fields
|
||||
original = df_work.iloc[survivor_idx]
|
||||
for col in original.index:
|
||||
if isinstance(col, str) and col.startswith("_norm_"):
|
||||
continue
|
||||
orig_val = str(original[col]).strip()
|
||||
new_val = str(merged[col]).strip()
|
||||
if orig_val != new_val and not orig_val:
|
||||
log_entries.append(
|
||||
f"Group {group.group_id}: merged '{col}' "
|
||||
f"into survivor from losers: '{new_val}'"
|
||||
)
|
||||
|
||||
# Build output DataFrames
|
||||
keep_indices = [i for i in range(len(df_work)) if i not in remove_indices]
|
||||
|
||||
if merged_rows:
|
||||
rows = []
|
||||
for i in keep_indices:
|
||||
if i in merged_rows:
|
||||
rows.append(merged_rows[i])
|
||||
else:
|
||||
rows.append(df_work.iloc[i])
|
||||
deduplicated_df = pd.DataFrame(rows)
|
||||
else:
|
||||
deduplicated_df = df_work.iloc[keep_indices].copy()
|
||||
|
||||
removed_df = df_work.iloc[sorted(remove_indices)].copy() if remove_indices else pd.DataFrame()
|
||||
|
||||
# Drop shadow columns from output
|
||||
norm_cols = [c for c in deduplicated_df.columns if str(c).startswith("_norm_")]
|
||||
deduplicated_df = deduplicated_df.drop(columns=norm_cols, errors="ignore")
|
||||
if not removed_df.empty:
|
||||
removed_df = removed_df.drop(columns=norm_cols, errors="ignore")
|
||||
|
||||
# Reset index
|
||||
deduplicated_df = deduplicated_df.reset_index(drop=True)
|
||||
if not removed_df.empty:
|
||||
removed_df = removed_df.reset_index(drop=True)
|
||||
|
||||
removed_count = original_count - len(deduplicated_df)
|
||||
log_entries.append(f"Result: {original_count} → {len(deduplicated_df)} rows ({removed_count} removed)")
|
||||
|
||||
return DeduplicationResult(
|
||||
original_row_count=original_count,
|
||||
deduplicated_df=deduplicated_df,
|
||||
removed_df=removed_df,
|
||||
match_groups=match_groups,
|
||||
log_entries=log_entries,
|
||||
is_preview=preview,
|
||||
)
|
||||
247
src/core/io.py
Normal file
247
src/core/io.py
Normal file
@@ -0,0 +1,247 @@
|
||||
"""File I/O: encoding/delimiter detection, CSV/Excel reading, output writing."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import io
|
||||
from pathlib import Path
|
||||
from typing import Generator, Optional
|
||||
|
||||
import pandas as pd
|
||||
from charset_normalizer import from_bytes
|
||||
from loguru import logger
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Encoding detection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def detect_encoding(path: Path, sample_bytes: int = 65_536) -> str:
|
||||
"""Detect file encoding by reading the first *sample_bytes*.
|
||||
|
||||
Returns the best-guess encoding name (e.g. ``utf-8``, ``windows-1252``).
|
||||
Falls back to ``utf-8`` when detection is inconclusive.
|
||||
"""
|
||||
raw = Path(path).read_bytes()[:sample_bytes]
|
||||
if not raw:
|
||||
return "utf-8"
|
||||
|
||||
# Check BOM first
|
||||
if raw[:3] == b"\xef\xbb\xbf":
|
||||
return "utf-8-sig"
|
||||
if raw[:2] in (b"\xff\xfe", b"\xfe\xff"):
|
||||
return "utf-16"
|
||||
|
||||
result = from_bytes(raw).best()
|
||||
if result is None:
|
||||
return "utf-8"
|
||||
enc = result.encoding.lower()
|
||||
# Normalise common aliases
|
||||
if enc in ("ascii", "us-ascii"):
|
||||
enc = "utf-8"
|
||||
return enc
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Delimiter detection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_COMMON_DELIMITERS = [",", "\t", ";", "|"]
|
||||
|
||||
|
||||
def detect_delimiter(path: Path, encoding: str = "utf-8") -> str:
|
||||
"""Sniff the delimiter from the first 20 lines of a text file.
|
||||
|
||||
Falls back to comma if csv.Sniffer cannot decide.
|
||||
"""
|
||||
raw_path = Path(path)
|
||||
lines: list[str] = []
|
||||
with raw_path.open("r", encoding=encoding, errors="replace") as fh:
|
||||
for _ in range(20):
|
||||
line = fh.readline()
|
||||
if not line:
|
||||
break
|
||||
lines.append(line)
|
||||
|
||||
if not lines:
|
||||
return ","
|
||||
|
||||
sample = "".join(lines)
|
||||
try:
|
||||
dialect = csv.Sniffer().sniff(sample, delimiters="".join(_COMMON_DELIMITERS))
|
||||
return dialect.delimiter
|
||||
except csv.Error:
|
||||
return ","
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Header-row detection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def detect_header_row(path: Path, encoding: str = "utf-8", delimiter: str = ",",
|
||||
max_scan: int = 20) -> int:
|
||||
"""Return the 0-based index of the likely header row.
|
||||
|
||||
Heuristic: the first row where *every* cell looks like a column name
|
||||
(non-numeric, non-empty string). Falls back to 0.
|
||||
"""
|
||||
raw_path = Path(path)
|
||||
with raw_path.open("r", encoding=encoding, errors="replace") as fh:
|
||||
reader = csv.reader(fh, delimiter=delimiter)
|
||||
for idx, row in enumerate(reader):
|
||||
if idx >= max_scan:
|
||||
break
|
||||
if not row:
|
||||
continue
|
||||
# All cells must be non-empty, non-numeric strings
|
||||
if all(_looks_like_header(cell) for cell in row if cell.strip()):
|
||||
return idx
|
||||
return 0
|
||||
|
||||
|
||||
def _looks_like_header(value: str) -> bool:
|
||||
"""True if *value* looks like a column header, not a data value."""
|
||||
v = value.strip()
|
||||
if not v:
|
||||
return False
|
||||
# Pure numbers are not headers
|
||||
try:
|
||||
float(v.replace(",", ""))
|
||||
return False
|
||||
except ValueError:
|
||||
pass
|
||||
return True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Excel helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def list_sheets(path: Path) -> list[str]:
|
||||
"""Return sheet names from an Excel workbook."""
|
||||
xl = pd.ExcelFile(path, engine="openpyxl")
|
||||
return xl.sheet_names
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Reading
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def read_file(
|
||||
path: str | Path,
|
||||
*,
|
||||
encoding: Optional[str] = None,
|
||||
delimiter: Optional[str] = None,
|
||||
header_row: Optional[int] = None,
|
||||
sheet_name: Optional[str | int] = 0,
|
||||
chunk_size: Optional[int] = None,
|
||||
) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
|
||||
"""Read a CSV, TSV, or Excel file into a DataFrame.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : file path
|
||||
encoding : override detected encoding (CSV only)
|
||||
delimiter : override detected delimiter (CSV only)
|
||||
header_row : 0-based row index for the header; auto-detected if *None*
|
||||
sheet_name : Excel sheet (name or 0-based index). Ignored for CSV.
|
||||
chunk_size : if set, return a generator of DataFrames (CSV only).
|
||||
|
||||
Returns a DataFrame (or generator when *chunk_size* is set).
|
||||
"""
|
||||
filepath = Path(path)
|
||||
if not filepath.exists():
|
||||
raise FileNotFoundError(f"File not found: {filepath}")
|
||||
|
||||
suffix = filepath.suffix.lower()
|
||||
if suffix in (".xlsx", ".xls"):
|
||||
return _read_excel(filepath, header_row=header_row, sheet_name=sheet_name)
|
||||
else:
|
||||
return _read_csv(
|
||||
filepath,
|
||||
encoding=encoding,
|
||||
delimiter=delimiter,
|
||||
header_row=header_row,
|
||||
chunk_size=chunk_size,
|
||||
)
|
||||
|
||||
|
||||
def _read_csv(
|
||||
path: Path,
|
||||
*,
|
||||
encoding: Optional[str] = None,
|
||||
delimiter: Optional[str] = None,
|
||||
header_row: Optional[int] = None,
|
||||
chunk_size: Optional[int] = None,
|
||||
) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
|
||||
enc = encoding or detect_encoding(path)
|
||||
delim = delimiter or detect_delimiter(path, enc)
|
||||
hdr = header_row if header_row is not None else detect_header_row(path, enc, delim)
|
||||
|
||||
logger.debug("Reading CSV {} (encoding={}, delimiter={!r}, header_row={})",
|
||||
path.name, enc, delim, hdr)
|
||||
|
||||
kwargs: dict = dict(
|
||||
filepath_or_buffer=path,
|
||||
encoding=enc,
|
||||
delimiter=delim,
|
||||
header=hdr,
|
||||
dtype=str,
|
||||
keep_default_na=False,
|
||||
on_bad_lines="warn",
|
||||
)
|
||||
|
||||
if chunk_size:
|
||||
return pd.read_csv(**kwargs, chunksize=chunk_size)
|
||||
|
||||
return pd.read_csv(**kwargs)
|
||||
|
||||
|
||||
def _read_excel(
|
||||
path: Path,
|
||||
*,
|
||||
header_row: Optional[int] = None,
|
||||
sheet_name: Optional[str | int] = 0,
|
||||
) -> pd.DataFrame:
|
||||
hdr = header_row if header_row is not None else 0
|
||||
logger.debug("Reading Excel {} (sheet={}, header_row={})", path.name, sheet_name, hdr)
|
||||
return pd.read_excel(
|
||||
path,
|
||||
sheet_name=sheet_name,
|
||||
header=hdr,
|
||||
dtype=str,
|
||||
keep_default_na=False,
|
||||
engine="openpyxl",
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Writing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def write_file(
|
||||
df: pd.DataFrame,
|
||||
path: str | Path,
|
||||
*,
|
||||
file_format: Optional[str] = None,
|
||||
encoding: str = "utf-8-sig",
|
||||
) -> Path:
|
||||
"""Write a DataFrame to CSV or Excel.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrame to write
|
||||
path : output file path
|
||||
file_format : ``"csv"`` or ``"xlsx"``; auto-detected from *path* suffix if *None*
|
||||
encoding : output encoding (default ``utf-8-sig`` for Windows Excel compat)
|
||||
|
||||
Returns the resolved output Path.
|
||||
"""
|
||||
out = Path(path)
|
||||
fmt = file_format or out.suffix.lstrip(".").lower()
|
||||
if fmt in ("xlsx", "xls"):
|
||||
df.to_excel(out, index=False, engine="openpyxl")
|
||||
else:
|
||||
df.to_csv(out, index=False, encoding=encoding)
|
||||
logger.info("Wrote {} rows to {}", len(df), out)
|
||||
return out
|
||||
224
src/core/normalizers.py
Normal file
224
src/core/normalizers.py
Normal file
@@ -0,0 +1,224 @@
|
||||
"""Per-column normalization functions for deduplication matching.
|
||||
|
||||
Every normalizer is ``str -> str``, handles None/empty gracefully, and is
|
||||
idempotent (applying it twice yields the same result as once).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from enum import Enum
|
||||
from typing import Callable, Optional
|
||||
|
||||
import phonenumbers
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Types
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class NormalizerType(str, Enum):
|
||||
EMAIL = "email"
|
||||
PHONE = "phone"
|
||||
NAME = "name"
|
||||
ADDRESS = "address"
|
||||
STRING = "string"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# String normalizer (base)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def normalize_string(value: Optional[str]) -> str:
|
||||
"""Trim, collapse internal whitespace, case-fold."""
|
||||
if not value or not isinstance(value, str):
|
||||
return ""
|
||||
return re.sub(r"\s+", " ", value.strip()).casefold()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Email normalizer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_GMAIL_DOMAINS = {"gmail.com", "googlemail.com"}
|
||||
|
||||
|
||||
def normalize_email(value: Optional[str]) -> str:
|
||||
"""Lowercase, strip whitespace, strip Gmail dots, strip +tag suffixes."""
|
||||
if not value or not isinstance(value, str):
|
||||
return ""
|
||||
email = value.strip().lower()
|
||||
if "@" not in email:
|
||||
return email
|
||||
|
||||
local, domain = email.rsplit("@", 1)
|
||||
|
||||
# Strip +tag suffix
|
||||
if "+" in local:
|
||||
local = local.split("+", 1)[0]
|
||||
|
||||
# Strip dots for Gmail addresses
|
||||
if domain in _GMAIL_DOMAINS:
|
||||
local = local.replace(".", "")
|
||||
|
||||
return f"{local}@{domain}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Phone normalizer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def normalize_phone(value: Optional[str], default_region: str = "US") -> str:
|
||||
"""Parse with phonenumbers lib, return E.164. Fallback: digits-only."""
|
||||
if not value or not isinstance(value, str):
|
||||
return ""
|
||||
stripped = value.strip()
|
||||
if not stripped:
|
||||
return ""
|
||||
|
||||
try:
|
||||
parsed = phonenumbers.parse(stripped, default_region)
|
||||
if phonenumbers.is_possible_number(parsed):
|
||||
return phonenumbers.format_number(parsed, phonenumbers.PhoneNumberFormat.E164)
|
||||
except phonenumbers.NumberParseException:
|
||||
pass
|
||||
|
||||
# Fallback: digits only
|
||||
digits = re.sub(r"\D", "", stripped)
|
||||
return digits
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Name normalizer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_TITLE_PREFIXES = {
|
||||
"mr", "mrs", "ms", "miss", "dr", "prof", "professor",
|
||||
"sir", "madam", "rev", "reverend", "hon", "honorable",
|
||||
}
|
||||
_NAME_SUFFIXES = {
|
||||
"jr", "sr", "ii", "iii", "iv", "v",
|
||||
"phd", "md", "esq", "dds", "rn",
|
||||
}
|
||||
|
||||
|
||||
def normalize_name(value: Optional[str]) -> str:
|
||||
"""Strip titles/suffixes, collapse whitespace, case-fold."""
|
||||
if not value or not isinstance(value, str):
|
||||
return ""
|
||||
name = value.strip()
|
||||
if not name:
|
||||
return ""
|
||||
|
||||
# Case-fold first for matching
|
||||
name = name.casefold()
|
||||
|
||||
# Remove periods and commas that are part of titles/suffixes
|
||||
name = name.replace(".", " ").replace(",", " ")
|
||||
|
||||
parts = name.split()
|
||||
|
||||
# Strip leading titles
|
||||
while parts and parts[0].rstrip(".") in _TITLE_PREFIXES:
|
||||
parts.pop(0)
|
||||
|
||||
# Strip trailing suffixes
|
||||
while parts and parts[-1].rstrip(".") in _NAME_SUFFIXES:
|
||||
parts.pop()
|
||||
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Address normalizer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_USPS_ABBREVIATIONS: dict[str, str] = {
|
||||
"street": "st",
|
||||
"avenue": "ave",
|
||||
"boulevard": "blvd",
|
||||
"drive": "dr",
|
||||
"lane": "ln",
|
||||
"road": "rd",
|
||||
"court": "ct",
|
||||
"place": "pl",
|
||||
"circle": "cir",
|
||||
"trail": "trl",
|
||||
"way": "way",
|
||||
"terrace": "ter",
|
||||
"parkway": "pkwy",
|
||||
"highway": "hwy",
|
||||
"expressway": "expy",
|
||||
"freeway": "fwy",
|
||||
"square": "sq",
|
||||
"loop": "loop",
|
||||
"alley": "aly",
|
||||
"crossing": "xing",
|
||||
"point": "pt",
|
||||
"north": "n",
|
||||
"south": "s",
|
||||
"east": "e",
|
||||
"west": "w",
|
||||
"northeast": "ne",
|
||||
"northwest": "nw",
|
||||
"southeast": "se",
|
||||
"southwest": "sw",
|
||||
"apartment": "apt",
|
||||
"suite": "ste",
|
||||
"building": "bldg",
|
||||
"floor": "fl",
|
||||
"room": "rm",
|
||||
"unit": "unit",
|
||||
"number": "#",
|
||||
"saint": "st",
|
||||
"fort": "ft",
|
||||
"mount": "mt",
|
||||
"heights": "hts",
|
||||
"springs": "spgs",
|
||||
}
|
||||
|
||||
|
||||
def normalize_address(value: Optional[str]) -> str:
|
||||
"""USPS abbreviation normalization, collapse whitespace, case-fold."""
|
||||
if not value or not isinstance(value, str):
|
||||
return ""
|
||||
addr = value.strip()
|
||||
if not addr:
|
||||
return ""
|
||||
|
||||
# Case-fold and clean punctuation (keep #)
|
||||
addr = addr.casefold()
|
||||
addr = addr.replace(".", " ").replace(",", " ")
|
||||
|
||||
parts = addr.split()
|
||||
normalized_parts = []
|
||||
for part in parts:
|
||||
normalized_parts.append(_USPS_ABBREVIATIONS.get(part, part))
|
||||
|
||||
return " ".join(normalized_parts)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Registry
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_NORMALIZER_MAP: dict[NormalizerType, Callable[[str], str]] = {
|
||||
NormalizerType.EMAIL: normalize_email,
|
||||
NormalizerType.PHONE: normalize_phone,
|
||||
NormalizerType.NAME: normalize_name,
|
||||
NormalizerType.ADDRESS: normalize_address,
|
||||
NormalizerType.STRING: normalize_string,
|
||||
}
|
||||
|
||||
|
||||
def get_normalizer(normalizer_type: NormalizerType | str) -> Callable[[str], str]:
|
||||
"""Return the normalizer function for the given type.
|
||||
|
||||
Accepts both ``NormalizerType`` enum values and plain strings.
|
||||
"""
|
||||
if isinstance(normalizer_type, str):
|
||||
normalizer_type = NormalizerType(normalizer_type.lower())
|
||||
func = _NORMALIZER_MAP.get(normalizer_type)
|
||||
if func is None:
|
||||
raise ValueError(f"Unknown normalizer type: {normalizer_type}")
|
||||
return func
|
||||
1
src/gui/__init__.py
Normal file
1
src/gui/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Streamlit GUI for the DataTools Deduplicator."""
|
||||
8
src/gui/__main__.py
Normal file
8
src/gui/__main__.py
Normal file
@@ -0,0 +1,8 @@
|
||||
"""Allow running as ``python -m src.gui``."""
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
app_path = Path(__file__).parent / "app.py"
|
||||
subprocess.run([sys.executable, "-m", "streamlit", "run", str(app_path)])
|
||||
287
src/gui/app.py
Normal file
287
src/gui/app.py
Normal file
@@ -0,0 +1,287 @@
|
||||
"""DataTools Deduplicator — Streamlit GUI.
|
||||
|
||||
Launch:
|
||||
streamlit run src/gui/app.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
# Ensure project root is on sys.path so `src.core` imports work
|
||||
_project_root = Path(__file__).resolve().parent.parent.parent
|
||||
if str(_project_root) not in sys.path:
|
||||
sys.path.insert(0, str(_project_root))
|
||||
|
||||
from src.core.dedup import deduplicate, build_default_strategies, DeduplicationResult
|
||||
from src.core.io import read_file, list_sheets
|
||||
from src.core.config import DeduplicationConfig
|
||||
from src.gui.components import config_panel, match_group_card, results_summary
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Page config
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.set_page_config(
|
||||
page_title="DataTools Deduplicator",
|
||||
page_icon="🔍",
|
||||
layout="wide",
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Session state defaults
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_DEFAULTS = {
|
||||
"df": None,
|
||||
"result": None,
|
||||
"review_decisions": {},
|
||||
"config": None,
|
||||
"file_name": "",
|
||||
"sheet_names": [],
|
||||
}
|
||||
for key, default in _DEFAULTS.items():
|
||||
if key not in st.session_state:
|
||||
st.session_state[key] = default
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Header
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.title("DataTools Deduplicator")
|
||||
st.caption("Find and remove duplicate rows in CSV and Excel files.")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# File upload
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
uploaded = st.file_uploader(
|
||||
"Upload CSV or Excel file",
|
||||
type=["csv", "tsv", "xlsx", "xls"],
|
||||
help="Supports CSV, TSV, and Excel files. Encoding and delimiters are auto-detected.",
|
||||
)
|
||||
|
||||
if uploaded is not None:
|
||||
# Detect if file changed
|
||||
if uploaded.name != st.session_state["file_name"]:
|
||||
st.session_state["file_name"] = uploaded.name
|
||||
st.session_state["result"] = None
|
||||
st.session_state["review_decisions"] = {}
|
||||
|
||||
# Read the file
|
||||
try:
|
||||
# Write to a temp file for read_file() which needs a path
|
||||
import tempfile
|
||||
suffix = Path(uploaded.name).suffix
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||
tmp.write(uploaded.getvalue())
|
||||
tmp_path = Path(tmp.name)
|
||||
|
||||
# Check for Excel sheets
|
||||
if suffix.lower() in (".xlsx", ".xls"):
|
||||
st.session_state["sheet_names"] = list_sheets(tmp_path)
|
||||
else:
|
||||
st.session_state["sheet_names"] = []
|
||||
|
||||
df = read_file(tmp_path)
|
||||
if not isinstance(df, pd.DataFrame):
|
||||
df = pd.concat(list(df), ignore_index=True)
|
||||
|
||||
st.session_state["df"] = df
|
||||
|
||||
# Clean up temp file
|
||||
tmp_path.unlink(missing_ok=True)
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Failed to read file: {e}")
|
||||
st.session_state["df"] = None
|
||||
|
||||
df = st.session_state["df"]
|
||||
|
||||
if df is not None:
|
||||
# Sheet selector for Excel files
|
||||
if st.session_state["sheet_names"] and len(st.session_state["sheet_names"]) > 1:
|
||||
sheet = st.selectbox(
|
||||
"Select sheet",
|
||||
st.session_state["sheet_names"],
|
||||
)
|
||||
if sheet != st.session_state.get("_current_sheet"):
|
||||
st.session_state["_current_sheet"] = sheet
|
||||
suffix = Path(uploaded.name).suffix
|
||||
import tempfile
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||
tmp.write(uploaded.getvalue())
|
||||
tmp_path = Path(tmp.name)
|
||||
df = read_file(tmp_path, sheet_name=sheet)
|
||||
if not isinstance(df, pd.DataFrame):
|
||||
df = pd.concat(list(df), ignore_index=True)
|
||||
st.session_state["df"] = df
|
||||
st.session_state["result"] = None
|
||||
st.session_state["review_decisions"] = {}
|
||||
tmp_path.unlink(missing_ok=True)
|
||||
|
||||
# Preview
|
||||
st.subheader(f"Preview: {uploaded.name}")
|
||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||
st.dataframe(df.head(10), use_container_width=True)
|
||||
|
||||
# Advanced options
|
||||
settings = config_panel(df)
|
||||
|
||||
# Apply loaded config if present
|
||||
loaded_cfg = st.session_state.get("loaded_config")
|
||||
if loaded_cfg is not None:
|
||||
settings["strategies"] = loaded_cfg.to_strategies()
|
||||
settings["survivor_rule"] = loaded_cfg.to_survivor_rule()
|
||||
settings["date_column"] = loaded_cfg.date_column
|
||||
settings["merge"] = loaded_cfg.merge
|
||||
# Clear so it doesn't override on every rerun
|
||||
del st.session_state["loaded_config"]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Find Duplicates button
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.divider()
|
||||
|
||||
if st.button("Find Duplicates", type="primary", use_container_width=True):
|
||||
progress_bar = st.progress(0, text="Comparing rows...")
|
||||
|
||||
def _gui_progress(current: int, total: int) -> None:
|
||||
if total > 0:
|
||||
pct = min(current / total, 1.0)
|
||||
progress_bar.progress(pct, text=f"Comparing rows... {current:,}/{total:,}")
|
||||
|
||||
with st.spinner("Running deduplication..."):
|
||||
result = deduplicate(
|
||||
df,
|
||||
strategies=settings["strategies"],
|
||||
survivor_rule=settings["survivor_rule"],
|
||||
date_column=settings["date_column"],
|
||||
merge=settings["merge"],
|
||||
preview=False,
|
||||
progress_callback=_gui_progress,
|
||||
)
|
||||
|
||||
progress_bar.empty()
|
||||
st.session_state["result"] = result
|
||||
st.session_state["review_decisions"] = {}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Results
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
result: DeduplicationResult | None = st.session_state["result"]
|
||||
|
||||
if result is not None:
|
||||
st.divider()
|
||||
st.subheader("Results")
|
||||
|
||||
# Summary + download buttons
|
||||
results_summary(result, df)
|
||||
|
||||
# Match group review
|
||||
if result.match_groups:
|
||||
st.divider()
|
||||
st.subheader("Match Groups")
|
||||
|
||||
# Batch actions
|
||||
action_left, action_mid, action_right = st.columns(3)
|
||||
with action_left:
|
||||
if st.button("Accept All"):
|
||||
for g in result.match_groups:
|
||||
st.session_state["review_decisions"][g.group_id] = True
|
||||
st.rerun()
|
||||
with action_mid:
|
||||
if st.button("Reject All"):
|
||||
for g in result.match_groups:
|
||||
st.session_state["review_decisions"][g.group_id] = False
|
||||
st.rerun()
|
||||
with action_right:
|
||||
if st.button("Clear Decisions"):
|
||||
st.session_state["review_decisions"] = {}
|
||||
st.rerun()
|
||||
|
||||
# Individual group cards
|
||||
decisions = st.session_state["review_decisions"]
|
||||
for i, group in enumerate(result.match_groups):
|
||||
decision = match_group_card(group, df, group_num=i + 1)
|
||||
if decision is not None:
|
||||
decisions[group.group_id] = decision
|
||||
st.session_state["review_decisions"] = decisions
|
||||
st.rerun()
|
||||
|
||||
# Show decision summary
|
||||
if decisions:
|
||||
st.divider()
|
||||
accepted = sum(1 for v in decisions.values() if v is True)
|
||||
rejected = sum(1 for v in decisions.values() if v is False)
|
||||
pending = len(result.match_groups) - len(decisions)
|
||||
st.caption(
|
||||
f"Decisions: {accepted} merged, {rejected} kept both, "
|
||||
f"{pending} pending"
|
||||
)
|
||||
|
||||
# Re-run dedup with review decisions applied
|
||||
if st.button(
|
||||
"Apply Review Decisions & Download",
|
||||
type="primary",
|
||||
use_container_width=True,
|
||||
):
|
||||
def _review_callback(group, _df):
|
||||
gid = group.group_id
|
||||
if gid in decisions:
|
||||
return decisions[gid]
|
||||
return True # default: accept
|
||||
|
||||
reviewed_result = deduplicate(
|
||||
df,
|
||||
strategies=settings["strategies"],
|
||||
survivor_rule=settings["survivor_rule"],
|
||||
date_column=settings["date_column"],
|
||||
merge=settings["merge"],
|
||||
preview=False,
|
||||
review_callback=_review_callback,
|
||||
)
|
||||
|
||||
# Update result and show downloads
|
||||
st.session_state["result"] = reviewed_result
|
||||
|
||||
csv_bytes = reviewed_result.deduplicated_df.to_csv(
|
||||
index=False
|
||||
).encode("utf-8-sig")
|
||||
st.download_button(
|
||||
"Download Reviewed & Deduplicated CSV",
|
||||
data=csv_bytes,
|
||||
file_name="deduplicated_reviewed.csv",
|
||||
mime="text/csv",
|
||||
key="reviewed_download",
|
||||
)
|
||||
|
||||
# Log entries
|
||||
if result.log_entries:
|
||||
with st.expander("Processing Log"):
|
||||
st.code("\n".join(result.log_entries))
|
||||
|
||||
else:
|
||||
# No file uploaded — show placeholder
|
||||
st.info("Upload a CSV or Excel file to get started.")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Footer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.divider()
|
||||
st.caption(
|
||||
"Runs locally. Your data never leaves this computer. "
|
||||
"| DataTools Deduplicator v1.0"
|
||||
)
|
||||
413
src/gui/components.py
Normal file
413
src/gui/components.py
Normal file
@@ -0,0 +1,413 @@
|
||||
"""Reusable Streamlit widgets for the deduplicator GUI."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
from src.core.dedup import (
|
||||
Algorithm,
|
||||
ColumnMatchStrategy,
|
||||
DeduplicationResult,
|
||||
MatchResult,
|
||||
MatchStrategy,
|
||||
SurvivorRule,
|
||||
)
|
||||
from src.core.config import (
|
||||
ColumnStrategyConfig,
|
||||
DeduplicationConfig,
|
||||
StrategyConfig,
|
||||
)
|
||||
from src.core.normalizers import NormalizerType
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Config panel (advanced options)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def config_panel(df: pd.DataFrame) -> dict:
|
||||
"""Render the Advanced Options expander. Returns a settings dict.
|
||||
|
||||
Keys returned:
|
||||
strategies: list[MatchStrategy] | None
|
||||
survivor_rule: SurvivorRule
|
||||
date_column: str | None
|
||||
merge: bool
|
||||
"""
|
||||
columns = list(df.columns)
|
||||
|
||||
with st.expander("Advanced Options"):
|
||||
col_left, col_right = st.columns(2)
|
||||
|
||||
with col_left:
|
||||
subset_cols = st.multiselect(
|
||||
"Match on columns",
|
||||
columns,
|
||||
default=[],
|
||||
help="Leave empty to auto-detect based on column names.",
|
||||
)
|
||||
key_cols = st.multiselect(
|
||||
"Strong keys",
|
||||
columns,
|
||||
default=[],
|
||||
help="Columns that uniquely identify records (e.g., EIN, SKU). Each is an independent exact-match strategy.",
|
||||
)
|
||||
fuzzy_cols = st.multiselect(
|
||||
"Fuzzy columns",
|
||||
columns,
|
||||
default=[],
|
||||
help="Columns to fuzzy-match. Others use exact matching.",
|
||||
)
|
||||
|
||||
with col_right:
|
||||
algorithm = st.selectbox(
|
||||
"Fuzzy algorithm",
|
||||
["jaro_winkler", "levenshtein", "token_set_ratio"],
|
||||
index=0,
|
||||
help="jaro_winkler: best for names. levenshtein: best for typos. token_set_ratio: best for addresses.",
|
||||
)
|
||||
threshold = st.slider(
|
||||
"Similarity threshold",
|
||||
min_value=50,
|
||||
max_value=100,
|
||||
value=85,
|
||||
help="Lower = more matches but more false positives.",
|
||||
)
|
||||
survivor = st.selectbox(
|
||||
"Survivor rule",
|
||||
["first", "last", "most-complete", "most-recent"],
|
||||
index=0,
|
||||
help="Which row to keep when duplicates are found.",
|
||||
)
|
||||
|
||||
# Second row of options
|
||||
col_a, col_b = st.columns(2)
|
||||
|
||||
with col_a:
|
||||
normalize_options = {c: "auto" for c in columns}
|
||||
normalizer_types = ["auto", "email", "phone", "name", "address", "string", "none"]
|
||||
|
||||
normalize_map: dict[str, str] = {}
|
||||
if fuzzy_cols or subset_cols:
|
||||
target_cols = fuzzy_cols or subset_cols
|
||||
st.markdown("**Per-column normalizers**")
|
||||
for col_name in target_cols:
|
||||
norm = st.selectbox(
|
||||
f"Normalizer for '{col_name}'",
|
||||
normalizer_types,
|
||||
index=0,
|
||||
key=f"norm_{col_name}",
|
||||
)
|
||||
if norm not in ("auto", "none"):
|
||||
normalize_map[col_name] = norm
|
||||
|
||||
with col_b:
|
||||
merge = st.checkbox(
|
||||
"Merge mode",
|
||||
value=False,
|
||||
help="Fill missing fields in the surviving row from removed duplicates.",
|
||||
)
|
||||
date_column: Optional[str] = None
|
||||
if survivor == "most-recent":
|
||||
date_column = st.selectbox(
|
||||
"Date column",
|
||||
columns,
|
||||
help="Required for most-recent survivor rule.",
|
||||
)
|
||||
|
||||
# Config save/load
|
||||
st.divider()
|
||||
cfg_left, cfg_right = st.columns(2)
|
||||
|
||||
with cfg_left:
|
||||
config_file = st.file_uploader(
|
||||
"Load config profile",
|
||||
type=["json"],
|
||||
help="Load previously saved settings.",
|
||||
key="config_upload",
|
||||
)
|
||||
if config_file is not None:
|
||||
import json
|
||||
try:
|
||||
data = json.loads(config_file.read())
|
||||
loaded = DeduplicationConfig.from_dict(data)
|
||||
st.session_state["loaded_config"] = loaded
|
||||
st.success("Config loaded.")
|
||||
except Exception as e:
|
||||
st.error(f"Failed to load config: {e}")
|
||||
|
||||
with cfg_right:
|
||||
if st.button("Save current settings"):
|
||||
cfg = _build_config(
|
||||
subset_cols, key_cols, fuzzy_cols,
|
||||
algorithm, threshold, normalize_map,
|
||||
survivor, date_column, merge,
|
||||
)
|
||||
cfg_json = cfg.to_dict()
|
||||
import json
|
||||
st.download_button(
|
||||
"Download config JSON",
|
||||
data=json.dumps(cfg_json, indent=2),
|
||||
file_name="dedup_config.json",
|
||||
mime="application/json",
|
||||
)
|
||||
|
||||
# Build strategies from selections
|
||||
strategies = _build_strategies(
|
||||
subset_cols, key_cols, fuzzy_cols,
|
||||
algorithm, threshold, normalize_map,
|
||||
)
|
||||
|
||||
# Survivor rule mapping
|
||||
survivor_map = {
|
||||
"first": SurvivorRule.KEEP_FIRST,
|
||||
"last": SurvivorRule.KEEP_LAST,
|
||||
"most-complete": SurvivorRule.KEEP_MOST_COMPLETE,
|
||||
"most-recent": SurvivorRule.KEEP_MOST_RECENT,
|
||||
}
|
||||
|
||||
return {
|
||||
"strategies": strategies,
|
||||
"survivor_rule": survivor_map[survivor],
|
||||
"date_column": date_column,
|
||||
"merge": merge,
|
||||
}
|
||||
|
||||
|
||||
def _build_strategies(
|
||||
subset_cols: list[str],
|
||||
key_cols: list[str],
|
||||
fuzzy_cols: list[str],
|
||||
algorithm: str,
|
||||
threshold: int,
|
||||
normalize_map: dict[str, str],
|
||||
) -> Optional[list[MatchStrategy]]:
|
||||
"""Build MatchStrategy list from GUI selections. Returns None for auto-detect."""
|
||||
strategies: list[MatchStrategy] = []
|
||||
|
||||
# If user selected columns explicitly, build from those
|
||||
if subset_cols or fuzzy_cols:
|
||||
target_cols = subset_cols if subset_cols else fuzzy_cols
|
||||
fuzzy_set = set(fuzzy_cols)
|
||||
col_strats: list[ColumnMatchStrategy] = []
|
||||
for col in target_cols:
|
||||
norm = None
|
||||
if col in normalize_map:
|
||||
norm = NormalizerType(normalize_map[col])
|
||||
if col in fuzzy_set:
|
||||
algo = Algorithm(algorithm)
|
||||
thresh = float(threshold)
|
||||
else:
|
||||
algo = Algorithm.EXACT
|
||||
thresh = 100.0
|
||||
col_strats.append(ColumnMatchStrategy(
|
||||
column=col, algorithm=algo, threshold=thresh, normalizer=norm,
|
||||
))
|
||||
strategies.append(MatchStrategy(column_strategies=col_strats))
|
||||
|
||||
# Add strong key strategies
|
||||
if key_cols:
|
||||
for col in key_cols:
|
||||
strategies.append(MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column=col, algorithm=Algorithm.EXACT, threshold=100.0)
|
||||
]))
|
||||
|
||||
return strategies if strategies else None
|
||||
|
||||
|
||||
def _build_config(
|
||||
subset_cols, key_cols, fuzzy_cols,
|
||||
algorithm, threshold, normalize_map,
|
||||
survivor, date_column, merge,
|
||||
) -> DeduplicationConfig:
|
||||
"""Build a DeduplicationConfig from GUI state."""
|
||||
cfg = DeduplicationConfig(
|
||||
survivor_rule=survivor.replace("-", "_"),
|
||||
date_column=date_column,
|
||||
merge=merge,
|
||||
subset_columns=subset_cols or None,
|
||||
fuzzy_columns=fuzzy_cols or None,
|
||||
default_algorithm=algorithm,
|
||||
default_threshold=float(threshold),
|
||||
normalize_map=normalize_map or None,
|
||||
)
|
||||
strategies = _build_strategies(
|
||||
subset_cols, key_cols, fuzzy_cols,
|
||||
algorithm, threshold, normalize_map,
|
||||
)
|
||||
if strategies:
|
||||
cfg.strategies = [
|
||||
StrategyConfig(columns=[
|
||||
ColumnStrategyConfig(
|
||||
column=cs.column,
|
||||
algorithm=cs.algorithm.value,
|
||||
threshold=cs.threshold,
|
||||
normalizer=cs.normalizer.value if cs.normalizer else None,
|
||||
)
|
||||
for cs in s.column_strategies
|
||||
])
|
||||
for s in strategies
|
||||
]
|
||||
return cfg
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Match group review card
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def match_group_card(
|
||||
group: MatchResult,
|
||||
df: pd.DataFrame,
|
||||
group_num: int,
|
||||
) -> Optional[bool]:
|
||||
"""Render an expandable match group card with side-by-side diff.
|
||||
|
||||
Returns:
|
||||
True — user clicked Merge (accept match)
|
||||
False — user clicked Keep Both (reject match)
|
||||
None — no decision yet
|
||||
"""
|
||||
confidence = group.confidence
|
||||
auto_expand = confidence < 95.0
|
||||
matched_on = ", ".join(group.matched_on)
|
||||
n_rows = len(group.row_indices)
|
||||
|
||||
label = (
|
||||
f"Group {group_num}: {n_rows} rows "
|
||||
f"(confidence: {confidence:.0f}%) "
|
||||
f"[{matched_on}]"
|
||||
)
|
||||
|
||||
with st.expander(label, expanded=auto_expand):
|
||||
# Build comparison DataFrame
|
||||
display_cols = [c for c in df.columns if not str(c).startswith("_norm_")]
|
||||
rows_data = []
|
||||
for idx in group.row_indices:
|
||||
row = {"_row": idx + 1}
|
||||
for col in display_cols:
|
||||
row[col] = df.iloc[idx].get(col, "")
|
||||
rows_data.append(row)
|
||||
|
||||
compare_df = pd.DataFrame(rows_data)
|
||||
compare_df = compare_df.set_index("_row")
|
||||
|
||||
# Highlight differences
|
||||
def _highlight_diffs(s: pd.Series) -> list[str]:
|
||||
"""Highlight cells that differ from the first row."""
|
||||
styles = []
|
||||
first_val = str(s.iloc[0]).strip() if len(s) > 0 else ""
|
||||
for val in s:
|
||||
val_str = str(val).strip()
|
||||
if val_str != first_val and val_str and first_val:
|
||||
styles.append("background-color: rgba(245, 166, 35, 0.2)")
|
||||
elif not val_str and first_val:
|
||||
styles.append("background-color: rgba(240, 82, 82, 0.1)")
|
||||
else:
|
||||
styles.append("")
|
||||
return styles
|
||||
|
||||
styled = compare_df.style.apply(_highlight_diffs, axis=0)
|
||||
st.dataframe(styled, use_container_width=True)
|
||||
|
||||
# Action buttons
|
||||
btn_left, btn_mid, btn_right = st.columns(3)
|
||||
merge_key = f"merge_{group.group_id}"
|
||||
keep_key = f"keep_{group.group_id}"
|
||||
|
||||
with btn_left:
|
||||
if st.button("Merge", key=merge_key, type="primary"):
|
||||
return True
|
||||
with btn_mid:
|
||||
if st.button("Keep Both", key=keep_key):
|
||||
return False
|
||||
|
||||
# Check session state for previous decisions
|
||||
decisions = st.session_state.get("review_decisions", {})
|
||||
if group.group_id in decisions:
|
||||
decision = decisions[group.group_id]
|
||||
if decision is True:
|
||||
st.success("Decision: Merge")
|
||||
elif decision is False:
|
||||
st.info("Decision: Keep Both")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Results summary + downloads
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def results_summary(
|
||||
result: DeduplicationResult,
|
||||
original_df: pd.DataFrame,
|
||||
) -> None:
|
||||
"""Render summary stats and download buttons."""
|
||||
removed = result.original_row_count - len(result.deduplicated_df)
|
||||
|
||||
# Summary metrics
|
||||
col1, col2, col3, col4 = st.columns(4)
|
||||
col1.metric("Rows In", result.original_row_count)
|
||||
col2.metric("Rows Out", len(result.deduplicated_df))
|
||||
col3.metric("Removed", removed)
|
||||
col4.metric("Groups", len(result.match_groups))
|
||||
|
||||
st.divider()
|
||||
|
||||
# Download buttons
|
||||
dl_left, dl_mid, dl_right = st.columns(3)
|
||||
|
||||
with dl_left:
|
||||
csv_bytes = result.deduplicated_df.to_csv(index=False).encode("utf-8-sig")
|
||||
st.download_button(
|
||||
"Download Deduplicated CSV",
|
||||
data=csv_bytes,
|
||||
file_name="deduplicated.csv",
|
||||
mime="text/csv",
|
||||
)
|
||||
|
||||
with dl_mid:
|
||||
if not result.removed_df.empty:
|
||||
removed_bytes = result.removed_df.to_csv(index=False).encode("utf-8-sig")
|
||||
st.download_button(
|
||||
"Download Removed Rows",
|
||||
data=removed_bytes,
|
||||
file_name="removed_rows.csv",
|
||||
mime="text/csv",
|
||||
)
|
||||
|
||||
with dl_right:
|
||||
if result.match_groups:
|
||||
groups_data = _build_match_groups_csv(result, original_df)
|
||||
st.download_button(
|
||||
"Download Match Groups Report",
|
||||
data=groups_data,
|
||||
file_name="match_groups.csv",
|
||||
mime="text/csv",
|
||||
)
|
||||
|
||||
|
||||
def _build_match_groups_csv(
|
||||
result: DeduplicationResult,
|
||||
original_df: pd.DataFrame,
|
||||
) -> bytes:
|
||||
"""Build the match groups audit CSV as bytes."""
|
||||
rows = []
|
||||
for g in result.match_groups:
|
||||
for idx in g.row_indices:
|
||||
row_data = {
|
||||
"_group_id": g.group_id + 1,
|
||||
"_is_survivor": idx == g.survivor_index,
|
||||
"_confidence": g.confidence,
|
||||
"_matched_on": ", ".join(g.matched_on),
|
||||
"_original_row": idx + 1,
|
||||
}
|
||||
for col in original_df.columns:
|
||||
if not str(col).startswith("_norm_"):
|
||||
row_data[col] = original_df.iloc[idx].get(col, "") if idx < len(original_df) else ""
|
||||
rows.append(row_data)
|
||||
|
||||
groups_df = pd.DataFrame(rows)
|
||||
return groups_df.to_csv(index=False).encode("utf-8-sig")
|
||||
Reference in New Issue
Block a user