feat: add documentation, Streamlit GUI, and full source tree

- Rewrite README.md with project overview, quick-start, and CLI summary
- Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections
- Add docs/DEVELOPER.md with architecture, data flow, and extension guides
- Rewrite src/core/__init__.py with public API exports and module docstring
- Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive
  match group review with side-by-side diff, and download buttons
- Add .gitignore, requirements.txt, all source code, tests, and sample data
- Add streamlit to requirements.txt

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-28 23:06:39 +00:00
parent 0613dc420c
commit b871ab24fc
47 changed files with 4413 additions and 2 deletions

0
src/__init__.py Normal file
View File

4
src/__main__.py Normal file
View File

@@ -0,0 +1,4 @@
"""Allow running as ``python -m src``."""
from src.cli import main
main()

502
src/cli.py Normal file
View File

@@ -0,0 +1,502 @@
"""CLI for the DataTools deduplicator.
Usage:
python -m src.cli input.csv # dry-run preview
python -m src.cli input.csv --apply # write deduplicated output
python -m src.cli input.csv --fuzzy name --merge # fuzzy match + merge
python -m src.cli --help # full help
"""
from __future__ import annotations
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional
import typer
from loguru import logger
from rapidfuzz import process as rf_process
app = typer.Typer(
name="dedup",
help=(
"Find and remove duplicate rows in CSV and Excel files.\n\n"
"By default, runs in preview mode — shows what would change without "
"modifying anything. Add --apply to write the output.\n\n"
"Examples:\n\n"
" # Preview duplicates in a CSV file\n"
" python -m src.cli customers.csv\n\n"
" # Remove duplicates and save the result\n"
" python -m src.cli customers.csv --apply\n\n"
" # Fuzzy-match on the 'name' column with 80% threshold\n"
" python -m src.cli customers.csv --fuzzy name --threshold 80 --apply\n\n"
" # Match on specific columns only\n"
" python -m src.cli customers.csv --subset email,phone --apply\n\n"
" # Keep the most complete row and merge missing fields\n"
" python -m src.cli customers.csv --survivor most-complete --merge --apply\n"
),
add_completion=False,
no_args_is_help=True,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _setup_logging(log_dir: Path) -> Path:
"""Configure loguru to write a timestamped log file. Returns the log path."""
log_dir.mkdir(parents=True, exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
log_path = log_dir / f"dedup_{ts}.log"
logger.remove() # remove default stderr handler
logger.add(sys.stderr, level="WARNING", format="{message}")
logger.add(str(log_path), level="DEBUG",
format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}")
return log_path
def _suggest_column(name: str, available: list[str]) -> str:
"""Return a helpful error message when a column is not found."""
cols_str = ", ".join(available)
matches = rf_process.extract(name, available, limit=1, score_cutoff=50)
if matches:
suggestion = matches[0][0]
return (
f"Column '{name}' not found. "
f"Available columns: {cols_str}. "
f"Did you mean '{suggestion}'?"
)
return f"Column '{name}' not found. Available columns: {cols_str}."
def _validate_columns(requested: list[str], available: list[str]) -> None:
"""Raise typer.BadParameter if any requested column doesn't exist."""
for col in requested:
if col not in available:
raise typer.BadParameter(_suggest_column(col, available))
def _parse_normalize_map(raw: Optional[str]) -> dict[str, str]:
"""Parse 'col:type,col:type' into a dict."""
if not raw:
return {}
result = {}
for pair in raw.split(","):
pair = pair.strip()
if ":" not in pair:
raise typer.BadParameter(
f"Invalid normalize format: '{pair}'. "
f"Expected 'column:type' (e.g., 'email:email,phone:phone')."
)
col, ntype = pair.split(":", 1)
result[col.strip()] = ntype.strip()
return result
def _interactive_review(group, df) -> Optional[bool]:
"""Side-by-side CLI review for a match group. Returns True/False/None."""
from src.core.dedup import MatchResult
group: MatchResult
print(f"\n{'='*60}")
print(f"Match Group {group.group_id + 1} — Confidence: {group.confidence:.1f}%")
print(f"Matched on: {', '.join(group.matched_on)}")
print(f"{'='*60}")
display_cols = [c for c in df.columns if not str(c).startswith("_norm_")]
for idx in group.row_indices:
print(f"\n Row {idx + 1}:")
for col in display_cols:
val = df.iloc[idx].get(col, "")
if str(val).strip():
print(f" {col}: {val}")
while True:
choice = input("\n [y] Merge [n] Keep both [s] Skip remaining: ").strip().lower()
if choice == "y":
return True
if choice == "n":
return False
if choice == "s":
return None
print(" Please enter y, n, or s.")
# ---------------------------------------------------------------------------
# Main command
# ---------------------------------------------------------------------------
@app.command()
def dedup(
input_file: str = typer.Argument(
...,
help="Path to the CSV or Excel file to deduplicate.",
),
output: Optional[str] = typer.Option(
None, "--output", "-o",
help="Output file path. Default: {input}_deduplicated.csv",
),
apply: bool = typer.Option(
False, "--apply",
help="Write the output file. Without this flag, only a preview is shown.",
),
key: Optional[str] = typer.Option(
None, "--key", "-k",
help="Comma-separated strong-key columns (e.g., 'fb_id,ein'). Each is an independent exact-match dedup key.",
),
subset: Optional[str] = typer.Option(
None, "--subset", "-s",
help="Comma-separated columns to match on (default: auto-detect).",
),
fuzzy: Optional[str] = typer.Option(
None, "--fuzzy",
help="Comma-separated columns to fuzzy-match (others use exact match).",
),
algorithm: str = typer.Option(
"jaro_winkler", "--algorithm", "-a",
help="Fuzzy algorithm: levenshtein, jaro_winkler, or token_set_ratio.",
),
threshold: int = typer.Option(
85, "--threshold", "-t",
help="Similarity threshold 0-100 for fuzzy matching.",
),
normalize: Optional[str] = typer.Option(
None, "--normalize",
help="Column normalizers as 'col:type' pairs (e.g., 'email:email,phone:phone').",
),
survivor: str = typer.Option(
"first", "--survivor",
help="Survivor rule: first, last, most-complete, or most-recent.",
),
date_column: Optional[str] = typer.Option(
None, "--date-column",
help="Date column for most-recent survivor rule.",
),
merge: bool = typer.Option(
False, "--merge",
help="Fill missing fields in the surviving row from removed duplicates.",
),
review: bool = typer.Option(
False, "--review",
help="Interactively review each match group before merging.",
),
config: Optional[str] = typer.Option(
None, "--config",
help="Load settings from a saved JSON config file.",
),
save_config: Optional[str] = typer.Option(
None, "--save-config",
help="Save current settings to a JSON config file.",
),
sheet: Optional[str] = typer.Option(
None, "--sheet",
help="Excel sheet name or index (default: first sheet).",
),
encoding_override: Optional[str] = typer.Option(
None, "--encoding",
help="Override auto-detected file encoding.",
),
header_row: Optional[int] = typer.Option(
None, "--header-row",
help="0-based row index for the header (default: auto-detect).",
),
):
"""Find and remove duplicate rows in CSV and Excel files."""
from src.core.io import read_file, write_file, list_sheets
from src.core.dedup import (
Algorithm, ColumnMatchStrategy, MatchStrategy, SurvivorRule,
build_default_strategies, deduplicate,
)
from src.core.normalizers import NormalizerType
from src.core.config import DeduplicationConfig
# Setup
input_path = Path(input_file)
if not input_path.exists():
typer.echo(f"Error: File not found: {input_path}", err=True)
raise typer.Exit(1)
log_path = _setup_logging(Path("logs"))
# Load config if provided
cfg: Optional[DeduplicationConfig] = None
if config:
config_path = Path(config)
if not config_path.exists():
typer.echo(f"Error: Config file not found: {config_path}", err=True)
raise typer.Exit(1)
cfg = DeduplicationConfig.from_file(config_path)
logger.info("Loaded config from {}", config_path)
# Read input
typer.echo(f"Reading {input_path.name}...")
try:
sheet_arg: str | int | None = None
if sheet is not None:
try:
sheet_arg = int(sheet)
except ValueError:
sheet_arg = sheet
df = read_file(
input_path,
encoding=encoding_override,
header_row=header_row,
sheet_name=sheet_arg if sheet_arg is not None else 0,
)
if not isinstance(df, __import__("pandas").DataFrame):
# chunked reading returns generator — materialise for v1
import pandas as pd
df = pd.concat(list(df), ignore_index=True)
except Exception as e:
typer.echo(f"Error reading file: {e}", err=True)
raise typer.Exit(1)
typer.echo(f" {len(df)} rows, {len(df.columns)} columns")
available_columns = list(df.columns)
# Build strategies
strategies: Optional[list[MatchStrategy]] = None
if cfg and cfg.strategies:
strategies = cfg.to_strategies()
elif subset or fuzzy:
# Build from CLI flags
normalize_map = _parse_normalize_map(normalize)
strategies = []
fuzzy_cols = set(c.strip() for c in fuzzy.split(",")) if fuzzy else set()
if subset:
subset_cols = [c.strip() for c in subset.split(",")]
elif fuzzy_cols:
# When only --fuzzy is given, match on just those columns
subset_cols = list(fuzzy_cols)
else:
subset_cols = available_columns
_validate_columns(subset_cols, available_columns)
if fuzzy_cols:
_validate_columns(list(fuzzy_cols), available_columns)
col_strats: list[ColumnMatchStrategy] = []
for col in subset_cols:
norm = None
if col in normalize_map:
norm = NormalizerType(normalize_map[col])
if col in fuzzy_cols:
algo = Algorithm(algorithm)
thresh = float(threshold)
else:
algo = Algorithm.EXACT
thresh = 100.0
col_strats.append(ColumnMatchStrategy(
column=col, algorithm=algo, threshold=thresh, normalizer=norm,
))
strategies = [MatchStrategy(column_strategies=col_strats)]
# Apply normalizer overrides even with auto-detect
if normalize and strategies is None:
normalize_map = _parse_normalize_map(normalize)
auto_strats = build_default_strategies(df)
# Inject normalize_map into auto strategies
for strat in auto_strats:
for cs in strat.column_strategies:
if cs.column in normalize_map:
cs.normalizer = NormalizerType(normalize_map[cs.column])
strategies = auto_strats
# --key: add user-declared strong keys as standalone exact-match strategies
if key:
key_cols = [c.strip() for c in key.split(",")]
_validate_columns(key_cols, available_columns)
key_strats = [
MatchStrategy(column_strategies=[
ColumnMatchStrategy(column=col, algorithm=Algorithm.EXACT, threshold=100.0)
])
for col in key_cols
]
if strategies is None:
# Combine with auto-detect so user gets both
strategies = build_default_strategies(df) + key_strats
else:
strategies.extend(key_strats)
# Survivor rule
survivor_map = {
"first": SurvivorRule.KEEP_FIRST,
"last": SurvivorRule.KEEP_LAST,
"most-complete": SurvivorRule.KEEP_MOST_COMPLETE,
"most_complete": SurvivorRule.KEEP_MOST_COMPLETE,
"most-recent": SurvivorRule.KEEP_MOST_RECENT,
"most_recent": SurvivorRule.KEEP_MOST_RECENT,
}
if cfg:
surv_rule = cfg.to_survivor_rule()
do_merge = cfg.merge
dc = cfg.date_column
else:
surv_key = survivor.lower().replace("-", "_")
if surv_key not in {r.value for r in SurvivorRule} and surv_key not in survivor_map:
typer.echo(
f"Error: Unknown survivor rule '{survivor}'. "
f"Choose from: first, last, most-complete, most-recent.",
err=True,
)
raise typer.Exit(1)
surv_rule = survivor_map.get(survivor.lower(), SurvivorRule(surv_key))
do_merge = merge
dc = date_column
# Save config if requested
if save_config:
from src.core.config import DeduplicationConfig, StrategyConfig, ColumnStrategyConfig
save_cfg = DeduplicationConfig(
survivor_rule=surv_rule.value,
date_column=dc,
merge=do_merge,
subset_columns=[c.strip() for c in subset.split(",")] if subset else None,
fuzzy_columns=[c.strip() for c in fuzzy.split(",")] if fuzzy else None,
default_algorithm=algorithm,
default_threshold=float(threshold),
normalize_map=_parse_normalize_map(normalize),
)
if strategies:
save_cfg.strategies = [
StrategyConfig(columns=[
ColumnStrategyConfig(
column=cs.column,
algorithm=cs.algorithm.value,
threshold=cs.threshold,
normalizer=cs.normalizer.value if cs.normalizer else None,
)
for cs in s.column_strategies
])
for s in strategies
]
saved = save_cfg.to_file(save_config)
typer.echo(f"Config saved to {saved}")
# Progress bar
progress_cb = None
if len(df) > 10_000:
from tqdm import tqdm
pbar = tqdm(total=len(df) * (len(df) - 1) // 2, desc="Comparing rows",
unit="pairs", leave=False)
def _progress(current: int, total: int):
pbar.update(current - pbar.n)
if current >= total:
pbar.close()
progress_cb = _progress
# Review callback
review_cb = _interactive_review if review else None
# Run dedup
typer.echo("Finding duplicates...")
result = deduplicate(
df,
strategies=strategies,
survivor_rule=surv_rule,
date_column=dc,
merge=do_merge,
preview=not apply,
review_callback=review_cb,
progress_callback=progress_cb,
)
# Print results
_print_results(result, input_path)
# Write output files
if apply:
stem = input_path.stem
suffix = input_path.suffix
out_path = Path(output) if output else input_path.parent / f"{stem}_deduplicated.csv"
write_file(result.deduplicated_df, out_path)
typer.echo(f"\nDeduplicated file: {out_path}")
if not result.removed_df.empty:
removed_path = input_path.parent / f"{stem}_removed.csv"
write_file(result.removed_df, removed_path)
typer.echo(f"Removed rows: {removed_path}")
if result.match_groups:
groups_path = input_path.parent / f"{stem}_match_groups.csv"
_write_match_groups(result, df, groups_path)
typer.echo(f"Match groups: {groups_path}")
else:
typer.echo("\nThis was a preview. Add --apply to write the output files.")
typer.echo(f"Log: {log_path}")
# ---------------------------------------------------------------------------
# Output formatting
# ---------------------------------------------------------------------------
def _print_results(result, input_path: Path) -> None:
"""Print a human-readable summary."""
removed = result.original_row_count - len(result.deduplicated_df)
typer.echo(f"\n{''*50}")
typer.echo(f" File: {input_path.name}")
typer.echo(f" Rows in: {result.original_row_count}")
typer.echo(f" Rows out: {len(result.deduplicated_df)}")
typer.echo(f" Removed: {removed}")
typer.echo(f" Groups: {len(result.match_groups)}")
typer.echo(f"{''*50}")
if result.match_groups:
typer.echo("\nMatch groups:")
for g in result.match_groups[:20]: # cap display
rows_str = ", ".join(str(i + 1) for i in g.row_indices)
surv = g.survivor_index + 1
typer.echo(
f" Group {g.group_id + 1}: rows [{rows_str}] "
f"→ keep row {surv} "
f"(confidence: {g.confidence:.1f}%, "
f"matched on: {', '.join(g.matched_on)})"
)
if len(result.match_groups) > 20:
typer.echo(f" ... and {len(result.match_groups) - 20} more groups")
def _write_match_groups(result, original_df, path: Path) -> None:
"""Write match groups to a CSV for audit."""
import pandas as pd
from src.core.io import write_file
rows = []
for g in result.match_groups:
for idx in g.row_indices:
row_data = {"_group_id": g.group_id + 1}
row_data["_is_survivor"] = idx == g.survivor_index
row_data["_confidence"] = g.confidence
row_data["_matched_on"] = ", ".join(g.matched_on)
row_data["_original_row"] = idx + 1
# Include original data
for col in original_df.columns:
row_data[col] = original_df.iloc[idx].get(col, "")
rows.append(row_data)
groups_df = pd.DataFrame(rows)
write_file(groups_df, path)
# ---------------------------------------------------------------------------
# __main__ support
# ---------------------------------------------------------------------------
def main():
app()
if __name__ == "__main__":
main()

93
src/core/__init__.py Normal file
View File

@@ -0,0 +1,93 @@
"""DataTools deduplication engine.
Public API
----------
Core:
deduplicate(df, ...) -> DeduplicationResult
build_default_strategies(df) -> list[MatchStrategy]
Types:
Algorithm, SurvivorRule, ColumnMatchStrategy, MatchStrategy
MatchResult, DeduplicationResult
Normalizers:
get_normalizer(type) -> Callable
NormalizerType
normalize_email, normalize_phone, normalize_name,
normalize_address, normalize_string
I/O:
read_file(path, ...) -> DataFrame
write_file(df, path, ...)
list_sheets(path) -> list[str]
detect_encoding, detect_delimiter, detect_header_row
Configuration:
DeduplicationConfig.from_file(path) -> DeduplicationConfig
DeduplicationConfig.to_file(path)
"""
from .dedup import (
Algorithm,
ColumnMatchStrategy,
DeduplicationResult,
MatchResult,
MatchStrategy,
SurvivorRule,
build_default_strategies,
deduplicate,
)
from .normalizers import (
NormalizerType,
get_normalizer,
normalize_address,
normalize_email,
normalize_name,
normalize_phone,
normalize_string,
)
from .io import (
detect_delimiter,
detect_encoding,
detect_header_row,
list_sheets,
read_file,
write_file,
)
from .config import (
ColumnStrategyConfig,
DeduplicationConfig,
StrategyConfig,
)
__all__ = [
# Core
"deduplicate",
"build_default_strategies",
# Types
"Algorithm",
"SurvivorRule",
"ColumnMatchStrategy",
"MatchStrategy",
"MatchResult",
"DeduplicationResult",
# Normalizers
"NormalizerType",
"get_normalizer",
"normalize_email",
"normalize_phone",
"normalize_name",
"normalize_address",
"normalize_string",
# I/O
"read_file",
"write_file",
"list_sheets",
"detect_encoding",
"detect_delimiter",
"detect_header_row",
# Config
"DeduplicationConfig",
"StrategyConfig",
"ColumnStrategyConfig",
]

117
src/core/config.py Normal file
View File

@@ -0,0 +1,117 @@
"""Configuration profiles: save/load deduplication settings as JSON."""
from __future__ import annotations
import json
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import Optional
from .dedup import (
Algorithm,
ColumnMatchStrategy,
MatchStrategy,
NormalizerType,
SurvivorRule,
)
@dataclass
class ColumnStrategyConfig:
"""JSON-serializable mirror of ColumnMatchStrategy."""
column: str
algorithm: str = "exact"
threshold: float = 100.0
normalizer: Optional[str] = None
@dataclass
class StrategyConfig:
"""JSON-serializable mirror of MatchStrategy."""
columns: list[ColumnStrategyConfig] = field(default_factory=list)
@dataclass
class DeduplicationConfig:
"""All deduplication settings as a flat JSON-serializable structure."""
strategies: list[StrategyConfig] = field(default_factory=list)
survivor_rule: str = "first"
date_column: Optional[str] = None
merge: bool = False
subset_columns: Optional[list[str]] = None
fuzzy_columns: Optional[list[str]] = None
default_algorithm: str = "jaro_winkler"
default_threshold: float = 85.0
normalize_map: Optional[dict[str, str]] = None # column -> normalizer type
# -----------------------------------------------------------------------
# Serialisation
# -----------------------------------------------------------------------
def to_dict(self) -> dict:
return asdict(self)
def to_file(self, path: str | Path) -> Path:
"""Save configuration to a JSON file."""
out = Path(path)
out.write_text(json.dumps(self.to_dict(), indent=2))
return out
@classmethod
def from_dict(cls, data: dict) -> DeduplicationConfig:
strategies = []
for s in data.get("strategies", []):
cols = [ColumnStrategyConfig(**c) for c in s.get("columns", [])]
strategies.append(StrategyConfig(columns=cols))
return cls(
strategies=strategies,
survivor_rule=data.get("survivor_rule", "first"),
date_column=data.get("date_column"),
merge=data.get("merge", False),
subset_columns=data.get("subset_columns"),
fuzzy_columns=data.get("fuzzy_columns"),
default_algorithm=data.get("default_algorithm", "jaro_winkler"),
default_threshold=data.get("default_threshold", 85.0),
normalize_map=data.get("normalize_map"),
)
@classmethod
def from_file(cls, path: str | Path) -> DeduplicationConfig:
"""Load configuration from a JSON file."""
data = json.loads(Path(path).read_text())
return cls.from_dict(data)
@classmethod
def default(cls) -> DeduplicationConfig:
"""Return sensible defaults (auto-detect strategies at runtime)."""
return cls()
# -----------------------------------------------------------------------
# Convert to engine objects
# -----------------------------------------------------------------------
def to_strategies(self) -> Optional[list[MatchStrategy]]:
"""Convert the config back to MatchStrategy objects.
Returns None if no explicit strategies are configured
(the engine will auto-detect).
"""
if not self.strategies:
return None
result: list[MatchStrategy] = []
for sc in self.strategies:
col_strats = []
for cc in sc.columns:
col_strats.append(ColumnMatchStrategy(
column=cc.column,
algorithm=Algorithm(cc.algorithm),
threshold=cc.threshold,
normalizer=NormalizerType(cc.normalizer) if cc.normalizer else None,
))
result.append(MatchStrategy(column_strategies=col_strats))
return result
def to_survivor_rule(self) -> SurvivorRule:
return SurvivorRule(self.survivor_rule)

568
src/core/dedup.py Normal file
View File

@@ -0,0 +1,568 @@
"""Deduplication engine: matching, survivor selection, and merge.
Core algorithm:
1. Normalise columns → shadow ``_norm_*`` columns (computed once).
2. Pairwise comparison within each strategy → candidate pairs.
3. Union-find for transitive closure (A~B, B~C ⇒ one group).
4. Multi-strategy OR: feed all pairs from all strategies into the same union-find.
5. Survivor selection per group + optional field merge.
"""
from __future__ import annotations
import re
from dataclasses import dataclass, field
from enum import Enum
from typing import Callable, Optional
import pandas as pd
from loguru import logger
from rapidfuzz import fuzz as rf_fuzz
from rapidfuzz import distance as rf_distance
from .normalizers import NormalizerType, get_normalizer
# ---------------------------------------------------------------------------
# Enums & data structures
# ---------------------------------------------------------------------------
class Algorithm(str, Enum):
EXACT = "exact"
LEVENSHTEIN = "levenshtein"
JARO_WINKLER = "jaro_winkler"
TOKEN_SET_RATIO = "token_set_ratio"
class SurvivorRule(str, Enum):
KEEP_FIRST = "first"
KEEP_LAST = "last"
KEEP_MOST_COMPLETE = "most_complete"
KEEP_MOST_RECENT = "most_recent"
@dataclass
class ColumnMatchStrategy:
"""How to match on a single column."""
column: str
algorithm: Algorithm = Algorithm.EXACT
threshold: float = 100.0 # 0-100 scale
normalizer: Optional[NormalizerType] = None
@dataclass
class MatchStrategy:
"""A set of column strategies combined with AND.
Multiple ``MatchStrategy`` instances are combined with OR at the top level.
"""
column_strategies: list[ColumnMatchStrategy]
@dataclass
class MatchResult:
"""One group of duplicate rows."""
group_id: int
row_indices: list[int]
confidence: float # min confidence across pairs in the group
matched_on: list[str] # column names that contributed to the match
survivor_index: int # index of the row to keep
@dataclass
class DeduplicationResult:
"""Full result of a deduplication run."""
original_row_count: int
deduplicated_df: pd.DataFrame
removed_df: pd.DataFrame
match_groups: list[MatchResult]
log_entries: list[str] = field(default_factory=list)
is_preview: bool = True
# ---------------------------------------------------------------------------
# Union-Find
# ---------------------------------------------------------------------------
class _UnionFind:
"""Disjoint-set / union-find for transitive closure of match pairs."""
def __init__(self, n: int):
self._parent = list(range(n))
self._rank = [0] * n
def find(self, x: int) -> int:
while self._parent[x] != x:
self._parent[x] = self._parent[self._parent[x]] # path halving
x = self._parent[x]
return x
def union(self, a: int, b: int) -> None:
ra, rb = self.find(a), self.find(b)
if ra == rb:
return
if self._rank[ra] < self._rank[rb]:
ra, rb = rb, ra
self._parent[rb] = ra
if self._rank[ra] == self._rank[rb]:
self._rank[ra] += 1
def groups(self) -> dict[int, list[int]]:
"""Return {root: [members]} for all non-singleton groups."""
from collections import defaultdict
g: dict[int, list[int]] = defaultdict(list)
for i in range(len(self._parent)):
g[self.find(i)].append(i)
return {root: members for root, members in g.items() if len(members) > 1}
# ---------------------------------------------------------------------------
# Similarity computation
# ---------------------------------------------------------------------------
def _compute_similarity(val_a: str, val_b: str, algorithm: Algorithm) -> float:
"""Return similarity score on a 0-100 scale."""
if algorithm == Algorithm.EXACT:
return 100.0 if val_a == val_b else 0.0
if algorithm == Algorithm.LEVENSHTEIN:
return rf_fuzz.ratio(val_a, val_b)
if algorithm == Algorithm.JARO_WINKLER:
# rapidfuzz jaro_winkler_similarity returns 0-100
return rf_distance.JaroWinkler.similarity(val_a, val_b) * 100
if algorithm == Algorithm.TOKEN_SET_RATIO:
return rf_fuzz.token_set_ratio(val_a, val_b)
raise ValueError(f"Unknown algorithm: {algorithm}")
# ---------------------------------------------------------------------------
# Pair comparison
# ---------------------------------------------------------------------------
def _compare_pair(
row_a: pd.Series,
row_b: pd.Series,
strategy: MatchStrategy,
norm_prefix: str = "_norm_",
) -> tuple[bool, float, list[str]]:
"""Compare two rows using a single MatchStrategy (AND of column strategies).
Returns ``(is_match, confidence, matched_columns)``.
"""
min_score = 100.0
matched_cols: list[str] = []
for cs in strategy.column_strategies:
col = f"{norm_prefix}{cs.column}" if cs.normalizer else cs.column
va = str(row_a.get(col, ""))
vb = str(row_b.get(col, ""))
# Skip if both empty
if not va and not vb:
continue
# If one empty and one not — no match for this column
if not va or not vb:
return False, 0.0, []
score = _compute_similarity(va, vb, cs.algorithm)
if score < cs.threshold:
return False, 0.0, []
min_score = min(min_score, score)
matched_cols.append(cs.column)
if not matched_cols:
return False, 0.0, []
return True, min_score, matched_cols
# ---------------------------------------------------------------------------
# Match-group finding
# ---------------------------------------------------------------------------
def _find_match_groups(
df: pd.DataFrame,
strategies: list[MatchStrategy],
*,
progress_callback: Optional[Callable[[int, int], None]] = None,
) -> tuple[list[MatchResult], dict[tuple[int, int], tuple[float, list[str]]]]:
"""Pairwise comparison + union-find for transitive closure.
Returns ``(match_groups, pair_info)`` where *pair_info* maps
``(i, j)`` → ``(confidence, matched_columns)`` for logging.
"""
n = len(df)
uf = _UnionFind(n)
pair_info: dict[tuple[int, int], tuple[float, list[str]]] = {}
total_pairs = n * (n - 1) // 2
checked = 0
for i in range(n):
for j in range(i + 1, n):
for strategy in strategies:
is_match, confidence, cols = _compare_pair(
df.iloc[i], df.iloc[j], strategy
)
if is_match:
uf.union(i, j)
key = (i, j)
# Keep the highest-confidence match for this pair
if key not in pair_info or confidence > pair_info[key][0]:
pair_info[key] = (confidence, cols)
break # OR logic: one strategy match is enough
checked += 1
if progress_callback and checked % 1000 == 0:
progress_callback(checked, total_pairs)
if progress_callback:
progress_callback(total_pairs, total_pairs)
# Build MatchResult objects (survivor not yet selected)
raw_groups = uf.groups()
match_groups: list[MatchResult] = []
for gid, (root, members) in enumerate(sorted(raw_groups.items())):
# Confidence = min across all pairs in the group
group_confidence = 100.0
group_cols: set[str] = set()
for idx_a, m in enumerate(members):
for idx_b in range(idx_a + 1, len(members)):
key = (min(m, members[idx_b]), max(m, members[idx_b]))
if key in pair_info:
conf, cols = pair_info[key]
group_confidence = min(group_confidence, conf)
group_cols.update(cols)
match_groups.append(MatchResult(
group_id=gid,
row_indices=members,
confidence=round(group_confidence, 2),
matched_on=sorted(group_cols),
survivor_index=members[0], # placeholder
))
return match_groups, pair_info
# ---------------------------------------------------------------------------
# Survivor selection
# ---------------------------------------------------------------------------
def _select_survivor(
group: MatchResult,
df: pd.DataFrame,
rule: SurvivorRule,
date_column: Optional[str] = None,
) -> int:
"""Choose the survivor row index within a match group."""
indices = group.row_indices
if rule == SurvivorRule.KEEP_FIRST:
return indices[0]
if rule == SurvivorRule.KEEP_LAST:
return indices[-1]
if rule == SurvivorRule.KEEP_MOST_COMPLETE:
# Fewest empty/blank cells wins
best_idx = indices[0]
best_empty = _count_empty(df.iloc[indices[0]])
for idx in indices[1:]:
empty = _count_empty(df.iloc[idx])
if empty < best_empty:
best_empty = empty
best_idx = idx
return best_idx
if rule == SurvivorRule.KEEP_MOST_RECENT:
if not date_column or date_column not in df.columns:
logger.warning("date_column '{}' not found; falling back to keep_first", date_column)
return indices[0]
best_idx = indices[0]
best_date = _parse_date(df.iloc[indices[0]].get(date_column, ""))
for idx in indices[1:]:
d = _parse_date(df.iloc[idx].get(date_column, ""))
if d is not None and (best_date is None or d > best_date):
best_date = d
best_idx = idx
return best_idx
return indices[0]
def _count_empty(row: pd.Series) -> int:
"""Count empty/blank cells in a row, ignoring internal shadow columns."""
count = 0
for col, val in row.items():
if isinstance(col, str) and col.startswith("_norm_"):
continue
if pd.isna(val) or str(val).strip() == "":
count += 1
return count
def _parse_date(value) -> Optional[pd.Timestamp]:
try:
return pd.to_datetime(value)
except Exception:
return None
# ---------------------------------------------------------------------------
# Merge mode
# ---------------------------------------------------------------------------
def _merge_group(df: pd.DataFrame, survivor_idx: int, loser_indices: list[int]) -> pd.Series:
"""Fill missing fields in survivor from losers (ordered by position)."""
survivor = df.iloc[survivor_idx].copy()
for col in survivor.index:
if isinstance(col, str) and col.startswith("_norm_"):
continue
val = survivor[col]
if pd.isna(val) or str(val).strip() == "":
for loser_idx in loser_indices:
candidate = df.iloc[loser_idx][col]
if not pd.isna(candidate) and str(candidate).strip() != "":
survivor[col] = candidate
break
return survivor
# ---------------------------------------------------------------------------
# Auto-detect strategies
# ---------------------------------------------------------------------------
# (pattern, normalizer, algorithm, threshold, is_strong_key)
# Strong keys (email, phone) can be standalone strategies.
# Weak keys (name, address) must be combined with a strong key via AND.
_COLUMN_TYPE_PATTERNS: list[tuple[re.Pattern, NormalizerType, Algorithm, float, bool]] = [
(re.compile(r"e[-_]?mail", re.I), NormalizerType.EMAIL, Algorithm.EXACT, 100.0, True),
(re.compile(r"phone|telephone|mobile|cell", re.I), NormalizerType.PHONE, Algorithm.EXACT, 100.0, True),
(re.compile(r"^(name|full_name|customer_name|first_name|last_name|contact_name|respondent_name)$", re.I),
NormalizerType.NAME, Algorithm.JARO_WINKLER, 85.0, False),
(re.compile(r"address|street|addr", re.I), NormalizerType.ADDRESS, Algorithm.TOKEN_SET_RATIO, 80.0, False),
]
def build_default_strategies(df: pd.DataFrame) -> list[MatchStrategy]:
"""Auto-detect column types and build match strategies.
Strategy logic:
- Strong keys (email, phone): each gets its own standalone OR strategy.
- Weak keys (name, address): combined with each strong key via AND to
form additional strategies. Weak keys never stand alone (too many
false positives — "John""Jon" at 93 % Jaro-Winkler).
- If only weak keys are found (no strong keys), they're promoted to
standalone strategies as a fallback.
- If no columns match, exact match on all columns (drop_duplicates
equivalent).
"""
strong_cols: list[ColumnMatchStrategy] = []
weak_cols: list[ColumnMatchStrategy] = []
for col in df.columns:
if col.startswith("_norm_"):
continue
for pattern, norm_type, algo, threshold, is_strong in _COLUMN_TYPE_PATTERNS:
if pattern.search(col):
cs = ColumnMatchStrategy(
column=col, algorithm=algo,
threshold=threshold, normalizer=norm_type,
)
if is_strong:
strong_cols.append(cs)
else:
weak_cols.append(cs)
break
strategies: list[MatchStrategy] = []
if strong_cols:
# Each strong key is a standalone strategy (OR)
for sc in strong_cols:
strategies.append(MatchStrategy(column_strategies=[sc]))
# Each weak key is paired with each strong key (AND) for extra recall
for wc in weak_cols:
for sc in strong_cols:
strategies.append(MatchStrategy(column_strategies=[wc, sc]))
elif weak_cols:
# No strong keys — promote weak to standalone (best effort)
for wc in weak_cols:
strategies.append(MatchStrategy(column_strategies=[wc]))
if strategies:
return strategies
# Fallback: exact match on all columns (equivalent to drop_duplicates)
logger.info("No column patterns matched; using exact match on all columns")
all_cols = [
ColumnMatchStrategy(column=c, algorithm=Algorithm.EXACT, threshold=100.0)
for c in df.columns
]
return [MatchStrategy(column_strategies=all_cols)]
# ---------------------------------------------------------------------------
# Normalisation pass
# ---------------------------------------------------------------------------
def _apply_normalizations(df: pd.DataFrame, strategies: list[MatchStrategy]) -> pd.DataFrame:
"""Add ``_norm_*`` shadow columns for every column that has a normalizer."""
df = df.copy()
seen: set[str] = set()
for strategy in strategies:
for cs in strategy.column_strategies:
if cs.normalizer and cs.column not in seen and cs.column in df.columns:
seen.add(cs.column)
norm_fn = get_normalizer(cs.normalizer)
norm_col = f"_norm_{cs.column}"
df[norm_col] = df[cs.column].apply(
lambda v, fn=norm_fn: fn(str(v)) if pd.notna(v) and str(v).strip() else ""
)
return df
# ---------------------------------------------------------------------------
# Main entry point
# ---------------------------------------------------------------------------
def deduplicate(
df: pd.DataFrame,
*,
strategies: Optional[list[MatchStrategy]] = None,
survivor_rule: SurvivorRule = SurvivorRule.KEEP_FIRST,
date_column: Optional[str] = None,
merge: bool = False,
preview: bool = True,
review_callback: Optional[Callable] = None,
progress_callback: Optional[Callable[[int, int], None]] = None,
) -> DeduplicationResult:
"""Run the full deduplication pipeline.
Parameters
----------
df : input DataFrame
strategies : matching strategies (auto-detected if None)
survivor_rule : which row to keep per group
date_column : used with ``KEEP_MOST_RECENT``
merge : fill missing fields in survivor from losers
preview : if True, result is informational only (no writes)
review_callback : ``(group: MatchResult, df: DataFrame) -> bool|None``
Called for each match group. Return True to accept, False to reject,
None to skip (keep both rows). Used for interactive review.
progress_callback : ``(current: int, total: int) -> None``
Called periodically during pairwise comparison.
Returns a ``DeduplicationResult``.
"""
log_entries: list[str] = []
original_count = len(df)
if strategies is None:
strategies = build_default_strategies(df)
log_entries.append(f"Auto-detected {len(strategies)} match strategies")
# Log strategies
for i, s in enumerate(strategies):
cols_desc = ", ".join(
f"{cs.column}({cs.algorithm.value}@{cs.threshold})"
for cs in s.column_strategies
)
log_entries.append(f"Strategy {i}: {cols_desc}")
logger.info("Strategy {}: {}", i, cols_desc)
# Normalise
df_work = _apply_normalizations(df, strategies)
# Find matches
match_groups, pair_info = _find_match_groups(
df_work, strategies, progress_callback=progress_callback
)
log_entries.append(f"Found {len(match_groups)} duplicate groups")
logger.info("Found {} duplicate groups from {} rows", len(match_groups), original_count)
# Interactive review
if review_callback and match_groups:
reviewed_groups: list[MatchResult] = []
for group in match_groups:
decision = review_callback(group, df_work)
if decision is True:
reviewed_groups.append(group)
log_entries.append(f"Group {group.group_id}: accepted by reviewer")
elif decision is False:
log_entries.append(f"Group {group.group_id}: rejected by reviewer")
else:
log_entries.append(f"Group {group.group_id}: skipped by reviewer")
match_groups = reviewed_groups
# Survivor selection
for group in match_groups:
group.survivor_index = _select_survivor(group, df_work, survivor_rule, date_column)
log_entries.append(
f"Group {group.group_id}: survivor=row {group.survivor_index} "
f"(rule={survivor_rule.value}, confidence={group.confidence}%)"
)
# Build result dataframes
remove_indices: set[int] = set()
merged_rows: dict[int, pd.Series] = {}
for group in match_groups:
survivor_idx = group.survivor_index
losers = [i for i in group.row_indices if i != survivor_idx]
remove_indices.update(losers)
if merge and losers:
merged = _merge_group(df_work, survivor_idx, losers)
merged_rows[survivor_idx] = merged
# Log merged fields
original = df_work.iloc[survivor_idx]
for col in original.index:
if isinstance(col, str) and col.startswith("_norm_"):
continue
orig_val = str(original[col]).strip()
new_val = str(merged[col]).strip()
if orig_val != new_val and not orig_val:
log_entries.append(
f"Group {group.group_id}: merged '{col}' "
f"into survivor from losers: '{new_val}'"
)
# Build output DataFrames
keep_indices = [i for i in range(len(df_work)) if i not in remove_indices]
if merged_rows:
rows = []
for i in keep_indices:
if i in merged_rows:
rows.append(merged_rows[i])
else:
rows.append(df_work.iloc[i])
deduplicated_df = pd.DataFrame(rows)
else:
deduplicated_df = df_work.iloc[keep_indices].copy()
removed_df = df_work.iloc[sorted(remove_indices)].copy() if remove_indices else pd.DataFrame()
# Drop shadow columns from output
norm_cols = [c for c in deduplicated_df.columns if str(c).startswith("_norm_")]
deduplicated_df = deduplicated_df.drop(columns=norm_cols, errors="ignore")
if not removed_df.empty:
removed_df = removed_df.drop(columns=norm_cols, errors="ignore")
# Reset index
deduplicated_df = deduplicated_df.reset_index(drop=True)
if not removed_df.empty:
removed_df = removed_df.reset_index(drop=True)
removed_count = original_count - len(deduplicated_df)
log_entries.append(f"Result: {original_count}{len(deduplicated_df)} rows ({removed_count} removed)")
return DeduplicationResult(
original_row_count=original_count,
deduplicated_df=deduplicated_df,
removed_df=removed_df,
match_groups=match_groups,
log_entries=log_entries,
is_preview=preview,
)

247
src/core/io.py Normal file
View File

@@ -0,0 +1,247 @@
"""File I/O: encoding/delimiter detection, CSV/Excel reading, output writing."""
from __future__ import annotations
import csv
import io
from pathlib import Path
from typing import Generator, Optional
import pandas as pd
from charset_normalizer import from_bytes
from loguru import logger
# ---------------------------------------------------------------------------
# Encoding detection
# ---------------------------------------------------------------------------
def detect_encoding(path: Path, sample_bytes: int = 65_536) -> str:
"""Detect file encoding by reading the first *sample_bytes*.
Returns the best-guess encoding name (e.g. ``utf-8``, ``windows-1252``).
Falls back to ``utf-8`` when detection is inconclusive.
"""
raw = Path(path).read_bytes()[:sample_bytes]
if not raw:
return "utf-8"
# Check BOM first
if raw[:3] == b"\xef\xbb\xbf":
return "utf-8-sig"
if raw[:2] in (b"\xff\xfe", b"\xfe\xff"):
return "utf-16"
result = from_bytes(raw).best()
if result is None:
return "utf-8"
enc = result.encoding.lower()
# Normalise common aliases
if enc in ("ascii", "us-ascii"):
enc = "utf-8"
return enc
# ---------------------------------------------------------------------------
# Delimiter detection
# ---------------------------------------------------------------------------
_COMMON_DELIMITERS = [",", "\t", ";", "|"]
def detect_delimiter(path: Path, encoding: str = "utf-8") -> str:
"""Sniff the delimiter from the first 20 lines of a text file.
Falls back to comma if csv.Sniffer cannot decide.
"""
raw_path = Path(path)
lines: list[str] = []
with raw_path.open("r", encoding=encoding, errors="replace") as fh:
for _ in range(20):
line = fh.readline()
if not line:
break
lines.append(line)
if not lines:
return ","
sample = "".join(lines)
try:
dialect = csv.Sniffer().sniff(sample, delimiters="".join(_COMMON_DELIMITERS))
return dialect.delimiter
except csv.Error:
return ","
# ---------------------------------------------------------------------------
# Header-row detection
# ---------------------------------------------------------------------------
def detect_header_row(path: Path, encoding: str = "utf-8", delimiter: str = ",",
max_scan: int = 20) -> int:
"""Return the 0-based index of the likely header row.
Heuristic: the first row where *every* cell looks like a column name
(non-numeric, non-empty string). Falls back to 0.
"""
raw_path = Path(path)
with raw_path.open("r", encoding=encoding, errors="replace") as fh:
reader = csv.reader(fh, delimiter=delimiter)
for idx, row in enumerate(reader):
if idx >= max_scan:
break
if not row:
continue
# All cells must be non-empty, non-numeric strings
if all(_looks_like_header(cell) for cell in row if cell.strip()):
return idx
return 0
def _looks_like_header(value: str) -> bool:
"""True if *value* looks like a column header, not a data value."""
v = value.strip()
if not v:
return False
# Pure numbers are not headers
try:
float(v.replace(",", ""))
return False
except ValueError:
pass
return True
# ---------------------------------------------------------------------------
# Excel helpers
# ---------------------------------------------------------------------------
def list_sheets(path: Path) -> list[str]:
"""Return sheet names from an Excel workbook."""
xl = pd.ExcelFile(path, engine="openpyxl")
return xl.sheet_names
# ---------------------------------------------------------------------------
# Reading
# ---------------------------------------------------------------------------
def read_file(
path: str | Path,
*,
encoding: Optional[str] = None,
delimiter: Optional[str] = None,
header_row: Optional[int] = None,
sheet_name: Optional[str | int] = 0,
chunk_size: Optional[int] = None,
) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
"""Read a CSV, TSV, or Excel file into a DataFrame.
Parameters
----------
path : file path
encoding : override detected encoding (CSV only)
delimiter : override detected delimiter (CSV only)
header_row : 0-based row index for the header; auto-detected if *None*
sheet_name : Excel sheet (name or 0-based index). Ignored for CSV.
chunk_size : if set, return a generator of DataFrames (CSV only).
Returns a DataFrame (or generator when *chunk_size* is set).
"""
filepath = Path(path)
if not filepath.exists():
raise FileNotFoundError(f"File not found: {filepath}")
suffix = filepath.suffix.lower()
if suffix in (".xlsx", ".xls"):
return _read_excel(filepath, header_row=header_row, sheet_name=sheet_name)
else:
return _read_csv(
filepath,
encoding=encoding,
delimiter=delimiter,
header_row=header_row,
chunk_size=chunk_size,
)
def _read_csv(
path: Path,
*,
encoding: Optional[str] = None,
delimiter: Optional[str] = None,
header_row: Optional[int] = None,
chunk_size: Optional[int] = None,
) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
enc = encoding or detect_encoding(path)
delim = delimiter or detect_delimiter(path, enc)
hdr = header_row if header_row is not None else detect_header_row(path, enc, delim)
logger.debug("Reading CSV {} (encoding={}, delimiter={!r}, header_row={})",
path.name, enc, delim, hdr)
kwargs: dict = dict(
filepath_or_buffer=path,
encoding=enc,
delimiter=delim,
header=hdr,
dtype=str,
keep_default_na=False,
on_bad_lines="warn",
)
if chunk_size:
return pd.read_csv(**kwargs, chunksize=chunk_size)
return pd.read_csv(**kwargs)
def _read_excel(
path: Path,
*,
header_row: Optional[int] = None,
sheet_name: Optional[str | int] = 0,
) -> pd.DataFrame:
hdr = header_row if header_row is not None else 0
logger.debug("Reading Excel {} (sheet={}, header_row={})", path.name, sheet_name, hdr)
return pd.read_excel(
path,
sheet_name=sheet_name,
header=hdr,
dtype=str,
keep_default_na=False,
engine="openpyxl",
)
# ---------------------------------------------------------------------------
# Writing
# ---------------------------------------------------------------------------
def write_file(
df: pd.DataFrame,
path: str | Path,
*,
file_format: Optional[str] = None,
encoding: str = "utf-8-sig",
) -> Path:
"""Write a DataFrame to CSV or Excel.
Parameters
----------
df : DataFrame to write
path : output file path
file_format : ``"csv"`` or ``"xlsx"``; auto-detected from *path* suffix if *None*
encoding : output encoding (default ``utf-8-sig`` for Windows Excel compat)
Returns the resolved output Path.
"""
out = Path(path)
fmt = file_format or out.suffix.lstrip(".").lower()
if fmt in ("xlsx", "xls"):
df.to_excel(out, index=False, engine="openpyxl")
else:
df.to_csv(out, index=False, encoding=encoding)
logger.info("Wrote {} rows to {}", len(df), out)
return out

224
src/core/normalizers.py Normal file
View File

@@ -0,0 +1,224 @@
"""Per-column normalization functions for deduplication matching.
Every normalizer is ``str -> str``, handles None/empty gracefully, and is
idempotent (applying it twice yields the same result as once).
"""
from __future__ import annotations
import re
from enum import Enum
from typing import Callable, Optional
import phonenumbers
# ---------------------------------------------------------------------------
# Types
# ---------------------------------------------------------------------------
class NormalizerType(str, Enum):
EMAIL = "email"
PHONE = "phone"
NAME = "name"
ADDRESS = "address"
STRING = "string"
# ---------------------------------------------------------------------------
# String normalizer (base)
# ---------------------------------------------------------------------------
def normalize_string(value: Optional[str]) -> str:
"""Trim, collapse internal whitespace, case-fold."""
if not value or not isinstance(value, str):
return ""
return re.sub(r"\s+", " ", value.strip()).casefold()
# ---------------------------------------------------------------------------
# Email normalizer
# ---------------------------------------------------------------------------
_GMAIL_DOMAINS = {"gmail.com", "googlemail.com"}
def normalize_email(value: Optional[str]) -> str:
"""Lowercase, strip whitespace, strip Gmail dots, strip +tag suffixes."""
if not value or not isinstance(value, str):
return ""
email = value.strip().lower()
if "@" not in email:
return email
local, domain = email.rsplit("@", 1)
# Strip +tag suffix
if "+" in local:
local = local.split("+", 1)[0]
# Strip dots for Gmail addresses
if domain in _GMAIL_DOMAINS:
local = local.replace(".", "")
return f"{local}@{domain}"
# ---------------------------------------------------------------------------
# Phone normalizer
# ---------------------------------------------------------------------------
def normalize_phone(value: Optional[str], default_region: str = "US") -> str:
"""Parse with phonenumbers lib, return E.164. Fallback: digits-only."""
if not value or not isinstance(value, str):
return ""
stripped = value.strip()
if not stripped:
return ""
try:
parsed = phonenumbers.parse(stripped, default_region)
if phonenumbers.is_possible_number(parsed):
return phonenumbers.format_number(parsed, phonenumbers.PhoneNumberFormat.E164)
except phonenumbers.NumberParseException:
pass
# Fallback: digits only
digits = re.sub(r"\D", "", stripped)
return digits
# ---------------------------------------------------------------------------
# Name normalizer
# ---------------------------------------------------------------------------
_TITLE_PREFIXES = {
"mr", "mrs", "ms", "miss", "dr", "prof", "professor",
"sir", "madam", "rev", "reverend", "hon", "honorable",
}
_NAME_SUFFIXES = {
"jr", "sr", "ii", "iii", "iv", "v",
"phd", "md", "esq", "dds", "rn",
}
def normalize_name(value: Optional[str]) -> str:
"""Strip titles/suffixes, collapse whitespace, case-fold."""
if not value or not isinstance(value, str):
return ""
name = value.strip()
if not name:
return ""
# Case-fold first for matching
name = name.casefold()
# Remove periods and commas that are part of titles/suffixes
name = name.replace(".", " ").replace(",", " ")
parts = name.split()
# Strip leading titles
while parts and parts[0].rstrip(".") in _TITLE_PREFIXES:
parts.pop(0)
# Strip trailing suffixes
while parts and parts[-1].rstrip(".") in _NAME_SUFFIXES:
parts.pop()
return " ".join(parts)
# ---------------------------------------------------------------------------
# Address normalizer
# ---------------------------------------------------------------------------
_USPS_ABBREVIATIONS: dict[str, str] = {
"street": "st",
"avenue": "ave",
"boulevard": "blvd",
"drive": "dr",
"lane": "ln",
"road": "rd",
"court": "ct",
"place": "pl",
"circle": "cir",
"trail": "trl",
"way": "way",
"terrace": "ter",
"parkway": "pkwy",
"highway": "hwy",
"expressway": "expy",
"freeway": "fwy",
"square": "sq",
"loop": "loop",
"alley": "aly",
"crossing": "xing",
"point": "pt",
"north": "n",
"south": "s",
"east": "e",
"west": "w",
"northeast": "ne",
"northwest": "nw",
"southeast": "se",
"southwest": "sw",
"apartment": "apt",
"suite": "ste",
"building": "bldg",
"floor": "fl",
"room": "rm",
"unit": "unit",
"number": "#",
"saint": "st",
"fort": "ft",
"mount": "mt",
"heights": "hts",
"springs": "spgs",
}
def normalize_address(value: Optional[str]) -> str:
"""USPS abbreviation normalization, collapse whitespace, case-fold."""
if not value or not isinstance(value, str):
return ""
addr = value.strip()
if not addr:
return ""
# Case-fold and clean punctuation (keep #)
addr = addr.casefold()
addr = addr.replace(".", " ").replace(",", " ")
parts = addr.split()
normalized_parts = []
for part in parts:
normalized_parts.append(_USPS_ABBREVIATIONS.get(part, part))
return " ".join(normalized_parts)
# ---------------------------------------------------------------------------
# Registry
# ---------------------------------------------------------------------------
_NORMALIZER_MAP: dict[NormalizerType, Callable[[str], str]] = {
NormalizerType.EMAIL: normalize_email,
NormalizerType.PHONE: normalize_phone,
NormalizerType.NAME: normalize_name,
NormalizerType.ADDRESS: normalize_address,
NormalizerType.STRING: normalize_string,
}
def get_normalizer(normalizer_type: NormalizerType | str) -> Callable[[str], str]:
"""Return the normalizer function for the given type.
Accepts both ``NormalizerType`` enum values and plain strings.
"""
if isinstance(normalizer_type, str):
normalizer_type = NormalizerType(normalizer_type.lower())
func = _NORMALIZER_MAP.get(normalizer_type)
if func is None:
raise ValueError(f"Unknown normalizer type: {normalizer_type}")
return func

1
src/gui/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""Streamlit GUI for the DataTools Deduplicator."""

8
src/gui/__main__.py Normal file
View File

@@ -0,0 +1,8 @@
"""Allow running as ``python -m src.gui``."""
import subprocess
import sys
from pathlib import Path
app_path = Path(__file__).parent / "app.py"
subprocess.run([sys.executable, "-m", "streamlit", "run", str(app_path)])

287
src/gui/app.py Normal file
View File

@@ -0,0 +1,287 @@
"""DataTools Deduplicator — Streamlit GUI.
Launch:
streamlit run src/gui/app.py
"""
from __future__ import annotations
import io
import sys
from pathlib import Path
import pandas as pd
import streamlit as st
# Ensure project root is on sys.path so `src.core` imports work
_project_root = Path(__file__).resolve().parent.parent.parent
if str(_project_root) not in sys.path:
sys.path.insert(0, str(_project_root))
from src.core.dedup import deduplicate, build_default_strategies, DeduplicationResult
from src.core.io import read_file, list_sheets
from src.core.config import DeduplicationConfig
from src.gui.components import config_panel, match_group_card, results_summary
# ---------------------------------------------------------------------------
# Page config
# ---------------------------------------------------------------------------
st.set_page_config(
page_title="DataTools Deduplicator",
page_icon="🔍",
layout="wide",
)
# ---------------------------------------------------------------------------
# Session state defaults
# ---------------------------------------------------------------------------
_DEFAULTS = {
"df": None,
"result": None,
"review_decisions": {},
"config": None,
"file_name": "",
"sheet_names": [],
}
for key, default in _DEFAULTS.items():
if key not in st.session_state:
st.session_state[key] = default
# ---------------------------------------------------------------------------
# Header
# ---------------------------------------------------------------------------
st.title("DataTools Deduplicator")
st.caption("Find and remove duplicate rows in CSV and Excel files.")
# ---------------------------------------------------------------------------
# File upload
# ---------------------------------------------------------------------------
uploaded = st.file_uploader(
"Upload CSV or Excel file",
type=["csv", "tsv", "xlsx", "xls"],
help="Supports CSV, TSV, and Excel files. Encoding and delimiters are auto-detected.",
)
if uploaded is not None:
# Detect if file changed
if uploaded.name != st.session_state["file_name"]:
st.session_state["file_name"] = uploaded.name
st.session_state["result"] = None
st.session_state["review_decisions"] = {}
# Read the file
try:
# Write to a temp file for read_file() which needs a path
import tempfile
suffix = Path(uploaded.name).suffix
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(uploaded.getvalue())
tmp_path = Path(tmp.name)
# Check for Excel sheets
if suffix.lower() in (".xlsx", ".xls"):
st.session_state["sheet_names"] = list_sheets(tmp_path)
else:
st.session_state["sheet_names"] = []
df = read_file(tmp_path)
if not isinstance(df, pd.DataFrame):
df = pd.concat(list(df), ignore_index=True)
st.session_state["df"] = df
# Clean up temp file
tmp_path.unlink(missing_ok=True)
except Exception as e:
st.error(f"Failed to read file: {e}")
st.session_state["df"] = None
df = st.session_state["df"]
if df is not None:
# Sheet selector for Excel files
if st.session_state["sheet_names"] and len(st.session_state["sheet_names"]) > 1:
sheet = st.selectbox(
"Select sheet",
st.session_state["sheet_names"],
)
if sheet != st.session_state.get("_current_sheet"):
st.session_state["_current_sheet"] = sheet
suffix = Path(uploaded.name).suffix
import tempfile
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(uploaded.getvalue())
tmp_path = Path(tmp.name)
df = read_file(tmp_path, sheet_name=sheet)
if not isinstance(df, pd.DataFrame):
df = pd.concat(list(df), ignore_index=True)
st.session_state["df"] = df
st.session_state["result"] = None
st.session_state["review_decisions"] = {}
tmp_path.unlink(missing_ok=True)
# Preview
st.subheader(f"Preview: {uploaded.name}")
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
st.dataframe(df.head(10), use_container_width=True)
# Advanced options
settings = config_panel(df)
# Apply loaded config if present
loaded_cfg = st.session_state.get("loaded_config")
if loaded_cfg is not None:
settings["strategies"] = loaded_cfg.to_strategies()
settings["survivor_rule"] = loaded_cfg.to_survivor_rule()
settings["date_column"] = loaded_cfg.date_column
settings["merge"] = loaded_cfg.merge
# Clear so it doesn't override on every rerun
del st.session_state["loaded_config"]
# ---------------------------------------------------------------------------
# Find Duplicates button
# ---------------------------------------------------------------------------
st.divider()
if st.button("Find Duplicates", type="primary", use_container_width=True):
progress_bar = st.progress(0, text="Comparing rows...")
def _gui_progress(current: int, total: int) -> None:
if total > 0:
pct = min(current / total, 1.0)
progress_bar.progress(pct, text=f"Comparing rows... {current:,}/{total:,}")
with st.spinner("Running deduplication..."):
result = deduplicate(
df,
strategies=settings["strategies"],
survivor_rule=settings["survivor_rule"],
date_column=settings["date_column"],
merge=settings["merge"],
preview=False,
progress_callback=_gui_progress,
)
progress_bar.empty()
st.session_state["result"] = result
st.session_state["review_decisions"] = {}
# ---------------------------------------------------------------------------
# Results
# ---------------------------------------------------------------------------
result: DeduplicationResult | None = st.session_state["result"]
if result is not None:
st.divider()
st.subheader("Results")
# Summary + download buttons
results_summary(result, df)
# Match group review
if result.match_groups:
st.divider()
st.subheader("Match Groups")
# Batch actions
action_left, action_mid, action_right = st.columns(3)
with action_left:
if st.button("Accept All"):
for g in result.match_groups:
st.session_state["review_decisions"][g.group_id] = True
st.rerun()
with action_mid:
if st.button("Reject All"):
for g in result.match_groups:
st.session_state["review_decisions"][g.group_id] = False
st.rerun()
with action_right:
if st.button("Clear Decisions"):
st.session_state["review_decisions"] = {}
st.rerun()
# Individual group cards
decisions = st.session_state["review_decisions"]
for i, group in enumerate(result.match_groups):
decision = match_group_card(group, df, group_num=i + 1)
if decision is not None:
decisions[group.group_id] = decision
st.session_state["review_decisions"] = decisions
st.rerun()
# Show decision summary
if decisions:
st.divider()
accepted = sum(1 for v in decisions.values() if v is True)
rejected = sum(1 for v in decisions.values() if v is False)
pending = len(result.match_groups) - len(decisions)
st.caption(
f"Decisions: {accepted} merged, {rejected} kept both, "
f"{pending} pending"
)
# Re-run dedup with review decisions applied
if st.button(
"Apply Review Decisions & Download",
type="primary",
use_container_width=True,
):
def _review_callback(group, _df):
gid = group.group_id
if gid in decisions:
return decisions[gid]
return True # default: accept
reviewed_result = deduplicate(
df,
strategies=settings["strategies"],
survivor_rule=settings["survivor_rule"],
date_column=settings["date_column"],
merge=settings["merge"],
preview=False,
review_callback=_review_callback,
)
# Update result and show downloads
st.session_state["result"] = reviewed_result
csv_bytes = reviewed_result.deduplicated_df.to_csv(
index=False
).encode("utf-8-sig")
st.download_button(
"Download Reviewed & Deduplicated CSV",
data=csv_bytes,
file_name="deduplicated_reviewed.csv",
mime="text/csv",
key="reviewed_download",
)
# Log entries
if result.log_entries:
with st.expander("Processing Log"):
st.code("\n".join(result.log_entries))
else:
# No file uploaded — show placeholder
st.info("Upload a CSV or Excel file to get started.")
# ---------------------------------------------------------------------------
# Footer
# ---------------------------------------------------------------------------
st.divider()
st.caption(
"Runs locally. Your data never leaves this computer. "
"| DataTools Deduplicator v1.0"
)

413
src/gui/components.py Normal file
View File

@@ -0,0 +1,413 @@
"""Reusable Streamlit widgets for the deduplicator GUI."""
from __future__ import annotations
import io
from typing import Optional
import pandas as pd
import streamlit as st
from src.core.dedup import (
Algorithm,
ColumnMatchStrategy,
DeduplicationResult,
MatchResult,
MatchStrategy,
SurvivorRule,
)
from src.core.config import (
ColumnStrategyConfig,
DeduplicationConfig,
StrategyConfig,
)
from src.core.normalizers import NormalizerType
# ---------------------------------------------------------------------------
# Config panel (advanced options)
# ---------------------------------------------------------------------------
def config_panel(df: pd.DataFrame) -> dict:
"""Render the Advanced Options expander. Returns a settings dict.
Keys returned:
strategies: list[MatchStrategy] | None
survivor_rule: SurvivorRule
date_column: str | None
merge: bool
"""
columns = list(df.columns)
with st.expander("Advanced Options"):
col_left, col_right = st.columns(2)
with col_left:
subset_cols = st.multiselect(
"Match on columns",
columns,
default=[],
help="Leave empty to auto-detect based on column names.",
)
key_cols = st.multiselect(
"Strong keys",
columns,
default=[],
help="Columns that uniquely identify records (e.g., EIN, SKU). Each is an independent exact-match strategy.",
)
fuzzy_cols = st.multiselect(
"Fuzzy columns",
columns,
default=[],
help="Columns to fuzzy-match. Others use exact matching.",
)
with col_right:
algorithm = st.selectbox(
"Fuzzy algorithm",
["jaro_winkler", "levenshtein", "token_set_ratio"],
index=0,
help="jaro_winkler: best for names. levenshtein: best for typos. token_set_ratio: best for addresses.",
)
threshold = st.slider(
"Similarity threshold",
min_value=50,
max_value=100,
value=85,
help="Lower = more matches but more false positives.",
)
survivor = st.selectbox(
"Survivor rule",
["first", "last", "most-complete", "most-recent"],
index=0,
help="Which row to keep when duplicates are found.",
)
# Second row of options
col_a, col_b = st.columns(2)
with col_a:
normalize_options = {c: "auto" for c in columns}
normalizer_types = ["auto", "email", "phone", "name", "address", "string", "none"]
normalize_map: dict[str, str] = {}
if fuzzy_cols or subset_cols:
target_cols = fuzzy_cols or subset_cols
st.markdown("**Per-column normalizers**")
for col_name in target_cols:
norm = st.selectbox(
f"Normalizer for '{col_name}'",
normalizer_types,
index=0,
key=f"norm_{col_name}",
)
if norm not in ("auto", "none"):
normalize_map[col_name] = norm
with col_b:
merge = st.checkbox(
"Merge mode",
value=False,
help="Fill missing fields in the surviving row from removed duplicates.",
)
date_column: Optional[str] = None
if survivor == "most-recent":
date_column = st.selectbox(
"Date column",
columns,
help="Required for most-recent survivor rule.",
)
# Config save/load
st.divider()
cfg_left, cfg_right = st.columns(2)
with cfg_left:
config_file = st.file_uploader(
"Load config profile",
type=["json"],
help="Load previously saved settings.",
key="config_upload",
)
if config_file is not None:
import json
try:
data = json.loads(config_file.read())
loaded = DeduplicationConfig.from_dict(data)
st.session_state["loaded_config"] = loaded
st.success("Config loaded.")
except Exception as e:
st.error(f"Failed to load config: {e}")
with cfg_right:
if st.button("Save current settings"):
cfg = _build_config(
subset_cols, key_cols, fuzzy_cols,
algorithm, threshold, normalize_map,
survivor, date_column, merge,
)
cfg_json = cfg.to_dict()
import json
st.download_button(
"Download config JSON",
data=json.dumps(cfg_json, indent=2),
file_name="dedup_config.json",
mime="application/json",
)
# Build strategies from selections
strategies = _build_strategies(
subset_cols, key_cols, fuzzy_cols,
algorithm, threshold, normalize_map,
)
# Survivor rule mapping
survivor_map = {
"first": SurvivorRule.KEEP_FIRST,
"last": SurvivorRule.KEEP_LAST,
"most-complete": SurvivorRule.KEEP_MOST_COMPLETE,
"most-recent": SurvivorRule.KEEP_MOST_RECENT,
}
return {
"strategies": strategies,
"survivor_rule": survivor_map[survivor],
"date_column": date_column,
"merge": merge,
}
def _build_strategies(
subset_cols: list[str],
key_cols: list[str],
fuzzy_cols: list[str],
algorithm: str,
threshold: int,
normalize_map: dict[str, str],
) -> Optional[list[MatchStrategy]]:
"""Build MatchStrategy list from GUI selections. Returns None for auto-detect."""
strategies: list[MatchStrategy] = []
# If user selected columns explicitly, build from those
if subset_cols or fuzzy_cols:
target_cols = subset_cols if subset_cols else fuzzy_cols
fuzzy_set = set(fuzzy_cols)
col_strats: list[ColumnMatchStrategy] = []
for col in target_cols:
norm = None
if col in normalize_map:
norm = NormalizerType(normalize_map[col])
if col in fuzzy_set:
algo = Algorithm(algorithm)
thresh = float(threshold)
else:
algo = Algorithm.EXACT
thresh = 100.0
col_strats.append(ColumnMatchStrategy(
column=col, algorithm=algo, threshold=thresh, normalizer=norm,
))
strategies.append(MatchStrategy(column_strategies=col_strats))
# Add strong key strategies
if key_cols:
for col in key_cols:
strategies.append(MatchStrategy(column_strategies=[
ColumnMatchStrategy(column=col, algorithm=Algorithm.EXACT, threshold=100.0)
]))
return strategies if strategies else None
def _build_config(
subset_cols, key_cols, fuzzy_cols,
algorithm, threshold, normalize_map,
survivor, date_column, merge,
) -> DeduplicationConfig:
"""Build a DeduplicationConfig from GUI state."""
cfg = DeduplicationConfig(
survivor_rule=survivor.replace("-", "_"),
date_column=date_column,
merge=merge,
subset_columns=subset_cols or None,
fuzzy_columns=fuzzy_cols or None,
default_algorithm=algorithm,
default_threshold=float(threshold),
normalize_map=normalize_map or None,
)
strategies = _build_strategies(
subset_cols, key_cols, fuzzy_cols,
algorithm, threshold, normalize_map,
)
if strategies:
cfg.strategies = [
StrategyConfig(columns=[
ColumnStrategyConfig(
column=cs.column,
algorithm=cs.algorithm.value,
threshold=cs.threshold,
normalizer=cs.normalizer.value if cs.normalizer else None,
)
for cs in s.column_strategies
])
for s in strategies
]
return cfg
# ---------------------------------------------------------------------------
# Match group review card
# ---------------------------------------------------------------------------
def match_group_card(
group: MatchResult,
df: pd.DataFrame,
group_num: int,
) -> Optional[bool]:
"""Render an expandable match group card with side-by-side diff.
Returns:
True — user clicked Merge (accept match)
False — user clicked Keep Both (reject match)
None — no decision yet
"""
confidence = group.confidence
auto_expand = confidence < 95.0
matched_on = ", ".join(group.matched_on)
n_rows = len(group.row_indices)
label = (
f"Group {group_num}: {n_rows} rows "
f"(confidence: {confidence:.0f}%) "
f"[{matched_on}]"
)
with st.expander(label, expanded=auto_expand):
# Build comparison DataFrame
display_cols = [c for c in df.columns if not str(c).startswith("_norm_")]
rows_data = []
for idx in group.row_indices:
row = {"_row": idx + 1}
for col in display_cols:
row[col] = df.iloc[idx].get(col, "")
rows_data.append(row)
compare_df = pd.DataFrame(rows_data)
compare_df = compare_df.set_index("_row")
# Highlight differences
def _highlight_diffs(s: pd.Series) -> list[str]:
"""Highlight cells that differ from the first row."""
styles = []
first_val = str(s.iloc[0]).strip() if len(s) > 0 else ""
for val in s:
val_str = str(val).strip()
if val_str != first_val and val_str and first_val:
styles.append("background-color: rgba(245, 166, 35, 0.2)")
elif not val_str and first_val:
styles.append("background-color: rgba(240, 82, 82, 0.1)")
else:
styles.append("")
return styles
styled = compare_df.style.apply(_highlight_diffs, axis=0)
st.dataframe(styled, use_container_width=True)
# Action buttons
btn_left, btn_mid, btn_right = st.columns(3)
merge_key = f"merge_{group.group_id}"
keep_key = f"keep_{group.group_id}"
with btn_left:
if st.button("Merge", key=merge_key, type="primary"):
return True
with btn_mid:
if st.button("Keep Both", key=keep_key):
return False
# Check session state for previous decisions
decisions = st.session_state.get("review_decisions", {})
if group.group_id in decisions:
decision = decisions[group.group_id]
if decision is True:
st.success("Decision: Merge")
elif decision is False:
st.info("Decision: Keep Both")
return None
# ---------------------------------------------------------------------------
# Results summary + downloads
# ---------------------------------------------------------------------------
def results_summary(
result: DeduplicationResult,
original_df: pd.DataFrame,
) -> None:
"""Render summary stats and download buttons."""
removed = result.original_row_count - len(result.deduplicated_df)
# Summary metrics
col1, col2, col3, col4 = st.columns(4)
col1.metric("Rows In", result.original_row_count)
col2.metric("Rows Out", len(result.deduplicated_df))
col3.metric("Removed", removed)
col4.metric("Groups", len(result.match_groups))
st.divider()
# Download buttons
dl_left, dl_mid, dl_right = st.columns(3)
with dl_left:
csv_bytes = result.deduplicated_df.to_csv(index=False).encode("utf-8-sig")
st.download_button(
"Download Deduplicated CSV",
data=csv_bytes,
file_name="deduplicated.csv",
mime="text/csv",
)
with dl_mid:
if not result.removed_df.empty:
removed_bytes = result.removed_df.to_csv(index=False).encode("utf-8-sig")
st.download_button(
"Download Removed Rows",
data=removed_bytes,
file_name="removed_rows.csv",
mime="text/csv",
)
with dl_right:
if result.match_groups:
groups_data = _build_match_groups_csv(result, original_df)
st.download_button(
"Download Match Groups Report",
data=groups_data,
file_name="match_groups.csv",
mime="text/csv",
)
def _build_match_groups_csv(
result: DeduplicationResult,
original_df: pd.DataFrame,
) -> bytes:
"""Build the match groups audit CSV as bytes."""
rows = []
for g in result.match_groups:
for idx in g.row_indices:
row_data = {
"_group_id": g.group_id + 1,
"_is_survivor": idx == g.survivor_index,
"_confidence": g.confidence,
"_matched_on": ", ".join(g.matched_on),
"_original_row": idx + 1,
}
for col in original_df.columns:
if not str(col).startswith("_norm_"):
row_data[col] = original_df.iloc[idx].get(col, "") if idx < len(original_df) else ""
rows.append(row_data)
groups_df = pd.DataFrame(rows)
return groups_df.to_csv(index=False).encode("utf-8-sig")