feat: add documentation, Streamlit GUI, and full source tree

- Rewrite README.md with project overview, quick-start, and CLI summary
- Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections
- Add docs/DEVELOPER.md with architecture, data flow, and extension guides
- Rewrite src/core/__init__.py with public API exports and module docstring
- Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive
  match group review with side-by-side diff, and download buttons
- Add .gitignore, requirements.txt, all source code, tests, and sample data
- Add streamlit to requirements.txt

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-28 23:06:39 +00:00
parent 0613dc420c
commit b871ab24fc
47 changed files with 4413 additions and 2 deletions

93
src/core/__init__.py Normal file
View File

@@ -0,0 +1,93 @@
"""DataTools deduplication engine.
Public API
----------
Core:
deduplicate(df, ...) -> DeduplicationResult
build_default_strategies(df) -> list[MatchStrategy]
Types:
Algorithm, SurvivorRule, ColumnMatchStrategy, MatchStrategy
MatchResult, DeduplicationResult
Normalizers:
get_normalizer(type) -> Callable
NormalizerType
normalize_email, normalize_phone, normalize_name,
normalize_address, normalize_string
I/O:
read_file(path, ...) -> DataFrame
write_file(df, path, ...)
list_sheets(path) -> list[str]
detect_encoding, detect_delimiter, detect_header_row
Configuration:
DeduplicationConfig.from_file(path) -> DeduplicationConfig
DeduplicationConfig.to_file(path)
"""
from .dedup import (
Algorithm,
ColumnMatchStrategy,
DeduplicationResult,
MatchResult,
MatchStrategy,
SurvivorRule,
build_default_strategies,
deduplicate,
)
from .normalizers import (
NormalizerType,
get_normalizer,
normalize_address,
normalize_email,
normalize_name,
normalize_phone,
normalize_string,
)
from .io import (
detect_delimiter,
detect_encoding,
detect_header_row,
list_sheets,
read_file,
write_file,
)
from .config import (
ColumnStrategyConfig,
DeduplicationConfig,
StrategyConfig,
)
__all__ = [
# Core
"deduplicate",
"build_default_strategies",
# Types
"Algorithm",
"SurvivorRule",
"ColumnMatchStrategy",
"MatchStrategy",
"MatchResult",
"DeduplicationResult",
# Normalizers
"NormalizerType",
"get_normalizer",
"normalize_email",
"normalize_phone",
"normalize_name",
"normalize_address",
"normalize_string",
# I/O
"read_file",
"write_file",
"list_sheets",
"detect_encoding",
"detect_delimiter",
"detect_header_row",
# Config
"DeduplicationConfig",
"StrategyConfig",
"ColumnStrategyConfig",
]

117
src/core/config.py Normal file
View File

@@ -0,0 +1,117 @@
"""Configuration profiles: save/load deduplication settings as JSON."""
from __future__ import annotations
import json
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import Optional
from .dedup import (
Algorithm,
ColumnMatchStrategy,
MatchStrategy,
NormalizerType,
SurvivorRule,
)
@dataclass
class ColumnStrategyConfig:
"""JSON-serializable mirror of ColumnMatchStrategy."""
column: str
algorithm: str = "exact"
threshold: float = 100.0
normalizer: Optional[str] = None
@dataclass
class StrategyConfig:
"""JSON-serializable mirror of MatchStrategy."""
columns: list[ColumnStrategyConfig] = field(default_factory=list)
@dataclass
class DeduplicationConfig:
"""All deduplication settings as a flat JSON-serializable structure."""
strategies: list[StrategyConfig] = field(default_factory=list)
survivor_rule: str = "first"
date_column: Optional[str] = None
merge: bool = False
subset_columns: Optional[list[str]] = None
fuzzy_columns: Optional[list[str]] = None
default_algorithm: str = "jaro_winkler"
default_threshold: float = 85.0
normalize_map: Optional[dict[str, str]] = None # column -> normalizer type
# -----------------------------------------------------------------------
# Serialisation
# -----------------------------------------------------------------------
def to_dict(self) -> dict:
return asdict(self)
def to_file(self, path: str | Path) -> Path:
"""Save configuration to a JSON file."""
out = Path(path)
out.write_text(json.dumps(self.to_dict(), indent=2))
return out
@classmethod
def from_dict(cls, data: dict) -> DeduplicationConfig:
strategies = []
for s in data.get("strategies", []):
cols = [ColumnStrategyConfig(**c) for c in s.get("columns", [])]
strategies.append(StrategyConfig(columns=cols))
return cls(
strategies=strategies,
survivor_rule=data.get("survivor_rule", "first"),
date_column=data.get("date_column"),
merge=data.get("merge", False),
subset_columns=data.get("subset_columns"),
fuzzy_columns=data.get("fuzzy_columns"),
default_algorithm=data.get("default_algorithm", "jaro_winkler"),
default_threshold=data.get("default_threshold", 85.0),
normalize_map=data.get("normalize_map"),
)
@classmethod
def from_file(cls, path: str | Path) -> DeduplicationConfig:
"""Load configuration from a JSON file."""
data = json.loads(Path(path).read_text())
return cls.from_dict(data)
@classmethod
def default(cls) -> DeduplicationConfig:
"""Return sensible defaults (auto-detect strategies at runtime)."""
return cls()
# -----------------------------------------------------------------------
# Convert to engine objects
# -----------------------------------------------------------------------
def to_strategies(self) -> Optional[list[MatchStrategy]]:
"""Convert the config back to MatchStrategy objects.
Returns None if no explicit strategies are configured
(the engine will auto-detect).
"""
if not self.strategies:
return None
result: list[MatchStrategy] = []
for sc in self.strategies:
col_strats = []
for cc in sc.columns:
col_strats.append(ColumnMatchStrategy(
column=cc.column,
algorithm=Algorithm(cc.algorithm),
threshold=cc.threshold,
normalizer=NormalizerType(cc.normalizer) if cc.normalizer else None,
))
result.append(MatchStrategy(column_strategies=col_strats))
return result
def to_survivor_rule(self) -> SurvivorRule:
return SurvivorRule(self.survivor_rule)

568
src/core/dedup.py Normal file
View File

@@ -0,0 +1,568 @@
"""Deduplication engine: matching, survivor selection, and merge.
Core algorithm:
1. Normalise columns → shadow ``_norm_*`` columns (computed once).
2. Pairwise comparison within each strategy → candidate pairs.
3. Union-find for transitive closure (A~B, B~C ⇒ one group).
4. Multi-strategy OR: feed all pairs from all strategies into the same union-find.
5. Survivor selection per group + optional field merge.
"""
from __future__ import annotations
import re
from dataclasses import dataclass, field
from enum import Enum
from typing import Callable, Optional
import pandas as pd
from loguru import logger
from rapidfuzz import fuzz as rf_fuzz
from rapidfuzz import distance as rf_distance
from .normalizers import NormalizerType, get_normalizer
# ---------------------------------------------------------------------------
# Enums & data structures
# ---------------------------------------------------------------------------
class Algorithm(str, Enum):
EXACT = "exact"
LEVENSHTEIN = "levenshtein"
JARO_WINKLER = "jaro_winkler"
TOKEN_SET_RATIO = "token_set_ratio"
class SurvivorRule(str, Enum):
KEEP_FIRST = "first"
KEEP_LAST = "last"
KEEP_MOST_COMPLETE = "most_complete"
KEEP_MOST_RECENT = "most_recent"
@dataclass
class ColumnMatchStrategy:
"""How to match on a single column."""
column: str
algorithm: Algorithm = Algorithm.EXACT
threshold: float = 100.0 # 0-100 scale
normalizer: Optional[NormalizerType] = None
@dataclass
class MatchStrategy:
"""A set of column strategies combined with AND.
Multiple ``MatchStrategy`` instances are combined with OR at the top level.
"""
column_strategies: list[ColumnMatchStrategy]
@dataclass
class MatchResult:
"""One group of duplicate rows."""
group_id: int
row_indices: list[int]
confidence: float # min confidence across pairs in the group
matched_on: list[str] # column names that contributed to the match
survivor_index: int # index of the row to keep
@dataclass
class DeduplicationResult:
"""Full result of a deduplication run."""
original_row_count: int
deduplicated_df: pd.DataFrame
removed_df: pd.DataFrame
match_groups: list[MatchResult]
log_entries: list[str] = field(default_factory=list)
is_preview: bool = True
# ---------------------------------------------------------------------------
# Union-Find
# ---------------------------------------------------------------------------
class _UnionFind:
"""Disjoint-set / union-find for transitive closure of match pairs."""
def __init__(self, n: int):
self._parent = list(range(n))
self._rank = [0] * n
def find(self, x: int) -> int:
while self._parent[x] != x:
self._parent[x] = self._parent[self._parent[x]] # path halving
x = self._parent[x]
return x
def union(self, a: int, b: int) -> None:
ra, rb = self.find(a), self.find(b)
if ra == rb:
return
if self._rank[ra] < self._rank[rb]:
ra, rb = rb, ra
self._parent[rb] = ra
if self._rank[ra] == self._rank[rb]:
self._rank[ra] += 1
def groups(self) -> dict[int, list[int]]:
"""Return {root: [members]} for all non-singleton groups."""
from collections import defaultdict
g: dict[int, list[int]] = defaultdict(list)
for i in range(len(self._parent)):
g[self.find(i)].append(i)
return {root: members for root, members in g.items() if len(members) > 1}
# ---------------------------------------------------------------------------
# Similarity computation
# ---------------------------------------------------------------------------
def _compute_similarity(val_a: str, val_b: str, algorithm: Algorithm) -> float:
"""Return similarity score on a 0-100 scale."""
if algorithm == Algorithm.EXACT:
return 100.0 if val_a == val_b else 0.0
if algorithm == Algorithm.LEVENSHTEIN:
return rf_fuzz.ratio(val_a, val_b)
if algorithm == Algorithm.JARO_WINKLER:
# rapidfuzz jaro_winkler_similarity returns 0-100
return rf_distance.JaroWinkler.similarity(val_a, val_b) * 100
if algorithm == Algorithm.TOKEN_SET_RATIO:
return rf_fuzz.token_set_ratio(val_a, val_b)
raise ValueError(f"Unknown algorithm: {algorithm}")
# ---------------------------------------------------------------------------
# Pair comparison
# ---------------------------------------------------------------------------
def _compare_pair(
row_a: pd.Series,
row_b: pd.Series,
strategy: MatchStrategy,
norm_prefix: str = "_norm_",
) -> tuple[bool, float, list[str]]:
"""Compare two rows using a single MatchStrategy (AND of column strategies).
Returns ``(is_match, confidence, matched_columns)``.
"""
min_score = 100.0
matched_cols: list[str] = []
for cs in strategy.column_strategies:
col = f"{norm_prefix}{cs.column}" if cs.normalizer else cs.column
va = str(row_a.get(col, ""))
vb = str(row_b.get(col, ""))
# Skip if both empty
if not va and not vb:
continue
# If one empty and one not — no match for this column
if not va or not vb:
return False, 0.0, []
score = _compute_similarity(va, vb, cs.algorithm)
if score < cs.threshold:
return False, 0.0, []
min_score = min(min_score, score)
matched_cols.append(cs.column)
if not matched_cols:
return False, 0.0, []
return True, min_score, matched_cols
# ---------------------------------------------------------------------------
# Match-group finding
# ---------------------------------------------------------------------------
def _find_match_groups(
df: pd.DataFrame,
strategies: list[MatchStrategy],
*,
progress_callback: Optional[Callable[[int, int], None]] = None,
) -> tuple[list[MatchResult], dict[tuple[int, int], tuple[float, list[str]]]]:
"""Pairwise comparison + union-find for transitive closure.
Returns ``(match_groups, pair_info)`` where *pair_info* maps
``(i, j)`` → ``(confidence, matched_columns)`` for logging.
"""
n = len(df)
uf = _UnionFind(n)
pair_info: dict[tuple[int, int], tuple[float, list[str]]] = {}
total_pairs = n * (n - 1) // 2
checked = 0
for i in range(n):
for j in range(i + 1, n):
for strategy in strategies:
is_match, confidence, cols = _compare_pair(
df.iloc[i], df.iloc[j], strategy
)
if is_match:
uf.union(i, j)
key = (i, j)
# Keep the highest-confidence match for this pair
if key not in pair_info or confidence > pair_info[key][0]:
pair_info[key] = (confidence, cols)
break # OR logic: one strategy match is enough
checked += 1
if progress_callback and checked % 1000 == 0:
progress_callback(checked, total_pairs)
if progress_callback:
progress_callback(total_pairs, total_pairs)
# Build MatchResult objects (survivor not yet selected)
raw_groups = uf.groups()
match_groups: list[MatchResult] = []
for gid, (root, members) in enumerate(sorted(raw_groups.items())):
# Confidence = min across all pairs in the group
group_confidence = 100.0
group_cols: set[str] = set()
for idx_a, m in enumerate(members):
for idx_b in range(idx_a + 1, len(members)):
key = (min(m, members[idx_b]), max(m, members[idx_b]))
if key in pair_info:
conf, cols = pair_info[key]
group_confidence = min(group_confidence, conf)
group_cols.update(cols)
match_groups.append(MatchResult(
group_id=gid,
row_indices=members,
confidence=round(group_confidence, 2),
matched_on=sorted(group_cols),
survivor_index=members[0], # placeholder
))
return match_groups, pair_info
# ---------------------------------------------------------------------------
# Survivor selection
# ---------------------------------------------------------------------------
def _select_survivor(
group: MatchResult,
df: pd.DataFrame,
rule: SurvivorRule,
date_column: Optional[str] = None,
) -> int:
"""Choose the survivor row index within a match group."""
indices = group.row_indices
if rule == SurvivorRule.KEEP_FIRST:
return indices[0]
if rule == SurvivorRule.KEEP_LAST:
return indices[-1]
if rule == SurvivorRule.KEEP_MOST_COMPLETE:
# Fewest empty/blank cells wins
best_idx = indices[0]
best_empty = _count_empty(df.iloc[indices[0]])
for idx in indices[1:]:
empty = _count_empty(df.iloc[idx])
if empty < best_empty:
best_empty = empty
best_idx = idx
return best_idx
if rule == SurvivorRule.KEEP_MOST_RECENT:
if not date_column or date_column not in df.columns:
logger.warning("date_column '{}' not found; falling back to keep_first", date_column)
return indices[0]
best_idx = indices[0]
best_date = _parse_date(df.iloc[indices[0]].get(date_column, ""))
for idx in indices[1:]:
d = _parse_date(df.iloc[idx].get(date_column, ""))
if d is not None and (best_date is None or d > best_date):
best_date = d
best_idx = idx
return best_idx
return indices[0]
def _count_empty(row: pd.Series) -> int:
"""Count empty/blank cells in a row, ignoring internal shadow columns."""
count = 0
for col, val in row.items():
if isinstance(col, str) and col.startswith("_norm_"):
continue
if pd.isna(val) or str(val).strip() == "":
count += 1
return count
def _parse_date(value) -> Optional[pd.Timestamp]:
try:
return pd.to_datetime(value)
except Exception:
return None
# ---------------------------------------------------------------------------
# Merge mode
# ---------------------------------------------------------------------------
def _merge_group(df: pd.DataFrame, survivor_idx: int, loser_indices: list[int]) -> pd.Series:
"""Fill missing fields in survivor from losers (ordered by position)."""
survivor = df.iloc[survivor_idx].copy()
for col in survivor.index:
if isinstance(col, str) and col.startswith("_norm_"):
continue
val = survivor[col]
if pd.isna(val) or str(val).strip() == "":
for loser_idx in loser_indices:
candidate = df.iloc[loser_idx][col]
if not pd.isna(candidate) and str(candidate).strip() != "":
survivor[col] = candidate
break
return survivor
# ---------------------------------------------------------------------------
# Auto-detect strategies
# ---------------------------------------------------------------------------
# (pattern, normalizer, algorithm, threshold, is_strong_key)
# Strong keys (email, phone) can be standalone strategies.
# Weak keys (name, address) must be combined with a strong key via AND.
_COLUMN_TYPE_PATTERNS: list[tuple[re.Pattern, NormalizerType, Algorithm, float, bool]] = [
(re.compile(r"e[-_]?mail", re.I), NormalizerType.EMAIL, Algorithm.EXACT, 100.0, True),
(re.compile(r"phone|telephone|mobile|cell", re.I), NormalizerType.PHONE, Algorithm.EXACT, 100.0, True),
(re.compile(r"^(name|full_name|customer_name|first_name|last_name|contact_name|respondent_name)$", re.I),
NormalizerType.NAME, Algorithm.JARO_WINKLER, 85.0, False),
(re.compile(r"address|street|addr", re.I), NormalizerType.ADDRESS, Algorithm.TOKEN_SET_RATIO, 80.0, False),
]
def build_default_strategies(df: pd.DataFrame) -> list[MatchStrategy]:
"""Auto-detect column types and build match strategies.
Strategy logic:
- Strong keys (email, phone): each gets its own standalone OR strategy.
- Weak keys (name, address): combined with each strong key via AND to
form additional strategies. Weak keys never stand alone (too many
false positives — "John""Jon" at 93 % Jaro-Winkler).
- If only weak keys are found (no strong keys), they're promoted to
standalone strategies as a fallback.
- If no columns match, exact match on all columns (drop_duplicates
equivalent).
"""
strong_cols: list[ColumnMatchStrategy] = []
weak_cols: list[ColumnMatchStrategy] = []
for col in df.columns:
if col.startswith("_norm_"):
continue
for pattern, norm_type, algo, threshold, is_strong in _COLUMN_TYPE_PATTERNS:
if pattern.search(col):
cs = ColumnMatchStrategy(
column=col, algorithm=algo,
threshold=threshold, normalizer=norm_type,
)
if is_strong:
strong_cols.append(cs)
else:
weak_cols.append(cs)
break
strategies: list[MatchStrategy] = []
if strong_cols:
# Each strong key is a standalone strategy (OR)
for sc in strong_cols:
strategies.append(MatchStrategy(column_strategies=[sc]))
# Each weak key is paired with each strong key (AND) for extra recall
for wc in weak_cols:
for sc in strong_cols:
strategies.append(MatchStrategy(column_strategies=[wc, sc]))
elif weak_cols:
# No strong keys — promote weak to standalone (best effort)
for wc in weak_cols:
strategies.append(MatchStrategy(column_strategies=[wc]))
if strategies:
return strategies
# Fallback: exact match on all columns (equivalent to drop_duplicates)
logger.info("No column patterns matched; using exact match on all columns")
all_cols = [
ColumnMatchStrategy(column=c, algorithm=Algorithm.EXACT, threshold=100.0)
for c in df.columns
]
return [MatchStrategy(column_strategies=all_cols)]
# ---------------------------------------------------------------------------
# Normalisation pass
# ---------------------------------------------------------------------------
def _apply_normalizations(df: pd.DataFrame, strategies: list[MatchStrategy]) -> pd.DataFrame:
"""Add ``_norm_*`` shadow columns for every column that has a normalizer."""
df = df.copy()
seen: set[str] = set()
for strategy in strategies:
for cs in strategy.column_strategies:
if cs.normalizer and cs.column not in seen and cs.column in df.columns:
seen.add(cs.column)
norm_fn = get_normalizer(cs.normalizer)
norm_col = f"_norm_{cs.column}"
df[norm_col] = df[cs.column].apply(
lambda v, fn=norm_fn: fn(str(v)) if pd.notna(v) and str(v).strip() else ""
)
return df
# ---------------------------------------------------------------------------
# Main entry point
# ---------------------------------------------------------------------------
def deduplicate(
df: pd.DataFrame,
*,
strategies: Optional[list[MatchStrategy]] = None,
survivor_rule: SurvivorRule = SurvivorRule.KEEP_FIRST,
date_column: Optional[str] = None,
merge: bool = False,
preview: bool = True,
review_callback: Optional[Callable] = None,
progress_callback: Optional[Callable[[int, int], None]] = None,
) -> DeduplicationResult:
"""Run the full deduplication pipeline.
Parameters
----------
df : input DataFrame
strategies : matching strategies (auto-detected if None)
survivor_rule : which row to keep per group
date_column : used with ``KEEP_MOST_RECENT``
merge : fill missing fields in survivor from losers
preview : if True, result is informational only (no writes)
review_callback : ``(group: MatchResult, df: DataFrame) -> bool|None``
Called for each match group. Return True to accept, False to reject,
None to skip (keep both rows). Used for interactive review.
progress_callback : ``(current: int, total: int) -> None``
Called periodically during pairwise comparison.
Returns a ``DeduplicationResult``.
"""
log_entries: list[str] = []
original_count = len(df)
if strategies is None:
strategies = build_default_strategies(df)
log_entries.append(f"Auto-detected {len(strategies)} match strategies")
# Log strategies
for i, s in enumerate(strategies):
cols_desc = ", ".join(
f"{cs.column}({cs.algorithm.value}@{cs.threshold})"
for cs in s.column_strategies
)
log_entries.append(f"Strategy {i}: {cols_desc}")
logger.info("Strategy {}: {}", i, cols_desc)
# Normalise
df_work = _apply_normalizations(df, strategies)
# Find matches
match_groups, pair_info = _find_match_groups(
df_work, strategies, progress_callback=progress_callback
)
log_entries.append(f"Found {len(match_groups)} duplicate groups")
logger.info("Found {} duplicate groups from {} rows", len(match_groups), original_count)
# Interactive review
if review_callback and match_groups:
reviewed_groups: list[MatchResult] = []
for group in match_groups:
decision = review_callback(group, df_work)
if decision is True:
reviewed_groups.append(group)
log_entries.append(f"Group {group.group_id}: accepted by reviewer")
elif decision is False:
log_entries.append(f"Group {group.group_id}: rejected by reviewer")
else:
log_entries.append(f"Group {group.group_id}: skipped by reviewer")
match_groups = reviewed_groups
# Survivor selection
for group in match_groups:
group.survivor_index = _select_survivor(group, df_work, survivor_rule, date_column)
log_entries.append(
f"Group {group.group_id}: survivor=row {group.survivor_index} "
f"(rule={survivor_rule.value}, confidence={group.confidence}%)"
)
# Build result dataframes
remove_indices: set[int] = set()
merged_rows: dict[int, pd.Series] = {}
for group in match_groups:
survivor_idx = group.survivor_index
losers = [i for i in group.row_indices if i != survivor_idx]
remove_indices.update(losers)
if merge and losers:
merged = _merge_group(df_work, survivor_idx, losers)
merged_rows[survivor_idx] = merged
# Log merged fields
original = df_work.iloc[survivor_idx]
for col in original.index:
if isinstance(col, str) and col.startswith("_norm_"):
continue
orig_val = str(original[col]).strip()
new_val = str(merged[col]).strip()
if orig_val != new_val and not orig_val:
log_entries.append(
f"Group {group.group_id}: merged '{col}' "
f"into survivor from losers: '{new_val}'"
)
# Build output DataFrames
keep_indices = [i for i in range(len(df_work)) if i not in remove_indices]
if merged_rows:
rows = []
for i in keep_indices:
if i in merged_rows:
rows.append(merged_rows[i])
else:
rows.append(df_work.iloc[i])
deduplicated_df = pd.DataFrame(rows)
else:
deduplicated_df = df_work.iloc[keep_indices].copy()
removed_df = df_work.iloc[sorted(remove_indices)].copy() if remove_indices else pd.DataFrame()
# Drop shadow columns from output
norm_cols = [c for c in deduplicated_df.columns if str(c).startswith("_norm_")]
deduplicated_df = deduplicated_df.drop(columns=norm_cols, errors="ignore")
if not removed_df.empty:
removed_df = removed_df.drop(columns=norm_cols, errors="ignore")
# Reset index
deduplicated_df = deduplicated_df.reset_index(drop=True)
if not removed_df.empty:
removed_df = removed_df.reset_index(drop=True)
removed_count = original_count - len(deduplicated_df)
log_entries.append(f"Result: {original_count}{len(deduplicated_df)} rows ({removed_count} removed)")
return DeduplicationResult(
original_row_count=original_count,
deduplicated_df=deduplicated_df,
removed_df=removed_df,
match_groups=match_groups,
log_entries=log_entries,
is_preview=preview,
)

247
src/core/io.py Normal file
View File

@@ -0,0 +1,247 @@
"""File I/O: encoding/delimiter detection, CSV/Excel reading, output writing."""
from __future__ import annotations
import csv
import io
from pathlib import Path
from typing import Generator, Optional
import pandas as pd
from charset_normalizer import from_bytes
from loguru import logger
# ---------------------------------------------------------------------------
# Encoding detection
# ---------------------------------------------------------------------------
def detect_encoding(path: Path, sample_bytes: int = 65_536) -> str:
"""Detect file encoding by reading the first *sample_bytes*.
Returns the best-guess encoding name (e.g. ``utf-8``, ``windows-1252``).
Falls back to ``utf-8`` when detection is inconclusive.
"""
raw = Path(path).read_bytes()[:sample_bytes]
if not raw:
return "utf-8"
# Check BOM first
if raw[:3] == b"\xef\xbb\xbf":
return "utf-8-sig"
if raw[:2] in (b"\xff\xfe", b"\xfe\xff"):
return "utf-16"
result = from_bytes(raw).best()
if result is None:
return "utf-8"
enc = result.encoding.lower()
# Normalise common aliases
if enc in ("ascii", "us-ascii"):
enc = "utf-8"
return enc
# ---------------------------------------------------------------------------
# Delimiter detection
# ---------------------------------------------------------------------------
_COMMON_DELIMITERS = [",", "\t", ";", "|"]
def detect_delimiter(path: Path, encoding: str = "utf-8") -> str:
"""Sniff the delimiter from the first 20 lines of a text file.
Falls back to comma if csv.Sniffer cannot decide.
"""
raw_path = Path(path)
lines: list[str] = []
with raw_path.open("r", encoding=encoding, errors="replace") as fh:
for _ in range(20):
line = fh.readline()
if not line:
break
lines.append(line)
if not lines:
return ","
sample = "".join(lines)
try:
dialect = csv.Sniffer().sniff(sample, delimiters="".join(_COMMON_DELIMITERS))
return dialect.delimiter
except csv.Error:
return ","
# ---------------------------------------------------------------------------
# Header-row detection
# ---------------------------------------------------------------------------
def detect_header_row(path: Path, encoding: str = "utf-8", delimiter: str = ",",
max_scan: int = 20) -> int:
"""Return the 0-based index of the likely header row.
Heuristic: the first row where *every* cell looks like a column name
(non-numeric, non-empty string). Falls back to 0.
"""
raw_path = Path(path)
with raw_path.open("r", encoding=encoding, errors="replace") as fh:
reader = csv.reader(fh, delimiter=delimiter)
for idx, row in enumerate(reader):
if idx >= max_scan:
break
if not row:
continue
# All cells must be non-empty, non-numeric strings
if all(_looks_like_header(cell) for cell in row if cell.strip()):
return idx
return 0
def _looks_like_header(value: str) -> bool:
"""True if *value* looks like a column header, not a data value."""
v = value.strip()
if not v:
return False
# Pure numbers are not headers
try:
float(v.replace(",", ""))
return False
except ValueError:
pass
return True
# ---------------------------------------------------------------------------
# Excel helpers
# ---------------------------------------------------------------------------
def list_sheets(path: Path) -> list[str]:
"""Return sheet names from an Excel workbook."""
xl = pd.ExcelFile(path, engine="openpyxl")
return xl.sheet_names
# ---------------------------------------------------------------------------
# Reading
# ---------------------------------------------------------------------------
def read_file(
path: str | Path,
*,
encoding: Optional[str] = None,
delimiter: Optional[str] = None,
header_row: Optional[int] = None,
sheet_name: Optional[str | int] = 0,
chunk_size: Optional[int] = None,
) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
"""Read a CSV, TSV, or Excel file into a DataFrame.
Parameters
----------
path : file path
encoding : override detected encoding (CSV only)
delimiter : override detected delimiter (CSV only)
header_row : 0-based row index for the header; auto-detected if *None*
sheet_name : Excel sheet (name or 0-based index). Ignored for CSV.
chunk_size : if set, return a generator of DataFrames (CSV only).
Returns a DataFrame (or generator when *chunk_size* is set).
"""
filepath = Path(path)
if not filepath.exists():
raise FileNotFoundError(f"File not found: {filepath}")
suffix = filepath.suffix.lower()
if suffix in (".xlsx", ".xls"):
return _read_excel(filepath, header_row=header_row, sheet_name=sheet_name)
else:
return _read_csv(
filepath,
encoding=encoding,
delimiter=delimiter,
header_row=header_row,
chunk_size=chunk_size,
)
def _read_csv(
path: Path,
*,
encoding: Optional[str] = None,
delimiter: Optional[str] = None,
header_row: Optional[int] = None,
chunk_size: Optional[int] = None,
) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
enc = encoding or detect_encoding(path)
delim = delimiter or detect_delimiter(path, enc)
hdr = header_row if header_row is not None else detect_header_row(path, enc, delim)
logger.debug("Reading CSV {} (encoding={}, delimiter={!r}, header_row={})",
path.name, enc, delim, hdr)
kwargs: dict = dict(
filepath_or_buffer=path,
encoding=enc,
delimiter=delim,
header=hdr,
dtype=str,
keep_default_na=False,
on_bad_lines="warn",
)
if chunk_size:
return pd.read_csv(**kwargs, chunksize=chunk_size)
return pd.read_csv(**kwargs)
def _read_excel(
path: Path,
*,
header_row: Optional[int] = None,
sheet_name: Optional[str | int] = 0,
) -> pd.DataFrame:
hdr = header_row if header_row is not None else 0
logger.debug("Reading Excel {} (sheet={}, header_row={})", path.name, sheet_name, hdr)
return pd.read_excel(
path,
sheet_name=sheet_name,
header=hdr,
dtype=str,
keep_default_na=False,
engine="openpyxl",
)
# ---------------------------------------------------------------------------
# Writing
# ---------------------------------------------------------------------------
def write_file(
df: pd.DataFrame,
path: str | Path,
*,
file_format: Optional[str] = None,
encoding: str = "utf-8-sig",
) -> Path:
"""Write a DataFrame to CSV or Excel.
Parameters
----------
df : DataFrame to write
path : output file path
file_format : ``"csv"`` or ``"xlsx"``; auto-detected from *path* suffix if *None*
encoding : output encoding (default ``utf-8-sig`` for Windows Excel compat)
Returns the resolved output Path.
"""
out = Path(path)
fmt = file_format or out.suffix.lstrip(".").lower()
if fmt in ("xlsx", "xls"):
df.to_excel(out, index=False, engine="openpyxl")
else:
df.to_csv(out, index=False, encoding=encoding)
logger.info("Wrote {} rows to {}", len(df), out)
return out

224
src/core/normalizers.py Normal file
View File

@@ -0,0 +1,224 @@
"""Per-column normalization functions for deduplication matching.
Every normalizer is ``str -> str``, handles None/empty gracefully, and is
idempotent (applying it twice yields the same result as once).
"""
from __future__ import annotations
import re
from enum import Enum
from typing import Callable, Optional
import phonenumbers
# ---------------------------------------------------------------------------
# Types
# ---------------------------------------------------------------------------
class NormalizerType(str, Enum):
EMAIL = "email"
PHONE = "phone"
NAME = "name"
ADDRESS = "address"
STRING = "string"
# ---------------------------------------------------------------------------
# String normalizer (base)
# ---------------------------------------------------------------------------
def normalize_string(value: Optional[str]) -> str:
"""Trim, collapse internal whitespace, case-fold."""
if not value or not isinstance(value, str):
return ""
return re.sub(r"\s+", " ", value.strip()).casefold()
# ---------------------------------------------------------------------------
# Email normalizer
# ---------------------------------------------------------------------------
_GMAIL_DOMAINS = {"gmail.com", "googlemail.com"}
def normalize_email(value: Optional[str]) -> str:
"""Lowercase, strip whitespace, strip Gmail dots, strip +tag suffixes."""
if not value or not isinstance(value, str):
return ""
email = value.strip().lower()
if "@" not in email:
return email
local, domain = email.rsplit("@", 1)
# Strip +tag suffix
if "+" in local:
local = local.split("+", 1)[0]
# Strip dots for Gmail addresses
if domain in _GMAIL_DOMAINS:
local = local.replace(".", "")
return f"{local}@{domain}"
# ---------------------------------------------------------------------------
# Phone normalizer
# ---------------------------------------------------------------------------
def normalize_phone(value: Optional[str], default_region: str = "US") -> str:
"""Parse with phonenumbers lib, return E.164. Fallback: digits-only."""
if not value or not isinstance(value, str):
return ""
stripped = value.strip()
if not stripped:
return ""
try:
parsed = phonenumbers.parse(stripped, default_region)
if phonenumbers.is_possible_number(parsed):
return phonenumbers.format_number(parsed, phonenumbers.PhoneNumberFormat.E164)
except phonenumbers.NumberParseException:
pass
# Fallback: digits only
digits = re.sub(r"\D", "", stripped)
return digits
# ---------------------------------------------------------------------------
# Name normalizer
# ---------------------------------------------------------------------------
_TITLE_PREFIXES = {
"mr", "mrs", "ms", "miss", "dr", "prof", "professor",
"sir", "madam", "rev", "reverend", "hon", "honorable",
}
_NAME_SUFFIXES = {
"jr", "sr", "ii", "iii", "iv", "v",
"phd", "md", "esq", "dds", "rn",
}
def normalize_name(value: Optional[str]) -> str:
"""Strip titles/suffixes, collapse whitespace, case-fold."""
if not value or not isinstance(value, str):
return ""
name = value.strip()
if not name:
return ""
# Case-fold first for matching
name = name.casefold()
# Remove periods and commas that are part of titles/suffixes
name = name.replace(".", " ").replace(",", " ")
parts = name.split()
# Strip leading titles
while parts and parts[0].rstrip(".") in _TITLE_PREFIXES:
parts.pop(0)
# Strip trailing suffixes
while parts and parts[-1].rstrip(".") in _NAME_SUFFIXES:
parts.pop()
return " ".join(parts)
# ---------------------------------------------------------------------------
# Address normalizer
# ---------------------------------------------------------------------------
_USPS_ABBREVIATIONS: dict[str, str] = {
"street": "st",
"avenue": "ave",
"boulevard": "blvd",
"drive": "dr",
"lane": "ln",
"road": "rd",
"court": "ct",
"place": "pl",
"circle": "cir",
"trail": "trl",
"way": "way",
"terrace": "ter",
"parkway": "pkwy",
"highway": "hwy",
"expressway": "expy",
"freeway": "fwy",
"square": "sq",
"loop": "loop",
"alley": "aly",
"crossing": "xing",
"point": "pt",
"north": "n",
"south": "s",
"east": "e",
"west": "w",
"northeast": "ne",
"northwest": "nw",
"southeast": "se",
"southwest": "sw",
"apartment": "apt",
"suite": "ste",
"building": "bldg",
"floor": "fl",
"room": "rm",
"unit": "unit",
"number": "#",
"saint": "st",
"fort": "ft",
"mount": "mt",
"heights": "hts",
"springs": "spgs",
}
def normalize_address(value: Optional[str]) -> str:
"""USPS abbreviation normalization, collapse whitespace, case-fold."""
if not value or not isinstance(value, str):
return ""
addr = value.strip()
if not addr:
return ""
# Case-fold and clean punctuation (keep #)
addr = addr.casefold()
addr = addr.replace(".", " ").replace(",", " ")
parts = addr.split()
normalized_parts = []
for part in parts:
normalized_parts.append(_USPS_ABBREVIATIONS.get(part, part))
return " ".join(normalized_parts)
# ---------------------------------------------------------------------------
# Registry
# ---------------------------------------------------------------------------
_NORMALIZER_MAP: dict[NormalizerType, Callable[[str], str]] = {
NormalizerType.EMAIL: normalize_email,
NormalizerType.PHONE: normalize_phone,
NormalizerType.NAME: normalize_name,
NormalizerType.ADDRESS: normalize_address,
NormalizerType.STRING: normalize_string,
}
def get_normalizer(normalizer_type: NormalizerType | str) -> Callable[[str], str]:
"""Return the normalizer function for the given type.
Accepts both ``NormalizerType`` enum values and plain strings.
"""
if isinstance(normalizer_type, str):
normalizer_type = NormalizerType(normalizer_type.lower())
func = _NORMALIZER_MAP.get(normalizer_type)
if func is None:
raise ValueError(f"Unknown normalizer type: {normalizer_type}")
return func