feat: add documentation, Streamlit GUI, and full source tree

- Rewrite README.md with project overview, quick-start, and CLI summary - Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections - Add docs/DEVELOPER.md with architecture, data flow, and extension guides - Rewrite src/core/__init__.py with public API exports and module docstring - Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive match group review with side-by-side diff, and download buttons - Add .gitignore, requirements.txt, all source code, tests, and sample data - Add streamlit to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-28 23:06:39 +00:00
parent 0613dc420c
commit b871ab24fc
47 changed files with 4413 additions and 2 deletions
--- a/src/init.py
+++ b/src/init.py
--- a/src/main.py
+++ b/src/main.py
@@ -0,0 +1,4 @@
+"""Allow running as ``python -m src``."""
+from src.cli import main
+
+main()
--- a/src/cli.py
+++ b/src/cli.py
@@ -0,0 +1,502 @@
+"""CLI for the DataTools deduplicator.
+
+Usage:
+    python -m src.cli input.csv                         # dry-run preview
+    python -m src.cli input.csv --apply                 # write deduplicated output
+    python -m src.cli input.csv --fuzzy name --merge    # fuzzy match + merge
+    python -m src.cli --help                            # full help
+"""
+
+from __future__ import annotations
+
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+import typer
+from loguru import logger
+from rapidfuzz import process as rf_process
+
+app = typer.Typer(
+    name="dedup",
+    help=(
+        "Find and remove duplicate rows in CSV and Excel files.\n\n"
+        "By default, runs in preview mode — shows what would change without "
+        "modifying anything. Add --apply to write the output.\n\n"
+        "Examples:\n\n"
+        "  # Preview duplicates in a CSV file\n"
+        "  python -m src.cli customers.csv\n\n"
+        "  # Remove duplicates and save the result\n"
+        "  python -m src.cli customers.csv --apply\n\n"
+        "  # Fuzzy-match on the 'name' column with 80% threshold\n"
+        "  python -m src.cli customers.csv --fuzzy name --threshold 80 --apply\n\n"
+        "  # Match on specific columns only\n"
+        "  python -m src.cli customers.csv --subset email,phone --apply\n\n"
+        "  # Keep the most complete row and merge missing fields\n"
+        "  python -m src.cli customers.csv --survivor most-complete --merge --apply\n"
+    ),
+    add_completion=False,
+    no_args_is_help=True,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _setup_logging(log_dir: Path) -> Path:
+    """Configure loguru to write a timestamped log file. Returns the log path."""
+    log_dir.mkdir(parents=True, exist_ok=True)
+    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+    log_path = log_dir / f"dedup_{ts}.log"
+    logger.remove()  # remove default stderr handler
+    logger.add(sys.stderr, level="WARNING", format="{message}")
+    logger.add(str(log_path), level="DEBUG",
+               format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}")
+    return log_path
+
+
+def _suggest_column(name: str, available: list[str]) -> str:
+    """Return a helpful error message when a column is not found."""
+    cols_str = ", ".join(available)
+    matches = rf_process.extract(name, available, limit=1, score_cutoff=50)
+    if matches:
+        suggestion = matches[0][0]
+        return (
+            f"Column '{name}' not found. "
+            f"Available columns: {cols_str}. "
+            f"Did you mean '{suggestion}'?"
+        )
+    return f"Column '{name}' not found. Available columns: {cols_str}."
+
+
+def _validate_columns(requested: list[str], available: list[str]) -> None:
+    """Raise typer.BadParameter if any requested column doesn't exist."""
+    for col in requested:
+        if col not in available:
+            raise typer.BadParameter(_suggest_column(col, available))
+
+
+def _parse_normalize_map(raw: Optional[str]) -> dict[str, str]:
+    """Parse 'col:type,col:type' into a dict."""
+    if not raw:
+        return {}
+    result = {}
+    for pair in raw.split(","):
+        pair = pair.strip()
+        if ":" not in pair:
+            raise typer.BadParameter(
+                f"Invalid normalize format: '{pair}'. "
+                f"Expected 'column:type' (e.g., 'email:email,phone:phone')."
+            )
+        col, ntype = pair.split(":", 1)
+        result[col.strip()] = ntype.strip()
+    return result
+
+
+def _interactive_review(group, df) -> Optional[bool]:
+    """Side-by-side CLI review for a match group. Returns True/False/None."""
+    from src.core.dedup import MatchResult
+    group: MatchResult
+
+    print(f"\n{'='*60}")
+    print(f"Match Group {group.group_id + 1} — Confidence: {group.confidence:.1f}%")
+    print(f"Matched on: {', '.join(group.matched_on)}")
+    print(f"{'='*60}")
+
+    display_cols = [c for c in df.columns if not str(c).startswith("_norm_")]
+    for idx in group.row_indices:
+        print(f"\n  Row {idx + 1}:")
+        for col in display_cols:
+            val = df.iloc[idx].get(col, "")
+            if str(val).strip():
+                print(f"    {col}: {val}")
+
+    while True:
+        choice = input("\n  [y] Merge  [n] Keep both  [s] Skip remaining: ").strip().lower()
+        if choice == "y":
+            return True
+        if choice == "n":
+            return False
+        if choice == "s":
+            return None
+        print("  Please enter y, n, or s.")
+
+
+# ---------------------------------------------------------------------------
+# Main command
+# ---------------------------------------------------------------------------
+
+@app.command()
+def dedup(
+    input_file: str = typer.Argument(
+        ...,
+        help="Path to the CSV or Excel file to deduplicate.",
+    ),
+    output: Optional[str] = typer.Option(
+        None, "--output", "-o",
+        help="Output file path. Default: {input}_deduplicated.csv",
+    ),
+    apply: bool = typer.Option(
+        False, "--apply",
+        help="Write the output file. Without this flag, only a preview is shown.",
+    ),
+    key: Optional[str] = typer.Option(
+        None, "--key", "-k",
+        help="Comma-separated strong-key columns (e.g., 'fb_id,ein'). Each is an independent exact-match dedup key.",
+    ),
+    subset: Optional[str] = typer.Option(
+        None, "--subset", "-s",
+        help="Comma-separated columns to match on (default: auto-detect).",
+    ),
+    fuzzy: Optional[str] = typer.Option(
+        None, "--fuzzy",
+        help="Comma-separated columns to fuzzy-match (others use exact match).",
+    ),
+    algorithm: str = typer.Option(
+        "jaro_winkler", "--algorithm", "-a",
+        help="Fuzzy algorithm: levenshtein, jaro_winkler, or token_set_ratio.",
+    ),
+    threshold: int = typer.Option(
+        85, "--threshold", "-t",
+        help="Similarity threshold 0-100 for fuzzy matching.",
+    ),
+    normalize: Optional[str] = typer.Option(
+        None, "--normalize",
+        help="Column normalizers as 'col:type' pairs (e.g., 'email:email,phone:phone').",
+    ),
+    survivor: str = typer.Option(
+        "first", "--survivor",
+        help="Survivor rule: first, last, most-complete, or most-recent.",
+    ),
+    date_column: Optional[str] = typer.Option(
+        None, "--date-column",
+        help="Date column for most-recent survivor rule.",
+    ),
+    merge: bool = typer.Option(
+        False, "--merge",
+        help="Fill missing fields in the surviving row from removed duplicates.",
+    ),
+    review: bool = typer.Option(
+        False, "--review",
+        help="Interactively review each match group before merging.",
+    ),
+    config: Optional[str] = typer.Option(
+        None, "--config",
+        help="Load settings from a saved JSON config file.",
+    ),
+    save_config: Optional[str] = typer.Option(
+        None, "--save-config",
+        help="Save current settings to a JSON config file.",
+    ),
+    sheet: Optional[str] = typer.Option(
+        None, "--sheet",
+        help="Excel sheet name or index (default: first sheet).",
+    ),
+    encoding_override: Optional[str] = typer.Option(
+        None, "--encoding",
+        help="Override auto-detected file encoding.",
+    ),
+    header_row: Optional[int] = typer.Option(
+        None, "--header-row",
+        help="0-based row index for the header (default: auto-detect).",
+    ),
+):
+    """Find and remove duplicate rows in CSV and Excel files."""
+    from src.core.io import read_file, write_file, list_sheets
+    from src.core.dedup import (
+        Algorithm, ColumnMatchStrategy, MatchStrategy, SurvivorRule,
+        build_default_strategies, deduplicate,
+    )
+    from src.core.normalizers import NormalizerType
+    from src.core.config import DeduplicationConfig
+
+    # Setup
+    input_path = Path(input_file)
+    if not input_path.exists():
+        typer.echo(f"Error: File not found: {input_path}", err=True)
+        raise typer.Exit(1)
+
+    log_path = _setup_logging(Path("logs"))
+
+    # Load config if provided
+    cfg: Optional[DeduplicationConfig] = None
+    if config:
+        config_path = Path(config)
+        if not config_path.exists():
+            typer.echo(f"Error: Config file not found: {config_path}", err=True)
+            raise typer.Exit(1)
+        cfg = DeduplicationConfig.from_file(config_path)
+        logger.info("Loaded config from {}", config_path)
+
+    # Read input
+    typer.echo(f"Reading {input_path.name}...")
+    try:
+        sheet_arg: str | int | None = None
+        if sheet is not None:
+            try:
+                sheet_arg = int(sheet)
+            except ValueError:
+                sheet_arg = sheet
+
+        df = read_file(
+            input_path,
+            encoding=encoding_override,
+            header_row=header_row,
+            sheet_name=sheet_arg if sheet_arg is not None else 0,
+        )
+        if not isinstance(df, __import__("pandas").DataFrame):
+            # chunked reading returns generator — materialise for v1
+            import pandas as pd
+            df = pd.concat(list(df), ignore_index=True)
+    except Exception as e:
+        typer.echo(f"Error reading file: {e}", err=True)
+        raise typer.Exit(1)
+
+    typer.echo(f"  {len(df)} rows, {len(df.columns)} columns")
+    available_columns = list(df.columns)
+
+    # Build strategies
+    strategies: Optional[list[MatchStrategy]] = None
+
+    if cfg and cfg.strategies:
+        strategies = cfg.to_strategies()
+    elif subset or fuzzy:
+        # Build from CLI flags
+        normalize_map = _parse_normalize_map(normalize)
+        strategies = []
+
+        fuzzy_cols = set(c.strip() for c in fuzzy.split(",")) if fuzzy else set()
+        if subset:
+            subset_cols = [c.strip() for c in subset.split(",")]
+        elif fuzzy_cols:
+            # When only --fuzzy is given, match on just those columns
+            subset_cols = list(fuzzy_cols)
+        else:
+            subset_cols = available_columns
+
+        _validate_columns(subset_cols, available_columns)
+        if fuzzy_cols:
+            _validate_columns(list(fuzzy_cols), available_columns)
+
+        col_strats: list[ColumnMatchStrategy] = []
+        for col in subset_cols:
+            norm = None
+            if col in normalize_map:
+                norm = NormalizerType(normalize_map[col])
+
+            if col in fuzzy_cols:
+                algo = Algorithm(algorithm)
+                thresh = float(threshold)
+            else:
+                algo = Algorithm.EXACT
+                thresh = 100.0
+
+            col_strats.append(ColumnMatchStrategy(
+                column=col, algorithm=algo, threshold=thresh, normalizer=norm,
+            ))
+
+        strategies = [MatchStrategy(column_strategies=col_strats)]
+
+    # Apply normalizer overrides even with auto-detect
+    if normalize and strategies is None:
+        normalize_map = _parse_normalize_map(normalize)
+        auto_strats = build_default_strategies(df)
+        # Inject normalize_map into auto strategies
+        for strat in auto_strats:
+            for cs in strat.column_strategies:
+                if cs.column in normalize_map:
+                    cs.normalizer = NormalizerType(normalize_map[cs.column])
+        strategies = auto_strats
+
+    # --key: add user-declared strong keys as standalone exact-match strategies
+    if key:
+        key_cols = [c.strip() for c in key.split(",")]
+        _validate_columns(key_cols, available_columns)
+        key_strats = [
+            MatchStrategy(column_strategies=[
+                ColumnMatchStrategy(column=col, algorithm=Algorithm.EXACT, threshold=100.0)
+            ])
+            for col in key_cols
+        ]
+        if strategies is None:
+            # Combine with auto-detect so user gets both
+            strategies = build_default_strategies(df) + key_strats
+        else:
+            strategies.extend(key_strats)
+
+    # Survivor rule
+    survivor_map = {
+        "first": SurvivorRule.KEEP_FIRST,
+        "last": SurvivorRule.KEEP_LAST,
+        "most-complete": SurvivorRule.KEEP_MOST_COMPLETE,
+        "most_complete": SurvivorRule.KEEP_MOST_COMPLETE,
+        "most-recent": SurvivorRule.KEEP_MOST_RECENT,
+        "most_recent": SurvivorRule.KEEP_MOST_RECENT,
+    }
+    if cfg:
+        surv_rule = cfg.to_survivor_rule()
+        do_merge = cfg.merge
+        dc = cfg.date_column
+    else:
+        surv_key = survivor.lower().replace("-", "_")
+        if surv_key not in {r.value for r in SurvivorRule} and surv_key not in survivor_map:
+            typer.echo(
+                f"Error: Unknown survivor rule '{survivor}'. "
+                f"Choose from: first, last, most-complete, most-recent.",
+                err=True,
+            )
+            raise typer.Exit(1)
+        surv_rule = survivor_map.get(survivor.lower(), SurvivorRule(surv_key))
+        do_merge = merge
+        dc = date_column
+
+    # Save config if requested
+    if save_config:
+        from src.core.config import DeduplicationConfig, StrategyConfig, ColumnStrategyConfig
+        save_cfg = DeduplicationConfig(
+            survivor_rule=surv_rule.value,
+            date_column=dc,
+            merge=do_merge,
+            subset_columns=[c.strip() for c in subset.split(",")] if subset else None,
+            fuzzy_columns=[c.strip() for c in fuzzy.split(",")] if fuzzy else None,
+            default_algorithm=algorithm,
+            default_threshold=float(threshold),
+            normalize_map=_parse_normalize_map(normalize),
+        )
+        if strategies:
+            save_cfg.strategies = [
+                StrategyConfig(columns=[
+                    ColumnStrategyConfig(
+                        column=cs.column,
+                        algorithm=cs.algorithm.value,
+                        threshold=cs.threshold,
+                        normalizer=cs.normalizer.value if cs.normalizer else None,
+                    )
+                    for cs in s.column_strategies
+                ])
+                for s in strategies
+            ]
+        saved = save_cfg.to_file(save_config)
+        typer.echo(f"Config saved to {saved}")
+
+    # Progress bar
+    progress_cb = None
+    if len(df) > 10_000:
+        from tqdm import tqdm
+        pbar = tqdm(total=len(df) * (len(df) - 1) // 2, desc="Comparing rows",
+                    unit="pairs", leave=False)
+
+        def _progress(current: int, total: int):
+            pbar.update(current - pbar.n)
+            if current >= total:
+                pbar.close()
+
+        progress_cb = _progress
+
+    # Review callback
+    review_cb = _interactive_review if review else None
+
+    # Run dedup
+    typer.echo("Finding duplicates...")
+    result = deduplicate(
+        df,
+        strategies=strategies,
+        survivor_rule=surv_rule,
+        date_column=dc,
+        merge=do_merge,
+        preview=not apply,
+        review_callback=review_cb,
+        progress_callback=progress_cb,
+    )
+
+    # Print results
+    _print_results(result, input_path)
+
+    # Write output files
+    if apply:
+        stem = input_path.stem
+        suffix = input_path.suffix
+
+        out_path = Path(output) if output else input_path.parent / f"{stem}_deduplicated.csv"
+        write_file(result.deduplicated_df, out_path)
+        typer.echo(f"\nDeduplicated file: {out_path}")
+
+        if not result.removed_df.empty:
+            removed_path = input_path.parent / f"{stem}_removed.csv"
+            write_file(result.removed_df, removed_path)
+            typer.echo(f"Removed rows:     {removed_path}")
+
+        if result.match_groups:
+            groups_path = input_path.parent / f"{stem}_match_groups.csv"
+            _write_match_groups(result, df, groups_path)
+            typer.echo(f"Match groups:     {groups_path}")
+    else:
+        typer.echo("\nThis was a preview. Add --apply to write the output files.")
+
+    typer.echo(f"Log: {log_path}")
+
+
+# ---------------------------------------------------------------------------
+# Output formatting
+# ---------------------------------------------------------------------------
+
+def _print_results(result, input_path: Path) -> None:
+    """Print a human-readable summary."""
+    removed = result.original_row_count - len(result.deduplicated_df)
+    typer.echo(f"\n{'─'*50}")
+    typer.echo(f"  File:      {input_path.name}")
+    typer.echo(f"  Rows in:   {result.original_row_count}")
+    typer.echo(f"  Rows out:  {len(result.deduplicated_df)}")
+    typer.echo(f"  Removed:   {removed}")
+    typer.echo(f"  Groups:    {len(result.match_groups)}")
+    typer.echo(f"{'─'*50}")
+
+    if result.match_groups:
+        typer.echo("\nMatch groups:")
+        for g in result.match_groups[:20]:  # cap display
+            rows_str = ", ".join(str(i + 1) for i in g.row_indices)
+            surv = g.survivor_index + 1
+            typer.echo(
+                f"  Group {g.group_id + 1}: rows [{rows_str}] "
+                f"→ keep row {surv} "
+                f"(confidence: {g.confidence:.1f}%, "
+                f"matched on: {', '.join(g.matched_on)})"
+            )
+        if len(result.match_groups) > 20:
+            typer.echo(f"  ... and {len(result.match_groups) - 20} more groups")
+
+
+def _write_match_groups(result, original_df, path: Path) -> None:
+    """Write match groups to a CSV for audit."""
+    import pandas as pd
+    from src.core.io import write_file
+
+    rows = []
+    for g in result.match_groups:
+        for idx in g.row_indices:
+            row_data = {"_group_id": g.group_id + 1}
+            row_data["_is_survivor"] = idx == g.survivor_index
+            row_data["_confidence"] = g.confidence
+            row_data["_matched_on"] = ", ".join(g.matched_on)
+            row_data["_original_row"] = idx + 1
+            # Include original data
+            for col in original_df.columns:
+                row_data[col] = original_df.iloc[idx].get(col, "")
+            rows.append(row_data)
+
+    groups_df = pd.DataFrame(rows)
+    write_file(groups_df, path)
+
+
+# ---------------------------------------------------------------------------
+# __main__ support
+# ---------------------------------------------------------------------------
+
+def main():
+    app()
+
+
+if __name__ == "__main__":
+    main()
--- a/src/core/init.py
+++ b/src/core/init.py
@@ -0,0 +1,93 @@
+"""DataTools deduplication engine.
+
+Public API
+----------
+Core:
+    deduplicate(df, ...) -> DeduplicationResult
+    build_default_strategies(df) -> list[MatchStrategy]
+
+Types:
+    Algorithm, SurvivorRule, ColumnMatchStrategy, MatchStrategy
+    MatchResult, DeduplicationResult
+
+Normalizers:
+    get_normalizer(type) -> Callable
+    NormalizerType
+    normalize_email, normalize_phone, normalize_name,
+    normalize_address, normalize_string
+
+I/O:
+    read_file(path, ...) -> DataFrame
+    write_file(df, path, ...)
+    list_sheets(path) -> list[str]
+    detect_encoding, detect_delimiter, detect_header_row
+
+Configuration:
+    DeduplicationConfig.from_file(path) -> DeduplicationConfig
+    DeduplicationConfig.to_file(path)
+"""
+
+from .dedup import (
+    Algorithm,
+    ColumnMatchStrategy,
+    DeduplicationResult,
+    MatchResult,
+    MatchStrategy,
+    SurvivorRule,
+    build_default_strategies,
+    deduplicate,
+)
+from .normalizers import (
+    NormalizerType,
+    get_normalizer,
+    normalize_address,
+    normalize_email,
+    normalize_name,
+    normalize_phone,
+    normalize_string,
+)
+from .io import (
+    detect_delimiter,
+    detect_encoding,
+    detect_header_row,
+    list_sheets,
+    read_file,
+    write_file,
+)
+from .config import (
+    ColumnStrategyConfig,
+    DeduplicationConfig,
+    StrategyConfig,
+)
+
+__all__ = [
+    # Core
+    "deduplicate",
+    "build_default_strategies",
+    # Types
+    "Algorithm",
+    "SurvivorRule",
+    "ColumnMatchStrategy",
+    "MatchStrategy",
+    "MatchResult",
+    "DeduplicationResult",
+    # Normalizers
+    "NormalizerType",
+    "get_normalizer",
+    "normalize_email",
+    "normalize_phone",
+    "normalize_name",
+    "normalize_address",
+    "normalize_string",
+    # I/O
+    "read_file",
+    "write_file",
+    "list_sheets",
+    "detect_encoding",
+    "detect_delimiter",
+    "detect_header_row",
+    # Config
+    "DeduplicationConfig",
+    "StrategyConfig",
+    "ColumnStrategyConfig",
+]
--- a/src/core/config.py
+++ b/src/core/config.py
@@ -0,0 +1,117 @@
+"""Configuration profiles: save/load deduplication settings as JSON."""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field, asdict
+from pathlib import Path
+from typing import Optional
+
+from .dedup import (
+    Algorithm,
+    ColumnMatchStrategy,
+    MatchStrategy,
+    NormalizerType,
+    SurvivorRule,
+)
+
+
+@dataclass
+class ColumnStrategyConfig:
+    """JSON-serializable mirror of ColumnMatchStrategy."""
+    column: str
+    algorithm: str = "exact"
+    threshold: float = 100.0
+    normalizer: Optional[str] = None
+
+
+@dataclass
+class StrategyConfig:
+    """JSON-serializable mirror of MatchStrategy."""
+    columns: list[ColumnStrategyConfig] = field(default_factory=list)
+
+
+@dataclass
+class DeduplicationConfig:
+    """All deduplication settings as a flat JSON-serializable structure."""
+
+    strategies: list[StrategyConfig] = field(default_factory=list)
+    survivor_rule: str = "first"
+    date_column: Optional[str] = None
+    merge: bool = False
+    subset_columns: Optional[list[str]] = None
+    fuzzy_columns: Optional[list[str]] = None
+    default_algorithm: str = "jaro_winkler"
+    default_threshold: float = 85.0
+    normalize_map: Optional[dict[str, str]] = None  # column -> normalizer type
+
+    # -----------------------------------------------------------------------
+    # Serialisation
+    # -----------------------------------------------------------------------
+
+    def to_dict(self) -> dict:
+        return asdict(self)
+
+    def to_file(self, path: str | Path) -> Path:
+        """Save configuration to a JSON file."""
+        out = Path(path)
+        out.write_text(json.dumps(self.to_dict(), indent=2))
+        return out
+
+    @classmethod
+    def from_dict(cls, data: dict) -> DeduplicationConfig:
+        strategies = []
+        for s in data.get("strategies", []):
+            cols = [ColumnStrategyConfig(**c) for c in s.get("columns", [])]
+            strategies.append(StrategyConfig(columns=cols))
+        return cls(
+            strategies=strategies,
+            survivor_rule=data.get("survivor_rule", "first"),
+            date_column=data.get("date_column"),
+            merge=data.get("merge", False),
+            subset_columns=data.get("subset_columns"),
+            fuzzy_columns=data.get("fuzzy_columns"),
+            default_algorithm=data.get("default_algorithm", "jaro_winkler"),
+            default_threshold=data.get("default_threshold", 85.0),
+            normalize_map=data.get("normalize_map"),
+        )
+
+    @classmethod
+    def from_file(cls, path: str | Path) -> DeduplicationConfig:
+        """Load configuration from a JSON file."""
+        data = json.loads(Path(path).read_text())
+        return cls.from_dict(data)
+
+    @classmethod
+    def default(cls) -> DeduplicationConfig:
+        """Return sensible defaults (auto-detect strategies at runtime)."""
+        return cls()
+
+    # -----------------------------------------------------------------------
+    # Convert to engine objects
+    # -----------------------------------------------------------------------
+
+    def to_strategies(self) -> Optional[list[MatchStrategy]]:
+        """Convert the config back to MatchStrategy objects.
+
+        Returns None if no explicit strategies are configured
+        (the engine will auto-detect).
+        """
+        if not self.strategies:
+            return None
+
+        result: list[MatchStrategy] = []
+        for sc in self.strategies:
+            col_strats = []
+            for cc in sc.columns:
+                col_strats.append(ColumnMatchStrategy(
+                    column=cc.column,
+                    algorithm=Algorithm(cc.algorithm),
+                    threshold=cc.threshold,
+                    normalizer=NormalizerType(cc.normalizer) if cc.normalizer else None,
+                ))
+            result.append(MatchStrategy(column_strategies=col_strats))
+        return result
+
+    def to_survivor_rule(self) -> SurvivorRule:
+        return SurvivorRule(self.survivor_rule)
--- a/src/core/dedup.py
+++ b/src/core/dedup.py
@@ -0,0 +1,568 @@
+"""Deduplication engine: matching, survivor selection, and merge.
+
+Core algorithm:
+1. Normalise columns → shadow ``_norm_*`` columns (computed once).
+2. Pairwise comparison within each strategy → candidate pairs.
+3. Union-find for transitive closure (A~B, B~C ⇒ one group).
+4. Multi-strategy OR: feed all pairs from all strategies into the same union-find.
+5. Survivor selection per group + optional field merge.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Callable, Optional
+
+import pandas as pd
+from loguru import logger
+from rapidfuzz import fuzz as rf_fuzz
+from rapidfuzz import distance as rf_distance
+
+from .normalizers import NormalizerType, get_normalizer
+
+
+# ---------------------------------------------------------------------------
+# Enums & data structures
+# ---------------------------------------------------------------------------
+
+class Algorithm(str, Enum):
+    EXACT = "exact"
+    LEVENSHTEIN = "levenshtein"
+    JARO_WINKLER = "jaro_winkler"
+    TOKEN_SET_RATIO = "token_set_ratio"
+
+
+class SurvivorRule(str, Enum):
+    KEEP_FIRST = "first"
+    KEEP_LAST = "last"
+    KEEP_MOST_COMPLETE = "most_complete"
+    KEEP_MOST_RECENT = "most_recent"
+
+
+@dataclass
+class ColumnMatchStrategy:
+    """How to match on a single column."""
+    column: str
+    algorithm: Algorithm = Algorithm.EXACT
+    threshold: float = 100.0  # 0-100 scale
+    normalizer: Optional[NormalizerType] = None
+
+
+@dataclass
+class MatchStrategy:
+    """A set of column strategies combined with AND.
+
+    Multiple ``MatchStrategy`` instances are combined with OR at the top level.
+    """
+    column_strategies: list[ColumnMatchStrategy]
+
+
+@dataclass
+class MatchResult:
+    """One group of duplicate rows."""
+    group_id: int
+    row_indices: list[int]
+    confidence: float            # min confidence across pairs in the group
+    matched_on: list[str]        # column names that contributed to the match
+    survivor_index: int          # index of the row to keep
+
+
+@dataclass
+class DeduplicationResult:
+    """Full result of a deduplication run."""
+    original_row_count: int
+    deduplicated_df: pd.DataFrame
+    removed_df: pd.DataFrame
+    match_groups: list[MatchResult]
+    log_entries: list[str] = field(default_factory=list)
+    is_preview: bool = True
+
+
+# ---------------------------------------------------------------------------
+# Union-Find
+# ---------------------------------------------------------------------------
+
+class _UnionFind:
+    """Disjoint-set / union-find for transitive closure of match pairs."""
+
+    def __init__(self, n: int):
+        self._parent = list(range(n))
+        self._rank = [0] * n
+
+    def find(self, x: int) -> int:
+        while self._parent[x] != x:
+            self._parent[x] = self._parent[self._parent[x]]  # path halving
+            x = self._parent[x]
+        return x
+
+    def union(self, a: int, b: int) -> None:
+        ra, rb = self.find(a), self.find(b)
+        if ra == rb:
+            return
+        if self._rank[ra] < self._rank[rb]:
+            ra, rb = rb, ra
+        self._parent[rb] = ra
+        if self._rank[ra] == self._rank[rb]:
+            self._rank[ra] += 1
+
+    def groups(self) -> dict[int, list[int]]:
+        """Return {root: [members]} for all non-singleton groups."""
+        from collections import defaultdict
+        g: dict[int, list[int]] = defaultdict(list)
+        for i in range(len(self._parent)):
+            g[self.find(i)].append(i)
+        return {root: members for root, members in g.items() if len(members) > 1}
+
+
+# ---------------------------------------------------------------------------
+# Similarity computation
+# ---------------------------------------------------------------------------
+
+def _compute_similarity(val_a: str, val_b: str, algorithm: Algorithm) -> float:
+    """Return similarity score on a 0-100 scale."""
+    if algorithm == Algorithm.EXACT:
+        return 100.0 if val_a == val_b else 0.0
+    if algorithm == Algorithm.LEVENSHTEIN:
+        return rf_fuzz.ratio(val_a, val_b)
+    if algorithm == Algorithm.JARO_WINKLER:
+        # rapidfuzz jaro_winkler_similarity returns 0-100
+        return rf_distance.JaroWinkler.similarity(val_a, val_b) * 100
+    if algorithm == Algorithm.TOKEN_SET_RATIO:
+        return rf_fuzz.token_set_ratio(val_a, val_b)
+    raise ValueError(f"Unknown algorithm: {algorithm}")
+
+
+# ---------------------------------------------------------------------------
+# Pair comparison
+# ---------------------------------------------------------------------------
+
+def _compare_pair(
+    row_a: pd.Series,
+    row_b: pd.Series,
+    strategy: MatchStrategy,
+    norm_prefix: str = "_norm_",
+) -> tuple[bool, float, list[str]]:
+    """Compare two rows using a single MatchStrategy (AND of column strategies).
+
+    Returns ``(is_match, confidence, matched_columns)``.
+    """
+    min_score = 100.0
+    matched_cols: list[str] = []
+
+    for cs in strategy.column_strategies:
+        col = f"{norm_prefix}{cs.column}" if cs.normalizer else cs.column
+        va = str(row_a.get(col, ""))
+        vb = str(row_b.get(col, ""))
+
+        # Skip if both empty
+        if not va and not vb:
+            continue
+        # If one empty and one not — no match for this column
+        if not va or not vb:
+            return False, 0.0, []
+
+        score = _compute_similarity(va, vb, cs.algorithm)
+        if score < cs.threshold:
+            return False, 0.0, []
+        min_score = min(min_score, score)
+        matched_cols.append(cs.column)
+
+    if not matched_cols:
+        return False, 0.0, []
+
+    return True, min_score, matched_cols
+
+
+# ---------------------------------------------------------------------------
+# Match-group finding
+# ---------------------------------------------------------------------------
+
+def _find_match_groups(
+    df: pd.DataFrame,
+    strategies: list[MatchStrategy],
+    *,
+    progress_callback: Optional[Callable[[int, int], None]] = None,
+) -> tuple[list[MatchResult], dict[tuple[int, int], tuple[float, list[str]]]]:
+    """Pairwise comparison + union-find for transitive closure.
+
+    Returns ``(match_groups, pair_info)`` where *pair_info* maps
+    ``(i, j)`` → ``(confidence, matched_columns)`` for logging.
+    """
+    n = len(df)
+    uf = _UnionFind(n)
+    pair_info: dict[tuple[int, int], tuple[float, list[str]]] = {}
+    total_pairs = n * (n - 1) // 2
+    checked = 0
+
+    for i in range(n):
+        for j in range(i + 1, n):
+            for strategy in strategies:
+                is_match, confidence, cols = _compare_pair(
+                    df.iloc[i], df.iloc[j], strategy
+                )
+                if is_match:
+                    uf.union(i, j)
+                    key = (i, j)
+                    # Keep the highest-confidence match for this pair
+                    if key not in pair_info or confidence > pair_info[key][0]:
+                        pair_info[key] = (confidence, cols)
+                    break  # OR logic: one strategy match is enough
+
+            checked += 1
+            if progress_callback and checked % 1000 == 0:
+                progress_callback(checked, total_pairs)
+
+    if progress_callback:
+        progress_callback(total_pairs, total_pairs)
+
+    # Build MatchResult objects (survivor not yet selected)
+    raw_groups = uf.groups()
+    match_groups: list[MatchResult] = []
+    for gid, (root, members) in enumerate(sorted(raw_groups.items())):
+        # Confidence = min across all pairs in the group
+        group_confidence = 100.0
+        group_cols: set[str] = set()
+        for idx_a, m in enumerate(members):
+            for idx_b in range(idx_a + 1, len(members)):
+                key = (min(m, members[idx_b]), max(m, members[idx_b]))
+                if key in pair_info:
+                    conf, cols = pair_info[key]
+                    group_confidence = min(group_confidence, conf)
+                    group_cols.update(cols)
+
+        match_groups.append(MatchResult(
+            group_id=gid,
+            row_indices=members,
+            confidence=round(group_confidence, 2),
+            matched_on=sorted(group_cols),
+            survivor_index=members[0],  # placeholder
+        ))
+
+    return match_groups, pair_info
+
+
+# ---------------------------------------------------------------------------
+# Survivor selection
+# ---------------------------------------------------------------------------
+
+def _select_survivor(
+    group: MatchResult,
+    df: pd.DataFrame,
+    rule: SurvivorRule,
+    date_column: Optional[str] = None,
+) -> int:
+    """Choose the survivor row index within a match group."""
+    indices = group.row_indices
+
+    if rule == SurvivorRule.KEEP_FIRST:
+        return indices[0]
+
+    if rule == SurvivorRule.KEEP_LAST:
+        return indices[-1]
+
+    if rule == SurvivorRule.KEEP_MOST_COMPLETE:
+        # Fewest empty/blank cells wins
+        best_idx = indices[0]
+        best_empty = _count_empty(df.iloc[indices[0]])
+        for idx in indices[1:]:
+            empty = _count_empty(df.iloc[idx])
+            if empty < best_empty:
+                best_empty = empty
+                best_idx = idx
+        return best_idx
+
+    if rule == SurvivorRule.KEEP_MOST_RECENT:
+        if not date_column or date_column not in df.columns:
+            logger.warning("date_column '{}' not found; falling back to keep_first", date_column)
+            return indices[0]
+        best_idx = indices[0]
+        best_date = _parse_date(df.iloc[indices[0]].get(date_column, ""))
+        for idx in indices[1:]:
+            d = _parse_date(df.iloc[idx].get(date_column, ""))
+            if d is not None and (best_date is None or d > best_date):
+                best_date = d
+                best_idx = idx
+        return best_idx
+
+    return indices[0]
+
+
+def _count_empty(row: pd.Series) -> int:
+    """Count empty/blank cells in a row, ignoring internal shadow columns."""
+    count = 0
+    for col, val in row.items():
+        if isinstance(col, str) and col.startswith("_norm_"):
+            continue
+        if pd.isna(val) or str(val).strip() == "":
+            count += 1
+    return count
+
+
+def _parse_date(value) -> Optional[pd.Timestamp]:
+    try:
+        return pd.to_datetime(value)
+    except Exception:
+        return None
+
+
+# ---------------------------------------------------------------------------
+# Merge mode
+# ---------------------------------------------------------------------------
+
+def _merge_group(df: pd.DataFrame, survivor_idx: int, loser_indices: list[int]) -> pd.Series:
+    """Fill missing fields in survivor from losers (ordered by position)."""
+    survivor = df.iloc[survivor_idx].copy()
+    for col in survivor.index:
+        if isinstance(col, str) and col.startswith("_norm_"):
+            continue
+        val = survivor[col]
+        if pd.isna(val) or str(val).strip() == "":
+            for loser_idx in loser_indices:
+                candidate = df.iloc[loser_idx][col]
+                if not pd.isna(candidate) and str(candidate).strip() != "":
+                    survivor[col] = candidate
+                    break
+    return survivor
+
+
+# ---------------------------------------------------------------------------
+# Auto-detect strategies
+# ---------------------------------------------------------------------------
+
+# (pattern, normalizer, algorithm, threshold, is_strong_key)
+# Strong keys (email, phone) can be standalone strategies.
+# Weak keys (name, address) must be combined with a strong key via AND.
+_COLUMN_TYPE_PATTERNS: list[tuple[re.Pattern, NormalizerType, Algorithm, float, bool]] = [
+    (re.compile(r"e[-_]?mail", re.I), NormalizerType.EMAIL, Algorithm.EXACT, 100.0, True),
+    (re.compile(r"phone|telephone|mobile|cell", re.I), NormalizerType.PHONE, Algorithm.EXACT, 100.0, True),
+    (re.compile(r"^(name|full_name|customer_name|first_name|last_name|contact_name|respondent_name)$", re.I),
+     NormalizerType.NAME, Algorithm.JARO_WINKLER, 85.0, False),
+    (re.compile(r"address|street|addr", re.I), NormalizerType.ADDRESS, Algorithm.TOKEN_SET_RATIO, 80.0, False),
+]
+
+
+def build_default_strategies(df: pd.DataFrame) -> list[MatchStrategy]:
+    """Auto-detect column types and build match strategies.
+
+    Strategy logic:
+    - Strong keys (email, phone): each gets its own standalone OR strategy.
+    - Weak keys (name, address): combined with each strong key via AND to
+      form additional strategies.  Weak keys never stand alone (too many
+      false positives — "John" ≈ "Jon" at 93 % Jaro-Winkler).
+    - If only weak keys are found (no strong keys), they're promoted to
+      standalone strategies as a fallback.
+    - If no columns match, exact match on all columns (drop_duplicates
+      equivalent).
+    """
+    strong_cols: list[ColumnMatchStrategy] = []
+    weak_cols: list[ColumnMatchStrategy] = []
+
+    for col in df.columns:
+        if col.startswith("_norm_"):
+            continue
+        for pattern, norm_type, algo, threshold, is_strong in _COLUMN_TYPE_PATTERNS:
+            if pattern.search(col):
+                cs = ColumnMatchStrategy(
+                    column=col, algorithm=algo,
+                    threshold=threshold, normalizer=norm_type,
+                )
+                if is_strong:
+                    strong_cols.append(cs)
+                else:
+                    weak_cols.append(cs)
+                break
+
+    strategies: list[MatchStrategy] = []
+
+    if strong_cols:
+        # Each strong key is a standalone strategy (OR)
+        for sc in strong_cols:
+            strategies.append(MatchStrategy(column_strategies=[sc]))
+
+        # Each weak key is paired with each strong key (AND) for extra recall
+        for wc in weak_cols:
+            for sc in strong_cols:
+                strategies.append(MatchStrategy(column_strategies=[wc, sc]))
+    elif weak_cols:
+        # No strong keys — promote weak to standalone (best effort)
+        for wc in weak_cols:
+            strategies.append(MatchStrategy(column_strategies=[wc]))
+
+    if strategies:
+        return strategies
+
+    # Fallback: exact match on all columns (equivalent to drop_duplicates)
+    logger.info("No column patterns matched; using exact match on all columns")
+    all_cols = [
+        ColumnMatchStrategy(column=c, algorithm=Algorithm.EXACT, threshold=100.0)
+        for c in df.columns
+    ]
+    return [MatchStrategy(column_strategies=all_cols)]
+
+
+# ---------------------------------------------------------------------------
+# Normalisation pass
+# ---------------------------------------------------------------------------
+
+def _apply_normalizations(df: pd.DataFrame, strategies: list[MatchStrategy]) -> pd.DataFrame:
+    """Add ``_norm_*`` shadow columns for every column that has a normalizer."""
+    df = df.copy()
+    seen: set[str] = set()
+    for strategy in strategies:
+        for cs in strategy.column_strategies:
+            if cs.normalizer and cs.column not in seen and cs.column in df.columns:
+                seen.add(cs.column)
+                norm_fn = get_normalizer(cs.normalizer)
+                norm_col = f"_norm_{cs.column}"
+                df[norm_col] = df[cs.column].apply(
+                    lambda v, fn=norm_fn: fn(str(v)) if pd.notna(v) and str(v).strip() else ""
+                )
+    return df
+
+
+# ---------------------------------------------------------------------------
+# Main entry point
+# ---------------------------------------------------------------------------
+
+def deduplicate(
+    df: pd.DataFrame,
+    *,
+    strategies: Optional[list[MatchStrategy]] = None,
+    survivor_rule: SurvivorRule = SurvivorRule.KEEP_FIRST,
+    date_column: Optional[str] = None,
+    merge: bool = False,
+    preview: bool = True,
+    review_callback: Optional[Callable] = None,
+    progress_callback: Optional[Callable[[int, int], None]] = None,
+) -> DeduplicationResult:
+    """Run the full deduplication pipeline.
+
+    Parameters
+    ----------
+    df : input DataFrame
+    strategies : matching strategies (auto-detected if None)
+    survivor_rule : which row to keep per group
+    date_column : used with ``KEEP_MOST_RECENT``
+    merge : fill missing fields in survivor from losers
+    preview : if True, result is informational only (no writes)
+    review_callback : ``(group: MatchResult, df: DataFrame) -> bool|None``
+        Called for each match group. Return True to accept, False to reject,
+        None to skip (keep both rows). Used for interactive review.
+    progress_callback : ``(current: int, total: int) -> None``
+        Called periodically during pairwise comparison.
+
+    Returns a ``DeduplicationResult``.
+    """
+    log_entries: list[str] = []
+    original_count = len(df)
+
+    if strategies is None:
+        strategies = build_default_strategies(df)
+        log_entries.append(f"Auto-detected {len(strategies)} match strategies")
+
+    # Log strategies
+    for i, s in enumerate(strategies):
+        cols_desc = ", ".join(
+            f"{cs.column}({cs.algorithm.value}@{cs.threshold})"
+            for cs in s.column_strategies
+        )
+        log_entries.append(f"Strategy {i}: {cols_desc}")
+        logger.info("Strategy {}: {}", i, cols_desc)
+
+    # Normalise
+    df_work = _apply_normalizations(df, strategies)
+
+    # Find matches
+    match_groups, pair_info = _find_match_groups(
+        df_work, strategies, progress_callback=progress_callback
+    )
+    log_entries.append(f"Found {len(match_groups)} duplicate groups")
+    logger.info("Found {} duplicate groups from {} rows", len(match_groups), original_count)
+
+    # Interactive review
+    if review_callback and match_groups:
+        reviewed_groups: list[MatchResult] = []
+        for group in match_groups:
+            decision = review_callback(group, df_work)
+            if decision is True:
+                reviewed_groups.append(group)
+                log_entries.append(f"Group {group.group_id}: accepted by reviewer")
+            elif decision is False:
+                log_entries.append(f"Group {group.group_id}: rejected by reviewer")
+            else:
+                log_entries.append(f"Group {group.group_id}: skipped by reviewer")
+        match_groups = reviewed_groups
+
+    # Survivor selection
+    for group in match_groups:
+        group.survivor_index = _select_survivor(group, df_work, survivor_rule, date_column)
+        log_entries.append(
+            f"Group {group.group_id}: survivor=row {group.survivor_index} "
+            f"(rule={survivor_rule.value}, confidence={group.confidence}%)"
+        )
+
+    # Build result dataframes
+    remove_indices: set[int] = set()
+    merged_rows: dict[int, pd.Series] = {}
+
+    for group in match_groups:
+        survivor_idx = group.survivor_index
+        losers = [i for i in group.row_indices if i != survivor_idx]
+        remove_indices.update(losers)
+
+        if merge and losers:
+            merged = _merge_group(df_work, survivor_idx, losers)
+            merged_rows[survivor_idx] = merged
+            # Log merged fields
+            original = df_work.iloc[survivor_idx]
+            for col in original.index:
+                if isinstance(col, str) and col.startswith("_norm_"):
+                    continue
+                orig_val = str(original[col]).strip()
+                new_val = str(merged[col]).strip()
+                if orig_val != new_val and not orig_val:
+                    log_entries.append(
+                        f"Group {group.group_id}: merged '{col}' "
+                        f"into survivor from losers: '{new_val}'"
+                    )
+
+    # Build output DataFrames
+    keep_indices = [i for i in range(len(df_work)) if i not in remove_indices]
+
+    if merged_rows:
+        rows = []
+        for i in keep_indices:
+            if i in merged_rows:
+                rows.append(merged_rows[i])
+            else:
+                rows.append(df_work.iloc[i])
+        deduplicated_df = pd.DataFrame(rows)
+    else:
+        deduplicated_df = df_work.iloc[keep_indices].copy()
+
+    removed_df = df_work.iloc[sorted(remove_indices)].copy() if remove_indices else pd.DataFrame()
+
+    # Drop shadow columns from output
+    norm_cols = [c for c in deduplicated_df.columns if str(c).startswith("_norm_")]
+    deduplicated_df = deduplicated_df.drop(columns=norm_cols, errors="ignore")
+    if not removed_df.empty:
+        removed_df = removed_df.drop(columns=norm_cols, errors="ignore")
+
+    # Reset index
+    deduplicated_df = deduplicated_df.reset_index(drop=True)
+    if not removed_df.empty:
+        removed_df = removed_df.reset_index(drop=True)
+
+    removed_count = original_count - len(deduplicated_df)
+    log_entries.append(f"Result: {original_count} → {len(deduplicated_df)} rows ({removed_count} removed)")
+
+    return DeduplicationResult(
+        original_row_count=original_count,
+        deduplicated_df=deduplicated_df,
+        removed_df=removed_df,
+        match_groups=match_groups,
+        log_entries=log_entries,
+        is_preview=preview,
+    )
--- a/src/core/io.py
+++ b/src/core/io.py
@@ -0,0 +1,247 @@
+"""File I/O: encoding/delimiter detection, CSV/Excel reading, output writing."""
+
+from __future__ import annotations
+
+import csv
+import io
+from pathlib import Path
+from typing import Generator, Optional
+
+import pandas as pd
+from charset_normalizer import from_bytes
+from loguru import logger
+
+
+# ---------------------------------------------------------------------------
+# Encoding detection
+# ---------------------------------------------------------------------------
+
+def detect_encoding(path: Path, sample_bytes: int = 65_536) -> str:
+    """Detect file encoding by reading the first *sample_bytes*.
+
+    Returns the best-guess encoding name (e.g. ``utf-8``, ``windows-1252``).
+    Falls back to ``utf-8`` when detection is inconclusive.
+    """
+    raw = Path(path).read_bytes()[:sample_bytes]
+    if not raw:
+        return "utf-8"
+
+    # Check BOM first
+    if raw[:3] == b"\xef\xbb\xbf":
+        return "utf-8-sig"
+    if raw[:2] in (b"\xff\xfe", b"\xfe\xff"):
+        return "utf-16"
+
+    result = from_bytes(raw).best()
+    if result is None:
+        return "utf-8"
+    enc = result.encoding.lower()
+    # Normalise common aliases
+    if enc in ("ascii", "us-ascii"):
+        enc = "utf-8"
+    return enc
+
+
+# ---------------------------------------------------------------------------
+# Delimiter detection
+# ---------------------------------------------------------------------------
+
+_COMMON_DELIMITERS = [",", "\t", ";", "|"]
+
+
+def detect_delimiter(path: Path, encoding: str = "utf-8") -> str:
+    """Sniff the delimiter from the first 20 lines of a text file.
+
+    Falls back to comma if csv.Sniffer cannot decide.
+    """
+    raw_path = Path(path)
+    lines: list[str] = []
+    with raw_path.open("r", encoding=encoding, errors="replace") as fh:
+        for _ in range(20):
+            line = fh.readline()
+            if not line:
+                break
+            lines.append(line)
+
+    if not lines:
+        return ","
+
+    sample = "".join(lines)
+    try:
+        dialect = csv.Sniffer().sniff(sample, delimiters="".join(_COMMON_DELIMITERS))
+        return dialect.delimiter
+    except csv.Error:
+        return ","
+
+
+# ---------------------------------------------------------------------------
+# Header-row detection
+# ---------------------------------------------------------------------------
+
+def detect_header_row(path: Path, encoding: str = "utf-8", delimiter: str = ",",
+                      max_scan: int = 20) -> int:
+    """Return the 0-based index of the likely header row.
+
+    Heuristic: the first row where *every* cell looks like a column name
+    (non-numeric, non-empty string).  Falls back to 0.
+    """
+    raw_path = Path(path)
+    with raw_path.open("r", encoding=encoding, errors="replace") as fh:
+        reader = csv.reader(fh, delimiter=delimiter)
+        for idx, row in enumerate(reader):
+            if idx >= max_scan:
+                break
+            if not row:
+                continue
+            # All cells must be non-empty, non-numeric strings
+            if all(_looks_like_header(cell) for cell in row if cell.strip()):
+                return idx
+    return 0
+
+
+def _looks_like_header(value: str) -> bool:
+    """True if *value* looks like a column header, not a data value."""
+    v = value.strip()
+    if not v:
+        return False
+    # Pure numbers are not headers
+    try:
+        float(v.replace(",", ""))
+        return False
+    except ValueError:
+        pass
+    return True
+
+
+# ---------------------------------------------------------------------------
+# Excel helpers
+# ---------------------------------------------------------------------------
+
+def list_sheets(path: Path) -> list[str]:
+    """Return sheet names from an Excel workbook."""
+    xl = pd.ExcelFile(path, engine="openpyxl")
+    return xl.sheet_names
+
+
+# ---------------------------------------------------------------------------
+# Reading
+# ---------------------------------------------------------------------------
+
+def read_file(
+    path: str | Path,
+    *,
+    encoding: Optional[str] = None,
+    delimiter: Optional[str] = None,
+    header_row: Optional[int] = None,
+    sheet_name: Optional[str | int] = 0,
+    chunk_size: Optional[int] = None,
+) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
+    """Read a CSV, TSV, or Excel file into a DataFrame.
+
+    Parameters
+    ----------
+    path : file path
+    encoding : override detected encoding (CSV only)
+    delimiter : override detected delimiter (CSV only)
+    header_row : 0-based row index for the header; auto-detected if *None*
+    sheet_name : Excel sheet (name or 0-based index). Ignored for CSV.
+    chunk_size : if set, return a generator of DataFrames (CSV only).
+
+    Returns a DataFrame (or generator when *chunk_size* is set).
+    """
+    filepath = Path(path)
+    if not filepath.exists():
+        raise FileNotFoundError(f"File not found: {filepath}")
+
+    suffix = filepath.suffix.lower()
+    if suffix in (".xlsx", ".xls"):
+        return _read_excel(filepath, header_row=header_row, sheet_name=sheet_name)
+    else:
+        return _read_csv(
+            filepath,
+            encoding=encoding,
+            delimiter=delimiter,
+            header_row=header_row,
+            chunk_size=chunk_size,
+        )
+
+
+def _read_csv(
+    path: Path,
+    *,
+    encoding: Optional[str] = None,
+    delimiter: Optional[str] = None,
+    header_row: Optional[int] = None,
+    chunk_size: Optional[int] = None,
+) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
+    enc = encoding or detect_encoding(path)
+    delim = delimiter or detect_delimiter(path, enc)
+    hdr = header_row if header_row is not None else detect_header_row(path, enc, delim)
+
+    logger.debug("Reading CSV {} (encoding={}, delimiter={!r}, header_row={})",
+                 path.name, enc, delim, hdr)
+
+    kwargs: dict = dict(
+        filepath_or_buffer=path,
+        encoding=enc,
+        delimiter=delim,
+        header=hdr,
+        dtype=str,
+        keep_default_na=False,
+        on_bad_lines="warn",
+    )
+
+    if chunk_size:
+        return pd.read_csv(**kwargs, chunksize=chunk_size)
+
+    return pd.read_csv(**kwargs)
+
+
+def _read_excel(
+    path: Path,
+    *,
+    header_row: Optional[int] = None,
+    sheet_name: Optional[str | int] = 0,
+) -> pd.DataFrame:
+    hdr = header_row if header_row is not None else 0
+    logger.debug("Reading Excel {} (sheet={}, header_row={})", path.name, sheet_name, hdr)
+    return pd.read_excel(
+        path,
+        sheet_name=sheet_name,
+        header=hdr,
+        dtype=str,
+        keep_default_na=False,
+        engine="openpyxl",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Writing
+# ---------------------------------------------------------------------------
+
+def write_file(
+    df: pd.DataFrame,
+    path: str | Path,
+    *,
+    file_format: Optional[str] = None,
+    encoding: str = "utf-8-sig",
+) -> Path:
+    """Write a DataFrame to CSV or Excel.
+
+    Parameters
+    ----------
+    df : DataFrame to write
+    path : output file path
+    file_format : ``"csv"`` or ``"xlsx"``; auto-detected from *path* suffix if *None*
+    encoding : output encoding (default ``utf-8-sig`` for Windows Excel compat)
+
+    Returns the resolved output Path.
+    """
+    out = Path(path)
+    fmt = file_format or out.suffix.lstrip(".").lower()
+    if fmt in ("xlsx", "xls"):
+        df.to_excel(out, index=False, engine="openpyxl")
+    else:
+        df.to_csv(out, index=False, encoding=encoding)
+    logger.info("Wrote {} rows to {}", len(df), out)
+    return out
--- a/src/core/normalizers.py
+++ b/src/core/normalizers.py
@@ -0,0 +1,224 @@
+"""Per-column normalization functions for deduplication matching.
+
+Every normalizer is ``str -> str``, handles None/empty gracefully, and is
+idempotent (applying it twice yields the same result as once).
+"""
+
+from __future__ import annotations
+
+import re
+from enum import Enum
+from typing import Callable, Optional
+
+import phonenumbers
+
+
+# ---------------------------------------------------------------------------
+# Types
+# ---------------------------------------------------------------------------
+
+class NormalizerType(str, Enum):
+    EMAIL = "email"
+    PHONE = "phone"
+    NAME = "name"
+    ADDRESS = "address"
+    STRING = "string"
+
+
+# ---------------------------------------------------------------------------
+# String normalizer (base)
+# ---------------------------------------------------------------------------
+
+def normalize_string(value: Optional[str]) -> str:
+    """Trim, collapse internal whitespace, case-fold."""
+    if not value or not isinstance(value, str):
+        return ""
+    return re.sub(r"\s+", " ", value.strip()).casefold()
+
+
+# ---------------------------------------------------------------------------
+# Email normalizer
+# ---------------------------------------------------------------------------
+
+_GMAIL_DOMAINS = {"gmail.com", "googlemail.com"}
+
+
+def normalize_email(value: Optional[str]) -> str:
+    """Lowercase, strip whitespace, strip Gmail dots, strip +tag suffixes."""
+    if not value or not isinstance(value, str):
+        return ""
+    email = value.strip().lower()
+    if "@" not in email:
+        return email
+
+    local, domain = email.rsplit("@", 1)
+
+    # Strip +tag suffix
+    if "+" in local:
+        local = local.split("+", 1)[0]
+
+    # Strip dots for Gmail addresses
+    if domain in _GMAIL_DOMAINS:
+        local = local.replace(".", "")
+
+    return f"{local}@{domain}"
+
+
+# ---------------------------------------------------------------------------
+# Phone normalizer
+# ---------------------------------------------------------------------------
+
+def normalize_phone(value: Optional[str], default_region: str = "US") -> str:
+    """Parse with phonenumbers lib, return E.164. Fallback: digits-only."""
+    if not value or not isinstance(value, str):
+        return ""
+    stripped = value.strip()
+    if not stripped:
+        return ""
+
+    try:
+        parsed = phonenumbers.parse(stripped, default_region)
+        if phonenumbers.is_possible_number(parsed):
+            return phonenumbers.format_number(parsed, phonenumbers.PhoneNumberFormat.E164)
+    except phonenumbers.NumberParseException:
+        pass
+
+    # Fallback: digits only
+    digits = re.sub(r"\D", "", stripped)
+    return digits
+
+
+# ---------------------------------------------------------------------------
+# Name normalizer
+# ---------------------------------------------------------------------------
+
+_TITLE_PREFIXES = {
+    "mr", "mrs", "ms", "miss", "dr", "prof", "professor",
+    "sir", "madam", "rev", "reverend", "hon", "honorable",
+}
+_NAME_SUFFIXES = {
+    "jr", "sr", "ii", "iii", "iv", "v",
+    "phd", "md", "esq", "dds", "rn",
+}
+
+
+def normalize_name(value: Optional[str]) -> str:
+    """Strip titles/suffixes, collapse whitespace, case-fold."""
+    if not value or not isinstance(value, str):
+        return ""
+    name = value.strip()
+    if not name:
+        return ""
+
+    # Case-fold first for matching
+    name = name.casefold()
+
+    # Remove periods and commas that are part of titles/suffixes
+    name = name.replace(".", " ").replace(",", " ")
+
+    parts = name.split()
+
+    # Strip leading titles
+    while parts and parts[0].rstrip(".") in _TITLE_PREFIXES:
+        parts.pop(0)
+
+    # Strip trailing suffixes
+    while parts and parts[-1].rstrip(".") in _NAME_SUFFIXES:
+        parts.pop()
+
+    return " ".join(parts)
+
+
+# ---------------------------------------------------------------------------
+# Address normalizer
+# ---------------------------------------------------------------------------
+
+_USPS_ABBREVIATIONS: dict[str, str] = {
+    "street": "st",
+    "avenue": "ave",
+    "boulevard": "blvd",
+    "drive": "dr",
+    "lane": "ln",
+    "road": "rd",
+    "court": "ct",
+    "place": "pl",
+    "circle": "cir",
+    "trail": "trl",
+    "way": "way",
+    "terrace": "ter",
+    "parkway": "pkwy",
+    "highway": "hwy",
+    "expressway": "expy",
+    "freeway": "fwy",
+    "square": "sq",
+    "loop": "loop",
+    "alley": "aly",
+    "crossing": "xing",
+    "point": "pt",
+    "north": "n",
+    "south": "s",
+    "east": "e",
+    "west": "w",
+    "northeast": "ne",
+    "northwest": "nw",
+    "southeast": "se",
+    "southwest": "sw",
+    "apartment": "apt",
+    "suite": "ste",
+    "building": "bldg",
+    "floor": "fl",
+    "room": "rm",
+    "unit": "unit",
+    "number": "#",
+    "saint": "st",
+    "fort": "ft",
+    "mount": "mt",
+    "heights": "hts",
+    "springs": "spgs",
+}
+
+
+def normalize_address(value: Optional[str]) -> str:
+    """USPS abbreviation normalization, collapse whitespace, case-fold."""
+    if not value or not isinstance(value, str):
+        return ""
+    addr = value.strip()
+    if not addr:
+        return ""
+
+    # Case-fold and clean punctuation (keep #)
+    addr = addr.casefold()
+    addr = addr.replace(".", " ").replace(",", " ")
+
+    parts = addr.split()
+    normalized_parts = []
+    for part in parts:
+        normalized_parts.append(_USPS_ABBREVIATIONS.get(part, part))
+
+    return " ".join(normalized_parts)
+
+
+# ---------------------------------------------------------------------------
+# Registry
+# ---------------------------------------------------------------------------
+
+_NORMALIZER_MAP: dict[NormalizerType, Callable[[str], str]] = {
+    NormalizerType.EMAIL: normalize_email,
+    NormalizerType.PHONE: normalize_phone,
+    NormalizerType.NAME: normalize_name,
+    NormalizerType.ADDRESS: normalize_address,
+    NormalizerType.STRING: normalize_string,
+}
+
+
+def get_normalizer(normalizer_type: NormalizerType | str) -> Callable[[str], str]:
+    """Return the normalizer function for the given type.
+
+    Accepts both ``NormalizerType`` enum values and plain strings.
+    """
+    if isinstance(normalizer_type, str):
+        normalizer_type = NormalizerType(normalizer_type.lower())
+    func = _NORMALIZER_MAP.get(normalizer_type)
+    if func is None:
+        raise ValueError(f"Unknown normalizer type: {normalizer_type}")
+    return func
--- a/src/gui/init.py
+++ b/src/gui/init.py
@@ -0,0 +1 @@
+"""Streamlit GUI for the DataTools Deduplicator."""
--- a/src/gui/main.py
+++ b/src/gui/main.py
@@ -0,0 +1,8 @@
+"""Allow running as ``python -m src.gui``."""
+
+import subprocess
+import sys
+from pathlib import Path
+
+app_path = Path(__file__).parent / "app.py"
+subprocess.run([sys.executable, "-m", "streamlit", "run", str(app_path)])
--- a/src/gui/app.py
+++ b/src/gui/app.py
@@ -0,0 +1,287 @@
+"""DataTools Deduplicator — Streamlit GUI.
+
+Launch:
+    streamlit run src/gui/app.py
+"""
+
+from __future__ import annotations
+
+import io
+import sys
+from pathlib import Path
+
+import pandas as pd
+import streamlit as st
+
+# Ensure project root is on sys.path so `src.core` imports work
+_project_root = Path(__file__).resolve().parent.parent.parent
+if str(_project_root) not in sys.path:
+    sys.path.insert(0, str(_project_root))
+
+from src.core.dedup import deduplicate, build_default_strategies, DeduplicationResult
+from src.core.io import read_file, list_sheets
+from src.core.config import DeduplicationConfig
+from src.gui.components import config_panel, match_group_card, results_summary
+
+
+# ---------------------------------------------------------------------------
+# Page config
+# ---------------------------------------------------------------------------
+
+st.set_page_config(
+    page_title="DataTools Deduplicator",
+    page_icon="🔍",
+    layout="wide",
+)
+
+# ---------------------------------------------------------------------------
+# Session state defaults
+# ---------------------------------------------------------------------------
+
+_DEFAULTS = {
+    "df": None,
+    "result": None,
+    "review_decisions": {},
+    "config": None,
+    "file_name": "",
+    "sheet_names": [],
+}
+for key, default in _DEFAULTS.items():
+    if key not in st.session_state:
+        st.session_state[key] = default
+
+
+# ---------------------------------------------------------------------------
+# Header
+# ---------------------------------------------------------------------------
+
+st.title("DataTools Deduplicator")
+st.caption("Find and remove duplicate rows in CSV and Excel files.")
+
+
+# ---------------------------------------------------------------------------
+# File upload
+# ---------------------------------------------------------------------------
+
+uploaded = st.file_uploader(
+    "Upload CSV or Excel file",
+    type=["csv", "tsv", "xlsx", "xls"],
+    help="Supports CSV, TSV, and Excel files. Encoding and delimiters are auto-detected.",
+)
+
+if uploaded is not None:
+    # Detect if file changed
+    if uploaded.name != st.session_state["file_name"]:
+        st.session_state["file_name"] = uploaded.name
+        st.session_state["result"] = None
+        st.session_state["review_decisions"] = {}
+
+        # Read the file
+        try:
+            # Write to a temp file for read_file() which needs a path
+            import tempfile
+            suffix = Path(uploaded.name).suffix
+            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+                tmp.write(uploaded.getvalue())
+                tmp_path = Path(tmp.name)
+
+            # Check for Excel sheets
+            if suffix.lower() in (".xlsx", ".xls"):
+                st.session_state["sheet_names"] = list_sheets(tmp_path)
+            else:
+                st.session_state["sheet_names"] = []
+
+            df = read_file(tmp_path)
+            if not isinstance(df, pd.DataFrame):
+                df = pd.concat(list(df), ignore_index=True)
+
+            st.session_state["df"] = df
+
+            # Clean up temp file
+            tmp_path.unlink(missing_ok=True)
+
+        except Exception as e:
+            st.error(f"Failed to read file: {e}")
+            st.session_state["df"] = None
+
+    df = st.session_state["df"]
+
+    if df is not None:
+        # Sheet selector for Excel files
+        if st.session_state["sheet_names"] and len(st.session_state["sheet_names"]) > 1:
+            sheet = st.selectbox(
+                "Select sheet",
+                st.session_state["sheet_names"],
+            )
+            if sheet != st.session_state.get("_current_sheet"):
+                st.session_state["_current_sheet"] = sheet
+                suffix = Path(uploaded.name).suffix
+                import tempfile
+                with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+                    tmp.write(uploaded.getvalue())
+                    tmp_path = Path(tmp.name)
+                df = read_file(tmp_path, sheet_name=sheet)
+                if not isinstance(df, pd.DataFrame):
+                    df = pd.concat(list(df), ignore_index=True)
+                st.session_state["df"] = df
+                st.session_state["result"] = None
+                st.session_state["review_decisions"] = {}
+                tmp_path.unlink(missing_ok=True)
+
+        # Preview
+        st.subheader(f"Preview: {uploaded.name}")
+        st.caption(f"{len(df)} rows, {len(df.columns)} columns")
+        st.dataframe(df.head(10), use_container_width=True)
+
+        # Advanced options
+        settings = config_panel(df)
+
+        # Apply loaded config if present
+        loaded_cfg = st.session_state.get("loaded_config")
+        if loaded_cfg is not None:
+            settings["strategies"] = loaded_cfg.to_strategies()
+            settings["survivor_rule"] = loaded_cfg.to_survivor_rule()
+            settings["date_column"] = loaded_cfg.date_column
+            settings["merge"] = loaded_cfg.merge
+            # Clear so it doesn't override on every rerun
+            del st.session_state["loaded_config"]
+
+        # ---------------------------------------------------------------------------
+        # Find Duplicates button
+        # ---------------------------------------------------------------------------
+
+        st.divider()
+
+        if st.button("Find Duplicates", type="primary", use_container_width=True):
+            progress_bar = st.progress(0, text="Comparing rows...")
+
+            def _gui_progress(current: int, total: int) -> None:
+                if total > 0:
+                    pct = min(current / total, 1.0)
+                    progress_bar.progress(pct, text=f"Comparing rows... {current:,}/{total:,}")
+
+            with st.spinner("Running deduplication..."):
+                result = deduplicate(
+                    df,
+                    strategies=settings["strategies"],
+                    survivor_rule=settings["survivor_rule"],
+                    date_column=settings["date_column"],
+                    merge=settings["merge"],
+                    preview=False,
+                    progress_callback=_gui_progress,
+                )
+
+            progress_bar.empty()
+            st.session_state["result"] = result
+            st.session_state["review_decisions"] = {}
+
+        # ---------------------------------------------------------------------------
+        # Results
+        # ---------------------------------------------------------------------------
+
+        result: DeduplicationResult | None = st.session_state["result"]
+
+        if result is not None:
+            st.divider()
+            st.subheader("Results")
+
+            # Summary + download buttons
+            results_summary(result, df)
+
+            # Match group review
+            if result.match_groups:
+                st.divider()
+                st.subheader("Match Groups")
+
+                # Batch actions
+                action_left, action_mid, action_right = st.columns(3)
+                with action_left:
+                    if st.button("Accept All"):
+                        for g in result.match_groups:
+                            st.session_state["review_decisions"][g.group_id] = True
+                        st.rerun()
+                with action_mid:
+                    if st.button("Reject All"):
+                        for g in result.match_groups:
+                            st.session_state["review_decisions"][g.group_id] = False
+                        st.rerun()
+                with action_right:
+                    if st.button("Clear Decisions"):
+                        st.session_state["review_decisions"] = {}
+                        st.rerun()
+
+                # Individual group cards
+                decisions = st.session_state["review_decisions"]
+                for i, group in enumerate(result.match_groups):
+                    decision = match_group_card(group, df, group_num=i + 1)
+                    if decision is not None:
+                        decisions[group.group_id] = decision
+                        st.session_state["review_decisions"] = decisions
+                        st.rerun()
+
+                # Show decision summary
+                if decisions:
+                    st.divider()
+                    accepted = sum(1 for v in decisions.values() if v is True)
+                    rejected = sum(1 for v in decisions.values() if v is False)
+                    pending = len(result.match_groups) - len(decisions)
+                    st.caption(
+                        f"Decisions: {accepted} merged, {rejected} kept both, "
+                        f"{pending} pending"
+                    )
+
+                    # Re-run dedup with review decisions applied
+                    if st.button(
+                        "Apply Review Decisions & Download",
+                        type="primary",
+                        use_container_width=True,
+                    ):
+                        def _review_callback(group, _df):
+                            gid = group.group_id
+                            if gid in decisions:
+                                return decisions[gid]
+                            return True  # default: accept
+
+                        reviewed_result = deduplicate(
+                            df,
+                            strategies=settings["strategies"],
+                            survivor_rule=settings["survivor_rule"],
+                            date_column=settings["date_column"],
+                            merge=settings["merge"],
+                            preview=False,
+                            review_callback=_review_callback,
+                        )
+
+                        # Update result and show downloads
+                        st.session_state["result"] = reviewed_result
+
+                        csv_bytes = reviewed_result.deduplicated_df.to_csv(
+                            index=False
+                        ).encode("utf-8-sig")
+                        st.download_button(
+                            "Download Reviewed & Deduplicated CSV",
+                            data=csv_bytes,
+                            file_name="deduplicated_reviewed.csv",
+                            mime="text/csv",
+                            key="reviewed_download",
+                        )
+
+            # Log entries
+            if result.log_entries:
+                with st.expander("Processing Log"):
+                    st.code("\n".join(result.log_entries))
+
+else:
+    # No file uploaded — show placeholder
+    st.info("Upload a CSV or Excel file to get started.")
+
+
+# ---------------------------------------------------------------------------
+# Footer
+# ---------------------------------------------------------------------------
+
+st.divider()
+st.caption(
+    "Runs locally. Your data never leaves this computer. "
+    "| DataTools Deduplicator v1.0"
+)
--- a/src/gui/components.py
+++ b/src/gui/components.py
@@ -0,0 +1,413 @@
+"""Reusable Streamlit widgets for the deduplicator GUI."""
+
+from __future__ import annotations
+
+import io
+from typing import Optional
+
+import pandas as pd
+import streamlit as st
+
+from src.core.dedup import (
+    Algorithm,
+    ColumnMatchStrategy,
+    DeduplicationResult,
+    MatchResult,
+    MatchStrategy,
+    SurvivorRule,
+)
+from src.core.config import (
+    ColumnStrategyConfig,
+    DeduplicationConfig,
+    StrategyConfig,
+)
+from src.core.normalizers import NormalizerType
+
+
+# ---------------------------------------------------------------------------
+# Config panel (advanced options)
+# ---------------------------------------------------------------------------
+
+def config_panel(df: pd.DataFrame) -> dict:
+    """Render the Advanced Options expander. Returns a settings dict.
+
+    Keys returned:
+        strategies: list[MatchStrategy] | None
+        survivor_rule: SurvivorRule
+        date_column: str | None
+        merge: bool
+    """
+    columns = list(df.columns)
+
+    with st.expander("Advanced Options"):
+        col_left, col_right = st.columns(2)
+
+        with col_left:
+            subset_cols = st.multiselect(
+                "Match on columns",
+                columns,
+                default=[],
+                help="Leave empty to auto-detect based on column names.",
+            )
+            key_cols = st.multiselect(
+                "Strong keys",
+                columns,
+                default=[],
+                help="Columns that uniquely identify records (e.g., EIN, SKU). Each is an independent exact-match strategy.",
+            )
+            fuzzy_cols = st.multiselect(
+                "Fuzzy columns",
+                columns,
+                default=[],
+                help="Columns to fuzzy-match. Others use exact matching.",
+            )
+
+        with col_right:
+            algorithm = st.selectbox(
+                "Fuzzy algorithm",
+                ["jaro_winkler", "levenshtein", "token_set_ratio"],
+                index=0,
+                help="jaro_winkler: best for names. levenshtein: best for typos. token_set_ratio: best for addresses.",
+            )
+            threshold = st.slider(
+                "Similarity threshold",
+                min_value=50,
+                max_value=100,
+                value=85,
+                help="Lower = more matches but more false positives.",
+            )
+            survivor = st.selectbox(
+                "Survivor rule",
+                ["first", "last", "most-complete", "most-recent"],
+                index=0,
+                help="Which row to keep when duplicates are found.",
+            )
+
+        # Second row of options
+        col_a, col_b = st.columns(2)
+
+        with col_a:
+            normalize_options = {c: "auto" for c in columns}
+            normalizer_types = ["auto", "email", "phone", "name", "address", "string", "none"]
+
+            normalize_map: dict[str, str] = {}
+            if fuzzy_cols or subset_cols:
+                target_cols = fuzzy_cols or subset_cols
+                st.markdown("**Per-column normalizers**")
+                for col_name in target_cols:
+                    norm = st.selectbox(
+                        f"Normalizer for '{col_name}'",
+                        normalizer_types,
+                        index=0,
+                        key=f"norm_{col_name}",
+                    )
+                    if norm not in ("auto", "none"):
+                        normalize_map[col_name] = norm
+
+        with col_b:
+            merge = st.checkbox(
+                "Merge mode",
+                value=False,
+                help="Fill missing fields in the surviving row from removed duplicates.",
+            )
+            date_column: Optional[str] = None
+            if survivor == "most-recent":
+                date_column = st.selectbox(
+                    "Date column",
+                    columns,
+                    help="Required for most-recent survivor rule.",
+                )
+
+        # Config save/load
+        st.divider()
+        cfg_left, cfg_right = st.columns(2)
+
+        with cfg_left:
+            config_file = st.file_uploader(
+                "Load config profile",
+                type=["json"],
+                help="Load previously saved settings.",
+                key="config_upload",
+            )
+            if config_file is not None:
+                import json
+                try:
+                    data = json.loads(config_file.read())
+                    loaded = DeduplicationConfig.from_dict(data)
+                    st.session_state["loaded_config"] = loaded
+                    st.success("Config loaded.")
+                except Exception as e:
+                    st.error(f"Failed to load config: {e}")
+
+        with cfg_right:
+            if st.button("Save current settings"):
+                cfg = _build_config(
+                    subset_cols, key_cols, fuzzy_cols,
+                    algorithm, threshold, normalize_map,
+                    survivor, date_column, merge,
+                )
+                cfg_json = cfg.to_dict()
+                import json
+                st.download_button(
+                    "Download config JSON",
+                    data=json.dumps(cfg_json, indent=2),
+                    file_name="dedup_config.json",
+                    mime="application/json",
+                )
+
+    # Build strategies from selections
+    strategies = _build_strategies(
+        subset_cols, key_cols, fuzzy_cols,
+        algorithm, threshold, normalize_map,
+    )
+
+    # Survivor rule mapping
+    survivor_map = {
+        "first": SurvivorRule.KEEP_FIRST,
+        "last": SurvivorRule.KEEP_LAST,
+        "most-complete": SurvivorRule.KEEP_MOST_COMPLETE,
+        "most-recent": SurvivorRule.KEEP_MOST_RECENT,
+    }
+
+    return {
+        "strategies": strategies,
+        "survivor_rule": survivor_map[survivor],
+        "date_column": date_column,
+        "merge": merge,
+    }
+
+
+def _build_strategies(
+    subset_cols: list[str],
+    key_cols: list[str],
+    fuzzy_cols: list[str],
+    algorithm: str,
+    threshold: int,
+    normalize_map: dict[str, str],
+) -> Optional[list[MatchStrategy]]:
+    """Build MatchStrategy list from GUI selections. Returns None for auto-detect."""
+    strategies: list[MatchStrategy] = []
+
+    # If user selected columns explicitly, build from those
+    if subset_cols or fuzzy_cols:
+        target_cols = subset_cols if subset_cols else fuzzy_cols
+        fuzzy_set = set(fuzzy_cols)
+        col_strats: list[ColumnMatchStrategy] = []
+        for col in target_cols:
+            norm = None
+            if col in normalize_map:
+                norm = NormalizerType(normalize_map[col])
+            if col in fuzzy_set:
+                algo = Algorithm(algorithm)
+                thresh = float(threshold)
+            else:
+                algo = Algorithm.EXACT
+                thresh = 100.0
+            col_strats.append(ColumnMatchStrategy(
+                column=col, algorithm=algo, threshold=thresh, normalizer=norm,
+            ))
+        strategies.append(MatchStrategy(column_strategies=col_strats))
+
+    # Add strong key strategies
+    if key_cols:
+        for col in key_cols:
+            strategies.append(MatchStrategy(column_strategies=[
+                ColumnMatchStrategy(column=col, algorithm=Algorithm.EXACT, threshold=100.0)
+            ]))
+
+    return strategies if strategies else None
+
+
+def _build_config(
+    subset_cols, key_cols, fuzzy_cols,
+    algorithm, threshold, normalize_map,
+    survivor, date_column, merge,
+) -> DeduplicationConfig:
+    """Build a DeduplicationConfig from GUI state."""
+    cfg = DeduplicationConfig(
+        survivor_rule=survivor.replace("-", "_"),
+        date_column=date_column,
+        merge=merge,
+        subset_columns=subset_cols or None,
+        fuzzy_columns=fuzzy_cols or None,
+        default_algorithm=algorithm,
+        default_threshold=float(threshold),
+        normalize_map=normalize_map or None,
+    )
+    strategies = _build_strategies(
+        subset_cols, key_cols, fuzzy_cols,
+        algorithm, threshold, normalize_map,
+    )
+    if strategies:
+        cfg.strategies = [
+            StrategyConfig(columns=[
+                ColumnStrategyConfig(
+                    column=cs.column,
+                    algorithm=cs.algorithm.value,
+                    threshold=cs.threshold,
+                    normalizer=cs.normalizer.value if cs.normalizer else None,
+                )
+                for cs in s.column_strategies
+            ])
+            for s in strategies
+        ]
+    return cfg
+
+
+# ---------------------------------------------------------------------------
+# Match group review card
+# ---------------------------------------------------------------------------
+
+def match_group_card(
+    group: MatchResult,
+    df: pd.DataFrame,
+    group_num: int,
+) -> Optional[bool]:
+    """Render an expandable match group card with side-by-side diff.
+
+    Returns:
+        True  — user clicked Merge (accept match)
+        False — user clicked Keep Both (reject match)
+        None  — no decision yet
+    """
+    confidence = group.confidence
+    auto_expand = confidence < 95.0
+    matched_on = ", ".join(group.matched_on)
+    n_rows = len(group.row_indices)
+
+    label = (
+        f"Group {group_num}: {n_rows} rows "
+        f"(confidence: {confidence:.0f}%) "
+        f"[{matched_on}]"
+    )
+
+    with st.expander(label, expanded=auto_expand):
+        # Build comparison DataFrame
+        display_cols = [c for c in df.columns if not str(c).startswith("_norm_")]
+        rows_data = []
+        for idx in group.row_indices:
+            row = {"_row": idx + 1}
+            for col in display_cols:
+                row[col] = df.iloc[idx].get(col, "")
+            rows_data.append(row)
+
+        compare_df = pd.DataFrame(rows_data)
+        compare_df = compare_df.set_index("_row")
+
+        # Highlight differences
+        def _highlight_diffs(s: pd.Series) -> list[str]:
+            """Highlight cells that differ from the first row."""
+            styles = []
+            first_val = str(s.iloc[0]).strip() if len(s) > 0 else ""
+            for val in s:
+                val_str = str(val).strip()
+                if val_str != first_val and val_str and first_val:
+                    styles.append("background-color: rgba(245, 166, 35, 0.2)")
+                elif not val_str and first_val:
+                    styles.append("background-color: rgba(240, 82, 82, 0.1)")
+                else:
+                    styles.append("")
+            return styles
+
+        styled = compare_df.style.apply(_highlight_diffs, axis=0)
+        st.dataframe(styled, use_container_width=True)
+
+        # Action buttons
+        btn_left, btn_mid, btn_right = st.columns(3)
+        merge_key = f"merge_{group.group_id}"
+        keep_key = f"keep_{group.group_id}"
+
+        with btn_left:
+            if st.button("Merge", key=merge_key, type="primary"):
+                return True
+        with btn_mid:
+            if st.button("Keep Both", key=keep_key):
+                return False
+
+        # Check session state for previous decisions
+        decisions = st.session_state.get("review_decisions", {})
+        if group.group_id in decisions:
+            decision = decisions[group.group_id]
+            if decision is True:
+                st.success("Decision: Merge")
+            elif decision is False:
+                st.info("Decision: Keep Both")
+
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Results summary + downloads
+# ---------------------------------------------------------------------------
+
+def results_summary(
+    result: DeduplicationResult,
+    original_df: pd.DataFrame,
+) -> None:
+    """Render summary stats and download buttons."""
+    removed = result.original_row_count - len(result.deduplicated_df)
+
+    # Summary metrics
+    col1, col2, col3, col4 = st.columns(4)
+    col1.metric("Rows In", result.original_row_count)
+    col2.metric("Rows Out", len(result.deduplicated_df))
+    col3.metric("Removed", removed)
+    col4.metric("Groups", len(result.match_groups))
+
+    st.divider()
+
+    # Download buttons
+    dl_left, dl_mid, dl_right = st.columns(3)
+
+    with dl_left:
+        csv_bytes = result.deduplicated_df.to_csv(index=False).encode("utf-8-sig")
+        st.download_button(
+            "Download Deduplicated CSV",
+            data=csv_bytes,
+            file_name="deduplicated.csv",
+            mime="text/csv",
+        )
+
+    with dl_mid:
+        if not result.removed_df.empty:
+            removed_bytes = result.removed_df.to_csv(index=False).encode("utf-8-sig")
+            st.download_button(
+                "Download Removed Rows",
+                data=removed_bytes,
+                file_name="removed_rows.csv",
+                mime="text/csv",
+            )
+
+    with dl_right:
+        if result.match_groups:
+            groups_data = _build_match_groups_csv(result, original_df)
+            st.download_button(
+                "Download Match Groups Report",
+                data=groups_data,
+                file_name="match_groups.csv",
+                mime="text/csv",
+            )
+
+
+def _build_match_groups_csv(
+    result: DeduplicationResult,
+    original_df: pd.DataFrame,
+) -> bytes:
+    """Build the match groups audit CSV as bytes."""
+    rows = []
+    for g in result.match_groups:
+        for idx in g.row_indices:
+            row_data = {
+                "_group_id": g.group_id + 1,
+                "_is_survivor": idx == g.survivor_index,
+                "_confidence": g.confidence,
+                "_matched_on": ", ".join(g.matched_on),
+                "_original_row": idx + 1,
+            }
+            for col in original_df.columns:
+                if not str(col).startswith("_norm_"):
+                    row_data[col] = original_df.iloc[idx].get(col, "") if idx < len(original_df) else ""
+            rows.append(row_data)
+
+    groups_df = pd.DataFrame(rows)
+    return groups_df.to_csv(index=False).encode("utf-8-sig")
				`@@ -0,0 +1 @@`
				`"""Streamlit GUI for the DataTools Deduplicator."""`