feat: add documentation, Streamlit GUI, and full source tree

- Rewrite README.md with project overview, quick-start, and CLI summary - Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections - Add docs/DEVELOPER.md with architecture, data flow, and extension guides - Rewrite src/core/__init__.py with public API exports and module docstring - Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive match group review with side-by-side diff, and download buttons - Add .gitignore, requirements.txt, all source code, tests, and sample data - Add streamlit to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-28 23:06:39 +00:00
parent 0613dc420c
commit b871ab24fc
47 changed files with 4413 additions and 2 deletions
--- a/src/cli.py
+++ b/src/cli.py
@@ -0,0 +1,502 @@
+"""CLI for the DataTools deduplicator.
+
+Usage:
+    python -m src.cli input.csv                         # dry-run preview
+    python -m src.cli input.csv --apply                 # write deduplicated output
+    python -m src.cli input.csv --fuzzy name --merge    # fuzzy match + merge
+    python -m src.cli --help                            # full help
+"""
+
+from __future__ import annotations
+
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+import typer
+from loguru import logger
+from rapidfuzz import process as rf_process
+
+app = typer.Typer(
+    name="dedup",
+    help=(
+        "Find and remove duplicate rows in CSV and Excel files.\n\n"
+        "By default, runs in preview mode — shows what would change without "
+        "modifying anything. Add --apply to write the output.\n\n"
+        "Examples:\n\n"
+        "  # Preview duplicates in a CSV file\n"
+        "  python -m src.cli customers.csv\n\n"
+        "  # Remove duplicates and save the result\n"
+        "  python -m src.cli customers.csv --apply\n\n"
+        "  # Fuzzy-match on the 'name' column with 80% threshold\n"
+        "  python -m src.cli customers.csv --fuzzy name --threshold 80 --apply\n\n"
+        "  # Match on specific columns only\n"
+        "  python -m src.cli customers.csv --subset email,phone --apply\n\n"
+        "  # Keep the most complete row and merge missing fields\n"
+        "  python -m src.cli customers.csv --survivor most-complete --merge --apply\n"
+    ),
+    add_completion=False,
+    no_args_is_help=True,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _setup_logging(log_dir: Path) -> Path:
+    """Configure loguru to write a timestamped log file. Returns the log path."""
+    log_dir.mkdir(parents=True, exist_ok=True)
+    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+    log_path = log_dir / f"dedup_{ts}.log"
+    logger.remove()  # remove default stderr handler
+    logger.add(sys.stderr, level="WARNING", format="{message}")
+    logger.add(str(log_path), level="DEBUG",
+               format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}")
+    return log_path
+
+
+def _suggest_column(name: str, available: list[str]) -> str:
+    """Return a helpful error message when a column is not found."""
+    cols_str = ", ".join(available)
+    matches = rf_process.extract(name, available, limit=1, score_cutoff=50)
+    if matches:
+        suggestion = matches[0][0]
+        return (
+            f"Column '{name}' not found. "
+            f"Available columns: {cols_str}. "
+            f"Did you mean '{suggestion}'?"
+        )
+    return f"Column '{name}' not found. Available columns: {cols_str}."
+
+
+def _validate_columns(requested: list[str], available: list[str]) -> None:
+    """Raise typer.BadParameter if any requested column doesn't exist."""
+    for col in requested:
+        if col not in available:
+            raise typer.BadParameter(_suggest_column(col, available))
+
+
+def _parse_normalize_map(raw: Optional[str]) -> dict[str, str]:
+    """Parse 'col:type,col:type' into a dict."""
+    if not raw:
+        return {}
+    result = {}
+    for pair in raw.split(","):
+        pair = pair.strip()
+        if ":" not in pair:
+            raise typer.BadParameter(
+                f"Invalid normalize format: '{pair}'. "
+                f"Expected 'column:type' (e.g., 'email:email,phone:phone')."
+            )
+        col, ntype = pair.split(":", 1)
+        result[col.strip()] = ntype.strip()
+    return result
+
+
+def _interactive_review(group, df) -> Optional[bool]:
+    """Side-by-side CLI review for a match group. Returns True/False/None."""
+    from src.core.dedup import MatchResult
+    group: MatchResult
+
+    print(f"\n{'='*60}")
+    print(f"Match Group {group.group_id + 1} — Confidence: {group.confidence:.1f}%")
+    print(f"Matched on: {', '.join(group.matched_on)}")
+    print(f"{'='*60}")
+
+    display_cols = [c for c in df.columns if not str(c).startswith("_norm_")]
+    for idx in group.row_indices:
+        print(f"\n  Row {idx + 1}:")
+        for col in display_cols:
+            val = df.iloc[idx].get(col, "")
+            if str(val).strip():
+                print(f"    {col}: {val}")
+
+    while True:
+        choice = input("\n  [y] Merge  [n] Keep both  [s] Skip remaining: ").strip().lower()
+        if choice == "y":
+            return True
+        if choice == "n":
+            return False
+        if choice == "s":
+            return None
+        print("  Please enter y, n, or s.")
+
+
+# ---------------------------------------------------------------------------
+# Main command
+# ---------------------------------------------------------------------------
+
+@app.command()
+def dedup(
+    input_file: str = typer.Argument(
+        ...,
+        help="Path to the CSV or Excel file to deduplicate.",
+    ),
+    output: Optional[str] = typer.Option(
+        None, "--output", "-o",
+        help="Output file path. Default: {input}_deduplicated.csv",
+    ),
+    apply: bool = typer.Option(
+        False, "--apply",
+        help="Write the output file. Without this flag, only a preview is shown.",
+    ),
+    key: Optional[str] = typer.Option(
+        None, "--key", "-k",
+        help="Comma-separated strong-key columns (e.g., 'fb_id,ein'). Each is an independent exact-match dedup key.",
+    ),
+    subset: Optional[str] = typer.Option(
+        None, "--subset", "-s",
+        help="Comma-separated columns to match on (default: auto-detect).",
+    ),
+    fuzzy: Optional[str] = typer.Option(
+        None, "--fuzzy",
+        help="Comma-separated columns to fuzzy-match (others use exact match).",
+    ),
+    algorithm: str = typer.Option(
+        "jaro_winkler", "--algorithm", "-a",
+        help="Fuzzy algorithm: levenshtein, jaro_winkler, or token_set_ratio.",
+    ),
+    threshold: int = typer.Option(
+        85, "--threshold", "-t",
+        help="Similarity threshold 0-100 for fuzzy matching.",
+    ),
+    normalize: Optional[str] = typer.Option(
+        None, "--normalize",
+        help="Column normalizers as 'col:type' pairs (e.g., 'email:email,phone:phone').",
+    ),
+    survivor: str = typer.Option(
+        "first", "--survivor",
+        help="Survivor rule: first, last, most-complete, or most-recent.",
+    ),
+    date_column: Optional[str] = typer.Option(
+        None, "--date-column",
+        help="Date column for most-recent survivor rule.",
+    ),
+    merge: bool = typer.Option(
+        False, "--merge",
+        help="Fill missing fields in the surviving row from removed duplicates.",
+    ),
+    review: bool = typer.Option(
+        False, "--review",
+        help="Interactively review each match group before merging.",
+    ),
+    config: Optional[str] = typer.Option(
+        None, "--config",
+        help="Load settings from a saved JSON config file.",
+    ),
+    save_config: Optional[str] = typer.Option(
+        None, "--save-config",
+        help="Save current settings to a JSON config file.",
+    ),
+    sheet: Optional[str] = typer.Option(
+        None, "--sheet",
+        help="Excel sheet name or index (default: first sheet).",
+    ),
+    encoding_override: Optional[str] = typer.Option(
+        None, "--encoding",
+        help="Override auto-detected file encoding.",
+    ),
+    header_row: Optional[int] = typer.Option(
+        None, "--header-row",
+        help="0-based row index for the header (default: auto-detect).",
+    ),
+):
+    """Find and remove duplicate rows in CSV and Excel files."""
+    from src.core.io import read_file, write_file, list_sheets
+    from src.core.dedup import (
+        Algorithm, ColumnMatchStrategy, MatchStrategy, SurvivorRule,
+        build_default_strategies, deduplicate,
+    )
+    from src.core.normalizers import NormalizerType
+    from src.core.config import DeduplicationConfig
+
+    # Setup
+    input_path = Path(input_file)
+    if not input_path.exists():
+        typer.echo(f"Error: File not found: {input_path}", err=True)
+        raise typer.Exit(1)
+
+    log_path = _setup_logging(Path("logs"))
+
+    # Load config if provided
+    cfg: Optional[DeduplicationConfig] = None
+    if config:
+        config_path = Path(config)
+        if not config_path.exists():
+            typer.echo(f"Error: Config file not found: {config_path}", err=True)
+            raise typer.Exit(1)
+        cfg = DeduplicationConfig.from_file(config_path)
+        logger.info("Loaded config from {}", config_path)
+
+    # Read input
+    typer.echo(f"Reading {input_path.name}...")
+    try:
+        sheet_arg: str | int | None = None
+        if sheet is not None:
+            try:
+                sheet_arg = int(sheet)
+            except ValueError:
+                sheet_arg = sheet
+
+        df = read_file(
+            input_path,
+            encoding=encoding_override,
+            header_row=header_row,
+            sheet_name=sheet_arg if sheet_arg is not None else 0,
+        )
+        if not isinstance(df, __import__("pandas").DataFrame):
+            # chunked reading returns generator — materialise for v1
+            import pandas as pd
+            df = pd.concat(list(df), ignore_index=True)
+    except Exception as e:
+        typer.echo(f"Error reading file: {e}", err=True)
+        raise typer.Exit(1)
+
+    typer.echo(f"  {len(df)} rows, {len(df.columns)} columns")
+    available_columns = list(df.columns)
+
+    # Build strategies
+    strategies: Optional[list[MatchStrategy]] = None
+
+    if cfg and cfg.strategies:
+        strategies = cfg.to_strategies()
+    elif subset or fuzzy:
+        # Build from CLI flags
+        normalize_map = _parse_normalize_map(normalize)
+        strategies = []
+
+        fuzzy_cols = set(c.strip() for c in fuzzy.split(",")) if fuzzy else set()
+        if subset:
+            subset_cols = [c.strip() for c in subset.split(",")]
+        elif fuzzy_cols:
+            # When only --fuzzy is given, match on just those columns
+            subset_cols = list(fuzzy_cols)
+        else:
+            subset_cols = available_columns
+
+        _validate_columns(subset_cols, available_columns)
+        if fuzzy_cols:
+            _validate_columns(list(fuzzy_cols), available_columns)
+
+        col_strats: list[ColumnMatchStrategy] = []
+        for col in subset_cols:
+            norm = None
+            if col in normalize_map:
+                norm = NormalizerType(normalize_map[col])
+
+            if col in fuzzy_cols:
+                algo = Algorithm(algorithm)
+                thresh = float(threshold)
+            else:
+                algo = Algorithm.EXACT
+                thresh = 100.0
+
+            col_strats.append(ColumnMatchStrategy(
+                column=col, algorithm=algo, threshold=thresh, normalizer=norm,
+            ))
+
+        strategies = [MatchStrategy(column_strategies=col_strats)]
+
+    # Apply normalizer overrides even with auto-detect
+    if normalize and strategies is None:
+        normalize_map = _parse_normalize_map(normalize)
+        auto_strats = build_default_strategies(df)
+        # Inject normalize_map into auto strategies
+        for strat in auto_strats:
+            for cs in strat.column_strategies:
+                if cs.column in normalize_map:
+                    cs.normalizer = NormalizerType(normalize_map[cs.column])
+        strategies = auto_strats
+
+    # --key: add user-declared strong keys as standalone exact-match strategies
+    if key:
+        key_cols = [c.strip() for c in key.split(",")]
+        _validate_columns(key_cols, available_columns)
+        key_strats = [
+            MatchStrategy(column_strategies=[
+                ColumnMatchStrategy(column=col, algorithm=Algorithm.EXACT, threshold=100.0)
+            ])
+            for col in key_cols
+        ]
+        if strategies is None:
+            # Combine with auto-detect so user gets both
+            strategies = build_default_strategies(df) + key_strats
+        else:
+            strategies.extend(key_strats)
+
+    # Survivor rule
+    survivor_map = {
+        "first": SurvivorRule.KEEP_FIRST,
+        "last": SurvivorRule.KEEP_LAST,
+        "most-complete": SurvivorRule.KEEP_MOST_COMPLETE,
+        "most_complete": SurvivorRule.KEEP_MOST_COMPLETE,
+        "most-recent": SurvivorRule.KEEP_MOST_RECENT,
+        "most_recent": SurvivorRule.KEEP_MOST_RECENT,
+    }
+    if cfg:
+        surv_rule = cfg.to_survivor_rule()
+        do_merge = cfg.merge
+        dc = cfg.date_column
+    else:
+        surv_key = survivor.lower().replace("-", "_")
+        if surv_key not in {r.value for r in SurvivorRule} and surv_key not in survivor_map:
+            typer.echo(
+                f"Error: Unknown survivor rule '{survivor}'. "
+                f"Choose from: first, last, most-complete, most-recent.",
+                err=True,
+            )
+            raise typer.Exit(1)
+        surv_rule = survivor_map.get(survivor.lower(), SurvivorRule(surv_key))
+        do_merge = merge
+        dc = date_column
+
+    # Save config if requested
+    if save_config:
+        from src.core.config import DeduplicationConfig, StrategyConfig, ColumnStrategyConfig
+        save_cfg = DeduplicationConfig(
+            survivor_rule=surv_rule.value,
+            date_column=dc,
+            merge=do_merge,
+            subset_columns=[c.strip() for c in subset.split(",")] if subset else None,
+            fuzzy_columns=[c.strip() for c in fuzzy.split(",")] if fuzzy else None,
+            default_algorithm=algorithm,
+            default_threshold=float(threshold),
+            normalize_map=_parse_normalize_map(normalize),
+        )
+        if strategies:
+            save_cfg.strategies = [
+                StrategyConfig(columns=[
+                    ColumnStrategyConfig(
+                        column=cs.column,
+                        algorithm=cs.algorithm.value,
+                        threshold=cs.threshold,
+                        normalizer=cs.normalizer.value if cs.normalizer else None,
+                    )
+                    for cs in s.column_strategies
+                ])
+                for s in strategies
+            ]
+        saved = save_cfg.to_file(save_config)
+        typer.echo(f"Config saved to {saved}")
+
+    # Progress bar
+    progress_cb = None
+    if len(df) > 10_000:
+        from tqdm import tqdm
+        pbar = tqdm(total=len(df) * (len(df) - 1) // 2, desc="Comparing rows",
+                    unit="pairs", leave=False)
+
+        def _progress(current: int, total: int):
+            pbar.update(current - pbar.n)
+            if current >= total:
+                pbar.close()
+
+        progress_cb = _progress
+
+    # Review callback
+    review_cb = _interactive_review if review else None
+
+    # Run dedup
+    typer.echo("Finding duplicates...")
+    result = deduplicate(
+        df,
+        strategies=strategies,
+        survivor_rule=surv_rule,
+        date_column=dc,
+        merge=do_merge,
+        preview=not apply,
+        review_callback=review_cb,
+        progress_callback=progress_cb,
+    )
+
+    # Print results
+    _print_results(result, input_path)
+
+    # Write output files
+    if apply:
+        stem = input_path.stem
+        suffix = input_path.suffix
+
+        out_path = Path(output) if output else input_path.parent / f"{stem}_deduplicated.csv"
+        write_file(result.deduplicated_df, out_path)
+        typer.echo(f"\nDeduplicated file: {out_path}")
+
+        if not result.removed_df.empty:
+            removed_path = input_path.parent / f"{stem}_removed.csv"
+            write_file(result.removed_df, removed_path)
+            typer.echo(f"Removed rows:     {removed_path}")
+
+        if result.match_groups:
+            groups_path = input_path.parent / f"{stem}_match_groups.csv"
+            _write_match_groups(result, df, groups_path)
+            typer.echo(f"Match groups:     {groups_path}")
+    else:
+        typer.echo("\nThis was a preview. Add --apply to write the output files.")
+
+    typer.echo(f"Log: {log_path}")
+
+
+# ---------------------------------------------------------------------------
+# Output formatting
+# ---------------------------------------------------------------------------
+
+def _print_results(result, input_path: Path) -> None:
+    """Print a human-readable summary."""
+    removed = result.original_row_count - len(result.deduplicated_df)
+    typer.echo(f"\n{'─'*50}")
+    typer.echo(f"  File:      {input_path.name}")
+    typer.echo(f"  Rows in:   {result.original_row_count}")
+    typer.echo(f"  Rows out:  {len(result.deduplicated_df)}")
+    typer.echo(f"  Removed:   {removed}")
+    typer.echo(f"  Groups:    {len(result.match_groups)}")
+    typer.echo(f"{'─'*50}")
+
+    if result.match_groups:
+        typer.echo("\nMatch groups:")
+        for g in result.match_groups[:20]:  # cap display
+            rows_str = ", ".join(str(i + 1) for i in g.row_indices)
+            surv = g.survivor_index + 1
+            typer.echo(
+                f"  Group {g.group_id + 1}: rows [{rows_str}] "
+                f"→ keep row {surv} "
+                f"(confidence: {g.confidence:.1f}%, "
+                f"matched on: {', '.join(g.matched_on)})"
+            )
+        if len(result.match_groups) > 20:
+            typer.echo(f"  ... and {len(result.match_groups) - 20} more groups")
+
+
+def _write_match_groups(result, original_df, path: Path) -> None:
+    """Write match groups to a CSV for audit."""
+    import pandas as pd
+    from src.core.io import write_file
+
+    rows = []
+    for g in result.match_groups:
+        for idx in g.row_indices:
+            row_data = {"_group_id": g.group_id + 1}
+            row_data["_is_survivor"] = idx == g.survivor_index
+            row_data["_confidence"] = g.confidence
+            row_data["_matched_on"] = ", ".join(g.matched_on)
+            row_data["_original_row"] = idx + 1
+            # Include original data
+            for col in original_df.columns:
+                row_data[col] = original_df.iloc[idx].get(col, "")
+            rows.append(row_data)
+
+    groups_df = pd.DataFrame(rows)
+    write_file(groups_df, path)
+
+
+# ---------------------------------------------------------------------------
+# __main__ support
+# ---------------------------------------------------------------------------
+
+def main():
+    app()
+
+
+if __name__ == "__main__":
+    main()