feat: add documentation, Streamlit GUI, and full source tree

- Rewrite README.md with project overview, quick-start, and CLI summary - Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections - Add docs/DEVELOPER.md with architecture, data flow, and extension guides - Rewrite src/core/__init__.py with public API exports and module docstring - Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive match group review with side-by-side diff, and download buttons - Add .gitignore, requirements.txt, all source code, tests, and sample data - Add streamlit to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-28 23:06:39 +00:00
parent 0613dc420c
commit b871ab24fc
47 changed files with 4413 additions and 2 deletions
--- a/src/gui/components.py
+++ b/src/gui/components.py
@@ -0,0 +1,413 @@
+"""Reusable Streamlit widgets for the deduplicator GUI."""
+
+from __future__ import annotations
+
+import io
+from typing import Optional
+
+import pandas as pd
+import streamlit as st
+
+from src.core.dedup import (
+    Algorithm,
+    ColumnMatchStrategy,
+    DeduplicationResult,
+    MatchResult,
+    MatchStrategy,
+    SurvivorRule,
+)
+from src.core.config import (
+    ColumnStrategyConfig,
+    DeduplicationConfig,
+    StrategyConfig,
+)
+from src.core.normalizers import NormalizerType
+
+
+# ---------------------------------------------------------------------------
+# Config panel (advanced options)
+# ---------------------------------------------------------------------------
+
+def config_panel(df: pd.DataFrame) -> dict:
+    """Render the Advanced Options expander. Returns a settings dict.
+
+    Keys returned:
+        strategies: list[MatchStrategy] | None
+        survivor_rule: SurvivorRule
+        date_column: str | None
+        merge: bool
+    """
+    columns = list(df.columns)
+
+    with st.expander("Advanced Options"):
+        col_left, col_right = st.columns(2)
+
+        with col_left:
+            subset_cols = st.multiselect(
+                "Match on columns",
+                columns,
+                default=[],
+                help="Leave empty to auto-detect based on column names.",
+            )
+            key_cols = st.multiselect(
+                "Strong keys",
+                columns,
+                default=[],
+                help="Columns that uniquely identify records (e.g., EIN, SKU). Each is an independent exact-match strategy.",
+            )
+            fuzzy_cols = st.multiselect(
+                "Fuzzy columns",
+                columns,
+                default=[],
+                help="Columns to fuzzy-match. Others use exact matching.",
+            )
+
+        with col_right:
+            algorithm = st.selectbox(
+                "Fuzzy algorithm",
+                ["jaro_winkler", "levenshtein", "token_set_ratio"],
+                index=0,
+                help="jaro_winkler: best for names. levenshtein: best for typos. token_set_ratio: best for addresses.",
+            )
+            threshold = st.slider(
+                "Similarity threshold",
+                min_value=50,
+                max_value=100,
+                value=85,
+                help="Lower = more matches but more false positives.",
+            )
+            survivor = st.selectbox(
+                "Survivor rule",
+                ["first", "last", "most-complete", "most-recent"],
+                index=0,
+                help="Which row to keep when duplicates are found.",
+            )
+
+        # Second row of options
+        col_a, col_b = st.columns(2)
+
+        with col_a:
+            normalize_options = {c: "auto" for c in columns}
+            normalizer_types = ["auto", "email", "phone", "name", "address", "string", "none"]
+
+            normalize_map: dict[str, str] = {}
+            if fuzzy_cols or subset_cols:
+                target_cols = fuzzy_cols or subset_cols
+                st.markdown("**Per-column normalizers**")
+                for col_name in target_cols:
+                    norm = st.selectbox(
+                        f"Normalizer for '{col_name}'",
+                        normalizer_types,
+                        index=0,
+                        key=f"norm_{col_name}",
+                    )
+                    if norm not in ("auto", "none"):
+                        normalize_map[col_name] = norm
+
+        with col_b:
+            merge = st.checkbox(
+                "Merge mode",
+                value=False,
+                help="Fill missing fields in the surviving row from removed duplicates.",
+            )
+            date_column: Optional[str] = None
+            if survivor == "most-recent":
+                date_column = st.selectbox(
+                    "Date column",
+                    columns,
+                    help="Required for most-recent survivor rule.",
+                )
+
+        # Config save/load
+        st.divider()
+        cfg_left, cfg_right = st.columns(2)
+
+        with cfg_left:
+            config_file = st.file_uploader(
+                "Load config profile",
+                type=["json"],
+                help="Load previously saved settings.",
+                key="config_upload",
+            )
+            if config_file is not None:
+                import json
+                try:
+                    data = json.loads(config_file.read())
+                    loaded = DeduplicationConfig.from_dict(data)
+                    st.session_state["loaded_config"] = loaded
+                    st.success("Config loaded.")
+                except Exception as e:
+                    st.error(f"Failed to load config: {e}")
+
+        with cfg_right:
+            if st.button("Save current settings"):
+                cfg = _build_config(
+                    subset_cols, key_cols, fuzzy_cols,
+                    algorithm, threshold, normalize_map,
+                    survivor, date_column, merge,
+                )
+                cfg_json = cfg.to_dict()
+                import json
+                st.download_button(
+                    "Download config JSON",
+                    data=json.dumps(cfg_json, indent=2),
+                    file_name="dedup_config.json",
+                    mime="application/json",
+                )
+
+    # Build strategies from selections
+    strategies = _build_strategies(
+        subset_cols, key_cols, fuzzy_cols,
+        algorithm, threshold, normalize_map,
+    )
+
+    # Survivor rule mapping
+    survivor_map = {
+        "first": SurvivorRule.KEEP_FIRST,
+        "last": SurvivorRule.KEEP_LAST,
+        "most-complete": SurvivorRule.KEEP_MOST_COMPLETE,
+        "most-recent": SurvivorRule.KEEP_MOST_RECENT,
+    }
+
+    return {
+        "strategies": strategies,
+        "survivor_rule": survivor_map[survivor],
+        "date_column": date_column,
+        "merge": merge,
+    }
+
+
+def _build_strategies(
+    subset_cols: list[str],
+    key_cols: list[str],
+    fuzzy_cols: list[str],
+    algorithm: str,
+    threshold: int,
+    normalize_map: dict[str, str],
+) -> Optional[list[MatchStrategy]]:
+    """Build MatchStrategy list from GUI selections. Returns None for auto-detect."""
+    strategies: list[MatchStrategy] = []
+
+    # If user selected columns explicitly, build from those
+    if subset_cols or fuzzy_cols:
+        target_cols = subset_cols if subset_cols else fuzzy_cols
+        fuzzy_set = set(fuzzy_cols)
+        col_strats: list[ColumnMatchStrategy] = []
+        for col in target_cols:
+            norm = None
+            if col in normalize_map:
+                norm = NormalizerType(normalize_map[col])
+            if col in fuzzy_set:
+                algo = Algorithm(algorithm)
+                thresh = float(threshold)
+            else:
+                algo = Algorithm.EXACT
+                thresh = 100.0
+            col_strats.append(ColumnMatchStrategy(
+                column=col, algorithm=algo, threshold=thresh, normalizer=norm,
+            ))
+        strategies.append(MatchStrategy(column_strategies=col_strats))
+
+    # Add strong key strategies
+    if key_cols:
+        for col in key_cols:
+            strategies.append(MatchStrategy(column_strategies=[
+                ColumnMatchStrategy(column=col, algorithm=Algorithm.EXACT, threshold=100.0)
+            ]))
+
+    return strategies if strategies else None
+
+
+def _build_config(
+    subset_cols, key_cols, fuzzy_cols,
+    algorithm, threshold, normalize_map,
+    survivor, date_column, merge,
+) -> DeduplicationConfig:
+    """Build a DeduplicationConfig from GUI state."""
+    cfg = DeduplicationConfig(
+        survivor_rule=survivor.replace("-", "_"),
+        date_column=date_column,
+        merge=merge,
+        subset_columns=subset_cols or None,
+        fuzzy_columns=fuzzy_cols or None,
+        default_algorithm=algorithm,
+        default_threshold=float(threshold),
+        normalize_map=normalize_map or None,
+    )
+    strategies = _build_strategies(
+        subset_cols, key_cols, fuzzy_cols,
+        algorithm, threshold, normalize_map,
+    )
+    if strategies:
+        cfg.strategies = [
+            StrategyConfig(columns=[
+                ColumnStrategyConfig(
+                    column=cs.column,
+                    algorithm=cs.algorithm.value,
+                    threshold=cs.threshold,
+                    normalizer=cs.normalizer.value if cs.normalizer else None,
+                )
+                for cs in s.column_strategies
+            ])
+            for s in strategies
+        ]
+    return cfg
+
+
+# ---------------------------------------------------------------------------
+# Match group review card
+# ---------------------------------------------------------------------------
+
+def match_group_card(
+    group: MatchResult,
+    df: pd.DataFrame,
+    group_num: int,
+) -> Optional[bool]:
+    """Render an expandable match group card with side-by-side diff.
+
+    Returns:
+        True  — user clicked Merge (accept match)
+        False — user clicked Keep Both (reject match)
+        None  — no decision yet
+    """
+    confidence = group.confidence
+    auto_expand = confidence < 95.0
+    matched_on = ", ".join(group.matched_on)
+    n_rows = len(group.row_indices)
+
+    label = (
+        f"Group {group_num}: {n_rows} rows "
+        f"(confidence: {confidence:.0f}%) "
+        f"[{matched_on}]"
+    )
+
+    with st.expander(label, expanded=auto_expand):
+        # Build comparison DataFrame
+        display_cols = [c for c in df.columns if not str(c).startswith("_norm_")]
+        rows_data = []
+        for idx in group.row_indices:
+            row = {"_row": idx + 1}
+            for col in display_cols:
+                row[col] = df.iloc[idx].get(col, "")
+            rows_data.append(row)
+
+        compare_df = pd.DataFrame(rows_data)
+        compare_df = compare_df.set_index("_row")
+
+        # Highlight differences
+        def _highlight_diffs(s: pd.Series) -> list[str]:
+            """Highlight cells that differ from the first row."""
+            styles = []
+            first_val = str(s.iloc[0]).strip() if len(s) > 0 else ""
+            for val in s:
+                val_str = str(val).strip()
+                if val_str != first_val and val_str and first_val:
+                    styles.append("background-color: rgba(245, 166, 35, 0.2)")
+                elif not val_str and first_val:
+                    styles.append("background-color: rgba(240, 82, 82, 0.1)")
+                else:
+                    styles.append("")
+            return styles
+
+        styled = compare_df.style.apply(_highlight_diffs, axis=0)
+        st.dataframe(styled, use_container_width=True)
+
+        # Action buttons
+        btn_left, btn_mid, btn_right = st.columns(3)
+        merge_key = f"merge_{group.group_id}"
+        keep_key = f"keep_{group.group_id}"
+
+        with btn_left:
+            if st.button("Merge", key=merge_key, type="primary"):
+                return True
+        with btn_mid:
+            if st.button("Keep Both", key=keep_key):
+                return False
+
+        # Check session state for previous decisions
+        decisions = st.session_state.get("review_decisions", {})
+        if group.group_id in decisions:
+            decision = decisions[group.group_id]
+            if decision is True:
+                st.success("Decision: Merge")
+            elif decision is False:
+                st.info("Decision: Keep Both")
+
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Results summary + downloads
+# ---------------------------------------------------------------------------
+
+def results_summary(
+    result: DeduplicationResult,
+    original_df: pd.DataFrame,
+) -> None:
+    """Render summary stats and download buttons."""
+    removed = result.original_row_count - len(result.deduplicated_df)
+
+    # Summary metrics
+    col1, col2, col3, col4 = st.columns(4)
+    col1.metric("Rows In", result.original_row_count)
+    col2.metric("Rows Out", len(result.deduplicated_df))
+    col3.metric("Removed", removed)
+    col4.metric("Groups", len(result.match_groups))
+
+    st.divider()
+
+    # Download buttons
+    dl_left, dl_mid, dl_right = st.columns(3)
+
+    with dl_left:
+        csv_bytes = result.deduplicated_df.to_csv(index=False).encode("utf-8-sig")
+        st.download_button(
+            "Download Deduplicated CSV",
+            data=csv_bytes,
+            file_name="deduplicated.csv",
+            mime="text/csv",
+        )
+
+    with dl_mid:
+        if not result.removed_df.empty:
+            removed_bytes = result.removed_df.to_csv(index=False).encode("utf-8-sig")
+            st.download_button(
+                "Download Removed Rows",
+                data=removed_bytes,
+                file_name="removed_rows.csv",
+                mime="text/csv",
+            )
+
+    with dl_right:
+        if result.match_groups:
+            groups_data = _build_match_groups_csv(result, original_df)
+            st.download_button(
+                "Download Match Groups Report",
+                data=groups_data,
+                file_name="match_groups.csv",
+                mime="text/csv",
+            )
+
+
+def _build_match_groups_csv(
+    result: DeduplicationResult,
+    original_df: pd.DataFrame,
+) -> bytes:
+    """Build the match groups audit CSV as bytes."""
+    rows = []
+    for g in result.match_groups:
+        for idx in g.row_indices:
+            row_data = {
+                "_group_id": g.group_id + 1,
+                "_is_survivor": idx == g.survivor_index,
+                "_confidence": g.confidence,
+                "_matched_on": ", ".join(g.matched_on),
+                "_original_row": idx + 1,
+            }
+            for col in original_df.columns:
+                if not str(col).startswith("_norm_"):
+                    row_data[col] = original_df.iloc[idx].get(col, "") if idx < len(original_df) else ""
+            rows.append(row_data)
+
+    groups_df = pd.DataFrame(rows)
+    return groups_df.to_csv(index=False).encode("utf-8-sig")