feat: add documentation, Streamlit GUI, and full source tree

- Rewrite README.md with project overview, quick-start, and CLI summary - Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections - Add docs/DEVELOPER.md with architecture, data flow, and extension guides - Rewrite src/core/__init__.py with public API exports and module docstring - Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive match group review with side-by-side diff, and download buttons - Add .gitignore, requirements.txt, all source code, tests, and sample data - Add streamlit to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-28 23:06:39 +00:00
parent 0613dc420c
commit b871ab24fc
47 changed files with 4413 additions and 2 deletions
--- a/src/gui/init.py
+++ b/src/gui/init.py
@@ -0,0 +1 @@
+"""Streamlit GUI for the DataTools Deduplicator."""
--- a/src/gui/main.py
+++ b/src/gui/main.py
@@ -0,0 +1,8 @@
+"""Allow running as ``python -m src.gui``."""
+
+import subprocess
+import sys
+from pathlib import Path
+
+app_path = Path(__file__).parent / "app.py"
+subprocess.run([sys.executable, "-m", "streamlit", "run", str(app_path)])
--- a/src/gui/app.py
+++ b/src/gui/app.py
@@ -0,0 +1,287 @@
+"""DataTools Deduplicator — Streamlit GUI.
+
+Launch:
+    streamlit run src/gui/app.py
+"""
+
+from __future__ import annotations
+
+import io
+import sys
+from pathlib import Path
+
+import pandas as pd
+import streamlit as st
+
+# Ensure project root is on sys.path so `src.core` imports work
+_project_root = Path(__file__).resolve().parent.parent.parent
+if str(_project_root) not in sys.path:
+    sys.path.insert(0, str(_project_root))
+
+from src.core.dedup import deduplicate, build_default_strategies, DeduplicationResult
+from src.core.io import read_file, list_sheets
+from src.core.config import DeduplicationConfig
+from src.gui.components import config_panel, match_group_card, results_summary
+
+
+# ---------------------------------------------------------------------------
+# Page config
+# ---------------------------------------------------------------------------
+
+st.set_page_config(
+    page_title="DataTools Deduplicator",
+    page_icon="🔍",
+    layout="wide",
+)
+
+# ---------------------------------------------------------------------------
+# Session state defaults
+# ---------------------------------------------------------------------------
+
+_DEFAULTS = {
+    "df": None,
+    "result": None,
+    "review_decisions": {},
+    "config": None,
+    "file_name": "",
+    "sheet_names": [],
+}
+for key, default in _DEFAULTS.items():
+    if key not in st.session_state:
+        st.session_state[key] = default
+
+
+# ---------------------------------------------------------------------------
+# Header
+# ---------------------------------------------------------------------------
+
+st.title("DataTools Deduplicator")
+st.caption("Find and remove duplicate rows in CSV and Excel files.")
+
+
+# ---------------------------------------------------------------------------
+# File upload
+# ---------------------------------------------------------------------------
+
+uploaded = st.file_uploader(
+    "Upload CSV or Excel file",
+    type=["csv", "tsv", "xlsx", "xls"],
+    help="Supports CSV, TSV, and Excel files. Encoding and delimiters are auto-detected.",
+)
+
+if uploaded is not None:
+    # Detect if file changed
+    if uploaded.name != st.session_state["file_name"]:
+        st.session_state["file_name"] = uploaded.name
+        st.session_state["result"] = None
+        st.session_state["review_decisions"] = {}
+
+        # Read the file
+        try:
+            # Write to a temp file for read_file() which needs a path
+            import tempfile
+            suffix = Path(uploaded.name).suffix
+            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+                tmp.write(uploaded.getvalue())
+                tmp_path = Path(tmp.name)
+
+            # Check for Excel sheets
+            if suffix.lower() in (".xlsx", ".xls"):
+                st.session_state["sheet_names"] = list_sheets(tmp_path)
+            else:
+                st.session_state["sheet_names"] = []
+
+            df = read_file(tmp_path)
+            if not isinstance(df, pd.DataFrame):
+                df = pd.concat(list(df), ignore_index=True)
+
+            st.session_state["df"] = df
+
+            # Clean up temp file
+            tmp_path.unlink(missing_ok=True)
+
+        except Exception as e:
+            st.error(f"Failed to read file: {e}")
+            st.session_state["df"] = None
+
+    df = st.session_state["df"]
+
+    if df is not None:
+        # Sheet selector for Excel files
+        if st.session_state["sheet_names"] and len(st.session_state["sheet_names"]) > 1:
+            sheet = st.selectbox(
+                "Select sheet",
+                st.session_state["sheet_names"],
+            )
+            if sheet != st.session_state.get("_current_sheet"):
+                st.session_state["_current_sheet"] = sheet
+                suffix = Path(uploaded.name).suffix
+                import tempfile
+                with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+                    tmp.write(uploaded.getvalue())
+                    tmp_path = Path(tmp.name)
+                df = read_file(tmp_path, sheet_name=sheet)
+                if not isinstance(df, pd.DataFrame):
+                    df = pd.concat(list(df), ignore_index=True)
+                st.session_state["df"] = df
+                st.session_state["result"] = None
+                st.session_state["review_decisions"] = {}
+                tmp_path.unlink(missing_ok=True)
+
+        # Preview
+        st.subheader(f"Preview: {uploaded.name}")
+        st.caption(f"{len(df)} rows, {len(df.columns)} columns")
+        st.dataframe(df.head(10), use_container_width=True)
+
+        # Advanced options
+        settings = config_panel(df)
+
+        # Apply loaded config if present
+        loaded_cfg = st.session_state.get("loaded_config")
+        if loaded_cfg is not None:
+            settings["strategies"] = loaded_cfg.to_strategies()
+            settings["survivor_rule"] = loaded_cfg.to_survivor_rule()
+            settings["date_column"] = loaded_cfg.date_column
+            settings["merge"] = loaded_cfg.merge
+            # Clear so it doesn't override on every rerun
+            del st.session_state["loaded_config"]
+
+        # ---------------------------------------------------------------------------
+        # Find Duplicates button
+        # ---------------------------------------------------------------------------
+
+        st.divider()
+
+        if st.button("Find Duplicates", type="primary", use_container_width=True):
+            progress_bar = st.progress(0, text="Comparing rows...")
+
+            def _gui_progress(current: int, total: int) -> None:
+                if total > 0:
+                    pct = min(current / total, 1.0)
+                    progress_bar.progress(pct, text=f"Comparing rows... {current:,}/{total:,}")
+
+            with st.spinner("Running deduplication..."):
+                result = deduplicate(
+                    df,
+                    strategies=settings["strategies"],
+                    survivor_rule=settings["survivor_rule"],
+                    date_column=settings["date_column"],
+                    merge=settings["merge"],
+                    preview=False,
+                    progress_callback=_gui_progress,
+                )
+
+            progress_bar.empty()
+            st.session_state["result"] = result
+            st.session_state["review_decisions"] = {}
+
+        # ---------------------------------------------------------------------------
+        # Results
+        # ---------------------------------------------------------------------------
+
+        result: DeduplicationResult | None = st.session_state["result"]
+
+        if result is not None:
+            st.divider()
+            st.subheader("Results")
+
+            # Summary + download buttons
+            results_summary(result, df)
+
+            # Match group review
+            if result.match_groups:
+                st.divider()
+                st.subheader("Match Groups")
+
+                # Batch actions
+                action_left, action_mid, action_right = st.columns(3)
+                with action_left:
+                    if st.button("Accept All"):
+                        for g in result.match_groups:
+                            st.session_state["review_decisions"][g.group_id] = True
+                        st.rerun()
+                with action_mid:
+                    if st.button("Reject All"):
+                        for g in result.match_groups:
+                            st.session_state["review_decisions"][g.group_id] = False
+                        st.rerun()
+                with action_right:
+                    if st.button("Clear Decisions"):
+                        st.session_state["review_decisions"] = {}
+                        st.rerun()
+
+                # Individual group cards
+                decisions = st.session_state["review_decisions"]
+                for i, group in enumerate(result.match_groups):
+                    decision = match_group_card(group, df, group_num=i + 1)
+                    if decision is not None:
+                        decisions[group.group_id] = decision
+                        st.session_state["review_decisions"] = decisions
+                        st.rerun()
+
+                # Show decision summary
+                if decisions:
+                    st.divider()
+                    accepted = sum(1 for v in decisions.values() if v is True)
+                    rejected = sum(1 for v in decisions.values() if v is False)
+                    pending = len(result.match_groups) - len(decisions)
+                    st.caption(
+                        f"Decisions: {accepted} merged, {rejected} kept both, "
+                        f"{pending} pending"
+                    )
+
+                    # Re-run dedup with review decisions applied
+                    if st.button(
+                        "Apply Review Decisions & Download",
+                        type="primary",
+                        use_container_width=True,
+                    ):
+                        def _review_callback(group, _df):
+                            gid = group.group_id
+                            if gid in decisions:
+                                return decisions[gid]
+                            return True  # default: accept
+
+                        reviewed_result = deduplicate(
+                            df,
+                            strategies=settings["strategies"],
+                            survivor_rule=settings["survivor_rule"],
+                            date_column=settings["date_column"],
+                            merge=settings["merge"],
+                            preview=False,
+                            review_callback=_review_callback,
+                        )
+
+                        # Update result and show downloads
+                        st.session_state["result"] = reviewed_result
+
+                        csv_bytes = reviewed_result.deduplicated_df.to_csv(
+                            index=False
+                        ).encode("utf-8-sig")
+                        st.download_button(
+                            "Download Reviewed & Deduplicated CSV",
+                            data=csv_bytes,
+                            file_name="deduplicated_reviewed.csv",
+                            mime="text/csv",
+                            key="reviewed_download",
+                        )
+
+            # Log entries
+            if result.log_entries:
+                with st.expander("Processing Log"):
+                    st.code("\n".join(result.log_entries))
+
+else:
+    # No file uploaded — show placeholder
+    st.info("Upload a CSV or Excel file to get started.")
+
+
+# ---------------------------------------------------------------------------
+# Footer
+# ---------------------------------------------------------------------------
+
+st.divider()
+st.caption(
+    "Runs locally. Your data never leaves this computer. "
+    "| DataTools Deduplicator v1.0"
+)
--- a/src/gui/components.py
+++ b/src/gui/components.py
@@ -0,0 +1,413 @@
+"""Reusable Streamlit widgets for the deduplicator GUI."""
+
+from __future__ import annotations
+
+import io
+from typing import Optional
+
+import pandas as pd
+import streamlit as st
+
+from src.core.dedup import (
+    Algorithm,
+    ColumnMatchStrategy,
+    DeduplicationResult,
+    MatchResult,
+    MatchStrategy,
+    SurvivorRule,
+)
+from src.core.config import (
+    ColumnStrategyConfig,
+    DeduplicationConfig,
+    StrategyConfig,
+)
+from src.core.normalizers import NormalizerType
+
+
+# ---------------------------------------------------------------------------
+# Config panel (advanced options)
+# ---------------------------------------------------------------------------
+
+def config_panel(df: pd.DataFrame) -> dict:
+    """Render the Advanced Options expander. Returns a settings dict.
+
+    Keys returned:
+        strategies: list[MatchStrategy] | None
+        survivor_rule: SurvivorRule
+        date_column: str | None
+        merge: bool
+    """
+    columns = list(df.columns)
+
+    with st.expander("Advanced Options"):
+        col_left, col_right = st.columns(2)
+
+        with col_left:
+            subset_cols = st.multiselect(
+                "Match on columns",
+                columns,
+                default=[],
+                help="Leave empty to auto-detect based on column names.",
+            )
+            key_cols = st.multiselect(
+                "Strong keys",
+                columns,
+                default=[],
+                help="Columns that uniquely identify records (e.g., EIN, SKU). Each is an independent exact-match strategy.",
+            )
+            fuzzy_cols = st.multiselect(
+                "Fuzzy columns",
+                columns,
+                default=[],
+                help="Columns to fuzzy-match. Others use exact matching.",
+            )
+
+        with col_right:
+            algorithm = st.selectbox(
+                "Fuzzy algorithm",
+                ["jaro_winkler", "levenshtein", "token_set_ratio"],
+                index=0,
+                help="jaro_winkler: best for names. levenshtein: best for typos. token_set_ratio: best for addresses.",
+            )
+            threshold = st.slider(
+                "Similarity threshold",
+                min_value=50,
+                max_value=100,
+                value=85,
+                help="Lower = more matches but more false positives.",
+            )
+            survivor = st.selectbox(
+                "Survivor rule",
+                ["first", "last", "most-complete", "most-recent"],
+                index=0,
+                help="Which row to keep when duplicates are found.",
+            )
+
+        # Second row of options
+        col_a, col_b = st.columns(2)
+
+        with col_a:
+            normalize_options = {c: "auto" for c in columns}
+            normalizer_types = ["auto", "email", "phone", "name", "address", "string", "none"]
+
+            normalize_map: dict[str, str] = {}
+            if fuzzy_cols or subset_cols:
+                target_cols = fuzzy_cols or subset_cols
+                st.markdown("**Per-column normalizers**")
+                for col_name in target_cols:
+                    norm = st.selectbox(
+                        f"Normalizer for '{col_name}'",
+                        normalizer_types,
+                        index=0,
+                        key=f"norm_{col_name}",
+                    )
+                    if norm not in ("auto", "none"):
+                        normalize_map[col_name] = norm
+
+        with col_b:
+            merge = st.checkbox(
+                "Merge mode",
+                value=False,
+                help="Fill missing fields in the surviving row from removed duplicates.",
+            )
+            date_column: Optional[str] = None
+            if survivor == "most-recent":
+                date_column = st.selectbox(
+                    "Date column",
+                    columns,
+                    help="Required for most-recent survivor rule.",
+                )
+
+        # Config save/load
+        st.divider()
+        cfg_left, cfg_right = st.columns(2)
+
+        with cfg_left:
+            config_file = st.file_uploader(
+                "Load config profile",
+                type=["json"],
+                help="Load previously saved settings.",
+                key="config_upload",
+            )
+            if config_file is not None:
+                import json
+                try:
+                    data = json.loads(config_file.read())
+                    loaded = DeduplicationConfig.from_dict(data)
+                    st.session_state["loaded_config"] = loaded
+                    st.success("Config loaded.")
+                except Exception as e:
+                    st.error(f"Failed to load config: {e}")
+
+        with cfg_right:
+            if st.button("Save current settings"):
+                cfg = _build_config(
+                    subset_cols, key_cols, fuzzy_cols,
+                    algorithm, threshold, normalize_map,
+                    survivor, date_column, merge,
+                )
+                cfg_json = cfg.to_dict()
+                import json
+                st.download_button(
+                    "Download config JSON",
+                    data=json.dumps(cfg_json, indent=2),
+                    file_name="dedup_config.json",
+                    mime="application/json",
+                )
+
+    # Build strategies from selections
+    strategies = _build_strategies(
+        subset_cols, key_cols, fuzzy_cols,
+        algorithm, threshold, normalize_map,
+    )
+
+    # Survivor rule mapping
+    survivor_map = {
+        "first": SurvivorRule.KEEP_FIRST,
+        "last": SurvivorRule.KEEP_LAST,
+        "most-complete": SurvivorRule.KEEP_MOST_COMPLETE,
+        "most-recent": SurvivorRule.KEEP_MOST_RECENT,
+    }
+
+    return {
+        "strategies": strategies,
+        "survivor_rule": survivor_map[survivor],
+        "date_column": date_column,
+        "merge": merge,
+    }
+
+
+def _build_strategies(
+    subset_cols: list[str],
+    key_cols: list[str],
+    fuzzy_cols: list[str],
+    algorithm: str,
+    threshold: int,
+    normalize_map: dict[str, str],
+) -> Optional[list[MatchStrategy]]:
+    """Build MatchStrategy list from GUI selections. Returns None for auto-detect."""
+    strategies: list[MatchStrategy] = []
+
+    # If user selected columns explicitly, build from those
+    if subset_cols or fuzzy_cols:
+        target_cols = subset_cols if subset_cols else fuzzy_cols
+        fuzzy_set = set(fuzzy_cols)
+        col_strats: list[ColumnMatchStrategy] = []
+        for col in target_cols:
+            norm = None
+            if col in normalize_map:
+                norm = NormalizerType(normalize_map[col])
+            if col in fuzzy_set:
+                algo = Algorithm(algorithm)
+                thresh = float(threshold)
+            else:
+                algo = Algorithm.EXACT
+                thresh = 100.0
+            col_strats.append(ColumnMatchStrategy(
+                column=col, algorithm=algo, threshold=thresh, normalizer=norm,
+            ))
+        strategies.append(MatchStrategy(column_strategies=col_strats))
+
+    # Add strong key strategies
+    if key_cols:
+        for col in key_cols:
+            strategies.append(MatchStrategy(column_strategies=[
+                ColumnMatchStrategy(column=col, algorithm=Algorithm.EXACT, threshold=100.0)
+            ]))
+
+    return strategies if strategies else None
+
+
+def _build_config(
+    subset_cols, key_cols, fuzzy_cols,
+    algorithm, threshold, normalize_map,
+    survivor, date_column, merge,
+) -> DeduplicationConfig:
+    """Build a DeduplicationConfig from GUI state."""
+    cfg = DeduplicationConfig(
+        survivor_rule=survivor.replace("-", "_"),
+        date_column=date_column,
+        merge=merge,
+        subset_columns=subset_cols or None,
+        fuzzy_columns=fuzzy_cols or None,
+        default_algorithm=algorithm,
+        default_threshold=float(threshold),
+        normalize_map=normalize_map or None,
+    )
+    strategies = _build_strategies(
+        subset_cols, key_cols, fuzzy_cols,
+        algorithm, threshold, normalize_map,
+    )
+    if strategies:
+        cfg.strategies = [
+            StrategyConfig(columns=[
+                ColumnStrategyConfig(
+                    column=cs.column,
+                    algorithm=cs.algorithm.value,
+                    threshold=cs.threshold,
+                    normalizer=cs.normalizer.value if cs.normalizer else None,
+                )
+                for cs in s.column_strategies
+            ])
+            for s in strategies
+        ]
+    return cfg
+
+
+# ---------------------------------------------------------------------------
+# Match group review card
+# ---------------------------------------------------------------------------
+
+def match_group_card(
+    group: MatchResult,
+    df: pd.DataFrame,
+    group_num: int,
+) -> Optional[bool]:
+    """Render an expandable match group card with side-by-side diff.
+
+    Returns:
+        True  — user clicked Merge (accept match)
+        False — user clicked Keep Both (reject match)
+        None  — no decision yet
+    """
+    confidence = group.confidence
+    auto_expand = confidence < 95.0
+    matched_on = ", ".join(group.matched_on)
+    n_rows = len(group.row_indices)
+
+    label = (
+        f"Group {group_num}: {n_rows} rows "
+        f"(confidence: {confidence:.0f}%) "
+        f"[{matched_on}]"
+    )
+
+    with st.expander(label, expanded=auto_expand):
+        # Build comparison DataFrame
+        display_cols = [c for c in df.columns if not str(c).startswith("_norm_")]
+        rows_data = []
+        for idx in group.row_indices:
+            row = {"_row": idx + 1}
+            for col in display_cols:
+                row[col] = df.iloc[idx].get(col, "")
+            rows_data.append(row)
+
+        compare_df = pd.DataFrame(rows_data)
+        compare_df = compare_df.set_index("_row")
+
+        # Highlight differences
+        def _highlight_diffs(s: pd.Series) -> list[str]:
+            """Highlight cells that differ from the first row."""
+            styles = []
+            first_val = str(s.iloc[0]).strip() if len(s) > 0 else ""
+            for val in s:
+                val_str = str(val).strip()
+                if val_str != first_val and val_str and first_val:
+                    styles.append("background-color: rgba(245, 166, 35, 0.2)")
+                elif not val_str and first_val:
+                    styles.append("background-color: rgba(240, 82, 82, 0.1)")
+                else:
+                    styles.append("")
+            return styles
+
+        styled = compare_df.style.apply(_highlight_diffs, axis=0)
+        st.dataframe(styled, use_container_width=True)
+
+        # Action buttons
+        btn_left, btn_mid, btn_right = st.columns(3)
+        merge_key = f"merge_{group.group_id}"
+        keep_key = f"keep_{group.group_id}"
+
+        with btn_left:
+            if st.button("Merge", key=merge_key, type="primary"):
+                return True
+        with btn_mid:
+            if st.button("Keep Both", key=keep_key):
+                return False
+
+        # Check session state for previous decisions
+        decisions = st.session_state.get("review_decisions", {})
+        if group.group_id in decisions:
+            decision = decisions[group.group_id]
+            if decision is True:
+                st.success("Decision: Merge")
+            elif decision is False:
+                st.info("Decision: Keep Both")
+
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Results summary + downloads
+# ---------------------------------------------------------------------------
+
+def results_summary(
+    result: DeduplicationResult,
+    original_df: pd.DataFrame,
+) -> None:
+    """Render summary stats and download buttons."""
+    removed = result.original_row_count - len(result.deduplicated_df)
+
+    # Summary metrics
+    col1, col2, col3, col4 = st.columns(4)
+    col1.metric("Rows In", result.original_row_count)
+    col2.metric("Rows Out", len(result.deduplicated_df))
+    col3.metric("Removed", removed)
+    col4.metric("Groups", len(result.match_groups))
+
+    st.divider()
+
+    # Download buttons
+    dl_left, dl_mid, dl_right = st.columns(3)
+
+    with dl_left:
+        csv_bytes = result.deduplicated_df.to_csv(index=False).encode("utf-8-sig")
+        st.download_button(
+            "Download Deduplicated CSV",
+            data=csv_bytes,
+            file_name="deduplicated.csv",
+            mime="text/csv",
+        )
+
+    with dl_mid:
+        if not result.removed_df.empty:
+            removed_bytes = result.removed_df.to_csv(index=False).encode("utf-8-sig")
+            st.download_button(
+                "Download Removed Rows",
+                data=removed_bytes,
+                file_name="removed_rows.csv",
+                mime="text/csv",
+            )
+
+    with dl_right:
+        if result.match_groups:
+            groups_data = _build_match_groups_csv(result, original_df)
+            st.download_button(
+                "Download Match Groups Report",
+                data=groups_data,
+                file_name="match_groups.csv",
+                mime="text/csv",
+            )
+
+
+def _build_match_groups_csv(
+    result: DeduplicationResult,
+    original_df: pd.DataFrame,
+) -> bytes:
+    """Build the match groups audit CSV as bytes."""
+    rows = []
+    for g in result.match_groups:
+        for idx in g.row_indices:
+            row_data = {
+                "_group_id": g.group_id + 1,
+                "_is_survivor": idx == g.survivor_index,
+                "_confidence": g.confidence,
+                "_matched_on": ", ".join(g.matched_on),
+                "_original_row": idx + 1,
+            }
+            for col in original_df.columns:
+                if not str(col).startswith("_norm_"):
+                    row_data[col] = original_df.iloc[idx].get(col, "") if idx < len(original_df) else ""
+            rows.append(row_data)
+
+    groups_df = pd.DataFrame(rows)
+    return groups_df.to_csv(index=False).encode("utf-8-sig")
				`@@ -0,0 +1 @@`
				`"""Streamlit GUI for the DataTools Deduplicator."""`