From debb0cb516987dc68a8aa1a28981780f718d19ca Mon Sep 17 00:00:00 2001
From: Michael <michael.dombaugh@gmail.com>
Date: Tue, 28 Apr 2026 23:47:25 +0000
Subject: [PATCH] feat: per-group survivor selection and column cherry-picking
 in GUI

Each match group card now has:
- Radio button to pick which row to keep as the base survivor
- "Customize columns" toggle showing only columns that differ
- Per-column selectbox to pick values from any row in the group
- Decisions stored as {action, survivor_idx, overrides} dicts

Added apply_review_decisions() that builds the final DataFrame by
applying survivor selection + column overrides without re-running
the dedup engine. Batch actions also use the new dict format.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/gui/app.py        | 100 +++++++++++++--------
 src/gui/components.py | 202 ++++++++++++++++++++++++++++++++++++++----
 2 files changed, 247 insertions(+), 55 deletions(-)

diff --git a/src/gui/app.py b/src/gui/app.py
index 6f95ba1..b2b0b1d 100644
--- a/src/gui/app.py
+++ b/src/gui/app.py
@@ -21,7 +21,12 @@ if str(_project_root) not in sys.path:
 from src.core.dedup import deduplicate, build_default_strategies, DeduplicationResult
 from src.core.io import read_file, list_sheets
 from src.core.config import DeduplicationConfig
-from src.gui.components import config_panel, match_group_card, results_summary
+from src.gui.components import (
+    apply_review_decisions,
+    config_panel,
+    match_group_card,
+    results_summary,
+)
 
 
 # ---------------------------------------------------------------------------
@@ -194,21 +199,32 @@ if uploaded is not None:
                 st.subheader("Match Groups")
 
                 # Batch actions
+                def _accept_all():
+                    for g in result.match_groups:
+                        st.session_state["review_decisions"][g.group_id] = {
+                            "action": True,
+                            "survivor_idx": g.survivor_index,
+                            "overrides": {},
+                        }
+
+                def _reject_all():
+                    for g in result.match_groups:
+                        st.session_state["review_decisions"][g.group_id] = {
+                            "action": False,
+                            "survivor_idx": g.survivor_index,
+                            "overrides": {},
+                        }
+
+                def _clear_all():
+                    st.session_state["review_decisions"] = {}
+
                 action_left, action_mid, action_right = st.columns(3)
                 with action_left:
-                    if st.button("Accept All"):
-                        for g in result.match_groups:
-                            st.session_state["review_decisions"][g.group_id] = True
-                        st.rerun()
+                    st.button("Accept All", on_click=_accept_all)
                 with action_mid:
-                    if st.button("Reject All"):
-                        for g in result.match_groups:
-                            st.session_state["review_decisions"][g.group_id] = False
-                        st.rerun()
+                    st.button("Reject All", on_click=_reject_all)
                 with action_right:
-                    if st.button("Clear Decisions"):
-                        st.session_state["review_decisions"] = {}
-                        st.rerun()
+                    st.button("Clear Decisions", on_click=_clear_all)
 
                 # Individual group cards
                 decisions = st.session_state["review_decisions"]
@@ -218,40 +234,39 @@ if uploaded is not None:
                 # Show decision summary
                 if decisions:
                     st.divider()
-                    accepted = sum(1 for v in decisions.values() if v is True)
-                    rejected = sum(1 for v in decisions.values() if v is False)
-                    pending = len(result.match_groups) - len(decisions)
-                    st.caption(
-                        f"Decisions: {accepted} merged, {rejected} kept both, "
-                        f"{pending} pending"
+                    accepted = sum(
+                        1 for v in decisions.values()
+                        if isinstance(v, dict) and v.get("action") is True
                     )
+                    customized = sum(
+                        1 for v in decisions.values()
+                        if isinstance(v, dict) and v.get("action") is True
+                        and v.get("overrides")
+                    )
+                    rejected = sum(
+                        1 for v in decisions.values()
+                        if isinstance(v, dict) and v.get("action") is False
+                    )
+                    pending = len(result.match_groups) - len(decisions)
 
-                    # Re-run dedup with review decisions applied
+                    summary_parts = [f"{accepted} merged"]
+                    if customized:
+                        summary_parts.append(f"{customized} customized")
+                    summary_parts.append(f"{rejected} kept both")
+                    summary_parts.append(f"{pending} pending")
+                    st.caption("Decisions: " + ", ".join(summary_parts))
+
+                    # Apply decisions and offer download
                     if st.button(
                         "Apply Review Decisions & Download",
                         type="primary",
                         use_container_width=True,
                     ):
-                        def _review_callback(group, _df):
-                            gid = group.group_id
-                            if gid in decisions:
-                                return decisions[gid]
-                            return True  # default: accept
-
-                        reviewed_result = deduplicate(
-                            df,
-                            strategies=settings["strategies"],
-                            survivor_rule=settings["survivor_rule"],
-                            date_column=settings["date_column"],
-                            merge=settings["merge"],
-                            preview=False,
-                            review_callback=_review_callback,
+                        reviewed_df, reviewed_removed = apply_review_decisions(
+                            df, result.match_groups, decisions,
                         )
 
-                        # Update result and show downloads
-                        st.session_state["result"] = reviewed_result
-
-                        csv_bytes = reviewed_result.deduplicated_df.to_csv(
+                        csv_bytes = reviewed_df.to_csv(
                             index=False
                         ).encode("utf-8-sig")
                         st.download_button(
@@ -261,6 +276,17 @@ if uploaded is not None:
                             mime="text/csv",
                             key="reviewed_download",
                         )
+                        if not reviewed_removed.empty:
+                            removed_bytes = reviewed_removed.to_csv(
+                                index=False
+                            ).encode("utf-8-sig")
+                            st.download_button(
+                                "Download Reviewed Removed Rows",
+                                data=removed_bytes,
+                                file_name="removed_reviewed.csv",
+                                mime="text/csv",
+                                key="reviewed_removed_download",
+                            )
 
             # Log entries
             if result.log_entries:
diff --git a/src/gui/components.py b/src/gui/components.py
index 3504644..f962335 100644
--- a/src/gui/components.py
+++ b/src/gui/components.py
@@ -258,6 +258,20 @@ def _build_config(
 # Match group review card
 # ---------------------------------------------------------------------------
 
+def _find_differing_cols(
+    group: MatchResult, df: pd.DataFrame, display_cols: list[str],
+) -> list[str]:
+    """Return columns where values differ across rows in the group."""
+    differing = []
+    for col in display_cols:
+        values = set()
+        for idx in group.row_indices:
+            values.add(str(df.iloc[idx].get(col, "")).strip())
+        if len(values) > 1:
+            differing.append(col)
+    return differing
+
+
 def match_group_card(
     group: MatchResult,
     df: pd.DataFrame,
@@ -265,9 +279,11 @@ def match_group_card(
 ) -> None:
     """Render an expandable match group card with side-by-side diff.
 
-    Decisions are stored directly in ``st.session_state["review_decisions"]``
-    via ``on_click`` callbacks so that other expanders keep their state on
-    rerun.
+    Users can pick which row to keep and cherry-pick column values from
+    other rows.  Decisions are stored in
+    ``st.session_state["review_decisions"]`` as dicts::
+
+        {group_id: {"action": bool, "survivor_idx": int, "overrides": {col: val}}}
     """
     confidence = group.confidence
     matched_on = ", ".join(group.matched_on)
@@ -276,7 +292,9 @@ def match_group_card(
 
     decisions = st.session_state.get("review_decisions", {})
     has_decision = gid in decisions
-    decision_val = decisions.get(gid)
+    decision_dict = decisions.get(gid, {})
+    action = decision_dict.get("action") if has_decision else None
+    overrides = decision_dict.get("overrides", {}) if has_decision else {}
 
     # Build label — append decision status if already decided
     label = (
@@ -284,17 +302,21 @@ def match_group_card(
         f"(confidence: {confidence:.0f}%) "
         f"[{matched_on}]"
     )
-    if decision_val is True:
+    if action is True and overrides:
+        label += " — Merged (customized)"
+    elif action is True:
         label += " — Merged"
-    elif decision_val is False:
+    elif action is False:
         label += " — Kept Both"
 
     # Decided groups collapse; undecided groups stay open
     expanded = not has_decision
 
+    display_cols = [c for c in df.columns if not str(c).startswith("_norm_")]
+    differing_cols = _find_differing_cols(group, df, display_cols)
+
     with st.expander(label, expanded=expanded):
         # Build comparison DataFrame
-        display_cols = [c for c in df.columns if not str(c).startswith("_norm_")]
         rows_data = []
         for idx in group.row_indices:
             row = {"_row": idx + 1}
@@ -325,30 +347,107 @@ def match_group_card(
 
         if has_decision:
             # Show current decision with option to undo
-            if decision_val is True:
-                st.success("Decision: Merge")
+            if action is True:
+                msg = "Decision: Merge"
+                if overrides:
+                    msg += f" ({len(overrides)} column(s) customized)"
+                st.success(msg)
             else:
                 st.info("Decision: Keep Both")
 
-            def _undo(g=gid):
+            def _undo(g=gid, diff=differing_cols):
                 st.session_state["review_decisions"].pop(g, None)
+                st.session_state.pop(f"base_row_{g}", None)
+                st.session_state.pop(f"customize_{g}", None)
+                for c in diff:
+                    st.session_state.pop(f"col_{g}_{c}", None)
 
             st.button("Undo", key=f"undo_{gid}", on_click=_undo)
         else:
-            # Action buttons — on_click writes to session state before rerun
-            def _on_merge(g=gid):
-                st.session_state["review_decisions"][g] = True
+            # --- Base row selector ---
+            default_base = (
+                group.row_indices.index(group.survivor_index)
+                if group.survivor_index in group.row_indices
+                else 0
+            )
+
+            def _on_base_change(g=gid, diff=differing_cols):
+                """Reset column pickers when the base row changes."""
+                for c in diff:
+                    st.session_state.pop(f"col_{g}_{c}", None)
+
+            selected_survivor = st.radio(
+                "Base row (keep)",
+                options=group.row_indices,
+                index=default_base,
+                format_func=lambda idx: f"Row {idx + 1}",
+                key=f"base_row_{gid}",
+                horizontal=True,
+                on_change=_on_base_change,
+            )
+
+            # --- Customize columns (progressive disclosure) ---
+            if differing_cols:
+                customize = st.checkbox(
+                    f"Customize columns ({len(differing_cols)} differ)",
+                    key=f"customize_{gid}",
+                    value=False,
+                )
+                if customize:
+                    base_pos = group.row_indices.index(selected_survivor)
+                    st.caption("Pick which row's value to use for each column:")
+                    for col in differing_cols:
+                        def _fmt(idx: int, c: str = col) -> str:
+                            val = str(
+                                st.session_state["df"].iloc[idx].get(c, "")
+                            ).strip()
+                            return f"Row {idx + 1}: {val or '(empty)'}"
+
+                        st.selectbox(
+                            col,
+                            options=group.row_indices,
+                            index=base_pos,
+                            format_func=_fmt,
+                            key=f"col_{gid}_{col}",
+                        )
+
+            # --- Action buttons ---
+            def _on_merge(
+                g=gid, indices=group.row_indices, diff=differing_cols,
+            ):
+                the_df = st.session_state["df"]
+                base_idx = st.session_state.get(f"base_row_{g}", indices[0])
+                ovr: dict[str, str] = {}
+                for c in diff:
+                    col_key = f"col_{g}_{c}"
+                    if col_key in st.session_state:
+                        source_idx = st.session_state[col_key]
+                        if source_idx != base_idx:
+                            ovr[c] = str(the_df.iloc[source_idx].get(c, ""))
+                st.session_state["review_decisions"][g] = {
+                    "action": True,
+                    "survivor_idx": base_idx,
+                    "overrides": ovr,
+                }
 
             def _on_keep(g=gid):
-                st.session_state["review_decisions"][g] = False
+                st.session_state["review_decisions"][g] = {
+                    "action": False,
+                    "survivor_idx": group.survivor_index,
+                    "overrides": {},
+                }
 
             btn_left, btn_mid, _btn_right = st.columns(3)
             with btn_left:
-                st.button("Merge", key=f"merge_{gid}",
-                          type="primary", on_click=_on_merge)
+                st.button(
+                    "Merge", key=f"merge_{gid}",
+                    type="primary", on_click=_on_merge,
+                )
             with btn_mid:
-                st.button("Keep Both", key=f"keep_{gid}",
-                          on_click=_on_keep)
+                st.button(
+                    "Keep Both", key=f"keep_{gid}",
+                    on_click=_on_keep,
+                )
 
 
 # ---------------------------------------------------------------------------
@@ -404,6 +503,73 @@ def results_summary(
             )
 
 
+def apply_review_decisions(
+    original_df: pd.DataFrame,
+    match_groups: list[MatchResult],
+    decisions: dict,
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Build final DataFrames by applying user review decisions.
+
+    Handles per-group survivor selection and column overrides without
+    re-running the deduplication engine.
+
+    Returns ``(deduplicated_df, removed_df)``.
+    """
+    remove_indices: set[int] = set()
+    row_overrides: dict[int, dict[str, str]] = {}
+
+    for group in match_groups:
+        gid = group.group_id
+        decision = decisions.get(gid)
+
+        # No decision yet — accept with engine defaults
+        if decision is None:
+            survivor_idx = group.survivor_index
+            for idx in group.row_indices:
+                if idx != survivor_idx:
+                    remove_indices.add(idx)
+            continue
+
+        # Keep both — skip this group entirely
+        if not decision.get("action", True):
+            continue
+
+        # Merge with user's choices
+        survivor_idx = decision.get("survivor_idx", group.survivor_index)
+        ovr = decision.get("overrides", {})
+
+        for idx in group.row_indices:
+            if idx != survivor_idx:
+                remove_indices.add(idx)
+
+        if ovr:
+            row_overrides[survivor_idx] = ovr
+
+    # Build output DataFrames
+    keep_indices = [i for i in range(len(original_df)) if i not in remove_indices]
+
+    if row_overrides:
+        rows = []
+        for i in keep_indices:
+            row = original_df.iloc[i].copy()
+            if i in row_overrides:
+                for col, val in row_overrides[i].items():
+                    if col in row.index:
+                        row[col] = val
+            rows.append(row)
+        deduped = pd.DataFrame(rows).reset_index(drop=True)
+    else:
+        deduped = original_df.iloc[keep_indices].copy().reset_index(drop=True)
+
+    removed = (
+        original_df.iloc[sorted(remove_indices)].copy().reset_index(drop=True)
+        if remove_indices
+        else pd.DataFrame()
+    )
+
+    return deduped, removed
+
+
 def _build_match_groups_csv(
     result: DeduplicationResult,
     original_df: pd.DataFrame,