From debb0cb516987dc68a8aa1a28981780f718d19ca Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 28 Apr 2026 23:47:25 +0000 Subject: [PATCH] feat: per-group survivor selection and column cherry-picking in GUI Each match group card now has: - Radio button to pick which row to keep as the base survivor - "Customize columns" toggle showing only columns that differ - Per-column selectbox to pick values from any row in the group - Decisions stored as {action, survivor_idx, overrides} dicts Added apply_review_decisions() that builds the final DataFrame by applying survivor selection + column overrides without re-running the dedup engine. Batch actions also use the new dict format. Co-Authored-By: Claude Opus 4.6 --- src/gui/app.py | 100 +++++++++++++-------- src/gui/components.py | 202 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 247 insertions(+), 55 deletions(-) diff --git a/src/gui/app.py b/src/gui/app.py index 6f95ba1..b2b0b1d 100644 --- a/src/gui/app.py +++ b/src/gui/app.py @@ -21,7 +21,12 @@ if str(_project_root) not in sys.path: from src.core.dedup import deduplicate, build_default_strategies, DeduplicationResult from src.core.io import read_file, list_sheets from src.core.config import DeduplicationConfig -from src.gui.components import config_panel, match_group_card, results_summary +from src.gui.components import ( + apply_review_decisions, + config_panel, + match_group_card, + results_summary, +) # --------------------------------------------------------------------------- @@ -194,21 +199,32 @@ if uploaded is not None: st.subheader("Match Groups") # Batch actions + def _accept_all(): + for g in result.match_groups: + st.session_state["review_decisions"][g.group_id] = { + "action": True, + "survivor_idx": g.survivor_index, + "overrides": {}, + } + + def _reject_all(): + for g in result.match_groups: + st.session_state["review_decisions"][g.group_id] = { + "action": False, + "survivor_idx": g.survivor_index, + "overrides": {}, + } + + def _clear_all(): + st.session_state["review_decisions"] = {} + action_left, action_mid, action_right = st.columns(3) with action_left: - if st.button("Accept All"): - for g in result.match_groups: - st.session_state["review_decisions"][g.group_id] = True - st.rerun() + st.button("Accept All", on_click=_accept_all) with action_mid: - if st.button("Reject All"): - for g in result.match_groups: - st.session_state["review_decisions"][g.group_id] = False - st.rerun() + st.button("Reject All", on_click=_reject_all) with action_right: - if st.button("Clear Decisions"): - st.session_state["review_decisions"] = {} - st.rerun() + st.button("Clear Decisions", on_click=_clear_all) # Individual group cards decisions = st.session_state["review_decisions"] @@ -218,40 +234,39 @@ if uploaded is not None: # Show decision summary if decisions: st.divider() - accepted = sum(1 for v in decisions.values() if v is True) - rejected = sum(1 for v in decisions.values() if v is False) - pending = len(result.match_groups) - len(decisions) - st.caption( - f"Decisions: {accepted} merged, {rejected} kept both, " - f"{pending} pending" + accepted = sum( + 1 for v in decisions.values() + if isinstance(v, dict) and v.get("action") is True ) + customized = sum( + 1 for v in decisions.values() + if isinstance(v, dict) and v.get("action") is True + and v.get("overrides") + ) + rejected = sum( + 1 for v in decisions.values() + if isinstance(v, dict) and v.get("action") is False + ) + pending = len(result.match_groups) - len(decisions) - # Re-run dedup with review decisions applied + summary_parts = [f"{accepted} merged"] + if customized: + summary_parts.append(f"{customized} customized") + summary_parts.append(f"{rejected} kept both") + summary_parts.append(f"{pending} pending") + st.caption("Decisions: " + ", ".join(summary_parts)) + + # Apply decisions and offer download if st.button( "Apply Review Decisions & Download", type="primary", use_container_width=True, ): - def _review_callback(group, _df): - gid = group.group_id - if gid in decisions: - return decisions[gid] - return True # default: accept - - reviewed_result = deduplicate( - df, - strategies=settings["strategies"], - survivor_rule=settings["survivor_rule"], - date_column=settings["date_column"], - merge=settings["merge"], - preview=False, - review_callback=_review_callback, + reviewed_df, reviewed_removed = apply_review_decisions( + df, result.match_groups, decisions, ) - # Update result and show downloads - st.session_state["result"] = reviewed_result - - csv_bytes = reviewed_result.deduplicated_df.to_csv( + csv_bytes = reviewed_df.to_csv( index=False ).encode("utf-8-sig") st.download_button( @@ -261,6 +276,17 @@ if uploaded is not None: mime="text/csv", key="reviewed_download", ) + if not reviewed_removed.empty: + removed_bytes = reviewed_removed.to_csv( + index=False + ).encode("utf-8-sig") + st.download_button( + "Download Reviewed Removed Rows", + data=removed_bytes, + file_name="removed_reviewed.csv", + mime="text/csv", + key="reviewed_removed_download", + ) # Log entries if result.log_entries: diff --git a/src/gui/components.py b/src/gui/components.py index 3504644..f962335 100644 --- a/src/gui/components.py +++ b/src/gui/components.py @@ -258,6 +258,20 @@ def _build_config( # Match group review card # --------------------------------------------------------------------------- +def _find_differing_cols( + group: MatchResult, df: pd.DataFrame, display_cols: list[str], +) -> list[str]: + """Return columns where values differ across rows in the group.""" + differing = [] + for col in display_cols: + values = set() + for idx in group.row_indices: + values.add(str(df.iloc[idx].get(col, "")).strip()) + if len(values) > 1: + differing.append(col) + return differing + + def match_group_card( group: MatchResult, df: pd.DataFrame, @@ -265,9 +279,11 @@ def match_group_card( ) -> None: """Render an expandable match group card with side-by-side diff. - Decisions are stored directly in ``st.session_state["review_decisions"]`` - via ``on_click`` callbacks so that other expanders keep their state on - rerun. + Users can pick which row to keep and cherry-pick column values from + other rows. Decisions are stored in + ``st.session_state["review_decisions"]`` as dicts:: + + {group_id: {"action": bool, "survivor_idx": int, "overrides": {col: val}}} """ confidence = group.confidence matched_on = ", ".join(group.matched_on) @@ -276,7 +292,9 @@ def match_group_card( decisions = st.session_state.get("review_decisions", {}) has_decision = gid in decisions - decision_val = decisions.get(gid) + decision_dict = decisions.get(gid, {}) + action = decision_dict.get("action") if has_decision else None + overrides = decision_dict.get("overrides", {}) if has_decision else {} # Build label — append decision status if already decided label = ( @@ -284,17 +302,21 @@ def match_group_card( f"(confidence: {confidence:.0f}%) " f"[{matched_on}]" ) - if decision_val is True: + if action is True and overrides: + label += " — Merged (customized)" + elif action is True: label += " — Merged" - elif decision_val is False: + elif action is False: label += " — Kept Both" # Decided groups collapse; undecided groups stay open expanded = not has_decision + display_cols = [c for c in df.columns if not str(c).startswith("_norm_")] + differing_cols = _find_differing_cols(group, df, display_cols) + with st.expander(label, expanded=expanded): # Build comparison DataFrame - display_cols = [c for c in df.columns if not str(c).startswith("_norm_")] rows_data = [] for idx in group.row_indices: row = {"_row": idx + 1} @@ -325,30 +347,107 @@ def match_group_card( if has_decision: # Show current decision with option to undo - if decision_val is True: - st.success("Decision: Merge") + if action is True: + msg = "Decision: Merge" + if overrides: + msg += f" ({len(overrides)} column(s) customized)" + st.success(msg) else: st.info("Decision: Keep Both") - def _undo(g=gid): + def _undo(g=gid, diff=differing_cols): st.session_state["review_decisions"].pop(g, None) + st.session_state.pop(f"base_row_{g}", None) + st.session_state.pop(f"customize_{g}", None) + for c in diff: + st.session_state.pop(f"col_{g}_{c}", None) st.button("Undo", key=f"undo_{gid}", on_click=_undo) else: - # Action buttons — on_click writes to session state before rerun - def _on_merge(g=gid): - st.session_state["review_decisions"][g] = True + # --- Base row selector --- + default_base = ( + group.row_indices.index(group.survivor_index) + if group.survivor_index in group.row_indices + else 0 + ) + + def _on_base_change(g=gid, diff=differing_cols): + """Reset column pickers when the base row changes.""" + for c in diff: + st.session_state.pop(f"col_{g}_{c}", None) + + selected_survivor = st.radio( + "Base row (keep)", + options=group.row_indices, + index=default_base, + format_func=lambda idx: f"Row {idx + 1}", + key=f"base_row_{gid}", + horizontal=True, + on_change=_on_base_change, + ) + + # --- Customize columns (progressive disclosure) --- + if differing_cols: + customize = st.checkbox( + f"Customize columns ({len(differing_cols)} differ)", + key=f"customize_{gid}", + value=False, + ) + if customize: + base_pos = group.row_indices.index(selected_survivor) + st.caption("Pick which row's value to use for each column:") + for col in differing_cols: + def _fmt(idx: int, c: str = col) -> str: + val = str( + st.session_state["df"].iloc[idx].get(c, "") + ).strip() + return f"Row {idx + 1}: {val or '(empty)'}" + + st.selectbox( + col, + options=group.row_indices, + index=base_pos, + format_func=_fmt, + key=f"col_{gid}_{col}", + ) + + # --- Action buttons --- + def _on_merge( + g=gid, indices=group.row_indices, diff=differing_cols, + ): + the_df = st.session_state["df"] + base_idx = st.session_state.get(f"base_row_{g}", indices[0]) + ovr: dict[str, str] = {} + for c in diff: + col_key = f"col_{g}_{c}" + if col_key in st.session_state: + source_idx = st.session_state[col_key] + if source_idx != base_idx: + ovr[c] = str(the_df.iloc[source_idx].get(c, "")) + st.session_state["review_decisions"][g] = { + "action": True, + "survivor_idx": base_idx, + "overrides": ovr, + } def _on_keep(g=gid): - st.session_state["review_decisions"][g] = False + st.session_state["review_decisions"][g] = { + "action": False, + "survivor_idx": group.survivor_index, + "overrides": {}, + } btn_left, btn_mid, _btn_right = st.columns(3) with btn_left: - st.button("Merge", key=f"merge_{gid}", - type="primary", on_click=_on_merge) + st.button( + "Merge", key=f"merge_{gid}", + type="primary", on_click=_on_merge, + ) with btn_mid: - st.button("Keep Both", key=f"keep_{gid}", - on_click=_on_keep) + st.button( + "Keep Both", key=f"keep_{gid}", + on_click=_on_keep, + ) # --------------------------------------------------------------------------- @@ -404,6 +503,73 @@ def results_summary( ) +def apply_review_decisions( + original_df: pd.DataFrame, + match_groups: list[MatchResult], + decisions: dict, +) -> tuple[pd.DataFrame, pd.DataFrame]: + """Build final DataFrames by applying user review decisions. + + Handles per-group survivor selection and column overrides without + re-running the deduplication engine. + + Returns ``(deduplicated_df, removed_df)``. + """ + remove_indices: set[int] = set() + row_overrides: dict[int, dict[str, str]] = {} + + for group in match_groups: + gid = group.group_id + decision = decisions.get(gid) + + # No decision yet — accept with engine defaults + if decision is None: + survivor_idx = group.survivor_index + for idx in group.row_indices: + if idx != survivor_idx: + remove_indices.add(idx) + continue + + # Keep both — skip this group entirely + if not decision.get("action", True): + continue + + # Merge with user's choices + survivor_idx = decision.get("survivor_idx", group.survivor_index) + ovr = decision.get("overrides", {}) + + for idx in group.row_indices: + if idx != survivor_idx: + remove_indices.add(idx) + + if ovr: + row_overrides[survivor_idx] = ovr + + # Build output DataFrames + keep_indices = [i for i in range(len(original_df)) if i not in remove_indices] + + if row_overrides: + rows = [] + for i in keep_indices: + row = original_df.iloc[i].copy() + if i in row_overrides: + for col, val in row_overrides[i].items(): + if col in row.index: + row[col] = val + rows.append(row) + deduped = pd.DataFrame(rows).reset_index(drop=True) + else: + deduped = original_df.iloc[keep_indices].copy().reset_index(drop=True) + + removed = ( + original_df.iloc[sorted(remove_indices)].copy().reset_index(drop=True) + if remove_indices + else pd.DataFrame() + ) + + return deduped, removed + + def _build_match_groups_csv( result: DeduplicationResult, original_df: pd.DataFrame,