feat: per-group survivor selection and column cherry-picking in GUI

Each match group card now has:
- Radio button to pick which row to keep as the base survivor
- "Customize columns" toggle showing only columns that differ
- Per-column selectbox to pick values from any row in the group
- Decisions stored as {action, survivor_idx, overrides} dicts

Added apply_review_decisions() that builds the final DataFrame by
applying survivor selection + column overrides without re-running
the dedup engine. Batch actions also use the new dict format.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-28 23:47:25 +00:00
parent 39e139d777
commit debb0cb516
2 changed files with 247 additions and 55 deletions

View File

@@ -258,6 +258,20 @@ def _build_config(
# Match group review card
# ---------------------------------------------------------------------------
def _find_differing_cols(
group: MatchResult, df: pd.DataFrame, display_cols: list[str],
) -> list[str]:
"""Return columns where values differ across rows in the group."""
differing = []
for col in display_cols:
values = set()
for idx in group.row_indices:
values.add(str(df.iloc[idx].get(col, "")).strip())
if len(values) > 1:
differing.append(col)
return differing
def match_group_card(
group: MatchResult,
df: pd.DataFrame,
@@ -265,9 +279,11 @@ def match_group_card(
) -> None:
"""Render an expandable match group card with side-by-side diff.
Decisions are stored directly in ``st.session_state["review_decisions"]``
via ``on_click`` callbacks so that other expanders keep their state on
rerun.
Users can pick which row to keep and cherry-pick column values from
other rows. Decisions are stored in
``st.session_state["review_decisions"]`` as dicts::
{group_id: {"action": bool, "survivor_idx": int, "overrides": {col: val}}}
"""
confidence = group.confidence
matched_on = ", ".join(group.matched_on)
@@ -276,7 +292,9 @@ def match_group_card(
decisions = st.session_state.get("review_decisions", {})
has_decision = gid in decisions
decision_val = decisions.get(gid)
decision_dict = decisions.get(gid, {})
action = decision_dict.get("action") if has_decision else None
overrides = decision_dict.get("overrides", {}) if has_decision else {}
# Build label — append decision status if already decided
label = (
@@ -284,17 +302,21 @@ def match_group_card(
f"(confidence: {confidence:.0f}%) "
f"[{matched_on}]"
)
if decision_val is True:
if action is True and overrides:
label += " — Merged (customized)"
elif action is True:
label += " — Merged"
elif decision_val is False:
elif action is False:
label += " — Kept Both"
# Decided groups collapse; undecided groups stay open
expanded = not has_decision
display_cols = [c for c in df.columns if not str(c).startswith("_norm_")]
differing_cols = _find_differing_cols(group, df, display_cols)
with st.expander(label, expanded=expanded):
# Build comparison DataFrame
display_cols = [c for c in df.columns if not str(c).startswith("_norm_")]
rows_data = []
for idx in group.row_indices:
row = {"_row": idx + 1}
@@ -325,30 +347,107 @@ def match_group_card(
if has_decision:
# Show current decision with option to undo
if decision_val is True:
st.success("Decision: Merge")
if action is True:
msg = "Decision: Merge"
if overrides:
msg += f" ({len(overrides)} column(s) customized)"
st.success(msg)
else:
st.info("Decision: Keep Both")
def _undo(g=gid):
def _undo(g=gid, diff=differing_cols):
st.session_state["review_decisions"].pop(g, None)
st.session_state.pop(f"base_row_{g}", None)
st.session_state.pop(f"customize_{g}", None)
for c in diff:
st.session_state.pop(f"col_{g}_{c}", None)
st.button("Undo", key=f"undo_{gid}", on_click=_undo)
else:
# Action buttons — on_click writes to session state before rerun
def _on_merge(g=gid):
st.session_state["review_decisions"][g] = True
# --- Base row selector ---
default_base = (
group.row_indices.index(group.survivor_index)
if group.survivor_index in group.row_indices
else 0
)
def _on_base_change(g=gid, diff=differing_cols):
"""Reset column pickers when the base row changes."""
for c in diff:
st.session_state.pop(f"col_{g}_{c}", None)
selected_survivor = st.radio(
"Base row (keep)",
options=group.row_indices,
index=default_base,
format_func=lambda idx: f"Row {idx + 1}",
key=f"base_row_{gid}",
horizontal=True,
on_change=_on_base_change,
)
# --- Customize columns (progressive disclosure) ---
if differing_cols:
customize = st.checkbox(
f"Customize columns ({len(differing_cols)} differ)",
key=f"customize_{gid}",
value=False,
)
if customize:
base_pos = group.row_indices.index(selected_survivor)
st.caption("Pick which row's value to use for each column:")
for col in differing_cols:
def _fmt(idx: int, c: str = col) -> str:
val = str(
st.session_state["df"].iloc[idx].get(c, "")
).strip()
return f"Row {idx + 1}: {val or '(empty)'}"
st.selectbox(
col,
options=group.row_indices,
index=base_pos,
format_func=_fmt,
key=f"col_{gid}_{col}",
)
# --- Action buttons ---
def _on_merge(
g=gid, indices=group.row_indices, diff=differing_cols,
):
the_df = st.session_state["df"]
base_idx = st.session_state.get(f"base_row_{g}", indices[0])
ovr: dict[str, str] = {}
for c in diff:
col_key = f"col_{g}_{c}"
if col_key in st.session_state:
source_idx = st.session_state[col_key]
if source_idx != base_idx:
ovr[c] = str(the_df.iloc[source_idx].get(c, ""))
st.session_state["review_decisions"][g] = {
"action": True,
"survivor_idx": base_idx,
"overrides": ovr,
}
def _on_keep(g=gid):
st.session_state["review_decisions"][g] = False
st.session_state["review_decisions"][g] = {
"action": False,
"survivor_idx": group.survivor_index,
"overrides": {},
}
btn_left, btn_mid, _btn_right = st.columns(3)
with btn_left:
st.button("Merge", key=f"merge_{gid}",
type="primary", on_click=_on_merge)
st.button(
"Merge", key=f"merge_{gid}",
type="primary", on_click=_on_merge,
)
with btn_mid:
st.button("Keep Both", key=f"keep_{gid}",
on_click=_on_keep)
st.button(
"Keep Both", key=f"keep_{gid}",
on_click=_on_keep,
)
# ---------------------------------------------------------------------------
@@ -404,6 +503,73 @@ def results_summary(
)
def apply_review_decisions(
original_df: pd.DataFrame,
match_groups: list[MatchResult],
decisions: dict,
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Build final DataFrames by applying user review decisions.
Handles per-group survivor selection and column overrides without
re-running the deduplication engine.
Returns ``(deduplicated_df, removed_df)``.
"""
remove_indices: set[int] = set()
row_overrides: dict[int, dict[str, str]] = {}
for group in match_groups:
gid = group.group_id
decision = decisions.get(gid)
# No decision yet — accept with engine defaults
if decision is None:
survivor_idx = group.survivor_index
for idx in group.row_indices:
if idx != survivor_idx:
remove_indices.add(idx)
continue
# Keep both — skip this group entirely
if not decision.get("action", True):
continue
# Merge with user's choices
survivor_idx = decision.get("survivor_idx", group.survivor_index)
ovr = decision.get("overrides", {})
for idx in group.row_indices:
if idx != survivor_idx:
remove_indices.add(idx)
if ovr:
row_overrides[survivor_idx] = ovr
# Build output DataFrames
keep_indices = [i for i in range(len(original_df)) if i not in remove_indices]
if row_overrides:
rows = []
for i in keep_indices:
row = original_df.iloc[i].copy()
if i in row_overrides:
for col, val in row_overrides[i].items():
if col in row.index:
row[col] = val
rows.append(row)
deduped = pd.DataFrame(rows).reset_index(drop=True)
else:
deduped = original_df.iloc[keep_indices].copy().reset_index(drop=True)
removed = (
original_df.iloc[sorted(remove_indices)].copy().reset_index(drop=True)
if remove_indices
else pd.DataFrame()
)
return deduped, removed
def _build_match_groups_csv(
result: DeduplicationResult,
original_df: pd.DataFrame,