feat: per-group survivor selection and column cherry-picking in GUI
Each match group card now has:
- Radio button to pick which row to keep as the base survivor
- "Customize columns" toggle showing only columns that differ
- Per-column selectbox to pick values from any row in the group
- Decisions stored as {action, survivor_idx, overrides} dicts
Added apply_review_decisions() that builds the final DataFrame by
applying survivor selection + column overrides without re-running
the dedup engine. Batch actions also use the new dict format.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
100
src/gui/app.py
100
src/gui/app.py
@@ -21,7 +21,12 @@ if str(_project_root) not in sys.path:
|
|||||||
from src.core.dedup import deduplicate, build_default_strategies, DeduplicationResult
|
from src.core.dedup import deduplicate, build_default_strategies, DeduplicationResult
|
||||||
from src.core.io import read_file, list_sheets
|
from src.core.io import read_file, list_sheets
|
||||||
from src.core.config import DeduplicationConfig
|
from src.core.config import DeduplicationConfig
|
||||||
from src.gui.components import config_panel, match_group_card, results_summary
|
from src.gui.components import (
|
||||||
|
apply_review_decisions,
|
||||||
|
config_panel,
|
||||||
|
match_group_card,
|
||||||
|
results_summary,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -194,21 +199,32 @@ if uploaded is not None:
|
|||||||
st.subheader("Match Groups")
|
st.subheader("Match Groups")
|
||||||
|
|
||||||
# Batch actions
|
# Batch actions
|
||||||
|
def _accept_all():
|
||||||
|
for g in result.match_groups:
|
||||||
|
st.session_state["review_decisions"][g.group_id] = {
|
||||||
|
"action": True,
|
||||||
|
"survivor_idx": g.survivor_index,
|
||||||
|
"overrides": {},
|
||||||
|
}
|
||||||
|
|
||||||
|
def _reject_all():
|
||||||
|
for g in result.match_groups:
|
||||||
|
st.session_state["review_decisions"][g.group_id] = {
|
||||||
|
"action": False,
|
||||||
|
"survivor_idx": g.survivor_index,
|
||||||
|
"overrides": {},
|
||||||
|
}
|
||||||
|
|
||||||
|
def _clear_all():
|
||||||
|
st.session_state["review_decisions"] = {}
|
||||||
|
|
||||||
action_left, action_mid, action_right = st.columns(3)
|
action_left, action_mid, action_right = st.columns(3)
|
||||||
with action_left:
|
with action_left:
|
||||||
if st.button("Accept All"):
|
st.button("Accept All", on_click=_accept_all)
|
||||||
for g in result.match_groups:
|
|
||||||
st.session_state["review_decisions"][g.group_id] = True
|
|
||||||
st.rerun()
|
|
||||||
with action_mid:
|
with action_mid:
|
||||||
if st.button("Reject All"):
|
st.button("Reject All", on_click=_reject_all)
|
||||||
for g in result.match_groups:
|
|
||||||
st.session_state["review_decisions"][g.group_id] = False
|
|
||||||
st.rerun()
|
|
||||||
with action_right:
|
with action_right:
|
||||||
if st.button("Clear Decisions"):
|
st.button("Clear Decisions", on_click=_clear_all)
|
||||||
st.session_state["review_decisions"] = {}
|
|
||||||
st.rerun()
|
|
||||||
|
|
||||||
# Individual group cards
|
# Individual group cards
|
||||||
decisions = st.session_state["review_decisions"]
|
decisions = st.session_state["review_decisions"]
|
||||||
@@ -218,40 +234,39 @@ if uploaded is not None:
|
|||||||
# Show decision summary
|
# Show decision summary
|
||||||
if decisions:
|
if decisions:
|
||||||
st.divider()
|
st.divider()
|
||||||
accepted = sum(1 for v in decisions.values() if v is True)
|
accepted = sum(
|
||||||
rejected = sum(1 for v in decisions.values() if v is False)
|
1 for v in decisions.values()
|
||||||
pending = len(result.match_groups) - len(decisions)
|
if isinstance(v, dict) and v.get("action") is True
|
||||||
st.caption(
|
|
||||||
f"Decisions: {accepted} merged, {rejected} kept both, "
|
|
||||||
f"{pending} pending"
|
|
||||||
)
|
)
|
||||||
|
customized = sum(
|
||||||
|
1 for v in decisions.values()
|
||||||
|
if isinstance(v, dict) and v.get("action") is True
|
||||||
|
and v.get("overrides")
|
||||||
|
)
|
||||||
|
rejected = sum(
|
||||||
|
1 for v in decisions.values()
|
||||||
|
if isinstance(v, dict) and v.get("action") is False
|
||||||
|
)
|
||||||
|
pending = len(result.match_groups) - len(decisions)
|
||||||
|
|
||||||
# Re-run dedup with review decisions applied
|
summary_parts = [f"{accepted} merged"]
|
||||||
|
if customized:
|
||||||
|
summary_parts.append(f"{customized} customized")
|
||||||
|
summary_parts.append(f"{rejected} kept both")
|
||||||
|
summary_parts.append(f"{pending} pending")
|
||||||
|
st.caption("Decisions: " + ", ".join(summary_parts))
|
||||||
|
|
||||||
|
# Apply decisions and offer download
|
||||||
if st.button(
|
if st.button(
|
||||||
"Apply Review Decisions & Download",
|
"Apply Review Decisions & Download",
|
||||||
type="primary",
|
type="primary",
|
||||||
use_container_width=True,
|
use_container_width=True,
|
||||||
):
|
):
|
||||||
def _review_callback(group, _df):
|
reviewed_df, reviewed_removed = apply_review_decisions(
|
||||||
gid = group.group_id
|
df, result.match_groups, decisions,
|
||||||
if gid in decisions:
|
|
||||||
return decisions[gid]
|
|
||||||
return True # default: accept
|
|
||||||
|
|
||||||
reviewed_result = deduplicate(
|
|
||||||
df,
|
|
||||||
strategies=settings["strategies"],
|
|
||||||
survivor_rule=settings["survivor_rule"],
|
|
||||||
date_column=settings["date_column"],
|
|
||||||
merge=settings["merge"],
|
|
||||||
preview=False,
|
|
||||||
review_callback=_review_callback,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Update result and show downloads
|
csv_bytes = reviewed_df.to_csv(
|
||||||
st.session_state["result"] = reviewed_result
|
|
||||||
|
|
||||||
csv_bytes = reviewed_result.deduplicated_df.to_csv(
|
|
||||||
index=False
|
index=False
|
||||||
).encode("utf-8-sig")
|
).encode("utf-8-sig")
|
||||||
st.download_button(
|
st.download_button(
|
||||||
@@ -261,6 +276,17 @@ if uploaded is not None:
|
|||||||
mime="text/csv",
|
mime="text/csv",
|
||||||
key="reviewed_download",
|
key="reviewed_download",
|
||||||
)
|
)
|
||||||
|
if not reviewed_removed.empty:
|
||||||
|
removed_bytes = reviewed_removed.to_csv(
|
||||||
|
index=False
|
||||||
|
).encode("utf-8-sig")
|
||||||
|
st.download_button(
|
||||||
|
"Download Reviewed Removed Rows",
|
||||||
|
data=removed_bytes,
|
||||||
|
file_name="removed_reviewed.csv",
|
||||||
|
mime="text/csv",
|
||||||
|
key="reviewed_removed_download",
|
||||||
|
)
|
||||||
|
|
||||||
# Log entries
|
# Log entries
|
||||||
if result.log_entries:
|
if result.log_entries:
|
||||||
|
|||||||
@@ -258,6 +258,20 @@ def _build_config(
|
|||||||
# Match group review card
|
# Match group review card
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _find_differing_cols(
|
||||||
|
group: MatchResult, df: pd.DataFrame, display_cols: list[str],
|
||||||
|
) -> list[str]:
|
||||||
|
"""Return columns where values differ across rows in the group."""
|
||||||
|
differing = []
|
||||||
|
for col in display_cols:
|
||||||
|
values = set()
|
||||||
|
for idx in group.row_indices:
|
||||||
|
values.add(str(df.iloc[idx].get(col, "")).strip())
|
||||||
|
if len(values) > 1:
|
||||||
|
differing.append(col)
|
||||||
|
return differing
|
||||||
|
|
||||||
|
|
||||||
def match_group_card(
|
def match_group_card(
|
||||||
group: MatchResult,
|
group: MatchResult,
|
||||||
df: pd.DataFrame,
|
df: pd.DataFrame,
|
||||||
@@ -265,9 +279,11 @@ def match_group_card(
|
|||||||
) -> None:
|
) -> None:
|
||||||
"""Render an expandable match group card with side-by-side diff.
|
"""Render an expandable match group card with side-by-side diff.
|
||||||
|
|
||||||
Decisions are stored directly in ``st.session_state["review_decisions"]``
|
Users can pick which row to keep and cherry-pick column values from
|
||||||
via ``on_click`` callbacks so that other expanders keep their state on
|
other rows. Decisions are stored in
|
||||||
rerun.
|
``st.session_state["review_decisions"]`` as dicts::
|
||||||
|
|
||||||
|
{group_id: {"action": bool, "survivor_idx": int, "overrides": {col: val}}}
|
||||||
"""
|
"""
|
||||||
confidence = group.confidence
|
confidence = group.confidence
|
||||||
matched_on = ", ".join(group.matched_on)
|
matched_on = ", ".join(group.matched_on)
|
||||||
@@ -276,7 +292,9 @@ def match_group_card(
|
|||||||
|
|
||||||
decisions = st.session_state.get("review_decisions", {})
|
decisions = st.session_state.get("review_decisions", {})
|
||||||
has_decision = gid in decisions
|
has_decision = gid in decisions
|
||||||
decision_val = decisions.get(gid)
|
decision_dict = decisions.get(gid, {})
|
||||||
|
action = decision_dict.get("action") if has_decision else None
|
||||||
|
overrides = decision_dict.get("overrides", {}) if has_decision else {}
|
||||||
|
|
||||||
# Build label — append decision status if already decided
|
# Build label — append decision status if already decided
|
||||||
label = (
|
label = (
|
||||||
@@ -284,17 +302,21 @@ def match_group_card(
|
|||||||
f"(confidence: {confidence:.0f}%) "
|
f"(confidence: {confidence:.0f}%) "
|
||||||
f"[{matched_on}]"
|
f"[{matched_on}]"
|
||||||
)
|
)
|
||||||
if decision_val is True:
|
if action is True and overrides:
|
||||||
|
label += " — Merged (customized)"
|
||||||
|
elif action is True:
|
||||||
label += " — Merged"
|
label += " — Merged"
|
||||||
elif decision_val is False:
|
elif action is False:
|
||||||
label += " — Kept Both"
|
label += " — Kept Both"
|
||||||
|
|
||||||
# Decided groups collapse; undecided groups stay open
|
# Decided groups collapse; undecided groups stay open
|
||||||
expanded = not has_decision
|
expanded = not has_decision
|
||||||
|
|
||||||
|
display_cols = [c for c in df.columns if not str(c).startswith("_norm_")]
|
||||||
|
differing_cols = _find_differing_cols(group, df, display_cols)
|
||||||
|
|
||||||
with st.expander(label, expanded=expanded):
|
with st.expander(label, expanded=expanded):
|
||||||
# Build comparison DataFrame
|
# Build comparison DataFrame
|
||||||
display_cols = [c for c in df.columns if not str(c).startswith("_norm_")]
|
|
||||||
rows_data = []
|
rows_data = []
|
||||||
for idx in group.row_indices:
|
for idx in group.row_indices:
|
||||||
row = {"_row": idx + 1}
|
row = {"_row": idx + 1}
|
||||||
@@ -325,30 +347,107 @@ def match_group_card(
|
|||||||
|
|
||||||
if has_decision:
|
if has_decision:
|
||||||
# Show current decision with option to undo
|
# Show current decision with option to undo
|
||||||
if decision_val is True:
|
if action is True:
|
||||||
st.success("Decision: Merge")
|
msg = "Decision: Merge"
|
||||||
|
if overrides:
|
||||||
|
msg += f" ({len(overrides)} column(s) customized)"
|
||||||
|
st.success(msg)
|
||||||
else:
|
else:
|
||||||
st.info("Decision: Keep Both")
|
st.info("Decision: Keep Both")
|
||||||
|
|
||||||
def _undo(g=gid):
|
def _undo(g=gid, diff=differing_cols):
|
||||||
st.session_state["review_decisions"].pop(g, None)
|
st.session_state["review_decisions"].pop(g, None)
|
||||||
|
st.session_state.pop(f"base_row_{g}", None)
|
||||||
|
st.session_state.pop(f"customize_{g}", None)
|
||||||
|
for c in diff:
|
||||||
|
st.session_state.pop(f"col_{g}_{c}", None)
|
||||||
|
|
||||||
st.button("Undo", key=f"undo_{gid}", on_click=_undo)
|
st.button("Undo", key=f"undo_{gid}", on_click=_undo)
|
||||||
else:
|
else:
|
||||||
# Action buttons — on_click writes to session state before rerun
|
# --- Base row selector ---
|
||||||
def _on_merge(g=gid):
|
default_base = (
|
||||||
st.session_state["review_decisions"][g] = True
|
group.row_indices.index(group.survivor_index)
|
||||||
|
if group.survivor_index in group.row_indices
|
||||||
|
else 0
|
||||||
|
)
|
||||||
|
|
||||||
|
def _on_base_change(g=gid, diff=differing_cols):
|
||||||
|
"""Reset column pickers when the base row changes."""
|
||||||
|
for c in diff:
|
||||||
|
st.session_state.pop(f"col_{g}_{c}", None)
|
||||||
|
|
||||||
|
selected_survivor = st.radio(
|
||||||
|
"Base row (keep)",
|
||||||
|
options=group.row_indices,
|
||||||
|
index=default_base,
|
||||||
|
format_func=lambda idx: f"Row {idx + 1}",
|
||||||
|
key=f"base_row_{gid}",
|
||||||
|
horizontal=True,
|
||||||
|
on_change=_on_base_change,
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Customize columns (progressive disclosure) ---
|
||||||
|
if differing_cols:
|
||||||
|
customize = st.checkbox(
|
||||||
|
f"Customize columns ({len(differing_cols)} differ)",
|
||||||
|
key=f"customize_{gid}",
|
||||||
|
value=False,
|
||||||
|
)
|
||||||
|
if customize:
|
||||||
|
base_pos = group.row_indices.index(selected_survivor)
|
||||||
|
st.caption("Pick which row's value to use for each column:")
|
||||||
|
for col in differing_cols:
|
||||||
|
def _fmt(idx: int, c: str = col) -> str:
|
||||||
|
val = str(
|
||||||
|
st.session_state["df"].iloc[idx].get(c, "")
|
||||||
|
).strip()
|
||||||
|
return f"Row {idx + 1}: {val or '(empty)'}"
|
||||||
|
|
||||||
|
st.selectbox(
|
||||||
|
col,
|
||||||
|
options=group.row_indices,
|
||||||
|
index=base_pos,
|
||||||
|
format_func=_fmt,
|
||||||
|
key=f"col_{gid}_{col}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Action buttons ---
|
||||||
|
def _on_merge(
|
||||||
|
g=gid, indices=group.row_indices, diff=differing_cols,
|
||||||
|
):
|
||||||
|
the_df = st.session_state["df"]
|
||||||
|
base_idx = st.session_state.get(f"base_row_{g}", indices[0])
|
||||||
|
ovr: dict[str, str] = {}
|
||||||
|
for c in diff:
|
||||||
|
col_key = f"col_{g}_{c}"
|
||||||
|
if col_key in st.session_state:
|
||||||
|
source_idx = st.session_state[col_key]
|
||||||
|
if source_idx != base_idx:
|
||||||
|
ovr[c] = str(the_df.iloc[source_idx].get(c, ""))
|
||||||
|
st.session_state["review_decisions"][g] = {
|
||||||
|
"action": True,
|
||||||
|
"survivor_idx": base_idx,
|
||||||
|
"overrides": ovr,
|
||||||
|
}
|
||||||
|
|
||||||
def _on_keep(g=gid):
|
def _on_keep(g=gid):
|
||||||
st.session_state["review_decisions"][g] = False
|
st.session_state["review_decisions"][g] = {
|
||||||
|
"action": False,
|
||||||
|
"survivor_idx": group.survivor_index,
|
||||||
|
"overrides": {},
|
||||||
|
}
|
||||||
|
|
||||||
btn_left, btn_mid, _btn_right = st.columns(3)
|
btn_left, btn_mid, _btn_right = st.columns(3)
|
||||||
with btn_left:
|
with btn_left:
|
||||||
st.button("Merge", key=f"merge_{gid}",
|
st.button(
|
||||||
type="primary", on_click=_on_merge)
|
"Merge", key=f"merge_{gid}",
|
||||||
|
type="primary", on_click=_on_merge,
|
||||||
|
)
|
||||||
with btn_mid:
|
with btn_mid:
|
||||||
st.button("Keep Both", key=f"keep_{gid}",
|
st.button(
|
||||||
on_click=_on_keep)
|
"Keep Both", key=f"keep_{gid}",
|
||||||
|
on_click=_on_keep,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -404,6 +503,73 @@ def results_summary(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def apply_review_decisions(
|
||||||
|
original_df: pd.DataFrame,
|
||||||
|
match_groups: list[MatchResult],
|
||||||
|
decisions: dict,
|
||||||
|
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
||||||
|
"""Build final DataFrames by applying user review decisions.
|
||||||
|
|
||||||
|
Handles per-group survivor selection and column overrides without
|
||||||
|
re-running the deduplication engine.
|
||||||
|
|
||||||
|
Returns ``(deduplicated_df, removed_df)``.
|
||||||
|
"""
|
||||||
|
remove_indices: set[int] = set()
|
||||||
|
row_overrides: dict[int, dict[str, str]] = {}
|
||||||
|
|
||||||
|
for group in match_groups:
|
||||||
|
gid = group.group_id
|
||||||
|
decision = decisions.get(gid)
|
||||||
|
|
||||||
|
# No decision yet — accept with engine defaults
|
||||||
|
if decision is None:
|
||||||
|
survivor_idx = group.survivor_index
|
||||||
|
for idx in group.row_indices:
|
||||||
|
if idx != survivor_idx:
|
||||||
|
remove_indices.add(idx)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Keep both — skip this group entirely
|
||||||
|
if not decision.get("action", True):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Merge with user's choices
|
||||||
|
survivor_idx = decision.get("survivor_idx", group.survivor_index)
|
||||||
|
ovr = decision.get("overrides", {})
|
||||||
|
|
||||||
|
for idx in group.row_indices:
|
||||||
|
if idx != survivor_idx:
|
||||||
|
remove_indices.add(idx)
|
||||||
|
|
||||||
|
if ovr:
|
||||||
|
row_overrides[survivor_idx] = ovr
|
||||||
|
|
||||||
|
# Build output DataFrames
|
||||||
|
keep_indices = [i for i in range(len(original_df)) if i not in remove_indices]
|
||||||
|
|
||||||
|
if row_overrides:
|
||||||
|
rows = []
|
||||||
|
for i in keep_indices:
|
||||||
|
row = original_df.iloc[i].copy()
|
||||||
|
if i in row_overrides:
|
||||||
|
for col, val in row_overrides[i].items():
|
||||||
|
if col in row.index:
|
||||||
|
row[col] = val
|
||||||
|
rows.append(row)
|
||||||
|
deduped = pd.DataFrame(rows).reset_index(drop=True)
|
||||||
|
else:
|
||||||
|
deduped = original_df.iloc[keep_indices].copy().reset_index(drop=True)
|
||||||
|
|
||||||
|
removed = (
|
||||||
|
original_df.iloc[sorted(remove_indices)].copy().reset_index(drop=True)
|
||||||
|
if remove_indices
|
||||||
|
else pd.DataFrame()
|
||||||
|
)
|
||||||
|
|
||||||
|
return deduped, removed
|
||||||
|
|
||||||
|
|
||||||
def _build_match_groups_csv(
|
def _build_match_groups_csv(
|
||||||
result: DeduplicationResult,
|
result: DeduplicationResult,
|
||||||
original_df: pd.DataFrame,
|
original_df: pd.DataFrame,
|
||||||
|
|||||||
Reference in New Issue
Block a user