"""Dedup review widget tests. ``match_group_card`` from ``src.gui.components`` has two modes (decided / undecided) and a Confirm/Undo flow keyed by session_state. We test each state by exercising the parent Deduplicator page end to end and then poking at ``review_decisions`` directly. Why not unit-test ``match_group_card`` in isolation? AppTest needs a real page module, not a function call, so we drive the page and verify the side effects on session_state. This catches integration bugs the unit test couldn't see (e.g., session-state key drift between the page and the component). """ from __future__ import annotations import pandas as pd import pytest from .conftest import collected_text, stash_upload # We need a frame that produces at least one match group. The 3-row # small_csv has two Alice rows that share an email (case-folded) → one # group of two members. def _run_with_results(app): """Drive the page through to the post-Find-Duplicates state. 1. First ``run()`` — page picks up the stashed upload, reads it, and renders the preview + Find Duplicates button. ``result`` is None. 2. Click Find Duplicates and ``run()`` again — page calls ``deduplicate()`` and stashes the result. Match group cards render on this pass. Mirrors what a real user does instead of trying to short-circuit the page by stashing ``result`` directly (the page resets it to None on every new upload). """ app.run() target = next(b for b in app.button if "Find Duplicates" in b.label) target.click().run() class TestMatchGroupCardUndecided: """A freshly-found group has no decision → the card renders the interactive editor + Confirm button.""" def test_card_expander_present(self, app_factory, small_csv_bytes): app = app_factory("1_Deduplicator") stash_upload(app, name="messy.csv", data=small_csv_bytes) _run_with_results(app) # An expander per group. The dedup result should produce # exactly one match group on this fixture. result = app.session_state["result"] assert len(result.match_groups) >= 1, ( "fixture should produce at least one match group" ) # Match group cards use ``st.expander``. AppTest exposes them # via ``app.expander``. labels = [e.label for e in app.expander] assert any("Group 1" in lbl for lbl in labels), ( f"undecided card expander missing; got: {labels}" ) def test_confirm_button_renders_for_undecided_group( self, app_factory, small_csv_bytes, ): app = app_factory("1_Deduplicator") stash_upload(app, name="messy.csv", data=small_csv_bytes) _run_with_results(app) # Confirm button is keyed ``confirm_``. result = app.session_state["result"] gid = result.match_groups[0].group_id labels = [b.label for b in app.button] # Streamlit renders the button label as "Confirm". assert any(lbl == "Confirm" for lbl in labels), ( f"undecided card missing Confirm button; buttons: {labels}" ) class TestBatchActions: """Accept All / Reject All / Clear Decisions are the three batch buttons that mutate ``review_decisions`` across all groups.""" def test_accept_all_populates_decisions(self, app_factory, small_csv_bytes): app = app_factory("1_Deduplicator") stash_upload(app, name="messy.csv", data=small_csv_bytes) _run_with_results(app) target = next(b for b in app.button if b.label == "Accept All") target.click().run() decisions = app.session_state["review_decisions"] result = app.session_state["result"] assert len(decisions) == len(result.match_groups), ( "Accept All should record a decision per group; " f"got {len(decisions)} decisions for " f"{len(result.match_groups)} groups" ) # Each Accept-All decision keeps exactly one row (the survivor). for d in decisions.values(): assert len(d["keep_indices"]) == 1 def test_reject_all_keeps_every_member(self, app_factory, small_csv_bytes): app = app_factory("1_Deduplicator") stash_upload(app, name="messy.csv", data=small_csv_bytes) _run_with_results(app) target = next(b for b in app.button if b.label == "Reject All") target.click().run() decisions = app.session_state["review_decisions"] result = app.session_state["result"] # Reject = keep every member → keep_indices == row_indices. for g in result.match_groups: assert set(decisions[g.group_id]["keep_indices"]) == set(g.row_indices) def test_clear_decisions_wipes_state(self, app_factory, small_csv_bytes): app = app_factory("1_Deduplicator") stash_upload(app, name="messy.csv", data=small_csv_bytes) _run_with_results(app) # Populate decisions via Accept All, then Clear, then verify. accept = next(b for b in app.button if b.label == "Accept All") accept.click().run() assert app.session_state["review_decisions"], ( "precondition failed: Accept All didn't populate" ) clear = next(b for b in app.button if "Clear Decisions" in b.label) clear.click().run() assert app.session_state["review_decisions"] == {} class TestApplyReviewDecisions: """The component-layer ``apply_review_decisions`` function is the actual semantic engine; unit-test it directly. The GUI just feeds its output to a download button.""" def test_keep_all_means_no_rows_removed( self, app_factory, small_csv_bytes, ): from src.gui.components import apply_review_decisions from src.core import deduplicate import io df = pd.read_csv(io.BytesIO(small_csv_bytes), dtype=str, keep_default_na=False) result = deduplicate(df, preview=True) decisions = { g.group_id: { "keep_indices": list(g.row_indices), "overrides": {}, } for g in result.match_groups } deduped, removed = apply_review_decisions(df, result.match_groups, decisions) assert len(deduped) == len(df), ( "Keep-All should preserve every row" ) assert removed.empty def test_merge_decision_drops_losers( self, app_factory, small_csv_bytes, ): from src.gui.components import apply_review_decisions from src.core import deduplicate import io df = pd.read_csv(io.BytesIO(small_csv_bytes), dtype=str, keep_default_na=False) result = deduplicate(df, preview=True) # Merge each group to its first member. decisions = { g.group_id: { "keep_indices": [g.row_indices[0]], "overrides": {}, } for g in result.match_groups } deduped, removed = apply_review_decisions(df, result.match_groups, decisions) expected_removed = sum(len(g.row_indices) - 1 for g in result.match_groups) assert len(removed) == expected_removed assert len(deduped) == len(df) - expected_removed def test_column_override_applies_to_survivor( self, app_factory, small_csv_bytes, ): from src.gui.components import apply_review_decisions from src.core import deduplicate import io df = pd.read_csv(io.BytesIO(small_csv_bytes), dtype=str, keep_default_na=False) result = deduplicate(df, preview=True) group = result.match_groups[0] survivor = group.row_indices[0] decisions = { group.group_id: { "keep_indices": [survivor], "overrides": {"phone": "OVERRIDE_VALUE"}, } } deduped, _ = apply_review_decisions(df, result.match_groups, decisions) # The survivor row in ``deduped`` must carry the override. Find # it via the original (non-loser) name. match = deduped[deduped["phone"] == "OVERRIDE_VALUE"] assert len(match) == 1, ( f"override didn't apply; deduped frame: {deduped.to_dict()}" )