datatools-dev/tests/gui/test_dedup_review.py

"""Dedup review widget tests.

``match_group_card`` from ``src.gui.components`` has two modes (decided
/ undecided) and a Confirm/Undo flow keyed by session_state. We test
each state by exercising the parent Find Duplicates page end to end and
then poking at ``review_decisions`` directly.

Why not unit-test ``match_group_card`` in isolation? AppTest needs a
real page module, not a function call, so we drive the page and verify
the side effects on session_state. This catches integration bugs the
unit test couldn't see (e.g., session-state key drift between the
page and the component).
"""

from __future__ import annotations

import pandas as pd
import pytest

from .conftest import collected_text, stash_upload


# We need a frame that produces at least one match group. The 3-row
# small_csv has two Alice rows that share an email (case-folded) → one
# group of two members.
def _run_with_results(app):
    """Drive the page through to the post-Find-Duplicates state.

    1. First ``run()`` — page picks up the stashed upload, reads it, and
       renders the preview + Find Duplicates button. ``result`` is None.
    2. Click Find Duplicates and ``run()`` again — page calls
       ``deduplicate()`` and stashes the result. Match group cards
       render on this pass.

    Mirrors what a real user does instead of trying to short-circuit
    the page by stashing ``result`` directly (the page resets it to
    None on every new upload).
    """
    app.run()
    target = next(b for b in app.button if "Find Duplicates" in b.label)
    target.click().run()


class TestMatchGroupCardUndecided:
    """A freshly-found group has no decision → the card renders the
    interactive editor + Confirm button."""

    def test_card_expander_present(self, app_factory, small_csv_bytes):
        app = app_factory("1_Deduplicator")
        stash_upload(app, name="messy.csv", data=small_csv_bytes)
        _run_with_results(app)
        # An expander per group. The dedup result should produce
        # exactly one match group on this fixture.
        result = app.session_state["result"]
        assert len(result.match_groups) >= 1, (
            "fixture should produce at least one match group"
        )
        # Match group cards use ``st.expander``. AppTest exposes them
        # via ``app.expander``.
        labels = [e.label for e in app.expander]
        assert any("Group 1" in lbl for lbl in labels), (
            f"undecided card expander missing; got: {labels}"
        )

    def test_confirm_button_renders_for_undecided_group(
        self, app_factory, small_csv_bytes,
    ):
        app = app_factory("1_Deduplicator")
        stash_upload(app, name="messy.csv", data=small_csv_bytes)
        _run_with_results(app)
        # Confirm button is keyed ``confirm_<group_id>``.
        result = app.session_state["result"]
        gid = result.match_groups[0].group_id
        labels = [b.label for b in app.button]
        # Streamlit renders the button label as "Confirm".
        assert any(lbl == "Confirm" for lbl in labels), (
            f"undecided card missing Confirm button; buttons: {labels}"
        )


class TestBatchActions:
    """Accept All / Reject All / Clear Decisions are the three batch
    buttons that mutate ``review_decisions`` across all groups."""

    def test_accept_all_populates_decisions(self, app_factory, small_csv_bytes):
        app = app_factory("1_Deduplicator")
        stash_upload(app, name="messy.csv", data=small_csv_bytes)
        _run_with_results(app)

        target = next(b for b in app.button if b.label == "Accept All")
        target.click().run()
        decisions = app.session_state["review_decisions"]
        result = app.session_state["result"]
        assert len(decisions) == len(result.match_groups), (
            "Accept All should record a decision per group; "
            f"got {len(decisions)} decisions for "
            f"{len(result.match_groups)} groups"
        )
        # Each Accept-All decision keeps exactly one row (the survivor).
        for d in decisions.values():
            assert len(d["keep_indices"]) == 1

    def test_reject_all_keeps_every_member(self, app_factory, small_csv_bytes):
        app = app_factory("1_Deduplicator")
        stash_upload(app, name="messy.csv", data=small_csv_bytes)
        _run_with_results(app)

        target = next(b for b in app.button if b.label == "Reject All")
        target.click().run()
        decisions = app.session_state["review_decisions"]
        result = app.session_state["result"]
        # Reject = keep every member → keep_indices == row_indices.
        for g in result.match_groups:
            assert set(decisions[g.group_id]["keep_indices"]) == set(g.row_indices)

    def test_clear_decisions_wipes_state(self, app_factory, small_csv_bytes):
        app = app_factory("1_Deduplicator")
        stash_upload(app, name="messy.csv", data=small_csv_bytes)
        _run_with_results(app)

        # Populate decisions via Accept All, then Clear, then verify.
        accept = next(b for b in app.button if b.label == "Accept All")
        accept.click().run()
        assert app.session_state["review_decisions"], (
            "precondition failed: Accept All didn't populate"
        )

        clear = next(b for b in app.button if "Clear Decisions" in b.label)
        clear.click().run()
        assert app.session_state["review_decisions"] == {}


class TestApplyReviewDecisions:
    """The component-layer ``apply_review_decisions`` function is the
    actual semantic engine; unit-test it directly. The GUI just feeds
    its output to a download button."""

    def test_keep_all_means_no_rows_removed(
        self, app_factory, small_csv_bytes,
    ):
        from src.gui.components import apply_review_decisions
        from src.core import deduplicate
        import io

        df = pd.read_csv(io.BytesIO(small_csv_bytes), dtype=str, keep_default_na=False)
        result = deduplicate(df, preview=True)
        decisions = {
            g.group_id: {
                "keep_indices": list(g.row_indices),
                "overrides": {},
            }
            for g in result.match_groups
        }
        deduped, removed = apply_review_decisions(df, result.match_groups, decisions)
        assert len(deduped) == len(df), (
            "Keep-All should preserve every row"
        )
        assert removed.empty

    def test_merge_decision_drops_losers(
        self, app_factory, small_csv_bytes,
    ):
        from src.gui.components import apply_review_decisions
        from src.core import deduplicate
        import io

        df = pd.read_csv(io.BytesIO(small_csv_bytes), dtype=str, keep_default_na=False)
        result = deduplicate(df, preview=True)
        # Merge each group to its first member.
        decisions = {
            g.group_id: {
                "keep_indices": [g.row_indices[0]],
                "overrides": {},
            }
            for g in result.match_groups
        }
        deduped, removed = apply_review_decisions(df, result.match_groups, decisions)
        expected_removed = sum(len(g.row_indices) - 1 for g in result.match_groups)
        assert len(removed) == expected_removed
        assert len(deduped) == len(df) - expected_removed

    def test_column_override_applies_to_survivor(
        self, app_factory, small_csv_bytes,
    ):
        from src.gui.components import apply_review_decisions
        from src.core import deduplicate
        import io

        df = pd.read_csv(io.BytesIO(small_csv_bytes), dtype=str, keep_default_na=False)
        result = deduplicate(df, preview=True)
        group = result.match_groups[0]
        survivor = group.row_indices[0]
        decisions = {
            group.group_id: {
                "keep_indices": [survivor],
                "overrides": {"phone": "OVERRIDE_VALUE"},
            }
        }
        deduped, _ = apply_review_decisions(df, result.match_groups, decisions)
        # The survivor row in ``deduped`` must carry the override. Find
        # it via the original (non-loser) name.
        match = deduped[deduped["phone"] == "OVERRIDE_VALUE"]
        assert len(match) == 1, (
            f"override didn't apply; deduped frame: {deduped.to_dict()}"
        )