datatools-dev/tests/gui/test_advanced_panels.py

"""Advanced-options panel tests.

``config_panel`` (in ``src.gui.components``) is the dedup-page's
expander that houses every per-column / per-strategy knob. It's the
densest single widget surface in the GUI, so a session-state key drift
in there cascades into every dedup session.

We exercise it via the Deduplicator page (rendering ``config_panel``
in isolation requires a fake Streamlit context). The page provides
the surrounding state; we poke widgets and verify their effects.
"""

from __future__ import annotations

import pandas as pd
import pytest

from .conftest import stash_upload


GATED_PAGE = "1_Deduplicator"


def _render_page(app_factory, small_csv_bytes):
    app = app_factory(GATED_PAGE)
    stash_upload(app, name="messy.csv", data=small_csv_bytes)
    app.run()
    return app


# ---------------------------------------------------------------------------
# Expander presence + collapsed state
# ---------------------------------------------------------------------------

class TestAdvancedExpander:
    def test_advanced_options_expander_renders(self, app_factory, small_csv_bytes):
        app = _render_page(app_factory, small_csv_bytes)
        labels = [e.label for e in app.expander]
        assert any("Advanced Options" in lbl for lbl in labels), (
            f"Advanced Options expander missing; expanders: {labels}"
        )


# ---------------------------------------------------------------------------
# Algorithm selector
# ---------------------------------------------------------------------------

class TestAlgorithmSelector:
    """The fuzzy-algorithm dropdown drives ``Algorithm.{LEVENSHTEIN,
    JARO_WINKLER, TOKEN_SET_RATIO}`` on every column. Default value
    must be jaro_winkler — the strong-key build_default_strategies
    assumes it."""

    def test_default_algorithm_is_jaro_winkler(self, app_factory, small_csv_bytes):
        app = _render_page(app_factory, small_csv_bytes)
        # Find the selectbox by label.
        algo_boxes = [
            sb for sb in app.selectbox
            if sb.label == "Fuzzy algorithm"
        ]
        assert len(algo_boxes) == 1
        assert algo_boxes[0].value == "jaro_winkler"

    def test_algorithm_options_complete(self, app_factory, small_csv_bytes):
        app = _render_page(app_factory, small_csv_bytes)
        algo = next(sb for sb in app.selectbox if sb.label == "Fuzzy algorithm")
        assert set(algo.options) == {
            "jaro_winkler", "levenshtein", "token_set_ratio",
        }


# ---------------------------------------------------------------------------
# Threshold slider
# ---------------------------------------------------------------------------

class TestThresholdSlider:
    def test_default_threshold_is_85(self, app_factory, small_csv_bytes):
        app = _render_page(app_factory, small_csv_bytes)
        thresholds = [s for s in app.slider if "Similarity" in (s.label or "")]
        assert len(thresholds) == 1
        assert thresholds[0].value == 85

    def test_threshold_bounds(self, app_factory, small_csv_bytes):
        app = _render_page(app_factory, small_csv_bytes)
        thr = next(s for s in app.slider if "Similarity" in (s.label or ""))
        assert thr.min == 50
        assert thr.max == 100


# ---------------------------------------------------------------------------
# Survivor rule selector
# ---------------------------------------------------------------------------

class TestSurvivorSelector:
    def test_default_is_first(self, app_factory, small_csv_bytes):
        app = _render_page(app_factory, small_csv_bytes)
        surv = next(
            sb for sb in app.selectbox
            if sb.label == "Survivor rule"
        )
        assert surv.value == "first"

    def test_all_four_rules_offered(self, app_factory, small_csv_bytes):
        app = _render_page(app_factory, small_csv_bytes)
        surv = next(sb for sb in app.selectbox if sb.label == "Survivor rule")
        assert set(surv.options) == {
            "first", "last", "most-complete", "most-recent",
        }

    def test_selecting_most_recent_does_not_crash(
        self, app_factory, small_csv_bytes,
    ):
        """When ``most-recent`` is chosen the page should reveal a
        Date column dropdown. Pin the no-crash invariant."""
        app = _render_page(app_factory, small_csv_bytes)
        surv = next(sb for sb in app.selectbox if sb.label == "Survivor rule")
        surv.select("most-recent").run()
        assert not app.exception


# ---------------------------------------------------------------------------
# Merge checkbox
# ---------------------------------------------------------------------------

class TestMergeCheckbox:
    def test_merge_default_off(self, app_factory, small_csv_bytes):
        app = _render_page(app_factory, small_csv_bytes)
        merge_boxes = [c for c in app.checkbox if c.label == "Merge mode"]
        assert len(merge_boxes) == 1
        assert merge_boxes[0].value is False

    def test_toggling_merge_does_not_crash(
        self, app_factory, small_csv_bytes,
    ):
        app = _render_page(app_factory, small_csv_bytes)
        merge = next(c for c in app.checkbox if c.label == "Merge mode")
        merge.check().run()
        assert not app.exception
        # After checking, the value persists in session_state via the
        # widget's own key.


# ---------------------------------------------------------------------------
# Column multiselects
# ---------------------------------------------------------------------------

class TestColumnMultiselects:
    """Match-on / Strong-keys / Fuzzy multiselects use ``st.multiselect``
    on every column. Empty default = auto-detect."""

    def test_three_multiselects_present(self, app_factory, small_csv_bytes):
        app = _render_page(app_factory, small_csv_bytes)
        labels = {m.label for m in app.multiselect}
        assert {"Match on columns", "Strong keys", "Fuzzy columns"} <= labels

    def test_defaults_are_empty(self, app_factory, small_csv_bytes):
        app = _render_page(app_factory, small_csv_bytes)
        for ms in app.multiselect:
            if ms.label in {
                "Match on columns", "Strong keys", "Fuzzy columns",
            }:
                assert ms.value == [], (
                    f"{ms.label!r} default should be []; got {ms.value}"
                )

    def test_options_match_dataframe_columns(self, app_factory, small_csv_bytes):
        app = _render_page(app_factory, small_csv_bytes)
        df_cols = list(app.session_state["df"].columns)
        for ms in app.multiselect:
            if ms.label in {
                "Match on columns", "Strong keys", "Fuzzy columns",
            }:
                assert list(ms.options) == df_cols


# ---------------------------------------------------------------------------
# Save / Load config
# ---------------------------------------------------------------------------

class TestConfigSaveLoadButtons:
    def test_save_settings_button_present(self, app_factory, small_csv_bytes):
        app = _render_page(app_factory, small_csv_bytes)
        labels = [b.label for b in app.button]
        assert any("Save current settings" in lbl for lbl in labels)

    def test_config_file_uploader_present(self, app_factory, small_csv_bytes):
        app = _render_page(app_factory, small_csv_bytes)
        # AppTest exposes uploaders via ``app.file_uploader``. There
        # are two: the main file (pickup-or-upload) and the config
        # JSON. Make sure the config one is there.
        labels = [u.label for u in app.file_uploader]
        assert any("Load config profile" in lbl for lbl in labels), (
            f"config uploader missing; uploaders: {labels}"
        )