datatools-dev/src/gui/components.py

"""Reusable Streamlit widgets for the DataTools GUI."""

from __future__ import annotations

import io
from typing import Optional

import pandas as pd
import streamlit as st

from src.core.dedup import (
    Algorithm,
    ColumnMatchStrategy,
    DeduplicationResult,
    MatchResult,
    MatchStrategy,
    SurvivorRule,
)
from src.core.config import (
    ColumnStrategyConfig,
    DeduplicationConfig,
    StrategyConfig,
)
from src.core.normalizers import NormalizerType


# ---------------------------------------------------------------------------
# App chrome — hide Streamlit default UI for app-like feel
# ---------------------------------------------------------------------------

_HIDE_CHROME_CSS = """
<style>
/* Hide Streamlit header bar */
header[data-testid="stHeader"] {
    display: none !important;
}
/* Hide hamburger menu */
button[kind="header"] {
    display: none !important;
}
#MainMenu {
    display: none !important;
}
/* Hide footer */
footer {
    display: none !important;
}
/* Hide deploy button */
[data-testid="stAppDeployButton"] {
    display: none !important;
}
/* Reclaim top padding lost from hidden header */
.stAppViewBlockContainer,
[data-testid="stAppViewBlockContainer"] {
    padding-top: 1rem !important;
}
</style>
"""


def hide_streamlit_chrome() -> None:
    """Inject CSS to hide Streamlit's default header, menu, and footer."""
    st.markdown(_HIDE_CHROME_CSS, unsafe_allow_html=True)


# ---------------------------------------------------------------------------
# Config panel (advanced options)
# ---------------------------------------------------------------------------

def config_panel(df: pd.DataFrame) -> dict:
    """Render the Advanced Options expander. Returns a settings dict.

    Keys returned:
        strategies: list[MatchStrategy] | None
        survivor_rule: SurvivorRule
        date_column: str | None
        merge: bool
    """
    columns = list(df.columns)

    with st.expander("Advanced Options"):
        col_left, col_right = st.columns(2)

        with col_left:
            subset_cols = st.multiselect(
                "Match on columns",
                columns,
                default=[],
                help="Leave empty to auto-detect based on column names.",
            )
            key_cols = st.multiselect(
                "Strong keys",
                columns,
                default=[],
                help="Columns that uniquely identify records (e.g., EIN, SKU). Each is an independent exact-match strategy.",
            )
            fuzzy_cols = st.multiselect(
                "Fuzzy columns",
                columns,
                default=[],
                help="Columns to fuzzy-match. Others use exact matching.",
            )

        with col_right:
            algorithm = st.selectbox(
                "Fuzzy algorithm",
                ["jaro_winkler", "levenshtein", "token_set_ratio"],
                index=0,
                help="jaro_winkler: best for names. levenshtein: best for typos. token_set_ratio: best for addresses.",
            )
            threshold = st.slider(
                "Similarity threshold",
                min_value=50,
                max_value=100,
                value=85,
                help="Lower = more matches but more false positives.",
            )
            survivor = st.selectbox(
                "Survivor rule",
                ["first", "last", "most-complete", "most-recent"],
                index=0,
                help="Which row to keep when duplicates are found.",
            )

        # Second row of options
        col_a, col_b = st.columns(2)

        with col_a:
            normalize_options = {c: "auto" for c in columns}
            normalizer_types = ["auto", "email", "phone", "name", "address", "string", "none"]

            normalize_map: dict[str, str] = {}
            if fuzzy_cols or subset_cols:
                target_cols = fuzzy_cols or subset_cols
                st.markdown("**Per-column normalizers**")
                for col_name in target_cols:
                    norm = st.selectbox(
                        f"Normalizer for '{col_name}'",
                        normalizer_types,
                        index=0,
                        key=f"norm_{col_name}",
                    )
                    if norm not in ("auto", "none"):
                        normalize_map[col_name] = norm

        with col_b:
            merge = st.checkbox(
                "Merge mode",
                value=False,
                help="Fill missing fields in the surviving row from removed duplicates.",
            )
            date_column: Optional[str] = None
            if survivor == "most-recent":
                date_column = st.selectbox(
                    "Date column",
                    columns,
                    help="Required for most-recent survivor rule.",
                )

        # Config save/load
        st.divider()
        cfg_left, cfg_right = st.columns(2)

        with cfg_left:
            config_file = st.file_uploader(
                "Load config profile",
                type=["json"],
                help="Load previously saved settings.",
                key="config_upload",
            )
            if config_file is not None:
                import json
                try:
                    data = json.loads(config_file.read())
                    loaded = DeduplicationConfig.from_dict(data)
                    st.session_state["loaded_config"] = loaded
                    st.success("Config loaded.")
                except Exception as e:
                    st.error(f"Failed to load config: {e}")

        with cfg_right:
            if st.button("Save current settings"):
                cfg = _build_config(
                    subset_cols, key_cols, fuzzy_cols,
                    algorithm, threshold, normalize_map,
                    survivor, date_column, merge,
                )
                cfg_json = cfg.to_dict()
                import json
                st.download_button(
                    "Download config JSON",
                    data=json.dumps(cfg_json, indent=2),
                    file_name="dedup_config.json",
                    mime="application/json",
                )

    # Build strategies from selections
    strategies = _build_strategies(
        subset_cols, key_cols, fuzzy_cols,
        algorithm, threshold, normalize_map,
    )

    # Survivor rule mapping
    survivor_map = {
        "first": SurvivorRule.KEEP_FIRST,
        "last": SurvivorRule.KEEP_LAST,
        "most-complete": SurvivorRule.KEEP_MOST_COMPLETE,
        "most-recent": SurvivorRule.KEEP_MOST_RECENT,
    }

    return {
        "strategies": strategies,
        "survivor_rule": survivor_map[survivor],
        "date_column": date_column,
        "merge": merge,
    }


def _build_strategies(
    subset_cols: list[str],
    key_cols: list[str],
    fuzzy_cols: list[str],
    algorithm: str,
    threshold: int,
    normalize_map: dict[str, str],
) -> Optional[list[MatchStrategy]]:
    """Build MatchStrategy list from GUI selections. Returns None for auto-detect."""
    strategies: list[MatchStrategy] = []

    # If user selected columns explicitly, build from those
    if subset_cols or fuzzy_cols:
        target_cols = subset_cols if subset_cols else fuzzy_cols
        fuzzy_set = set(fuzzy_cols)
        col_strats: list[ColumnMatchStrategy] = []
        for col in target_cols:
            norm = None
            if col in normalize_map:
                norm = NormalizerType(normalize_map[col])
            if col in fuzzy_set:
                algo = Algorithm(algorithm)
                thresh = float(threshold)
            else:
                algo = Algorithm.EXACT
                thresh = 100.0
            col_strats.append(ColumnMatchStrategy(
                column=col, algorithm=algo, threshold=thresh, normalizer=norm,
            ))
        strategies.append(MatchStrategy(column_strategies=col_strats))

    # Add strong key strategies
    if key_cols:
        for col in key_cols:
            strategies.append(MatchStrategy(column_strategies=[
                ColumnMatchStrategy(column=col, algorithm=Algorithm.EXACT, threshold=100.0)
            ]))

    return strategies if strategies else None


def _build_config(
    subset_cols, key_cols, fuzzy_cols,
    algorithm, threshold, normalize_map,
    survivor, date_column, merge,
) -> DeduplicationConfig:
    """Build a DeduplicationConfig from GUI state."""
    cfg = DeduplicationConfig(
        survivor_rule=survivor.replace("-", "_"),
        date_column=date_column,
        merge=merge,
        subset_columns=subset_cols or None,
        fuzzy_columns=fuzzy_cols or None,
        default_algorithm=algorithm,
        default_threshold=float(threshold),
        normalize_map=normalize_map or None,
    )
    strategies = _build_strategies(
        subset_cols, key_cols, fuzzy_cols,
        algorithm, threshold, normalize_map,
    )
    if strategies:
        cfg.strategies = [
            StrategyConfig(columns=[
                ColumnStrategyConfig(
                    column=cs.column,
                    algorithm=cs.algorithm.value,
                    threshold=cs.threshold,
                    normalizer=cs.normalizer.value if cs.normalizer else None,
                )
                for cs in s.column_strategies
            ])
            for s in strategies
        ]
    return cfg


# ---------------------------------------------------------------------------
# Match group review card
# ---------------------------------------------------------------------------

def _find_differing_cols(
    group: MatchResult, df: pd.DataFrame, display_cols: list[str],
) -> list[str]:
    """Return columns where values differ across rows in the group."""
    differing = []
    for col in display_cols:
        values = set()
        for idx in group.row_indices:
            values.add(str(df.iloc[idx].get(col, "")).strip())
        if len(values) > 1:
            differing.append(col)
    return differing


def match_group_card(
    group: MatchResult,
    df: pd.DataFrame,
    group_num: int,
) -> None:
    """Render an expandable match group card with side-by-side diff.

    Users select which rows to keep via checkboxes.  When exactly one row
    is kept they can also cherry-pick column values from the other rows.

    Decision format stored in ``st.session_state["review_decisions"]``::

        {group_id: {"keep_indices": [int, ...], "overrides": {col: val}}}
    """
    confidence = group.confidence
    matched_on = ", ".join(group.matched_on)
    n_rows = len(group.row_indices)
    gid = group.group_id

    decisions = st.session_state.get("review_decisions", {})
    has_decision = gid in decisions
    decision_dict = decisions.get(gid, {})
    keep_indices = decision_dict.get("keep_indices", []) if has_decision else []
    overrides = decision_dict.get("overrides", {}) if has_decision else {}

    # Build label — append decision status if already decided
    label = (
        f"Group {group_num}: {n_rows} rows "
        f"(confidence: {confidence:.0f}%) "
        f"[{matched_on}]"
    )
    if has_decision:
        if len(keep_indices) == n_rows:
            label += " — Kept All"
        elif len(keep_indices) == 1:
            label += " — Merged (customized)" if overrides else " — Merged"
        else:
            label += f" — Split (kept {len(keep_indices)} of {n_rows})"

    # Decided groups collapse; undecided groups stay open
    expanded = not has_decision

    display_cols = [c for c in df.columns if not str(c).startswith("_norm_")]
    differing_cols = _find_differing_cols(group, df, display_cols)

    with st.expander(label, expanded=expanded):
        if has_decision:
            # --- Decided state: read-only table with diff highlighting ---
            rows_data = []
            for idx in group.row_indices:
                row = {"Row": idx + 1}
                for col in display_cols:
                    row[col] = df.iloc[idx].get(col, "")
                rows_data.append(row)
            compare_df = pd.DataFrame(rows_data).set_index("Row")

            def _highlight_diffs(s: pd.Series) -> list[str]:
                styles = []
                first_val = str(s.iloc[0]).strip() if len(s) > 0 else ""
                for val in s:
                    val_str = str(val).strip()
                    if val_str != first_val and val_str and first_val:
                        styles.append(
                            "background-color: rgba(245, 166, 35, 0.2)"
                        )
                    elif not val_str and first_val:
                        styles.append(
                            "background-color: rgba(240, 82, 82, 0.1)"
                        )
                    else:
                        styles.append("")
                return styles

            styled = compare_df.style.apply(_highlight_diffs, axis=0)
            st.dataframe(styled, use_container_width=True)

            if len(keep_indices) == n_rows:
                st.info("Decision: Kept All")
            elif len(keep_indices) == 1:
                msg = "Decision: Merge"
                if overrides:
                    msg += f" ({len(overrides)} column(s) customized)"
                st.success(msg)
            else:
                kept = ", ".join(str(i + 1) for i in sorted(keep_indices))
                st.success(
                    f"Decision: Keep rows {kept} "
                    f"(removing {n_rows - len(keep_indices)})"
                )

            def _undo(g=gid):
                st.session_state["review_decisions"].pop(g, None)
                st.session_state.pop(f"editor_{g}", None)

            st.button("Undo", key=f"undo_{gid}", on_click=_undo)

        else:
            # --- Undecided: interactive editor with inline checkboxes & dropdowns ---
            editor_rows = []
            for idx in group.row_indices:
                row_data = {"Keep": idx == group.survivor_index, "Row": idx + 1}
                for col in display_cols:
                    row_data[col] = str(df.iloc[idx].get(col, ""))
                editor_rows.append(row_data)
            editor_df = pd.DataFrame(editor_rows)

            col_config = {
                "Keep": st.column_config.CheckboxColumn(
                    "Keep", default=True, width="small",
                ),
                "Row": st.column_config.NumberColumn("Row", width="small"),
            }
            for col in differing_cols:
                vals = []
                for idx in group.row_indices:
                    v = str(df.iloc[idx].get(col, "")).strip()
                    if v not in vals:
                        vals.append(v)
                if "" not in vals:
                    vals.append("")
                col_config[col] = st.column_config.SelectboxColumn(
                    col, options=vals, required=False,
                )

            disabled_cols = ["Row"] + [
                c for c in display_cols if c not in differing_cols
            ]

            edited = st.data_editor(
                editor_df,
                column_config=col_config,
                disabled=disabled_cols,
                use_container_width=True,
                hide_index=True,
                key=f"editor_{gid}",
            )

            # Read which rows are checked
            checked = [
                idx
                for i, idx in enumerate(group.row_indices)
                if edited.iloc[i]["Keep"]
            ]

            if differing_cols:
                st.caption(
                    f"Columns with differences (editable): "
                    f"{', '.join(differing_cols)}"
                )

            # Status + surviving rows preview
            if len(checked) == 0:
                st.warning("Select at least one row to keep.")
            else:
                if len(checked) == n_rows:
                    st.caption("Keeping all rows (no duplicates removed)")
                elif len(checked) == 1:
                    st.caption(
                        f"Merging into Row {checked[0] + 1}, "
                        f"removing {n_rows - 1} row(s)"
                    )
                else:
                    st.caption(
                        f"Keeping {len(checked)} rows, "
                        f"removing {n_rows - len(checked)}"
                    )

                # Build preview of surviving rows with edits applied
                checked_positions = [
                    i for i, idx in enumerate(group.row_indices)
                    if idx in checked
                ]
                preview = edited.iloc[checked_positions].drop(
                    columns=["Keep"],
                ).reset_index(drop=True)
                st.markdown("**Surviving rows preview:**")
                st.dataframe(preview, use_container_width=True, hide_index=True)

            # Confirm
            def _on_confirm(
                g=gid, indices=list(group.row_indices),
                diff=differing_cols, surv=group.survivor_index,
            ):
                editor_state = st.session_state.get(f"editor_{g}", {})
                ed_rows = editor_state.get("edited_rows", {})

                # Determine which rows to keep
                keep = []
                for i, idx in enumerate(indices):
                    changes = ed_rows.get(i, {})
                    default_keep = idx == surv
                    if changes.get("Keep", default_keep):
                        keep.append(idx)
                if not keep:
                    keep = list(indices)

                # Column overrides (single-survivor merge only)
                ovr: dict[str, str] = {}
                if len(keep) == 1:
                    surv_idx = keep[0]
                    surv_pos = indices.index(surv_idx)
                    surv_changes = ed_rows.get(surv_pos, {})
                    the_df = st.session_state["df"]
                    for c in diff:
                        if c in surv_changes:
                            new_val = (
                                str(surv_changes[c])
                                if surv_changes[c] is not None
                                else ""
                            )
                            orig = str(
                                the_df.iloc[surv_idx].get(c, "")
                            ).strip()
                            if new_val.strip() != orig:
                                ovr[c] = new_val

                st.session_state["review_decisions"][g] = {
                    "keep_indices": keep,
                    "overrides": ovr,
                }

            st.button(
                "Confirm",
                key=f"confirm_{gid}",
                type="primary",
                on_click=_on_confirm,
                disabled=(len(checked) == 0),
            )


# ---------------------------------------------------------------------------
# Results summary + downloads
# ---------------------------------------------------------------------------

def results_summary(
    result: DeduplicationResult,
    original_df: pd.DataFrame,
) -> None:
    """Render summary stats and download buttons."""
    removed = result.original_row_count - len(result.deduplicated_df)

    # Summary metrics
    col1, col2, col3, col4 = st.columns(4)
    col1.metric("Rows In", result.original_row_count)
    col2.metric("Rows Out", len(result.deduplicated_df))
    col3.metric("Removed", removed)
    col4.metric("Groups", len(result.match_groups))

    st.divider()

    # Download buttons
    dl_left, dl_mid, dl_right = st.columns(3)

    with dl_left:
        csv_bytes = result.deduplicated_df.to_csv(index=False).encode("utf-8-sig")
        st.download_button(
            "Download Deduplicated CSV",
            data=csv_bytes,
            file_name="deduplicated.csv",
            mime="text/csv",
        )

    with dl_mid:
        if not result.removed_df.empty:
            removed_bytes = result.removed_df.to_csv(index=False).encode("utf-8-sig")
            st.download_button(
                "Download Removed Rows",
                data=removed_bytes,
                file_name="removed_rows.csv",
                mime="text/csv",
            )

    with dl_right:
        if result.match_groups:
            groups_data = _build_match_groups_csv(result, original_df)
            st.download_button(
                "Download Match Groups Report",
                data=groups_data,
                file_name="match_groups.csv",
                mime="text/csv",
            )


def apply_review_decisions(
    original_df: pd.DataFrame,
    match_groups: list[MatchResult],
    decisions: dict,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Build final DataFrames by applying user review decisions.

    Supports three modes per group:

    - **Merge** (1 row kept): single survivor with optional column overrides.
    - **Split** (some rows kept): selected rows survive, others removed.
    - **Keep all** (all rows kept): no rows removed.
    - **No decision**: engine default (single survivor).

    Returns ``(deduplicated_df, removed_df)``.
    """
    remove_indices: set[int] = set()
    row_overrides: dict[int, dict[str, str]] = {}

    for group in match_groups:
        gid = group.group_id
        decision = decisions.get(gid)

        # No decision yet — accept with engine defaults
        if decision is None:
            keep = {group.survivor_index}
        else:
            keep = set(decision.get("keep_indices", group.row_indices))
            # Safety: never remove all rows in a group
            if not keep:
                keep = set(group.row_indices)

        for idx in group.row_indices:
            if idx not in keep:
                remove_indices.add(idx)

        # Column overrides (only meaningful for single-survivor merge)
        ovr = decision.get("overrides", {}) if decision else {}
        if ovr and len(keep) == 1:
            row_overrides[next(iter(keep))] = ovr

    # Build output DataFrames
    kept = [i for i in range(len(original_df)) if i not in remove_indices]

    if row_overrides:
        rows = []
        for i in kept:
            row = original_df.iloc[i].copy()
            if i in row_overrides:
                for col, val in row_overrides[i].items():
                    if col in row.index:
                        row[col] = val
            rows.append(row)
        deduped = pd.DataFrame(rows).reset_index(drop=True)
    else:
        deduped = original_df.iloc[kept].copy().reset_index(drop=True)

    removed = (
        original_df.iloc[sorted(remove_indices)].copy().reset_index(drop=True)
        if remove_indices
        else pd.DataFrame()
    )

    return deduped, removed


def _build_match_groups_csv(
    result: DeduplicationResult,
    original_df: pd.DataFrame,
) -> bytes:
    """Build the match groups audit CSV as bytes."""
    rows = []
    for g in result.match_groups:
        for idx in g.row_indices:
            row_data = {
                "_group_id": g.group_id + 1,
                "_is_survivor": idx == g.survivor_index,
                "_confidence": g.confidence,
                "_matched_on": ", ".join(g.matched_on),
                "_original_row": idx + 1,
            }
            for col in original_df.columns:
                if not str(col).startswith("_norm_"):
                    row_data[col] = original_df.iloc[idx].get(col, "") if idx < len(original_df) else ""
            rows.append(row_data)

    groups_df = pd.DataFrame(rows)
    return groups_df.to_csv(index=False).encode("utf-8-sig")