datatools-dev/src/gui/pages/4_Missing_Values.py

"""DataTools Missing Value Handler — Streamlit page."""

from __future__ import annotations

import io
import json
import sys
from pathlib import Path

import pandas as pd
import streamlit as st

_project_root = Path(__file__).resolve().parent.parent.parent.parent
if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root))

from src.gui.components import (
    hide_streamlit_chrome,
    pickup_or_upload,
    require_normalization_gate,
)
from src.core.missing import (
    DEFAULT_SENTINELS,
    MissingOptions,
    PRESETS,
    handle_missing,
    profile_missing,
)

hide_streamlit_chrome()
require_normalization_gate()


# ---------------------------------------------------------------------------
# Header
# ---------------------------------------------------------------------------

st.title("🕳️ Missing Value Handler")
st.caption(
    "Detect disguised nulls, profile missingness, and apply imputation or "
    "drop strategies. Runs locally — your data never leaves this computer."
)


# ---------------------------------------------------------------------------
# File upload
# ---------------------------------------------------------------------------

uploaded = pickup_or_upload(
    label="Upload CSV or Excel file",
    key="missing_file_upload",
    types=["csv", "tsv", "xlsx", "xls"],
)

if uploaded is None:
    st.info("Upload a CSV, TSV, or Excel file to begin.")
    st.stop()


@st.cache_data(show_spinner=False)
def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
    """Read the uploaded bytes into a DataFrame.

    Unlike the text cleaner, we do *not* force ``dtype=str`` here: missing-
    value handling is more useful when numeric columns are typed correctly
    (so mean / median / interpolate work without manual coercion).
    Sentinel strings are still detected because they survive in object
    columns where any cell is non-numeric.
    """
    suffix = Path(name).suffix.lower()
    bio = io.BytesIO(data)
    if suffix in (".xlsx", ".xls"):
        return pd.read_excel(bio)
    for enc in ("utf-8", "utf-8-sig", "latin-1"):
        try:
            bio.seek(0)
            sep = "\t" if suffix == ".tsv" else ","
            return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn")
        except UnicodeDecodeError:
            continue
    bio.seek(0)
    return pd.read_csv(bio, encoding="latin-1")


try:
    df = _read_uploaded(uploaded.name, uploaded.getvalue())
except Exception as e:
    from src.core.errors import format_for_user
    st.error(
        f"**Could not read `{uploaded.name}`**\n\n"
        f"```\n{format_for_user(e)}\n```"
    )
    st.stop()

st.subheader(f"Preview: {uploaded.name}")
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
st.dataframe(df.head(10), use_container_width=True)

st.divider()

# ---------------------------------------------------------------------------
# Initial profile (read-only)
# ---------------------------------------------------------------------------

st.subheader("Missingness profile")

initial_profile = profile_missing(df, MissingOptions())
prof_df = initial_profile.to_dataframe()

m1, m2, m3, m4 = st.columns(4)
m1.metric("Rows", initial_profile.rows_total)
m2.metric("Cells missing", initial_profile.cells_missing)
m3.metric("% cells missing", f"{initial_profile.cells_missing_pct:.1f}%")
m4.metric("Complete rows", initial_profile.rows_complete)

st.dataframe(prof_df, use_container_width=True, hide_index=True)

if initial_profile.cells_missing == 0:
    st.success("No missing values or disguised nulls detected. Nothing to handle.")

st.divider()

# ---------------------------------------------------------------------------
# Options
# ---------------------------------------------------------------------------

st.subheader("Strategy")

preset_label = st.radio(
    "Preset",
    [
        "detect-only (standardize sentinels to NaN, no fill or drop)",
        "safe-fill (numeric → median, categorical → mode)",
        "drop-incomplete (drop any row with missing)",
    ],
    index=0,
    help=(
        "detect-only: replace 'N/A', '-', 'NULL', etc. with real NaN, then stop. "
        "safe-fill: also fill — numeric columns with median, others with mode. "
        "drop-incomplete: also drop every row that has any missing cell."
    ),
)
preset_key = preset_label.split(" ", 1)[0]
options = MissingOptions.from_preset(preset_key)

with st.expander("Advanced options"):
    col_a, col_b = st.columns(2)

    with col_a:
        st.markdown("**Detection**")
        options.standardize_sentinels = st.checkbox(
            "Standardize disguised nulls to NaN",
            value=options.standardize_sentinels,
            help="Replace 'N/A', '-', 'NULL', whitespace-only cells, etc. with real NaN.",
        )
        sentinels_text = st.text_input(
            "Sentinel values (comma-separated)",
            value=", ".join(options.sentinels),
            disabled=not options.standardize_sentinels,
            help="Matched case-insensitively after stripping whitespace.",
        )
        options.sentinels = [
            s.strip() for s in sentinels_text.split(",") if s.strip()
        ]

    with col_b:
        st.markdown("**Strategy override**")
        strat_options = [
            "(use preset)",
            "none", "drop_row", "drop_col", "drop_both",
            "mean", "median", "mode", "constant",
            "ffill", "bfill", "interpolate",
        ]
        strat_choice = st.selectbox(
            "Global strategy",
            strat_options,
            index=0,
            help=(
                "drop_row / drop_col use the thresholds below. "
                "mean / median / interpolate are numeric only — non-numeric "
                "columns fall back to the categorical strategy."
            ),
        )
        if strat_choice != "(use preset)":
            options.strategy = strat_choice  # type: ignore[assignment]

        cat_strat = st.selectbox(
            "Categorical fallback (for non-numeric columns)",
            ["mode", "constant", "ffill", "bfill", "none"],
            index=0,
        )
        options.categorical_strategy = cat_strat  # type: ignore[assignment]

        if options.strategy == "constant" or cat_strat == "constant":
            fill_val = st.text_input(
                "Constant fill value",
                value="",
                help="Used when strategy = constant. Leave blank to fill with empty string.",
            )
            options.fill_value = fill_val

    st.markdown("**Drop thresholds**")
    col_c, col_d = st.columns(2)
    with col_c:
        options.row_drop_threshold = st.slider(
            "Row drop threshold (drop rows with ≥ this fraction missing across selected cols)",
            0.0, 1.0, options.row_drop_threshold, 0.05,
        )
    with col_d:
        options.col_drop_threshold = st.slider(
            "Column drop threshold (drop columns with ≥ this fraction missing)",
            0.0, 1.0, options.col_drop_threshold, 0.05,
        )

    st.markdown("**Scope**")
    selected_cols = st.multiselect(
        "Columns to handle (default: all)",
        options=list(df.columns),
        default=list(df.columns),
    )
    skip_cols = st.multiselect(
        "Columns to skip",
        options=list(df.columns),
        default=[],
    )
    options.columns = selected_cols if selected_cols else None
    options.skip_columns = list(skip_cols)

    st.markdown("**Per-column strategy overrides** (optional)")
    st.caption(
        "Set a different strategy for specific columns. Leave any row blank to "
        "use the global strategy."
    )
    per_col_overrides: dict[str, str] = {}
    only_missing_cols = [
        r.column for r in initial_profile.columns if r.has_missing
    ]
    if only_missing_cols:
        edit_df = pd.DataFrame({
            "column": only_missing_cols,
            "strategy": ["" for _ in only_missing_cols],
        })
        edited = st.data_editor(
            edit_df,
            use_container_width=True,
            hide_index=True,
            column_config={
                "column": st.column_config.TextColumn("Column", disabled=True),
                "strategy": st.column_config.SelectboxColumn(
                    "Override",
                    options=[
                        "", "drop_row", "drop_col",
                        "mean", "median", "mode", "constant",
                        "ffill", "bfill", "interpolate",
                    ],
                ),
            },
            key="missing_per_col_editor",
        )
        for _, row in edited.iterrows():
            if row["strategy"]:
                per_col_overrides[row["column"]] = row["strategy"]
        options.column_strategies = per_col_overrides  # type: ignore[assignment]

# ---------------------------------------------------------------------------
# Run
# ---------------------------------------------------------------------------

st.divider()

if st.button("Handle Missing Values", type="primary", use_container_width=True):
    with st.spinner("Handling..."):
        try:
            result = handle_missing(df, options)
        except (ValueError, OSError) as e:
            from src.core.errors import format_for_user
            st.error(format_for_user(e))
            st.stop()
    st.session_state["missing_result"] = result
    st.session_state["missing_input_name"] = uploaded.name
    st.session_state["missing_options"] = options.to_dict()

result = st.session_state.get("missing_result")
if result is None:
    st.info("Choose a strategy and click **Handle Missing Values** to run.")
    st.stop()

# ---------------------------------------------------------------------------
# Results
# ---------------------------------------------------------------------------

st.subheader("Results")

m1, m2, m3, m4 = st.columns(4)
m1.metric("Sentinels → NaN", result.sentinels_standardized)
m2.metric("Cells filled", result.cells_filled)
m3.metric("Rows dropped", result.rows_dropped)
m4.metric("Columns dropped", len(result.columns_dropped))

if result.columns_dropped:
    st.warning(f"Dropped columns: {', '.join(result.columns_dropped)}")

st.markdown("**Missingness — before vs. after**")
before = result.profile_before.to_dataframe().set_index("column")[
    ["missing", "missing_pct"]
].rename(columns={"missing": "before_missing", "missing_pct": "before_pct"})
after = result.profile_after.to_dataframe().set_index("column")[
    ["missing", "missing_pct"]
].rename(columns={"missing": "after_missing", "missing_pct": "after_pct"})
combined = before.join(after, how="outer").fillna(0)
st.dataframe(combined, use_container_width=True)

if result.strategy_per_column:
    st.markdown("**Strategy applied per column**")
    strat_df = pd.DataFrame(
        [{"column": c, "strategy": s} for c, s in result.strategy_per_column.items()]
    )
    st.dataframe(strat_df, use_container_width=True, hide_index=True)

if not result.changes.empty:
    st.markdown("**Audit (first 50 changes)**")
    audit_view = result.changes.head(50).copy()
    audit_view["row"] = audit_view["row"].apply(lambda x: "—" if x == -1 else x + 1)
    st.dataframe(audit_view, use_container_width=True, hide_index=True)
    if len(result.changes) > 50:
        st.caption(f"… and {len(result.changes) - 50} more (download the full audit below).")

st.markdown("**Handled preview (first 10 rows)**")
st.dataframe(result.handled_df.head(10), use_container_width=True)

# ---------------------------------------------------------------------------
# Downloads
# ---------------------------------------------------------------------------

st.divider()
stem = Path(st.session_state.get("missing_input_name", "input")).stem

dl_a, dl_b, dl_c = st.columns(3)
with dl_a:
    handled_bytes = result.handled_df.to_csv(index=False).encode("utf-8-sig")
    st.download_button(
        "Download handled CSV",
        data=handled_bytes,
        file_name=f"{stem}_missing.csv",
        mime="text/csv",
    )
with dl_b:
    if not result.changes.empty:
        changes_bytes = result.changes.to_csv(index=False).encode("utf-8-sig")
        st.download_button(
            "Download changes audit",
            data=changes_bytes,
            file_name=f"{stem}_missing_changes.csv",
            mime="text/csv",
        )
with dl_c:
    config_bytes = json.dumps(
        st.session_state.get("missing_options", {}), indent=2, default=str,
    ).encode("utf-8")
    st.download_button(
        "Download config JSON",
        data=config_bytes,
        file_name="missing_config.json",
        mime="application/json",
    )

st.divider()
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")