datatools-dev/tests/test_missing.py

"""Tests for src/core/missing.py."""

from __future__ import annotations

import json

import numpy as np
import pandas as pd
import pytest

from src.core.errors import ConfigError, InputValidationError
from src.core.missing import (
    DEFAULT_SENTINELS,
    MissingOptions,
    PRESETS,
    detect_sentinels,
    handle_missing,
    is_missing_like,
    profile_missing,
)


# ---------------------------------------------------------------------------
# is_missing_like
# ---------------------------------------------------------------------------

class TestIsMissingLike:
    def test_none(self):
        assert is_missing_like(None)

    def test_nan(self):
        assert is_missing_like(np.nan)

    def test_pd_nat(self):
        assert is_missing_like(pd.NaT)

    def test_empty_string(self):
        assert is_missing_like("")

    def test_whitespace_only(self):
        assert is_missing_like("   ")
        assert is_missing_like("\t\n  ")

    def test_default_sentinels(self):
        for s in ("N/A", "n/a", "NULL", "null", "-", "--", "?", "TBD", "(blank)"):
            assert is_missing_like(s), f"expected {s!r} to be missing-like"

    def test_case_insensitive(self):
        assert is_missing_like("N/A")
        assert is_missing_like("n/A")
        assert is_missing_like("NA")
        assert is_missing_like("na")

    def test_real_value_not_missing(self):
        assert not is_missing_like("hello")
        assert not is_missing_like("0")
        assert not is_missing_like(0)
        assert not is_missing_like(0.0)

    def test_zero_is_not_missing(self):
        # Common bug: treating 0 / "0" / False as missing.
        assert not is_missing_like(0)
        assert not is_missing_like(False)

    def test_custom_sentinels_override(self):
        assert is_missing_like("xx", sentinels=["xx"])
        assert not is_missing_like("xx", sentinels=["zz"])


# ---------------------------------------------------------------------------
# detect_sentinels
# ---------------------------------------------------------------------------

class TestDetectSentinels:
    def test_counts_by_label(self):
        s = pd.Series(["alice", "N/A", "n/a", "NULL", "  ", "", "bob"])
        counts = detect_sentinels(s)
        # "n/a" matches both 'N/A' and 'n/a' under casefold; the canonical
        # label that wins is whichever is in the DEFAULT_SENTINELS list.
        assert sum(v for k, v in counts.items() if k != "(whitespace)") == 3
        assert counts["(whitespace)"] == 2

    def test_skips_real_nan(self):
        s = pd.Series(["a", np.nan, "N/A"])
        counts = detect_sentinels(s)
        assert sum(counts.values()) == 1

    def test_no_sentinels_returns_empty(self):
        s = pd.Series(["alice", "bob", "charlie"])
        assert detect_sentinels(s) == {}


# ---------------------------------------------------------------------------
# profile_missing
# ---------------------------------------------------------------------------

class TestProfileMissing:
    def test_basic(self):
        df = pd.DataFrame({
            "name": ["Alice", "Bob", "N/A", "", "Charlie"],
            "age":  [30, None, 25, 40, np.nan],
        })
        prof = profile_missing(df, MissingOptions())
        assert prof.rows_total == 5
        # name: '' + 'N/A' = 2 sentinels; age: 2 NaN
        report_by_col = {r.column: r for r in prof.columns}
        assert report_by_col["name"].missing == 2
        assert report_by_col["age"].missing == 2
        assert prof.cells_missing == 4

    def test_complete_dataframe(self):
        df = pd.DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]})
        prof = profile_missing(df, MissingOptions())
        assert prof.cells_missing == 0
        assert prof.rows_complete == 3
        assert prof.rows_with_any_missing == 0

    def test_to_dataframe_columns(self):
        df = pd.DataFrame({"x": [1, None]})
        prof = profile_missing(df, MissingOptions())
        out = prof.to_dataframe()
        assert set(out.columns) >= {"column", "missing", "missing_pct", "top_sentinel"}

    def test_disabled_sentinels_only_counts_real_nan(self):
        df = pd.DataFrame({"x": ["N/A", "alice", np.nan]})
        opts = MissingOptions(standardize_sentinels=False)
        prof = profile_missing(df, opts)
        report_by_col = {r.column: r for r in prof.columns}
        # Only the real NaN counts; 'N/A' is left alone.
        assert report_by_col["x"].missing == 1


# ---------------------------------------------------------------------------
# handle_missing — sentinel standardization
# ---------------------------------------------------------------------------

class TestSentinelStandardization:
    def test_replaces_sentinels_with_nan(self):
        df = pd.DataFrame({"x": ["alice", "N/A", "-", "  ", "bob"]})
        res = handle_missing(df, MissingOptions(strategy="none"))
        # 'N/A' + '-' + whitespace-only = 3
        assert res.sentinels_standardized == 3
        assert res.handled_df["x"].isna().sum() == 3
        assert res.handled_df.iloc[0]["x"] == "alice"
        assert res.handled_df.iloc[4]["x"] == "bob"

    def test_audit_records_each_replacement(self):
        df = pd.DataFrame({"x": ["alice", "N/A", "bob"]})
        res = handle_missing(df, MissingOptions(strategy="none"))
        assert len(res.changes) == 1
        assert res.changes.iloc[0]["action"].startswith("standardize:")

    def test_disabled_keeps_sentinels(self):
        df = pd.DataFrame({"x": ["alice", "N/A", "bob"]})
        opts = MissingOptions(standardize_sentinels=False, strategy="none")
        res = handle_missing(df, opts)
        assert res.sentinels_standardized == 0
        assert res.handled_df.iloc[1]["x"] == "N/A"

    def test_custom_sentinels_extend_default(self):
        df = pd.DataFrame({"x": ["alice", "MISSING_DATA", "bob"]})
        opts = MissingOptions(
            sentinels=[*DEFAULT_SENTINELS, "MISSING_DATA"],
            strategy="none",
        )
        res = handle_missing(df, opts)
        assert res.sentinels_standardized == 1


# ---------------------------------------------------------------------------
# handle_missing — fill strategies
# ---------------------------------------------------------------------------

class TestFillStrategies:
    @pytest.fixture
    def numeric_df(self):
        return pd.DataFrame({"x": [1.0, 2.0, np.nan, 4.0, np.nan]})

    def test_mean(self, numeric_df):
        res = handle_missing(numeric_df, MissingOptions(strategy="mean"))
        # mean of [1, 2, 4] = 7/3
        filled = res.handled_df["x"].iloc[2]
        assert abs(filled - 7.0 / 3.0) < 1e-9
        assert res.cells_filled == 2

    def test_median(self, numeric_df):
        res = handle_missing(numeric_df, MissingOptions(strategy="median"))
        # median of [1, 2, 4] = 2.0
        assert res.handled_df["x"].iloc[2] == 2.0

    def test_mode(self):
        df = pd.DataFrame({"x": ["a", "a", "b", None, None]})
        res = handle_missing(df, MissingOptions(strategy="mode"))
        assert res.handled_df["x"].iloc[3] == "a"
        assert res.handled_df["x"].iloc[4] == "a"
        assert res.cells_filled == 2

    def test_constant_scalar(self, numeric_df):
        res = handle_missing(
            numeric_df,
            MissingOptions(strategy="constant", fill_value=99.0),
        )
        assert res.handled_df["x"].iloc[2] == 99.0
        assert res.handled_df["x"].iloc[4] == 99.0

    def test_constant_per_column(self):
        df = pd.DataFrame({"a": [1, np.nan], "b": ["x", None]})
        opts = MissingOptions(
            strategy="constant",
            column_fill_values={"a": 0, "b": "?"},
        )
        res = handle_missing(df, opts)
        assert res.handled_df["a"].iloc[1] == 0
        assert res.handled_df["b"].iloc[1] == "?"

    def test_ffill(self):
        df = pd.DataFrame({"x": [1.0, np.nan, np.nan, 4.0]})
        res = handle_missing(df, MissingOptions(strategy="ffill"))
        assert list(res.handled_df["x"]) == [1.0, 1.0, 1.0, 4.0]

    def test_bfill(self):
        df = pd.DataFrame({"x": [1.0, np.nan, np.nan, 4.0]})
        res = handle_missing(df, MissingOptions(strategy="bfill"))
        assert list(res.handled_df["x"]) == [1.0, 4.0, 4.0, 4.0]

    def test_interpolate(self):
        df = pd.DataFrame({"x": [1.0, np.nan, np.nan, 4.0]})
        res = handle_missing(df, MissingOptions(strategy="interpolate"))
        assert list(res.handled_df["x"]) == [1.0, 2.0, 3.0, 4.0]

    def test_numeric_strategy_falls_back_for_categorical(self):
        df = pd.DataFrame({"x": ["a", "a", None, "b"]})
        opts = MissingOptions(strategy="median", categorical_strategy="mode")
        res = handle_missing(df, opts)
        assert res.strategy_per_column["x"] == "mode"
        assert res.handled_df["x"].iloc[2] == "a"

    def test_per_column_strategy_overrides_global(self):
        df = pd.DataFrame({"a": [1.0, np.nan], "b": ["x", None]})
        opts = MissingOptions(
            strategy="median",
            column_strategies={"b": "constant"},
            fill_value="??",
        )
        res = handle_missing(df, opts)
        assert res.handled_df["a"].iloc[1] == 1.0  # median of [1.0]
        assert res.handled_df["b"].iloc[1] == "??"

    def test_all_nan_column_safely_skipped(self):
        df = pd.DataFrame({"x": [np.nan, np.nan, np.nan]})
        res = handle_missing(df, MissingOptions(strategy="mean"))
        assert res.cells_filled == 0
        assert res.handled_df["x"].isna().all()


# ---------------------------------------------------------------------------
# handle_missing — drops
# ---------------------------------------------------------------------------

class TestDropStrategies:
    def test_drop_row_any_missing(self):
        # Strict-greater: threshold 0.0 → drop any row with any missing.
        df = pd.DataFrame({
            "a": [1, 2, np.nan, 4],
            "b": ["x", None, "z", "w"],
        })
        opts = MissingOptions(strategy="drop_row", row_drop_threshold=0.0)
        res = handle_missing(df, opts)
        # Rows 1 and 2 each have one missing cell; rows 0 and 3 are clean.
        assert res.rows_dropped == 2
        assert len(res.handled_df) == 2

    def test_drop_row_default_threshold_never_drops(self):
        # Default 1.0 = never drop — no fraction exceeds 100%.
        df = pd.DataFrame({
            "a": [1, 2, np.nan],
            "b": ["x", "y", None],
        })
        opts = MissingOptions(strategy="drop_row")  # threshold defaults to 1.0
        res = handle_missing(df, opts)
        assert res.rows_dropped == 0

    def test_drop_row_partial_threshold(self):
        df = pd.DataFrame({
            "a": [1, np.nan, np.nan, np.nan],
            "b": [10, 20, np.nan, np.nan],
            "c": [100, 200, np.nan, 400],
        })
        # Strict-greater: threshold 0.5 → drop rows with > 50% missing.
        opts = MissingOptions(strategy="drop_row", row_drop_threshold=0.5)
        res = handle_missing(df, opts)
        # row 0: 0/3, row 1: 1/3 (0.33) -> keep
        # row 2: 3/3 (1.0) -> drop, row 3: 2/3 (0.67) -> drop
        assert res.rows_dropped == 2

    def test_drop_col_threshold(self):
        df = pd.DataFrame({
            "keep": [1, 2, 3, 4],
            "drop_me": [np.nan, np.nan, np.nan, 1],  # 75% missing
        })
        # Strict-greater: 0.5 → drop columns with > 50% missing.
        opts = MissingOptions(strategy="drop_col", col_drop_threshold=0.5)
        res = handle_missing(df, opts)
        assert "drop_me" in res.columns_dropped
        assert "keep" not in res.columns_dropped

    def test_drop_both(self):
        df = pd.DataFrame({
            "keep": [1, 2, 3, 4, 5],
            "drop_col": [np.nan] * 5,
            "x": [1, np.nan, 3, np.nan, 5],
        })
        opts = MissingOptions(
            strategy="drop_both",
            col_drop_threshold=0.99,  # >99% missing → drop column
            row_drop_threshold=0.0,   # any missing in remaining cols → drop row
        )
        res = handle_missing(df, opts)
        # drop_col is 100% missing → dropped
        assert "drop_col" in res.columns_dropped
        # Remaining scope (keep + x): rows 1 and 3 have a missing x → drop.
        assert res.rows_dropped == 2

    def test_drop_audit_records_dropped_rows(self):
        df = pd.DataFrame({"a": [1, np.nan], "b": [2, np.nan]})
        # Drop the fully-missing row (frac > 0.99).
        opts = MissingOptions(strategy="drop_row", row_drop_threshold=0.99)
        res = handle_missing(df, opts)
        drop_records = res.changes[res.changes["action"] == "drop_row"]
        assert len(drop_records) == 1


# ---------------------------------------------------------------------------
# Scope: columns / skip_columns
# ---------------------------------------------------------------------------

class TestScope:
    def test_columns_filter(self):
        df = pd.DataFrame({"a": [np.nan, 2], "b": [np.nan, 4]})
        opts = MissingOptions(columns=["a"], strategy="constant", fill_value=99)
        res = handle_missing(df, opts)
        assert res.handled_df["a"].iloc[0] == 99
        # b should be untouched
        assert pd.isna(res.handled_df["b"].iloc[0])

    def test_skip_columns(self):
        df = pd.DataFrame({"a": [np.nan, 2], "b": [np.nan, 4]})
        opts = MissingOptions(skip_columns=["b"], strategy="constant", fill_value=99)
        res = handle_missing(df, opts)
        assert res.handled_df["a"].iloc[0] == 99
        assert pd.isna(res.handled_df["b"].iloc[0])

    def test_unknown_column_raises(self):
        df = pd.DataFrame({"a": [1]})
        opts = MissingOptions(columns=["does_not_exist"])
        with pytest.raises(InputValidationError):
            handle_missing(df, opts)


# ---------------------------------------------------------------------------
# Presets / config
# ---------------------------------------------------------------------------

class TestPresets:
    def test_detect_only_does_not_fill(self):
        df = pd.DataFrame({"x": ["alice", "N/A", "bob"]})
        opts = MissingOptions.from_preset("detect-only")
        res = handle_missing(df, opts)
        assert res.sentinels_standardized == 1
        assert res.cells_filled == 0
        assert res.rows_dropped == 0

    def test_safe_fill_fills(self):
        df = pd.DataFrame({"age": [30, np.nan, 25, 40], "name": ["a", "a", None, "b"]})
        opts = MissingOptions.from_preset("safe-fill")
        res = handle_missing(df, opts)
        assert res.cells_filled == 2

    def test_drop_incomplete(self):
        df = pd.DataFrame({"a": [1, np.nan, 3], "b": [10, 20, 30]})
        opts = MissingOptions.from_preset("drop-incomplete")
        res = handle_missing(df, opts)
        assert res.rows_dropped == 1

    def test_unknown_preset_raises(self):
        with pytest.raises(ConfigError):
            MissingOptions.from_preset("does-not-exist")

    def test_roundtrip_to_file(self, tmp_path):
        opts = MissingOptions.from_preset("safe-fill")
        opts.column_strategies = {"age": "median"}
        path = tmp_path / "cfg.json"
        opts.to_file(path)
        loaded = MissingOptions.from_file(path)
        assert loaded.strategy == opts.strategy
        assert loaded.column_strategies == opts.column_strategies


# ---------------------------------------------------------------------------
# Validation
# ---------------------------------------------------------------------------

class TestValidate:
    def test_invalid_strategy(self):
        opts = MissingOptions(strategy="bogus")  # type: ignore[arg-type]
        with pytest.raises(InputValidationError):
            opts.validate()

    def test_threshold_out_of_range(self):
        opts = MissingOptions(row_drop_threshold=1.5)
        with pytest.raises(ConfigError):
            opts.validate()

    def test_handle_missing_validates(self):
        df = pd.DataFrame({"x": [1]})
        opts = MissingOptions(strategy="bogus")  # type: ignore[arg-type]
        with pytest.raises(InputValidationError):
            handle_missing(df, opts)

    def test_non_dataframe_input(self):
        with pytest.raises(InputValidationError):
            handle_missing([1, 2, 3])  # type: ignore[arg-type]


# ---------------------------------------------------------------------------
# End-to-end realistic case
# ---------------------------------------------------------------------------

class TestEndToEnd:
    def test_messy_customer_export(self):
        df = pd.DataFrame({
            "customer_id": [1, 2, 3, 4, 5, 6],
            "name": ["Alice", "Bob", "N/A", "  ", "Charlie", None],
            "email": ["a@x.com", "-", "c@x.com", "d@x.com", "NULL", "f@x.com"],
            "age":   [30, np.nan, 25, 40, np.nan, 50],
        })
        opts = MissingOptions(
            standardize_sentinels=True,
            strategy="median",
            categorical_strategy="constant",
            fill_value="UNKNOWN",
        )
        res = handle_missing(df, opts)

        # Sentinels: name "N/A","  ",None; email "-","NULL". (None is real-NaN, not sentinel.)
        # Whitespace + 'N/A' on name = 2; '-' + 'NULL' on email = 2.  Total = 4.
        assert res.sentinels_standardized == 4
        # name has 3 missing after standardize (N/A, "  ", None) → constant fill
        # email has 2 missing → constant fill
        # age has 2 missing → median (32.5 of [30, 25, 40, 50])
        assert res.cells_filled == 7
        assert res.handled_df["name"].isna().sum() == 0
        assert res.handled_df["email"].isna().sum() == 0
        assert res.handled_df["age"].isna().sum() == 0
        assert (res.handled_df["name"] == "UNKNOWN").sum() == 3
        assert (res.handled_df["age"] == 35.0).sum() == 2  # median of [30, 25, 40, 50]

    def test_input_not_mutated(self):
        df = pd.DataFrame({"x": ["N/A", "alice", np.nan]})
        df_copy = df.copy()
        handle_missing(df, MissingOptions.from_preset("safe-fill"))
        pd.testing.assert_frame_equal(df, df_copy)