"""Tests for src/core/missing.py.""" from __future__ import annotations import json import numpy as np import pandas as pd import pytest from src.core.errors import ConfigError, InputValidationError from src.core.missing import ( DEFAULT_SENTINELS, MissingOptions, PRESETS, detect_sentinels, handle_missing, is_missing_like, profile_missing, ) # --------------------------------------------------------------------------- # is_missing_like # --------------------------------------------------------------------------- class TestIsMissingLike: def test_none(self): assert is_missing_like(None) def test_nan(self): assert is_missing_like(np.nan) def test_pd_nat(self): assert is_missing_like(pd.NaT) def test_empty_string(self): assert is_missing_like("") def test_whitespace_only(self): assert is_missing_like(" ") assert is_missing_like("\t\n ") def test_default_sentinels(self): for s in ("N/A", "n/a", "NULL", "null", "-", "--", "?", "TBD", "(blank)"): assert is_missing_like(s), f"expected {s!r} to be missing-like" def test_case_insensitive(self): assert is_missing_like("N/A") assert is_missing_like("n/A") assert is_missing_like("NA") assert is_missing_like("na") def test_real_value_not_missing(self): assert not is_missing_like("hello") assert not is_missing_like("0") assert not is_missing_like(0) assert not is_missing_like(0.0) def test_zero_is_not_missing(self): # Common bug: treating 0 / "0" / False as missing. assert not is_missing_like(0) assert not is_missing_like(False) def test_custom_sentinels_override(self): assert is_missing_like("xx", sentinels=["xx"]) assert not is_missing_like("xx", sentinels=["zz"]) # --------------------------------------------------------------------------- # detect_sentinels # --------------------------------------------------------------------------- class TestDetectSentinels: def test_counts_by_label(self): s = pd.Series(["alice", "N/A", "n/a", "NULL", " ", "", "bob"]) counts = detect_sentinels(s) # "n/a" matches both 'N/A' and 'n/a' under casefold; the canonical # label that wins is whichever is in the DEFAULT_SENTINELS list. assert sum(v for k, v in counts.items() if k != "(whitespace)") == 3 assert counts["(whitespace)"] == 2 def test_skips_real_nan(self): s = pd.Series(["a", np.nan, "N/A"]) counts = detect_sentinels(s) assert sum(counts.values()) == 1 def test_no_sentinels_returns_empty(self): s = pd.Series(["alice", "bob", "charlie"]) assert detect_sentinels(s) == {} # --------------------------------------------------------------------------- # profile_missing # --------------------------------------------------------------------------- class TestProfileMissing: def test_basic(self): df = pd.DataFrame({ "name": ["Alice", "Bob", "N/A", "", "Charlie"], "age": [30, None, 25, 40, np.nan], }) prof = profile_missing(df, MissingOptions()) assert prof.rows_total == 5 # name: '' + 'N/A' = 2 sentinels; age: 2 NaN report_by_col = {r.column: r for r in prof.columns} assert report_by_col["name"].missing == 2 assert report_by_col["age"].missing == 2 assert prof.cells_missing == 4 def test_complete_dataframe(self): df = pd.DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]}) prof = profile_missing(df, MissingOptions()) assert prof.cells_missing == 0 assert prof.rows_complete == 3 assert prof.rows_with_any_missing == 0 def test_to_dataframe_columns(self): df = pd.DataFrame({"x": [1, None]}) prof = profile_missing(df, MissingOptions()) out = prof.to_dataframe() assert set(out.columns) >= {"column", "missing", "missing_pct", "top_sentinel"} def test_disabled_sentinels_only_counts_real_nan(self): df = pd.DataFrame({"x": ["N/A", "alice", np.nan]}) opts = MissingOptions(standardize_sentinels=False) prof = profile_missing(df, opts) report_by_col = {r.column: r for r in prof.columns} # Only the real NaN counts; 'N/A' is left alone. assert report_by_col["x"].missing == 1 # --------------------------------------------------------------------------- # handle_missing — sentinel standardization # --------------------------------------------------------------------------- class TestSentinelStandardization: def test_replaces_sentinels_with_nan(self): df = pd.DataFrame({"x": ["alice", "N/A", "-", " ", "bob"]}) res = handle_missing(df, MissingOptions(strategy="none")) # 'N/A' + '-' + whitespace-only = 3 assert res.sentinels_standardized == 3 assert res.handled_df["x"].isna().sum() == 3 assert res.handled_df.iloc[0]["x"] == "alice" assert res.handled_df.iloc[4]["x"] == "bob" def test_audit_records_each_replacement(self): df = pd.DataFrame({"x": ["alice", "N/A", "bob"]}) res = handle_missing(df, MissingOptions(strategy="none")) assert len(res.changes) == 1 assert res.changes.iloc[0]["action"].startswith("standardize:") def test_disabled_keeps_sentinels(self): df = pd.DataFrame({"x": ["alice", "N/A", "bob"]}) opts = MissingOptions(standardize_sentinels=False, strategy="none") res = handle_missing(df, opts) assert res.sentinels_standardized == 0 assert res.handled_df.iloc[1]["x"] == "N/A" def test_custom_sentinels_extend_default(self): df = pd.DataFrame({"x": ["alice", "MISSING_DATA", "bob"]}) opts = MissingOptions( sentinels=[*DEFAULT_SENTINELS, "MISSING_DATA"], strategy="none", ) res = handle_missing(df, opts) assert res.sentinels_standardized == 1 # --------------------------------------------------------------------------- # handle_missing — fill strategies # --------------------------------------------------------------------------- class TestFillStrategies: @pytest.fixture def numeric_df(self): return pd.DataFrame({"x": [1.0, 2.0, np.nan, 4.0, np.nan]}) def test_mean(self, numeric_df): res = handle_missing(numeric_df, MissingOptions(strategy="mean")) # mean of [1, 2, 4] = 7/3 filled = res.handled_df["x"].iloc[2] assert abs(filled - 7.0 / 3.0) < 1e-9 assert res.cells_filled == 2 def test_median(self, numeric_df): res = handle_missing(numeric_df, MissingOptions(strategy="median")) # median of [1, 2, 4] = 2.0 assert res.handled_df["x"].iloc[2] == 2.0 def test_mode(self): df = pd.DataFrame({"x": ["a", "a", "b", None, None]}) res = handle_missing(df, MissingOptions(strategy="mode")) assert res.handled_df["x"].iloc[3] == "a" assert res.handled_df["x"].iloc[4] == "a" assert res.cells_filled == 2 def test_constant_scalar(self, numeric_df): res = handle_missing( numeric_df, MissingOptions(strategy="constant", fill_value=99.0), ) assert res.handled_df["x"].iloc[2] == 99.0 assert res.handled_df["x"].iloc[4] == 99.0 def test_constant_per_column(self): df = pd.DataFrame({"a": [1, np.nan], "b": ["x", None]}) opts = MissingOptions( strategy="constant", column_fill_values={"a": 0, "b": "?"}, ) res = handle_missing(df, opts) assert res.handled_df["a"].iloc[1] == 0 assert res.handled_df["b"].iloc[1] == "?" def test_ffill(self): df = pd.DataFrame({"x": [1.0, np.nan, np.nan, 4.0]}) res = handle_missing(df, MissingOptions(strategy="ffill")) assert list(res.handled_df["x"]) == [1.0, 1.0, 1.0, 4.0] def test_bfill(self): df = pd.DataFrame({"x": [1.0, np.nan, np.nan, 4.0]}) res = handle_missing(df, MissingOptions(strategy="bfill")) assert list(res.handled_df["x"]) == [1.0, 4.0, 4.0, 4.0] def test_interpolate(self): df = pd.DataFrame({"x": [1.0, np.nan, np.nan, 4.0]}) res = handle_missing(df, MissingOptions(strategy="interpolate")) assert list(res.handled_df["x"]) == [1.0, 2.0, 3.0, 4.0] def test_numeric_strategy_falls_back_for_categorical(self): df = pd.DataFrame({"x": ["a", "a", None, "b"]}) opts = MissingOptions(strategy="median", categorical_strategy="mode") res = handle_missing(df, opts) assert res.strategy_per_column["x"] == "mode" assert res.handled_df["x"].iloc[2] == "a" def test_per_column_strategy_overrides_global(self): df = pd.DataFrame({"a": [1.0, np.nan], "b": ["x", None]}) opts = MissingOptions( strategy="median", column_strategies={"b": "constant"}, fill_value="??", ) res = handle_missing(df, opts) assert res.handled_df["a"].iloc[1] == 1.0 # median of [1.0] assert res.handled_df["b"].iloc[1] == "??" def test_all_nan_column_safely_skipped(self): df = pd.DataFrame({"x": [np.nan, np.nan, np.nan]}) res = handle_missing(df, MissingOptions(strategy="mean")) assert res.cells_filled == 0 assert res.handled_df["x"].isna().all() # --------------------------------------------------------------------------- # handle_missing — drops # --------------------------------------------------------------------------- class TestDropStrategies: def test_drop_row_any_missing(self): # Strict-greater: threshold 0.0 → drop any row with any missing. df = pd.DataFrame({ "a": [1, 2, np.nan, 4], "b": ["x", None, "z", "w"], }) opts = MissingOptions(strategy="drop_row", row_drop_threshold=0.0) res = handle_missing(df, opts) # Rows 1 and 2 each have one missing cell; rows 0 and 3 are clean. assert res.rows_dropped == 2 assert len(res.handled_df) == 2 def test_drop_row_default_threshold_never_drops(self): # Default 1.0 = never drop — no fraction exceeds 100%. df = pd.DataFrame({ "a": [1, 2, np.nan], "b": ["x", "y", None], }) opts = MissingOptions(strategy="drop_row") # threshold defaults to 1.0 res = handle_missing(df, opts) assert res.rows_dropped == 0 def test_drop_row_partial_threshold(self): df = pd.DataFrame({ "a": [1, np.nan, np.nan, np.nan], "b": [10, 20, np.nan, np.nan], "c": [100, 200, np.nan, 400], }) # Strict-greater: threshold 0.5 → drop rows with > 50% missing. opts = MissingOptions(strategy="drop_row", row_drop_threshold=0.5) res = handle_missing(df, opts) # row 0: 0/3, row 1: 1/3 (0.33) -> keep # row 2: 3/3 (1.0) -> drop, row 3: 2/3 (0.67) -> drop assert res.rows_dropped == 2 def test_drop_col_threshold(self): df = pd.DataFrame({ "keep": [1, 2, 3, 4], "drop_me": [np.nan, np.nan, np.nan, 1], # 75% missing }) # Strict-greater: 0.5 → drop columns with > 50% missing. opts = MissingOptions(strategy="drop_col", col_drop_threshold=0.5) res = handle_missing(df, opts) assert "drop_me" in res.columns_dropped assert "keep" not in res.columns_dropped def test_drop_both(self): df = pd.DataFrame({ "keep": [1, 2, 3, 4, 5], "drop_col": [np.nan] * 5, "x": [1, np.nan, 3, np.nan, 5], }) opts = MissingOptions( strategy="drop_both", col_drop_threshold=0.99, # >99% missing → drop column row_drop_threshold=0.0, # any missing in remaining cols → drop row ) res = handle_missing(df, opts) # drop_col is 100% missing → dropped assert "drop_col" in res.columns_dropped # Remaining scope (keep + x): rows 1 and 3 have a missing x → drop. assert res.rows_dropped == 2 def test_drop_audit_records_dropped_rows(self): df = pd.DataFrame({"a": [1, np.nan], "b": [2, np.nan]}) # Drop the fully-missing row (frac > 0.99). opts = MissingOptions(strategy="drop_row", row_drop_threshold=0.99) res = handle_missing(df, opts) drop_records = res.changes[res.changes["action"] == "drop_row"] assert len(drop_records) == 1 # --------------------------------------------------------------------------- # Scope: columns / skip_columns # --------------------------------------------------------------------------- class TestScope: def test_columns_filter(self): df = pd.DataFrame({"a": [np.nan, 2], "b": [np.nan, 4]}) opts = MissingOptions(columns=["a"], strategy="constant", fill_value=99) res = handle_missing(df, opts) assert res.handled_df["a"].iloc[0] == 99 # b should be untouched assert pd.isna(res.handled_df["b"].iloc[0]) def test_skip_columns(self): df = pd.DataFrame({"a": [np.nan, 2], "b": [np.nan, 4]}) opts = MissingOptions(skip_columns=["b"], strategy="constant", fill_value=99) res = handle_missing(df, opts) assert res.handled_df["a"].iloc[0] == 99 assert pd.isna(res.handled_df["b"].iloc[0]) def test_unknown_column_raises(self): df = pd.DataFrame({"a": [1]}) opts = MissingOptions(columns=["does_not_exist"]) with pytest.raises(InputValidationError): handle_missing(df, opts) # --------------------------------------------------------------------------- # Presets / config # --------------------------------------------------------------------------- class TestPresets: def test_detect_only_does_not_fill(self): df = pd.DataFrame({"x": ["alice", "N/A", "bob"]}) opts = MissingOptions.from_preset("detect-only") res = handle_missing(df, opts) assert res.sentinels_standardized == 1 assert res.cells_filled == 0 assert res.rows_dropped == 0 def test_safe_fill_fills(self): df = pd.DataFrame({"age": [30, np.nan, 25, 40], "name": ["a", "a", None, "b"]}) opts = MissingOptions.from_preset("safe-fill") res = handle_missing(df, opts) assert res.cells_filled == 2 def test_drop_incomplete(self): df = pd.DataFrame({"a": [1, np.nan, 3], "b": [10, 20, 30]}) opts = MissingOptions.from_preset("drop-incomplete") res = handle_missing(df, opts) assert res.rows_dropped == 1 def test_unknown_preset_raises(self): with pytest.raises(ConfigError): MissingOptions.from_preset("does-not-exist") def test_roundtrip_to_file(self, tmp_path): opts = MissingOptions.from_preset("safe-fill") opts.column_strategies = {"age": "median"} path = tmp_path / "cfg.json" opts.to_file(path) loaded = MissingOptions.from_file(path) assert loaded.strategy == opts.strategy assert loaded.column_strategies == opts.column_strategies # --------------------------------------------------------------------------- # Validation # --------------------------------------------------------------------------- class TestValidate: def test_invalid_strategy(self): opts = MissingOptions(strategy="bogus") # type: ignore[arg-type] with pytest.raises(InputValidationError): opts.validate() def test_threshold_out_of_range(self): opts = MissingOptions(row_drop_threshold=1.5) with pytest.raises(ConfigError): opts.validate() def test_handle_missing_validates(self): df = pd.DataFrame({"x": [1]}) opts = MissingOptions(strategy="bogus") # type: ignore[arg-type] with pytest.raises(InputValidationError): handle_missing(df, opts) def test_non_dataframe_input(self): with pytest.raises(InputValidationError): handle_missing([1, 2, 3]) # type: ignore[arg-type] # --------------------------------------------------------------------------- # End-to-end realistic case # --------------------------------------------------------------------------- class TestEndToEnd: def test_messy_customer_export(self): df = pd.DataFrame({ "customer_id": [1, 2, 3, 4, 5, 6], "name": ["Alice", "Bob", "N/A", " ", "Charlie", None], "email": ["a@x.com", "-", "c@x.com", "d@x.com", "NULL", "f@x.com"], "age": [30, np.nan, 25, 40, np.nan, 50], }) opts = MissingOptions( standardize_sentinels=True, strategy="median", categorical_strategy="constant", fill_value="UNKNOWN", ) res = handle_missing(df, opts) # Sentinels: name "N/A"," ",None; email "-","NULL". (None is real-NaN, not sentinel.) # Whitespace + 'N/A' on name = 2; '-' + 'NULL' on email = 2. Total = 4. assert res.sentinels_standardized == 4 # name has 3 missing after standardize (N/A, " ", None) → constant fill # email has 2 missing → constant fill # age has 2 missing → median (32.5 of [30, 25, 40, 50]) assert res.cells_filled == 7 assert res.handled_df["name"].isna().sum() == 0 assert res.handled_df["email"].isna().sum() == 0 assert res.handled_df["age"].isna().sum() == 0 assert (res.handled_df["name"] == "UNKNOWN").sum() == 3 assert (res.handled_df["age"] == 35.0).sum() == 2 # median of [30, 25, 40, 50] def test_input_not_mutated(self): df = pd.DataFrame({"x": ["N/A", "alice", np.nan]}) df_copy = df.copy() handle_missing(df, MissingOptions.from_preset("safe-fill")) pd.testing.assert_frame_equal(df, df_copy)