"""Acceptance corpus for the Fix Missing Values tool. Loads every fixture in ``test-cases/missing-corpus/test_data/`` and asserts the documented behaviour. The fixtures are split into: * ``uc##`` — three target-client use cases (Shopify operator, marketing analyst, consultant intake). * ``ec##`` — edge cases the engine must handle without surprise: all-NaN columns, zeros that aren't missing, Excel errors, unicode whitespace, mixed dtypes, padding, single row/column, every default sentinel, per-column constants, drop thresholds, leading-NaN ffill, numeric-strategy fallback for non-numeric columns, headers-only, idempotency. Each test runs through the public API (``handle_missing``) so any regression in the engine surfaces here. Fixture files double as living documentation for what the tool is supposed to do. """ from __future__ import annotations import io from pathlib import Path import numpy as np import pandas as pd import pytest from src.core.missing import ( MissingOptions, handle_missing, is_missing_like, profile_missing, ) CORPUS = Path(__file__).resolve().parents[1] / "test-cases" / "missing-corpus" TEST_DATA = CORPUS / "test_data" def _read(name: str, *, dtype_str: bool = False) -> pd.DataFrame: """Load a corpus CSV. By default we let pandas infer dtypes — that's the most realistic intake path (Excel exports keep numeric columns numeric). A handful of cases pass ``dtype_str=True`` to keep sentinels visible in columns that would otherwise be coerced to float. """ path = TEST_DATA / name if dtype_str: return pd.read_csv(path, dtype=str, keep_default_na=False) return pd.read_csv(path) # --------------------------------------------------------------------------- # Use case 1 — Shopify operator: detect-only # --------------------------------------------------------------------------- class TestUC01ShopifyExport: """SMB operator standardizes disguised nulls before reimporting.""" def test_detect_only_replaces_sentinels(self): df = _read("uc01_shopify_export.csv", dtype_str=True) opts = MissingOptions.from_preset("detect-only") res = handle_missing(df, opts) # Spot-check known sentinels from the fixture assert res.sentinels_standardized > 0 assert res.cells_filled == 0 assert res.rows_dropped == 0 # Fields that contained 'N/A', '-', 'NULL', '(blank)', '#N/A', # 'n/a', '?', '(none)' should now be NaN. for row, col in [ (1, "phone"), # 'N/A' (2, "city"), # '-' (3, "total_orders"), # 'NULL' (5, "phone"), # ' ' (5, "last_order_date"), # '(blank)' (6, "last_order_date"), # '#N/A' (7, "phone"), # 'n/a' (8, "city"), # '?' (9, "total_orders"), # '(none)' ]: assert pd.isna(res.handled_df.iloc[row][col]), ( f"Expected NaN at row {row} col {col}, got " f"{res.handled_df.iloc[row][col]!r}" ) def test_real_values_preserved(self): df = _read("uc01_shopify_export.csv", dtype_str=True) res = handle_missing(df, MissingOptions.from_preset("detect-only")) # First row should be untouched. assert res.handled_df.iloc[0]["first_name"] == "Alice" assert res.handled_df.iloc[0]["email"] == "alice@shop.com" assert res.handled_df.iloc[0]["lifetime_value"] == "1240.50" def test_audit_log_complete(self): df = _read("uc01_shopify_export.csv", dtype_str=True) res = handle_missing(df, MissingOptions.from_preset("detect-only")) # One audit row per sentinel replacement. assert len(res.changes) == res.sentinels_standardized assert set(res.changes["action"].apply(lambda s: s.startswith("standardize:"))) == {True} # --------------------------------------------------------------------------- # Use case 2 — Marketing analyst: safe-fill # --------------------------------------------------------------------------- class TestUC02MarketingAudience: """Marketer fills numeric columns with median, categorical with mode.""" def test_safe_fill_clears_all_missing(self): df = _read("uc02_marketing_audience.csv") opts = MissingOptions.from_preset("safe-fill") res = handle_missing(df, opts) # Every cell in scope should be filled. assert res.profile_after.cells_missing == 0 assert res.cells_filled > 0 def test_numeric_uses_median_categorical_uses_mode(self): df = _read("uc02_marketing_audience.csv") opts = MissingOptions.from_preset("safe-fill") res = handle_missing(df, opts) # 'age' is numeric → median strategy assert res.strategy_per_column["age"] == "median" # 'segment' / 'region' / 'source' are object → mode fallback assert res.strategy_per_column["segment"] == "mode" assert res.strategy_per_column["region"] == "mode" def test_per_column_override(self): df = _read("uc02_marketing_audience.csv") opts = MissingOptions.from_preset("safe-fill") opts.column_strategies = {"source": "constant"} opts.column_fill_values = {"source": "unknown"} res = handle_missing(df, opts) # Cells previously holding sentinels in 'source' should now equal "unknown". assert (res.handled_df["source"] == "unknown").sum() >= 3 def test_consent_real_false_not_dropped(self): # 'consent' column has empty cells but also explicit "true"; mode fill # must not silently change a real "true" to anything else. df = _read("uc02_marketing_audience.csv") res = handle_missing(df, MissingOptions.from_preset("safe-fill")) original_trues = (df["consent"] == "true").sum() result_trues = (res.handled_df["consent"] == "true").sum() # Filled rows can become "true" (mode) but should not lose existing trues. assert result_trues >= original_trues # --------------------------------------------------------------------------- # Use case 3 — Consultant intake: threshold drops + fill # --------------------------------------------------------------------------- class TestUC03ConsultantIntake: """Drop sparse columns and rows, then fill the survivors.""" def test_drop_col_removes_legacy_fields(self): df = _read("uc03_consultant_intake.csv", dtype_str=True) # internal_id_legacy and beta_field are 100% missing — drop them. opts = MissingOptions( standardize_sentinels=True, strategy="drop_col", col_drop_threshold=0.99, ) res = handle_missing(df, opts) assert "internal_id_legacy" in res.columns_dropped assert "beta_field" in res.columns_dropped def test_drop_row_removes_mostly_empty_respondents(self): df = _read("uc03_consultant_intake.csv", dtype_str=True) opts = MissingOptions( standardize_sentinels=True, strategy="drop_both", col_drop_threshold=0.99, # drop the legacy / beta cols first row_drop_threshold=0.5, # then drop rows with >50% missing ) res = handle_missing(df, opts) # R-002, R-005, R-007, R-010 are mostly-empty respondents. assert res.rows_dropped >= 4 # Non-empty respondents survive. kept_ids = set(res.handled_df["respondent_id"].tolist()) for survivor in ("R-001", "R-003", "R-006", "R-008", "R-009", "R-012"): assert survivor in kept_ids # --------------------------------------------------------------------------- # Edge cases # --------------------------------------------------------------------------- class TestEC01AllNanColumn: def test_fill_skips_all_nan_column(self): df = _read("ec01_all_nan_column.csv") res = handle_missing(df, MissingOptions(strategy="mean")) # Mean of all-NaN is NaN — engine must NOT fabricate a value. assert res.handled_df["deprecated_field"].isna().all() assert res.cells_filled == 0 def test_drop_col_catches_all_nan(self): df = _read("ec01_all_nan_column.csv") res = handle_missing( df, MissingOptions(strategy="drop_col", col_drop_threshold=0.99), ) assert "deprecated_field" in res.columns_dropped assert "name" not in res.columns_dropped class TestEC02NoMissing: def test_clean_file_is_noop(self): df = _read("ec02_no_missing.csv") res = handle_missing(df, MissingOptions.from_preset("safe-fill")) assert res.sentinels_standardized == 0 assert res.cells_filled == 0 assert res.rows_dropped == 0 pd.testing.assert_frame_equal(res.handled_df, df) class TestEC03ZeroIsNotMissing: def test_zero_preserved(self): df = _read("ec03_zero_is_not_missing.csv") res = handle_missing(df, MissingOptions.from_preset("safe-fill")) # Original zeros remain zero. assert (res.handled_df["balance"] == 0).sum() == (df["balance"] == 0).sum() assert (res.handled_df["count"] == 0).sum() == (df["count"] == 0).sum() # No spurious changes recorded. assert res.cells_filled == 0 assert res.sentinels_standardized == 0 def test_is_missing_like_zero_predicate(self): # Direct predicate check — zeros, false, "0" must all be non-missing. assert not is_missing_like(0) assert not is_missing_like(0.0) assert not is_missing_like(False) assert not is_missing_like("0") assert not is_missing_like("0.00") class TestEC04ExcelErrors: def test_excel_error_sentinels_recognized(self): df = _read("ec04_excel_errors.csv", dtype_str=True) res = handle_missing(df, MissingOptions(strategy="none")) # 6 error sentinels in the fixture: #N/A, #NULL!, #VALUE!, #N/A, #N/A, #NULL! assert res.sentinels_standardized == 6 class TestEC05UnicodeWhitespace: def test_nbsp_and_ideographic_space_count_as_missing(self): df = _read("ec05_unicode_whitespace.csv", dtype_str=True) res = handle_missing(df, MissingOptions(strategy="none")) # rows 1, 2, 4 contain NBSP / tab / ideographic space respectively assert res.handled_df["note"].isna().sum() == 3 assert res.handled_df.iloc[0]["note"] == "hello" assert res.handled_df.iloc[3]["note"] == "real" class TestEC06MixedDtypes: def test_mixed_column_falls_back_to_mode(self): # Read with native dtypes so 'real_num' stays numeric. df = _read("ec06_mixed_dtypes.csv") opts = MissingOptions( standardize_sentinels=True, strategy="median", categorical_strategy="mode", ) res = handle_missing(df, opts) # mixed_col holds 'N/A' / 'hello' alongside numbers → object dtype, # median falls back to mode. assert res.strategy_per_column["mixed_col"] == "mode" # real_num is float dtype → median runs. assert res.strategy_per_column["real_num"] == "median" class TestEC07RealDataWithPadding: def test_padded_real_data_not_treated_as_missing(self): df = _read("ec07_real_data_with_padding.csv", dtype_str=True) res = handle_missing(df, MissingOptions(strategy="none")) # Only row 1 (name=" ") and row 2 (city=blank) should become NaN. # " Alice ", " Bob ", " SF" must remain. assert res.handled_df.iloc[0]["name"] == " Alice " assert res.handled_df.iloc[2]["name"] == " Bob " assert res.handled_df.iloc[3]["city"] == " SF" class TestEC08SingleRow: def test_single_row_handles_cleanly(self): df = _read("ec08_single_row.csv", dtype_str=True) # detect-only res = handle_missing(df, MissingOptions(strategy="none")) assert res.sentinels_standardized == 2 # 'N/A' + '' # safe-fill on a one-row file: median/mode of a single value is itself. res2 = handle_missing(df, MissingOptions.from_preset("safe-fill")) assert res2.handled_df.iloc[0]["name"] == "Alice" class TestEC09SingleColumn: def test_single_column_works(self): df = _read("ec09_single_column.csv", dtype_str=True) res = handle_missing(df, MissingOptions(strategy="none")) # 'N/A', whitespace-only ' ', '-' = 3 sentinels assert res.sentinels_standardized == 3 assert res.handled_df["value"].isna().sum() == 3 class TestEC10AllSentinelVariants: def test_every_default_sentinel_recognized(self): df = _read("ec10_all_sentinel_variants.csv", dtype_str=True) res = handle_missing(df, MissingOptions(strategy="none")) # 20 sentinels + 1 real value assert res.sentinels_standardized == 20 # The 'real_value' row stays. assert (res.handled_df["sentinel_value"] == "real_value").sum() == 1 class TestEC11ConstantPerColumn: def test_per_column_fill_values(self): df = _read("ec11_constant_per_column.csv", dtype_str=True) opts = MissingOptions( strategy="constant", column_fill_values={ "country": "USA", "salary": "0", "department": "Unassigned", }, ) res = handle_missing(df, opts) # Fixture has 1 UK row + 2 USA rows + 2 blanks. Filling blanks with # "USA" yields 4 USA total; UK is preserved. assert (res.handled_df["country"] == "USA").sum() == 4 assert (res.handled_df["country"] == "UK").sum() == 1 assert (res.handled_df["department"] == "Unassigned").sum() >= 2 class TestEC12DropThresholdBoundary: def test_threshold_one_never_drops(self): # threshold 1.0 + strict-greater = never drop. df = _read("ec12_drop_threshold_boundary.csv") opts = MissingOptions(strategy="drop_row", row_drop_threshold=1.0) res = handle_missing(df, opts) assert res.rows_dropped == 0 def test_threshold_just_under_one_drops_fully_missing(self): # threshold 0.99: drop only fully-missing rows (frac > 0.99 → frac == 1.0). df = _read("ec12_drop_threshold_boundary.csv") opts = MissingOptions( strategy="drop_row", row_drop_threshold=0.99, columns=["a", "b", "c", "d"], # exclude id from the scope ) res = handle_missing(df, opts) # Only row 3 (id=4, all four are NaN) qualifies. assert res.rows_dropped == 1 def test_threshold_half_drops_majority_missing(self): df = _read("ec12_drop_threshold_boundary.csv") opts = MissingOptions( strategy="drop_row", row_drop_threshold=0.5, columns=["a", "b", "c", "d"], ) res = handle_missing(df, opts) # Missing fractions across [a,b,c,d]: # row 0: 0/4=0.0 keep # row 1: 2/4=0.5 keep (strict >, not equal) # row 2: 3/4=0.75 drop # row 3: 4/4=1.0 drop # row 4: 2/4=0.5 keep assert res.rows_dropped == 2 def test_threshold_zero_drops_any_missing(self): df = _read("ec12_drop_threshold_boundary.csv") opts = MissingOptions( strategy="drop_row", row_drop_threshold=0.0, columns=["a", "b", "c", "d"], ) res = handle_missing(df, opts) # Every body row except row 0 has at least one missing. assert res.rows_dropped == 4 class TestEC13FfillLeadingNan: def test_leading_nan_run_survives_ffill(self): df = _read("ec13_ffill_leading_nan.csv") res = handle_missing(df, MissingOptions(strategy="ffill")) # First two rows (leading NaN) remain NaN — there's nothing to fill from. assert pd.isna(res.handled_df["price"].iloc[0]) assert pd.isna(res.handled_df["price"].iloc[1]) # Mid-series gets filled forward. assert res.handled_df["price"].iloc[3] == 100.0 assert res.handled_df["price"].iloc[4] == 100.0 # Trailing NaN gets filled by the last seen value. assert res.handled_df["price"].iloc[6] == 150.0 class TestEC14InterpolateFallback: def test_interpolate_on_non_numeric_falls_back(self): df = _read("ec14_interpolate_fallback.csv", dtype_str=True) opts = MissingOptions( strategy="interpolate", categorical_strategy="mode", ) res = handle_missing(df, opts) # All columns are object dtype here → fallback to mode. assert res.strategy_per_column["category"] == "mode" assert res.strategy_per_column["value"] == "mode" class TestEC15HeadersOnly: def test_empty_body_does_not_crash(self): df = _read("ec15_headers_only.csv") # All operations must be no-ops on an empty body. for preset in ("detect-only", "safe-fill", "drop-incomplete"): res = handle_missing(df, MissingOptions.from_preset(preset)) assert len(res.handled_df) == 0 assert res.cells_filled == 0 assert res.rows_dropped == 0 class TestEC16Idempotency: def test_safe_fill_is_idempotent(self): df = _read("ec16_idempotent_apply.csv", dtype_str=True) opts = MissingOptions.from_preset("safe-fill") first = handle_missing(df, opts) second = handle_missing(first.handled_df, opts) # Second pass should make no further changes. pd.testing.assert_frame_equal( second.handled_df.reset_index(drop=True), first.handled_df.reset_index(drop=True), ) assert second.cells_filled == 0 assert second.sentinels_standardized == 0 def test_detect_only_is_idempotent(self): df = _read("ec16_idempotent_apply.csv", dtype_str=True) opts = MissingOptions.from_preset("detect-only") first = handle_missing(df, opts) second = handle_missing(first.handled_df, opts) assert second.sentinels_standardized == 0 # --------------------------------------------------------------------------- # Whole-corpus property tests # --------------------------------------------------------------------------- ALL_FIXTURES = sorted(p.name for p in TEST_DATA.glob("*.csv")) @pytest.mark.parametrize("fixture", ALL_FIXTURES) def test_handle_missing_does_not_mutate_input(fixture): """Every fixture must leave the input DataFrame untouched.""" df = pd.read_csv(TEST_DATA / fixture, dtype=str, keep_default_na=False) if df.empty and len(df.columns) == 0: pytest.skip(f"{fixture}: completely empty file") snapshot = df.copy(deep=True) handle_missing(df, MissingOptions.from_preset("safe-fill")) pd.testing.assert_frame_equal(df, snapshot) @pytest.mark.parametrize("fixture", ALL_FIXTURES) def test_profile_runs_on_every_fixture(fixture): """``profile_missing`` must succeed on every corpus file.""" df = pd.read_csv(TEST_DATA / fixture, dtype=str, keep_default_na=False) prof = profile_missing(df, MissingOptions()) assert prof.rows_total == len(df) assert prof.cells_total == len(df) * len(df.columns)