datatools-dev/tests/test_missing_corpus.py

"""Acceptance corpus for the Missing Value Handler.

Loads every fixture in ``test-cases/missing-corpus/test_data/`` and
asserts the documented behaviour. The fixtures are split into:

  * ``uc##`` — three target-client use cases (Shopify operator,
    marketing analyst, consultant intake).
  * ``ec##`` — edge cases the engine must handle without surprise:
    all-NaN columns, zeros that aren't missing, Excel errors, unicode
    whitespace, mixed dtypes, padding, single row/column, every default
    sentinel, per-column constants, drop thresholds, leading-NaN ffill,
    numeric-strategy fallback for non-numeric columns, headers-only,
    idempotency.

Each test runs through the public API (``handle_missing``) so any
regression in the engine surfaces here. Fixture files double as living
documentation for what the tool is supposed to do.
"""

from __future__ import annotations

import io
from pathlib import Path

import numpy as np
import pandas as pd
import pytest

from src.core.missing import (
    MissingOptions,
    handle_missing,
    is_missing_like,
    profile_missing,
)

CORPUS = Path(__file__).resolve().parents[1] / "test-cases" / "missing-corpus"
TEST_DATA = CORPUS / "test_data"


def _read(name: str, *, dtype_str: bool = False) -> pd.DataFrame:
    """Load a corpus CSV.

    By default we let pandas infer dtypes — that's the most realistic
    intake path (Excel exports keep numeric columns numeric). A handful
    of cases pass ``dtype_str=True`` to keep sentinels visible in
    columns that would otherwise be coerced to float.
    """
    path = TEST_DATA / name
    if dtype_str:
        return pd.read_csv(path, dtype=str, keep_default_na=False)
    return pd.read_csv(path)


# ---------------------------------------------------------------------------
# Use case 1 — Shopify operator: detect-only
# ---------------------------------------------------------------------------

class TestUC01ShopifyExport:
    """SMB operator standardizes disguised nulls before reimporting."""

    def test_detect_only_replaces_sentinels(self):
        df = _read("uc01_shopify_export.csv", dtype_str=True)
        opts = MissingOptions.from_preset("detect-only")
        res = handle_missing(df, opts)
        # Spot-check known sentinels from the fixture
        assert res.sentinels_standardized > 0
        assert res.cells_filled == 0
        assert res.rows_dropped == 0

        # Fields that contained 'N/A', '-', 'NULL', '(blank)', '#N/A',
        # 'n/a', '?', '(none)' should now be NaN.
        for row, col in [
            (1, "phone"),       # 'N/A'
            (2, "city"),        # '-'
            (3, "total_orders"),  # 'NULL'
            (5, "phone"),       # ' '
            (5, "last_order_date"),  # '(blank)'
            (6, "last_order_date"),  # '#N/A'
            (7, "phone"),       # 'n/a'
            (8, "city"),        # '?'
            (9, "total_orders"),  # '(none)'
        ]:
            assert pd.isna(res.handled_df.iloc[row][col]), (
                f"Expected NaN at row {row} col {col}, got "
                f"{res.handled_df.iloc[row][col]!r}"
            )

    def test_real_values_preserved(self):
        df = _read("uc01_shopify_export.csv", dtype_str=True)
        res = handle_missing(df, MissingOptions.from_preset("detect-only"))
        # First row should be untouched.
        assert res.handled_df.iloc[0]["first_name"] == "Alice"
        assert res.handled_df.iloc[0]["email"] == "alice@shop.com"
        assert res.handled_df.iloc[0]["lifetime_value"] == "1240.50"

    def test_audit_log_complete(self):
        df = _read("uc01_shopify_export.csv", dtype_str=True)
        res = handle_missing(df, MissingOptions.from_preset("detect-only"))
        # One audit row per sentinel replacement.
        assert len(res.changes) == res.sentinels_standardized
        assert set(res.changes["action"].apply(lambda s: s.startswith("standardize:"))) == {True}


# ---------------------------------------------------------------------------
# Use case 2 — Marketing analyst: safe-fill
# ---------------------------------------------------------------------------

class TestUC02MarketingAudience:
    """Marketer fills numeric columns with median, categorical with mode."""

    def test_safe_fill_clears_all_missing(self):
        df = _read("uc02_marketing_audience.csv")
        opts = MissingOptions.from_preset("safe-fill")
        res = handle_missing(df, opts)
        # Every cell in scope should be filled.
        assert res.profile_after.cells_missing == 0
        assert res.cells_filled > 0

    def test_numeric_uses_median_categorical_uses_mode(self):
        df = _read("uc02_marketing_audience.csv")
        opts = MissingOptions.from_preset("safe-fill")
        res = handle_missing(df, opts)
        # 'age' is numeric → median strategy
        assert res.strategy_per_column["age"] == "median"
        # 'segment' / 'region' / 'source' are object → mode fallback
        assert res.strategy_per_column["segment"] == "mode"
        assert res.strategy_per_column["region"] == "mode"

    def test_per_column_override(self):
        df = _read("uc02_marketing_audience.csv")
        opts = MissingOptions.from_preset("safe-fill")
        opts.column_strategies = {"source": "constant"}
        opts.column_fill_values = {"source": "unknown"}
        res = handle_missing(df, opts)
        # Cells previously holding sentinels in 'source' should now equal "unknown".
        assert (res.handled_df["source"] == "unknown").sum() >= 3

    def test_consent_real_false_not_dropped(self):
        # 'consent' column has empty cells but also explicit "true"; mode fill
        # must not silently change a real "true" to anything else.
        df = _read("uc02_marketing_audience.csv")
        res = handle_missing(df, MissingOptions.from_preset("safe-fill"))
        original_trues = (df["consent"] == "true").sum()
        result_trues = (res.handled_df["consent"] == "true").sum()
        # Filled rows can become "true" (mode) but should not lose existing trues.
        assert result_trues >= original_trues


# ---------------------------------------------------------------------------
# Use case 3 — Consultant intake: threshold drops + fill
# ---------------------------------------------------------------------------

class TestUC03ConsultantIntake:
    """Drop sparse columns and rows, then fill the survivors."""

    def test_drop_col_removes_legacy_fields(self):
        df = _read("uc03_consultant_intake.csv", dtype_str=True)
        # internal_id_legacy and beta_field are 100% missing — drop them.
        opts = MissingOptions(
            standardize_sentinels=True,
            strategy="drop_col",
            col_drop_threshold=0.99,
        )
        res = handle_missing(df, opts)
        assert "internal_id_legacy" in res.columns_dropped
        assert "beta_field" in res.columns_dropped

    def test_drop_row_removes_mostly_empty_respondents(self):
        df = _read("uc03_consultant_intake.csv", dtype_str=True)
        opts = MissingOptions(
            standardize_sentinels=True,
            strategy="drop_both",
            col_drop_threshold=0.99,  # drop the legacy / beta cols first
            row_drop_threshold=0.5,   # then drop rows with >50% missing
        )
        res = handle_missing(df, opts)
        # R-002, R-005, R-007, R-010 are mostly-empty respondents.
        assert res.rows_dropped >= 4
        # Non-empty respondents survive.
        kept_ids = set(res.handled_df["respondent_id"].tolist())
        for survivor in ("R-001", "R-003", "R-006", "R-008", "R-009", "R-012"):
            assert survivor in kept_ids


# ---------------------------------------------------------------------------
# Edge cases
# ---------------------------------------------------------------------------

class TestEC01AllNanColumn:
    def test_fill_skips_all_nan_column(self):
        df = _read("ec01_all_nan_column.csv")
        res = handle_missing(df, MissingOptions(strategy="mean"))
        # Mean of all-NaN is NaN — engine must NOT fabricate a value.
        assert res.handled_df["deprecated_field"].isna().all()
        assert res.cells_filled == 0

    def test_drop_col_catches_all_nan(self):
        df = _read("ec01_all_nan_column.csv")
        res = handle_missing(
            df, MissingOptions(strategy="drop_col", col_drop_threshold=0.99),
        )
        assert "deprecated_field" in res.columns_dropped
        assert "name" not in res.columns_dropped


class TestEC02NoMissing:
    def test_clean_file_is_noop(self):
        df = _read("ec02_no_missing.csv")
        res = handle_missing(df, MissingOptions.from_preset("safe-fill"))
        assert res.sentinels_standardized == 0
        assert res.cells_filled == 0
        assert res.rows_dropped == 0
        pd.testing.assert_frame_equal(res.handled_df, df)


class TestEC03ZeroIsNotMissing:
    def test_zero_preserved(self):
        df = _read("ec03_zero_is_not_missing.csv")
        res = handle_missing(df, MissingOptions.from_preset("safe-fill"))
        # Original zeros remain zero.
        assert (res.handled_df["balance"] == 0).sum() == (df["balance"] == 0).sum()
        assert (res.handled_df["count"] == 0).sum() == (df["count"] == 0).sum()
        # No spurious changes recorded.
        assert res.cells_filled == 0
        assert res.sentinels_standardized == 0

    def test_is_missing_like_zero_predicate(self):
        # Direct predicate check — zeros, false, "0" must all be non-missing.
        assert not is_missing_like(0)
        assert not is_missing_like(0.0)
        assert not is_missing_like(False)
        assert not is_missing_like("0")
        assert not is_missing_like("0.00")


class TestEC04ExcelErrors:
    def test_excel_error_sentinels_recognized(self):
        df = _read("ec04_excel_errors.csv", dtype_str=True)
        res = handle_missing(df, MissingOptions(strategy="none"))
        # 6 error sentinels in the fixture: #N/A, #NULL!, #VALUE!, #N/A, #N/A, #NULL!
        assert res.sentinels_standardized == 6


class TestEC05UnicodeWhitespace:
    def test_nbsp_and_ideographic_space_count_as_missing(self):
        df = _read("ec05_unicode_whitespace.csv", dtype_str=True)
        res = handle_missing(df, MissingOptions(strategy="none"))
        # rows 1, 2, 4 contain NBSP / tab / ideographic space respectively
        assert res.handled_df["note"].isna().sum() == 3
        assert res.handled_df.iloc[0]["note"] == "hello"
        assert res.handled_df.iloc[3]["note"] == "real"


class TestEC06MixedDtypes:
    def test_mixed_column_falls_back_to_mode(self):
        # Read with native dtypes so 'real_num' stays numeric.
        df = _read("ec06_mixed_dtypes.csv")
        opts = MissingOptions(
            standardize_sentinels=True,
            strategy="median",
            categorical_strategy="mode",
        )
        res = handle_missing(df, opts)
        # mixed_col holds 'N/A' / 'hello' alongside numbers → object dtype,
        # median falls back to mode.
        assert res.strategy_per_column["mixed_col"] == "mode"
        # real_num is float dtype → median runs.
        assert res.strategy_per_column["real_num"] == "median"


class TestEC07RealDataWithPadding:
    def test_padded_real_data_not_treated_as_missing(self):
        df = _read("ec07_real_data_with_padding.csv", dtype_str=True)
        res = handle_missing(df, MissingOptions(strategy="none"))
        # Only row 1 (name="  ") and row 2 (city=blank) should become NaN.
        # "  Alice  ", " Bob ", "  SF" must remain.
        assert res.handled_df.iloc[0]["name"] == "  Alice  "
        assert res.handled_df.iloc[2]["name"] == " Bob "
        assert res.handled_df.iloc[3]["city"] == "  SF"


class TestEC08SingleRow:
    def test_single_row_handles_cleanly(self):
        df = _read("ec08_single_row.csv", dtype_str=True)
        # detect-only
        res = handle_missing(df, MissingOptions(strategy="none"))
        assert res.sentinels_standardized == 2  # 'N/A' + ''
        # safe-fill on a one-row file: median/mode of a single value is itself.
        res2 = handle_missing(df, MissingOptions.from_preset("safe-fill"))
        assert res2.handled_df.iloc[0]["name"] == "Alice"


class TestEC09SingleColumn:
    def test_single_column_works(self):
        df = _read("ec09_single_column.csv", dtype_str=True)
        res = handle_missing(df, MissingOptions(strategy="none"))
        # 'N/A', whitespace-only ' ', '-' = 3 sentinels
        assert res.sentinels_standardized == 3
        assert res.handled_df["value"].isna().sum() == 3


class TestEC10AllSentinelVariants:
    def test_every_default_sentinel_recognized(self):
        df = _read("ec10_all_sentinel_variants.csv", dtype_str=True)
        res = handle_missing(df, MissingOptions(strategy="none"))
        # 20 sentinels + 1 real value
        assert res.sentinels_standardized == 20
        # The 'real_value' row stays.
        assert (res.handled_df["sentinel_value"] == "real_value").sum() == 1


class TestEC11ConstantPerColumn:
    def test_per_column_fill_values(self):
        df = _read("ec11_constant_per_column.csv", dtype_str=True)
        opts = MissingOptions(
            strategy="constant",
            column_fill_values={
                "country": "USA",
                "salary": "0",
                "department": "Unassigned",
            },
        )
        res = handle_missing(df, opts)
        # Fixture has 1 UK row + 2 USA rows + 2 blanks. Filling blanks with
        # "USA" yields 4 USA total; UK is preserved.
        assert (res.handled_df["country"] == "USA").sum() == 4
        assert (res.handled_df["country"] == "UK").sum() == 1
        assert (res.handled_df["department"] == "Unassigned").sum() >= 2


class TestEC12DropThresholdBoundary:
    def test_threshold_one_never_drops(self):
        # threshold 1.0 + strict-greater = never drop.
        df = _read("ec12_drop_threshold_boundary.csv")
        opts = MissingOptions(strategy="drop_row", row_drop_threshold=1.0)
        res = handle_missing(df, opts)
        assert res.rows_dropped == 0

    def test_threshold_just_under_one_drops_fully_missing(self):
        # threshold 0.99: drop only fully-missing rows (frac > 0.99 → frac == 1.0).
        df = _read("ec12_drop_threshold_boundary.csv")
        opts = MissingOptions(
            strategy="drop_row",
            row_drop_threshold=0.99,
            columns=["a", "b", "c", "d"],  # exclude id from the scope
        )
        res = handle_missing(df, opts)
        # Only row 3 (id=4, all four are NaN) qualifies.
        assert res.rows_dropped == 1

    def test_threshold_half_drops_majority_missing(self):
        df = _read("ec12_drop_threshold_boundary.csv")
        opts = MissingOptions(
            strategy="drop_row",
            row_drop_threshold=0.5,
            columns=["a", "b", "c", "d"],
        )
        res = handle_missing(df, opts)
        # Missing fractions across [a,b,c,d]:
        #   row 0: 0/4=0.0   keep
        #   row 1: 2/4=0.5   keep (strict >, not equal)
        #   row 2: 3/4=0.75  drop
        #   row 3: 4/4=1.0   drop
        #   row 4: 2/4=0.5   keep
        assert res.rows_dropped == 2

    def test_threshold_zero_drops_any_missing(self):
        df = _read("ec12_drop_threshold_boundary.csv")
        opts = MissingOptions(
            strategy="drop_row",
            row_drop_threshold=0.0,
            columns=["a", "b", "c", "d"],
        )
        res = handle_missing(df, opts)
        # Every body row except row 0 has at least one missing.
        assert res.rows_dropped == 4


class TestEC13FfillLeadingNan:
    def test_leading_nan_run_survives_ffill(self):
        df = _read("ec13_ffill_leading_nan.csv")
        res = handle_missing(df, MissingOptions(strategy="ffill"))
        # First two rows (leading NaN) remain NaN — there's nothing to fill from.
        assert pd.isna(res.handled_df["price"].iloc[0])
        assert pd.isna(res.handled_df["price"].iloc[1])
        # Mid-series gets filled forward.
        assert res.handled_df["price"].iloc[3] == 100.0
        assert res.handled_df["price"].iloc[4] == 100.0
        # Trailing NaN gets filled by the last seen value.
        assert res.handled_df["price"].iloc[6] == 150.0


class TestEC14InterpolateFallback:
    def test_interpolate_on_non_numeric_falls_back(self):
        df = _read("ec14_interpolate_fallback.csv", dtype_str=True)
        opts = MissingOptions(
            strategy="interpolate",
            categorical_strategy="mode",
        )
        res = handle_missing(df, opts)
        # All columns are object dtype here → fallback to mode.
        assert res.strategy_per_column["category"] == "mode"
        assert res.strategy_per_column["value"] == "mode"


class TestEC15HeadersOnly:
    def test_empty_body_does_not_crash(self):
        df = _read("ec15_headers_only.csv")
        # All operations must be no-ops on an empty body.
        for preset in ("detect-only", "safe-fill", "drop-incomplete"):
            res = handle_missing(df, MissingOptions.from_preset(preset))
            assert len(res.handled_df) == 0
            assert res.cells_filled == 0
            assert res.rows_dropped == 0


class TestEC16Idempotency:
    def test_safe_fill_is_idempotent(self):
        df = _read("ec16_idempotent_apply.csv", dtype_str=True)
        opts = MissingOptions.from_preset("safe-fill")
        first = handle_missing(df, opts)
        second = handle_missing(first.handled_df, opts)
        # Second pass should make no further changes.
        pd.testing.assert_frame_equal(
            second.handled_df.reset_index(drop=True),
            first.handled_df.reset_index(drop=True),
        )
        assert second.cells_filled == 0
        assert second.sentinels_standardized == 0

    def test_detect_only_is_idempotent(self):
        df = _read("ec16_idempotent_apply.csv", dtype_str=True)
        opts = MissingOptions.from_preset("detect-only")
        first = handle_missing(df, opts)
        second = handle_missing(first.handled_df, opts)
        assert second.sentinels_standardized == 0


# ---------------------------------------------------------------------------
# Whole-corpus property tests
# ---------------------------------------------------------------------------

ALL_FIXTURES = sorted(p.name for p in TEST_DATA.glob("*.csv"))


@pytest.mark.parametrize("fixture", ALL_FIXTURES)
def test_handle_missing_does_not_mutate_input(fixture):
    """Every fixture must leave the input DataFrame untouched."""
    df = pd.read_csv(TEST_DATA / fixture, dtype=str, keep_default_na=False)
    if df.empty and len(df.columns) == 0:
        pytest.skip(f"{fixture}: completely empty file")
    snapshot = df.copy(deep=True)
    handle_missing(df, MissingOptions.from_preset("safe-fill"))
    pd.testing.assert_frame_equal(df, snapshot)


@pytest.mark.parametrize("fixture", ALL_FIXTURES)
def test_profile_runs_on_every_fixture(fixture):
    """``profile_missing`` must succeed on every corpus file."""
    df = pd.read_csv(TEST_DATA / fixture, dtype=str, keep_default_na=False)
    prof = profile_missing(df, MissingOptions())
    assert prof.rows_total == len(df)
    assert prof.cells_total == len(df) * len(df.columns)