Files
datatools-dev/tests/test_missing.py
Michael 966af8ef94 feat: 3 new tools, format streaming, distribution-ready demo + landing pages
Tools shipped this batch (4 → 6 of 9 Ready):
  04 Missing Value Handler   src/core/missing.py + cli_missing.py + GUI
  05 Column Mapper           src/core/column_mapper.py + cli_column_map.py + GUI
  09 Pipeline Runner         src/core/pipeline.py + cli_pipeline.py + GUI
                             with soft tool-dependency graph (recommended,
                             not enforced) and JSON save/load for repeatable
                             weekly cleanups.

Format Standardizer reworked for 1 GB international files:
  • Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
  • Per-row country / address columns drive parsing
  • Audit cap (default 10 k rows, ~50 MB RAM)
  • standardize_file(): chunked streaming entry point (~165 k rows/sec)
  • currency_decimal="auto" for EU comma-decimal locales
  • R$ / kr / zł multi-char currency prefixes
  • cli_format.py with auto-stream above 100 MB inputs

Encoding detection arbiter + language-aware probe:
  Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
  via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.

Distribution-readiness assets:
  • streamlit_app.py — Streamlit Community Cloud entry shim
  • src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
    100-row cap + watermark, free-vs-paid boundary enforced at surface
  • samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
  • landing/ — 4 static HTML pages (apex chooser + 3 niche),
    shared CSS, deploy.py URL-substitution script,
    auto-generated robots.txt + sitemap.xml + 404.html + favicon
  • docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
    — full strategy + measurement + deployment + master checklist

Test counts:
  before: 1,520 passed · 4 skipped · 17 xfailed
  after:  1,729 passed · 0 skipped · 0  xfailed

Tier-1 corpora added:
  • missing-corpus           3 use cases + 16 edge cases
  • column-mapper-corpus     3 use cases + 5 edge cases
  • format-cleaner intl      20-row 13-country stress fixture

Engine hardening flushed out by the corpora:
  • interpolate guards against object-dtype columns
  • mean/median skip all-NaN columns (silences numpy warning)
  • fillna runs under future.no_silent_downcasting (silences pandas warning)
  • mojibake test no longer skips when ftfy installed (monkeypatch path)
  • drop-row threshold semantics: strict-greater (consistent across rows / cols)
  • currency_decimal validator allow-set updated for "auto"

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 22:31:26 +00:00

463 lines
18 KiB
Python

"""Tests for src/core/missing.py."""
from __future__ import annotations
import json
import numpy as np
import pandas as pd
import pytest
from src.core.errors import ConfigError, InputValidationError
from src.core.missing import (
DEFAULT_SENTINELS,
MissingOptions,
PRESETS,
detect_sentinels,
handle_missing,
is_missing_like,
profile_missing,
)
# ---------------------------------------------------------------------------
# is_missing_like
# ---------------------------------------------------------------------------
class TestIsMissingLike:
def test_none(self):
assert is_missing_like(None)
def test_nan(self):
assert is_missing_like(np.nan)
def test_pd_nat(self):
assert is_missing_like(pd.NaT)
def test_empty_string(self):
assert is_missing_like("")
def test_whitespace_only(self):
assert is_missing_like(" ")
assert is_missing_like("\t\n ")
def test_default_sentinels(self):
for s in ("N/A", "n/a", "NULL", "null", "-", "--", "?", "TBD", "(blank)"):
assert is_missing_like(s), f"expected {s!r} to be missing-like"
def test_case_insensitive(self):
assert is_missing_like("N/A")
assert is_missing_like("n/A")
assert is_missing_like("NA")
assert is_missing_like("na")
def test_real_value_not_missing(self):
assert not is_missing_like("hello")
assert not is_missing_like("0")
assert not is_missing_like(0)
assert not is_missing_like(0.0)
def test_zero_is_not_missing(self):
# Common bug: treating 0 / "0" / False as missing.
assert not is_missing_like(0)
assert not is_missing_like(False)
def test_custom_sentinels_override(self):
assert is_missing_like("xx", sentinels=["xx"])
assert not is_missing_like("xx", sentinels=["zz"])
# ---------------------------------------------------------------------------
# detect_sentinels
# ---------------------------------------------------------------------------
class TestDetectSentinels:
def test_counts_by_label(self):
s = pd.Series(["alice", "N/A", "n/a", "NULL", " ", "", "bob"])
counts = detect_sentinels(s)
# "n/a" matches both 'N/A' and 'n/a' under casefold; the canonical
# label that wins is whichever is in the DEFAULT_SENTINELS list.
assert sum(v for k, v in counts.items() if k != "(whitespace)") == 3
assert counts["(whitespace)"] == 2
def test_skips_real_nan(self):
s = pd.Series(["a", np.nan, "N/A"])
counts = detect_sentinels(s)
assert sum(counts.values()) == 1
def test_no_sentinels_returns_empty(self):
s = pd.Series(["alice", "bob", "charlie"])
assert detect_sentinels(s) == {}
# ---------------------------------------------------------------------------
# profile_missing
# ---------------------------------------------------------------------------
class TestProfileMissing:
def test_basic(self):
df = pd.DataFrame({
"name": ["Alice", "Bob", "N/A", "", "Charlie"],
"age": [30, None, 25, 40, np.nan],
})
prof = profile_missing(df, MissingOptions())
assert prof.rows_total == 5
# name: '' + 'N/A' = 2 sentinels; age: 2 NaN
report_by_col = {r.column: r for r in prof.columns}
assert report_by_col["name"].missing == 2
assert report_by_col["age"].missing == 2
assert prof.cells_missing == 4
def test_complete_dataframe(self):
df = pd.DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]})
prof = profile_missing(df, MissingOptions())
assert prof.cells_missing == 0
assert prof.rows_complete == 3
assert prof.rows_with_any_missing == 0
def test_to_dataframe_columns(self):
df = pd.DataFrame({"x": [1, None]})
prof = profile_missing(df, MissingOptions())
out = prof.to_dataframe()
assert set(out.columns) >= {"column", "missing", "missing_pct", "top_sentinel"}
def test_disabled_sentinels_only_counts_real_nan(self):
df = pd.DataFrame({"x": ["N/A", "alice", np.nan]})
opts = MissingOptions(standardize_sentinels=False)
prof = profile_missing(df, opts)
report_by_col = {r.column: r for r in prof.columns}
# Only the real NaN counts; 'N/A' is left alone.
assert report_by_col["x"].missing == 1
# ---------------------------------------------------------------------------
# handle_missing — sentinel standardization
# ---------------------------------------------------------------------------
class TestSentinelStandardization:
def test_replaces_sentinels_with_nan(self):
df = pd.DataFrame({"x": ["alice", "N/A", "-", " ", "bob"]})
res = handle_missing(df, MissingOptions(strategy="none"))
# 'N/A' + '-' + whitespace-only = 3
assert res.sentinels_standardized == 3
assert res.handled_df["x"].isna().sum() == 3
assert res.handled_df.iloc[0]["x"] == "alice"
assert res.handled_df.iloc[4]["x"] == "bob"
def test_audit_records_each_replacement(self):
df = pd.DataFrame({"x": ["alice", "N/A", "bob"]})
res = handle_missing(df, MissingOptions(strategy="none"))
assert len(res.changes) == 1
assert res.changes.iloc[0]["action"].startswith("standardize:")
def test_disabled_keeps_sentinels(self):
df = pd.DataFrame({"x": ["alice", "N/A", "bob"]})
opts = MissingOptions(standardize_sentinels=False, strategy="none")
res = handle_missing(df, opts)
assert res.sentinels_standardized == 0
assert res.handled_df.iloc[1]["x"] == "N/A"
def test_custom_sentinels_extend_default(self):
df = pd.DataFrame({"x": ["alice", "MISSING_DATA", "bob"]})
opts = MissingOptions(
sentinels=[*DEFAULT_SENTINELS, "MISSING_DATA"],
strategy="none",
)
res = handle_missing(df, opts)
assert res.sentinels_standardized == 1
# ---------------------------------------------------------------------------
# handle_missing — fill strategies
# ---------------------------------------------------------------------------
class TestFillStrategies:
@pytest.fixture
def numeric_df(self):
return pd.DataFrame({"x": [1.0, 2.0, np.nan, 4.0, np.nan]})
def test_mean(self, numeric_df):
res = handle_missing(numeric_df, MissingOptions(strategy="mean"))
# mean of [1, 2, 4] = 7/3
filled = res.handled_df["x"].iloc[2]
assert abs(filled - 7.0 / 3.0) < 1e-9
assert res.cells_filled == 2
def test_median(self, numeric_df):
res = handle_missing(numeric_df, MissingOptions(strategy="median"))
# median of [1, 2, 4] = 2.0
assert res.handled_df["x"].iloc[2] == 2.0
def test_mode(self):
df = pd.DataFrame({"x": ["a", "a", "b", None, None]})
res = handle_missing(df, MissingOptions(strategy="mode"))
assert res.handled_df["x"].iloc[3] == "a"
assert res.handled_df["x"].iloc[4] == "a"
assert res.cells_filled == 2
def test_constant_scalar(self, numeric_df):
res = handle_missing(
numeric_df,
MissingOptions(strategy="constant", fill_value=99.0),
)
assert res.handled_df["x"].iloc[2] == 99.0
assert res.handled_df["x"].iloc[4] == 99.0
def test_constant_per_column(self):
df = pd.DataFrame({"a": [1, np.nan], "b": ["x", None]})
opts = MissingOptions(
strategy="constant",
column_fill_values={"a": 0, "b": "?"},
)
res = handle_missing(df, opts)
assert res.handled_df["a"].iloc[1] == 0
assert res.handled_df["b"].iloc[1] == "?"
def test_ffill(self):
df = pd.DataFrame({"x": [1.0, np.nan, np.nan, 4.0]})
res = handle_missing(df, MissingOptions(strategy="ffill"))
assert list(res.handled_df["x"]) == [1.0, 1.0, 1.0, 4.0]
def test_bfill(self):
df = pd.DataFrame({"x": [1.0, np.nan, np.nan, 4.0]})
res = handle_missing(df, MissingOptions(strategy="bfill"))
assert list(res.handled_df["x"]) == [1.0, 4.0, 4.0, 4.0]
def test_interpolate(self):
df = pd.DataFrame({"x": [1.0, np.nan, np.nan, 4.0]})
res = handle_missing(df, MissingOptions(strategy="interpolate"))
assert list(res.handled_df["x"]) == [1.0, 2.0, 3.0, 4.0]
def test_numeric_strategy_falls_back_for_categorical(self):
df = pd.DataFrame({"x": ["a", "a", None, "b"]})
opts = MissingOptions(strategy="median", categorical_strategy="mode")
res = handle_missing(df, opts)
assert res.strategy_per_column["x"] == "mode"
assert res.handled_df["x"].iloc[2] == "a"
def test_per_column_strategy_overrides_global(self):
df = pd.DataFrame({"a": [1.0, np.nan], "b": ["x", None]})
opts = MissingOptions(
strategy="median",
column_strategies={"b": "constant"},
fill_value="??",
)
res = handle_missing(df, opts)
assert res.handled_df["a"].iloc[1] == 1.0 # median of [1.0]
assert res.handled_df["b"].iloc[1] == "??"
def test_all_nan_column_safely_skipped(self):
df = pd.DataFrame({"x": [np.nan, np.nan, np.nan]})
res = handle_missing(df, MissingOptions(strategy="mean"))
assert res.cells_filled == 0
assert res.handled_df["x"].isna().all()
# ---------------------------------------------------------------------------
# handle_missing — drops
# ---------------------------------------------------------------------------
class TestDropStrategies:
def test_drop_row_any_missing(self):
# Strict-greater: threshold 0.0 → drop any row with any missing.
df = pd.DataFrame({
"a": [1, 2, np.nan, 4],
"b": ["x", None, "z", "w"],
})
opts = MissingOptions(strategy="drop_row", row_drop_threshold=0.0)
res = handle_missing(df, opts)
# Rows 1 and 2 each have one missing cell; rows 0 and 3 are clean.
assert res.rows_dropped == 2
assert len(res.handled_df) == 2
def test_drop_row_default_threshold_never_drops(self):
# Default 1.0 = never drop — no fraction exceeds 100%.
df = pd.DataFrame({
"a": [1, 2, np.nan],
"b": ["x", "y", None],
})
opts = MissingOptions(strategy="drop_row") # threshold defaults to 1.0
res = handle_missing(df, opts)
assert res.rows_dropped == 0
def test_drop_row_partial_threshold(self):
df = pd.DataFrame({
"a": [1, np.nan, np.nan, np.nan],
"b": [10, 20, np.nan, np.nan],
"c": [100, 200, np.nan, 400],
})
# Strict-greater: threshold 0.5 → drop rows with > 50% missing.
opts = MissingOptions(strategy="drop_row", row_drop_threshold=0.5)
res = handle_missing(df, opts)
# row 0: 0/3, row 1: 1/3 (0.33) -> keep
# row 2: 3/3 (1.0) -> drop, row 3: 2/3 (0.67) -> drop
assert res.rows_dropped == 2
def test_drop_col_threshold(self):
df = pd.DataFrame({
"keep": [1, 2, 3, 4],
"drop_me": [np.nan, np.nan, np.nan, 1], # 75% missing
})
# Strict-greater: 0.5 → drop columns with > 50% missing.
opts = MissingOptions(strategy="drop_col", col_drop_threshold=0.5)
res = handle_missing(df, opts)
assert "drop_me" in res.columns_dropped
assert "keep" not in res.columns_dropped
def test_drop_both(self):
df = pd.DataFrame({
"keep": [1, 2, 3, 4, 5],
"drop_col": [np.nan] * 5,
"x": [1, np.nan, 3, np.nan, 5],
})
opts = MissingOptions(
strategy="drop_both",
col_drop_threshold=0.99, # >99% missing → drop column
row_drop_threshold=0.0, # any missing in remaining cols → drop row
)
res = handle_missing(df, opts)
# drop_col is 100% missing → dropped
assert "drop_col" in res.columns_dropped
# Remaining scope (keep + x): rows 1 and 3 have a missing x → drop.
assert res.rows_dropped == 2
def test_drop_audit_records_dropped_rows(self):
df = pd.DataFrame({"a": [1, np.nan], "b": [2, np.nan]})
# Drop the fully-missing row (frac > 0.99).
opts = MissingOptions(strategy="drop_row", row_drop_threshold=0.99)
res = handle_missing(df, opts)
drop_records = res.changes[res.changes["action"] == "drop_row"]
assert len(drop_records) == 1
# ---------------------------------------------------------------------------
# Scope: columns / skip_columns
# ---------------------------------------------------------------------------
class TestScope:
def test_columns_filter(self):
df = pd.DataFrame({"a": [np.nan, 2], "b": [np.nan, 4]})
opts = MissingOptions(columns=["a"], strategy="constant", fill_value=99)
res = handle_missing(df, opts)
assert res.handled_df["a"].iloc[0] == 99
# b should be untouched
assert pd.isna(res.handled_df["b"].iloc[0])
def test_skip_columns(self):
df = pd.DataFrame({"a": [np.nan, 2], "b": [np.nan, 4]})
opts = MissingOptions(skip_columns=["b"], strategy="constant", fill_value=99)
res = handle_missing(df, opts)
assert res.handled_df["a"].iloc[0] == 99
assert pd.isna(res.handled_df["b"].iloc[0])
def test_unknown_column_raises(self):
df = pd.DataFrame({"a": [1]})
opts = MissingOptions(columns=["does_not_exist"])
with pytest.raises(InputValidationError):
handle_missing(df, opts)
# ---------------------------------------------------------------------------
# Presets / config
# ---------------------------------------------------------------------------
class TestPresets:
def test_detect_only_does_not_fill(self):
df = pd.DataFrame({"x": ["alice", "N/A", "bob"]})
opts = MissingOptions.from_preset("detect-only")
res = handle_missing(df, opts)
assert res.sentinels_standardized == 1
assert res.cells_filled == 0
assert res.rows_dropped == 0
def test_safe_fill_fills(self):
df = pd.DataFrame({"age": [30, np.nan, 25, 40], "name": ["a", "a", None, "b"]})
opts = MissingOptions.from_preset("safe-fill")
res = handle_missing(df, opts)
assert res.cells_filled == 2
def test_drop_incomplete(self):
df = pd.DataFrame({"a": [1, np.nan, 3], "b": [10, 20, 30]})
opts = MissingOptions.from_preset("drop-incomplete")
res = handle_missing(df, opts)
assert res.rows_dropped == 1
def test_unknown_preset_raises(self):
with pytest.raises(ConfigError):
MissingOptions.from_preset("does-not-exist")
def test_roundtrip_to_file(self, tmp_path):
opts = MissingOptions.from_preset("safe-fill")
opts.column_strategies = {"age": "median"}
path = tmp_path / "cfg.json"
opts.to_file(path)
loaded = MissingOptions.from_file(path)
assert loaded.strategy == opts.strategy
assert loaded.column_strategies == opts.column_strategies
# ---------------------------------------------------------------------------
# Validation
# ---------------------------------------------------------------------------
class TestValidate:
def test_invalid_strategy(self):
opts = MissingOptions(strategy="bogus") # type: ignore[arg-type]
with pytest.raises(InputValidationError):
opts.validate()
def test_threshold_out_of_range(self):
opts = MissingOptions(row_drop_threshold=1.5)
with pytest.raises(ConfigError):
opts.validate()
def test_handle_missing_validates(self):
df = pd.DataFrame({"x": [1]})
opts = MissingOptions(strategy="bogus") # type: ignore[arg-type]
with pytest.raises(InputValidationError):
handle_missing(df, opts)
def test_non_dataframe_input(self):
with pytest.raises(InputValidationError):
handle_missing([1, 2, 3]) # type: ignore[arg-type]
# ---------------------------------------------------------------------------
# End-to-end realistic case
# ---------------------------------------------------------------------------
class TestEndToEnd:
def test_messy_customer_export(self):
df = pd.DataFrame({
"customer_id": [1, 2, 3, 4, 5, 6],
"name": ["Alice", "Bob", "N/A", " ", "Charlie", None],
"email": ["a@x.com", "-", "c@x.com", "d@x.com", "NULL", "f@x.com"],
"age": [30, np.nan, 25, 40, np.nan, 50],
})
opts = MissingOptions(
standardize_sentinels=True,
strategy="median",
categorical_strategy="constant",
fill_value="UNKNOWN",
)
res = handle_missing(df, opts)
# Sentinels: name "N/A"," ",None; email "-","NULL". (None is real-NaN, not sentinel.)
# Whitespace + 'N/A' on name = 2; '-' + 'NULL' on email = 2. Total = 4.
assert res.sentinels_standardized == 4
# name has 3 missing after standardize (N/A, " ", None) → constant fill
# email has 2 missing → constant fill
# age has 2 missing → median (32.5 of [30, 25, 40, 50])
assert res.cells_filled == 7
assert res.handled_df["name"].isna().sum() == 0
assert res.handled_df["email"].isna().sum() == 0
assert res.handled_df["age"].isna().sum() == 0
assert (res.handled_df["name"] == "UNKNOWN").sum() == 3
assert (res.handled_df["age"] == 35.0).sum() == 2 # median of [30, 25, 40, 50]
def test_input_not_mutated(self):
df = pd.DataFrame({"x": ["N/A", "alice", np.nan]})
df_copy = df.copy()
handle_missing(df, MissingOptions.from_preset("safe-fill"))
pd.testing.assert_frame_equal(df, df_copy)