Tools shipped this batch (4 → 6 of 9 Ready):
04 Missing Value Handler src/core/missing.py + cli_missing.py + GUI
05 Column Mapper src/core/column_mapper.py + cli_column_map.py + GUI
09 Pipeline Runner src/core/pipeline.py + cli_pipeline.py + GUI
with soft tool-dependency graph (recommended,
not enforced) and JSON save/load for repeatable
weekly cleanups.
Format Standardizer reworked for 1 GB international files:
• Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
• Per-row country / address columns drive parsing
• Audit cap (default 10 k rows, ~50 MB RAM)
• standardize_file(): chunked streaming entry point (~165 k rows/sec)
• currency_decimal="auto" for EU comma-decimal locales
• R$ / kr / zł multi-char currency prefixes
• cli_format.py with auto-stream above 100 MB inputs
Encoding detection arbiter + language-aware probe:
Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.
Distribution-readiness assets:
• streamlit_app.py — Streamlit Community Cloud entry shim
• src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
100-row cap + watermark, free-vs-paid boundary enforced at surface
• samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
• landing/ — 4 static HTML pages (apex chooser + 3 niche),
shared CSS, deploy.py URL-substitution script,
auto-generated robots.txt + sitemap.xml + 404.html + favicon
• docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
— full strategy + measurement + deployment + master checklist
Test counts:
before: 1,520 passed · 4 skipped · 17 xfailed
after: 1,729 passed · 0 skipped · 0 xfailed
Tier-1 corpora added:
• missing-corpus 3 use cases + 16 edge cases
• column-mapper-corpus 3 use cases + 5 edge cases
• format-cleaner intl 20-row 13-country stress fixture
Engine hardening flushed out by the corpora:
• interpolate guards against object-dtype columns
• mean/median skip all-NaN columns (silences numpy warning)
• fillna runs under future.no_silent_downcasting (silences pandas warning)
• mojibake test no longer skips when ftfy installed (monkeypatch path)
• drop-row threshold semantics: strict-greater (consistent across rows / cols)
• currency_decimal validator allow-set updated for "auto"
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
463 lines
18 KiB
Python
463 lines
18 KiB
Python
"""Tests for src/core/missing.py."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pytest
|
|
|
|
from src.core.errors import ConfigError, InputValidationError
|
|
from src.core.missing import (
|
|
DEFAULT_SENTINELS,
|
|
MissingOptions,
|
|
PRESETS,
|
|
detect_sentinels,
|
|
handle_missing,
|
|
is_missing_like,
|
|
profile_missing,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# is_missing_like
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestIsMissingLike:
|
|
def test_none(self):
|
|
assert is_missing_like(None)
|
|
|
|
def test_nan(self):
|
|
assert is_missing_like(np.nan)
|
|
|
|
def test_pd_nat(self):
|
|
assert is_missing_like(pd.NaT)
|
|
|
|
def test_empty_string(self):
|
|
assert is_missing_like("")
|
|
|
|
def test_whitespace_only(self):
|
|
assert is_missing_like(" ")
|
|
assert is_missing_like("\t\n ")
|
|
|
|
def test_default_sentinels(self):
|
|
for s in ("N/A", "n/a", "NULL", "null", "-", "--", "?", "TBD", "(blank)"):
|
|
assert is_missing_like(s), f"expected {s!r} to be missing-like"
|
|
|
|
def test_case_insensitive(self):
|
|
assert is_missing_like("N/A")
|
|
assert is_missing_like("n/A")
|
|
assert is_missing_like("NA")
|
|
assert is_missing_like("na")
|
|
|
|
def test_real_value_not_missing(self):
|
|
assert not is_missing_like("hello")
|
|
assert not is_missing_like("0")
|
|
assert not is_missing_like(0)
|
|
assert not is_missing_like(0.0)
|
|
|
|
def test_zero_is_not_missing(self):
|
|
# Common bug: treating 0 / "0" / False as missing.
|
|
assert not is_missing_like(0)
|
|
assert not is_missing_like(False)
|
|
|
|
def test_custom_sentinels_override(self):
|
|
assert is_missing_like("xx", sentinels=["xx"])
|
|
assert not is_missing_like("xx", sentinels=["zz"])
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# detect_sentinels
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestDetectSentinels:
|
|
def test_counts_by_label(self):
|
|
s = pd.Series(["alice", "N/A", "n/a", "NULL", " ", "", "bob"])
|
|
counts = detect_sentinels(s)
|
|
# "n/a" matches both 'N/A' and 'n/a' under casefold; the canonical
|
|
# label that wins is whichever is in the DEFAULT_SENTINELS list.
|
|
assert sum(v for k, v in counts.items() if k != "(whitespace)") == 3
|
|
assert counts["(whitespace)"] == 2
|
|
|
|
def test_skips_real_nan(self):
|
|
s = pd.Series(["a", np.nan, "N/A"])
|
|
counts = detect_sentinels(s)
|
|
assert sum(counts.values()) == 1
|
|
|
|
def test_no_sentinels_returns_empty(self):
|
|
s = pd.Series(["alice", "bob", "charlie"])
|
|
assert detect_sentinels(s) == {}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# profile_missing
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestProfileMissing:
|
|
def test_basic(self):
|
|
df = pd.DataFrame({
|
|
"name": ["Alice", "Bob", "N/A", "", "Charlie"],
|
|
"age": [30, None, 25, 40, np.nan],
|
|
})
|
|
prof = profile_missing(df, MissingOptions())
|
|
assert prof.rows_total == 5
|
|
# name: '' + 'N/A' = 2 sentinels; age: 2 NaN
|
|
report_by_col = {r.column: r for r in prof.columns}
|
|
assert report_by_col["name"].missing == 2
|
|
assert report_by_col["age"].missing == 2
|
|
assert prof.cells_missing == 4
|
|
|
|
def test_complete_dataframe(self):
|
|
df = pd.DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]})
|
|
prof = profile_missing(df, MissingOptions())
|
|
assert prof.cells_missing == 0
|
|
assert prof.rows_complete == 3
|
|
assert prof.rows_with_any_missing == 0
|
|
|
|
def test_to_dataframe_columns(self):
|
|
df = pd.DataFrame({"x": [1, None]})
|
|
prof = profile_missing(df, MissingOptions())
|
|
out = prof.to_dataframe()
|
|
assert set(out.columns) >= {"column", "missing", "missing_pct", "top_sentinel"}
|
|
|
|
def test_disabled_sentinels_only_counts_real_nan(self):
|
|
df = pd.DataFrame({"x": ["N/A", "alice", np.nan]})
|
|
opts = MissingOptions(standardize_sentinels=False)
|
|
prof = profile_missing(df, opts)
|
|
report_by_col = {r.column: r for r in prof.columns}
|
|
# Only the real NaN counts; 'N/A' is left alone.
|
|
assert report_by_col["x"].missing == 1
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# handle_missing — sentinel standardization
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestSentinelStandardization:
|
|
def test_replaces_sentinels_with_nan(self):
|
|
df = pd.DataFrame({"x": ["alice", "N/A", "-", " ", "bob"]})
|
|
res = handle_missing(df, MissingOptions(strategy="none"))
|
|
# 'N/A' + '-' + whitespace-only = 3
|
|
assert res.sentinels_standardized == 3
|
|
assert res.handled_df["x"].isna().sum() == 3
|
|
assert res.handled_df.iloc[0]["x"] == "alice"
|
|
assert res.handled_df.iloc[4]["x"] == "bob"
|
|
|
|
def test_audit_records_each_replacement(self):
|
|
df = pd.DataFrame({"x": ["alice", "N/A", "bob"]})
|
|
res = handle_missing(df, MissingOptions(strategy="none"))
|
|
assert len(res.changes) == 1
|
|
assert res.changes.iloc[0]["action"].startswith("standardize:")
|
|
|
|
def test_disabled_keeps_sentinels(self):
|
|
df = pd.DataFrame({"x": ["alice", "N/A", "bob"]})
|
|
opts = MissingOptions(standardize_sentinels=False, strategy="none")
|
|
res = handle_missing(df, opts)
|
|
assert res.sentinels_standardized == 0
|
|
assert res.handled_df.iloc[1]["x"] == "N/A"
|
|
|
|
def test_custom_sentinels_extend_default(self):
|
|
df = pd.DataFrame({"x": ["alice", "MISSING_DATA", "bob"]})
|
|
opts = MissingOptions(
|
|
sentinels=[*DEFAULT_SENTINELS, "MISSING_DATA"],
|
|
strategy="none",
|
|
)
|
|
res = handle_missing(df, opts)
|
|
assert res.sentinels_standardized == 1
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# handle_missing — fill strategies
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestFillStrategies:
|
|
@pytest.fixture
|
|
def numeric_df(self):
|
|
return pd.DataFrame({"x": [1.0, 2.0, np.nan, 4.0, np.nan]})
|
|
|
|
def test_mean(self, numeric_df):
|
|
res = handle_missing(numeric_df, MissingOptions(strategy="mean"))
|
|
# mean of [1, 2, 4] = 7/3
|
|
filled = res.handled_df["x"].iloc[2]
|
|
assert abs(filled - 7.0 / 3.0) < 1e-9
|
|
assert res.cells_filled == 2
|
|
|
|
def test_median(self, numeric_df):
|
|
res = handle_missing(numeric_df, MissingOptions(strategy="median"))
|
|
# median of [1, 2, 4] = 2.0
|
|
assert res.handled_df["x"].iloc[2] == 2.0
|
|
|
|
def test_mode(self):
|
|
df = pd.DataFrame({"x": ["a", "a", "b", None, None]})
|
|
res = handle_missing(df, MissingOptions(strategy="mode"))
|
|
assert res.handled_df["x"].iloc[3] == "a"
|
|
assert res.handled_df["x"].iloc[4] == "a"
|
|
assert res.cells_filled == 2
|
|
|
|
def test_constant_scalar(self, numeric_df):
|
|
res = handle_missing(
|
|
numeric_df,
|
|
MissingOptions(strategy="constant", fill_value=99.0),
|
|
)
|
|
assert res.handled_df["x"].iloc[2] == 99.0
|
|
assert res.handled_df["x"].iloc[4] == 99.0
|
|
|
|
def test_constant_per_column(self):
|
|
df = pd.DataFrame({"a": [1, np.nan], "b": ["x", None]})
|
|
opts = MissingOptions(
|
|
strategy="constant",
|
|
column_fill_values={"a": 0, "b": "?"},
|
|
)
|
|
res = handle_missing(df, opts)
|
|
assert res.handled_df["a"].iloc[1] == 0
|
|
assert res.handled_df["b"].iloc[1] == "?"
|
|
|
|
def test_ffill(self):
|
|
df = pd.DataFrame({"x": [1.0, np.nan, np.nan, 4.0]})
|
|
res = handle_missing(df, MissingOptions(strategy="ffill"))
|
|
assert list(res.handled_df["x"]) == [1.0, 1.0, 1.0, 4.0]
|
|
|
|
def test_bfill(self):
|
|
df = pd.DataFrame({"x": [1.0, np.nan, np.nan, 4.0]})
|
|
res = handle_missing(df, MissingOptions(strategy="bfill"))
|
|
assert list(res.handled_df["x"]) == [1.0, 4.0, 4.0, 4.0]
|
|
|
|
def test_interpolate(self):
|
|
df = pd.DataFrame({"x": [1.0, np.nan, np.nan, 4.0]})
|
|
res = handle_missing(df, MissingOptions(strategy="interpolate"))
|
|
assert list(res.handled_df["x"]) == [1.0, 2.0, 3.0, 4.0]
|
|
|
|
def test_numeric_strategy_falls_back_for_categorical(self):
|
|
df = pd.DataFrame({"x": ["a", "a", None, "b"]})
|
|
opts = MissingOptions(strategy="median", categorical_strategy="mode")
|
|
res = handle_missing(df, opts)
|
|
assert res.strategy_per_column["x"] == "mode"
|
|
assert res.handled_df["x"].iloc[2] == "a"
|
|
|
|
def test_per_column_strategy_overrides_global(self):
|
|
df = pd.DataFrame({"a": [1.0, np.nan], "b": ["x", None]})
|
|
opts = MissingOptions(
|
|
strategy="median",
|
|
column_strategies={"b": "constant"},
|
|
fill_value="??",
|
|
)
|
|
res = handle_missing(df, opts)
|
|
assert res.handled_df["a"].iloc[1] == 1.0 # median of [1.0]
|
|
assert res.handled_df["b"].iloc[1] == "??"
|
|
|
|
def test_all_nan_column_safely_skipped(self):
|
|
df = pd.DataFrame({"x": [np.nan, np.nan, np.nan]})
|
|
res = handle_missing(df, MissingOptions(strategy="mean"))
|
|
assert res.cells_filled == 0
|
|
assert res.handled_df["x"].isna().all()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# handle_missing — drops
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestDropStrategies:
|
|
def test_drop_row_any_missing(self):
|
|
# Strict-greater: threshold 0.0 → drop any row with any missing.
|
|
df = pd.DataFrame({
|
|
"a": [1, 2, np.nan, 4],
|
|
"b": ["x", None, "z", "w"],
|
|
})
|
|
opts = MissingOptions(strategy="drop_row", row_drop_threshold=0.0)
|
|
res = handle_missing(df, opts)
|
|
# Rows 1 and 2 each have one missing cell; rows 0 and 3 are clean.
|
|
assert res.rows_dropped == 2
|
|
assert len(res.handled_df) == 2
|
|
|
|
def test_drop_row_default_threshold_never_drops(self):
|
|
# Default 1.0 = never drop — no fraction exceeds 100%.
|
|
df = pd.DataFrame({
|
|
"a": [1, 2, np.nan],
|
|
"b": ["x", "y", None],
|
|
})
|
|
opts = MissingOptions(strategy="drop_row") # threshold defaults to 1.0
|
|
res = handle_missing(df, opts)
|
|
assert res.rows_dropped == 0
|
|
|
|
def test_drop_row_partial_threshold(self):
|
|
df = pd.DataFrame({
|
|
"a": [1, np.nan, np.nan, np.nan],
|
|
"b": [10, 20, np.nan, np.nan],
|
|
"c": [100, 200, np.nan, 400],
|
|
})
|
|
# Strict-greater: threshold 0.5 → drop rows with > 50% missing.
|
|
opts = MissingOptions(strategy="drop_row", row_drop_threshold=0.5)
|
|
res = handle_missing(df, opts)
|
|
# row 0: 0/3, row 1: 1/3 (0.33) -> keep
|
|
# row 2: 3/3 (1.0) -> drop, row 3: 2/3 (0.67) -> drop
|
|
assert res.rows_dropped == 2
|
|
|
|
def test_drop_col_threshold(self):
|
|
df = pd.DataFrame({
|
|
"keep": [1, 2, 3, 4],
|
|
"drop_me": [np.nan, np.nan, np.nan, 1], # 75% missing
|
|
})
|
|
# Strict-greater: 0.5 → drop columns with > 50% missing.
|
|
opts = MissingOptions(strategy="drop_col", col_drop_threshold=0.5)
|
|
res = handle_missing(df, opts)
|
|
assert "drop_me" in res.columns_dropped
|
|
assert "keep" not in res.columns_dropped
|
|
|
|
def test_drop_both(self):
|
|
df = pd.DataFrame({
|
|
"keep": [1, 2, 3, 4, 5],
|
|
"drop_col": [np.nan] * 5,
|
|
"x": [1, np.nan, 3, np.nan, 5],
|
|
})
|
|
opts = MissingOptions(
|
|
strategy="drop_both",
|
|
col_drop_threshold=0.99, # >99% missing → drop column
|
|
row_drop_threshold=0.0, # any missing in remaining cols → drop row
|
|
)
|
|
res = handle_missing(df, opts)
|
|
# drop_col is 100% missing → dropped
|
|
assert "drop_col" in res.columns_dropped
|
|
# Remaining scope (keep + x): rows 1 and 3 have a missing x → drop.
|
|
assert res.rows_dropped == 2
|
|
|
|
def test_drop_audit_records_dropped_rows(self):
|
|
df = pd.DataFrame({"a": [1, np.nan], "b": [2, np.nan]})
|
|
# Drop the fully-missing row (frac > 0.99).
|
|
opts = MissingOptions(strategy="drop_row", row_drop_threshold=0.99)
|
|
res = handle_missing(df, opts)
|
|
drop_records = res.changes[res.changes["action"] == "drop_row"]
|
|
assert len(drop_records) == 1
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Scope: columns / skip_columns
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestScope:
|
|
def test_columns_filter(self):
|
|
df = pd.DataFrame({"a": [np.nan, 2], "b": [np.nan, 4]})
|
|
opts = MissingOptions(columns=["a"], strategy="constant", fill_value=99)
|
|
res = handle_missing(df, opts)
|
|
assert res.handled_df["a"].iloc[0] == 99
|
|
# b should be untouched
|
|
assert pd.isna(res.handled_df["b"].iloc[0])
|
|
|
|
def test_skip_columns(self):
|
|
df = pd.DataFrame({"a": [np.nan, 2], "b": [np.nan, 4]})
|
|
opts = MissingOptions(skip_columns=["b"], strategy="constant", fill_value=99)
|
|
res = handle_missing(df, opts)
|
|
assert res.handled_df["a"].iloc[0] == 99
|
|
assert pd.isna(res.handled_df["b"].iloc[0])
|
|
|
|
def test_unknown_column_raises(self):
|
|
df = pd.DataFrame({"a": [1]})
|
|
opts = MissingOptions(columns=["does_not_exist"])
|
|
with pytest.raises(InputValidationError):
|
|
handle_missing(df, opts)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Presets / config
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestPresets:
|
|
def test_detect_only_does_not_fill(self):
|
|
df = pd.DataFrame({"x": ["alice", "N/A", "bob"]})
|
|
opts = MissingOptions.from_preset("detect-only")
|
|
res = handle_missing(df, opts)
|
|
assert res.sentinels_standardized == 1
|
|
assert res.cells_filled == 0
|
|
assert res.rows_dropped == 0
|
|
|
|
def test_safe_fill_fills(self):
|
|
df = pd.DataFrame({"age": [30, np.nan, 25, 40], "name": ["a", "a", None, "b"]})
|
|
opts = MissingOptions.from_preset("safe-fill")
|
|
res = handle_missing(df, opts)
|
|
assert res.cells_filled == 2
|
|
|
|
def test_drop_incomplete(self):
|
|
df = pd.DataFrame({"a": [1, np.nan, 3], "b": [10, 20, 30]})
|
|
opts = MissingOptions.from_preset("drop-incomplete")
|
|
res = handle_missing(df, opts)
|
|
assert res.rows_dropped == 1
|
|
|
|
def test_unknown_preset_raises(self):
|
|
with pytest.raises(ConfigError):
|
|
MissingOptions.from_preset("does-not-exist")
|
|
|
|
def test_roundtrip_to_file(self, tmp_path):
|
|
opts = MissingOptions.from_preset("safe-fill")
|
|
opts.column_strategies = {"age": "median"}
|
|
path = tmp_path / "cfg.json"
|
|
opts.to_file(path)
|
|
loaded = MissingOptions.from_file(path)
|
|
assert loaded.strategy == opts.strategy
|
|
assert loaded.column_strategies == opts.column_strategies
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Validation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestValidate:
|
|
def test_invalid_strategy(self):
|
|
opts = MissingOptions(strategy="bogus") # type: ignore[arg-type]
|
|
with pytest.raises(InputValidationError):
|
|
opts.validate()
|
|
|
|
def test_threshold_out_of_range(self):
|
|
opts = MissingOptions(row_drop_threshold=1.5)
|
|
with pytest.raises(ConfigError):
|
|
opts.validate()
|
|
|
|
def test_handle_missing_validates(self):
|
|
df = pd.DataFrame({"x": [1]})
|
|
opts = MissingOptions(strategy="bogus") # type: ignore[arg-type]
|
|
with pytest.raises(InputValidationError):
|
|
handle_missing(df, opts)
|
|
|
|
def test_non_dataframe_input(self):
|
|
with pytest.raises(InputValidationError):
|
|
handle_missing([1, 2, 3]) # type: ignore[arg-type]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# End-to-end realistic case
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestEndToEnd:
|
|
def test_messy_customer_export(self):
|
|
df = pd.DataFrame({
|
|
"customer_id": [1, 2, 3, 4, 5, 6],
|
|
"name": ["Alice", "Bob", "N/A", " ", "Charlie", None],
|
|
"email": ["a@x.com", "-", "c@x.com", "d@x.com", "NULL", "f@x.com"],
|
|
"age": [30, np.nan, 25, 40, np.nan, 50],
|
|
})
|
|
opts = MissingOptions(
|
|
standardize_sentinels=True,
|
|
strategy="median",
|
|
categorical_strategy="constant",
|
|
fill_value="UNKNOWN",
|
|
)
|
|
res = handle_missing(df, opts)
|
|
|
|
# Sentinels: name "N/A"," ",None; email "-","NULL". (None is real-NaN, not sentinel.)
|
|
# Whitespace + 'N/A' on name = 2; '-' + 'NULL' on email = 2. Total = 4.
|
|
assert res.sentinels_standardized == 4
|
|
# name has 3 missing after standardize (N/A, " ", None) → constant fill
|
|
# email has 2 missing → constant fill
|
|
# age has 2 missing → median (32.5 of [30, 25, 40, 50])
|
|
assert res.cells_filled == 7
|
|
assert res.handled_df["name"].isna().sum() == 0
|
|
assert res.handled_df["email"].isna().sum() == 0
|
|
assert res.handled_df["age"].isna().sum() == 0
|
|
assert (res.handled_df["name"] == "UNKNOWN").sum() == 3
|
|
assert (res.handled_df["age"] == 35.0).sum() == 2 # median of [30, 25, 40, 50]
|
|
|
|
def test_input_not_mutated(self):
|
|
df = pd.DataFrame({"x": ["N/A", "alice", np.nan]})
|
|
df_copy = df.copy()
|
|
handle_missing(df, MissingOptions.from_preset("safe-fill"))
|
|
pd.testing.assert_frame_equal(df, df_copy)
|