Files
datatools-dev/tests/test_missing_corpus.py
Michael 966af8ef94 feat: 3 new tools, format streaming, distribution-ready demo + landing pages
Tools shipped this batch (4 → 6 of 9 Ready):
  04 Missing Value Handler   src/core/missing.py + cli_missing.py + GUI
  05 Column Mapper           src/core/column_mapper.py + cli_column_map.py + GUI
  09 Pipeline Runner         src/core/pipeline.py + cli_pipeline.py + GUI
                             with soft tool-dependency graph (recommended,
                             not enforced) and JSON save/load for repeatable
                             weekly cleanups.

Format Standardizer reworked for 1 GB international files:
  • Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
  • Per-row country / address columns drive parsing
  • Audit cap (default 10 k rows, ~50 MB RAM)
  • standardize_file(): chunked streaming entry point (~165 k rows/sec)
  • currency_decimal="auto" for EU comma-decimal locales
  • R$ / kr / zł multi-char currency prefixes
  • cli_format.py with auto-stream above 100 MB inputs

Encoding detection arbiter + language-aware probe:
  Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
  via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.

Distribution-readiness assets:
  • streamlit_app.py — Streamlit Community Cloud entry shim
  • src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
    100-row cap + watermark, free-vs-paid boundary enforced at surface
  • samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
  • landing/ — 4 static HTML pages (apex chooser + 3 niche),
    shared CSS, deploy.py URL-substitution script,
    auto-generated robots.txt + sitemap.xml + 404.html + favicon
  • docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
    — full strategy + measurement + deployment + master checklist

Test counts:
  before: 1,520 passed · 4 skipped · 17 xfailed
  after:  1,729 passed · 0 skipped · 0  xfailed

Tier-1 corpora added:
  • missing-corpus           3 use cases + 16 edge cases
  • column-mapper-corpus     3 use cases + 5 edge cases
  • format-cleaner intl      20-row 13-country stress fixture

Engine hardening flushed out by the corpora:
  • interpolate guards against object-dtype columns
  • mean/median skip all-NaN columns (silences numpy warning)
  • fillna runs under future.no_silent_downcasting (silences pandas warning)
  • mojibake test no longer skips when ftfy installed (monkeypatch path)
  • drop-row threshold semantics: strict-greater (consistent across rows / cols)
  • currency_decimal validator allow-set updated for "auto"

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 22:31:26 +00:00

464 lines
19 KiB
Python

"""Acceptance corpus for the Missing Value Handler.
Loads every fixture in ``test-cases/missing-corpus/test_data/`` and
asserts the documented behaviour. The fixtures are split into:
* ``uc##`` — three target-client use cases (Shopify operator,
marketing analyst, consultant intake).
* ``ec##`` — edge cases the engine must handle without surprise:
all-NaN columns, zeros that aren't missing, Excel errors, unicode
whitespace, mixed dtypes, padding, single row/column, every default
sentinel, per-column constants, drop thresholds, leading-NaN ffill,
numeric-strategy fallback for non-numeric columns, headers-only,
idempotency.
Each test runs through the public API (``handle_missing``) so any
regression in the engine surfaces here. Fixture files double as living
documentation for what the tool is supposed to do.
"""
from __future__ import annotations
import io
from pathlib import Path
import numpy as np
import pandas as pd
import pytest
from src.core.missing import (
MissingOptions,
handle_missing,
is_missing_like,
profile_missing,
)
CORPUS = Path(__file__).resolve().parents[1] / "test-cases" / "missing-corpus"
TEST_DATA = CORPUS / "test_data"
def _read(name: str, *, dtype_str: bool = False) -> pd.DataFrame:
"""Load a corpus CSV.
By default we let pandas infer dtypes — that's the most realistic
intake path (Excel exports keep numeric columns numeric). A handful
of cases pass ``dtype_str=True`` to keep sentinels visible in
columns that would otherwise be coerced to float.
"""
path = TEST_DATA / name
if dtype_str:
return pd.read_csv(path, dtype=str, keep_default_na=False)
return pd.read_csv(path)
# ---------------------------------------------------------------------------
# Use case 1 — Shopify operator: detect-only
# ---------------------------------------------------------------------------
class TestUC01ShopifyExport:
"""SMB operator standardizes disguised nulls before reimporting."""
def test_detect_only_replaces_sentinels(self):
df = _read("uc01_shopify_export.csv", dtype_str=True)
opts = MissingOptions.from_preset("detect-only")
res = handle_missing(df, opts)
# Spot-check known sentinels from the fixture
assert res.sentinels_standardized > 0
assert res.cells_filled == 0
assert res.rows_dropped == 0
# Fields that contained 'N/A', '-', 'NULL', '(blank)', '#N/A',
# 'n/a', '?', '(none)' should now be NaN.
for row, col in [
(1, "phone"), # 'N/A'
(2, "city"), # '-'
(3, "total_orders"), # 'NULL'
(5, "phone"), # ' '
(5, "last_order_date"), # '(blank)'
(6, "last_order_date"), # '#N/A'
(7, "phone"), # 'n/a'
(8, "city"), # '?'
(9, "total_orders"), # '(none)'
]:
assert pd.isna(res.handled_df.iloc[row][col]), (
f"Expected NaN at row {row} col {col}, got "
f"{res.handled_df.iloc[row][col]!r}"
)
def test_real_values_preserved(self):
df = _read("uc01_shopify_export.csv", dtype_str=True)
res = handle_missing(df, MissingOptions.from_preset("detect-only"))
# First row should be untouched.
assert res.handled_df.iloc[0]["first_name"] == "Alice"
assert res.handled_df.iloc[0]["email"] == "alice@shop.com"
assert res.handled_df.iloc[0]["lifetime_value"] == "1240.50"
def test_audit_log_complete(self):
df = _read("uc01_shopify_export.csv", dtype_str=True)
res = handle_missing(df, MissingOptions.from_preset("detect-only"))
# One audit row per sentinel replacement.
assert len(res.changes) == res.sentinels_standardized
assert set(res.changes["action"].apply(lambda s: s.startswith("standardize:"))) == {True}
# ---------------------------------------------------------------------------
# Use case 2 — Marketing analyst: safe-fill
# ---------------------------------------------------------------------------
class TestUC02MarketingAudience:
"""Marketer fills numeric columns with median, categorical with mode."""
def test_safe_fill_clears_all_missing(self):
df = _read("uc02_marketing_audience.csv")
opts = MissingOptions.from_preset("safe-fill")
res = handle_missing(df, opts)
# Every cell in scope should be filled.
assert res.profile_after.cells_missing == 0
assert res.cells_filled > 0
def test_numeric_uses_median_categorical_uses_mode(self):
df = _read("uc02_marketing_audience.csv")
opts = MissingOptions.from_preset("safe-fill")
res = handle_missing(df, opts)
# 'age' is numeric → median strategy
assert res.strategy_per_column["age"] == "median"
# 'segment' / 'region' / 'source' are object → mode fallback
assert res.strategy_per_column["segment"] == "mode"
assert res.strategy_per_column["region"] == "mode"
def test_per_column_override(self):
df = _read("uc02_marketing_audience.csv")
opts = MissingOptions.from_preset("safe-fill")
opts.column_strategies = {"source": "constant"}
opts.column_fill_values = {"source": "unknown"}
res = handle_missing(df, opts)
# Cells previously holding sentinels in 'source' should now equal "unknown".
assert (res.handled_df["source"] == "unknown").sum() >= 3
def test_consent_real_false_not_dropped(self):
# 'consent' column has empty cells but also explicit "true"; mode fill
# must not silently change a real "true" to anything else.
df = _read("uc02_marketing_audience.csv")
res = handle_missing(df, MissingOptions.from_preset("safe-fill"))
original_trues = (df["consent"] == "true").sum()
result_trues = (res.handled_df["consent"] == "true").sum()
# Filled rows can become "true" (mode) but should not lose existing trues.
assert result_trues >= original_trues
# ---------------------------------------------------------------------------
# Use case 3 — Consultant intake: threshold drops + fill
# ---------------------------------------------------------------------------
class TestUC03ConsultantIntake:
"""Drop sparse columns and rows, then fill the survivors."""
def test_drop_col_removes_legacy_fields(self):
df = _read("uc03_consultant_intake.csv", dtype_str=True)
# internal_id_legacy and beta_field are 100% missing — drop them.
opts = MissingOptions(
standardize_sentinels=True,
strategy="drop_col",
col_drop_threshold=0.99,
)
res = handle_missing(df, opts)
assert "internal_id_legacy" in res.columns_dropped
assert "beta_field" in res.columns_dropped
def test_drop_row_removes_mostly_empty_respondents(self):
df = _read("uc03_consultant_intake.csv", dtype_str=True)
opts = MissingOptions(
standardize_sentinels=True,
strategy="drop_both",
col_drop_threshold=0.99, # drop the legacy / beta cols first
row_drop_threshold=0.5, # then drop rows with >50% missing
)
res = handle_missing(df, opts)
# R-002, R-005, R-007, R-010 are mostly-empty respondents.
assert res.rows_dropped >= 4
# Non-empty respondents survive.
kept_ids = set(res.handled_df["respondent_id"].tolist())
for survivor in ("R-001", "R-003", "R-006", "R-008", "R-009", "R-012"):
assert survivor in kept_ids
# ---------------------------------------------------------------------------
# Edge cases
# ---------------------------------------------------------------------------
class TestEC01AllNanColumn:
def test_fill_skips_all_nan_column(self):
df = _read("ec01_all_nan_column.csv")
res = handle_missing(df, MissingOptions(strategy="mean"))
# Mean of all-NaN is NaN — engine must NOT fabricate a value.
assert res.handled_df["deprecated_field"].isna().all()
assert res.cells_filled == 0
def test_drop_col_catches_all_nan(self):
df = _read("ec01_all_nan_column.csv")
res = handle_missing(
df, MissingOptions(strategy="drop_col", col_drop_threshold=0.99),
)
assert "deprecated_field" in res.columns_dropped
assert "name" not in res.columns_dropped
class TestEC02NoMissing:
def test_clean_file_is_noop(self):
df = _read("ec02_no_missing.csv")
res = handle_missing(df, MissingOptions.from_preset("safe-fill"))
assert res.sentinels_standardized == 0
assert res.cells_filled == 0
assert res.rows_dropped == 0
pd.testing.assert_frame_equal(res.handled_df, df)
class TestEC03ZeroIsNotMissing:
def test_zero_preserved(self):
df = _read("ec03_zero_is_not_missing.csv")
res = handle_missing(df, MissingOptions.from_preset("safe-fill"))
# Original zeros remain zero.
assert (res.handled_df["balance"] == 0).sum() == (df["balance"] == 0).sum()
assert (res.handled_df["count"] == 0).sum() == (df["count"] == 0).sum()
# No spurious changes recorded.
assert res.cells_filled == 0
assert res.sentinels_standardized == 0
def test_is_missing_like_zero_predicate(self):
# Direct predicate check — zeros, false, "0" must all be non-missing.
assert not is_missing_like(0)
assert not is_missing_like(0.0)
assert not is_missing_like(False)
assert not is_missing_like("0")
assert not is_missing_like("0.00")
class TestEC04ExcelErrors:
def test_excel_error_sentinels_recognized(self):
df = _read("ec04_excel_errors.csv", dtype_str=True)
res = handle_missing(df, MissingOptions(strategy="none"))
# 6 error sentinels in the fixture: #N/A, #NULL!, #VALUE!, #N/A, #N/A, #NULL!
assert res.sentinels_standardized == 6
class TestEC05UnicodeWhitespace:
def test_nbsp_and_ideographic_space_count_as_missing(self):
df = _read("ec05_unicode_whitespace.csv", dtype_str=True)
res = handle_missing(df, MissingOptions(strategy="none"))
# rows 1, 2, 4 contain NBSP / tab / ideographic space respectively
assert res.handled_df["note"].isna().sum() == 3
assert res.handled_df.iloc[0]["note"] == "hello"
assert res.handled_df.iloc[3]["note"] == "real"
class TestEC06MixedDtypes:
def test_mixed_column_falls_back_to_mode(self):
# Read with native dtypes so 'real_num' stays numeric.
df = _read("ec06_mixed_dtypes.csv")
opts = MissingOptions(
standardize_sentinels=True,
strategy="median",
categorical_strategy="mode",
)
res = handle_missing(df, opts)
# mixed_col holds 'N/A' / 'hello' alongside numbers → object dtype,
# median falls back to mode.
assert res.strategy_per_column["mixed_col"] == "mode"
# real_num is float dtype → median runs.
assert res.strategy_per_column["real_num"] == "median"
class TestEC07RealDataWithPadding:
def test_padded_real_data_not_treated_as_missing(self):
df = _read("ec07_real_data_with_padding.csv", dtype_str=True)
res = handle_missing(df, MissingOptions(strategy="none"))
# Only row 1 (name=" ") and row 2 (city=blank) should become NaN.
# " Alice ", " Bob ", " SF" must remain.
assert res.handled_df.iloc[0]["name"] == " Alice "
assert res.handled_df.iloc[2]["name"] == " Bob "
assert res.handled_df.iloc[3]["city"] == " SF"
class TestEC08SingleRow:
def test_single_row_handles_cleanly(self):
df = _read("ec08_single_row.csv", dtype_str=True)
# detect-only
res = handle_missing(df, MissingOptions(strategy="none"))
assert res.sentinels_standardized == 2 # 'N/A' + ''
# safe-fill on a one-row file: median/mode of a single value is itself.
res2 = handle_missing(df, MissingOptions.from_preset("safe-fill"))
assert res2.handled_df.iloc[0]["name"] == "Alice"
class TestEC09SingleColumn:
def test_single_column_works(self):
df = _read("ec09_single_column.csv", dtype_str=True)
res = handle_missing(df, MissingOptions(strategy="none"))
# 'N/A', whitespace-only ' ', '-' = 3 sentinels
assert res.sentinels_standardized == 3
assert res.handled_df["value"].isna().sum() == 3
class TestEC10AllSentinelVariants:
def test_every_default_sentinel_recognized(self):
df = _read("ec10_all_sentinel_variants.csv", dtype_str=True)
res = handle_missing(df, MissingOptions(strategy="none"))
# 20 sentinels + 1 real value
assert res.sentinels_standardized == 20
# The 'real_value' row stays.
assert (res.handled_df["sentinel_value"] == "real_value").sum() == 1
class TestEC11ConstantPerColumn:
def test_per_column_fill_values(self):
df = _read("ec11_constant_per_column.csv", dtype_str=True)
opts = MissingOptions(
strategy="constant",
column_fill_values={
"country": "USA",
"salary": "0",
"department": "Unassigned",
},
)
res = handle_missing(df, opts)
# Fixture has 1 UK row + 2 USA rows + 2 blanks. Filling blanks with
# "USA" yields 4 USA total; UK is preserved.
assert (res.handled_df["country"] == "USA").sum() == 4
assert (res.handled_df["country"] == "UK").sum() == 1
assert (res.handled_df["department"] == "Unassigned").sum() >= 2
class TestEC12DropThresholdBoundary:
def test_threshold_one_never_drops(self):
# threshold 1.0 + strict-greater = never drop.
df = _read("ec12_drop_threshold_boundary.csv")
opts = MissingOptions(strategy="drop_row", row_drop_threshold=1.0)
res = handle_missing(df, opts)
assert res.rows_dropped == 0
def test_threshold_just_under_one_drops_fully_missing(self):
# threshold 0.99: drop only fully-missing rows (frac > 0.99 → frac == 1.0).
df = _read("ec12_drop_threshold_boundary.csv")
opts = MissingOptions(
strategy="drop_row",
row_drop_threshold=0.99,
columns=["a", "b", "c", "d"], # exclude id from the scope
)
res = handle_missing(df, opts)
# Only row 3 (id=4, all four are NaN) qualifies.
assert res.rows_dropped == 1
def test_threshold_half_drops_majority_missing(self):
df = _read("ec12_drop_threshold_boundary.csv")
opts = MissingOptions(
strategy="drop_row",
row_drop_threshold=0.5,
columns=["a", "b", "c", "d"],
)
res = handle_missing(df, opts)
# Missing fractions across [a,b,c,d]:
# row 0: 0/4=0.0 keep
# row 1: 2/4=0.5 keep (strict >, not equal)
# row 2: 3/4=0.75 drop
# row 3: 4/4=1.0 drop
# row 4: 2/4=0.5 keep
assert res.rows_dropped == 2
def test_threshold_zero_drops_any_missing(self):
df = _read("ec12_drop_threshold_boundary.csv")
opts = MissingOptions(
strategy="drop_row",
row_drop_threshold=0.0,
columns=["a", "b", "c", "d"],
)
res = handle_missing(df, opts)
# Every body row except row 0 has at least one missing.
assert res.rows_dropped == 4
class TestEC13FfillLeadingNan:
def test_leading_nan_run_survives_ffill(self):
df = _read("ec13_ffill_leading_nan.csv")
res = handle_missing(df, MissingOptions(strategy="ffill"))
# First two rows (leading NaN) remain NaN — there's nothing to fill from.
assert pd.isna(res.handled_df["price"].iloc[0])
assert pd.isna(res.handled_df["price"].iloc[1])
# Mid-series gets filled forward.
assert res.handled_df["price"].iloc[3] == 100.0
assert res.handled_df["price"].iloc[4] == 100.0
# Trailing NaN gets filled by the last seen value.
assert res.handled_df["price"].iloc[6] == 150.0
class TestEC14InterpolateFallback:
def test_interpolate_on_non_numeric_falls_back(self):
df = _read("ec14_interpolate_fallback.csv", dtype_str=True)
opts = MissingOptions(
strategy="interpolate",
categorical_strategy="mode",
)
res = handle_missing(df, opts)
# All columns are object dtype here → fallback to mode.
assert res.strategy_per_column["category"] == "mode"
assert res.strategy_per_column["value"] == "mode"
class TestEC15HeadersOnly:
def test_empty_body_does_not_crash(self):
df = _read("ec15_headers_only.csv")
# All operations must be no-ops on an empty body.
for preset in ("detect-only", "safe-fill", "drop-incomplete"):
res = handle_missing(df, MissingOptions.from_preset(preset))
assert len(res.handled_df) == 0
assert res.cells_filled == 0
assert res.rows_dropped == 0
class TestEC16Idempotency:
def test_safe_fill_is_idempotent(self):
df = _read("ec16_idempotent_apply.csv", dtype_str=True)
opts = MissingOptions.from_preset("safe-fill")
first = handle_missing(df, opts)
second = handle_missing(first.handled_df, opts)
# Second pass should make no further changes.
pd.testing.assert_frame_equal(
second.handled_df.reset_index(drop=True),
first.handled_df.reset_index(drop=True),
)
assert second.cells_filled == 0
assert second.sentinels_standardized == 0
def test_detect_only_is_idempotent(self):
df = _read("ec16_idempotent_apply.csv", dtype_str=True)
opts = MissingOptions.from_preset("detect-only")
first = handle_missing(df, opts)
second = handle_missing(first.handled_df, opts)
assert second.sentinels_standardized == 0
# ---------------------------------------------------------------------------
# Whole-corpus property tests
# ---------------------------------------------------------------------------
ALL_FIXTURES = sorted(p.name for p in TEST_DATA.glob("*.csv"))
@pytest.mark.parametrize("fixture", ALL_FIXTURES)
def test_handle_missing_does_not_mutate_input(fixture):
"""Every fixture must leave the input DataFrame untouched."""
df = pd.read_csv(TEST_DATA / fixture, dtype=str, keep_default_na=False)
if df.empty and len(df.columns) == 0:
pytest.skip(f"{fixture}: completely empty file")
snapshot = df.copy(deep=True)
handle_missing(df, MissingOptions.from_preset("safe-fill"))
pd.testing.assert_frame_equal(df, snapshot)
@pytest.mark.parametrize("fixture", ALL_FIXTURES)
def test_profile_runs_on_every_fixture(fixture):
"""``profile_missing`` must succeed on every corpus file."""
df = pd.read_csv(TEST_DATA / fixture, dtype=str, keep_default_na=False)
prof = profile_missing(df, MissingOptions())
assert prof.rows_total == len(df)
assert prof.cells_total == len(df) * len(df.columns)