feat: 3 new tools, format streaming, distribution-ready demo + landing pages

Tools shipped this batch (4 → 6 of 9 Ready):
  04 Missing Value Handler   src/core/missing.py + cli_missing.py + GUI
  05 Column Mapper           src/core/column_mapper.py + cli_column_map.py + GUI
  09 Pipeline Runner         src/core/pipeline.py + cli_pipeline.py + GUI
                             with soft tool-dependency graph (recommended,
                             not enforced) and JSON save/load for repeatable
                             weekly cleanups.

Format Standardizer reworked for 1 GB international files:
  • Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
  • Per-row country / address columns drive parsing
  • Audit cap (default 10 k rows, ~50 MB RAM)
  • standardize_file(): chunked streaming entry point (~165 k rows/sec)
  • currency_decimal="auto" for EU comma-decimal locales
  • R$ / kr / zł multi-char currency prefixes
  • cli_format.py with auto-stream above 100 MB inputs

Encoding detection arbiter + language-aware probe:
  Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
  via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.

Distribution-readiness assets:
  • streamlit_app.py — Streamlit Community Cloud entry shim
  • src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
    100-row cap + watermark, free-vs-paid boundary enforced at surface
  • samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
  • landing/ — 4 static HTML pages (apex chooser + 3 niche),
    shared CSS, deploy.py URL-substitution script,
    auto-generated robots.txt + sitemap.xml + 404.html + favicon
  • docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
    — full strategy + measurement + deployment + master checklist

Test counts:
  before: 1,520 passed · 4 skipped · 17 xfailed
  after:  1,729 passed · 0 skipped · 0  xfailed

Tier-1 corpora added:
  • missing-corpus           3 use cases + 16 edge cases
  • column-mapper-corpus     3 use cases + 5 edge cases
  • format-cleaner intl      20-row 13-country stress fixture

Engine hardening flushed out by the corpora:
  • interpolate guards against object-dtype columns
  • mean/median skip all-NaN columns (silences numpy warning)
  • fillna runs under future.no_silent_downcasting (silences pandas warning)
  • mojibake test no longer skips when ftfy installed (monkeypatch path)
  • drop-row threshold semantics: strict-greater (consistent across rows / cols)
  • currency_decimal validator allow-set updated for "auto"

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-01 22:31:26 +00:00
parent d18b95880d
commit 966af8ef94
89 changed files with 12039 additions and 284 deletions

View File

@@ -253,16 +253,20 @@ class TestEncodingOverride:
class TestEncodingDecodeFailedFromRepair:
def test_decode_replaced_action_surfaces_error_finding(self, tmp_path):
# Create a file with a UTF-8 BOM but cp1252 body bytes — utf-8-sig
# fails on byte 0x80 (€ in cp1252).
def test_lying_bom_recovered_and_flagged(self, tmp_path):
# File has a UTF-8 BOM but the body bytes are cp1252 (0x80 = € in
# cp1252; not a valid UTF-8 continuation byte). Detector should
# recover transparently to cp1252 and surface an
# ``encoding_lying_bom`` warn so the user knows.
f = tmp_path / "lying_bom.csv"
f.write_bytes(b"\xef\xbb\xbfid,name\n1,\x80100\n")
findings = analyze(f)
ids = {x.id for x in findings}
assert "encoding_decode_failed" in ids
bad = next(x for x in findings if x.id == "encoding_decode_failed")
assert bad.severity == "error"
assert "encoding_lying_bom" in ids
bad = next(x for x in findings if x.id == "encoding_lying_bom")
assert bad.severity == "warn"
# Decode should have succeeded — no replacement-character finding.
assert "encoding_decode_failed" not in ids
class TestMixedLineEndings:

374
tests/test_column_mapper.py Normal file
View File

@@ -0,0 +1,374 @@
"""Tests for src/core/column_mapper.py."""
from __future__ import annotations
import json
import numpy as np
import pandas as pd
import pytest
from src.core.errors import ConfigError, InputValidationError
from src.core.column_mapper import (
MapOptions,
PRESETS,
TargetField,
TargetSchema,
coerce_series,
infer_mapping,
map_columns,
)
# ---------------------------------------------------------------------------
# infer_mapping — fuzzy matcher
# ---------------------------------------------------------------------------
class TestInferMapping:
def test_exact_normalized_match(self):
df = pd.DataFrame({"First Name": [], "Last Name": []})
schema = TargetSchema(fields=[
TargetField(name="first_name"), TargetField(name="last_name"),
])
m = infer_mapping(df, schema)
assert m == {"First Name": "first_name", "Last Name": "last_name"}
def test_alias_match(self):
df = pd.DataFrame({"EmailAddr": []})
schema = TargetSchema(fields=[
TargetField(name="email", aliases=["EmailAddr", "email_address"]),
])
m = infer_mapping(df, schema)
assert m == {"EmailAddr": "email"}
def test_below_threshold_excluded(self):
df = pd.DataFrame({"xyz": []})
schema = TargetSchema(fields=[TargetField(name="email")])
m = infer_mapping(df, schema, threshold=0.6)
assert m == {}
def test_target_matched_at_most_once(self):
df = pd.DataFrame({"first_name": [], "fname": []})
schema = TargetSchema(fields=[TargetField(name="first_name")])
m = infer_mapping(df, schema)
# Exact match wins; "fname" stays unmapped.
assert m == {"first_name": "first_name"}
def test_threshold_zero_matches_anything(self):
df = pd.DataFrame({"a": [], "b": []})
schema = TargetSchema(fields=[TargetField(name="z")])
m = infer_mapping(df, schema, threshold=0.0)
assert len(m) == 1
# ---------------------------------------------------------------------------
# coerce_series
# ---------------------------------------------------------------------------
class TestCoerceSeries:
def test_integer_clean(self):
s = pd.Series(["1", "2", "3"])
out, fails = coerce_series(s, "integer")
assert list(out) == [1, 2, 3]
assert fails == 0
def test_integer_with_failure(self):
s = pd.Series(["1", "bad", "3"])
out, fails = coerce_series(s, "integer")
assert fails == 1
assert pd.isna(out.iloc[1])
def test_float_with_thousands_sep(self):
# Plain floats; thousands-sep handling is for format standardizer.
s = pd.Series(["1.5", "2.0", "3.25"])
out, fails = coerce_series(s, "float")
assert fails == 0
assert out.iloc[2] == 3.25
def test_boolean_truthy_falsy(self):
s = pd.Series(["true", "false", "Yes", "no", "1", "0"])
out, fails = coerce_series(s, "boolean")
assert fails == 0
assert list(out) == [True, False, True, False, True, False]
def test_boolean_unknown_value_fails(self):
s = pd.Series(["true", "maybe"])
out, fails = coerce_series(s, "boolean")
assert fails == 1
assert pd.isna(out.iloc[1])
def test_date_iso_format(self):
s = pd.Series(["2025-01-15", "2025-02-20"])
out, fails = coerce_series(s, "date")
assert fails == 0
assert out.iloc[0].year == 2025
def test_date_failure(self):
s = pd.Series(["2025-01-15", "garbage"])
out, fails = coerce_series(s, "date")
assert fails == 1
assert pd.isna(out.iloc[1])
def test_string_passthrough(self):
s = pd.Series([1, 2, 3])
out, fails = coerce_series(s, "string")
assert fails == 0
assert out.dtype.name == "string"
def test_auto_returns_unchanged(self):
s = pd.Series([1, 2])
out, fails = coerce_series(s, "auto")
assert fails == 0
assert out is s
def test_unknown_dtype_raises(self):
with pytest.raises(InputValidationError):
coerce_series(pd.Series([1]), "bogus") # type: ignore[arg-type]
# ---------------------------------------------------------------------------
# map_columns — explicit mapping
# ---------------------------------------------------------------------------
class TestMapColumnsExplicit:
def test_simple_rename(self):
df = pd.DataFrame({"a": [1], "b": [2]})
opts = MapOptions(mapping={"a": "alpha", "b": "beta"})
res = map_columns(df, opts)
assert list(res.mapped_df.columns) == ["alpha", "beta"]
assert res.columns_renamed == 2
def test_unknown_source_raises(self):
df = pd.DataFrame({"a": [1]})
opts = MapOptions(mapping={"missing": "x"})
with pytest.raises(InputValidationError):
map_columns(df, opts)
def test_duplicate_target_raises(self):
df = pd.DataFrame({"a": [1], "b": [2]})
opts = MapOptions(mapping={"a": "x", "b": "x"})
with pytest.raises(InputValidationError):
map_columns(df, opts)
def test_unmapped_keep(self):
df = pd.DataFrame({"a": [1], "b": [2]})
opts = MapOptions(mapping={"a": "alpha"}, unmapped="keep")
res = map_columns(df, opts)
assert "b" in res.mapped_df.columns
assert res.unmapped_kept == ["b"]
def test_unmapped_drop(self):
df = pd.DataFrame({"a": [1], "b": [2]})
opts = MapOptions(mapping={"a": "alpha"}, unmapped="drop")
res = map_columns(df, opts)
assert list(res.mapped_df.columns) == ["alpha"]
assert res.columns_dropped == ["b"]
def test_unmapped_error(self):
df = pd.DataFrame({"a": [1], "b": [2]})
opts = MapOptions(mapping={"a": "alpha"}, unmapped="error")
with pytest.raises(InputValidationError):
map_columns(df, opts)
# ---------------------------------------------------------------------------
# map_columns — schema + auto-inference
# ---------------------------------------------------------------------------
class TestMapColumnsWithSchema:
def test_auto_infer_renames(self):
df = pd.DataFrame({"First Name": ["A"], "Last Name": ["B"]})
schema = TargetSchema(fields=[
TargetField(name="first_name"), TargetField(name="last_name"),
])
opts = MapOptions(schema=schema, auto_infer=True)
res = map_columns(df, opts)
assert "first_name" in res.mapped_df.columns
assert "last_name" in res.mapped_df.columns
assert res.inferred_pairs == {"First Name": "first_name", "Last Name": "last_name"}
def test_explicit_overrides_inferred(self):
df = pd.DataFrame({"name": ["A"], "fname": ["B"]})
schema = TargetSchema(fields=[TargetField(name="first_name")])
opts = MapOptions(
schema=schema,
mapping={"fname": "first_name"},
auto_infer=True,
)
res = map_columns(df, opts)
assert res.mapping["fname"] == "first_name"
assert "name" not in res.mapping
def test_required_missing_raises(self):
df = pd.DataFrame({"first_name": ["A"]})
schema = TargetSchema(fields=[
TargetField(name="first_name", required=True),
TargetField(name="email", required=True),
])
opts = MapOptions(schema=schema, auto_infer=False, enforce_required=True)
with pytest.raises(InputValidationError):
map_columns(df, opts)
def test_required_missing_with_default_added(self):
df = pd.DataFrame({"first_name": ["A"]})
schema = TargetSchema(fields=[
TargetField(name="first_name", required=True),
TargetField(name="source", required=False, default="import"),
])
opts = MapOptions(schema=schema, auto_infer=False)
res = map_columns(df, opts)
assert "source" in res.mapped_df.columns
assert res.mapped_df.iloc[0]["source"] == "import"
assert res.columns_added == ["source"]
def test_required_missing_disabled(self):
df = pd.DataFrame({"first_name": ["A"]})
schema = TargetSchema(fields=[
TargetField(name="first_name", required=True),
TargetField(name="email", required=True),
])
opts = MapOptions(schema=schema, auto_infer=False, enforce_required=False)
res = map_columns(df, opts)
assert "email" in res.missing_required_targets
def test_reorder_to_schema(self):
df = pd.DataFrame({"z": [1], "a": [2], "m": [3]})
schema = TargetSchema(fields=[
TargetField(name="a"), TargetField(name="m"), TargetField(name="z"),
])
opts = MapOptions(schema=schema, auto_infer=True, reorder_to_schema=True)
res = map_columns(df, opts)
assert list(res.mapped_df.columns) == ["a", "m", "z"]
def test_coerce_types(self):
df = pd.DataFrame({"age": ["30", "bad", "40"], "active": ["true", "no", "yes"]})
schema = TargetSchema(fields=[
TargetField(name="age", dtype="integer"),
TargetField(name="active", dtype="boolean"),
])
opts = MapOptions(schema=schema, auto_infer=True, coerce_types=True)
res = map_columns(df, opts)
assert res.mapped_df["age"].iloc[0] == 30
assert res.mapped_df["active"].iloc[0] is True or res.mapped_df["active"].iloc[0]
assert res.coercion_failures == {"age": 1}
# ---------------------------------------------------------------------------
# Presets
# ---------------------------------------------------------------------------
class TestPresets:
def test_strict_schema_drops_and_coerces_and_reorders(self):
df = pd.DataFrame({"First Name": ["A"], "Email": ["a@x"], "extra": [1]})
schema = TargetSchema(fields=[
TargetField(name="first_name", required=True),
TargetField(name="email", required=True),
])
opts = MapOptions.from_preset("strict-schema")
opts.schema = schema
res = map_columns(df, opts)
assert list(res.mapped_df.columns) == ["first_name", "email"]
assert res.columns_dropped == ["extra"]
def test_lenient_keeps_extras(self):
df = pd.DataFrame({"First Name": ["A"], "extra": [1]})
schema = TargetSchema(fields=[TargetField(name="first_name")])
opts = MapOptions.from_preset("lenient-schema")
opts.schema = schema
res = map_columns(df, opts)
assert "extra" in res.mapped_df.columns
def test_unknown_preset(self):
with pytest.raises(ConfigError):
MapOptions.from_preset("does-not-exist")
# ---------------------------------------------------------------------------
# Schema serialization
# ---------------------------------------------------------------------------
class TestSchemaIO:
def test_roundtrip_dict(self):
schema = TargetSchema(fields=[
TargetField(name="x", dtype="integer", required=True, aliases=["X", "X "]),
TargetField(name="y", default="z"),
])
d = schema.to_dict()
loaded = TargetSchema.from_dict(d)
assert loaded.field_names() == ["x", "y"]
assert loaded.fields[0].required is True
assert loaded.fields[1].default == "z"
def test_from_dict_string_field(self):
# Allow shorthand: bare string defaults to dtype=auto.
loaded = TargetSchema.from_dict({"fields": ["a", "b"]})
assert loaded.field_names() == ["a", "b"]
def test_from_dict_unknown_dtype_raises(self):
with pytest.raises(ConfigError):
TargetSchema.from_dict({"fields": [{"name": "x", "dtype": "bogus"}]})
def test_from_dict_missing_name_raises(self):
with pytest.raises(ConfigError):
TargetSchema.from_dict({"fields": [{"dtype": "string"}]})
def test_options_roundtrip_to_file(self, tmp_path):
schema = TargetSchema(fields=[TargetField(name="x", dtype="string")])
opts = MapOptions(
schema=schema,
mapping={"a": "x"},
unmapped="drop",
coerce_types=True,
reorder_to_schema=True,
)
path = tmp_path / "cfg.json"
opts.to_file(path)
loaded = MapOptions.from_file(path)
assert loaded.mapping == {"a": "x"}
assert loaded.unmapped == "drop"
assert loaded.coerce_types is True
assert loaded.schema is not None
assert loaded.schema.field_names() == ["x"]
# ---------------------------------------------------------------------------
# Validation
# ---------------------------------------------------------------------------
class TestValidation:
def test_invalid_unmapped_strategy(self):
opts = MapOptions(unmapped="bogus") # type: ignore[arg-type]
with pytest.raises(InputValidationError):
opts.validate()
def test_threshold_out_of_range(self):
opts = MapOptions(fuzzy_threshold=1.5)
with pytest.raises(ConfigError):
opts.validate()
def test_non_dataframe_input(self):
with pytest.raises(InputValidationError):
map_columns([1, 2, 3]) # type: ignore[arg-type]
# ---------------------------------------------------------------------------
# Idempotency
# ---------------------------------------------------------------------------
class TestIdempotency:
def test_double_apply_is_stable(self):
df = pd.DataFrame({"First Name": ["A"], "Email": ["a@x"]})
schema = TargetSchema(fields=[
TargetField(name="first_name"),
TargetField(name="email"),
])
opts = MapOptions(schema=schema, auto_infer=True, reorder_to_schema=True)
first = map_columns(df, opts)
second = map_columns(first.mapped_df, opts)
pd.testing.assert_frame_equal(second.mapped_df, first.mapped_df)
def test_input_not_mutated(self):
df = pd.DataFrame({"a": [1], "b": [2]})
snapshot = df.copy(deep=True)
map_columns(df, MapOptions(mapping={"a": "x"}))
pd.testing.assert_frame_equal(df, snapshot)

View File

@@ -0,0 +1,240 @@
"""Acceptance corpus for the Column Mapper.
Loads every fixture in ``test-cases/column-mapper-corpus/test_data/``
and asserts the documented behaviour against the documented schema.
"""
from __future__ import annotations
import json
from pathlib import Path
import pandas as pd
import pytest
from src.core.errors import InputValidationError
from src.core.column_mapper import (
MapOptions,
TargetField,
TargetSchema,
map_columns,
)
CORPUS = Path(__file__).resolve().parents[1] / "test-cases" / "column-mapper-corpus"
TEST_DATA = CORPUS / "test_data"
SCHEMAS = CORPUS / "schemas"
def _read(name: str) -> pd.DataFrame:
return pd.read_csv(TEST_DATA / name)
def _schema(name: str) -> TargetSchema:
return TargetSchema.from_file(SCHEMAS / name)
# ---------------------------------------------------------------------------
# UC01 — CRM import
# ---------------------------------------------------------------------------
class TestUC01CrmImport:
def test_strict_schema_round_trip(self):
df = _read("uc01_crm_import.csv")
schema = _schema("uc01_crm_target.json")
opts = MapOptions.from_preset("strict-schema")
opts.schema = schema
res = map_columns(df, opts)
# Every required target is present after the run.
for f in schema.fields:
if f.required:
assert f.name in res.mapped_df.columns
# 'owner' default added.
assert "owner" in res.columns_added
assert (res.mapped_df["owner"] == "unassigned").all()
# No unmapped survivors (strict preset drops extras).
assert res.unmapped_kept == []
# Reordered to schema order.
expected_prefix = [f.name for f in schema.fields]
assert list(res.mapped_df.columns)[: len(expected_prefix)] == expected_prefix
def test_types_coerced_from_strings(self):
df = _read("uc01_crm_import.csv")
schema = _schema("uc01_crm_target.json")
opts = MapOptions.from_preset("strict-schema")
opts.schema = schema
res = map_columns(df, opts)
# annual_rev → integer (was numeric strings in the source).
assert pd.api.types.is_integer_dtype(res.mapped_df["annual_rev"])
# created_date → datetime64.
assert pd.api.types.is_datetime64_any_dtype(res.mapped_df["created_date"])
# ---------------------------------------------------------------------------
# UC02 — Multi-vendor unification
# ---------------------------------------------------------------------------
class TestUC02MultiVendor:
@pytest.mark.parametrize("vendor", ["a", "b", "c"])
def test_each_vendor_normalises_to_canonical(self, vendor):
df = _read(f"uc02_vendor_{vendor}.csv")
schema = _schema("uc02_canonical.json")
opts = MapOptions.from_preset("lenient-schema")
opts.schema = schema
opts.fuzzy_threshold = 0.5 # vendor C uses obscure aliases ("FName", "Tel")
res = map_columns(df, opts)
# Every required canonical field landed in the output.
for f in schema.fields:
if f.required:
assert f.name in res.mapped_df.columns, (
f"vendor {vendor}: missing {f.name}; mapping={res.mapping}"
)
def test_concatenated_vendors_share_schema(self):
# The point of unification: after each vendor goes through the
# mapper, the resulting frames stack cleanly.
schema = _schema("uc02_canonical.json")
opts = MapOptions.from_preset("strict-schema")
opts.schema = schema
opts.fuzzy_threshold = 0.5
frames = [
map_columns(_read(f"uc02_vendor_{v}.csv"), opts).mapped_df
for v in ("a", "b", "c")
]
unified = pd.concat(frames, ignore_index=True)
assert list(unified.columns) == [f.name for f in schema.fields]
# Total rows = sum of inputs.
assert len(unified) == sum(len(f) for f in frames)
# ---------------------------------------------------------------------------
# UC03 — Type coercion
# ---------------------------------------------------------------------------
class TestUC03TypeCoercion:
def test_documented_failures_are_reported(self):
df = _read("uc03_type_coercion.csv")
schema = _schema("uc03_types.json")
opts = MapOptions.from_preset("lenient-schema")
opts.schema = schema
res = map_columns(df, opts)
# Bad rows survive as NaN, with counts recorded.
assert res.coercion_failures.get("age") == 1
assert res.coercion_failures.get("score") == 1
assert res.coercion_failures.get("joined") == 1
def test_coerced_dtypes(self):
df = _read("uc03_type_coercion.csv")
schema = _schema("uc03_types.json")
opts = MapOptions.from_preset("lenient-schema")
opts.schema = schema
res = map_columns(df, opts)
out = res.mapped_df
assert pd.api.types.is_integer_dtype(out["id"])
assert out["active"].dtype.name == "boolean"
assert pd.api.types.is_datetime64_any_dtype(out["joined"])
# Float failures NaN-ify.
assert pd.isna(out["score"].iloc[1])
# ---------------------------------------------------------------------------
# Edge cases
# ---------------------------------------------------------------------------
class TestEC01DuplicateTarget:
def test_two_sources_to_same_target_raises(self):
df = _read("ec01_duplicate_target.csv")
opts = MapOptions(mapping={"a": "x", "b": "x"})
with pytest.raises(InputValidationError):
map_columns(df, opts)
class TestEC02UnicodeColumns:
def test_japanese_column_renamed(self):
df = _read("ec02_unicode_columns.csv")
opts = MapOptions(mapping={"名前": "name", "価格": "price"})
res = map_columns(df, opts)
assert "name" in res.mapped_df.columns
assert "price" in res.mapped_df.columns
# Email passes through (unmapped, kept by default).
assert "Email" in res.mapped_df.columns
class TestEC03WhitespaceHeaders:
def test_header_whitespace_does_not_block_match(self):
df = _read("ec03_whitespace_headers.csv")
schema = TargetSchema(fields=[
TargetField(name="first_name", aliases=["First Name"]),
TargetField(name="last_name", aliases=["Last Name"]),
TargetField(name="email", aliases=["EmailAddr"]),
])
opts = MapOptions(schema=schema, auto_infer=True)
res = map_columns(df, opts)
# All three columns should map despite the leading/trailing spaces.
assert len(res.mapping) == 3
class TestEC04NoMatch:
def test_zero_inferred_with_no_match(self):
df = _read("ec04_no_match.csv")
schema = TargetSchema(fields=[
TargetField(name="email"), TargetField(name="phone"),
])
opts = MapOptions(schema=schema, auto_infer=True, unmapped="keep")
res = map_columns(df, opts)
assert res.inferred_pairs == {}
# Source columns survive as-is under keep.
assert set(df.columns) <= set(res.mapped_df.columns)
def test_no_match_with_unmapped_error(self):
df = _read("ec04_no_match.csv")
schema = TargetSchema(fields=[TargetField(name="email")])
opts = MapOptions(
schema=schema, auto_infer=True, unmapped="error",
enforce_required=False,
)
with pytest.raises(InputValidationError):
map_columns(df, opts)
class TestEC05RequiredMissing:
def test_required_missing_raises(self):
df = _read("ec05_required_missing.csv")
schema = TargetSchema(fields=[
TargetField(name="first_name", required=True),
TargetField(name="email", required=True),
])
opts = MapOptions(schema=schema, auto_infer=True, enforce_required=True)
with pytest.raises(InputValidationError):
map_columns(df, opts)
def test_disable_enforce_surfaces_in_result(self):
df = _read("ec05_required_missing.csv")
schema = TargetSchema(fields=[
TargetField(name="first_name", required=True),
TargetField(name="email", required=True),
])
opts = MapOptions(schema=schema, auto_infer=True, enforce_required=False)
res = map_columns(df, opts)
assert "email" in res.missing_required_targets
# ---------------------------------------------------------------------------
# Whole-corpus property tests
# ---------------------------------------------------------------------------
ALL_FIXTURES = sorted(p.name for p in TEST_DATA.glob("*.csv"))
@pytest.mark.parametrize("fixture", ALL_FIXTURES)
def test_map_columns_does_not_mutate_input(fixture):
df = pd.read_csv(TEST_DATA / fixture)
snapshot = df.copy(deep=True)
try:
map_columns(df, MapOptions()) # identity run; default options.
except InputValidationError:
pass # ec01 / ec05 raise here — fine, mutation is what we care about.
pd.testing.assert_frame_equal(df, snapshot)

View File

@@ -169,8 +169,23 @@ class TestMojibake:
assert actual.equals(expected), "14 mojibake default (no repair) differs"
def test_fixed_variant(self):
# --fix-mojibake is Tier 2; the cleaner does not implement it. Mark xfail.
pytest.xfail("Mojibake auto-repair is Tier 2; not yet implemented (uses ftfy).")
"""Mojibake auto-repair (ftfy-backed) restores the original text.
Skipped automatically when ftfy is not installed — the engine
falls back to a no-op in that case and the diff would never close.
"""
try:
import ftfy # noqa: F401
except ImportError:
pytest.skip("ftfy not installed — install ftfy to enable mojibake repair")
from src.core.fixes import repair_mojibake
df = _read_csv_strict(TEST_DATA / "14_mojibake.csv")
expected = _read_csv_strict(EXPECTED / "14_mojibake__fixed.csv")
repaired, _ = repair_mojibake(df)
actual = repaired.reset_index(drop=True)
assert actual.equals(expected), "14 mojibake fixed variant differs"
class TestEmptyFile:

View File

@@ -14,12 +14,11 @@ What's tested
REJECT / LOW_CONFIDENCE.
3. The decoded DataFrame matches the canonical reference content.
Cases where the current implementation is known to fail (charset-
normalizer label drift on byte-equivalent encodings, ``repair_bytes``
NUL-strip destroying UTF-16, the "lying BOM" pathological case) are
marked ``xfail`` so they surface in the report as documented gaps.
A future fix that makes the case pass will flip xfail to xpass and the
test owner can drop the marker.
Detection arbiter (cp1250→cp1252, mac_iceland→mac_roman, lying-BOM
recovery) and a language-aware probe (Cyrillic / EE-Latin coverage)
together close every documented gap; the ``KNOWN_*_FAILURES`` dicts
below are kept empty as a tripwire — re-add an entry only when a real
limitation surfaces.
"""
from __future__ import annotations
@@ -41,27 +40,9 @@ REFERENCE_DIR = CORPUS / "reference"
# Known failures the analyzer does not yet handle correctly. Each entry
# has a one-line reason — drop the entry once a fix lands.
KNOWN_DETECTION_FAILURES = {
"E03_western_basic_cp1252.csv": "charset-normalizer returns cp1250 for byte-equivalent content",
"E04_western_basic_latin1.csv": "charset-normalizer returns cp1250 for byte-equivalent content",
"E05_western_basic_latin9.csv": "charset-normalizer returns cp1250 for byte-equivalent content",
"E06_western_basic_macroman.csv": "returns mac_iceland (same family) instead of mac_roman",
"E11_western_extended_cp1252.csv": "charset-normalizer returns cp1250 for cp1252 content",
"E15_eastern_european_iso88592.csv": "charset-normalizer returns cp1258 for ISO-8859-2 content",
"E18_cyrillic_koi8r.csv": "charset-normalizer returns shift_jis_2004 for KOI8-R content",
}
KNOWN_DETECTION_FAILURES: dict[str, str] = {}
KNOWN_DECODE_FAILURES = {
"E03_western_basic_cp1252.csv": "decoded as cp1250 — different mapping at 0xF1 (ñ vs ń)",
"E04_western_basic_latin1.csv": "decoded as cp1250 — different mapping at 0xF1",
"E05_western_basic_latin9.csv": "decoded as cp1250 — different mapping at 0xF1",
"E10_western_extended_utf8.csv": "byte-level smart-quote fold rewrites U+201C/U+201D to ASCII before parse",
"E11_western_extended_cp1252.csv": "wrong encoding + smart-quote fold",
"E12_western_extended_utf16le.csv": "byte-level smart-quote fold rewrites U+201C/U+201D before parse",
"E15_eastern_european_iso88592.csv": "wrong encoding (cp1258 != ISO-8859-2)",
"E18_cyrillic_koi8r.csv": "wrong encoding (shift_jis_2004 != KOI8-R)",
"E30_pathological_lying_bom.csv": "utf-8-sig fails on cp1252 body bytes; needs lying-BOM recovery",
}
KNOWN_DECODE_FAILURES: dict[str, str] = {}
def _normalize_encoding(name: str) -> str:
@@ -164,7 +145,12 @@ def _decodable_entries():
],
)
def test_decoded_matches_reference(entry):
df, _, _ = _load_for_analysis(CORPUS / entry["filename"], sample_rows=1000)
# The reference files preserve smart quotes — disable byte-level
# smart-quote folding so this round-trip identity test isn't
# confounded by the analyzer's deliberate parser-safety fold.
df, _, _ = _load_for_analysis(
CORPUS / entry["filename"], sample_rows=1000, fold_quotes=False,
)
ref_text = REFERENCES[entry["canonical_content_id"]]
ref_rows = list(csv.reader(io.StringIO(ref_text)))
if not ref_rows:

View File

@@ -230,8 +230,27 @@ class TestRepairMojibake:
class TestRepairMojibakeNoFtfy:
@pytest.mark.skipif(_HAS_FTFY, reason="ftfy installed — exercises the no-op path")
def test_returns_input_unchanged_without_ftfy(self):
def test_returns_input_unchanged_without_ftfy(self, monkeypatch):
"""Exercise the no-op path regardless of whether ftfy is installed.
``repair_mojibake`` lazy-imports ftfy inside the function body, so
we hide ``ftfy`` from ``sys.modules`` and from import resolution
before calling. The function must then degrade to ``(df, 0)``
without raising.
"""
import sys
import builtins
monkeypatch.delitem(sys.modules, "ftfy", raising=False)
real_import = builtins.__import__
def fake_import(name, *args, **kwargs):
if name == "ftfy" or name.startswith("ftfy."):
raise ImportError("ftfy hidden by test")
return real_import(name, *args, **kwargs)
monkeypatch.setattr(builtins, "__import__", fake_import)
df = pd.DataFrame({"x": ["café"]})
out, changed = repair_mojibake(df)
assert changed == 0

View File

@@ -0,0 +1,105 @@
"""Acceptance corpus for international format standardization.
Stresses the rework's three pillars on a single mixed-locale fixture:
* Per-row country column drives phone parsing.
* ``currency_decimal="auto"`` resolves comma-decimal locales.
* Streaming entry point handles the same content unchanged.
"""
from __future__ import annotations
from pathlib import Path
import pandas as pd
import pytest
from src.core.format_standardize import (
FieldType,
StandardizeOptions,
standardize_dataframe,
standardize_file,
)
CORPUS = Path(__file__).resolve().parents[1] / "test-cases" / "format-cleaner-corpus" / "international"
FIXTURE = CORPUS / "intl_phones_addresses.csv"
@pytest.fixture(scope="module")
def df():
return pd.read_csv(FIXTURE, dtype=str, keep_default_na=False)
@pytest.fixture(scope="module")
def options():
return StandardizeOptions(
column_types={
"name": FieldType.NAME,
"phone": FieldType.PHONE,
"price": FieldType.CURRENCY,
},
phone_country_column="country",
currency_preserve_code=True,
currency_decimal="auto",
)
class TestPhonesByRegion:
def test_every_row_lands_on_correct_e164_prefix(self, df, options):
# Each row's country column drives the per-row region used by
# phonenumbers.parse — the correct + prefix is the acceptance bar.
res = standardize_dataframe(df, options)
out = res.standardized_df
# ISO-2 → expected E.164 country code prefix
prefix_for_country = {
"US": "+1", "GB": "+44", "RU": "+7", "ES": "+34",
"FR": "+33", "JP": "+81", "DE": "+49", "IT": "+39",
"CN": "+86", "IN": "+91", "EG": "+20", "AU": "+61",
"BR": "+55", "MX": "+52", "KR": "+82", "TR": "+90",
"IL": "+972", "PL": "+48", "DK": "+45", "SE": "+46",
}
bad: list[tuple[str, str, str]] = []
for _, row in out.iterrows():
want = prefix_for_country[row["country"]]
got = row["phone"]
if not got.startswith(want):
bad.append((row["country"], want, got))
assert not bad, f"phone prefix mismatches: {bad}"
class TestCurrencyByLocale:
def test_eu_decimal_comma_resolves_under_auto(self, df, options):
res = standardize_dataframe(df, options)
# Spain, France, Germany, Italy, Brazil, Sweden all use decimal
# comma. Verify a clean numeric result post-standardization.
eu_idx = df.index[df["country"].isin(
["ES", "FR", "DE", "IT", "BR", "SE"]
)]
for i in eu_idx:
val = res.standardized_df.loc[i, "price"]
# Either ``CODE NNN.NN`` or bare ``NNN.NN`` — but the comma
# in the source must have become a dot in the output.
assert "," not in val, (
f"row {i} ({df.loc[i, 'country']}): comma persisted in {val!r}"
)
def test_brl_real_prefix_recognised(self, df, options):
res = standardize_dataframe(df, options)
br_row = res.standardized_df[res.standardized_df["country"] == "BR"].iloc[0]
assert "BRL" in br_row["price"]
class TestStreamingMatchesInMemory:
def test_same_output_via_streaming(self, tmp_path, df, options):
# Streaming the same fixture through standardize_file should
# produce a CSV byte-equivalent to the in-memory path.
in_mem = standardize_dataframe(df, options).standardized_df
out = tmp_path / "out.csv"
# Use a chunk size that splits the 20-row fixture mid-way.
res = standardize_file(FIXTURE, out, options, chunk_size=7)
assert res.rows_processed == len(df)
streamed = pd.read_csv(out, dtype=str, keep_default_na=False)
# Compare typed columns only — others pass through.
for col in options.column_types:
assert streamed[col].tolist() == in_mem[col].astype(str).tolist(), (
f"column {col} differs between in-memory and streaming"
)

View File

@@ -110,16 +110,16 @@ _DATE_EXPECTED_MDY: dict[str, object] = {
"FD13": PASSTHROUGH,
"FD14": PASSTHROUGH,
"FD15": PASSTHROUGH,
# excel serial → 2024-01-15 (xfail — not implemented)
# excel serial dates (numeric days since 1899-12-30)
"FD22": "2024-01-15",
"FD23": "2024-01-15",
# unix timestamp seconds / millis → 2024-01-15 (xfail)
# unix timestamps (seconds, milliseconds)
"FD24": "2024-01-15",
"FD25": "2024-01-15",
# partial precision — corpus preserves it
"FD26": "2024-01",
"FD27": "2024-01", # xfail — text precision
"FD28": "2024-Q1", # xfail — quarter
"FD27": "2024-01", # text precision month
"FD28": "2024-Q1", # quarter
"FD29": "2024",
# 2-digit year cutoff (per docs: 1969 wins over 2069)
"FD30": "1969-01-15",
@@ -135,7 +135,7 @@ _DATE_EXPECTED_MDY: dict[str, object] = {
"FD37": "2024-01-15",
# garbage → pass through (corpus 0.3 boundary table)
# FD38/39/40 → PASSTHROUGH default
# locale-specific month names (xfail — not shipped)
# locale-specific month names (en/fr/de via month_locales)
"FD41": "2024-01-15",
"FD42": "2024-01-15",
# timezone — corpus 3.3 says fixed-offset only

View File

@@ -0,0 +1,301 @@
"""Tests for the format-standardizer rework: cache, vectorized dispatch,
per-row country, audit cap, and streaming entry point."""
from __future__ import annotations
import csv
from pathlib import Path
import pandas as pd
import pytest
from src.core.format_standardize import (
FieldType,
StandardizeOptions,
StreamingStandardizeResult,
_normalize_region,
standardize_dataframe,
standardize_file,
)
# ---------------------------------------------------------------------------
# Per-row country / region
# ---------------------------------------------------------------------------
class TestPerRowCountry:
def test_phone_uses_per_row_country(self):
df = pd.DataFrame({
"phone": ["020 7946 0958", "03-3210-7000", "(415) 555-1234"],
"country": ["GB", "JP", "US"],
})
opts = StandardizeOptions(
column_types={"phone": FieldType.PHONE},
phone_country_column="country",
)
res = standardize_dataframe(df, opts)
out = res.standardized_df["phone"].tolist()
assert out[0].startswith("+44")
assert out[1].startswith("+81")
assert out[2].startswith("+1")
def test_phone_country_full_name_resolved(self):
df = pd.DataFrame({
"phone": ["020 7946 0958"],
"country": ["United Kingdom"],
})
opts = StandardizeOptions(
column_types={"phone": FieldType.PHONE},
phone_country_column="country",
)
res = standardize_dataframe(df, opts)
assert res.standardized_df["phone"].iloc[0].startswith("+44")
def test_blank_country_falls_back_to_default(self):
df = pd.DataFrame({
"phone": ["(415) 555-1234"],
"country": [""], # blank → use default region
})
opts = StandardizeOptions(
column_types={"phone": FieldType.PHONE},
phone_country_column="country",
phone_region="US",
)
res = standardize_dataframe(df, opts)
assert res.standardized_df["phone"].iloc[0] == "+14155551234"
def test_unknown_country_column_raises(self):
df = pd.DataFrame({"phone": ["x"]})
opts = StandardizeOptions(
column_types={"phone": FieldType.PHONE},
phone_country_column="missing_col",
)
from src.core.errors import InputValidationError
with pytest.raises(InputValidationError):
standardize_dataframe(df, opts)
class TestNormalizeRegion:
def test_iso2_passthrough(self):
assert _normalize_region("US") == "US"
assert _normalize_region("us") == "US"
assert _normalize_region(" jp ") == "JP"
def test_iso3_mapped(self):
assert _normalize_region("USA") == "US"
assert _normalize_region("GBR") == "GB"
assert _normalize_region("JPN") == "JP"
def test_full_name(self):
assert _normalize_region("United States") == "US"
assert _normalize_region("Japan") == "JP"
assert _normalize_region("Brazil") == "BR"
assert _normalize_region("brasil") == "BR"
assert _normalize_region("España") == "ES"
def test_blank_or_unknown(self):
assert _normalize_region("") is None
assert _normalize_region(" ") is None
assert _normalize_region(None) is None
assert _normalize_region("xyz-no-such-country") is None
# ---------------------------------------------------------------------------
# Audit cap
# ---------------------------------------------------------------------------
class TestAuditCap:
def test_cap_truncates_change_rows(self):
df = pd.DataFrame({
"phone": ["(415) 555-12{:02d}".format(i) for i in range(50)],
})
opts = StandardizeOptions(
column_types={"phone": FieldType.PHONE},
audit_max_rows=5,
)
res = standardize_dataframe(df, opts)
# cells_changed counts everything; the audit table is capped.
assert res.cells_changed == 50
assert len(res.changes) == 5
def test_unbounded_audit(self):
df = pd.DataFrame({
"phone": ["(415) 555-12{:02d}".format(i) for i in range(20)],
})
opts = StandardizeOptions(
column_types={"phone": FieldType.PHONE},
audit_max_rows=None,
)
res = standardize_dataframe(df, opts)
assert len(res.changes) == 20
# ---------------------------------------------------------------------------
# Cache + vectorized dispatch (correctness)
# ---------------------------------------------------------------------------
class TestCacheCorrectness:
def test_repeated_phone_consistent(self):
# 1000 copies of the same phone should produce identical output.
df = pd.DataFrame({"phone": ["(415) 555-1234"] * 1000})
opts = StandardizeOptions(
column_types={"phone": FieldType.PHONE},
audit_max_rows=None,
)
res = standardize_dataframe(df, opts)
assert (res.standardized_df["phone"] == "+14155551234").all()
assert res.cells_changed == 1000
def test_cache_disabled_still_works(self):
df = pd.DataFrame({"phone": ["(415) 555-1234", "020 7946 0958"]})
opts = StandardizeOptions(
column_types={"phone": FieldType.PHONE},
cache_size=0, # disabled
)
res = standardize_dataframe(df, opts)
assert res.standardized_df["phone"].iloc[0] == "+14155551234"
# ---------------------------------------------------------------------------
# Streaming standardize_file
# ---------------------------------------------------------------------------
class TestStandardizeFile:
def test_basic_streaming(self, tmp_path):
inp = tmp_path / "in.csv"
inp.write_text(
"phone,country,price\n"
"(415) 555-1234,US,$1500.00\n"
"020 7946 0958,GB,£99.99\n"
"03-3210-7000,JP,¥12000\n"
"+33 1 42 86 82 00,FR,€850.50\n"
)
out = tmp_path / "out.csv"
opts = StandardizeOptions(
column_types={"phone": FieldType.PHONE, "price": FieldType.CURRENCY},
phone_country_column="country",
currency_preserve_code=True,
)
res = standardize_file(inp, out, opts, chunk_size=2)
assert isinstance(res, StreamingStandardizeResult)
assert res.rows_processed == 4
assert res.chunks_processed == 2
assert out.exists()
out_df = pd.read_csv(out, dtype=str, keep_default_na=False)
assert out_df["phone"].iloc[0].startswith("+1")
assert out_df["phone"].iloc[1].startswith("+44")
assert out_df["phone"].iloc[2].startswith("+81")
assert out_df["phone"].iloc[3].startswith("+33")
def test_audit_capped_across_chunks(self, tmp_path):
# 60 rows, audit cap 10, chunks of 20 → audit must stop at 10.
inp = tmp_path / "in.csv"
rows = ["phone\n"] + [f"(415) 555-12{i:02d}\n" for i in range(60)]
inp.write_text("".join(rows))
out = tmp_path / "out.csv"
opts = StandardizeOptions(
column_types={"phone": FieldType.PHONE},
audit_max_rows=10,
)
res = standardize_file(inp, out, opts, chunk_size=20)
# Audit file exists and has exactly 10 data rows + 1 header.
audit_lines = res.audit_path.read_text().splitlines()
assert len(audit_lines) - 1 == 10
def test_audit_row_indices_are_global(self, tmp_path):
# Audit row numbers must reflect absolute file position, not chunk-local.
inp = tmp_path / "in.csv"
rows = ["phone\n"] + [f"(415) 555-12{i:02d}\n" for i in range(30)]
inp.write_text("".join(rows))
out = tmp_path / "out.csv"
opts = StandardizeOptions(
column_types={"phone": FieldType.PHONE},
audit_max_rows=None,
)
res = standardize_file(inp, out, opts, chunk_size=10)
audit = pd.read_csv(res.audit_path)
# Rows should be 0..29, monotonically increasing.
assert audit["row"].tolist() == list(range(30))
def test_progress_callback_fires(self, tmp_path):
inp = tmp_path / "in.csv"
inp.write_text("phone\n" + "\n".join("(415) 555-1234" for _ in range(20)) + "\n")
out = tmp_path / "out.csv"
opts = StandardizeOptions(column_types={"phone": FieldType.PHONE})
seen: list[tuple[int, int]] = []
def cb(rows, chunks):
seen.append((rows, chunks))
standardize_file(inp, out, opts, chunk_size=5, progress_callback=cb)
assert len(seen) == 4
assert seen[-1] == (20, 4)
def test_progress_callback_exception_does_not_abort(self, tmp_path):
inp = tmp_path / "in.csv"
inp.write_text("phone\n(415) 555-1234\n")
out = tmp_path / "out.csv"
opts = StandardizeOptions(column_types={"phone": FieldType.PHONE})
def bad_cb(*a, **k):
raise RuntimeError("boom")
# Must not raise.
res = standardize_file(inp, out, opts, chunk_size=1, progress_callback=bad_cb)
assert res.rows_processed == 1
def test_missing_input_raises_clean_error(self, tmp_path):
from src.core.errors import FileAccessError
opts = StandardizeOptions(column_types={"phone": FieldType.PHONE})
with pytest.raises(FileAccessError):
standardize_file(
tmp_path / "missing.csv",
tmp_path / "out.csv",
opts,
)
# ---------------------------------------------------------------------------
# International coverage smoke
# ---------------------------------------------------------------------------
class TestInternationalCoverage:
@pytest.mark.parametrize("number,country,prefix", [
("020 7946 0958", "GB", "+44"),
("03-3210-7000", "JP", "+81"),
("+49 30 12345678", "DE", "+49"),
("01 42 86 82 00", "FR", "+33"),
("+39 06 6982", "IT", "+39"),
("+34 91 411 1111", "ES", "+34"),
("+86 10 1234 5678", "CN", "+86"),
("+91 11 2345 6789", "IN", "+91"),
("+61 2 9374 4000", "AU", "+61"),
("11 3071 0000", "BR", "+55"),
("+52 55 5555 0000", "MX", "+52"),
("+82 2 2287 0114", "KR", "+82"),
])
def test_phone_via_per_row_region(self, number, country, prefix):
df = pd.DataFrame({"phone": [number], "country": [country]})
opts = StandardizeOptions(
column_types={"phone": FieldType.PHONE},
phone_country_column="country",
)
res = standardize_dataframe(df, opts)
out = res.standardized_df["phone"].iloc[0]
assert out.startswith(prefix), (
f"{number!r} ({country}): expected to start with {prefix}, got {out!r}"
)
@pytest.mark.parametrize("price,want_code", [
("$1,500.00", "USD"),
("€850,50", "EUR"),
("£99.99", "GBP"),
("¥12000", "JPY"),
("R$ 250,00", "BRL"),
("CHF 1200.00", "CHF"),
])
def test_currency_codes_detected(self, price, want_code):
df = pd.DataFrame({"price": [price]})
opts = StandardizeOptions(
column_types={"price": FieldType.CURRENCY},
currency_preserve_code=True,
currency_decimal="auto", # international mode
)
res = standardize_dataframe(df, opts)
assert want_code in res.standardized_df["price"].iloc[0]

View File

@@ -8,10 +8,8 @@ These cover edges that existing suites missed:
- ``analyze()`` with ``sample_rows >= len(df)`` (uses copy(), not head()).
- ``findings_by_tool`` on an empty list.
- BOM that appears mid-cell rather than at file start.
The collapse-whitespace heuristic for numeric/date/phone-shaped cells (spec
§4.17) is *not yet implemented* and is captured here as a known-gap xfail
so it's surfaced rather than silently missing.
- The collapse-whitespace heuristic for numeric/date/phone-shaped cells
(spec §4.17), now wired in via ``_smart_collapse_whitespace``.
"""
from __future__ import annotations

462
tests/test_missing.py Normal file
View File

@@ -0,0 +1,462 @@
"""Tests for src/core/missing.py."""
from __future__ import annotations
import json
import numpy as np
import pandas as pd
import pytest
from src.core.errors import ConfigError, InputValidationError
from src.core.missing import (
DEFAULT_SENTINELS,
MissingOptions,
PRESETS,
detect_sentinels,
handle_missing,
is_missing_like,
profile_missing,
)
# ---------------------------------------------------------------------------
# is_missing_like
# ---------------------------------------------------------------------------
class TestIsMissingLike:
def test_none(self):
assert is_missing_like(None)
def test_nan(self):
assert is_missing_like(np.nan)
def test_pd_nat(self):
assert is_missing_like(pd.NaT)
def test_empty_string(self):
assert is_missing_like("")
def test_whitespace_only(self):
assert is_missing_like(" ")
assert is_missing_like("\t\n ")
def test_default_sentinels(self):
for s in ("N/A", "n/a", "NULL", "null", "-", "--", "?", "TBD", "(blank)"):
assert is_missing_like(s), f"expected {s!r} to be missing-like"
def test_case_insensitive(self):
assert is_missing_like("N/A")
assert is_missing_like("n/A")
assert is_missing_like("NA")
assert is_missing_like("na")
def test_real_value_not_missing(self):
assert not is_missing_like("hello")
assert not is_missing_like("0")
assert not is_missing_like(0)
assert not is_missing_like(0.0)
def test_zero_is_not_missing(self):
# Common bug: treating 0 / "0" / False as missing.
assert not is_missing_like(0)
assert not is_missing_like(False)
def test_custom_sentinels_override(self):
assert is_missing_like("xx", sentinels=["xx"])
assert not is_missing_like("xx", sentinels=["zz"])
# ---------------------------------------------------------------------------
# detect_sentinels
# ---------------------------------------------------------------------------
class TestDetectSentinels:
def test_counts_by_label(self):
s = pd.Series(["alice", "N/A", "n/a", "NULL", " ", "", "bob"])
counts = detect_sentinels(s)
# "n/a" matches both 'N/A' and 'n/a' under casefold; the canonical
# label that wins is whichever is in the DEFAULT_SENTINELS list.
assert sum(v for k, v in counts.items() if k != "(whitespace)") == 3
assert counts["(whitespace)"] == 2
def test_skips_real_nan(self):
s = pd.Series(["a", np.nan, "N/A"])
counts = detect_sentinels(s)
assert sum(counts.values()) == 1
def test_no_sentinels_returns_empty(self):
s = pd.Series(["alice", "bob", "charlie"])
assert detect_sentinels(s) == {}
# ---------------------------------------------------------------------------
# profile_missing
# ---------------------------------------------------------------------------
class TestProfileMissing:
def test_basic(self):
df = pd.DataFrame({
"name": ["Alice", "Bob", "N/A", "", "Charlie"],
"age": [30, None, 25, 40, np.nan],
})
prof = profile_missing(df, MissingOptions())
assert prof.rows_total == 5
# name: '' + 'N/A' = 2 sentinels; age: 2 NaN
report_by_col = {r.column: r for r in prof.columns}
assert report_by_col["name"].missing == 2
assert report_by_col["age"].missing == 2
assert prof.cells_missing == 4
def test_complete_dataframe(self):
df = pd.DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]})
prof = profile_missing(df, MissingOptions())
assert prof.cells_missing == 0
assert prof.rows_complete == 3
assert prof.rows_with_any_missing == 0
def test_to_dataframe_columns(self):
df = pd.DataFrame({"x": [1, None]})
prof = profile_missing(df, MissingOptions())
out = prof.to_dataframe()
assert set(out.columns) >= {"column", "missing", "missing_pct", "top_sentinel"}
def test_disabled_sentinels_only_counts_real_nan(self):
df = pd.DataFrame({"x": ["N/A", "alice", np.nan]})
opts = MissingOptions(standardize_sentinels=False)
prof = profile_missing(df, opts)
report_by_col = {r.column: r for r in prof.columns}
# Only the real NaN counts; 'N/A' is left alone.
assert report_by_col["x"].missing == 1
# ---------------------------------------------------------------------------
# handle_missing — sentinel standardization
# ---------------------------------------------------------------------------
class TestSentinelStandardization:
def test_replaces_sentinels_with_nan(self):
df = pd.DataFrame({"x": ["alice", "N/A", "-", " ", "bob"]})
res = handle_missing(df, MissingOptions(strategy="none"))
# 'N/A' + '-' + whitespace-only = 3
assert res.sentinels_standardized == 3
assert res.handled_df["x"].isna().sum() == 3
assert res.handled_df.iloc[0]["x"] == "alice"
assert res.handled_df.iloc[4]["x"] == "bob"
def test_audit_records_each_replacement(self):
df = pd.DataFrame({"x": ["alice", "N/A", "bob"]})
res = handle_missing(df, MissingOptions(strategy="none"))
assert len(res.changes) == 1
assert res.changes.iloc[0]["action"].startswith("standardize:")
def test_disabled_keeps_sentinels(self):
df = pd.DataFrame({"x": ["alice", "N/A", "bob"]})
opts = MissingOptions(standardize_sentinels=False, strategy="none")
res = handle_missing(df, opts)
assert res.sentinels_standardized == 0
assert res.handled_df.iloc[1]["x"] == "N/A"
def test_custom_sentinels_extend_default(self):
df = pd.DataFrame({"x": ["alice", "MISSING_DATA", "bob"]})
opts = MissingOptions(
sentinels=[*DEFAULT_SENTINELS, "MISSING_DATA"],
strategy="none",
)
res = handle_missing(df, opts)
assert res.sentinels_standardized == 1
# ---------------------------------------------------------------------------
# handle_missing — fill strategies
# ---------------------------------------------------------------------------
class TestFillStrategies:
@pytest.fixture
def numeric_df(self):
return pd.DataFrame({"x": [1.0, 2.0, np.nan, 4.0, np.nan]})
def test_mean(self, numeric_df):
res = handle_missing(numeric_df, MissingOptions(strategy="mean"))
# mean of [1, 2, 4] = 7/3
filled = res.handled_df["x"].iloc[2]
assert abs(filled - 7.0 / 3.0) < 1e-9
assert res.cells_filled == 2
def test_median(self, numeric_df):
res = handle_missing(numeric_df, MissingOptions(strategy="median"))
# median of [1, 2, 4] = 2.0
assert res.handled_df["x"].iloc[2] == 2.0
def test_mode(self):
df = pd.DataFrame({"x": ["a", "a", "b", None, None]})
res = handle_missing(df, MissingOptions(strategy="mode"))
assert res.handled_df["x"].iloc[3] == "a"
assert res.handled_df["x"].iloc[4] == "a"
assert res.cells_filled == 2
def test_constant_scalar(self, numeric_df):
res = handle_missing(
numeric_df,
MissingOptions(strategy="constant", fill_value=99.0),
)
assert res.handled_df["x"].iloc[2] == 99.0
assert res.handled_df["x"].iloc[4] == 99.0
def test_constant_per_column(self):
df = pd.DataFrame({"a": [1, np.nan], "b": ["x", None]})
opts = MissingOptions(
strategy="constant",
column_fill_values={"a": 0, "b": "?"},
)
res = handle_missing(df, opts)
assert res.handled_df["a"].iloc[1] == 0
assert res.handled_df["b"].iloc[1] == "?"
def test_ffill(self):
df = pd.DataFrame({"x": [1.0, np.nan, np.nan, 4.0]})
res = handle_missing(df, MissingOptions(strategy="ffill"))
assert list(res.handled_df["x"]) == [1.0, 1.0, 1.0, 4.0]
def test_bfill(self):
df = pd.DataFrame({"x": [1.0, np.nan, np.nan, 4.0]})
res = handle_missing(df, MissingOptions(strategy="bfill"))
assert list(res.handled_df["x"]) == [1.0, 4.0, 4.0, 4.0]
def test_interpolate(self):
df = pd.DataFrame({"x": [1.0, np.nan, np.nan, 4.0]})
res = handle_missing(df, MissingOptions(strategy="interpolate"))
assert list(res.handled_df["x"]) == [1.0, 2.0, 3.0, 4.0]
def test_numeric_strategy_falls_back_for_categorical(self):
df = pd.DataFrame({"x": ["a", "a", None, "b"]})
opts = MissingOptions(strategy="median", categorical_strategy="mode")
res = handle_missing(df, opts)
assert res.strategy_per_column["x"] == "mode"
assert res.handled_df["x"].iloc[2] == "a"
def test_per_column_strategy_overrides_global(self):
df = pd.DataFrame({"a": [1.0, np.nan], "b": ["x", None]})
opts = MissingOptions(
strategy="median",
column_strategies={"b": "constant"},
fill_value="??",
)
res = handle_missing(df, opts)
assert res.handled_df["a"].iloc[1] == 1.0 # median of [1.0]
assert res.handled_df["b"].iloc[1] == "??"
def test_all_nan_column_safely_skipped(self):
df = pd.DataFrame({"x": [np.nan, np.nan, np.nan]})
res = handle_missing(df, MissingOptions(strategy="mean"))
assert res.cells_filled == 0
assert res.handled_df["x"].isna().all()
# ---------------------------------------------------------------------------
# handle_missing — drops
# ---------------------------------------------------------------------------
class TestDropStrategies:
def test_drop_row_any_missing(self):
# Strict-greater: threshold 0.0 → drop any row with any missing.
df = pd.DataFrame({
"a": [1, 2, np.nan, 4],
"b": ["x", None, "z", "w"],
})
opts = MissingOptions(strategy="drop_row", row_drop_threshold=0.0)
res = handle_missing(df, opts)
# Rows 1 and 2 each have one missing cell; rows 0 and 3 are clean.
assert res.rows_dropped == 2
assert len(res.handled_df) == 2
def test_drop_row_default_threshold_never_drops(self):
# Default 1.0 = never drop — no fraction exceeds 100%.
df = pd.DataFrame({
"a": [1, 2, np.nan],
"b": ["x", "y", None],
})
opts = MissingOptions(strategy="drop_row") # threshold defaults to 1.0
res = handle_missing(df, opts)
assert res.rows_dropped == 0
def test_drop_row_partial_threshold(self):
df = pd.DataFrame({
"a": [1, np.nan, np.nan, np.nan],
"b": [10, 20, np.nan, np.nan],
"c": [100, 200, np.nan, 400],
})
# Strict-greater: threshold 0.5 → drop rows with > 50% missing.
opts = MissingOptions(strategy="drop_row", row_drop_threshold=0.5)
res = handle_missing(df, opts)
# row 0: 0/3, row 1: 1/3 (0.33) -> keep
# row 2: 3/3 (1.0) -> drop, row 3: 2/3 (0.67) -> drop
assert res.rows_dropped == 2
def test_drop_col_threshold(self):
df = pd.DataFrame({
"keep": [1, 2, 3, 4],
"drop_me": [np.nan, np.nan, np.nan, 1], # 75% missing
})
# Strict-greater: 0.5 → drop columns with > 50% missing.
opts = MissingOptions(strategy="drop_col", col_drop_threshold=0.5)
res = handle_missing(df, opts)
assert "drop_me" in res.columns_dropped
assert "keep" not in res.columns_dropped
def test_drop_both(self):
df = pd.DataFrame({
"keep": [1, 2, 3, 4, 5],
"drop_col": [np.nan] * 5,
"x": [1, np.nan, 3, np.nan, 5],
})
opts = MissingOptions(
strategy="drop_both",
col_drop_threshold=0.99, # >99% missing → drop column
row_drop_threshold=0.0, # any missing in remaining cols → drop row
)
res = handle_missing(df, opts)
# drop_col is 100% missing → dropped
assert "drop_col" in res.columns_dropped
# Remaining scope (keep + x): rows 1 and 3 have a missing x → drop.
assert res.rows_dropped == 2
def test_drop_audit_records_dropped_rows(self):
df = pd.DataFrame({"a": [1, np.nan], "b": [2, np.nan]})
# Drop the fully-missing row (frac > 0.99).
opts = MissingOptions(strategy="drop_row", row_drop_threshold=0.99)
res = handle_missing(df, opts)
drop_records = res.changes[res.changes["action"] == "drop_row"]
assert len(drop_records) == 1
# ---------------------------------------------------------------------------
# Scope: columns / skip_columns
# ---------------------------------------------------------------------------
class TestScope:
def test_columns_filter(self):
df = pd.DataFrame({"a": [np.nan, 2], "b": [np.nan, 4]})
opts = MissingOptions(columns=["a"], strategy="constant", fill_value=99)
res = handle_missing(df, opts)
assert res.handled_df["a"].iloc[0] == 99
# b should be untouched
assert pd.isna(res.handled_df["b"].iloc[0])
def test_skip_columns(self):
df = pd.DataFrame({"a": [np.nan, 2], "b": [np.nan, 4]})
opts = MissingOptions(skip_columns=["b"], strategy="constant", fill_value=99)
res = handle_missing(df, opts)
assert res.handled_df["a"].iloc[0] == 99
assert pd.isna(res.handled_df["b"].iloc[0])
def test_unknown_column_raises(self):
df = pd.DataFrame({"a": [1]})
opts = MissingOptions(columns=["does_not_exist"])
with pytest.raises(InputValidationError):
handle_missing(df, opts)
# ---------------------------------------------------------------------------
# Presets / config
# ---------------------------------------------------------------------------
class TestPresets:
def test_detect_only_does_not_fill(self):
df = pd.DataFrame({"x": ["alice", "N/A", "bob"]})
opts = MissingOptions.from_preset("detect-only")
res = handle_missing(df, opts)
assert res.sentinels_standardized == 1
assert res.cells_filled == 0
assert res.rows_dropped == 0
def test_safe_fill_fills(self):
df = pd.DataFrame({"age": [30, np.nan, 25, 40], "name": ["a", "a", None, "b"]})
opts = MissingOptions.from_preset("safe-fill")
res = handle_missing(df, opts)
assert res.cells_filled == 2
def test_drop_incomplete(self):
df = pd.DataFrame({"a": [1, np.nan, 3], "b": [10, 20, 30]})
opts = MissingOptions.from_preset("drop-incomplete")
res = handle_missing(df, opts)
assert res.rows_dropped == 1
def test_unknown_preset_raises(self):
with pytest.raises(ConfigError):
MissingOptions.from_preset("does-not-exist")
def test_roundtrip_to_file(self, tmp_path):
opts = MissingOptions.from_preset("safe-fill")
opts.column_strategies = {"age": "median"}
path = tmp_path / "cfg.json"
opts.to_file(path)
loaded = MissingOptions.from_file(path)
assert loaded.strategy == opts.strategy
assert loaded.column_strategies == opts.column_strategies
# ---------------------------------------------------------------------------
# Validation
# ---------------------------------------------------------------------------
class TestValidate:
def test_invalid_strategy(self):
opts = MissingOptions(strategy="bogus") # type: ignore[arg-type]
with pytest.raises(InputValidationError):
opts.validate()
def test_threshold_out_of_range(self):
opts = MissingOptions(row_drop_threshold=1.5)
with pytest.raises(ConfigError):
opts.validate()
def test_handle_missing_validates(self):
df = pd.DataFrame({"x": [1]})
opts = MissingOptions(strategy="bogus") # type: ignore[arg-type]
with pytest.raises(InputValidationError):
handle_missing(df, opts)
def test_non_dataframe_input(self):
with pytest.raises(InputValidationError):
handle_missing([1, 2, 3]) # type: ignore[arg-type]
# ---------------------------------------------------------------------------
# End-to-end realistic case
# ---------------------------------------------------------------------------
class TestEndToEnd:
def test_messy_customer_export(self):
df = pd.DataFrame({
"customer_id": [1, 2, 3, 4, 5, 6],
"name": ["Alice", "Bob", "N/A", " ", "Charlie", None],
"email": ["a@x.com", "-", "c@x.com", "d@x.com", "NULL", "f@x.com"],
"age": [30, np.nan, 25, 40, np.nan, 50],
})
opts = MissingOptions(
standardize_sentinels=True,
strategy="median",
categorical_strategy="constant",
fill_value="UNKNOWN",
)
res = handle_missing(df, opts)
# Sentinels: name "N/A"," ",None; email "-","NULL". (None is real-NaN, not sentinel.)
# Whitespace + 'N/A' on name = 2; '-' + 'NULL' on email = 2. Total = 4.
assert res.sentinels_standardized == 4
# name has 3 missing after standardize (N/A, " ", None) → constant fill
# email has 2 missing → constant fill
# age has 2 missing → median (32.5 of [30, 25, 40, 50])
assert res.cells_filled == 7
assert res.handled_df["name"].isna().sum() == 0
assert res.handled_df["email"].isna().sum() == 0
assert res.handled_df["age"].isna().sum() == 0
assert (res.handled_df["name"] == "UNKNOWN").sum() == 3
assert (res.handled_df["age"] == 35.0).sum() == 2 # median of [30, 25, 40, 50]
def test_input_not_mutated(self):
df = pd.DataFrame({"x": ["N/A", "alice", np.nan]})
df_copy = df.copy()
handle_missing(df, MissingOptions.from_preset("safe-fill"))
pd.testing.assert_frame_equal(df, df_copy)

View File

@@ -0,0 +1,463 @@
"""Acceptance corpus for the Missing Value Handler.
Loads every fixture in ``test-cases/missing-corpus/test_data/`` and
asserts the documented behaviour. The fixtures are split into:
* ``uc##`` — three target-client use cases (Shopify operator,
marketing analyst, consultant intake).
* ``ec##`` — edge cases the engine must handle without surprise:
all-NaN columns, zeros that aren't missing, Excel errors, unicode
whitespace, mixed dtypes, padding, single row/column, every default
sentinel, per-column constants, drop thresholds, leading-NaN ffill,
numeric-strategy fallback for non-numeric columns, headers-only,
idempotency.
Each test runs through the public API (``handle_missing``) so any
regression in the engine surfaces here. Fixture files double as living
documentation for what the tool is supposed to do.
"""
from __future__ import annotations
import io
from pathlib import Path
import numpy as np
import pandas as pd
import pytest
from src.core.missing import (
MissingOptions,
handle_missing,
is_missing_like,
profile_missing,
)
CORPUS = Path(__file__).resolve().parents[1] / "test-cases" / "missing-corpus"
TEST_DATA = CORPUS / "test_data"
def _read(name: str, *, dtype_str: bool = False) -> pd.DataFrame:
"""Load a corpus CSV.
By default we let pandas infer dtypes — that's the most realistic
intake path (Excel exports keep numeric columns numeric). A handful
of cases pass ``dtype_str=True`` to keep sentinels visible in
columns that would otherwise be coerced to float.
"""
path = TEST_DATA / name
if dtype_str:
return pd.read_csv(path, dtype=str, keep_default_na=False)
return pd.read_csv(path)
# ---------------------------------------------------------------------------
# Use case 1 — Shopify operator: detect-only
# ---------------------------------------------------------------------------
class TestUC01ShopifyExport:
"""SMB operator standardizes disguised nulls before reimporting."""
def test_detect_only_replaces_sentinels(self):
df = _read("uc01_shopify_export.csv", dtype_str=True)
opts = MissingOptions.from_preset("detect-only")
res = handle_missing(df, opts)
# Spot-check known sentinels from the fixture
assert res.sentinels_standardized > 0
assert res.cells_filled == 0
assert res.rows_dropped == 0
# Fields that contained 'N/A', '-', 'NULL', '(blank)', '#N/A',
# 'n/a', '?', '(none)' should now be NaN.
for row, col in [
(1, "phone"), # 'N/A'
(2, "city"), # '-'
(3, "total_orders"), # 'NULL'
(5, "phone"), # ' '
(5, "last_order_date"), # '(blank)'
(6, "last_order_date"), # '#N/A'
(7, "phone"), # 'n/a'
(8, "city"), # '?'
(9, "total_orders"), # '(none)'
]:
assert pd.isna(res.handled_df.iloc[row][col]), (
f"Expected NaN at row {row} col {col}, got "
f"{res.handled_df.iloc[row][col]!r}"
)
def test_real_values_preserved(self):
df = _read("uc01_shopify_export.csv", dtype_str=True)
res = handle_missing(df, MissingOptions.from_preset("detect-only"))
# First row should be untouched.
assert res.handled_df.iloc[0]["first_name"] == "Alice"
assert res.handled_df.iloc[0]["email"] == "alice@shop.com"
assert res.handled_df.iloc[0]["lifetime_value"] == "1240.50"
def test_audit_log_complete(self):
df = _read("uc01_shopify_export.csv", dtype_str=True)
res = handle_missing(df, MissingOptions.from_preset("detect-only"))
# One audit row per sentinel replacement.
assert len(res.changes) == res.sentinels_standardized
assert set(res.changes["action"].apply(lambda s: s.startswith("standardize:"))) == {True}
# ---------------------------------------------------------------------------
# Use case 2 — Marketing analyst: safe-fill
# ---------------------------------------------------------------------------
class TestUC02MarketingAudience:
"""Marketer fills numeric columns with median, categorical with mode."""
def test_safe_fill_clears_all_missing(self):
df = _read("uc02_marketing_audience.csv")
opts = MissingOptions.from_preset("safe-fill")
res = handle_missing(df, opts)
# Every cell in scope should be filled.
assert res.profile_after.cells_missing == 0
assert res.cells_filled > 0
def test_numeric_uses_median_categorical_uses_mode(self):
df = _read("uc02_marketing_audience.csv")
opts = MissingOptions.from_preset("safe-fill")
res = handle_missing(df, opts)
# 'age' is numeric → median strategy
assert res.strategy_per_column["age"] == "median"
# 'segment' / 'region' / 'source' are object → mode fallback
assert res.strategy_per_column["segment"] == "mode"
assert res.strategy_per_column["region"] == "mode"
def test_per_column_override(self):
df = _read("uc02_marketing_audience.csv")
opts = MissingOptions.from_preset("safe-fill")
opts.column_strategies = {"source": "constant"}
opts.column_fill_values = {"source": "unknown"}
res = handle_missing(df, opts)
# Cells previously holding sentinels in 'source' should now equal "unknown".
assert (res.handled_df["source"] == "unknown").sum() >= 3
def test_consent_real_false_not_dropped(self):
# 'consent' column has empty cells but also explicit "true"; mode fill
# must not silently change a real "true" to anything else.
df = _read("uc02_marketing_audience.csv")
res = handle_missing(df, MissingOptions.from_preset("safe-fill"))
original_trues = (df["consent"] == "true").sum()
result_trues = (res.handled_df["consent"] == "true").sum()
# Filled rows can become "true" (mode) but should not lose existing trues.
assert result_trues >= original_trues
# ---------------------------------------------------------------------------
# Use case 3 — Consultant intake: threshold drops + fill
# ---------------------------------------------------------------------------
class TestUC03ConsultantIntake:
"""Drop sparse columns and rows, then fill the survivors."""
def test_drop_col_removes_legacy_fields(self):
df = _read("uc03_consultant_intake.csv", dtype_str=True)
# internal_id_legacy and beta_field are 100% missing — drop them.
opts = MissingOptions(
standardize_sentinels=True,
strategy="drop_col",
col_drop_threshold=0.99,
)
res = handle_missing(df, opts)
assert "internal_id_legacy" in res.columns_dropped
assert "beta_field" in res.columns_dropped
def test_drop_row_removes_mostly_empty_respondents(self):
df = _read("uc03_consultant_intake.csv", dtype_str=True)
opts = MissingOptions(
standardize_sentinels=True,
strategy="drop_both",
col_drop_threshold=0.99, # drop the legacy / beta cols first
row_drop_threshold=0.5, # then drop rows with >50% missing
)
res = handle_missing(df, opts)
# R-002, R-005, R-007, R-010 are mostly-empty respondents.
assert res.rows_dropped >= 4
# Non-empty respondents survive.
kept_ids = set(res.handled_df["respondent_id"].tolist())
for survivor in ("R-001", "R-003", "R-006", "R-008", "R-009", "R-012"):
assert survivor in kept_ids
# ---------------------------------------------------------------------------
# Edge cases
# ---------------------------------------------------------------------------
class TestEC01AllNanColumn:
def test_fill_skips_all_nan_column(self):
df = _read("ec01_all_nan_column.csv")
res = handle_missing(df, MissingOptions(strategy="mean"))
# Mean of all-NaN is NaN — engine must NOT fabricate a value.
assert res.handled_df["deprecated_field"].isna().all()
assert res.cells_filled == 0
def test_drop_col_catches_all_nan(self):
df = _read("ec01_all_nan_column.csv")
res = handle_missing(
df, MissingOptions(strategy="drop_col", col_drop_threshold=0.99),
)
assert "deprecated_field" in res.columns_dropped
assert "name" not in res.columns_dropped
class TestEC02NoMissing:
def test_clean_file_is_noop(self):
df = _read("ec02_no_missing.csv")
res = handle_missing(df, MissingOptions.from_preset("safe-fill"))
assert res.sentinels_standardized == 0
assert res.cells_filled == 0
assert res.rows_dropped == 0
pd.testing.assert_frame_equal(res.handled_df, df)
class TestEC03ZeroIsNotMissing:
def test_zero_preserved(self):
df = _read("ec03_zero_is_not_missing.csv")
res = handle_missing(df, MissingOptions.from_preset("safe-fill"))
# Original zeros remain zero.
assert (res.handled_df["balance"] == 0).sum() == (df["balance"] == 0).sum()
assert (res.handled_df["count"] == 0).sum() == (df["count"] == 0).sum()
# No spurious changes recorded.
assert res.cells_filled == 0
assert res.sentinels_standardized == 0
def test_is_missing_like_zero_predicate(self):
# Direct predicate check — zeros, false, "0" must all be non-missing.
assert not is_missing_like(0)
assert not is_missing_like(0.0)
assert not is_missing_like(False)
assert not is_missing_like("0")
assert not is_missing_like("0.00")
class TestEC04ExcelErrors:
def test_excel_error_sentinels_recognized(self):
df = _read("ec04_excel_errors.csv", dtype_str=True)
res = handle_missing(df, MissingOptions(strategy="none"))
# 6 error sentinels in the fixture: #N/A, #NULL!, #VALUE!, #N/A, #N/A, #NULL!
assert res.sentinels_standardized == 6
class TestEC05UnicodeWhitespace:
def test_nbsp_and_ideographic_space_count_as_missing(self):
df = _read("ec05_unicode_whitespace.csv", dtype_str=True)
res = handle_missing(df, MissingOptions(strategy="none"))
# rows 1, 2, 4 contain NBSP / tab / ideographic space respectively
assert res.handled_df["note"].isna().sum() == 3
assert res.handled_df.iloc[0]["note"] == "hello"
assert res.handled_df.iloc[3]["note"] == "real"
class TestEC06MixedDtypes:
def test_mixed_column_falls_back_to_mode(self):
# Read with native dtypes so 'real_num' stays numeric.
df = _read("ec06_mixed_dtypes.csv")
opts = MissingOptions(
standardize_sentinels=True,
strategy="median",
categorical_strategy="mode",
)
res = handle_missing(df, opts)
# mixed_col holds 'N/A' / 'hello' alongside numbers → object dtype,
# median falls back to mode.
assert res.strategy_per_column["mixed_col"] == "mode"
# real_num is float dtype → median runs.
assert res.strategy_per_column["real_num"] == "median"
class TestEC07RealDataWithPadding:
def test_padded_real_data_not_treated_as_missing(self):
df = _read("ec07_real_data_with_padding.csv", dtype_str=True)
res = handle_missing(df, MissingOptions(strategy="none"))
# Only row 1 (name=" ") and row 2 (city=blank) should become NaN.
# " Alice ", " Bob ", " SF" must remain.
assert res.handled_df.iloc[0]["name"] == " Alice "
assert res.handled_df.iloc[2]["name"] == " Bob "
assert res.handled_df.iloc[3]["city"] == " SF"
class TestEC08SingleRow:
def test_single_row_handles_cleanly(self):
df = _read("ec08_single_row.csv", dtype_str=True)
# detect-only
res = handle_missing(df, MissingOptions(strategy="none"))
assert res.sentinels_standardized == 2 # 'N/A' + ''
# safe-fill on a one-row file: median/mode of a single value is itself.
res2 = handle_missing(df, MissingOptions.from_preset("safe-fill"))
assert res2.handled_df.iloc[0]["name"] == "Alice"
class TestEC09SingleColumn:
def test_single_column_works(self):
df = _read("ec09_single_column.csv", dtype_str=True)
res = handle_missing(df, MissingOptions(strategy="none"))
# 'N/A', whitespace-only ' ', '-' = 3 sentinels
assert res.sentinels_standardized == 3
assert res.handled_df["value"].isna().sum() == 3
class TestEC10AllSentinelVariants:
def test_every_default_sentinel_recognized(self):
df = _read("ec10_all_sentinel_variants.csv", dtype_str=True)
res = handle_missing(df, MissingOptions(strategy="none"))
# 20 sentinels + 1 real value
assert res.sentinels_standardized == 20
# The 'real_value' row stays.
assert (res.handled_df["sentinel_value"] == "real_value").sum() == 1
class TestEC11ConstantPerColumn:
def test_per_column_fill_values(self):
df = _read("ec11_constant_per_column.csv", dtype_str=True)
opts = MissingOptions(
strategy="constant",
column_fill_values={
"country": "USA",
"salary": "0",
"department": "Unassigned",
},
)
res = handle_missing(df, opts)
# Fixture has 1 UK row + 2 USA rows + 2 blanks. Filling blanks with
# "USA" yields 4 USA total; UK is preserved.
assert (res.handled_df["country"] == "USA").sum() == 4
assert (res.handled_df["country"] == "UK").sum() == 1
assert (res.handled_df["department"] == "Unassigned").sum() >= 2
class TestEC12DropThresholdBoundary:
def test_threshold_one_never_drops(self):
# threshold 1.0 + strict-greater = never drop.
df = _read("ec12_drop_threshold_boundary.csv")
opts = MissingOptions(strategy="drop_row", row_drop_threshold=1.0)
res = handle_missing(df, opts)
assert res.rows_dropped == 0
def test_threshold_just_under_one_drops_fully_missing(self):
# threshold 0.99: drop only fully-missing rows (frac > 0.99 → frac == 1.0).
df = _read("ec12_drop_threshold_boundary.csv")
opts = MissingOptions(
strategy="drop_row",
row_drop_threshold=0.99,
columns=["a", "b", "c", "d"], # exclude id from the scope
)
res = handle_missing(df, opts)
# Only row 3 (id=4, all four are NaN) qualifies.
assert res.rows_dropped == 1
def test_threshold_half_drops_majority_missing(self):
df = _read("ec12_drop_threshold_boundary.csv")
opts = MissingOptions(
strategy="drop_row",
row_drop_threshold=0.5,
columns=["a", "b", "c", "d"],
)
res = handle_missing(df, opts)
# Missing fractions across [a,b,c,d]:
# row 0: 0/4=0.0 keep
# row 1: 2/4=0.5 keep (strict >, not equal)
# row 2: 3/4=0.75 drop
# row 3: 4/4=1.0 drop
# row 4: 2/4=0.5 keep
assert res.rows_dropped == 2
def test_threshold_zero_drops_any_missing(self):
df = _read("ec12_drop_threshold_boundary.csv")
opts = MissingOptions(
strategy="drop_row",
row_drop_threshold=0.0,
columns=["a", "b", "c", "d"],
)
res = handle_missing(df, opts)
# Every body row except row 0 has at least one missing.
assert res.rows_dropped == 4
class TestEC13FfillLeadingNan:
def test_leading_nan_run_survives_ffill(self):
df = _read("ec13_ffill_leading_nan.csv")
res = handle_missing(df, MissingOptions(strategy="ffill"))
# First two rows (leading NaN) remain NaN — there's nothing to fill from.
assert pd.isna(res.handled_df["price"].iloc[0])
assert pd.isna(res.handled_df["price"].iloc[1])
# Mid-series gets filled forward.
assert res.handled_df["price"].iloc[3] == 100.0
assert res.handled_df["price"].iloc[4] == 100.0
# Trailing NaN gets filled by the last seen value.
assert res.handled_df["price"].iloc[6] == 150.0
class TestEC14InterpolateFallback:
def test_interpolate_on_non_numeric_falls_back(self):
df = _read("ec14_interpolate_fallback.csv", dtype_str=True)
opts = MissingOptions(
strategy="interpolate",
categorical_strategy="mode",
)
res = handle_missing(df, opts)
# All columns are object dtype here → fallback to mode.
assert res.strategy_per_column["category"] == "mode"
assert res.strategy_per_column["value"] == "mode"
class TestEC15HeadersOnly:
def test_empty_body_does_not_crash(self):
df = _read("ec15_headers_only.csv")
# All operations must be no-ops on an empty body.
for preset in ("detect-only", "safe-fill", "drop-incomplete"):
res = handle_missing(df, MissingOptions.from_preset(preset))
assert len(res.handled_df) == 0
assert res.cells_filled == 0
assert res.rows_dropped == 0
class TestEC16Idempotency:
def test_safe_fill_is_idempotent(self):
df = _read("ec16_idempotent_apply.csv", dtype_str=True)
opts = MissingOptions.from_preset("safe-fill")
first = handle_missing(df, opts)
second = handle_missing(first.handled_df, opts)
# Second pass should make no further changes.
pd.testing.assert_frame_equal(
second.handled_df.reset_index(drop=True),
first.handled_df.reset_index(drop=True),
)
assert second.cells_filled == 0
assert second.sentinels_standardized == 0
def test_detect_only_is_idempotent(self):
df = _read("ec16_idempotent_apply.csv", dtype_str=True)
opts = MissingOptions.from_preset("detect-only")
first = handle_missing(df, opts)
second = handle_missing(first.handled_df, opts)
assert second.sentinels_standardized == 0
# ---------------------------------------------------------------------------
# Whole-corpus property tests
# ---------------------------------------------------------------------------
ALL_FIXTURES = sorted(p.name for p in TEST_DATA.glob("*.csv"))
@pytest.mark.parametrize("fixture", ALL_FIXTURES)
def test_handle_missing_does_not_mutate_input(fixture):
"""Every fixture must leave the input DataFrame untouched."""
df = pd.read_csv(TEST_DATA / fixture, dtype=str, keep_default_na=False)
if df.empty and len(df.columns) == 0:
pytest.skip(f"{fixture}: completely empty file")
snapshot = df.copy(deep=True)
handle_missing(df, MissingOptions.from_preset("safe-fill"))
pd.testing.assert_frame_equal(df, snapshot)
@pytest.mark.parametrize("fixture", ALL_FIXTURES)
def test_profile_runs_on_every_fixture(fixture):
"""``profile_missing`` must succeed on every corpus file."""
df = pd.read_csv(TEST_DATA / fixture, dtype=str, keep_default_na=False)
prof = profile_missing(df, MissingOptions())
assert prof.rows_total == len(df)
assert prof.cells_total == len(df) * len(df.columns)

324
tests/test_pipeline.py Normal file
View File

@@ -0,0 +1,324 @@
"""Tests for src/core/pipeline.py."""
from __future__ import annotations
import json
import numpy as np
import pandas as pd
import pytest
from src.core.errors import ConfigError, InputValidationError
from src.core.pipeline import (
Pipeline,
PipelineResult,
SOFT_DEPENDENCIES,
Step,
StepResult,
TOOL_ADAPTERS,
TOOL_NAMES,
recommended_pipeline,
run_pipeline,
validate_pipeline,
)
# ---------------------------------------------------------------------------
# Step / Pipeline construction
# ---------------------------------------------------------------------------
class TestStep:
def test_unknown_tool_raises(self):
with pytest.raises(ConfigError):
Step(tool="bogus_tool")
def test_default_options_empty_dict(self):
s = Step(tool="text_clean")
assert s.options == {}
assert s.enabled is True
def test_display_name_falls_back_to_tool(self):
assert Step(tool="dedup").display_name() == "dedup"
assert Step(tool="dedup", name="Final dedup").display_name() == "Final dedup"
class TestPipelineSerialization:
def test_roundtrip_dict(self):
p = Pipeline(steps=[
Step("text_clean", {"trim": True}),
Step("dedup", {"survivor_rule": "first"}),
])
out = p.to_dict()
loaded = Pipeline.from_dict(out)
assert len(loaded.steps) == 2
assert loaded.steps[0].tool == "text_clean"
assert loaded.steps[1].options["survivor_rule"] == "first"
def test_roundtrip_file(self, tmp_path):
p = Pipeline(steps=[Step("text_clean")])
path = tmp_path / "p.json"
p.to_file(path)
loaded = Pipeline.from_file(path)
assert loaded.steps[0].tool == "text_clean"
def test_from_dict_missing_steps_key(self):
with pytest.raises(ConfigError):
Pipeline.from_dict({})
def test_from_dict_missing_tool(self):
with pytest.raises(ConfigError):
Pipeline.from_dict({"steps": [{"options": {}}]})
# ---------------------------------------------------------------------------
# recommended_pipeline
# ---------------------------------------------------------------------------
class TestRecommendedPipeline:
def test_default_order(self):
p = recommended_pipeline()
assert [s.tool for s in p.steps] == [
"text_clean", "format_standardize", "missing", "dedup",
]
def test_default_passes_validation(self):
p = recommended_pipeline()
assert validate_pipeline(p) == []
def test_include_overrides_default(self):
p = recommended_pipeline(include=["text_clean", "missing"])
assert [s.tool for s in p.steps] == ["text_clean", "missing"]
def test_options_seed_reaches_step(self):
p = recommended_pipeline(options={"text_clean": {"trim": False}})
assert p.steps[0].options == {"trim": False}
def test_unknown_tool_raises(self):
with pytest.raises(InputValidationError):
recommended_pipeline(include=["bogus"])
def test_can_place_column_map_first_or_last(self):
# Both placements must be acceptable per the docstring.
first = recommended_pipeline(include=[
"column_map", "text_clean", "format_standardize", "missing", "dedup",
])
last = recommended_pipeline(include=[
"text_clean", "format_standardize", "missing", "column_map", "dedup",
])
# No soft-dependency rule names column_map, so neither warns.
assert validate_pipeline(first) == []
assert validate_pipeline(last) == []
# ---------------------------------------------------------------------------
# validate_pipeline — soft dependencies
# ---------------------------------------------------------------------------
class TestValidatePipeline:
def test_in_order_no_warnings(self):
p = recommended_pipeline()
assert validate_pipeline(p) == []
def test_dedup_before_text_clean_warns(self):
p = Pipeline(steps=[Step("dedup"), Step("text_clean")])
ws = validate_pipeline(p)
assert len(ws) == 1
assert "dedup" in ws[0] and "text_clean" in ws[0]
def test_format_before_text_clean_warns(self):
p = Pipeline(steps=[Step("format_standardize"), Step("text_clean")])
ws = validate_pipeline(p)
assert any("format_standardize" in w for w in ws)
def test_disabled_steps_ignored(self):
# Disabled dedup-first should not trigger a warning.
p = Pipeline(steps=[
Step("dedup", enabled=False),
Step("text_clean"),
])
assert validate_pipeline(p) == []
def test_duplicate_tool_does_not_double_warn(self):
# text_clean twice (legitimate: two-pass cleaning) shouldn't
# generate redundant warnings.
p = Pipeline(steps=[
Step("text_clean"),
Step("text_clean"),
])
assert validate_pipeline(p) == []
# ---------------------------------------------------------------------------
# run_pipeline — execution
# ---------------------------------------------------------------------------
@pytest.fixture
def messy_df():
return pd.DataFrame({
"name": [" Alice ", "BOB", "N/A", "", "charlie "],
"phone": ["(415) 555-1234", "+44 20 7946 0958", "03-3210-7000", "", "(415) 555-1234"],
"country": ["US", "GB", "JP", "", "US"],
})
class TestRunPipeline:
def test_recommended_pipeline_runs_end_to_end(self, messy_df):
p = recommended_pipeline(options={
"format_standardize": {
"column_types": {"phone": "phone"},
"phone_country_column": "country",
},
"missing": {"strategy": "none"},
})
res = run_pipeline(messy_df, p)
assert isinstance(res, PipelineResult)
assert res.initial_rows == 5
# Dedup at the end removes the Alice/charlie duplicate (same phone).
assert res.final_rows < res.initial_rows
assert res.warnings == []
def test_initial_df_not_mutated(self, messy_df):
snapshot = messy_df.copy(deep=True)
run_pipeline(messy_df, recommended_pipeline())
pd.testing.assert_frame_equal(messy_df, snapshot)
def test_disabled_step_skipped(self, messy_df):
p = Pipeline(steps=[
Step("text_clean", enabled=False),
Step("missing", options={"strategy": "none"}),
])
res = run_pipeline(messy_df, p)
assert res.step_results[0].skipped is True
assert res.step_results[1].skipped is False
def test_step_results_ordered_and_timed(self, messy_df):
p = recommended_pipeline(options={
"missing": {"strategy": "none"},
})
res = run_pipeline(messy_df, p)
assert len(res.step_results) == 4
for sr in res.step_results:
assert sr.elapsed_seconds >= 0
assert [sr.step.tool for sr in res.step_results] == [
"text_clean", "format_standardize", "missing", "dedup",
]
def test_warnings_returned_but_run_proceeds(self, messy_df):
p = Pipeline(steps=[
Step("dedup"),
Step("text_clean"),
])
res = run_pipeline(messy_df, p)
assert res.warnings # warnings present
# Both steps still ran.
assert all(not sr.skipped for sr in res.step_results)
def test_progress_callback_fires_per_step(self, messy_df):
seen: list[StepResult] = []
p = Pipeline(steps=[
Step("text_clean"),
Step("missing", options={"strategy": "none"}),
])
run_pipeline(messy_df, p, on_step_complete=seen.append)
assert len(seen) == 2
assert all(isinstance(s, StepResult) for s in seen)
def test_progress_callback_exception_does_not_abort(self, messy_df):
def bad(_sr):
raise RuntimeError("boom")
p = Pipeline(steps=[Step("text_clean")])
# Must not raise.
res = run_pipeline(messy_df, p, on_step_complete=bad)
assert res.final_rows == 5
def test_stop_on_error_default(self, messy_df):
# Force an error by giving format_standardize a non-existent column.
p = Pipeline(steps=[
Step("format_standardize", options={
"column_types": {"does_not_exist": "phone"},
}),
])
with pytest.raises(InputValidationError):
run_pipeline(messy_df, p)
def test_continue_on_error_carries_previous_df(self, messy_df):
p = Pipeline(steps=[
Step("text_clean"),
Step("format_standardize", options={
"column_types": {"does_not_exist": "phone"},
}),
Step("missing", options={"strategy": "none"}),
])
res = run_pipeline(messy_df, p, stop_on_error=False)
# Step 2 errored, step 3 still ran.
assert res.step_results[1].error is not None
assert res.step_results[2].error is None
assert res.final_rows == 5
def test_non_dataframe_input(self):
with pytest.raises(InputValidationError):
run_pipeline([1, 2, 3], recommended_pipeline()) # type: ignore[arg-type]
# ---------------------------------------------------------------------------
# Per-tool adapter sanity
# ---------------------------------------------------------------------------
class TestAdapters:
@pytest.mark.parametrize("tool", TOOL_NAMES)
def test_adapter_with_default_options_runs(self, tool, messy_df):
# Each adapter must accept an empty options dict and return a
# (df, summary) pair.
out_df, summary = TOOL_ADAPTERS[tool](messy_df, {})
assert isinstance(out_df, pd.DataFrame)
assert isinstance(summary, dict)
def test_format_standardize_adapter_passes_column_types(self, messy_df):
out, summary = TOOL_ADAPTERS["format_standardize"](
messy_df, {"column_types": {"phone": "phone"}},
)
assert summary["columns_processed"] == ["phone"]
def test_dedup_adapter_with_unknown_survivor_rule_raises(self, messy_df):
with pytest.raises(ConfigError):
TOOL_ADAPTERS["dedup"](messy_df, {"survivor_rule": "bogus"})
# ---------------------------------------------------------------------------
# SOFT_DEPENDENCIES integrity
# ---------------------------------------------------------------------------
class TestSoftDependencies:
def test_every_pair_uses_known_tools(self):
for earlier, later, _ in SOFT_DEPENDENCIES:
assert earlier in TOOL_NAMES
assert later in TOOL_NAMES
def test_all_reasons_non_empty(self):
for _, _, why in SOFT_DEPENDENCIES:
assert why and isinstance(why, str)
# Reason should be a sentence — at least 20 chars.
assert len(why) > 20
def test_dependencies_form_a_dag(self):
# No cycles — there must exist a topological ordering of the
# tools such that every soft dependency (earlier, later)
# is satisfied. With 5 tools and 6 deps this is easy to verify.
from collections import defaultdict, deque
edges: dict[str, list[str]] = defaultdict(list)
in_degree: dict[str, int] = {t: 0 for t in TOOL_NAMES}
for e, l, _ in SOFT_DEPENDENCIES:
edges[e].append(l)
in_degree[l] += 1
queue = deque(t for t, d in in_degree.items() if d == 0)
order = []
while queue:
t = queue.popleft()
order.append(t)
for nxt in edges[t]:
in_degree[nxt] -= 1
if in_degree[nxt] == 0:
queue.append(nxt)
assert len(order) == len(TOOL_NAMES), (
f"SOFT_DEPENDENCIES contain a cycle; topo order={order}"
)