feat: 3 new tools, format streaming, distribution-ready demo + landing pages
Tools shipped this batch (4 → 6 of 9 Ready):
04 Missing Value Handler src/core/missing.py + cli_missing.py + GUI
05 Column Mapper src/core/column_mapper.py + cli_column_map.py + GUI
09 Pipeline Runner src/core/pipeline.py + cli_pipeline.py + GUI
with soft tool-dependency graph (recommended,
not enforced) and JSON save/load for repeatable
weekly cleanups.
Format Standardizer reworked for 1 GB international files:
• Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
• Per-row country / address columns drive parsing
• Audit cap (default 10 k rows, ~50 MB RAM)
• standardize_file(): chunked streaming entry point (~165 k rows/sec)
• currency_decimal="auto" for EU comma-decimal locales
• R$ / kr / zł multi-char currency prefixes
• cli_format.py with auto-stream above 100 MB inputs
Encoding detection arbiter + language-aware probe:
Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.
Distribution-readiness assets:
• streamlit_app.py — Streamlit Community Cloud entry shim
• src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
100-row cap + watermark, free-vs-paid boundary enforced at surface
• samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
• landing/ — 4 static HTML pages (apex chooser + 3 niche),
shared CSS, deploy.py URL-substitution script,
auto-generated robots.txt + sitemap.xml + 404.html + favicon
• docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
— full strategy + measurement + deployment + master checklist
Test counts:
before: 1,520 passed · 4 skipped · 17 xfailed
after: 1,729 passed · 0 skipped · 0 xfailed
Tier-1 corpora added:
• missing-corpus 3 use cases + 16 edge cases
• column-mapper-corpus 3 use cases + 5 edge cases
• format-cleaner intl 20-row 13-country stress fixture
Engine hardening flushed out by the corpora:
• interpolate guards against object-dtype columns
• mean/median skip all-NaN columns (silences numpy warning)
• fillna runs under future.no_silent_downcasting (silences pandas warning)
• mojibake test no longer skips when ftfy installed (monkeypatch path)
• drop-row threshold semantics: strict-greater (consistent across rows / cols)
• currency_decimal validator allow-set updated for "auto"
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -253,16 +253,20 @@ class TestEncodingOverride:
|
||||
|
||||
|
||||
class TestEncodingDecodeFailedFromRepair:
|
||||
def test_decode_replaced_action_surfaces_error_finding(self, tmp_path):
|
||||
# Create a file with a UTF-8 BOM but cp1252 body bytes — utf-8-sig
|
||||
# fails on byte 0x80 (€ in cp1252).
|
||||
def test_lying_bom_recovered_and_flagged(self, tmp_path):
|
||||
# File has a UTF-8 BOM but the body bytes are cp1252 (0x80 = € in
|
||||
# cp1252; not a valid UTF-8 continuation byte). Detector should
|
||||
# recover transparently to cp1252 and surface an
|
||||
# ``encoding_lying_bom`` warn so the user knows.
|
||||
f = tmp_path / "lying_bom.csv"
|
||||
f.write_bytes(b"\xef\xbb\xbfid,name\n1,\x80100\n")
|
||||
findings = analyze(f)
|
||||
ids = {x.id for x in findings}
|
||||
assert "encoding_decode_failed" in ids
|
||||
bad = next(x for x in findings if x.id == "encoding_decode_failed")
|
||||
assert bad.severity == "error"
|
||||
assert "encoding_lying_bom" in ids
|
||||
bad = next(x for x in findings if x.id == "encoding_lying_bom")
|
||||
assert bad.severity == "warn"
|
||||
# Decode should have succeeded — no replacement-character finding.
|
||||
assert "encoding_decode_failed" not in ids
|
||||
|
||||
|
||||
class TestMixedLineEndings:
|
||||
|
||||
374
tests/test_column_mapper.py
Normal file
374
tests/test_column_mapper.py
Normal file
@@ -0,0 +1,374 @@
|
||||
"""Tests for src/core/column_mapper.py."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from src.core.errors import ConfigError, InputValidationError
|
||||
from src.core.column_mapper import (
|
||||
MapOptions,
|
||||
PRESETS,
|
||||
TargetField,
|
||||
TargetSchema,
|
||||
coerce_series,
|
||||
infer_mapping,
|
||||
map_columns,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# infer_mapping — fuzzy matcher
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestInferMapping:
|
||||
def test_exact_normalized_match(self):
|
||||
df = pd.DataFrame({"First Name": [], "Last Name": []})
|
||||
schema = TargetSchema(fields=[
|
||||
TargetField(name="first_name"), TargetField(name="last_name"),
|
||||
])
|
||||
m = infer_mapping(df, schema)
|
||||
assert m == {"First Name": "first_name", "Last Name": "last_name"}
|
||||
|
||||
def test_alias_match(self):
|
||||
df = pd.DataFrame({"EmailAddr": []})
|
||||
schema = TargetSchema(fields=[
|
||||
TargetField(name="email", aliases=["EmailAddr", "email_address"]),
|
||||
])
|
||||
m = infer_mapping(df, schema)
|
||||
assert m == {"EmailAddr": "email"}
|
||||
|
||||
def test_below_threshold_excluded(self):
|
||||
df = pd.DataFrame({"xyz": []})
|
||||
schema = TargetSchema(fields=[TargetField(name="email")])
|
||||
m = infer_mapping(df, schema, threshold=0.6)
|
||||
assert m == {}
|
||||
|
||||
def test_target_matched_at_most_once(self):
|
||||
df = pd.DataFrame({"first_name": [], "fname": []})
|
||||
schema = TargetSchema(fields=[TargetField(name="first_name")])
|
||||
m = infer_mapping(df, schema)
|
||||
# Exact match wins; "fname" stays unmapped.
|
||||
assert m == {"first_name": "first_name"}
|
||||
|
||||
def test_threshold_zero_matches_anything(self):
|
||||
df = pd.DataFrame({"a": [], "b": []})
|
||||
schema = TargetSchema(fields=[TargetField(name="z")])
|
||||
m = infer_mapping(df, schema, threshold=0.0)
|
||||
assert len(m) == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# coerce_series
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestCoerceSeries:
|
||||
def test_integer_clean(self):
|
||||
s = pd.Series(["1", "2", "3"])
|
||||
out, fails = coerce_series(s, "integer")
|
||||
assert list(out) == [1, 2, 3]
|
||||
assert fails == 0
|
||||
|
||||
def test_integer_with_failure(self):
|
||||
s = pd.Series(["1", "bad", "3"])
|
||||
out, fails = coerce_series(s, "integer")
|
||||
assert fails == 1
|
||||
assert pd.isna(out.iloc[1])
|
||||
|
||||
def test_float_with_thousands_sep(self):
|
||||
# Plain floats; thousands-sep handling is for format standardizer.
|
||||
s = pd.Series(["1.5", "2.0", "3.25"])
|
||||
out, fails = coerce_series(s, "float")
|
||||
assert fails == 0
|
||||
assert out.iloc[2] == 3.25
|
||||
|
||||
def test_boolean_truthy_falsy(self):
|
||||
s = pd.Series(["true", "false", "Yes", "no", "1", "0"])
|
||||
out, fails = coerce_series(s, "boolean")
|
||||
assert fails == 0
|
||||
assert list(out) == [True, False, True, False, True, False]
|
||||
|
||||
def test_boolean_unknown_value_fails(self):
|
||||
s = pd.Series(["true", "maybe"])
|
||||
out, fails = coerce_series(s, "boolean")
|
||||
assert fails == 1
|
||||
assert pd.isna(out.iloc[1])
|
||||
|
||||
def test_date_iso_format(self):
|
||||
s = pd.Series(["2025-01-15", "2025-02-20"])
|
||||
out, fails = coerce_series(s, "date")
|
||||
assert fails == 0
|
||||
assert out.iloc[0].year == 2025
|
||||
|
||||
def test_date_failure(self):
|
||||
s = pd.Series(["2025-01-15", "garbage"])
|
||||
out, fails = coerce_series(s, "date")
|
||||
assert fails == 1
|
||||
assert pd.isna(out.iloc[1])
|
||||
|
||||
def test_string_passthrough(self):
|
||||
s = pd.Series([1, 2, 3])
|
||||
out, fails = coerce_series(s, "string")
|
||||
assert fails == 0
|
||||
assert out.dtype.name == "string"
|
||||
|
||||
def test_auto_returns_unchanged(self):
|
||||
s = pd.Series([1, 2])
|
||||
out, fails = coerce_series(s, "auto")
|
||||
assert fails == 0
|
||||
assert out is s
|
||||
|
||||
def test_unknown_dtype_raises(self):
|
||||
with pytest.raises(InputValidationError):
|
||||
coerce_series(pd.Series([1]), "bogus") # type: ignore[arg-type]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# map_columns — explicit mapping
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestMapColumnsExplicit:
|
||||
def test_simple_rename(self):
|
||||
df = pd.DataFrame({"a": [1], "b": [2]})
|
||||
opts = MapOptions(mapping={"a": "alpha", "b": "beta"})
|
||||
res = map_columns(df, opts)
|
||||
assert list(res.mapped_df.columns) == ["alpha", "beta"]
|
||||
assert res.columns_renamed == 2
|
||||
|
||||
def test_unknown_source_raises(self):
|
||||
df = pd.DataFrame({"a": [1]})
|
||||
opts = MapOptions(mapping={"missing": "x"})
|
||||
with pytest.raises(InputValidationError):
|
||||
map_columns(df, opts)
|
||||
|
||||
def test_duplicate_target_raises(self):
|
||||
df = pd.DataFrame({"a": [1], "b": [2]})
|
||||
opts = MapOptions(mapping={"a": "x", "b": "x"})
|
||||
with pytest.raises(InputValidationError):
|
||||
map_columns(df, opts)
|
||||
|
||||
def test_unmapped_keep(self):
|
||||
df = pd.DataFrame({"a": [1], "b": [2]})
|
||||
opts = MapOptions(mapping={"a": "alpha"}, unmapped="keep")
|
||||
res = map_columns(df, opts)
|
||||
assert "b" in res.mapped_df.columns
|
||||
assert res.unmapped_kept == ["b"]
|
||||
|
||||
def test_unmapped_drop(self):
|
||||
df = pd.DataFrame({"a": [1], "b": [2]})
|
||||
opts = MapOptions(mapping={"a": "alpha"}, unmapped="drop")
|
||||
res = map_columns(df, opts)
|
||||
assert list(res.mapped_df.columns) == ["alpha"]
|
||||
assert res.columns_dropped == ["b"]
|
||||
|
||||
def test_unmapped_error(self):
|
||||
df = pd.DataFrame({"a": [1], "b": [2]})
|
||||
opts = MapOptions(mapping={"a": "alpha"}, unmapped="error")
|
||||
with pytest.raises(InputValidationError):
|
||||
map_columns(df, opts)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# map_columns — schema + auto-inference
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestMapColumnsWithSchema:
|
||||
def test_auto_infer_renames(self):
|
||||
df = pd.DataFrame({"First Name": ["A"], "Last Name": ["B"]})
|
||||
schema = TargetSchema(fields=[
|
||||
TargetField(name="first_name"), TargetField(name="last_name"),
|
||||
])
|
||||
opts = MapOptions(schema=schema, auto_infer=True)
|
||||
res = map_columns(df, opts)
|
||||
assert "first_name" in res.mapped_df.columns
|
||||
assert "last_name" in res.mapped_df.columns
|
||||
assert res.inferred_pairs == {"First Name": "first_name", "Last Name": "last_name"}
|
||||
|
||||
def test_explicit_overrides_inferred(self):
|
||||
df = pd.DataFrame({"name": ["A"], "fname": ["B"]})
|
||||
schema = TargetSchema(fields=[TargetField(name="first_name")])
|
||||
opts = MapOptions(
|
||||
schema=schema,
|
||||
mapping={"fname": "first_name"},
|
||||
auto_infer=True,
|
||||
)
|
||||
res = map_columns(df, opts)
|
||||
assert res.mapping["fname"] == "first_name"
|
||||
assert "name" not in res.mapping
|
||||
|
||||
def test_required_missing_raises(self):
|
||||
df = pd.DataFrame({"first_name": ["A"]})
|
||||
schema = TargetSchema(fields=[
|
||||
TargetField(name="first_name", required=True),
|
||||
TargetField(name="email", required=True),
|
||||
])
|
||||
opts = MapOptions(schema=schema, auto_infer=False, enforce_required=True)
|
||||
with pytest.raises(InputValidationError):
|
||||
map_columns(df, opts)
|
||||
|
||||
def test_required_missing_with_default_added(self):
|
||||
df = pd.DataFrame({"first_name": ["A"]})
|
||||
schema = TargetSchema(fields=[
|
||||
TargetField(name="first_name", required=True),
|
||||
TargetField(name="source", required=False, default="import"),
|
||||
])
|
||||
opts = MapOptions(schema=schema, auto_infer=False)
|
||||
res = map_columns(df, opts)
|
||||
assert "source" in res.mapped_df.columns
|
||||
assert res.mapped_df.iloc[0]["source"] == "import"
|
||||
assert res.columns_added == ["source"]
|
||||
|
||||
def test_required_missing_disabled(self):
|
||||
df = pd.DataFrame({"first_name": ["A"]})
|
||||
schema = TargetSchema(fields=[
|
||||
TargetField(name="first_name", required=True),
|
||||
TargetField(name="email", required=True),
|
||||
])
|
||||
opts = MapOptions(schema=schema, auto_infer=False, enforce_required=False)
|
||||
res = map_columns(df, opts)
|
||||
assert "email" in res.missing_required_targets
|
||||
|
||||
def test_reorder_to_schema(self):
|
||||
df = pd.DataFrame({"z": [1], "a": [2], "m": [3]})
|
||||
schema = TargetSchema(fields=[
|
||||
TargetField(name="a"), TargetField(name="m"), TargetField(name="z"),
|
||||
])
|
||||
opts = MapOptions(schema=schema, auto_infer=True, reorder_to_schema=True)
|
||||
res = map_columns(df, opts)
|
||||
assert list(res.mapped_df.columns) == ["a", "m", "z"]
|
||||
|
||||
def test_coerce_types(self):
|
||||
df = pd.DataFrame({"age": ["30", "bad", "40"], "active": ["true", "no", "yes"]})
|
||||
schema = TargetSchema(fields=[
|
||||
TargetField(name="age", dtype="integer"),
|
||||
TargetField(name="active", dtype="boolean"),
|
||||
])
|
||||
opts = MapOptions(schema=schema, auto_infer=True, coerce_types=True)
|
||||
res = map_columns(df, opts)
|
||||
assert res.mapped_df["age"].iloc[0] == 30
|
||||
assert res.mapped_df["active"].iloc[0] is True or res.mapped_df["active"].iloc[0]
|
||||
assert res.coercion_failures == {"age": 1}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Presets
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestPresets:
|
||||
def test_strict_schema_drops_and_coerces_and_reorders(self):
|
||||
df = pd.DataFrame({"First Name": ["A"], "Email": ["a@x"], "extra": [1]})
|
||||
schema = TargetSchema(fields=[
|
||||
TargetField(name="first_name", required=True),
|
||||
TargetField(name="email", required=True),
|
||||
])
|
||||
opts = MapOptions.from_preset("strict-schema")
|
||||
opts.schema = schema
|
||||
res = map_columns(df, opts)
|
||||
assert list(res.mapped_df.columns) == ["first_name", "email"]
|
||||
assert res.columns_dropped == ["extra"]
|
||||
|
||||
def test_lenient_keeps_extras(self):
|
||||
df = pd.DataFrame({"First Name": ["A"], "extra": [1]})
|
||||
schema = TargetSchema(fields=[TargetField(name="first_name")])
|
||||
opts = MapOptions.from_preset("lenient-schema")
|
||||
opts.schema = schema
|
||||
res = map_columns(df, opts)
|
||||
assert "extra" in res.mapped_df.columns
|
||||
|
||||
def test_unknown_preset(self):
|
||||
with pytest.raises(ConfigError):
|
||||
MapOptions.from_preset("does-not-exist")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Schema serialization
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestSchemaIO:
|
||||
def test_roundtrip_dict(self):
|
||||
schema = TargetSchema(fields=[
|
||||
TargetField(name="x", dtype="integer", required=True, aliases=["X", "X "]),
|
||||
TargetField(name="y", default="z"),
|
||||
])
|
||||
d = schema.to_dict()
|
||||
loaded = TargetSchema.from_dict(d)
|
||||
assert loaded.field_names() == ["x", "y"]
|
||||
assert loaded.fields[0].required is True
|
||||
assert loaded.fields[1].default == "z"
|
||||
|
||||
def test_from_dict_string_field(self):
|
||||
# Allow shorthand: bare string defaults to dtype=auto.
|
||||
loaded = TargetSchema.from_dict({"fields": ["a", "b"]})
|
||||
assert loaded.field_names() == ["a", "b"]
|
||||
|
||||
def test_from_dict_unknown_dtype_raises(self):
|
||||
with pytest.raises(ConfigError):
|
||||
TargetSchema.from_dict({"fields": [{"name": "x", "dtype": "bogus"}]})
|
||||
|
||||
def test_from_dict_missing_name_raises(self):
|
||||
with pytest.raises(ConfigError):
|
||||
TargetSchema.from_dict({"fields": [{"dtype": "string"}]})
|
||||
|
||||
def test_options_roundtrip_to_file(self, tmp_path):
|
||||
schema = TargetSchema(fields=[TargetField(name="x", dtype="string")])
|
||||
opts = MapOptions(
|
||||
schema=schema,
|
||||
mapping={"a": "x"},
|
||||
unmapped="drop",
|
||||
coerce_types=True,
|
||||
reorder_to_schema=True,
|
||||
)
|
||||
path = tmp_path / "cfg.json"
|
||||
opts.to_file(path)
|
||||
loaded = MapOptions.from_file(path)
|
||||
assert loaded.mapping == {"a": "x"}
|
||||
assert loaded.unmapped == "drop"
|
||||
assert loaded.coerce_types is True
|
||||
assert loaded.schema is not None
|
||||
assert loaded.schema.field_names() == ["x"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Validation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestValidation:
|
||||
def test_invalid_unmapped_strategy(self):
|
||||
opts = MapOptions(unmapped="bogus") # type: ignore[arg-type]
|
||||
with pytest.raises(InputValidationError):
|
||||
opts.validate()
|
||||
|
||||
def test_threshold_out_of_range(self):
|
||||
opts = MapOptions(fuzzy_threshold=1.5)
|
||||
with pytest.raises(ConfigError):
|
||||
opts.validate()
|
||||
|
||||
def test_non_dataframe_input(self):
|
||||
with pytest.raises(InputValidationError):
|
||||
map_columns([1, 2, 3]) # type: ignore[arg-type]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Idempotency
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestIdempotency:
|
||||
def test_double_apply_is_stable(self):
|
||||
df = pd.DataFrame({"First Name": ["A"], "Email": ["a@x"]})
|
||||
schema = TargetSchema(fields=[
|
||||
TargetField(name="first_name"),
|
||||
TargetField(name="email"),
|
||||
])
|
||||
opts = MapOptions(schema=schema, auto_infer=True, reorder_to_schema=True)
|
||||
first = map_columns(df, opts)
|
||||
second = map_columns(first.mapped_df, opts)
|
||||
pd.testing.assert_frame_equal(second.mapped_df, first.mapped_df)
|
||||
|
||||
def test_input_not_mutated(self):
|
||||
df = pd.DataFrame({"a": [1], "b": [2]})
|
||||
snapshot = df.copy(deep=True)
|
||||
map_columns(df, MapOptions(mapping={"a": "x"}))
|
||||
pd.testing.assert_frame_equal(df, snapshot)
|
||||
240
tests/test_column_mapper_corpus.py
Normal file
240
tests/test_column_mapper_corpus.py
Normal file
@@ -0,0 +1,240 @@
|
||||
"""Acceptance corpus for the Column Mapper.
|
||||
|
||||
Loads every fixture in ``test-cases/column-mapper-corpus/test_data/``
|
||||
and asserts the documented behaviour against the documented schema.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from src.core.errors import InputValidationError
|
||||
from src.core.column_mapper import (
|
||||
MapOptions,
|
||||
TargetField,
|
||||
TargetSchema,
|
||||
map_columns,
|
||||
)
|
||||
|
||||
CORPUS = Path(__file__).resolve().parents[1] / "test-cases" / "column-mapper-corpus"
|
||||
TEST_DATA = CORPUS / "test_data"
|
||||
SCHEMAS = CORPUS / "schemas"
|
||||
|
||||
|
||||
def _read(name: str) -> pd.DataFrame:
|
||||
return pd.read_csv(TEST_DATA / name)
|
||||
|
||||
|
||||
def _schema(name: str) -> TargetSchema:
|
||||
return TargetSchema.from_file(SCHEMAS / name)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# UC01 — CRM import
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestUC01CrmImport:
|
||||
def test_strict_schema_round_trip(self):
|
||||
df = _read("uc01_crm_import.csv")
|
||||
schema = _schema("uc01_crm_target.json")
|
||||
opts = MapOptions.from_preset("strict-schema")
|
||||
opts.schema = schema
|
||||
res = map_columns(df, opts)
|
||||
|
||||
# Every required target is present after the run.
|
||||
for f in schema.fields:
|
||||
if f.required:
|
||||
assert f.name in res.mapped_df.columns
|
||||
|
||||
# 'owner' default added.
|
||||
assert "owner" in res.columns_added
|
||||
assert (res.mapped_df["owner"] == "unassigned").all()
|
||||
|
||||
# No unmapped survivors (strict preset drops extras).
|
||||
assert res.unmapped_kept == []
|
||||
|
||||
# Reordered to schema order.
|
||||
expected_prefix = [f.name for f in schema.fields]
|
||||
assert list(res.mapped_df.columns)[: len(expected_prefix)] == expected_prefix
|
||||
|
||||
def test_types_coerced_from_strings(self):
|
||||
df = _read("uc01_crm_import.csv")
|
||||
schema = _schema("uc01_crm_target.json")
|
||||
opts = MapOptions.from_preset("strict-schema")
|
||||
opts.schema = schema
|
||||
res = map_columns(df, opts)
|
||||
# annual_rev → integer (was numeric strings in the source).
|
||||
assert pd.api.types.is_integer_dtype(res.mapped_df["annual_rev"])
|
||||
# created_date → datetime64.
|
||||
assert pd.api.types.is_datetime64_any_dtype(res.mapped_df["created_date"])
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# UC02 — Multi-vendor unification
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestUC02MultiVendor:
|
||||
@pytest.mark.parametrize("vendor", ["a", "b", "c"])
|
||||
def test_each_vendor_normalises_to_canonical(self, vendor):
|
||||
df = _read(f"uc02_vendor_{vendor}.csv")
|
||||
schema = _schema("uc02_canonical.json")
|
||||
opts = MapOptions.from_preset("lenient-schema")
|
||||
opts.schema = schema
|
||||
opts.fuzzy_threshold = 0.5 # vendor C uses obscure aliases ("FName", "Tel")
|
||||
res = map_columns(df, opts)
|
||||
# Every required canonical field landed in the output.
|
||||
for f in schema.fields:
|
||||
if f.required:
|
||||
assert f.name in res.mapped_df.columns, (
|
||||
f"vendor {vendor}: missing {f.name}; mapping={res.mapping}"
|
||||
)
|
||||
|
||||
def test_concatenated_vendors_share_schema(self):
|
||||
# The point of unification: after each vendor goes through the
|
||||
# mapper, the resulting frames stack cleanly.
|
||||
schema = _schema("uc02_canonical.json")
|
||||
opts = MapOptions.from_preset("strict-schema")
|
||||
opts.schema = schema
|
||||
opts.fuzzy_threshold = 0.5
|
||||
frames = [
|
||||
map_columns(_read(f"uc02_vendor_{v}.csv"), opts).mapped_df
|
||||
for v in ("a", "b", "c")
|
||||
]
|
||||
unified = pd.concat(frames, ignore_index=True)
|
||||
assert list(unified.columns) == [f.name for f in schema.fields]
|
||||
# Total rows = sum of inputs.
|
||||
assert len(unified) == sum(len(f) for f in frames)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# UC03 — Type coercion
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestUC03TypeCoercion:
|
||||
def test_documented_failures_are_reported(self):
|
||||
df = _read("uc03_type_coercion.csv")
|
||||
schema = _schema("uc03_types.json")
|
||||
opts = MapOptions.from_preset("lenient-schema")
|
||||
opts.schema = schema
|
||||
res = map_columns(df, opts)
|
||||
# Bad rows survive as NaN, with counts recorded.
|
||||
assert res.coercion_failures.get("age") == 1
|
||||
assert res.coercion_failures.get("score") == 1
|
||||
assert res.coercion_failures.get("joined") == 1
|
||||
|
||||
def test_coerced_dtypes(self):
|
||||
df = _read("uc03_type_coercion.csv")
|
||||
schema = _schema("uc03_types.json")
|
||||
opts = MapOptions.from_preset("lenient-schema")
|
||||
opts.schema = schema
|
||||
res = map_columns(df, opts)
|
||||
out = res.mapped_df
|
||||
assert pd.api.types.is_integer_dtype(out["id"])
|
||||
assert out["active"].dtype.name == "boolean"
|
||||
assert pd.api.types.is_datetime64_any_dtype(out["joined"])
|
||||
# Float failures NaN-ify.
|
||||
assert pd.isna(out["score"].iloc[1])
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Edge cases
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestEC01DuplicateTarget:
|
||||
def test_two_sources_to_same_target_raises(self):
|
||||
df = _read("ec01_duplicate_target.csv")
|
||||
opts = MapOptions(mapping={"a": "x", "b": "x"})
|
||||
with pytest.raises(InputValidationError):
|
||||
map_columns(df, opts)
|
||||
|
||||
|
||||
class TestEC02UnicodeColumns:
|
||||
def test_japanese_column_renamed(self):
|
||||
df = _read("ec02_unicode_columns.csv")
|
||||
opts = MapOptions(mapping={"名前": "name", "価格": "price"})
|
||||
res = map_columns(df, opts)
|
||||
assert "name" in res.mapped_df.columns
|
||||
assert "price" in res.mapped_df.columns
|
||||
# Email passes through (unmapped, kept by default).
|
||||
assert "Email" in res.mapped_df.columns
|
||||
|
||||
|
||||
class TestEC03WhitespaceHeaders:
|
||||
def test_header_whitespace_does_not_block_match(self):
|
||||
df = _read("ec03_whitespace_headers.csv")
|
||||
schema = TargetSchema(fields=[
|
||||
TargetField(name="first_name", aliases=["First Name"]),
|
||||
TargetField(name="last_name", aliases=["Last Name"]),
|
||||
TargetField(name="email", aliases=["EmailAddr"]),
|
||||
])
|
||||
opts = MapOptions(schema=schema, auto_infer=True)
|
||||
res = map_columns(df, opts)
|
||||
# All three columns should map despite the leading/trailing spaces.
|
||||
assert len(res.mapping) == 3
|
||||
|
||||
|
||||
class TestEC04NoMatch:
|
||||
def test_zero_inferred_with_no_match(self):
|
||||
df = _read("ec04_no_match.csv")
|
||||
schema = TargetSchema(fields=[
|
||||
TargetField(name="email"), TargetField(name="phone"),
|
||||
])
|
||||
opts = MapOptions(schema=schema, auto_infer=True, unmapped="keep")
|
||||
res = map_columns(df, opts)
|
||||
assert res.inferred_pairs == {}
|
||||
# Source columns survive as-is under keep.
|
||||
assert set(df.columns) <= set(res.mapped_df.columns)
|
||||
|
||||
def test_no_match_with_unmapped_error(self):
|
||||
df = _read("ec04_no_match.csv")
|
||||
schema = TargetSchema(fields=[TargetField(name="email")])
|
||||
opts = MapOptions(
|
||||
schema=schema, auto_infer=True, unmapped="error",
|
||||
enforce_required=False,
|
||||
)
|
||||
with pytest.raises(InputValidationError):
|
||||
map_columns(df, opts)
|
||||
|
||||
|
||||
class TestEC05RequiredMissing:
|
||||
def test_required_missing_raises(self):
|
||||
df = _read("ec05_required_missing.csv")
|
||||
schema = TargetSchema(fields=[
|
||||
TargetField(name="first_name", required=True),
|
||||
TargetField(name="email", required=True),
|
||||
])
|
||||
opts = MapOptions(schema=schema, auto_infer=True, enforce_required=True)
|
||||
with pytest.raises(InputValidationError):
|
||||
map_columns(df, opts)
|
||||
|
||||
def test_disable_enforce_surfaces_in_result(self):
|
||||
df = _read("ec05_required_missing.csv")
|
||||
schema = TargetSchema(fields=[
|
||||
TargetField(name="first_name", required=True),
|
||||
TargetField(name="email", required=True),
|
||||
])
|
||||
opts = MapOptions(schema=schema, auto_infer=True, enforce_required=False)
|
||||
res = map_columns(df, opts)
|
||||
assert "email" in res.missing_required_targets
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Whole-corpus property tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ALL_FIXTURES = sorted(p.name for p in TEST_DATA.glob("*.csv"))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fixture", ALL_FIXTURES)
|
||||
def test_map_columns_does_not_mutate_input(fixture):
|
||||
df = pd.read_csv(TEST_DATA / fixture)
|
||||
snapshot = df.copy(deep=True)
|
||||
try:
|
||||
map_columns(df, MapOptions()) # identity run; default options.
|
||||
except InputValidationError:
|
||||
pass # ec01 / ec05 raise here — fine, mutation is what we care about.
|
||||
pd.testing.assert_frame_equal(df, snapshot)
|
||||
@@ -169,8 +169,23 @@ class TestMojibake:
|
||||
assert actual.equals(expected), "14 mojibake default (no repair) differs"
|
||||
|
||||
def test_fixed_variant(self):
|
||||
# --fix-mojibake is Tier 2; the cleaner does not implement it. Mark xfail.
|
||||
pytest.xfail("Mojibake auto-repair is Tier 2; not yet implemented (uses ftfy).")
|
||||
"""Mojibake auto-repair (ftfy-backed) restores the original text.
|
||||
|
||||
Skipped automatically when ftfy is not installed — the engine
|
||||
falls back to a no-op in that case and the diff would never close.
|
||||
"""
|
||||
try:
|
||||
import ftfy # noqa: F401
|
||||
except ImportError:
|
||||
pytest.skip("ftfy not installed — install ftfy to enable mojibake repair")
|
||||
|
||||
from src.core.fixes import repair_mojibake
|
||||
|
||||
df = _read_csv_strict(TEST_DATA / "14_mojibake.csv")
|
||||
expected = _read_csv_strict(EXPECTED / "14_mojibake__fixed.csv")
|
||||
repaired, _ = repair_mojibake(df)
|
||||
actual = repaired.reset_index(drop=True)
|
||||
assert actual.equals(expected), "14 mojibake fixed variant differs"
|
||||
|
||||
|
||||
class TestEmptyFile:
|
||||
|
||||
@@ -14,12 +14,11 @@ What's tested
|
||||
REJECT / LOW_CONFIDENCE.
|
||||
3. The decoded DataFrame matches the canonical reference content.
|
||||
|
||||
Cases where the current implementation is known to fail (charset-
|
||||
normalizer label drift on byte-equivalent encodings, ``repair_bytes``
|
||||
NUL-strip destroying UTF-16, the "lying BOM" pathological case) are
|
||||
marked ``xfail`` so they surface in the report as documented gaps.
|
||||
A future fix that makes the case pass will flip xfail to xpass and the
|
||||
test owner can drop the marker.
|
||||
Detection arbiter (cp1250→cp1252, mac_iceland→mac_roman, lying-BOM
|
||||
recovery) and a language-aware probe (Cyrillic / EE-Latin coverage)
|
||||
together close every documented gap; the ``KNOWN_*_FAILURES`` dicts
|
||||
below are kept empty as a tripwire — re-add an entry only when a real
|
||||
limitation surfaces.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -41,27 +40,9 @@ REFERENCE_DIR = CORPUS / "reference"
|
||||
|
||||
# Known failures the analyzer does not yet handle correctly. Each entry
|
||||
# has a one-line reason — drop the entry once a fix lands.
|
||||
KNOWN_DETECTION_FAILURES = {
|
||||
"E03_western_basic_cp1252.csv": "charset-normalizer returns cp1250 for byte-equivalent content",
|
||||
"E04_western_basic_latin1.csv": "charset-normalizer returns cp1250 for byte-equivalent content",
|
||||
"E05_western_basic_latin9.csv": "charset-normalizer returns cp1250 for byte-equivalent content",
|
||||
"E06_western_basic_macroman.csv": "returns mac_iceland (same family) instead of mac_roman",
|
||||
"E11_western_extended_cp1252.csv": "charset-normalizer returns cp1250 for cp1252 content",
|
||||
"E15_eastern_european_iso88592.csv": "charset-normalizer returns cp1258 for ISO-8859-2 content",
|
||||
"E18_cyrillic_koi8r.csv": "charset-normalizer returns shift_jis_2004 for KOI8-R content",
|
||||
}
|
||||
KNOWN_DETECTION_FAILURES: dict[str, str] = {}
|
||||
|
||||
KNOWN_DECODE_FAILURES = {
|
||||
"E03_western_basic_cp1252.csv": "decoded as cp1250 — different mapping at 0xF1 (ñ vs ń)",
|
||||
"E04_western_basic_latin1.csv": "decoded as cp1250 — different mapping at 0xF1",
|
||||
"E05_western_basic_latin9.csv": "decoded as cp1250 — different mapping at 0xF1",
|
||||
"E10_western_extended_utf8.csv": "byte-level smart-quote fold rewrites U+201C/U+201D to ASCII before parse",
|
||||
"E11_western_extended_cp1252.csv": "wrong encoding + smart-quote fold",
|
||||
"E12_western_extended_utf16le.csv": "byte-level smart-quote fold rewrites U+201C/U+201D before parse",
|
||||
"E15_eastern_european_iso88592.csv": "wrong encoding (cp1258 != ISO-8859-2)",
|
||||
"E18_cyrillic_koi8r.csv": "wrong encoding (shift_jis_2004 != KOI8-R)",
|
||||
"E30_pathological_lying_bom.csv": "utf-8-sig fails on cp1252 body bytes; needs lying-BOM recovery",
|
||||
}
|
||||
KNOWN_DECODE_FAILURES: dict[str, str] = {}
|
||||
|
||||
|
||||
def _normalize_encoding(name: str) -> str:
|
||||
@@ -164,7 +145,12 @@ def _decodable_entries():
|
||||
],
|
||||
)
|
||||
def test_decoded_matches_reference(entry):
|
||||
df, _, _ = _load_for_analysis(CORPUS / entry["filename"], sample_rows=1000)
|
||||
# The reference files preserve smart quotes — disable byte-level
|
||||
# smart-quote folding so this round-trip identity test isn't
|
||||
# confounded by the analyzer's deliberate parser-safety fold.
|
||||
df, _, _ = _load_for_analysis(
|
||||
CORPUS / entry["filename"], sample_rows=1000, fold_quotes=False,
|
||||
)
|
||||
ref_text = REFERENCES[entry["canonical_content_id"]]
|
||||
ref_rows = list(csv.reader(io.StringIO(ref_text)))
|
||||
if not ref_rows:
|
||||
|
||||
@@ -230,8 +230,27 @@ class TestRepairMojibake:
|
||||
|
||||
|
||||
class TestRepairMojibakeNoFtfy:
|
||||
@pytest.mark.skipif(_HAS_FTFY, reason="ftfy installed — exercises the no-op path")
|
||||
def test_returns_input_unchanged_without_ftfy(self):
|
||||
def test_returns_input_unchanged_without_ftfy(self, monkeypatch):
|
||||
"""Exercise the no-op path regardless of whether ftfy is installed.
|
||||
|
||||
``repair_mojibake`` lazy-imports ftfy inside the function body, so
|
||||
we hide ``ftfy`` from ``sys.modules`` and from import resolution
|
||||
before calling. The function must then degrade to ``(df, 0)``
|
||||
without raising.
|
||||
"""
|
||||
import sys
|
||||
import builtins
|
||||
|
||||
monkeypatch.delitem(sys.modules, "ftfy", raising=False)
|
||||
real_import = builtins.__import__
|
||||
|
||||
def fake_import(name, *args, **kwargs):
|
||||
if name == "ftfy" or name.startswith("ftfy."):
|
||||
raise ImportError("ftfy hidden by test")
|
||||
return real_import(name, *args, **kwargs)
|
||||
|
||||
monkeypatch.setattr(builtins, "__import__", fake_import)
|
||||
|
||||
df = pd.DataFrame({"x": ["café"]})
|
||||
out, changed = repair_mojibake(df)
|
||||
assert changed == 0
|
||||
|
||||
105
tests/test_format_intl_corpus.py
Normal file
105
tests/test_format_intl_corpus.py
Normal file
@@ -0,0 +1,105 @@
|
||||
"""Acceptance corpus for international format standardization.
|
||||
|
||||
Stresses the rework's three pillars on a single mixed-locale fixture:
|
||||
* Per-row country column drives phone parsing.
|
||||
* ``currency_decimal="auto"`` resolves comma-decimal locales.
|
||||
* Streaming entry point handles the same content unchanged.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from src.core.format_standardize import (
|
||||
FieldType,
|
||||
StandardizeOptions,
|
||||
standardize_dataframe,
|
||||
standardize_file,
|
||||
)
|
||||
|
||||
CORPUS = Path(__file__).resolve().parents[1] / "test-cases" / "format-cleaner-corpus" / "international"
|
||||
FIXTURE = CORPUS / "intl_phones_addresses.csv"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def df():
|
||||
return pd.read_csv(FIXTURE, dtype=str, keep_default_na=False)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def options():
|
||||
return StandardizeOptions(
|
||||
column_types={
|
||||
"name": FieldType.NAME,
|
||||
"phone": FieldType.PHONE,
|
||||
"price": FieldType.CURRENCY,
|
||||
},
|
||||
phone_country_column="country",
|
||||
currency_preserve_code=True,
|
||||
currency_decimal="auto",
|
||||
)
|
||||
|
||||
|
||||
class TestPhonesByRegion:
|
||||
def test_every_row_lands_on_correct_e164_prefix(self, df, options):
|
||||
# Each row's country column drives the per-row region used by
|
||||
# phonenumbers.parse — the correct + prefix is the acceptance bar.
|
||||
res = standardize_dataframe(df, options)
|
||||
out = res.standardized_df
|
||||
# ISO-2 → expected E.164 country code prefix
|
||||
prefix_for_country = {
|
||||
"US": "+1", "GB": "+44", "RU": "+7", "ES": "+34",
|
||||
"FR": "+33", "JP": "+81", "DE": "+49", "IT": "+39",
|
||||
"CN": "+86", "IN": "+91", "EG": "+20", "AU": "+61",
|
||||
"BR": "+55", "MX": "+52", "KR": "+82", "TR": "+90",
|
||||
"IL": "+972", "PL": "+48", "DK": "+45", "SE": "+46",
|
||||
}
|
||||
bad: list[tuple[str, str, str]] = []
|
||||
for _, row in out.iterrows():
|
||||
want = prefix_for_country[row["country"]]
|
||||
got = row["phone"]
|
||||
if not got.startswith(want):
|
||||
bad.append((row["country"], want, got))
|
||||
assert not bad, f"phone prefix mismatches: {bad}"
|
||||
|
||||
|
||||
class TestCurrencyByLocale:
|
||||
def test_eu_decimal_comma_resolves_under_auto(self, df, options):
|
||||
res = standardize_dataframe(df, options)
|
||||
# Spain, France, Germany, Italy, Brazil, Sweden all use decimal
|
||||
# comma. Verify a clean numeric result post-standardization.
|
||||
eu_idx = df.index[df["country"].isin(
|
||||
["ES", "FR", "DE", "IT", "BR", "SE"]
|
||||
)]
|
||||
for i in eu_idx:
|
||||
val = res.standardized_df.loc[i, "price"]
|
||||
# Either ``CODE NNN.NN`` or bare ``NNN.NN`` — but the comma
|
||||
# in the source must have become a dot in the output.
|
||||
assert "," not in val, (
|
||||
f"row {i} ({df.loc[i, 'country']}): comma persisted in {val!r}"
|
||||
)
|
||||
|
||||
def test_brl_real_prefix_recognised(self, df, options):
|
||||
res = standardize_dataframe(df, options)
|
||||
br_row = res.standardized_df[res.standardized_df["country"] == "BR"].iloc[0]
|
||||
assert "BRL" in br_row["price"]
|
||||
|
||||
|
||||
class TestStreamingMatchesInMemory:
|
||||
def test_same_output_via_streaming(self, tmp_path, df, options):
|
||||
# Streaming the same fixture through standardize_file should
|
||||
# produce a CSV byte-equivalent to the in-memory path.
|
||||
in_mem = standardize_dataframe(df, options).standardized_df
|
||||
out = tmp_path / "out.csv"
|
||||
# Use a chunk size that splits the 20-row fixture mid-way.
|
||||
res = standardize_file(FIXTURE, out, options, chunk_size=7)
|
||||
assert res.rows_processed == len(df)
|
||||
streamed = pd.read_csv(out, dtype=str, keep_default_na=False)
|
||||
# Compare typed columns only — others pass through.
|
||||
for col in options.column_types:
|
||||
assert streamed[col].tolist() == in_mem[col].astype(str).tolist(), (
|
||||
f"column {col} differs between in-memory and streaming"
|
||||
)
|
||||
@@ -110,16 +110,16 @@ _DATE_EXPECTED_MDY: dict[str, object] = {
|
||||
"FD13": PASSTHROUGH,
|
||||
"FD14": PASSTHROUGH,
|
||||
"FD15": PASSTHROUGH,
|
||||
# excel serial → 2024-01-15 (xfail — not implemented)
|
||||
# excel serial dates (numeric days since 1899-12-30)
|
||||
"FD22": "2024-01-15",
|
||||
"FD23": "2024-01-15",
|
||||
# unix timestamp seconds / millis → 2024-01-15 (xfail)
|
||||
# unix timestamps (seconds, milliseconds)
|
||||
"FD24": "2024-01-15",
|
||||
"FD25": "2024-01-15",
|
||||
# partial precision — corpus preserves it
|
||||
"FD26": "2024-01",
|
||||
"FD27": "2024-01", # xfail — text precision
|
||||
"FD28": "2024-Q1", # xfail — quarter
|
||||
"FD27": "2024-01", # text precision month
|
||||
"FD28": "2024-Q1", # quarter
|
||||
"FD29": "2024",
|
||||
# 2-digit year cutoff (per docs: 1969 wins over 2069)
|
||||
"FD30": "1969-01-15",
|
||||
@@ -135,7 +135,7 @@ _DATE_EXPECTED_MDY: dict[str, object] = {
|
||||
"FD37": "2024-01-15",
|
||||
# garbage → pass through (corpus 0.3 boundary table)
|
||||
# FD38/39/40 → PASSTHROUGH default
|
||||
# locale-specific month names (xfail — not shipped)
|
||||
# locale-specific month names (en/fr/de via month_locales)
|
||||
"FD41": "2024-01-15",
|
||||
"FD42": "2024-01-15",
|
||||
# timezone — corpus 3.3 says fixed-offset only
|
||||
|
||||
301
tests/test_format_streaming.py
Normal file
301
tests/test_format_streaming.py
Normal file
@@ -0,0 +1,301 @@
|
||||
"""Tests for the format-standardizer rework: cache, vectorized dispatch,
|
||||
per-row country, audit cap, and streaming entry point."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from src.core.format_standardize import (
|
||||
FieldType,
|
||||
StandardizeOptions,
|
||||
StreamingStandardizeResult,
|
||||
_normalize_region,
|
||||
standardize_dataframe,
|
||||
standardize_file,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-row country / region
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestPerRowCountry:
|
||||
def test_phone_uses_per_row_country(self):
|
||||
df = pd.DataFrame({
|
||||
"phone": ["020 7946 0958", "03-3210-7000", "(415) 555-1234"],
|
||||
"country": ["GB", "JP", "US"],
|
||||
})
|
||||
opts = StandardizeOptions(
|
||||
column_types={"phone": FieldType.PHONE},
|
||||
phone_country_column="country",
|
||||
)
|
||||
res = standardize_dataframe(df, opts)
|
||||
out = res.standardized_df["phone"].tolist()
|
||||
assert out[0].startswith("+44")
|
||||
assert out[1].startswith("+81")
|
||||
assert out[2].startswith("+1")
|
||||
|
||||
def test_phone_country_full_name_resolved(self):
|
||||
df = pd.DataFrame({
|
||||
"phone": ["020 7946 0958"],
|
||||
"country": ["United Kingdom"],
|
||||
})
|
||||
opts = StandardizeOptions(
|
||||
column_types={"phone": FieldType.PHONE},
|
||||
phone_country_column="country",
|
||||
)
|
||||
res = standardize_dataframe(df, opts)
|
||||
assert res.standardized_df["phone"].iloc[0].startswith("+44")
|
||||
|
||||
def test_blank_country_falls_back_to_default(self):
|
||||
df = pd.DataFrame({
|
||||
"phone": ["(415) 555-1234"],
|
||||
"country": [""], # blank → use default region
|
||||
})
|
||||
opts = StandardizeOptions(
|
||||
column_types={"phone": FieldType.PHONE},
|
||||
phone_country_column="country",
|
||||
phone_region="US",
|
||||
)
|
||||
res = standardize_dataframe(df, opts)
|
||||
assert res.standardized_df["phone"].iloc[0] == "+14155551234"
|
||||
|
||||
def test_unknown_country_column_raises(self):
|
||||
df = pd.DataFrame({"phone": ["x"]})
|
||||
opts = StandardizeOptions(
|
||||
column_types={"phone": FieldType.PHONE},
|
||||
phone_country_column="missing_col",
|
||||
)
|
||||
from src.core.errors import InputValidationError
|
||||
with pytest.raises(InputValidationError):
|
||||
standardize_dataframe(df, opts)
|
||||
|
||||
|
||||
class TestNormalizeRegion:
|
||||
def test_iso2_passthrough(self):
|
||||
assert _normalize_region("US") == "US"
|
||||
assert _normalize_region("us") == "US"
|
||||
assert _normalize_region(" jp ") == "JP"
|
||||
|
||||
def test_iso3_mapped(self):
|
||||
assert _normalize_region("USA") == "US"
|
||||
assert _normalize_region("GBR") == "GB"
|
||||
assert _normalize_region("JPN") == "JP"
|
||||
|
||||
def test_full_name(self):
|
||||
assert _normalize_region("United States") == "US"
|
||||
assert _normalize_region("Japan") == "JP"
|
||||
assert _normalize_region("Brazil") == "BR"
|
||||
assert _normalize_region("brasil") == "BR"
|
||||
assert _normalize_region("España") == "ES"
|
||||
|
||||
def test_blank_or_unknown(self):
|
||||
assert _normalize_region("") is None
|
||||
assert _normalize_region(" ") is None
|
||||
assert _normalize_region(None) is None
|
||||
assert _normalize_region("xyz-no-such-country") is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Audit cap
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestAuditCap:
|
||||
def test_cap_truncates_change_rows(self):
|
||||
df = pd.DataFrame({
|
||||
"phone": ["(415) 555-12{:02d}".format(i) for i in range(50)],
|
||||
})
|
||||
opts = StandardizeOptions(
|
||||
column_types={"phone": FieldType.PHONE},
|
||||
audit_max_rows=5,
|
||||
)
|
||||
res = standardize_dataframe(df, opts)
|
||||
# cells_changed counts everything; the audit table is capped.
|
||||
assert res.cells_changed == 50
|
||||
assert len(res.changes) == 5
|
||||
|
||||
def test_unbounded_audit(self):
|
||||
df = pd.DataFrame({
|
||||
"phone": ["(415) 555-12{:02d}".format(i) for i in range(20)],
|
||||
})
|
||||
opts = StandardizeOptions(
|
||||
column_types={"phone": FieldType.PHONE},
|
||||
audit_max_rows=None,
|
||||
)
|
||||
res = standardize_dataframe(df, opts)
|
||||
assert len(res.changes) == 20
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cache + vectorized dispatch (correctness)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestCacheCorrectness:
|
||||
def test_repeated_phone_consistent(self):
|
||||
# 1000 copies of the same phone should produce identical output.
|
||||
df = pd.DataFrame({"phone": ["(415) 555-1234"] * 1000})
|
||||
opts = StandardizeOptions(
|
||||
column_types={"phone": FieldType.PHONE},
|
||||
audit_max_rows=None,
|
||||
)
|
||||
res = standardize_dataframe(df, opts)
|
||||
assert (res.standardized_df["phone"] == "+14155551234").all()
|
||||
assert res.cells_changed == 1000
|
||||
|
||||
def test_cache_disabled_still_works(self):
|
||||
df = pd.DataFrame({"phone": ["(415) 555-1234", "020 7946 0958"]})
|
||||
opts = StandardizeOptions(
|
||||
column_types={"phone": FieldType.PHONE},
|
||||
cache_size=0, # disabled
|
||||
)
|
||||
res = standardize_dataframe(df, opts)
|
||||
assert res.standardized_df["phone"].iloc[0] == "+14155551234"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Streaming standardize_file
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestStandardizeFile:
|
||||
def test_basic_streaming(self, tmp_path):
|
||||
inp = tmp_path / "in.csv"
|
||||
inp.write_text(
|
||||
"phone,country,price\n"
|
||||
"(415) 555-1234,US,$1500.00\n"
|
||||
"020 7946 0958,GB,£99.99\n"
|
||||
"03-3210-7000,JP,¥12000\n"
|
||||
"+33 1 42 86 82 00,FR,€850.50\n"
|
||||
)
|
||||
out = tmp_path / "out.csv"
|
||||
opts = StandardizeOptions(
|
||||
column_types={"phone": FieldType.PHONE, "price": FieldType.CURRENCY},
|
||||
phone_country_column="country",
|
||||
currency_preserve_code=True,
|
||||
)
|
||||
res = standardize_file(inp, out, opts, chunk_size=2)
|
||||
assert isinstance(res, StreamingStandardizeResult)
|
||||
assert res.rows_processed == 4
|
||||
assert res.chunks_processed == 2
|
||||
assert out.exists()
|
||||
out_df = pd.read_csv(out, dtype=str, keep_default_na=False)
|
||||
assert out_df["phone"].iloc[0].startswith("+1")
|
||||
assert out_df["phone"].iloc[1].startswith("+44")
|
||||
assert out_df["phone"].iloc[2].startswith("+81")
|
||||
assert out_df["phone"].iloc[3].startswith("+33")
|
||||
|
||||
def test_audit_capped_across_chunks(self, tmp_path):
|
||||
# 60 rows, audit cap 10, chunks of 20 → audit must stop at 10.
|
||||
inp = tmp_path / "in.csv"
|
||||
rows = ["phone\n"] + [f"(415) 555-12{i:02d}\n" for i in range(60)]
|
||||
inp.write_text("".join(rows))
|
||||
out = tmp_path / "out.csv"
|
||||
opts = StandardizeOptions(
|
||||
column_types={"phone": FieldType.PHONE},
|
||||
audit_max_rows=10,
|
||||
)
|
||||
res = standardize_file(inp, out, opts, chunk_size=20)
|
||||
# Audit file exists and has exactly 10 data rows + 1 header.
|
||||
audit_lines = res.audit_path.read_text().splitlines()
|
||||
assert len(audit_lines) - 1 == 10
|
||||
|
||||
def test_audit_row_indices_are_global(self, tmp_path):
|
||||
# Audit row numbers must reflect absolute file position, not chunk-local.
|
||||
inp = tmp_path / "in.csv"
|
||||
rows = ["phone\n"] + [f"(415) 555-12{i:02d}\n" for i in range(30)]
|
||||
inp.write_text("".join(rows))
|
||||
out = tmp_path / "out.csv"
|
||||
opts = StandardizeOptions(
|
||||
column_types={"phone": FieldType.PHONE},
|
||||
audit_max_rows=None,
|
||||
)
|
||||
res = standardize_file(inp, out, opts, chunk_size=10)
|
||||
audit = pd.read_csv(res.audit_path)
|
||||
# Rows should be 0..29, monotonically increasing.
|
||||
assert audit["row"].tolist() == list(range(30))
|
||||
|
||||
def test_progress_callback_fires(self, tmp_path):
|
||||
inp = tmp_path / "in.csv"
|
||||
inp.write_text("phone\n" + "\n".join("(415) 555-1234" for _ in range(20)) + "\n")
|
||||
out = tmp_path / "out.csv"
|
||||
opts = StandardizeOptions(column_types={"phone": FieldType.PHONE})
|
||||
seen: list[tuple[int, int]] = []
|
||||
def cb(rows, chunks):
|
||||
seen.append((rows, chunks))
|
||||
standardize_file(inp, out, opts, chunk_size=5, progress_callback=cb)
|
||||
assert len(seen) == 4
|
||||
assert seen[-1] == (20, 4)
|
||||
|
||||
def test_progress_callback_exception_does_not_abort(self, tmp_path):
|
||||
inp = tmp_path / "in.csv"
|
||||
inp.write_text("phone\n(415) 555-1234\n")
|
||||
out = tmp_path / "out.csv"
|
||||
opts = StandardizeOptions(column_types={"phone": FieldType.PHONE})
|
||||
def bad_cb(*a, **k):
|
||||
raise RuntimeError("boom")
|
||||
# Must not raise.
|
||||
res = standardize_file(inp, out, opts, chunk_size=1, progress_callback=bad_cb)
|
||||
assert res.rows_processed == 1
|
||||
|
||||
def test_missing_input_raises_clean_error(self, tmp_path):
|
||||
from src.core.errors import FileAccessError
|
||||
opts = StandardizeOptions(column_types={"phone": FieldType.PHONE})
|
||||
with pytest.raises(FileAccessError):
|
||||
standardize_file(
|
||||
tmp_path / "missing.csv",
|
||||
tmp_path / "out.csv",
|
||||
opts,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# International coverage smoke
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestInternationalCoverage:
|
||||
@pytest.mark.parametrize("number,country,prefix", [
|
||||
("020 7946 0958", "GB", "+44"),
|
||||
("03-3210-7000", "JP", "+81"),
|
||||
("+49 30 12345678", "DE", "+49"),
|
||||
("01 42 86 82 00", "FR", "+33"),
|
||||
("+39 06 6982", "IT", "+39"),
|
||||
("+34 91 411 1111", "ES", "+34"),
|
||||
("+86 10 1234 5678", "CN", "+86"),
|
||||
("+91 11 2345 6789", "IN", "+91"),
|
||||
("+61 2 9374 4000", "AU", "+61"),
|
||||
("11 3071 0000", "BR", "+55"),
|
||||
("+52 55 5555 0000", "MX", "+52"),
|
||||
("+82 2 2287 0114", "KR", "+82"),
|
||||
])
|
||||
def test_phone_via_per_row_region(self, number, country, prefix):
|
||||
df = pd.DataFrame({"phone": [number], "country": [country]})
|
||||
opts = StandardizeOptions(
|
||||
column_types={"phone": FieldType.PHONE},
|
||||
phone_country_column="country",
|
||||
)
|
||||
res = standardize_dataframe(df, opts)
|
||||
out = res.standardized_df["phone"].iloc[0]
|
||||
assert out.startswith(prefix), (
|
||||
f"{number!r} ({country}): expected to start with {prefix}, got {out!r}"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize("price,want_code", [
|
||||
("$1,500.00", "USD"),
|
||||
("€850,50", "EUR"),
|
||||
("£99.99", "GBP"),
|
||||
("¥12000", "JPY"),
|
||||
("R$ 250,00", "BRL"),
|
||||
("CHF 1200.00", "CHF"),
|
||||
])
|
||||
def test_currency_codes_detected(self, price, want_code):
|
||||
df = pd.DataFrame({"price": [price]})
|
||||
opts = StandardizeOptions(
|
||||
column_types={"price": FieldType.CURRENCY},
|
||||
currency_preserve_code=True,
|
||||
currency_decimal="auto", # international mode
|
||||
)
|
||||
res = standardize_dataframe(df, opts)
|
||||
assert want_code in res.standardized_df["price"].iloc[0]
|
||||
@@ -8,10 +8,8 @@ These cover edges that existing suites missed:
|
||||
- ``analyze()`` with ``sample_rows >= len(df)`` (uses copy(), not head()).
|
||||
- ``findings_by_tool`` on an empty list.
|
||||
- BOM that appears mid-cell rather than at file start.
|
||||
|
||||
The collapse-whitespace heuristic for numeric/date/phone-shaped cells (spec
|
||||
§4.17) is *not yet implemented* and is captured here as a known-gap xfail
|
||||
so it's surfaced rather than silently missing.
|
||||
- The collapse-whitespace heuristic for numeric/date/phone-shaped cells
|
||||
(spec §4.17), now wired in via ``_smart_collapse_whitespace``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
462
tests/test_missing.py
Normal file
462
tests/test_missing.py
Normal file
@@ -0,0 +1,462 @@
|
||||
"""Tests for src/core/missing.py."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from src.core.errors import ConfigError, InputValidationError
|
||||
from src.core.missing import (
|
||||
DEFAULT_SENTINELS,
|
||||
MissingOptions,
|
||||
PRESETS,
|
||||
detect_sentinels,
|
||||
handle_missing,
|
||||
is_missing_like,
|
||||
profile_missing,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# is_missing_like
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestIsMissingLike:
|
||||
def test_none(self):
|
||||
assert is_missing_like(None)
|
||||
|
||||
def test_nan(self):
|
||||
assert is_missing_like(np.nan)
|
||||
|
||||
def test_pd_nat(self):
|
||||
assert is_missing_like(pd.NaT)
|
||||
|
||||
def test_empty_string(self):
|
||||
assert is_missing_like("")
|
||||
|
||||
def test_whitespace_only(self):
|
||||
assert is_missing_like(" ")
|
||||
assert is_missing_like("\t\n ")
|
||||
|
||||
def test_default_sentinels(self):
|
||||
for s in ("N/A", "n/a", "NULL", "null", "-", "--", "?", "TBD", "(blank)"):
|
||||
assert is_missing_like(s), f"expected {s!r} to be missing-like"
|
||||
|
||||
def test_case_insensitive(self):
|
||||
assert is_missing_like("N/A")
|
||||
assert is_missing_like("n/A")
|
||||
assert is_missing_like("NA")
|
||||
assert is_missing_like("na")
|
||||
|
||||
def test_real_value_not_missing(self):
|
||||
assert not is_missing_like("hello")
|
||||
assert not is_missing_like("0")
|
||||
assert not is_missing_like(0)
|
||||
assert not is_missing_like(0.0)
|
||||
|
||||
def test_zero_is_not_missing(self):
|
||||
# Common bug: treating 0 / "0" / False as missing.
|
||||
assert not is_missing_like(0)
|
||||
assert not is_missing_like(False)
|
||||
|
||||
def test_custom_sentinels_override(self):
|
||||
assert is_missing_like("xx", sentinels=["xx"])
|
||||
assert not is_missing_like("xx", sentinels=["zz"])
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# detect_sentinels
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestDetectSentinels:
|
||||
def test_counts_by_label(self):
|
||||
s = pd.Series(["alice", "N/A", "n/a", "NULL", " ", "", "bob"])
|
||||
counts = detect_sentinels(s)
|
||||
# "n/a" matches both 'N/A' and 'n/a' under casefold; the canonical
|
||||
# label that wins is whichever is in the DEFAULT_SENTINELS list.
|
||||
assert sum(v for k, v in counts.items() if k != "(whitespace)") == 3
|
||||
assert counts["(whitespace)"] == 2
|
||||
|
||||
def test_skips_real_nan(self):
|
||||
s = pd.Series(["a", np.nan, "N/A"])
|
||||
counts = detect_sentinels(s)
|
||||
assert sum(counts.values()) == 1
|
||||
|
||||
def test_no_sentinels_returns_empty(self):
|
||||
s = pd.Series(["alice", "bob", "charlie"])
|
||||
assert detect_sentinels(s) == {}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# profile_missing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestProfileMissing:
|
||||
def test_basic(self):
|
||||
df = pd.DataFrame({
|
||||
"name": ["Alice", "Bob", "N/A", "", "Charlie"],
|
||||
"age": [30, None, 25, 40, np.nan],
|
||||
})
|
||||
prof = profile_missing(df, MissingOptions())
|
||||
assert prof.rows_total == 5
|
||||
# name: '' + 'N/A' = 2 sentinels; age: 2 NaN
|
||||
report_by_col = {r.column: r for r in prof.columns}
|
||||
assert report_by_col["name"].missing == 2
|
||||
assert report_by_col["age"].missing == 2
|
||||
assert prof.cells_missing == 4
|
||||
|
||||
def test_complete_dataframe(self):
|
||||
df = pd.DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]})
|
||||
prof = profile_missing(df, MissingOptions())
|
||||
assert prof.cells_missing == 0
|
||||
assert prof.rows_complete == 3
|
||||
assert prof.rows_with_any_missing == 0
|
||||
|
||||
def test_to_dataframe_columns(self):
|
||||
df = pd.DataFrame({"x": [1, None]})
|
||||
prof = profile_missing(df, MissingOptions())
|
||||
out = prof.to_dataframe()
|
||||
assert set(out.columns) >= {"column", "missing", "missing_pct", "top_sentinel"}
|
||||
|
||||
def test_disabled_sentinels_only_counts_real_nan(self):
|
||||
df = pd.DataFrame({"x": ["N/A", "alice", np.nan]})
|
||||
opts = MissingOptions(standardize_sentinels=False)
|
||||
prof = profile_missing(df, opts)
|
||||
report_by_col = {r.column: r for r in prof.columns}
|
||||
# Only the real NaN counts; 'N/A' is left alone.
|
||||
assert report_by_col["x"].missing == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# handle_missing — sentinel standardization
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestSentinelStandardization:
|
||||
def test_replaces_sentinels_with_nan(self):
|
||||
df = pd.DataFrame({"x": ["alice", "N/A", "-", " ", "bob"]})
|
||||
res = handle_missing(df, MissingOptions(strategy="none"))
|
||||
# 'N/A' + '-' + whitespace-only = 3
|
||||
assert res.sentinels_standardized == 3
|
||||
assert res.handled_df["x"].isna().sum() == 3
|
||||
assert res.handled_df.iloc[0]["x"] == "alice"
|
||||
assert res.handled_df.iloc[4]["x"] == "bob"
|
||||
|
||||
def test_audit_records_each_replacement(self):
|
||||
df = pd.DataFrame({"x": ["alice", "N/A", "bob"]})
|
||||
res = handle_missing(df, MissingOptions(strategy="none"))
|
||||
assert len(res.changes) == 1
|
||||
assert res.changes.iloc[0]["action"].startswith("standardize:")
|
||||
|
||||
def test_disabled_keeps_sentinels(self):
|
||||
df = pd.DataFrame({"x": ["alice", "N/A", "bob"]})
|
||||
opts = MissingOptions(standardize_sentinels=False, strategy="none")
|
||||
res = handle_missing(df, opts)
|
||||
assert res.sentinels_standardized == 0
|
||||
assert res.handled_df.iloc[1]["x"] == "N/A"
|
||||
|
||||
def test_custom_sentinels_extend_default(self):
|
||||
df = pd.DataFrame({"x": ["alice", "MISSING_DATA", "bob"]})
|
||||
opts = MissingOptions(
|
||||
sentinels=[*DEFAULT_SENTINELS, "MISSING_DATA"],
|
||||
strategy="none",
|
||||
)
|
||||
res = handle_missing(df, opts)
|
||||
assert res.sentinels_standardized == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# handle_missing — fill strategies
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestFillStrategies:
|
||||
@pytest.fixture
|
||||
def numeric_df(self):
|
||||
return pd.DataFrame({"x": [1.0, 2.0, np.nan, 4.0, np.nan]})
|
||||
|
||||
def test_mean(self, numeric_df):
|
||||
res = handle_missing(numeric_df, MissingOptions(strategy="mean"))
|
||||
# mean of [1, 2, 4] = 7/3
|
||||
filled = res.handled_df["x"].iloc[2]
|
||||
assert abs(filled - 7.0 / 3.0) < 1e-9
|
||||
assert res.cells_filled == 2
|
||||
|
||||
def test_median(self, numeric_df):
|
||||
res = handle_missing(numeric_df, MissingOptions(strategy="median"))
|
||||
# median of [1, 2, 4] = 2.0
|
||||
assert res.handled_df["x"].iloc[2] == 2.0
|
||||
|
||||
def test_mode(self):
|
||||
df = pd.DataFrame({"x": ["a", "a", "b", None, None]})
|
||||
res = handle_missing(df, MissingOptions(strategy="mode"))
|
||||
assert res.handled_df["x"].iloc[3] == "a"
|
||||
assert res.handled_df["x"].iloc[4] == "a"
|
||||
assert res.cells_filled == 2
|
||||
|
||||
def test_constant_scalar(self, numeric_df):
|
||||
res = handle_missing(
|
||||
numeric_df,
|
||||
MissingOptions(strategy="constant", fill_value=99.0),
|
||||
)
|
||||
assert res.handled_df["x"].iloc[2] == 99.0
|
||||
assert res.handled_df["x"].iloc[4] == 99.0
|
||||
|
||||
def test_constant_per_column(self):
|
||||
df = pd.DataFrame({"a": [1, np.nan], "b": ["x", None]})
|
||||
opts = MissingOptions(
|
||||
strategy="constant",
|
||||
column_fill_values={"a": 0, "b": "?"},
|
||||
)
|
||||
res = handle_missing(df, opts)
|
||||
assert res.handled_df["a"].iloc[1] == 0
|
||||
assert res.handled_df["b"].iloc[1] == "?"
|
||||
|
||||
def test_ffill(self):
|
||||
df = pd.DataFrame({"x": [1.0, np.nan, np.nan, 4.0]})
|
||||
res = handle_missing(df, MissingOptions(strategy="ffill"))
|
||||
assert list(res.handled_df["x"]) == [1.0, 1.0, 1.0, 4.0]
|
||||
|
||||
def test_bfill(self):
|
||||
df = pd.DataFrame({"x": [1.0, np.nan, np.nan, 4.0]})
|
||||
res = handle_missing(df, MissingOptions(strategy="bfill"))
|
||||
assert list(res.handled_df["x"]) == [1.0, 4.0, 4.0, 4.0]
|
||||
|
||||
def test_interpolate(self):
|
||||
df = pd.DataFrame({"x": [1.0, np.nan, np.nan, 4.0]})
|
||||
res = handle_missing(df, MissingOptions(strategy="interpolate"))
|
||||
assert list(res.handled_df["x"]) == [1.0, 2.0, 3.0, 4.0]
|
||||
|
||||
def test_numeric_strategy_falls_back_for_categorical(self):
|
||||
df = pd.DataFrame({"x": ["a", "a", None, "b"]})
|
||||
opts = MissingOptions(strategy="median", categorical_strategy="mode")
|
||||
res = handle_missing(df, opts)
|
||||
assert res.strategy_per_column["x"] == "mode"
|
||||
assert res.handled_df["x"].iloc[2] == "a"
|
||||
|
||||
def test_per_column_strategy_overrides_global(self):
|
||||
df = pd.DataFrame({"a": [1.0, np.nan], "b": ["x", None]})
|
||||
opts = MissingOptions(
|
||||
strategy="median",
|
||||
column_strategies={"b": "constant"},
|
||||
fill_value="??",
|
||||
)
|
||||
res = handle_missing(df, opts)
|
||||
assert res.handled_df["a"].iloc[1] == 1.0 # median of [1.0]
|
||||
assert res.handled_df["b"].iloc[1] == "??"
|
||||
|
||||
def test_all_nan_column_safely_skipped(self):
|
||||
df = pd.DataFrame({"x": [np.nan, np.nan, np.nan]})
|
||||
res = handle_missing(df, MissingOptions(strategy="mean"))
|
||||
assert res.cells_filled == 0
|
||||
assert res.handled_df["x"].isna().all()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# handle_missing — drops
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestDropStrategies:
|
||||
def test_drop_row_any_missing(self):
|
||||
# Strict-greater: threshold 0.0 → drop any row with any missing.
|
||||
df = pd.DataFrame({
|
||||
"a": [1, 2, np.nan, 4],
|
||||
"b": ["x", None, "z", "w"],
|
||||
})
|
||||
opts = MissingOptions(strategy="drop_row", row_drop_threshold=0.0)
|
||||
res = handle_missing(df, opts)
|
||||
# Rows 1 and 2 each have one missing cell; rows 0 and 3 are clean.
|
||||
assert res.rows_dropped == 2
|
||||
assert len(res.handled_df) == 2
|
||||
|
||||
def test_drop_row_default_threshold_never_drops(self):
|
||||
# Default 1.0 = never drop — no fraction exceeds 100%.
|
||||
df = pd.DataFrame({
|
||||
"a": [1, 2, np.nan],
|
||||
"b": ["x", "y", None],
|
||||
})
|
||||
opts = MissingOptions(strategy="drop_row") # threshold defaults to 1.0
|
||||
res = handle_missing(df, opts)
|
||||
assert res.rows_dropped == 0
|
||||
|
||||
def test_drop_row_partial_threshold(self):
|
||||
df = pd.DataFrame({
|
||||
"a": [1, np.nan, np.nan, np.nan],
|
||||
"b": [10, 20, np.nan, np.nan],
|
||||
"c": [100, 200, np.nan, 400],
|
||||
})
|
||||
# Strict-greater: threshold 0.5 → drop rows with > 50% missing.
|
||||
opts = MissingOptions(strategy="drop_row", row_drop_threshold=0.5)
|
||||
res = handle_missing(df, opts)
|
||||
# row 0: 0/3, row 1: 1/3 (0.33) -> keep
|
||||
# row 2: 3/3 (1.0) -> drop, row 3: 2/3 (0.67) -> drop
|
||||
assert res.rows_dropped == 2
|
||||
|
||||
def test_drop_col_threshold(self):
|
||||
df = pd.DataFrame({
|
||||
"keep": [1, 2, 3, 4],
|
||||
"drop_me": [np.nan, np.nan, np.nan, 1], # 75% missing
|
||||
})
|
||||
# Strict-greater: 0.5 → drop columns with > 50% missing.
|
||||
opts = MissingOptions(strategy="drop_col", col_drop_threshold=0.5)
|
||||
res = handle_missing(df, opts)
|
||||
assert "drop_me" in res.columns_dropped
|
||||
assert "keep" not in res.columns_dropped
|
||||
|
||||
def test_drop_both(self):
|
||||
df = pd.DataFrame({
|
||||
"keep": [1, 2, 3, 4, 5],
|
||||
"drop_col": [np.nan] * 5,
|
||||
"x": [1, np.nan, 3, np.nan, 5],
|
||||
})
|
||||
opts = MissingOptions(
|
||||
strategy="drop_both",
|
||||
col_drop_threshold=0.99, # >99% missing → drop column
|
||||
row_drop_threshold=0.0, # any missing in remaining cols → drop row
|
||||
)
|
||||
res = handle_missing(df, opts)
|
||||
# drop_col is 100% missing → dropped
|
||||
assert "drop_col" in res.columns_dropped
|
||||
# Remaining scope (keep + x): rows 1 and 3 have a missing x → drop.
|
||||
assert res.rows_dropped == 2
|
||||
|
||||
def test_drop_audit_records_dropped_rows(self):
|
||||
df = pd.DataFrame({"a": [1, np.nan], "b": [2, np.nan]})
|
||||
# Drop the fully-missing row (frac > 0.99).
|
||||
opts = MissingOptions(strategy="drop_row", row_drop_threshold=0.99)
|
||||
res = handle_missing(df, opts)
|
||||
drop_records = res.changes[res.changes["action"] == "drop_row"]
|
||||
assert len(drop_records) == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scope: columns / skip_columns
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestScope:
|
||||
def test_columns_filter(self):
|
||||
df = pd.DataFrame({"a": [np.nan, 2], "b": [np.nan, 4]})
|
||||
opts = MissingOptions(columns=["a"], strategy="constant", fill_value=99)
|
||||
res = handle_missing(df, opts)
|
||||
assert res.handled_df["a"].iloc[0] == 99
|
||||
# b should be untouched
|
||||
assert pd.isna(res.handled_df["b"].iloc[0])
|
||||
|
||||
def test_skip_columns(self):
|
||||
df = pd.DataFrame({"a": [np.nan, 2], "b": [np.nan, 4]})
|
||||
opts = MissingOptions(skip_columns=["b"], strategy="constant", fill_value=99)
|
||||
res = handle_missing(df, opts)
|
||||
assert res.handled_df["a"].iloc[0] == 99
|
||||
assert pd.isna(res.handled_df["b"].iloc[0])
|
||||
|
||||
def test_unknown_column_raises(self):
|
||||
df = pd.DataFrame({"a": [1]})
|
||||
opts = MissingOptions(columns=["does_not_exist"])
|
||||
with pytest.raises(InputValidationError):
|
||||
handle_missing(df, opts)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Presets / config
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestPresets:
|
||||
def test_detect_only_does_not_fill(self):
|
||||
df = pd.DataFrame({"x": ["alice", "N/A", "bob"]})
|
||||
opts = MissingOptions.from_preset("detect-only")
|
||||
res = handle_missing(df, opts)
|
||||
assert res.sentinels_standardized == 1
|
||||
assert res.cells_filled == 0
|
||||
assert res.rows_dropped == 0
|
||||
|
||||
def test_safe_fill_fills(self):
|
||||
df = pd.DataFrame({"age": [30, np.nan, 25, 40], "name": ["a", "a", None, "b"]})
|
||||
opts = MissingOptions.from_preset("safe-fill")
|
||||
res = handle_missing(df, opts)
|
||||
assert res.cells_filled == 2
|
||||
|
||||
def test_drop_incomplete(self):
|
||||
df = pd.DataFrame({"a": [1, np.nan, 3], "b": [10, 20, 30]})
|
||||
opts = MissingOptions.from_preset("drop-incomplete")
|
||||
res = handle_missing(df, opts)
|
||||
assert res.rows_dropped == 1
|
||||
|
||||
def test_unknown_preset_raises(self):
|
||||
with pytest.raises(ConfigError):
|
||||
MissingOptions.from_preset("does-not-exist")
|
||||
|
||||
def test_roundtrip_to_file(self, tmp_path):
|
||||
opts = MissingOptions.from_preset("safe-fill")
|
||||
opts.column_strategies = {"age": "median"}
|
||||
path = tmp_path / "cfg.json"
|
||||
opts.to_file(path)
|
||||
loaded = MissingOptions.from_file(path)
|
||||
assert loaded.strategy == opts.strategy
|
||||
assert loaded.column_strategies == opts.column_strategies
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Validation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestValidate:
|
||||
def test_invalid_strategy(self):
|
||||
opts = MissingOptions(strategy="bogus") # type: ignore[arg-type]
|
||||
with pytest.raises(InputValidationError):
|
||||
opts.validate()
|
||||
|
||||
def test_threshold_out_of_range(self):
|
||||
opts = MissingOptions(row_drop_threshold=1.5)
|
||||
with pytest.raises(ConfigError):
|
||||
opts.validate()
|
||||
|
||||
def test_handle_missing_validates(self):
|
||||
df = pd.DataFrame({"x": [1]})
|
||||
opts = MissingOptions(strategy="bogus") # type: ignore[arg-type]
|
||||
with pytest.raises(InputValidationError):
|
||||
handle_missing(df, opts)
|
||||
|
||||
def test_non_dataframe_input(self):
|
||||
with pytest.raises(InputValidationError):
|
||||
handle_missing([1, 2, 3]) # type: ignore[arg-type]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# End-to-end realistic case
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestEndToEnd:
|
||||
def test_messy_customer_export(self):
|
||||
df = pd.DataFrame({
|
||||
"customer_id": [1, 2, 3, 4, 5, 6],
|
||||
"name": ["Alice", "Bob", "N/A", " ", "Charlie", None],
|
||||
"email": ["a@x.com", "-", "c@x.com", "d@x.com", "NULL", "f@x.com"],
|
||||
"age": [30, np.nan, 25, 40, np.nan, 50],
|
||||
})
|
||||
opts = MissingOptions(
|
||||
standardize_sentinels=True,
|
||||
strategy="median",
|
||||
categorical_strategy="constant",
|
||||
fill_value="UNKNOWN",
|
||||
)
|
||||
res = handle_missing(df, opts)
|
||||
|
||||
# Sentinels: name "N/A"," ",None; email "-","NULL". (None is real-NaN, not sentinel.)
|
||||
# Whitespace + 'N/A' on name = 2; '-' + 'NULL' on email = 2. Total = 4.
|
||||
assert res.sentinels_standardized == 4
|
||||
# name has 3 missing after standardize (N/A, " ", None) → constant fill
|
||||
# email has 2 missing → constant fill
|
||||
# age has 2 missing → median (32.5 of [30, 25, 40, 50])
|
||||
assert res.cells_filled == 7
|
||||
assert res.handled_df["name"].isna().sum() == 0
|
||||
assert res.handled_df["email"].isna().sum() == 0
|
||||
assert res.handled_df["age"].isna().sum() == 0
|
||||
assert (res.handled_df["name"] == "UNKNOWN").sum() == 3
|
||||
assert (res.handled_df["age"] == 35.0).sum() == 2 # median of [30, 25, 40, 50]
|
||||
|
||||
def test_input_not_mutated(self):
|
||||
df = pd.DataFrame({"x": ["N/A", "alice", np.nan]})
|
||||
df_copy = df.copy()
|
||||
handle_missing(df, MissingOptions.from_preset("safe-fill"))
|
||||
pd.testing.assert_frame_equal(df, df_copy)
|
||||
463
tests/test_missing_corpus.py
Normal file
463
tests/test_missing_corpus.py
Normal file
@@ -0,0 +1,463 @@
|
||||
"""Acceptance corpus for the Missing Value Handler.
|
||||
|
||||
Loads every fixture in ``test-cases/missing-corpus/test_data/`` and
|
||||
asserts the documented behaviour. The fixtures are split into:
|
||||
|
||||
* ``uc##`` — three target-client use cases (Shopify operator,
|
||||
marketing analyst, consultant intake).
|
||||
* ``ec##`` — edge cases the engine must handle without surprise:
|
||||
all-NaN columns, zeros that aren't missing, Excel errors, unicode
|
||||
whitespace, mixed dtypes, padding, single row/column, every default
|
||||
sentinel, per-column constants, drop thresholds, leading-NaN ffill,
|
||||
numeric-strategy fallback for non-numeric columns, headers-only,
|
||||
idempotency.
|
||||
|
||||
Each test runs through the public API (``handle_missing``) so any
|
||||
regression in the engine surfaces here. Fixture files double as living
|
||||
documentation for what the tool is supposed to do.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from src.core.missing import (
|
||||
MissingOptions,
|
||||
handle_missing,
|
||||
is_missing_like,
|
||||
profile_missing,
|
||||
)
|
||||
|
||||
CORPUS = Path(__file__).resolve().parents[1] / "test-cases" / "missing-corpus"
|
||||
TEST_DATA = CORPUS / "test_data"
|
||||
|
||||
|
||||
def _read(name: str, *, dtype_str: bool = False) -> pd.DataFrame:
|
||||
"""Load a corpus CSV.
|
||||
|
||||
By default we let pandas infer dtypes — that's the most realistic
|
||||
intake path (Excel exports keep numeric columns numeric). A handful
|
||||
of cases pass ``dtype_str=True`` to keep sentinels visible in
|
||||
columns that would otherwise be coerced to float.
|
||||
"""
|
||||
path = TEST_DATA / name
|
||||
if dtype_str:
|
||||
return pd.read_csv(path, dtype=str, keep_default_na=False)
|
||||
return pd.read_csv(path)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Use case 1 — Shopify operator: detect-only
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestUC01ShopifyExport:
|
||||
"""SMB operator standardizes disguised nulls before reimporting."""
|
||||
|
||||
def test_detect_only_replaces_sentinels(self):
|
||||
df = _read("uc01_shopify_export.csv", dtype_str=True)
|
||||
opts = MissingOptions.from_preset("detect-only")
|
||||
res = handle_missing(df, opts)
|
||||
# Spot-check known sentinels from the fixture
|
||||
assert res.sentinels_standardized > 0
|
||||
assert res.cells_filled == 0
|
||||
assert res.rows_dropped == 0
|
||||
|
||||
# Fields that contained 'N/A', '-', 'NULL', '(blank)', '#N/A',
|
||||
# 'n/a', '?', '(none)' should now be NaN.
|
||||
for row, col in [
|
||||
(1, "phone"), # 'N/A'
|
||||
(2, "city"), # '-'
|
||||
(3, "total_orders"), # 'NULL'
|
||||
(5, "phone"), # ' '
|
||||
(5, "last_order_date"), # '(blank)'
|
||||
(6, "last_order_date"), # '#N/A'
|
||||
(7, "phone"), # 'n/a'
|
||||
(8, "city"), # '?'
|
||||
(9, "total_orders"), # '(none)'
|
||||
]:
|
||||
assert pd.isna(res.handled_df.iloc[row][col]), (
|
||||
f"Expected NaN at row {row} col {col}, got "
|
||||
f"{res.handled_df.iloc[row][col]!r}"
|
||||
)
|
||||
|
||||
def test_real_values_preserved(self):
|
||||
df = _read("uc01_shopify_export.csv", dtype_str=True)
|
||||
res = handle_missing(df, MissingOptions.from_preset("detect-only"))
|
||||
# First row should be untouched.
|
||||
assert res.handled_df.iloc[0]["first_name"] == "Alice"
|
||||
assert res.handled_df.iloc[0]["email"] == "alice@shop.com"
|
||||
assert res.handled_df.iloc[0]["lifetime_value"] == "1240.50"
|
||||
|
||||
def test_audit_log_complete(self):
|
||||
df = _read("uc01_shopify_export.csv", dtype_str=True)
|
||||
res = handle_missing(df, MissingOptions.from_preset("detect-only"))
|
||||
# One audit row per sentinel replacement.
|
||||
assert len(res.changes) == res.sentinels_standardized
|
||||
assert set(res.changes["action"].apply(lambda s: s.startswith("standardize:"))) == {True}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Use case 2 — Marketing analyst: safe-fill
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestUC02MarketingAudience:
|
||||
"""Marketer fills numeric columns with median, categorical with mode."""
|
||||
|
||||
def test_safe_fill_clears_all_missing(self):
|
||||
df = _read("uc02_marketing_audience.csv")
|
||||
opts = MissingOptions.from_preset("safe-fill")
|
||||
res = handle_missing(df, opts)
|
||||
# Every cell in scope should be filled.
|
||||
assert res.profile_after.cells_missing == 0
|
||||
assert res.cells_filled > 0
|
||||
|
||||
def test_numeric_uses_median_categorical_uses_mode(self):
|
||||
df = _read("uc02_marketing_audience.csv")
|
||||
opts = MissingOptions.from_preset("safe-fill")
|
||||
res = handle_missing(df, opts)
|
||||
# 'age' is numeric → median strategy
|
||||
assert res.strategy_per_column["age"] == "median"
|
||||
# 'segment' / 'region' / 'source' are object → mode fallback
|
||||
assert res.strategy_per_column["segment"] == "mode"
|
||||
assert res.strategy_per_column["region"] == "mode"
|
||||
|
||||
def test_per_column_override(self):
|
||||
df = _read("uc02_marketing_audience.csv")
|
||||
opts = MissingOptions.from_preset("safe-fill")
|
||||
opts.column_strategies = {"source": "constant"}
|
||||
opts.column_fill_values = {"source": "unknown"}
|
||||
res = handle_missing(df, opts)
|
||||
# Cells previously holding sentinels in 'source' should now equal "unknown".
|
||||
assert (res.handled_df["source"] == "unknown").sum() >= 3
|
||||
|
||||
def test_consent_real_false_not_dropped(self):
|
||||
# 'consent' column has empty cells but also explicit "true"; mode fill
|
||||
# must not silently change a real "true" to anything else.
|
||||
df = _read("uc02_marketing_audience.csv")
|
||||
res = handle_missing(df, MissingOptions.from_preset("safe-fill"))
|
||||
original_trues = (df["consent"] == "true").sum()
|
||||
result_trues = (res.handled_df["consent"] == "true").sum()
|
||||
# Filled rows can become "true" (mode) but should not lose existing trues.
|
||||
assert result_trues >= original_trues
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Use case 3 — Consultant intake: threshold drops + fill
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestUC03ConsultantIntake:
|
||||
"""Drop sparse columns and rows, then fill the survivors."""
|
||||
|
||||
def test_drop_col_removes_legacy_fields(self):
|
||||
df = _read("uc03_consultant_intake.csv", dtype_str=True)
|
||||
# internal_id_legacy and beta_field are 100% missing — drop them.
|
||||
opts = MissingOptions(
|
||||
standardize_sentinels=True,
|
||||
strategy="drop_col",
|
||||
col_drop_threshold=0.99,
|
||||
)
|
||||
res = handle_missing(df, opts)
|
||||
assert "internal_id_legacy" in res.columns_dropped
|
||||
assert "beta_field" in res.columns_dropped
|
||||
|
||||
def test_drop_row_removes_mostly_empty_respondents(self):
|
||||
df = _read("uc03_consultant_intake.csv", dtype_str=True)
|
||||
opts = MissingOptions(
|
||||
standardize_sentinels=True,
|
||||
strategy="drop_both",
|
||||
col_drop_threshold=0.99, # drop the legacy / beta cols first
|
||||
row_drop_threshold=0.5, # then drop rows with >50% missing
|
||||
)
|
||||
res = handle_missing(df, opts)
|
||||
# R-002, R-005, R-007, R-010 are mostly-empty respondents.
|
||||
assert res.rows_dropped >= 4
|
||||
# Non-empty respondents survive.
|
||||
kept_ids = set(res.handled_df["respondent_id"].tolist())
|
||||
for survivor in ("R-001", "R-003", "R-006", "R-008", "R-009", "R-012"):
|
||||
assert survivor in kept_ids
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Edge cases
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestEC01AllNanColumn:
|
||||
def test_fill_skips_all_nan_column(self):
|
||||
df = _read("ec01_all_nan_column.csv")
|
||||
res = handle_missing(df, MissingOptions(strategy="mean"))
|
||||
# Mean of all-NaN is NaN — engine must NOT fabricate a value.
|
||||
assert res.handled_df["deprecated_field"].isna().all()
|
||||
assert res.cells_filled == 0
|
||||
|
||||
def test_drop_col_catches_all_nan(self):
|
||||
df = _read("ec01_all_nan_column.csv")
|
||||
res = handle_missing(
|
||||
df, MissingOptions(strategy="drop_col", col_drop_threshold=0.99),
|
||||
)
|
||||
assert "deprecated_field" in res.columns_dropped
|
||||
assert "name" not in res.columns_dropped
|
||||
|
||||
|
||||
class TestEC02NoMissing:
|
||||
def test_clean_file_is_noop(self):
|
||||
df = _read("ec02_no_missing.csv")
|
||||
res = handle_missing(df, MissingOptions.from_preset("safe-fill"))
|
||||
assert res.sentinels_standardized == 0
|
||||
assert res.cells_filled == 0
|
||||
assert res.rows_dropped == 0
|
||||
pd.testing.assert_frame_equal(res.handled_df, df)
|
||||
|
||||
|
||||
class TestEC03ZeroIsNotMissing:
|
||||
def test_zero_preserved(self):
|
||||
df = _read("ec03_zero_is_not_missing.csv")
|
||||
res = handle_missing(df, MissingOptions.from_preset("safe-fill"))
|
||||
# Original zeros remain zero.
|
||||
assert (res.handled_df["balance"] == 0).sum() == (df["balance"] == 0).sum()
|
||||
assert (res.handled_df["count"] == 0).sum() == (df["count"] == 0).sum()
|
||||
# No spurious changes recorded.
|
||||
assert res.cells_filled == 0
|
||||
assert res.sentinels_standardized == 0
|
||||
|
||||
def test_is_missing_like_zero_predicate(self):
|
||||
# Direct predicate check — zeros, false, "0" must all be non-missing.
|
||||
assert not is_missing_like(0)
|
||||
assert not is_missing_like(0.0)
|
||||
assert not is_missing_like(False)
|
||||
assert not is_missing_like("0")
|
||||
assert not is_missing_like("0.00")
|
||||
|
||||
|
||||
class TestEC04ExcelErrors:
|
||||
def test_excel_error_sentinels_recognized(self):
|
||||
df = _read("ec04_excel_errors.csv", dtype_str=True)
|
||||
res = handle_missing(df, MissingOptions(strategy="none"))
|
||||
# 6 error sentinels in the fixture: #N/A, #NULL!, #VALUE!, #N/A, #N/A, #NULL!
|
||||
assert res.sentinels_standardized == 6
|
||||
|
||||
|
||||
class TestEC05UnicodeWhitespace:
|
||||
def test_nbsp_and_ideographic_space_count_as_missing(self):
|
||||
df = _read("ec05_unicode_whitespace.csv", dtype_str=True)
|
||||
res = handle_missing(df, MissingOptions(strategy="none"))
|
||||
# rows 1, 2, 4 contain NBSP / tab / ideographic space respectively
|
||||
assert res.handled_df["note"].isna().sum() == 3
|
||||
assert res.handled_df.iloc[0]["note"] == "hello"
|
||||
assert res.handled_df.iloc[3]["note"] == "real"
|
||||
|
||||
|
||||
class TestEC06MixedDtypes:
|
||||
def test_mixed_column_falls_back_to_mode(self):
|
||||
# Read with native dtypes so 'real_num' stays numeric.
|
||||
df = _read("ec06_mixed_dtypes.csv")
|
||||
opts = MissingOptions(
|
||||
standardize_sentinels=True,
|
||||
strategy="median",
|
||||
categorical_strategy="mode",
|
||||
)
|
||||
res = handle_missing(df, opts)
|
||||
# mixed_col holds 'N/A' / 'hello' alongside numbers → object dtype,
|
||||
# median falls back to mode.
|
||||
assert res.strategy_per_column["mixed_col"] == "mode"
|
||||
# real_num is float dtype → median runs.
|
||||
assert res.strategy_per_column["real_num"] == "median"
|
||||
|
||||
|
||||
class TestEC07RealDataWithPadding:
|
||||
def test_padded_real_data_not_treated_as_missing(self):
|
||||
df = _read("ec07_real_data_with_padding.csv", dtype_str=True)
|
||||
res = handle_missing(df, MissingOptions(strategy="none"))
|
||||
# Only row 1 (name=" ") and row 2 (city=blank) should become NaN.
|
||||
# " Alice ", " Bob ", " SF" must remain.
|
||||
assert res.handled_df.iloc[0]["name"] == " Alice "
|
||||
assert res.handled_df.iloc[2]["name"] == " Bob "
|
||||
assert res.handled_df.iloc[3]["city"] == " SF"
|
||||
|
||||
|
||||
class TestEC08SingleRow:
|
||||
def test_single_row_handles_cleanly(self):
|
||||
df = _read("ec08_single_row.csv", dtype_str=True)
|
||||
# detect-only
|
||||
res = handle_missing(df, MissingOptions(strategy="none"))
|
||||
assert res.sentinels_standardized == 2 # 'N/A' + ''
|
||||
# safe-fill on a one-row file: median/mode of a single value is itself.
|
||||
res2 = handle_missing(df, MissingOptions.from_preset("safe-fill"))
|
||||
assert res2.handled_df.iloc[0]["name"] == "Alice"
|
||||
|
||||
|
||||
class TestEC09SingleColumn:
|
||||
def test_single_column_works(self):
|
||||
df = _read("ec09_single_column.csv", dtype_str=True)
|
||||
res = handle_missing(df, MissingOptions(strategy="none"))
|
||||
# 'N/A', whitespace-only ' ', '-' = 3 sentinels
|
||||
assert res.sentinels_standardized == 3
|
||||
assert res.handled_df["value"].isna().sum() == 3
|
||||
|
||||
|
||||
class TestEC10AllSentinelVariants:
|
||||
def test_every_default_sentinel_recognized(self):
|
||||
df = _read("ec10_all_sentinel_variants.csv", dtype_str=True)
|
||||
res = handle_missing(df, MissingOptions(strategy="none"))
|
||||
# 20 sentinels + 1 real value
|
||||
assert res.sentinels_standardized == 20
|
||||
# The 'real_value' row stays.
|
||||
assert (res.handled_df["sentinel_value"] == "real_value").sum() == 1
|
||||
|
||||
|
||||
class TestEC11ConstantPerColumn:
|
||||
def test_per_column_fill_values(self):
|
||||
df = _read("ec11_constant_per_column.csv", dtype_str=True)
|
||||
opts = MissingOptions(
|
||||
strategy="constant",
|
||||
column_fill_values={
|
||||
"country": "USA",
|
||||
"salary": "0",
|
||||
"department": "Unassigned",
|
||||
},
|
||||
)
|
||||
res = handle_missing(df, opts)
|
||||
# Fixture has 1 UK row + 2 USA rows + 2 blanks. Filling blanks with
|
||||
# "USA" yields 4 USA total; UK is preserved.
|
||||
assert (res.handled_df["country"] == "USA").sum() == 4
|
||||
assert (res.handled_df["country"] == "UK").sum() == 1
|
||||
assert (res.handled_df["department"] == "Unassigned").sum() >= 2
|
||||
|
||||
|
||||
class TestEC12DropThresholdBoundary:
|
||||
def test_threshold_one_never_drops(self):
|
||||
# threshold 1.0 + strict-greater = never drop.
|
||||
df = _read("ec12_drop_threshold_boundary.csv")
|
||||
opts = MissingOptions(strategy="drop_row", row_drop_threshold=1.0)
|
||||
res = handle_missing(df, opts)
|
||||
assert res.rows_dropped == 0
|
||||
|
||||
def test_threshold_just_under_one_drops_fully_missing(self):
|
||||
# threshold 0.99: drop only fully-missing rows (frac > 0.99 → frac == 1.0).
|
||||
df = _read("ec12_drop_threshold_boundary.csv")
|
||||
opts = MissingOptions(
|
||||
strategy="drop_row",
|
||||
row_drop_threshold=0.99,
|
||||
columns=["a", "b", "c", "d"], # exclude id from the scope
|
||||
)
|
||||
res = handle_missing(df, opts)
|
||||
# Only row 3 (id=4, all four are NaN) qualifies.
|
||||
assert res.rows_dropped == 1
|
||||
|
||||
def test_threshold_half_drops_majority_missing(self):
|
||||
df = _read("ec12_drop_threshold_boundary.csv")
|
||||
opts = MissingOptions(
|
||||
strategy="drop_row",
|
||||
row_drop_threshold=0.5,
|
||||
columns=["a", "b", "c", "d"],
|
||||
)
|
||||
res = handle_missing(df, opts)
|
||||
# Missing fractions across [a,b,c,d]:
|
||||
# row 0: 0/4=0.0 keep
|
||||
# row 1: 2/4=0.5 keep (strict >, not equal)
|
||||
# row 2: 3/4=0.75 drop
|
||||
# row 3: 4/4=1.0 drop
|
||||
# row 4: 2/4=0.5 keep
|
||||
assert res.rows_dropped == 2
|
||||
|
||||
def test_threshold_zero_drops_any_missing(self):
|
||||
df = _read("ec12_drop_threshold_boundary.csv")
|
||||
opts = MissingOptions(
|
||||
strategy="drop_row",
|
||||
row_drop_threshold=0.0,
|
||||
columns=["a", "b", "c", "d"],
|
||||
)
|
||||
res = handle_missing(df, opts)
|
||||
# Every body row except row 0 has at least one missing.
|
||||
assert res.rows_dropped == 4
|
||||
|
||||
|
||||
class TestEC13FfillLeadingNan:
|
||||
def test_leading_nan_run_survives_ffill(self):
|
||||
df = _read("ec13_ffill_leading_nan.csv")
|
||||
res = handle_missing(df, MissingOptions(strategy="ffill"))
|
||||
# First two rows (leading NaN) remain NaN — there's nothing to fill from.
|
||||
assert pd.isna(res.handled_df["price"].iloc[0])
|
||||
assert pd.isna(res.handled_df["price"].iloc[1])
|
||||
# Mid-series gets filled forward.
|
||||
assert res.handled_df["price"].iloc[3] == 100.0
|
||||
assert res.handled_df["price"].iloc[4] == 100.0
|
||||
# Trailing NaN gets filled by the last seen value.
|
||||
assert res.handled_df["price"].iloc[6] == 150.0
|
||||
|
||||
|
||||
class TestEC14InterpolateFallback:
|
||||
def test_interpolate_on_non_numeric_falls_back(self):
|
||||
df = _read("ec14_interpolate_fallback.csv", dtype_str=True)
|
||||
opts = MissingOptions(
|
||||
strategy="interpolate",
|
||||
categorical_strategy="mode",
|
||||
)
|
||||
res = handle_missing(df, opts)
|
||||
# All columns are object dtype here → fallback to mode.
|
||||
assert res.strategy_per_column["category"] == "mode"
|
||||
assert res.strategy_per_column["value"] == "mode"
|
||||
|
||||
|
||||
class TestEC15HeadersOnly:
|
||||
def test_empty_body_does_not_crash(self):
|
||||
df = _read("ec15_headers_only.csv")
|
||||
# All operations must be no-ops on an empty body.
|
||||
for preset in ("detect-only", "safe-fill", "drop-incomplete"):
|
||||
res = handle_missing(df, MissingOptions.from_preset(preset))
|
||||
assert len(res.handled_df) == 0
|
||||
assert res.cells_filled == 0
|
||||
assert res.rows_dropped == 0
|
||||
|
||||
|
||||
class TestEC16Idempotency:
|
||||
def test_safe_fill_is_idempotent(self):
|
||||
df = _read("ec16_idempotent_apply.csv", dtype_str=True)
|
||||
opts = MissingOptions.from_preset("safe-fill")
|
||||
first = handle_missing(df, opts)
|
||||
second = handle_missing(first.handled_df, opts)
|
||||
# Second pass should make no further changes.
|
||||
pd.testing.assert_frame_equal(
|
||||
second.handled_df.reset_index(drop=True),
|
||||
first.handled_df.reset_index(drop=True),
|
||||
)
|
||||
assert second.cells_filled == 0
|
||||
assert second.sentinels_standardized == 0
|
||||
|
||||
def test_detect_only_is_idempotent(self):
|
||||
df = _read("ec16_idempotent_apply.csv", dtype_str=True)
|
||||
opts = MissingOptions.from_preset("detect-only")
|
||||
first = handle_missing(df, opts)
|
||||
second = handle_missing(first.handled_df, opts)
|
||||
assert second.sentinels_standardized == 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Whole-corpus property tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ALL_FIXTURES = sorted(p.name for p in TEST_DATA.glob("*.csv"))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fixture", ALL_FIXTURES)
|
||||
def test_handle_missing_does_not_mutate_input(fixture):
|
||||
"""Every fixture must leave the input DataFrame untouched."""
|
||||
df = pd.read_csv(TEST_DATA / fixture, dtype=str, keep_default_na=False)
|
||||
if df.empty and len(df.columns) == 0:
|
||||
pytest.skip(f"{fixture}: completely empty file")
|
||||
snapshot = df.copy(deep=True)
|
||||
handle_missing(df, MissingOptions.from_preset("safe-fill"))
|
||||
pd.testing.assert_frame_equal(df, snapshot)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fixture", ALL_FIXTURES)
|
||||
def test_profile_runs_on_every_fixture(fixture):
|
||||
"""``profile_missing`` must succeed on every corpus file."""
|
||||
df = pd.read_csv(TEST_DATA / fixture, dtype=str, keep_default_na=False)
|
||||
prof = profile_missing(df, MissingOptions())
|
||||
assert prof.rows_total == len(df)
|
||||
assert prof.cells_total == len(df) * len(df.columns)
|
||||
324
tests/test_pipeline.py
Normal file
324
tests/test_pipeline.py
Normal file
@@ -0,0 +1,324 @@
|
||||
"""Tests for src/core/pipeline.py."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from src.core.errors import ConfigError, InputValidationError
|
||||
from src.core.pipeline import (
|
||||
Pipeline,
|
||||
PipelineResult,
|
||||
SOFT_DEPENDENCIES,
|
||||
Step,
|
||||
StepResult,
|
||||
TOOL_ADAPTERS,
|
||||
TOOL_NAMES,
|
||||
recommended_pipeline,
|
||||
run_pipeline,
|
||||
validate_pipeline,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step / Pipeline construction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestStep:
|
||||
def test_unknown_tool_raises(self):
|
||||
with pytest.raises(ConfigError):
|
||||
Step(tool="bogus_tool")
|
||||
|
||||
def test_default_options_empty_dict(self):
|
||||
s = Step(tool="text_clean")
|
||||
assert s.options == {}
|
||||
assert s.enabled is True
|
||||
|
||||
def test_display_name_falls_back_to_tool(self):
|
||||
assert Step(tool="dedup").display_name() == "dedup"
|
||||
assert Step(tool="dedup", name="Final dedup").display_name() == "Final dedup"
|
||||
|
||||
|
||||
class TestPipelineSerialization:
|
||||
def test_roundtrip_dict(self):
|
||||
p = Pipeline(steps=[
|
||||
Step("text_clean", {"trim": True}),
|
||||
Step("dedup", {"survivor_rule": "first"}),
|
||||
])
|
||||
out = p.to_dict()
|
||||
loaded = Pipeline.from_dict(out)
|
||||
assert len(loaded.steps) == 2
|
||||
assert loaded.steps[0].tool == "text_clean"
|
||||
assert loaded.steps[1].options["survivor_rule"] == "first"
|
||||
|
||||
def test_roundtrip_file(self, tmp_path):
|
||||
p = Pipeline(steps=[Step("text_clean")])
|
||||
path = tmp_path / "p.json"
|
||||
p.to_file(path)
|
||||
loaded = Pipeline.from_file(path)
|
||||
assert loaded.steps[0].tool == "text_clean"
|
||||
|
||||
def test_from_dict_missing_steps_key(self):
|
||||
with pytest.raises(ConfigError):
|
||||
Pipeline.from_dict({})
|
||||
|
||||
def test_from_dict_missing_tool(self):
|
||||
with pytest.raises(ConfigError):
|
||||
Pipeline.from_dict({"steps": [{"options": {}}]})
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# recommended_pipeline
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestRecommendedPipeline:
|
||||
def test_default_order(self):
|
||||
p = recommended_pipeline()
|
||||
assert [s.tool for s in p.steps] == [
|
||||
"text_clean", "format_standardize", "missing", "dedup",
|
||||
]
|
||||
|
||||
def test_default_passes_validation(self):
|
||||
p = recommended_pipeline()
|
||||
assert validate_pipeline(p) == []
|
||||
|
||||
def test_include_overrides_default(self):
|
||||
p = recommended_pipeline(include=["text_clean", "missing"])
|
||||
assert [s.tool for s in p.steps] == ["text_clean", "missing"]
|
||||
|
||||
def test_options_seed_reaches_step(self):
|
||||
p = recommended_pipeline(options={"text_clean": {"trim": False}})
|
||||
assert p.steps[0].options == {"trim": False}
|
||||
|
||||
def test_unknown_tool_raises(self):
|
||||
with pytest.raises(InputValidationError):
|
||||
recommended_pipeline(include=["bogus"])
|
||||
|
||||
def test_can_place_column_map_first_or_last(self):
|
||||
# Both placements must be acceptable per the docstring.
|
||||
first = recommended_pipeline(include=[
|
||||
"column_map", "text_clean", "format_standardize", "missing", "dedup",
|
||||
])
|
||||
last = recommended_pipeline(include=[
|
||||
"text_clean", "format_standardize", "missing", "column_map", "dedup",
|
||||
])
|
||||
# No soft-dependency rule names column_map, so neither warns.
|
||||
assert validate_pipeline(first) == []
|
||||
assert validate_pipeline(last) == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# validate_pipeline — soft dependencies
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestValidatePipeline:
|
||||
def test_in_order_no_warnings(self):
|
||||
p = recommended_pipeline()
|
||||
assert validate_pipeline(p) == []
|
||||
|
||||
def test_dedup_before_text_clean_warns(self):
|
||||
p = Pipeline(steps=[Step("dedup"), Step("text_clean")])
|
||||
ws = validate_pipeline(p)
|
||||
assert len(ws) == 1
|
||||
assert "dedup" in ws[0] and "text_clean" in ws[0]
|
||||
|
||||
def test_format_before_text_clean_warns(self):
|
||||
p = Pipeline(steps=[Step("format_standardize"), Step("text_clean")])
|
||||
ws = validate_pipeline(p)
|
||||
assert any("format_standardize" in w for w in ws)
|
||||
|
||||
def test_disabled_steps_ignored(self):
|
||||
# Disabled dedup-first should not trigger a warning.
|
||||
p = Pipeline(steps=[
|
||||
Step("dedup", enabled=False),
|
||||
Step("text_clean"),
|
||||
])
|
||||
assert validate_pipeline(p) == []
|
||||
|
||||
def test_duplicate_tool_does_not_double_warn(self):
|
||||
# text_clean twice (legitimate: two-pass cleaning) shouldn't
|
||||
# generate redundant warnings.
|
||||
p = Pipeline(steps=[
|
||||
Step("text_clean"),
|
||||
Step("text_clean"),
|
||||
])
|
||||
assert validate_pipeline(p) == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# run_pipeline — execution
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@pytest.fixture
|
||||
def messy_df():
|
||||
return pd.DataFrame({
|
||||
"name": [" Alice ", "BOB", "N/A", "", "charlie "],
|
||||
"phone": ["(415) 555-1234", "+44 20 7946 0958", "03-3210-7000", "", "(415) 555-1234"],
|
||||
"country": ["US", "GB", "JP", "", "US"],
|
||||
})
|
||||
|
||||
|
||||
class TestRunPipeline:
|
||||
def test_recommended_pipeline_runs_end_to_end(self, messy_df):
|
||||
p = recommended_pipeline(options={
|
||||
"format_standardize": {
|
||||
"column_types": {"phone": "phone"},
|
||||
"phone_country_column": "country",
|
||||
},
|
||||
"missing": {"strategy": "none"},
|
||||
})
|
||||
res = run_pipeline(messy_df, p)
|
||||
assert isinstance(res, PipelineResult)
|
||||
assert res.initial_rows == 5
|
||||
# Dedup at the end removes the Alice/charlie duplicate (same phone).
|
||||
assert res.final_rows < res.initial_rows
|
||||
assert res.warnings == []
|
||||
|
||||
def test_initial_df_not_mutated(self, messy_df):
|
||||
snapshot = messy_df.copy(deep=True)
|
||||
run_pipeline(messy_df, recommended_pipeline())
|
||||
pd.testing.assert_frame_equal(messy_df, snapshot)
|
||||
|
||||
def test_disabled_step_skipped(self, messy_df):
|
||||
p = Pipeline(steps=[
|
||||
Step("text_clean", enabled=False),
|
||||
Step("missing", options={"strategy": "none"}),
|
||||
])
|
||||
res = run_pipeline(messy_df, p)
|
||||
assert res.step_results[0].skipped is True
|
||||
assert res.step_results[1].skipped is False
|
||||
|
||||
def test_step_results_ordered_and_timed(self, messy_df):
|
||||
p = recommended_pipeline(options={
|
||||
"missing": {"strategy": "none"},
|
||||
})
|
||||
res = run_pipeline(messy_df, p)
|
||||
assert len(res.step_results) == 4
|
||||
for sr in res.step_results:
|
||||
assert sr.elapsed_seconds >= 0
|
||||
assert [sr.step.tool for sr in res.step_results] == [
|
||||
"text_clean", "format_standardize", "missing", "dedup",
|
||||
]
|
||||
|
||||
def test_warnings_returned_but_run_proceeds(self, messy_df):
|
||||
p = Pipeline(steps=[
|
||||
Step("dedup"),
|
||||
Step("text_clean"),
|
||||
])
|
||||
res = run_pipeline(messy_df, p)
|
||||
assert res.warnings # warnings present
|
||||
# Both steps still ran.
|
||||
assert all(not sr.skipped for sr in res.step_results)
|
||||
|
||||
def test_progress_callback_fires_per_step(self, messy_df):
|
||||
seen: list[StepResult] = []
|
||||
p = Pipeline(steps=[
|
||||
Step("text_clean"),
|
||||
Step("missing", options={"strategy": "none"}),
|
||||
])
|
||||
run_pipeline(messy_df, p, on_step_complete=seen.append)
|
||||
assert len(seen) == 2
|
||||
assert all(isinstance(s, StepResult) for s in seen)
|
||||
|
||||
def test_progress_callback_exception_does_not_abort(self, messy_df):
|
||||
def bad(_sr):
|
||||
raise RuntimeError("boom")
|
||||
p = Pipeline(steps=[Step("text_clean")])
|
||||
# Must not raise.
|
||||
res = run_pipeline(messy_df, p, on_step_complete=bad)
|
||||
assert res.final_rows == 5
|
||||
|
||||
def test_stop_on_error_default(self, messy_df):
|
||||
# Force an error by giving format_standardize a non-existent column.
|
||||
p = Pipeline(steps=[
|
||||
Step("format_standardize", options={
|
||||
"column_types": {"does_not_exist": "phone"},
|
||||
}),
|
||||
])
|
||||
with pytest.raises(InputValidationError):
|
||||
run_pipeline(messy_df, p)
|
||||
|
||||
def test_continue_on_error_carries_previous_df(self, messy_df):
|
||||
p = Pipeline(steps=[
|
||||
Step("text_clean"),
|
||||
Step("format_standardize", options={
|
||||
"column_types": {"does_not_exist": "phone"},
|
||||
}),
|
||||
Step("missing", options={"strategy": "none"}),
|
||||
])
|
||||
res = run_pipeline(messy_df, p, stop_on_error=False)
|
||||
# Step 2 errored, step 3 still ran.
|
||||
assert res.step_results[1].error is not None
|
||||
assert res.step_results[2].error is None
|
||||
assert res.final_rows == 5
|
||||
|
||||
def test_non_dataframe_input(self):
|
||||
with pytest.raises(InputValidationError):
|
||||
run_pipeline([1, 2, 3], recommended_pipeline()) # type: ignore[arg-type]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-tool adapter sanity
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestAdapters:
|
||||
@pytest.mark.parametrize("tool", TOOL_NAMES)
|
||||
def test_adapter_with_default_options_runs(self, tool, messy_df):
|
||||
# Each adapter must accept an empty options dict and return a
|
||||
# (df, summary) pair.
|
||||
out_df, summary = TOOL_ADAPTERS[tool](messy_df, {})
|
||||
assert isinstance(out_df, pd.DataFrame)
|
||||
assert isinstance(summary, dict)
|
||||
|
||||
def test_format_standardize_adapter_passes_column_types(self, messy_df):
|
||||
out, summary = TOOL_ADAPTERS["format_standardize"](
|
||||
messy_df, {"column_types": {"phone": "phone"}},
|
||||
)
|
||||
assert summary["columns_processed"] == ["phone"]
|
||||
|
||||
def test_dedup_adapter_with_unknown_survivor_rule_raises(self, messy_df):
|
||||
with pytest.raises(ConfigError):
|
||||
TOOL_ADAPTERS["dedup"](messy_df, {"survivor_rule": "bogus"})
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SOFT_DEPENDENCIES integrity
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestSoftDependencies:
|
||||
def test_every_pair_uses_known_tools(self):
|
||||
for earlier, later, _ in SOFT_DEPENDENCIES:
|
||||
assert earlier in TOOL_NAMES
|
||||
assert later in TOOL_NAMES
|
||||
|
||||
def test_all_reasons_non_empty(self):
|
||||
for _, _, why in SOFT_DEPENDENCIES:
|
||||
assert why and isinstance(why, str)
|
||||
# Reason should be a sentence — at least 20 chars.
|
||||
assert len(why) > 20
|
||||
|
||||
def test_dependencies_form_a_dag(self):
|
||||
# No cycles — there must exist a topological ordering of the
|
||||
# tools such that every soft dependency (earlier, later)
|
||||
# is satisfied. With 5 tools and 6 deps this is easy to verify.
|
||||
from collections import defaultdict, deque
|
||||
edges: dict[str, list[str]] = defaultdict(list)
|
||||
in_degree: dict[str, int] = {t: 0 for t in TOOL_NAMES}
|
||||
for e, l, _ in SOFT_DEPENDENCIES:
|
||||
edges[e].append(l)
|
||||
in_degree[l] += 1
|
||||
queue = deque(t for t, d in in_degree.items() if d == 0)
|
||||
order = []
|
||||
while queue:
|
||||
t = queue.popleft()
|
||||
order.append(t)
|
||||
for nxt in edges[t]:
|
||||
in_degree[nxt] -= 1
|
||||
if in_degree[nxt] == 0:
|
||||
queue.append(nxt)
|
||||
assert len(order) == len(TOOL_NAMES), (
|
||||
f"SOFT_DEPENDENCIES contain a cycle; topo order={order}"
|
||||
)
|
||||
Reference in New Issue
Block a user