Tools shipped this batch (4 → 6 of 9 Ready):
04 Missing Value Handler src/core/missing.py + cli_missing.py + GUI
05 Column Mapper src/core/column_mapper.py + cli_column_map.py + GUI
09 Pipeline Runner src/core/pipeline.py + cli_pipeline.py + GUI
with soft tool-dependency graph (recommended,
not enforced) and JSON save/load for repeatable
weekly cleanups.
Format Standardizer reworked for 1 GB international files:
• Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
• Per-row country / address columns drive parsing
• Audit cap (default 10 k rows, ~50 MB RAM)
• standardize_file(): chunked streaming entry point (~165 k rows/sec)
• currency_decimal="auto" for EU comma-decimal locales
• R$ / kr / zł multi-char currency prefixes
• cli_format.py with auto-stream above 100 MB inputs
Encoding detection arbiter + language-aware probe:
Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.
Distribution-readiness assets:
• streamlit_app.py — Streamlit Community Cloud entry shim
• src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
100-row cap + watermark, free-vs-paid boundary enforced at surface
• samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
• landing/ — 4 static HTML pages (apex chooser + 3 niche),
shared CSS, deploy.py URL-substitution script,
auto-generated robots.txt + sitemap.xml + 404.html + favicon
• docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
— full strategy + measurement + deployment + master checklist
Test counts:
before: 1,520 passed · 4 skipped · 17 xfailed
after: 1,729 passed · 0 skipped · 0 xfailed
Tier-1 corpora added:
• missing-corpus 3 use cases + 16 edge cases
• column-mapper-corpus 3 use cases + 5 edge cases
• format-cleaner intl 20-row 13-country stress fixture
Engine hardening flushed out by the corpora:
• interpolate guards against object-dtype columns
• mean/median skip all-NaN columns (silences numpy warning)
• fillna runs under future.no_silent_downcasting (silences pandas warning)
• mojibake test no longer skips when ftfy installed (monkeypatch path)
• drop-row threshold semantics: strict-greater (consistent across rows / cols)
• currency_decimal validator allow-set updated for "auto"
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
375 lines
14 KiB
Python
375 lines
14 KiB
Python
"""Tests for src/core/column_mapper.py."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pytest
|
|
|
|
from src.core.errors import ConfigError, InputValidationError
|
|
from src.core.column_mapper import (
|
|
MapOptions,
|
|
PRESETS,
|
|
TargetField,
|
|
TargetSchema,
|
|
coerce_series,
|
|
infer_mapping,
|
|
map_columns,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# infer_mapping — fuzzy matcher
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestInferMapping:
|
|
def test_exact_normalized_match(self):
|
|
df = pd.DataFrame({"First Name": [], "Last Name": []})
|
|
schema = TargetSchema(fields=[
|
|
TargetField(name="first_name"), TargetField(name="last_name"),
|
|
])
|
|
m = infer_mapping(df, schema)
|
|
assert m == {"First Name": "first_name", "Last Name": "last_name"}
|
|
|
|
def test_alias_match(self):
|
|
df = pd.DataFrame({"EmailAddr": []})
|
|
schema = TargetSchema(fields=[
|
|
TargetField(name="email", aliases=["EmailAddr", "email_address"]),
|
|
])
|
|
m = infer_mapping(df, schema)
|
|
assert m == {"EmailAddr": "email"}
|
|
|
|
def test_below_threshold_excluded(self):
|
|
df = pd.DataFrame({"xyz": []})
|
|
schema = TargetSchema(fields=[TargetField(name="email")])
|
|
m = infer_mapping(df, schema, threshold=0.6)
|
|
assert m == {}
|
|
|
|
def test_target_matched_at_most_once(self):
|
|
df = pd.DataFrame({"first_name": [], "fname": []})
|
|
schema = TargetSchema(fields=[TargetField(name="first_name")])
|
|
m = infer_mapping(df, schema)
|
|
# Exact match wins; "fname" stays unmapped.
|
|
assert m == {"first_name": "first_name"}
|
|
|
|
def test_threshold_zero_matches_anything(self):
|
|
df = pd.DataFrame({"a": [], "b": []})
|
|
schema = TargetSchema(fields=[TargetField(name="z")])
|
|
m = infer_mapping(df, schema, threshold=0.0)
|
|
assert len(m) == 1
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# coerce_series
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestCoerceSeries:
|
|
def test_integer_clean(self):
|
|
s = pd.Series(["1", "2", "3"])
|
|
out, fails = coerce_series(s, "integer")
|
|
assert list(out) == [1, 2, 3]
|
|
assert fails == 0
|
|
|
|
def test_integer_with_failure(self):
|
|
s = pd.Series(["1", "bad", "3"])
|
|
out, fails = coerce_series(s, "integer")
|
|
assert fails == 1
|
|
assert pd.isna(out.iloc[1])
|
|
|
|
def test_float_with_thousands_sep(self):
|
|
# Plain floats; thousands-sep handling is for format standardizer.
|
|
s = pd.Series(["1.5", "2.0", "3.25"])
|
|
out, fails = coerce_series(s, "float")
|
|
assert fails == 0
|
|
assert out.iloc[2] == 3.25
|
|
|
|
def test_boolean_truthy_falsy(self):
|
|
s = pd.Series(["true", "false", "Yes", "no", "1", "0"])
|
|
out, fails = coerce_series(s, "boolean")
|
|
assert fails == 0
|
|
assert list(out) == [True, False, True, False, True, False]
|
|
|
|
def test_boolean_unknown_value_fails(self):
|
|
s = pd.Series(["true", "maybe"])
|
|
out, fails = coerce_series(s, "boolean")
|
|
assert fails == 1
|
|
assert pd.isna(out.iloc[1])
|
|
|
|
def test_date_iso_format(self):
|
|
s = pd.Series(["2025-01-15", "2025-02-20"])
|
|
out, fails = coerce_series(s, "date")
|
|
assert fails == 0
|
|
assert out.iloc[0].year == 2025
|
|
|
|
def test_date_failure(self):
|
|
s = pd.Series(["2025-01-15", "garbage"])
|
|
out, fails = coerce_series(s, "date")
|
|
assert fails == 1
|
|
assert pd.isna(out.iloc[1])
|
|
|
|
def test_string_passthrough(self):
|
|
s = pd.Series([1, 2, 3])
|
|
out, fails = coerce_series(s, "string")
|
|
assert fails == 0
|
|
assert out.dtype.name == "string"
|
|
|
|
def test_auto_returns_unchanged(self):
|
|
s = pd.Series([1, 2])
|
|
out, fails = coerce_series(s, "auto")
|
|
assert fails == 0
|
|
assert out is s
|
|
|
|
def test_unknown_dtype_raises(self):
|
|
with pytest.raises(InputValidationError):
|
|
coerce_series(pd.Series([1]), "bogus") # type: ignore[arg-type]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# map_columns — explicit mapping
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestMapColumnsExplicit:
|
|
def test_simple_rename(self):
|
|
df = pd.DataFrame({"a": [1], "b": [2]})
|
|
opts = MapOptions(mapping={"a": "alpha", "b": "beta"})
|
|
res = map_columns(df, opts)
|
|
assert list(res.mapped_df.columns) == ["alpha", "beta"]
|
|
assert res.columns_renamed == 2
|
|
|
|
def test_unknown_source_raises(self):
|
|
df = pd.DataFrame({"a": [1]})
|
|
opts = MapOptions(mapping={"missing": "x"})
|
|
with pytest.raises(InputValidationError):
|
|
map_columns(df, opts)
|
|
|
|
def test_duplicate_target_raises(self):
|
|
df = pd.DataFrame({"a": [1], "b": [2]})
|
|
opts = MapOptions(mapping={"a": "x", "b": "x"})
|
|
with pytest.raises(InputValidationError):
|
|
map_columns(df, opts)
|
|
|
|
def test_unmapped_keep(self):
|
|
df = pd.DataFrame({"a": [1], "b": [2]})
|
|
opts = MapOptions(mapping={"a": "alpha"}, unmapped="keep")
|
|
res = map_columns(df, opts)
|
|
assert "b" in res.mapped_df.columns
|
|
assert res.unmapped_kept == ["b"]
|
|
|
|
def test_unmapped_drop(self):
|
|
df = pd.DataFrame({"a": [1], "b": [2]})
|
|
opts = MapOptions(mapping={"a": "alpha"}, unmapped="drop")
|
|
res = map_columns(df, opts)
|
|
assert list(res.mapped_df.columns) == ["alpha"]
|
|
assert res.columns_dropped == ["b"]
|
|
|
|
def test_unmapped_error(self):
|
|
df = pd.DataFrame({"a": [1], "b": [2]})
|
|
opts = MapOptions(mapping={"a": "alpha"}, unmapped="error")
|
|
with pytest.raises(InputValidationError):
|
|
map_columns(df, opts)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# map_columns — schema + auto-inference
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestMapColumnsWithSchema:
|
|
def test_auto_infer_renames(self):
|
|
df = pd.DataFrame({"First Name": ["A"], "Last Name": ["B"]})
|
|
schema = TargetSchema(fields=[
|
|
TargetField(name="first_name"), TargetField(name="last_name"),
|
|
])
|
|
opts = MapOptions(schema=schema, auto_infer=True)
|
|
res = map_columns(df, opts)
|
|
assert "first_name" in res.mapped_df.columns
|
|
assert "last_name" in res.mapped_df.columns
|
|
assert res.inferred_pairs == {"First Name": "first_name", "Last Name": "last_name"}
|
|
|
|
def test_explicit_overrides_inferred(self):
|
|
df = pd.DataFrame({"name": ["A"], "fname": ["B"]})
|
|
schema = TargetSchema(fields=[TargetField(name="first_name")])
|
|
opts = MapOptions(
|
|
schema=schema,
|
|
mapping={"fname": "first_name"},
|
|
auto_infer=True,
|
|
)
|
|
res = map_columns(df, opts)
|
|
assert res.mapping["fname"] == "first_name"
|
|
assert "name" not in res.mapping
|
|
|
|
def test_required_missing_raises(self):
|
|
df = pd.DataFrame({"first_name": ["A"]})
|
|
schema = TargetSchema(fields=[
|
|
TargetField(name="first_name", required=True),
|
|
TargetField(name="email", required=True),
|
|
])
|
|
opts = MapOptions(schema=schema, auto_infer=False, enforce_required=True)
|
|
with pytest.raises(InputValidationError):
|
|
map_columns(df, opts)
|
|
|
|
def test_required_missing_with_default_added(self):
|
|
df = pd.DataFrame({"first_name": ["A"]})
|
|
schema = TargetSchema(fields=[
|
|
TargetField(name="first_name", required=True),
|
|
TargetField(name="source", required=False, default="import"),
|
|
])
|
|
opts = MapOptions(schema=schema, auto_infer=False)
|
|
res = map_columns(df, opts)
|
|
assert "source" in res.mapped_df.columns
|
|
assert res.mapped_df.iloc[0]["source"] == "import"
|
|
assert res.columns_added == ["source"]
|
|
|
|
def test_required_missing_disabled(self):
|
|
df = pd.DataFrame({"first_name": ["A"]})
|
|
schema = TargetSchema(fields=[
|
|
TargetField(name="first_name", required=True),
|
|
TargetField(name="email", required=True),
|
|
])
|
|
opts = MapOptions(schema=schema, auto_infer=False, enforce_required=False)
|
|
res = map_columns(df, opts)
|
|
assert "email" in res.missing_required_targets
|
|
|
|
def test_reorder_to_schema(self):
|
|
df = pd.DataFrame({"z": [1], "a": [2], "m": [3]})
|
|
schema = TargetSchema(fields=[
|
|
TargetField(name="a"), TargetField(name="m"), TargetField(name="z"),
|
|
])
|
|
opts = MapOptions(schema=schema, auto_infer=True, reorder_to_schema=True)
|
|
res = map_columns(df, opts)
|
|
assert list(res.mapped_df.columns) == ["a", "m", "z"]
|
|
|
|
def test_coerce_types(self):
|
|
df = pd.DataFrame({"age": ["30", "bad", "40"], "active": ["true", "no", "yes"]})
|
|
schema = TargetSchema(fields=[
|
|
TargetField(name="age", dtype="integer"),
|
|
TargetField(name="active", dtype="boolean"),
|
|
])
|
|
opts = MapOptions(schema=schema, auto_infer=True, coerce_types=True)
|
|
res = map_columns(df, opts)
|
|
assert res.mapped_df["age"].iloc[0] == 30
|
|
assert res.mapped_df["active"].iloc[0] is True or res.mapped_df["active"].iloc[0]
|
|
assert res.coercion_failures == {"age": 1}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Presets
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestPresets:
|
|
def test_strict_schema_drops_and_coerces_and_reorders(self):
|
|
df = pd.DataFrame({"First Name": ["A"], "Email": ["a@x"], "extra": [1]})
|
|
schema = TargetSchema(fields=[
|
|
TargetField(name="first_name", required=True),
|
|
TargetField(name="email", required=True),
|
|
])
|
|
opts = MapOptions.from_preset("strict-schema")
|
|
opts.schema = schema
|
|
res = map_columns(df, opts)
|
|
assert list(res.mapped_df.columns) == ["first_name", "email"]
|
|
assert res.columns_dropped == ["extra"]
|
|
|
|
def test_lenient_keeps_extras(self):
|
|
df = pd.DataFrame({"First Name": ["A"], "extra": [1]})
|
|
schema = TargetSchema(fields=[TargetField(name="first_name")])
|
|
opts = MapOptions.from_preset("lenient-schema")
|
|
opts.schema = schema
|
|
res = map_columns(df, opts)
|
|
assert "extra" in res.mapped_df.columns
|
|
|
|
def test_unknown_preset(self):
|
|
with pytest.raises(ConfigError):
|
|
MapOptions.from_preset("does-not-exist")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Schema serialization
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestSchemaIO:
|
|
def test_roundtrip_dict(self):
|
|
schema = TargetSchema(fields=[
|
|
TargetField(name="x", dtype="integer", required=True, aliases=["X", "X "]),
|
|
TargetField(name="y", default="z"),
|
|
])
|
|
d = schema.to_dict()
|
|
loaded = TargetSchema.from_dict(d)
|
|
assert loaded.field_names() == ["x", "y"]
|
|
assert loaded.fields[0].required is True
|
|
assert loaded.fields[1].default == "z"
|
|
|
|
def test_from_dict_string_field(self):
|
|
# Allow shorthand: bare string defaults to dtype=auto.
|
|
loaded = TargetSchema.from_dict({"fields": ["a", "b"]})
|
|
assert loaded.field_names() == ["a", "b"]
|
|
|
|
def test_from_dict_unknown_dtype_raises(self):
|
|
with pytest.raises(ConfigError):
|
|
TargetSchema.from_dict({"fields": [{"name": "x", "dtype": "bogus"}]})
|
|
|
|
def test_from_dict_missing_name_raises(self):
|
|
with pytest.raises(ConfigError):
|
|
TargetSchema.from_dict({"fields": [{"dtype": "string"}]})
|
|
|
|
def test_options_roundtrip_to_file(self, tmp_path):
|
|
schema = TargetSchema(fields=[TargetField(name="x", dtype="string")])
|
|
opts = MapOptions(
|
|
schema=schema,
|
|
mapping={"a": "x"},
|
|
unmapped="drop",
|
|
coerce_types=True,
|
|
reorder_to_schema=True,
|
|
)
|
|
path = tmp_path / "cfg.json"
|
|
opts.to_file(path)
|
|
loaded = MapOptions.from_file(path)
|
|
assert loaded.mapping == {"a": "x"}
|
|
assert loaded.unmapped == "drop"
|
|
assert loaded.coerce_types is True
|
|
assert loaded.schema is not None
|
|
assert loaded.schema.field_names() == ["x"]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Validation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestValidation:
|
|
def test_invalid_unmapped_strategy(self):
|
|
opts = MapOptions(unmapped="bogus") # type: ignore[arg-type]
|
|
with pytest.raises(InputValidationError):
|
|
opts.validate()
|
|
|
|
def test_threshold_out_of_range(self):
|
|
opts = MapOptions(fuzzy_threshold=1.5)
|
|
with pytest.raises(ConfigError):
|
|
opts.validate()
|
|
|
|
def test_non_dataframe_input(self):
|
|
with pytest.raises(InputValidationError):
|
|
map_columns([1, 2, 3]) # type: ignore[arg-type]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Idempotency
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestIdempotency:
|
|
def test_double_apply_is_stable(self):
|
|
df = pd.DataFrame({"First Name": ["A"], "Email": ["a@x"]})
|
|
schema = TargetSchema(fields=[
|
|
TargetField(name="first_name"),
|
|
TargetField(name="email"),
|
|
])
|
|
opts = MapOptions(schema=schema, auto_infer=True, reorder_to_schema=True)
|
|
first = map_columns(df, opts)
|
|
second = map_columns(first.mapped_df, opts)
|
|
pd.testing.assert_frame_equal(second.mapped_df, first.mapped_df)
|
|
|
|
def test_input_not_mutated(self):
|
|
df = pd.DataFrame({"a": [1], "b": [2]})
|
|
snapshot = df.copy(deep=True)
|
|
map_columns(df, MapOptions(mapping={"a": "x"}))
|
|
pd.testing.assert_frame_equal(df, snapshot)
|