Files
datatools-dev/tests/test_column_mapper.py
Michael 966af8ef94 feat: 3 new tools, format streaming, distribution-ready demo + landing pages
Tools shipped this batch (4 → 6 of 9 Ready):
  04 Missing Value Handler   src/core/missing.py + cli_missing.py + GUI
  05 Column Mapper           src/core/column_mapper.py + cli_column_map.py + GUI
  09 Pipeline Runner         src/core/pipeline.py + cli_pipeline.py + GUI
                             with soft tool-dependency graph (recommended,
                             not enforced) and JSON save/load for repeatable
                             weekly cleanups.

Format Standardizer reworked for 1 GB international files:
  • Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
  • Per-row country / address columns drive parsing
  • Audit cap (default 10 k rows, ~50 MB RAM)
  • standardize_file(): chunked streaming entry point (~165 k rows/sec)
  • currency_decimal="auto" for EU comma-decimal locales
  • R$ / kr / zł multi-char currency prefixes
  • cli_format.py with auto-stream above 100 MB inputs

Encoding detection arbiter + language-aware probe:
  Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
  via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.

Distribution-readiness assets:
  • streamlit_app.py — Streamlit Community Cloud entry shim
  • src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
    100-row cap + watermark, free-vs-paid boundary enforced at surface
  • samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
  • landing/ — 4 static HTML pages (apex chooser + 3 niche),
    shared CSS, deploy.py URL-substitution script,
    auto-generated robots.txt + sitemap.xml + 404.html + favicon
  • docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
    — full strategy + measurement + deployment + master checklist

Test counts:
  before: 1,520 passed · 4 skipped · 17 xfailed
  after:  1,729 passed · 0 skipped · 0  xfailed

Tier-1 corpora added:
  • missing-corpus           3 use cases + 16 edge cases
  • column-mapper-corpus     3 use cases + 5 edge cases
  • format-cleaner intl      20-row 13-country stress fixture

Engine hardening flushed out by the corpora:
  • interpolate guards against object-dtype columns
  • mean/median skip all-NaN columns (silences numpy warning)
  • fillna runs under future.no_silent_downcasting (silences pandas warning)
  • mojibake test no longer skips when ftfy installed (monkeypatch path)
  • drop-row threshold semantics: strict-greater (consistent across rows / cols)
  • currency_decimal validator allow-set updated for "auto"

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 22:31:26 +00:00

375 lines
14 KiB
Python

"""Tests for src/core/column_mapper.py."""
from __future__ import annotations
import json
import numpy as np
import pandas as pd
import pytest
from src.core.errors import ConfigError, InputValidationError
from src.core.column_mapper import (
MapOptions,
PRESETS,
TargetField,
TargetSchema,
coerce_series,
infer_mapping,
map_columns,
)
# ---------------------------------------------------------------------------
# infer_mapping — fuzzy matcher
# ---------------------------------------------------------------------------
class TestInferMapping:
def test_exact_normalized_match(self):
df = pd.DataFrame({"First Name": [], "Last Name": []})
schema = TargetSchema(fields=[
TargetField(name="first_name"), TargetField(name="last_name"),
])
m = infer_mapping(df, schema)
assert m == {"First Name": "first_name", "Last Name": "last_name"}
def test_alias_match(self):
df = pd.DataFrame({"EmailAddr": []})
schema = TargetSchema(fields=[
TargetField(name="email", aliases=["EmailAddr", "email_address"]),
])
m = infer_mapping(df, schema)
assert m == {"EmailAddr": "email"}
def test_below_threshold_excluded(self):
df = pd.DataFrame({"xyz": []})
schema = TargetSchema(fields=[TargetField(name="email")])
m = infer_mapping(df, schema, threshold=0.6)
assert m == {}
def test_target_matched_at_most_once(self):
df = pd.DataFrame({"first_name": [], "fname": []})
schema = TargetSchema(fields=[TargetField(name="first_name")])
m = infer_mapping(df, schema)
# Exact match wins; "fname" stays unmapped.
assert m == {"first_name": "first_name"}
def test_threshold_zero_matches_anything(self):
df = pd.DataFrame({"a": [], "b": []})
schema = TargetSchema(fields=[TargetField(name="z")])
m = infer_mapping(df, schema, threshold=0.0)
assert len(m) == 1
# ---------------------------------------------------------------------------
# coerce_series
# ---------------------------------------------------------------------------
class TestCoerceSeries:
def test_integer_clean(self):
s = pd.Series(["1", "2", "3"])
out, fails = coerce_series(s, "integer")
assert list(out) == [1, 2, 3]
assert fails == 0
def test_integer_with_failure(self):
s = pd.Series(["1", "bad", "3"])
out, fails = coerce_series(s, "integer")
assert fails == 1
assert pd.isna(out.iloc[1])
def test_float_with_thousands_sep(self):
# Plain floats; thousands-sep handling is for format standardizer.
s = pd.Series(["1.5", "2.0", "3.25"])
out, fails = coerce_series(s, "float")
assert fails == 0
assert out.iloc[2] == 3.25
def test_boolean_truthy_falsy(self):
s = pd.Series(["true", "false", "Yes", "no", "1", "0"])
out, fails = coerce_series(s, "boolean")
assert fails == 0
assert list(out) == [True, False, True, False, True, False]
def test_boolean_unknown_value_fails(self):
s = pd.Series(["true", "maybe"])
out, fails = coerce_series(s, "boolean")
assert fails == 1
assert pd.isna(out.iloc[1])
def test_date_iso_format(self):
s = pd.Series(["2025-01-15", "2025-02-20"])
out, fails = coerce_series(s, "date")
assert fails == 0
assert out.iloc[0].year == 2025
def test_date_failure(self):
s = pd.Series(["2025-01-15", "garbage"])
out, fails = coerce_series(s, "date")
assert fails == 1
assert pd.isna(out.iloc[1])
def test_string_passthrough(self):
s = pd.Series([1, 2, 3])
out, fails = coerce_series(s, "string")
assert fails == 0
assert out.dtype.name == "string"
def test_auto_returns_unchanged(self):
s = pd.Series([1, 2])
out, fails = coerce_series(s, "auto")
assert fails == 0
assert out is s
def test_unknown_dtype_raises(self):
with pytest.raises(InputValidationError):
coerce_series(pd.Series([1]), "bogus") # type: ignore[arg-type]
# ---------------------------------------------------------------------------
# map_columns — explicit mapping
# ---------------------------------------------------------------------------
class TestMapColumnsExplicit:
def test_simple_rename(self):
df = pd.DataFrame({"a": [1], "b": [2]})
opts = MapOptions(mapping={"a": "alpha", "b": "beta"})
res = map_columns(df, opts)
assert list(res.mapped_df.columns) == ["alpha", "beta"]
assert res.columns_renamed == 2
def test_unknown_source_raises(self):
df = pd.DataFrame({"a": [1]})
opts = MapOptions(mapping={"missing": "x"})
with pytest.raises(InputValidationError):
map_columns(df, opts)
def test_duplicate_target_raises(self):
df = pd.DataFrame({"a": [1], "b": [2]})
opts = MapOptions(mapping={"a": "x", "b": "x"})
with pytest.raises(InputValidationError):
map_columns(df, opts)
def test_unmapped_keep(self):
df = pd.DataFrame({"a": [1], "b": [2]})
opts = MapOptions(mapping={"a": "alpha"}, unmapped="keep")
res = map_columns(df, opts)
assert "b" in res.mapped_df.columns
assert res.unmapped_kept == ["b"]
def test_unmapped_drop(self):
df = pd.DataFrame({"a": [1], "b": [2]})
opts = MapOptions(mapping={"a": "alpha"}, unmapped="drop")
res = map_columns(df, opts)
assert list(res.mapped_df.columns) == ["alpha"]
assert res.columns_dropped == ["b"]
def test_unmapped_error(self):
df = pd.DataFrame({"a": [1], "b": [2]})
opts = MapOptions(mapping={"a": "alpha"}, unmapped="error")
with pytest.raises(InputValidationError):
map_columns(df, opts)
# ---------------------------------------------------------------------------
# map_columns — schema + auto-inference
# ---------------------------------------------------------------------------
class TestMapColumnsWithSchema:
def test_auto_infer_renames(self):
df = pd.DataFrame({"First Name": ["A"], "Last Name": ["B"]})
schema = TargetSchema(fields=[
TargetField(name="first_name"), TargetField(name="last_name"),
])
opts = MapOptions(schema=schema, auto_infer=True)
res = map_columns(df, opts)
assert "first_name" in res.mapped_df.columns
assert "last_name" in res.mapped_df.columns
assert res.inferred_pairs == {"First Name": "first_name", "Last Name": "last_name"}
def test_explicit_overrides_inferred(self):
df = pd.DataFrame({"name": ["A"], "fname": ["B"]})
schema = TargetSchema(fields=[TargetField(name="first_name")])
opts = MapOptions(
schema=schema,
mapping={"fname": "first_name"},
auto_infer=True,
)
res = map_columns(df, opts)
assert res.mapping["fname"] == "first_name"
assert "name" not in res.mapping
def test_required_missing_raises(self):
df = pd.DataFrame({"first_name": ["A"]})
schema = TargetSchema(fields=[
TargetField(name="first_name", required=True),
TargetField(name="email", required=True),
])
opts = MapOptions(schema=schema, auto_infer=False, enforce_required=True)
with pytest.raises(InputValidationError):
map_columns(df, opts)
def test_required_missing_with_default_added(self):
df = pd.DataFrame({"first_name": ["A"]})
schema = TargetSchema(fields=[
TargetField(name="first_name", required=True),
TargetField(name="source", required=False, default="import"),
])
opts = MapOptions(schema=schema, auto_infer=False)
res = map_columns(df, opts)
assert "source" in res.mapped_df.columns
assert res.mapped_df.iloc[0]["source"] == "import"
assert res.columns_added == ["source"]
def test_required_missing_disabled(self):
df = pd.DataFrame({"first_name": ["A"]})
schema = TargetSchema(fields=[
TargetField(name="first_name", required=True),
TargetField(name="email", required=True),
])
opts = MapOptions(schema=schema, auto_infer=False, enforce_required=False)
res = map_columns(df, opts)
assert "email" in res.missing_required_targets
def test_reorder_to_schema(self):
df = pd.DataFrame({"z": [1], "a": [2], "m": [3]})
schema = TargetSchema(fields=[
TargetField(name="a"), TargetField(name="m"), TargetField(name="z"),
])
opts = MapOptions(schema=schema, auto_infer=True, reorder_to_schema=True)
res = map_columns(df, opts)
assert list(res.mapped_df.columns) == ["a", "m", "z"]
def test_coerce_types(self):
df = pd.DataFrame({"age": ["30", "bad", "40"], "active": ["true", "no", "yes"]})
schema = TargetSchema(fields=[
TargetField(name="age", dtype="integer"),
TargetField(name="active", dtype="boolean"),
])
opts = MapOptions(schema=schema, auto_infer=True, coerce_types=True)
res = map_columns(df, opts)
assert res.mapped_df["age"].iloc[0] == 30
assert res.mapped_df["active"].iloc[0] is True or res.mapped_df["active"].iloc[0]
assert res.coercion_failures == {"age": 1}
# ---------------------------------------------------------------------------
# Presets
# ---------------------------------------------------------------------------
class TestPresets:
def test_strict_schema_drops_and_coerces_and_reorders(self):
df = pd.DataFrame({"First Name": ["A"], "Email": ["a@x"], "extra": [1]})
schema = TargetSchema(fields=[
TargetField(name="first_name", required=True),
TargetField(name="email", required=True),
])
opts = MapOptions.from_preset("strict-schema")
opts.schema = schema
res = map_columns(df, opts)
assert list(res.mapped_df.columns) == ["first_name", "email"]
assert res.columns_dropped == ["extra"]
def test_lenient_keeps_extras(self):
df = pd.DataFrame({"First Name": ["A"], "extra": [1]})
schema = TargetSchema(fields=[TargetField(name="first_name")])
opts = MapOptions.from_preset("lenient-schema")
opts.schema = schema
res = map_columns(df, opts)
assert "extra" in res.mapped_df.columns
def test_unknown_preset(self):
with pytest.raises(ConfigError):
MapOptions.from_preset("does-not-exist")
# ---------------------------------------------------------------------------
# Schema serialization
# ---------------------------------------------------------------------------
class TestSchemaIO:
def test_roundtrip_dict(self):
schema = TargetSchema(fields=[
TargetField(name="x", dtype="integer", required=True, aliases=["X", "X "]),
TargetField(name="y", default="z"),
])
d = schema.to_dict()
loaded = TargetSchema.from_dict(d)
assert loaded.field_names() == ["x", "y"]
assert loaded.fields[0].required is True
assert loaded.fields[1].default == "z"
def test_from_dict_string_field(self):
# Allow shorthand: bare string defaults to dtype=auto.
loaded = TargetSchema.from_dict({"fields": ["a", "b"]})
assert loaded.field_names() == ["a", "b"]
def test_from_dict_unknown_dtype_raises(self):
with pytest.raises(ConfigError):
TargetSchema.from_dict({"fields": [{"name": "x", "dtype": "bogus"}]})
def test_from_dict_missing_name_raises(self):
with pytest.raises(ConfigError):
TargetSchema.from_dict({"fields": [{"dtype": "string"}]})
def test_options_roundtrip_to_file(self, tmp_path):
schema = TargetSchema(fields=[TargetField(name="x", dtype="string")])
opts = MapOptions(
schema=schema,
mapping={"a": "x"},
unmapped="drop",
coerce_types=True,
reorder_to_schema=True,
)
path = tmp_path / "cfg.json"
opts.to_file(path)
loaded = MapOptions.from_file(path)
assert loaded.mapping == {"a": "x"}
assert loaded.unmapped == "drop"
assert loaded.coerce_types is True
assert loaded.schema is not None
assert loaded.schema.field_names() == ["x"]
# ---------------------------------------------------------------------------
# Validation
# ---------------------------------------------------------------------------
class TestValidation:
def test_invalid_unmapped_strategy(self):
opts = MapOptions(unmapped="bogus") # type: ignore[arg-type]
with pytest.raises(InputValidationError):
opts.validate()
def test_threshold_out_of_range(self):
opts = MapOptions(fuzzy_threshold=1.5)
with pytest.raises(ConfigError):
opts.validate()
def test_non_dataframe_input(self):
with pytest.raises(InputValidationError):
map_columns([1, 2, 3]) # type: ignore[arg-type]
# ---------------------------------------------------------------------------
# Idempotency
# ---------------------------------------------------------------------------
class TestIdempotency:
def test_double_apply_is_stable(self):
df = pd.DataFrame({"First Name": ["A"], "Email": ["a@x"]})
schema = TargetSchema(fields=[
TargetField(name="first_name"),
TargetField(name="email"),
])
opts = MapOptions(schema=schema, auto_infer=True, reorder_to_schema=True)
first = map_columns(df, opts)
second = map_columns(first.mapped_df, opts)
pd.testing.assert_frame_equal(second.mapped_df, first.mapped_df)
def test_input_not_mutated(self):
df = pd.DataFrame({"a": [1], "b": [2]})
snapshot = df.copy(deep=True)
map_columns(df, MapOptions(mapping={"a": "x"}))
pd.testing.assert_frame_equal(df, snapshot)