"""Tests for src/core/column_mapper.py.""" from __future__ import annotations import json import numpy as np import pandas as pd import pytest from src.core.errors import ConfigError, InputValidationError from src.core.column_mapper import ( MapOptions, PRESETS, TargetField, TargetSchema, coerce_series, infer_mapping, map_columns, ) # --------------------------------------------------------------------------- # infer_mapping — fuzzy matcher # --------------------------------------------------------------------------- class TestInferMapping: def test_exact_normalized_match(self): df = pd.DataFrame({"First Name": [], "Last Name": []}) schema = TargetSchema(fields=[ TargetField(name="first_name"), TargetField(name="last_name"), ]) m = infer_mapping(df, schema) assert m == {"First Name": "first_name", "Last Name": "last_name"} def test_alias_match(self): df = pd.DataFrame({"EmailAddr": []}) schema = TargetSchema(fields=[ TargetField(name="email", aliases=["EmailAddr", "email_address"]), ]) m = infer_mapping(df, schema) assert m == {"EmailAddr": "email"} def test_below_threshold_excluded(self): df = pd.DataFrame({"xyz": []}) schema = TargetSchema(fields=[TargetField(name="email")]) m = infer_mapping(df, schema, threshold=0.6) assert m == {} def test_target_matched_at_most_once(self): df = pd.DataFrame({"first_name": [], "fname": []}) schema = TargetSchema(fields=[TargetField(name="first_name")]) m = infer_mapping(df, schema) # Exact match wins; "fname" stays unmapped. assert m == {"first_name": "first_name"} def test_threshold_zero_matches_anything(self): df = pd.DataFrame({"a": [], "b": []}) schema = TargetSchema(fields=[TargetField(name="z")]) m = infer_mapping(df, schema, threshold=0.0) assert len(m) == 1 # --------------------------------------------------------------------------- # coerce_series # --------------------------------------------------------------------------- class TestCoerceSeries: def test_integer_clean(self): s = pd.Series(["1", "2", "3"]) out, fails = coerce_series(s, "integer") assert list(out) == [1, 2, 3] assert fails == 0 def test_integer_with_failure(self): s = pd.Series(["1", "bad", "3"]) out, fails = coerce_series(s, "integer") assert fails == 1 assert pd.isna(out.iloc[1]) def test_float_with_thousands_sep(self): # Plain floats; thousands-sep handling is for format standardizer. s = pd.Series(["1.5", "2.0", "3.25"]) out, fails = coerce_series(s, "float") assert fails == 0 assert out.iloc[2] == 3.25 def test_boolean_truthy_falsy(self): s = pd.Series(["true", "false", "Yes", "no", "1", "0"]) out, fails = coerce_series(s, "boolean") assert fails == 0 assert list(out) == [True, False, True, False, True, False] def test_boolean_unknown_value_fails(self): s = pd.Series(["true", "maybe"]) out, fails = coerce_series(s, "boolean") assert fails == 1 assert pd.isna(out.iloc[1]) def test_date_iso_format(self): s = pd.Series(["2025-01-15", "2025-02-20"]) out, fails = coerce_series(s, "date") assert fails == 0 assert out.iloc[0].year == 2025 def test_date_failure(self): s = pd.Series(["2025-01-15", "garbage"]) out, fails = coerce_series(s, "date") assert fails == 1 assert pd.isna(out.iloc[1]) def test_string_passthrough(self): s = pd.Series([1, 2, 3]) out, fails = coerce_series(s, "string") assert fails == 0 assert out.dtype.name == "string" def test_auto_returns_unchanged(self): s = pd.Series([1, 2]) out, fails = coerce_series(s, "auto") assert fails == 0 assert out is s def test_unknown_dtype_raises(self): with pytest.raises(InputValidationError): coerce_series(pd.Series([1]), "bogus") # type: ignore[arg-type] # --------------------------------------------------------------------------- # map_columns — explicit mapping # --------------------------------------------------------------------------- class TestMapColumnsExplicit: def test_simple_rename(self): df = pd.DataFrame({"a": [1], "b": [2]}) opts = MapOptions(mapping={"a": "alpha", "b": "beta"}) res = map_columns(df, opts) assert list(res.mapped_df.columns) == ["alpha", "beta"] assert res.columns_renamed == 2 def test_unknown_source_raises(self): df = pd.DataFrame({"a": [1]}) opts = MapOptions(mapping={"missing": "x"}) with pytest.raises(InputValidationError): map_columns(df, opts) def test_duplicate_target_raises(self): df = pd.DataFrame({"a": [1], "b": [2]}) opts = MapOptions(mapping={"a": "x", "b": "x"}) with pytest.raises(InputValidationError): map_columns(df, opts) def test_unmapped_keep(self): df = pd.DataFrame({"a": [1], "b": [2]}) opts = MapOptions(mapping={"a": "alpha"}, unmapped="keep") res = map_columns(df, opts) assert "b" in res.mapped_df.columns assert res.unmapped_kept == ["b"] def test_unmapped_drop(self): df = pd.DataFrame({"a": [1], "b": [2]}) opts = MapOptions(mapping={"a": "alpha"}, unmapped="drop") res = map_columns(df, opts) assert list(res.mapped_df.columns) == ["alpha"] assert res.columns_dropped == ["b"] def test_unmapped_error(self): df = pd.DataFrame({"a": [1], "b": [2]}) opts = MapOptions(mapping={"a": "alpha"}, unmapped="error") with pytest.raises(InputValidationError): map_columns(df, opts) # --------------------------------------------------------------------------- # map_columns — schema + auto-inference # --------------------------------------------------------------------------- class TestMapColumnsWithSchema: def test_auto_infer_renames(self): df = pd.DataFrame({"First Name": ["A"], "Last Name": ["B"]}) schema = TargetSchema(fields=[ TargetField(name="first_name"), TargetField(name="last_name"), ]) opts = MapOptions(schema=schema, auto_infer=True) res = map_columns(df, opts) assert "first_name" in res.mapped_df.columns assert "last_name" in res.mapped_df.columns assert res.inferred_pairs == {"First Name": "first_name", "Last Name": "last_name"} def test_explicit_overrides_inferred(self): df = pd.DataFrame({"name": ["A"], "fname": ["B"]}) schema = TargetSchema(fields=[TargetField(name="first_name")]) opts = MapOptions( schema=schema, mapping={"fname": "first_name"}, auto_infer=True, ) res = map_columns(df, opts) assert res.mapping["fname"] == "first_name" assert "name" not in res.mapping def test_required_missing_raises(self): df = pd.DataFrame({"first_name": ["A"]}) schema = TargetSchema(fields=[ TargetField(name="first_name", required=True), TargetField(name="email", required=True), ]) opts = MapOptions(schema=schema, auto_infer=False, enforce_required=True) with pytest.raises(InputValidationError): map_columns(df, opts) def test_required_missing_with_default_added(self): df = pd.DataFrame({"first_name": ["A"]}) schema = TargetSchema(fields=[ TargetField(name="first_name", required=True), TargetField(name="source", required=False, default="import"), ]) opts = MapOptions(schema=schema, auto_infer=False) res = map_columns(df, opts) assert "source" in res.mapped_df.columns assert res.mapped_df.iloc[0]["source"] == "import" assert res.columns_added == ["source"] def test_required_missing_disabled(self): df = pd.DataFrame({"first_name": ["A"]}) schema = TargetSchema(fields=[ TargetField(name="first_name", required=True), TargetField(name="email", required=True), ]) opts = MapOptions(schema=schema, auto_infer=False, enforce_required=False) res = map_columns(df, opts) assert "email" in res.missing_required_targets def test_reorder_to_schema(self): df = pd.DataFrame({"z": [1], "a": [2], "m": [3]}) schema = TargetSchema(fields=[ TargetField(name="a"), TargetField(name="m"), TargetField(name="z"), ]) opts = MapOptions(schema=schema, auto_infer=True, reorder_to_schema=True) res = map_columns(df, opts) assert list(res.mapped_df.columns) == ["a", "m", "z"] def test_coerce_types(self): df = pd.DataFrame({"age": ["30", "bad", "40"], "active": ["true", "no", "yes"]}) schema = TargetSchema(fields=[ TargetField(name="age", dtype="integer"), TargetField(name="active", dtype="boolean"), ]) opts = MapOptions(schema=schema, auto_infer=True, coerce_types=True) res = map_columns(df, opts) assert res.mapped_df["age"].iloc[0] == 30 assert res.mapped_df["active"].iloc[0] is True or res.mapped_df["active"].iloc[0] assert res.coercion_failures == {"age": 1} # --------------------------------------------------------------------------- # Presets # --------------------------------------------------------------------------- class TestPresets: def test_strict_schema_drops_and_coerces_and_reorders(self): df = pd.DataFrame({"First Name": ["A"], "Email": ["a@x"], "extra": [1]}) schema = TargetSchema(fields=[ TargetField(name="first_name", required=True), TargetField(name="email", required=True), ]) opts = MapOptions.from_preset("strict-schema") opts.schema = schema res = map_columns(df, opts) assert list(res.mapped_df.columns) == ["first_name", "email"] assert res.columns_dropped == ["extra"] def test_lenient_keeps_extras(self): df = pd.DataFrame({"First Name": ["A"], "extra": [1]}) schema = TargetSchema(fields=[TargetField(name="first_name")]) opts = MapOptions.from_preset("lenient-schema") opts.schema = schema res = map_columns(df, opts) assert "extra" in res.mapped_df.columns def test_unknown_preset(self): with pytest.raises(ConfigError): MapOptions.from_preset("does-not-exist") # --------------------------------------------------------------------------- # Schema serialization # --------------------------------------------------------------------------- class TestSchemaIO: def test_roundtrip_dict(self): schema = TargetSchema(fields=[ TargetField(name="x", dtype="integer", required=True, aliases=["X", "X "]), TargetField(name="y", default="z"), ]) d = schema.to_dict() loaded = TargetSchema.from_dict(d) assert loaded.field_names() == ["x", "y"] assert loaded.fields[0].required is True assert loaded.fields[1].default == "z" def test_from_dict_string_field(self): # Allow shorthand: bare string defaults to dtype=auto. loaded = TargetSchema.from_dict({"fields": ["a", "b"]}) assert loaded.field_names() == ["a", "b"] def test_from_dict_unknown_dtype_raises(self): with pytest.raises(ConfigError): TargetSchema.from_dict({"fields": [{"name": "x", "dtype": "bogus"}]}) def test_from_dict_missing_name_raises(self): with pytest.raises(ConfigError): TargetSchema.from_dict({"fields": [{"dtype": "string"}]}) def test_options_roundtrip_to_file(self, tmp_path): schema = TargetSchema(fields=[TargetField(name="x", dtype="string")]) opts = MapOptions( schema=schema, mapping={"a": "x"}, unmapped="drop", coerce_types=True, reorder_to_schema=True, ) path = tmp_path / "cfg.json" opts.to_file(path) loaded = MapOptions.from_file(path) assert loaded.mapping == {"a": "x"} assert loaded.unmapped == "drop" assert loaded.coerce_types is True assert loaded.schema is not None assert loaded.schema.field_names() == ["x"] # --------------------------------------------------------------------------- # Validation # --------------------------------------------------------------------------- class TestValidation: def test_invalid_unmapped_strategy(self): opts = MapOptions(unmapped="bogus") # type: ignore[arg-type] with pytest.raises(InputValidationError): opts.validate() def test_threshold_out_of_range(self): opts = MapOptions(fuzzy_threshold=1.5) with pytest.raises(ConfigError): opts.validate() def test_non_dataframe_input(self): with pytest.raises(InputValidationError): map_columns([1, 2, 3]) # type: ignore[arg-type] # --------------------------------------------------------------------------- # Idempotency # --------------------------------------------------------------------------- class TestIdempotency: def test_double_apply_is_stable(self): df = pd.DataFrame({"First Name": ["A"], "Email": ["a@x"]}) schema = TargetSchema(fields=[ TargetField(name="first_name"), TargetField(name="email"), ]) opts = MapOptions(schema=schema, auto_infer=True, reorder_to_schema=True) first = map_columns(df, opts) second = map_columns(first.mapped_df, opts) pd.testing.assert_frame_equal(second.mapped_df, first.mapped_df) def test_input_not_mutated(self): df = pd.DataFrame({"a": [1], "b": [2]}) snapshot = df.copy(deep=True) map_columns(df, MapOptions(mapping={"a": "x"})) pd.testing.assert_frame_equal(df, snapshot)