Tools shipped this batch (4 → 6 of 9 Ready):
04 Missing Value Handler src/core/missing.py + cli_missing.py + GUI
05 Column Mapper src/core/column_mapper.py + cli_column_map.py + GUI
09 Pipeline Runner src/core/pipeline.py + cli_pipeline.py + GUI
with soft tool-dependency graph (recommended,
not enforced) and JSON save/load for repeatable
weekly cleanups.
Format Standardizer reworked for 1 GB international files:
• Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
• Per-row country / address columns drive parsing
• Audit cap (default 10 k rows, ~50 MB RAM)
• standardize_file(): chunked streaming entry point (~165 k rows/sec)
• currency_decimal="auto" for EU comma-decimal locales
• R$ / kr / zł multi-char currency prefixes
• cli_format.py with auto-stream above 100 MB inputs
Encoding detection arbiter + language-aware probe:
Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.
Distribution-readiness assets:
• streamlit_app.py — Streamlit Community Cloud entry shim
• src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
100-row cap + watermark, free-vs-paid boundary enforced at surface
• samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
• landing/ — 4 static HTML pages (apex chooser + 3 niche),
shared CSS, deploy.py URL-substitution script,
auto-generated robots.txt + sitemap.xml + 404.html + favicon
• docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
— full strategy + measurement + deployment + master checklist
Test counts:
before: 1,520 passed · 4 skipped · 17 xfailed
after: 1,729 passed · 0 skipped · 0 xfailed
Tier-1 corpora added:
• missing-corpus 3 use cases + 16 edge cases
• column-mapper-corpus 3 use cases + 5 edge cases
• format-cleaner intl 20-row 13-country stress fixture
Engine hardening flushed out by the corpora:
• interpolate guards against object-dtype columns
• mean/median skip all-NaN columns (silences numpy warning)
• fillna runs under future.no_silent_downcasting (silences pandas warning)
• mojibake test no longer skips when ftfy installed (monkeypatch path)
• drop-row threshold semantics: strict-greater (consistent across rows / cols)
• currency_decimal validator allow-set updated for "auto"
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
634 lines
23 KiB
Python
634 lines
23 KiB
Python
"""DataTools Column Mapper.
|
||
|
||
Rename columns, enforce a target schema, coerce types, drop / add /
|
||
reorder columns. Designed for the three buyer profiles the toolkit
|
||
already serves:
|
||
|
||
1. **Schema enforcement** — analyst receives a CSV that has to fit a
|
||
known target shape (a CRM import format, a database schema, a
|
||
mailing-list contract). Map source columns to target names, coerce
|
||
each to the declared type, drop the extras, fail clearly when a
|
||
required target field is missing.
|
||
2. **Multi-source unification** — operator merges vendor/partner
|
||
exports where every file uses different column names ("First Name"
|
||
/ "first_name" / "FirstName"). The fuzzy auto-mapper proposes a
|
||
mapping; the user reviews and overrides.
|
||
3. **Type coercion** — quick conversion of mis-typed columns (string
|
||
"123" → int, "true"/"yes" → bool, "2024-01-15" → date) without
|
||
leaving the tool, with errors surfaced row-by-row.
|
||
|
||
Public API
|
||
----------
|
||
Types:
|
||
TargetField, TargetSchema, ColumnMapping, MapOptions, MapResult,
|
||
ColumnDtype
|
||
|
||
Functions:
|
||
map_columns(df, options) -> MapResult
|
||
infer_mapping(df, schema, *, threshold=0.6) -> dict[src, target]
|
||
coerce_series(series, dtype) -> (Series, n_failures)
|
||
|
||
Presets:
|
||
PRESETS = {"rename-only", "strict-schema", "lenient-schema"}
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import re
|
||
from dataclasses import asdict, dataclass, field
|
||
from pathlib import Path
|
||
from typing import Any, Iterable, Literal, Optional
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
from loguru import logger
|
||
from pandas.api import types as pdtypes
|
||
|
||
from .errors import ConfigError, InputValidationError, ensure_choice, ensure_dataframe
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Types
|
||
# ---------------------------------------------------------------------------
|
||
|
||
ColumnDtype = Literal[
|
||
"string",
|
||
"integer",
|
||
"float",
|
||
"boolean",
|
||
"date",
|
||
"datetime",
|
||
"category",
|
||
"auto", # leave dtype alone
|
||
]
|
||
|
||
_VALID_DTYPES: frozenset[str] = frozenset({
|
||
"string", "integer", "float", "boolean", "date", "datetime",
|
||
"category", "auto",
|
||
})
|
||
|
||
|
||
@dataclass
|
||
class TargetField:
|
||
"""One field in a target schema.
|
||
|
||
Required fields whose source column is missing produce a
|
||
``MapResult.missing_required_targets`` entry rather than silently
|
||
creating a NaN column.
|
||
"""
|
||
|
||
name: str
|
||
dtype: ColumnDtype = "auto"
|
||
required: bool = False
|
||
aliases: list[str] = field(default_factory=list)
|
||
default: Any = None
|
||
|
||
|
||
@dataclass
|
||
class TargetSchema:
|
||
"""Ordered list of target fields. Ordering survives into the result DataFrame."""
|
||
|
||
fields: list[TargetField]
|
||
|
||
def field_names(self) -> list[str]:
|
||
return [f.name for f in self.fields]
|
||
|
||
def get(self, name: str) -> Optional[TargetField]:
|
||
return next((f for f in self.fields if f.name == name), None)
|
||
|
||
def to_dict(self) -> dict:
|
||
return {"fields": [asdict(f) for f in self.fields]}
|
||
|
||
def to_file(self, path: str | Path) -> Path:
|
||
out = Path(path)
|
||
out.write_text(json.dumps(self.to_dict(), indent=2, default=str))
|
||
return out
|
||
|
||
@classmethod
|
||
def from_dict(cls, data: dict) -> TargetSchema:
|
||
if "fields" not in data:
|
||
raise ConfigError(
|
||
"Target schema must contain a 'fields' list",
|
||
operation="TargetSchema.from_dict",
|
||
suggestion='Example: {"fields": [{"name": "email", "dtype": "string", "required": true}, ...]}',
|
||
)
|
||
fields = []
|
||
for entry in data["fields"]:
|
||
if isinstance(entry, str):
|
||
fields.append(TargetField(name=entry))
|
||
continue
|
||
if "name" not in entry:
|
||
raise ConfigError(
|
||
f"Schema field is missing 'name': {entry!r}",
|
||
operation="TargetSchema.from_dict",
|
||
)
|
||
dtype = entry.get("dtype", "auto")
|
||
if dtype not in _VALID_DTYPES:
|
||
raise ConfigError(
|
||
f"Schema field {entry['name']!r}: unknown dtype {dtype!r}",
|
||
operation="TargetSchema.from_dict",
|
||
suggestion=f"Valid: {sorted(_VALID_DTYPES)}",
|
||
)
|
||
fields.append(TargetField(
|
||
name=entry["name"],
|
||
dtype=dtype,
|
||
required=bool(entry.get("required", False)),
|
||
aliases=list(entry.get("aliases", [])),
|
||
default=entry.get("default"),
|
||
))
|
||
return cls(fields=fields)
|
||
|
||
@classmethod
|
||
def from_file(cls, path: str | Path) -> TargetSchema:
|
||
return cls.from_dict(json.loads(Path(path).read_text()))
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Fuzzy column-name matching
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# Whitespace, punctuation, and case all vary across vendors. We normalise
|
||
# both sides to a token list before comparing.
|
||
_NORM_RE = re.compile(r"[^a-z0-9]+")
|
||
|
||
|
||
def _normalize_name(name: str) -> str:
|
||
"""Lowercase, strip non-alphanumerics — ``First Name`` → ``firstname``."""
|
||
if not isinstance(name, str):
|
||
return ""
|
||
return _NORM_RE.sub("", name.strip().lower())
|
||
|
||
|
||
def _token_set(name: str) -> frozenset[str]:
|
||
"""Tokenise a column name on non-alphanumeric boundaries."""
|
||
if not isinstance(name, str):
|
||
return frozenset()
|
||
parts = [p for p in _NORM_RE.split(name.strip().lower()) if p]
|
||
return frozenset(parts)
|
||
|
||
|
||
def _name_similarity(a: str, b: str) -> float:
|
||
"""Cheap similarity score in [0.0, 1.0].
|
||
|
||
Combines exact-after-normalisation, token Jaccard, and SequenceMatcher
|
||
ratio. A real fuzzy library (rapidfuzz) is already a project
|
||
dependency for the deduplicator — we use it when available, fall
|
||
back to stdlib ``difflib`` otherwise so the mapper works in trimmed
|
||
builds.
|
||
"""
|
||
if not a or not b:
|
||
return 0.0
|
||
na, nb = _normalize_name(a), _normalize_name(b)
|
||
if na == nb:
|
||
return 1.0
|
||
|
||
ta, tb = _token_set(a), _token_set(b)
|
||
jaccard = (len(ta & tb) / len(ta | tb)) if (ta or tb) else 0.0
|
||
|
||
try:
|
||
from rapidfuzz import fuzz
|
||
seq = fuzz.ratio(na, nb) / 100.0
|
||
except ImportError:
|
||
from difflib import SequenceMatcher
|
||
seq = SequenceMatcher(None, na, nb).ratio()
|
||
|
||
return max(jaccard, seq)
|
||
|
||
|
||
def infer_mapping(
|
||
df: pd.DataFrame,
|
||
schema: TargetSchema,
|
||
*,
|
||
threshold: float = 0.6,
|
||
) -> dict[str, str]:
|
||
"""Best-guess source-column → target-field mapping.
|
||
|
||
Returns a dict keyed by source-column name. A source column is
|
||
omitted from the result when no candidate scores above *threshold*.
|
||
Each target is matched at most once: the highest-scoring source
|
||
wins, ties broken by source-column order in *df*.
|
||
|
||
Aliases declared on a :class:`TargetField` are scored as if they
|
||
were target names — useful for vendor-specific synonyms
|
||
(``["customer_id", "cust_id", "client_no"]``).
|
||
"""
|
||
ensure_dataframe(df, function="infer_mapping")
|
||
sources = list(df.columns)
|
||
targets = schema.fields
|
||
|
||
# All (source, target) candidate scores; keep only those above
|
||
# threshold, sorted descending so a greedy walk picks the best
|
||
# available pairings first.
|
||
scored: list[tuple[float, str, str]] = []
|
||
for src in sources:
|
||
for tgt in targets:
|
||
best = _name_similarity(src, tgt.name)
|
||
for alias in tgt.aliases:
|
||
s = _name_similarity(src, alias)
|
||
if s > best:
|
||
best = s
|
||
if best >= threshold:
|
||
scored.append((best, str(src), tgt.name))
|
||
|
||
scored.sort(key=lambda x: (-x[0], sources.index(x[1])))
|
||
|
||
mapping: dict[str, str] = {}
|
||
used_targets: set[str] = set()
|
||
for score, src, tgt in scored:
|
||
if src in mapping or tgt in used_targets:
|
||
continue
|
||
mapping[src] = tgt
|
||
used_targets.add(tgt)
|
||
return mapping
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Type coercion
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_TRUTHY = frozenset({"true", "t", "yes", "y", "1"})
|
||
_FALSY = frozenset({"false", "f", "no", "n", "0"})
|
||
|
||
|
||
def _coerce_boolean(value: Any) -> Any:
|
||
if isinstance(value, bool):
|
||
return value
|
||
if value is None or (isinstance(value, float) and pd.isna(value)):
|
||
return pd.NA
|
||
if isinstance(value, (int, float)):
|
||
return bool(value)
|
||
if isinstance(value, str):
|
||
v = value.strip().lower()
|
||
if v in _TRUTHY:
|
||
return True
|
||
if v in _FALSY:
|
||
return False
|
||
raise ValueError(f"cannot coerce to boolean: {value!r}")
|
||
|
||
|
||
def coerce_series(series: pd.Series, dtype: ColumnDtype) -> tuple[pd.Series, int]:
|
||
"""Coerce *series* to *dtype*, returning ``(coerced, n_failures)``.
|
||
|
||
Failures are counted but never raised — the caller (``map_columns``)
|
||
surfaces them through ``MapResult.coercion_failures`` so the user
|
||
can inspect which rows didn't fit. Already-typed inputs are cheap
|
||
no-ops.
|
||
"""
|
||
if dtype == "auto":
|
||
return series, 0
|
||
if dtype == "string":
|
||
return series.astype("string"), 0
|
||
if dtype == "category":
|
||
return series.astype("category"), 0
|
||
if dtype == "integer":
|
||
coerced = pd.to_numeric(series, errors="coerce")
|
||
# Use nullable Int64 so NaN entries don't get cast to floats.
|
||
rounded = coerced.round().astype("Int64")
|
||
# Failures = original non-NaN cells whose numeric coercion produced NaN.
|
||
original_filled = series.notna()
|
||
failed = (rounded.isna() & original_filled).sum()
|
||
return rounded, int(failed)
|
||
if dtype == "float":
|
||
coerced = pd.to_numeric(series, errors="coerce").astype("Float64")
|
||
original_filled = series.notna()
|
||
failed = (coerced.isna() & original_filled).sum()
|
||
return coerced, int(failed)
|
||
if dtype == "boolean":
|
||
out: list[Any] = []
|
||
failed = 0
|
||
for v in series.tolist():
|
||
try:
|
||
out.append(_coerce_boolean(v))
|
||
except ValueError:
|
||
out.append(pd.NA)
|
||
failed += 1
|
||
return pd.Series(out, index=series.index, dtype="boolean"), failed
|
||
if dtype in {"date", "datetime"}:
|
||
coerced = pd.to_datetime(series, errors="coerce", utc=False)
|
||
original_filled = series.notna()
|
||
failed = (coerced.isna() & original_filled).sum()
|
||
if dtype == "date":
|
||
# Drop the time component but keep dtype as datetime64 so
|
||
# downstream operations (delta, sort) still work.
|
||
coerced = coerced.dt.normalize()
|
||
return coerced, int(failed)
|
||
raise InputValidationError(
|
||
f"Unknown dtype {dtype!r}",
|
||
operation="coerce_series",
|
||
suggestion=f"Valid: {sorted(_VALID_DTYPES)}",
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Options / result dataclasses
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# Strategy for handling source columns that don't appear in the target
|
||
# schema. ``keep`` preserves them at the end of the output; ``drop``
|
||
# removes them; ``error`` raises an InputValidationError.
|
||
UnmappedStrategy = Literal["keep", "drop", "error"]
|
||
|
||
PRESETS: dict[str, dict[str, Any]] = {
|
||
"rename-only": {
|
||
"auto_infer": True,
|
||
"unmapped": "keep",
|
||
"coerce_types": False,
|
||
"reorder_to_schema": False,
|
||
},
|
||
"strict-schema": {
|
||
"auto_infer": True,
|
||
"unmapped": "drop",
|
||
"coerce_types": True,
|
||
"reorder_to_schema": True,
|
||
},
|
||
"lenient-schema": {
|
||
"auto_infer": True,
|
||
"unmapped": "keep",
|
||
"coerce_types": True,
|
||
"reorder_to_schema": True,
|
||
},
|
||
}
|
||
|
||
|
||
@dataclass
|
||
class MapOptions:
|
||
"""Toggles for column mapping.
|
||
|
||
Defaults match the ``rename-only`` preset: best-effort fuzzy match
|
||
against the schema (if provided), keep unmapped source columns
|
||
after the mapped ones, no type coercion, no reorder.
|
||
"""
|
||
|
||
# Either pass an explicit ``mapping`` dict or a ``schema`` (and let
|
||
# the engine infer the mapping). Explicit mapping wins when both
|
||
# are set.
|
||
mapping: dict[str, str] = field(default_factory=dict)
|
||
schema: Optional[TargetSchema] = None
|
||
|
||
# When True (default), missing entries in ``mapping`` are filled in
|
||
# by ``infer_mapping`` against ``schema``. When False, only the
|
||
# explicit mapping is honoured.
|
||
auto_infer: bool = True
|
||
fuzzy_threshold: float = 0.6
|
||
|
||
# What to do with source columns that aren't in the mapping.
|
||
unmapped: UnmappedStrategy = "keep"
|
||
|
||
# Apply target-field dtypes from the schema after rename.
|
||
coerce_types: bool = False
|
||
|
||
# Reorder output to match schema.fields order. Unmapped survivors
|
||
# (when unmapped="keep") are appended at the end in their original
|
||
# source order.
|
||
reorder_to_schema: bool = False
|
||
|
||
# Required-target enforcement. When True (default), a required
|
||
# target field that has no source column raises an InputValidationError.
|
||
# When False, the missing field is added with ``default`` value.
|
||
enforce_required: bool = True
|
||
|
||
@classmethod
|
||
def from_preset(cls, name: str) -> MapOptions:
|
||
if name not in PRESETS:
|
||
raise ConfigError(
|
||
f"Unknown preset '{name}'",
|
||
operation="MapOptions.from_preset",
|
||
suggestion=f"Available: {sorted(PRESETS)}",
|
||
)
|
||
return cls(**PRESETS[name])
|
||
|
||
@classmethod
|
||
def from_dict(cls, data: dict) -> MapOptions:
|
||
known = set(cls.__dataclass_fields__)
|
||
kwargs = {k: v for k, v in data.items() if k in known}
|
||
if "schema" in kwargs and isinstance(kwargs["schema"], dict):
|
||
kwargs["schema"] = TargetSchema.from_dict(kwargs["schema"])
|
||
return cls(**kwargs)
|
||
|
||
def to_dict(self) -> dict:
|
||
out: dict[str, Any] = {
|
||
"mapping": dict(self.mapping),
|
||
"auto_infer": self.auto_infer,
|
||
"fuzzy_threshold": self.fuzzy_threshold,
|
||
"unmapped": self.unmapped,
|
||
"coerce_types": self.coerce_types,
|
||
"reorder_to_schema": self.reorder_to_schema,
|
||
"enforce_required": self.enforce_required,
|
||
}
|
||
if self.schema is not None:
|
||
out["schema"] = self.schema.to_dict()
|
||
return out
|
||
|
||
def to_file(self, path: str | Path) -> Path:
|
||
out = Path(path)
|
||
out.write_text(json.dumps(self.to_dict(), indent=2, default=str))
|
||
return out
|
||
|
||
@classmethod
|
||
def from_file(cls, path: str | Path) -> MapOptions:
|
||
return cls.from_dict(json.loads(Path(path).read_text()))
|
||
|
||
def validate(self) -> None:
|
||
ensure_choice(
|
||
self.unmapped, name="unmapped",
|
||
choices=("keep", "drop", "error"),
|
||
function="MapOptions.validate",
|
||
)
|
||
if not (0.0 <= self.fuzzy_threshold <= 1.0):
|
||
raise ConfigError(
|
||
f"fuzzy_threshold must be in [0.0, 1.0], got {self.fuzzy_threshold!r}",
|
||
operation="MapOptions.validate",
|
||
)
|
||
|
||
|
||
@dataclass
|
||
class MapResult:
|
||
"""Output of ``map_columns``."""
|
||
|
||
mapped_df: pd.DataFrame
|
||
mapping: dict[str, str] # source → target
|
||
inferred_pairs: dict[str, str] # subset of mapping that was auto-inferred
|
||
columns_renamed: int
|
||
columns_dropped: list[str]
|
||
columns_added: list[str] # required-defaulted fields added with default value
|
||
coercion_failures: dict[str, int] # column → n_rows_that_failed_coercion
|
||
unmapped_kept: list[str]
|
||
missing_required_targets: list[str]
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Main entry point
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def map_columns(
|
||
df: pd.DataFrame,
|
||
options: Optional[MapOptions] = None,
|
||
) -> MapResult:
|
||
"""Apply *options* to *df* and return a :class:`MapResult`.
|
||
|
||
Pipeline placement (recommended, not enforced)
|
||
----------------------------------------------
|
||
Two natural slots:
|
||
* **Early** — header alignment for multi-vendor unification.
|
||
Each vendor uses different column names; rename to a canonical
|
||
schema before any other tool runs.
|
||
* **Late** — schema enforcement for output. After cleaning, coerce
|
||
types and project to the target shape (CRM import contract,
|
||
database schema). Run after format / missing so the coerced
|
||
data is canonical first.
|
||
The pipeline runner does not enforce a position; place by use case.
|
||
|
||
Pipeline:
|
||
1. Compose mapping (explicit ``options.mapping`` ∪ inferred
|
||
pairs from ``options.schema``).
|
||
2. Reject duplicate target names — two source columns mapped to
|
||
the same target is a user error, not a silent overwrite.
|
||
3. Decide what to do with unmapped source columns
|
||
(``keep`` / ``drop`` / ``error``).
|
||
4. Rename, then handle missing required targets, then coerce
|
||
types, then reorder.
|
||
"""
|
||
ensure_dataframe(df, function="map_columns")
|
||
options = options or MapOptions()
|
||
options.validate()
|
||
|
||
# ------------------------------------------------------------------
|
||
# 1. Compose the effective mapping
|
||
# ------------------------------------------------------------------
|
||
explicit = dict(options.mapping)
|
||
inferred: dict[str, str] = {}
|
||
if options.schema is not None and options.auto_infer:
|
||
all_inferred = infer_mapping(df, options.schema, threshold=options.fuzzy_threshold)
|
||
# Explicit user pairings always win.
|
||
used_targets = set(explicit.values())
|
||
for src, tgt in all_inferred.items():
|
||
if src in explicit:
|
||
continue
|
||
if tgt in used_targets:
|
||
continue
|
||
inferred[src] = tgt
|
||
used_targets.add(tgt)
|
||
|
||
mapping: dict[str, str] = {**inferred, **explicit}
|
||
|
||
# ------------------------------------------------------------------
|
||
# 2. Validate mapping coherence
|
||
# ------------------------------------------------------------------
|
||
unknown_sources = [s for s in mapping if s not in df.columns]
|
||
if unknown_sources:
|
||
raise InputValidationError(
|
||
f"Mapping references columns not in input: {unknown_sources}",
|
||
operation="map_columns",
|
||
suggestion=f"Available source columns: {list(df.columns)}",
|
||
)
|
||
target_counts: dict[str, int] = {}
|
||
for tgt in mapping.values():
|
||
target_counts[tgt] = target_counts.get(tgt, 0) + 1
|
||
duplicates = [t for t, n in target_counts.items() if n > 1]
|
||
if duplicates:
|
||
raise InputValidationError(
|
||
f"Multiple source columns mapped to the same target(s): {duplicates}",
|
||
operation="map_columns",
|
||
suggestion="Each target name must be unique. Drop or rename the conflicting source columns.",
|
||
)
|
||
|
||
# ------------------------------------------------------------------
|
||
# 3. Handle unmapped source columns
|
||
# ------------------------------------------------------------------
|
||
unmapped_sources = [c for c in df.columns if c not in mapping]
|
||
unmapped_kept: list[str] = []
|
||
columns_dropped: list[str] = []
|
||
if unmapped_sources:
|
||
if options.unmapped == "drop":
|
||
columns_dropped = list(unmapped_sources)
|
||
elif options.unmapped == "error":
|
||
raise InputValidationError(
|
||
f"Source columns have no mapping and unmapped='error': {unmapped_sources}",
|
||
operation="map_columns",
|
||
suggestion=(
|
||
"Either add explicit mapping entries, set unmapped='keep' / 'drop', "
|
||
"or include the columns in the target schema."
|
||
),
|
||
)
|
||
else:
|
||
unmapped_kept = list(unmapped_sources)
|
||
|
||
# ------------------------------------------------------------------
|
||
# 4. Apply rename and drop
|
||
# ------------------------------------------------------------------
|
||
out = df.copy()
|
||
if columns_dropped:
|
||
out = out.drop(columns=columns_dropped)
|
||
if mapping:
|
||
out = out.rename(columns=mapping)
|
||
columns_renamed = sum(1 for src, tgt in mapping.items() if src != tgt)
|
||
|
||
# ------------------------------------------------------------------
|
||
# 5. Handle the schema's required + default fields
|
||
# ------------------------------------------------------------------
|
||
columns_added: list[str] = []
|
||
missing_required: list[str] = []
|
||
if options.schema is not None:
|
||
present = set(out.columns)
|
||
for tf in options.schema.fields:
|
||
if tf.name in present:
|
||
continue
|
||
if tf.required and tf.default is None:
|
||
missing_required.append(tf.name)
|
||
continue
|
||
# Add with default value (NaN if no default).
|
||
out[tf.name] = tf.default if tf.default is not None else pd.NA
|
||
columns_added.append(tf.name)
|
||
|
||
if missing_required and options.enforce_required:
|
||
raise InputValidationError(
|
||
f"Required target field(s) missing from input: {missing_required}",
|
||
operation="map_columns",
|
||
suggestion=(
|
||
"Either add explicit mapping entries, lower fuzzy_threshold, "
|
||
"supply a default in the schema, or set enforce_required=False."
|
||
),
|
||
)
|
||
|
||
# ------------------------------------------------------------------
|
||
# 6. Coerce types per the schema
|
||
# ------------------------------------------------------------------
|
||
coercion_failures: dict[str, int] = {}
|
||
if options.coerce_types and options.schema is not None:
|
||
for tf in options.schema.fields:
|
||
if tf.name not in out.columns or tf.dtype == "auto":
|
||
continue
|
||
try:
|
||
series, fails = coerce_series(out[tf.name], tf.dtype)
|
||
except (ValueError, TypeError) as e:
|
||
logger.warning(
|
||
"map_columns: coerce of {!r} → {} failed: {}",
|
||
tf.name, tf.dtype, e,
|
||
)
|
||
continue
|
||
out[tf.name] = series
|
||
if fails:
|
||
coercion_failures[tf.name] = fails
|
||
|
||
# ------------------------------------------------------------------
|
||
# 7. Reorder
|
||
# ------------------------------------------------------------------
|
||
if options.reorder_to_schema and options.schema is not None:
|
||
ordered = [f.name for f in options.schema.fields if f.name in out.columns]
|
||
# Append survivors (kept-unmapped originals) in their pre-rename order.
|
||
survivors = [c for c in out.columns if c not in ordered]
|
||
out = out.loc[:, ordered + survivors]
|
||
|
||
return MapResult(
|
||
mapped_df=out,
|
||
mapping=mapping,
|
||
inferred_pairs=inferred,
|
||
columns_renamed=columns_renamed,
|
||
columns_dropped=columns_dropped,
|
||
columns_added=columns_added,
|
||
coercion_failures=coercion_failures,
|
||
unmapped_kept=unmapped_kept,
|
||
missing_required_targets=missing_required,
|
||
)
|