Files
datatools-dev/tests/test_column_mapper_corpus.py
Michael 966af8ef94 feat: 3 new tools, format streaming, distribution-ready demo + landing pages
Tools shipped this batch (4 → 6 of 9 Ready):
  04 Missing Value Handler   src/core/missing.py + cli_missing.py + GUI
  05 Column Mapper           src/core/column_mapper.py + cli_column_map.py + GUI
  09 Pipeline Runner         src/core/pipeline.py + cli_pipeline.py + GUI
                             with soft tool-dependency graph (recommended,
                             not enforced) and JSON save/load for repeatable
                             weekly cleanups.

Format Standardizer reworked for 1 GB international files:
  • Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
  • Per-row country / address columns drive parsing
  • Audit cap (default 10 k rows, ~50 MB RAM)
  • standardize_file(): chunked streaming entry point (~165 k rows/sec)
  • currency_decimal="auto" for EU comma-decimal locales
  • R$ / kr / zł multi-char currency prefixes
  • cli_format.py with auto-stream above 100 MB inputs

Encoding detection arbiter + language-aware probe:
  Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
  via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.

Distribution-readiness assets:
  • streamlit_app.py — Streamlit Community Cloud entry shim
  • src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
    100-row cap + watermark, free-vs-paid boundary enforced at surface
  • samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
  • landing/ — 4 static HTML pages (apex chooser + 3 niche),
    shared CSS, deploy.py URL-substitution script,
    auto-generated robots.txt + sitemap.xml + 404.html + favicon
  • docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
    — full strategy + measurement + deployment + master checklist

Test counts:
  before: 1,520 passed · 4 skipped · 17 xfailed
  after:  1,729 passed · 0 skipped · 0  xfailed

Tier-1 corpora added:
  • missing-corpus           3 use cases + 16 edge cases
  • column-mapper-corpus     3 use cases + 5 edge cases
  • format-cleaner intl      20-row 13-country stress fixture

Engine hardening flushed out by the corpora:
  • interpolate guards against object-dtype columns
  • mean/median skip all-NaN columns (silences numpy warning)
  • fillna runs under future.no_silent_downcasting (silences pandas warning)
  • mojibake test no longer skips when ftfy installed (monkeypatch path)
  • drop-row threshold semantics: strict-greater (consistent across rows / cols)
  • currency_decimal validator allow-set updated for "auto"

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 22:31:26 +00:00

241 lines
8.9 KiB
Python

"""Acceptance corpus for the Column Mapper.
Loads every fixture in ``test-cases/column-mapper-corpus/test_data/``
and asserts the documented behaviour against the documented schema.
"""
from __future__ import annotations
import json
from pathlib import Path
import pandas as pd
import pytest
from src.core.errors import InputValidationError
from src.core.column_mapper import (
MapOptions,
TargetField,
TargetSchema,
map_columns,
)
CORPUS = Path(__file__).resolve().parents[1] / "test-cases" / "column-mapper-corpus"
TEST_DATA = CORPUS / "test_data"
SCHEMAS = CORPUS / "schemas"
def _read(name: str) -> pd.DataFrame:
return pd.read_csv(TEST_DATA / name)
def _schema(name: str) -> TargetSchema:
return TargetSchema.from_file(SCHEMAS / name)
# ---------------------------------------------------------------------------
# UC01 — CRM import
# ---------------------------------------------------------------------------
class TestUC01CrmImport:
def test_strict_schema_round_trip(self):
df = _read("uc01_crm_import.csv")
schema = _schema("uc01_crm_target.json")
opts = MapOptions.from_preset("strict-schema")
opts.schema = schema
res = map_columns(df, opts)
# Every required target is present after the run.
for f in schema.fields:
if f.required:
assert f.name in res.mapped_df.columns
# 'owner' default added.
assert "owner" in res.columns_added
assert (res.mapped_df["owner"] == "unassigned").all()
# No unmapped survivors (strict preset drops extras).
assert res.unmapped_kept == []
# Reordered to schema order.
expected_prefix = [f.name for f in schema.fields]
assert list(res.mapped_df.columns)[: len(expected_prefix)] == expected_prefix
def test_types_coerced_from_strings(self):
df = _read("uc01_crm_import.csv")
schema = _schema("uc01_crm_target.json")
opts = MapOptions.from_preset("strict-schema")
opts.schema = schema
res = map_columns(df, opts)
# annual_rev → integer (was numeric strings in the source).
assert pd.api.types.is_integer_dtype(res.mapped_df["annual_rev"])
# created_date → datetime64.
assert pd.api.types.is_datetime64_any_dtype(res.mapped_df["created_date"])
# ---------------------------------------------------------------------------
# UC02 — Multi-vendor unification
# ---------------------------------------------------------------------------
class TestUC02MultiVendor:
@pytest.mark.parametrize("vendor", ["a", "b", "c"])
def test_each_vendor_normalises_to_canonical(self, vendor):
df = _read(f"uc02_vendor_{vendor}.csv")
schema = _schema("uc02_canonical.json")
opts = MapOptions.from_preset("lenient-schema")
opts.schema = schema
opts.fuzzy_threshold = 0.5 # vendor C uses obscure aliases ("FName", "Tel")
res = map_columns(df, opts)
# Every required canonical field landed in the output.
for f in schema.fields:
if f.required:
assert f.name in res.mapped_df.columns, (
f"vendor {vendor}: missing {f.name}; mapping={res.mapping}"
)
def test_concatenated_vendors_share_schema(self):
# The point of unification: after each vendor goes through the
# mapper, the resulting frames stack cleanly.
schema = _schema("uc02_canonical.json")
opts = MapOptions.from_preset("strict-schema")
opts.schema = schema
opts.fuzzy_threshold = 0.5
frames = [
map_columns(_read(f"uc02_vendor_{v}.csv"), opts).mapped_df
for v in ("a", "b", "c")
]
unified = pd.concat(frames, ignore_index=True)
assert list(unified.columns) == [f.name for f in schema.fields]
# Total rows = sum of inputs.
assert len(unified) == sum(len(f) for f in frames)
# ---------------------------------------------------------------------------
# UC03 — Type coercion
# ---------------------------------------------------------------------------
class TestUC03TypeCoercion:
def test_documented_failures_are_reported(self):
df = _read("uc03_type_coercion.csv")
schema = _schema("uc03_types.json")
opts = MapOptions.from_preset("lenient-schema")
opts.schema = schema
res = map_columns(df, opts)
# Bad rows survive as NaN, with counts recorded.
assert res.coercion_failures.get("age") == 1
assert res.coercion_failures.get("score") == 1
assert res.coercion_failures.get("joined") == 1
def test_coerced_dtypes(self):
df = _read("uc03_type_coercion.csv")
schema = _schema("uc03_types.json")
opts = MapOptions.from_preset("lenient-schema")
opts.schema = schema
res = map_columns(df, opts)
out = res.mapped_df
assert pd.api.types.is_integer_dtype(out["id"])
assert out["active"].dtype.name == "boolean"
assert pd.api.types.is_datetime64_any_dtype(out["joined"])
# Float failures NaN-ify.
assert pd.isna(out["score"].iloc[1])
# ---------------------------------------------------------------------------
# Edge cases
# ---------------------------------------------------------------------------
class TestEC01DuplicateTarget:
def test_two_sources_to_same_target_raises(self):
df = _read("ec01_duplicate_target.csv")
opts = MapOptions(mapping={"a": "x", "b": "x"})
with pytest.raises(InputValidationError):
map_columns(df, opts)
class TestEC02UnicodeColumns:
def test_japanese_column_renamed(self):
df = _read("ec02_unicode_columns.csv")
opts = MapOptions(mapping={"名前": "name", "価格": "price"})
res = map_columns(df, opts)
assert "name" in res.mapped_df.columns
assert "price" in res.mapped_df.columns
# Email passes through (unmapped, kept by default).
assert "Email" in res.mapped_df.columns
class TestEC03WhitespaceHeaders:
def test_header_whitespace_does_not_block_match(self):
df = _read("ec03_whitespace_headers.csv")
schema = TargetSchema(fields=[
TargetField(name="first_name", aliases=["First Name"]),
TargetField(name="last_name", aliases=["Last Name"]),
TargetField(name="email", aliases=["EmailAddr"]),
])
opts = MapOptions(schema=schema, auto_infer=True)
res = map_columns(df, opts)
# All three columns should map despite the leading/trailing spaces.
assert len(res.mapping) == 3
class TestEC04NoMatch:
def test_zero_inferred_with_no_match(self):
df = _read("ec04_no_match.csv")
schema = TargetSchema(fields=[
TargetField(name="email"), TargetField(name="phone"),
])
opts = MapOptions(schema=schema, auto_infer=True, unmapped="keep")
res = map_columns(df, opts)
assert res.inferred_pairs == {}
# Source columns survive as-is under keep.
assert set(df.columns) <= set(res.mapped_df.columns)
def test_no_match_with_unmapped_error(self):
df = _read("ec04_no_match.csv")
schema = TargetSchema(fields=[TargetField(name="email")])
opts = MapOptions(
schema=schema, auto_infer=True, unmapped="error",
enforce_required=False,
)
with pytest.raises(InputValidationError):
map_columns(df, opts)
class TestEC05RequiredMissing:
def test_required_missing_raises(self):
df = _read("ec05_required_missing.csv")
schema = TargetSchema(fields=[
TargetField(name="first_name", required=True),
TargetField(name="email", required=True),
])
opts = MapOptions(schema=schema, auto_infer=True, enforce_required=True)
with pytest.raises(InputValidationError):
map_columns(df, opts)
def test_disable_enforce_surfaces_in_result(self):
df = _read("ec05_required_missing.csv")
schema = TargetSchema(fields=[
TargetField(name="first_name", required=True),
TargetField(name="email", required=True),
])
opts = MapOptions(schema=schema, auto_infer=True, enforce_required=False)
res = map_columns(df, opts)
assert "email" in res.missing_required_targets
# ---------------------------------------------------------------------------
# Whole-corpus property tests
# ---------------------------------------------------------------------------
ALL_FIXTURES = sorted(p.name for p in TEST_DATA.glob("*.csv"))
@pytest.mark.parametrize("fixture", ALL_FIXTURES)
def test_map_columns_does_not_mutate_input(fixture):
df = pd.read_csv(TEST_DATA / fixture)
snapshot = df.copy(deep=True)
try:
map_columns(df, MapOptions()) # identity run; default options.
except InputValidationError:
pass # ec01 / ec05 raise here — fine, mutation is what we care about.
pd.testing.assert_frame_equal(df, snapshot)