feat: 3 new tools, format streaming, distribution-ready demo + landing pages

Tools shipped this batch (4 → 6 of 9 Ready): 04 Missing Value Handler src/core/missing.py + cli_missing.py + GUI 05 Column Mapper src/core/column_mapper.py + cli_column_map.py + GUI 09 Pipeline Runner src/core/pipeline.py + cli_pipeline.py + GUI with soft tool-dependency graph (recommended, not enforced) and JSON save/load for repeatable weekly cleanups. Format Standardizer reworked for 1 GB international files: • Vectorised dispatch + LRU cache over phone/date/currency/boolean/email • Per-row country / address columns drive parsing • Audit cap (default 10 k rows, ~50 MB RAM) • standardize_file(): chunked streaming entry point (~165 k rows/sec) • currency_decimal="auto" for EU comma-decimal locales • R$ / kr / zł multi-char currency prefixes • cli_format.py with auto-stream above 100 MB inputs Encoding detection arbiter + language-aware probe: Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM) via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes. Distribution-readiness assets: • streamlit_app.py — Streamlit Community Cloud entry shim • src/gui/app_demo.py — single-page demo, ?p=<persona> routing, 100-row cap + watermark, free-vs-paid boundary enforced at surface • samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs • landing/ — 4 static HTML pages (apex chooser + 3 niche), shared CSS, deploy.py URL-substitution script, auto-generated robots.txt + sitemap.xml + 404.html + favicon • docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md — full strategy + measurement + deployment + master checklist Test counts: before: 1,520 passed · 4 skipped · 17 xfailed after: 1,729 passed · 0 skipped · 0 xfailed Tier-1 corpora added: • missing-corpus 3 use cases + 16 edge cases • column-mapper-corpus 3 use cases + 5 edge cases • format-cleaner intl 20-row 13-country stress fixture Engine hardening flushed out by the corpora: • interpolate guards against object-dtype columns • mean/median skip all-NaN columns (silences numpy warning) • fillna runs under future.no_silent_downcasting (silences pandas warning) • mojibake test no longer skips when ftfy installed (monkeypatch path) • drop-row threshold semantics: strict-greater (consistent across rows / cols) • currency_decimal validator allow-set updated for "auto" Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 22:31:26 +00:00
parent d18b95880d
commit 966af8ef94
89 changed files with 12039 additions and 284 deletions
--- a/tests/test_analyze.py
+++ b/tests/test_analyze.py
@@ -253,16 +253,20 @@ class TestEncodingOverride:


 class TestEncodingDecodeFailedFromRepair:
-    def test_decode_replaced_action_surfaces_error_finding(self, tmp_path):
-        # Create a file with a UTF-8 BOM but cp1252 body bytes — utf-8-sig
-        # fails on byte 0x80 (€ in cp1252).
+    def test_lying_bom_recovered_and_flagged(self, tmp_path):
+        # File has a UTF-8 BOM but the body bytes are cp1252 (0x80 = € in
+        # cp1252; not a valid UTF-8 continuation byte). Detector should
+        # recover transparently to cp1252 and surface an
+        # ``encoding_lying_bom`` warn so the user knows.
        f = tmp_path / "lying_bom.csv"
        f.write_bytes(b"\xef\xbb\xbfid,name\n1,\x80100\n")
        findings = analyze(f)
        ids = {x.id for x in findings}
-        assert "encoding_decode_failed" in ids
-        bad = next(x for x in findings if x.id == "encoding_decode_failed")
-        assert bad.severity == "error"
+        assert "encoding_lying_bom" in ids
+        bad = next(x for x in findings if x.id == "encoding_lying_bom")
+        assert bad.severity == "warn"
+        # Decode should have succeeded — no replacement-character finding.
+        assert "encoding_decode_failed" not in ids


 class TestMixedLineEndings:
--- a/tests/test_column_mapper.py
+++ b/tests/test_column_mapper.py
@@ -0,0 +1,374 @@
+"""Tests for src/core/column_mapper.py."""
+
+from __future__ import annotations
+
+import json
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from src.core.errors import ConfigError, InputValidationError
+from src.core.column_mapper import (
+    MapOptions,
+    PRESETS,
+    TargetField,
+    TargetSchema,
+    coerce_series,
+    infer_mapping,
+    map_columns,
+)
+
+
+# ---------------------------------------------------------------------------
+# infer_mapping — fuzzy matcher
+# ---------------------------------------------------------------------------
+
+class TestInferMapping:
+    def test_exact_normalized_match(self):
+        df = pd.DataFrame({"First Name": [], "Last Name": []})
+        schema = TargetSchema(fields=[
+            TargetField(name="first_name"), TargetField(name="last_name"),
+        ])
+        m = infer_mapping(df, schema)
+        assert m == {"First Name": "first_name", "Last Name": "last_name"}
+
+    def test_alias_match(self):
+        df = pd.DataFrame({"EmailAddr": []})
+        schema = TargetSchema(fields=[
+            TargetField(name="email", aliases=["EmailAddr", "email_address"]),
+        ])
+        m = infer_mapping(df, schema)
+        assert m == {"EmailAddr": "email"}
+
+    def test_below_threshold_excluded(self):
+        df = pd.DataFrame({"xyz": []})
+        schema = TargetSchema(fields=[TargetField(name="email")])
+        m = infer_mapping(df, schema, threshold=0.6)
+        assert m == {}
+
+    def test_target_matched_at_most_once(self):
+        df = pd.DataFrame({"first_name": [], "fname": []})
+        schema = TargetSchema(fields=[TargetField(name="first_name")])
+        m = infer_mapping(df, schema)
+        # Exact match wins; "fname" stays unmapped.
+        assert m == {"first_name": "first_name"}
+
+    def test_threshold_zero_matches_anything(self):
+        df = pd.DataFrame({"a": [], "b": []})
+        schema = TargetSchema(fields=[TargetField(name="z")])
+        m = infer_mapping(df, schema, threshold=0.0)
+        assert len(m) == 1
+
+
+# ---------------------------------------------------------------------------
+# coerce_series
+# ---------------------------------------------------------------------------
+
+class TestCoerceSeries:
+    def test_integer_clean(self):
+        s = pd.Series(["1", "2", "3"])
+        out, fails = coerce_series(s, "integer")
+        assert list(out) == [1, 2, 3]
+        assert fails == 0
+
+    def test_integer_with_failure(self):
+        s = pd.Series(["1", "bad", "3"])
+        out, fails = coerce_series(s, "integer")
+        assert fails == 1
+        assert pd.isna(out.iloc[1])
+
+    def test_float_with_thousands_sep(self):
+        # Plain floats; thousands-sep handling is for format standardizer.
+        s = pd.Series(["1.5", "2.0", "3.25"])
+        out, fails = coerce_series(s, "float")
+        assert fails == 0
+        assert out.iloc[2] == 3.25
+
+    def test_boolean_truthy_falsy(self):
+        s = pd.Series(["true", "false", "Yes", "no", "1", "0"])
+        out, fails = coerce_series(s, "boolean")
+        assert fails == 0
+        assert list(out) == [True, False, True, False, True, False]
+
+    def test_boolean_unknown_value_fails(self):
+        s = pd.Series(["true", "maybe"])
+        out, fails = coerce_series(s, "boolean")
+        assert fails == 1
+        assert pd.isna(out.iloc[1])
+
+    def test_date_iso_format(self):
+        s = pd.Series(["2025-01-15", "2025-02-20"])
+        out, fails = coerce_series(s, "date")
+        assert fails == 0
+        assert out.iloc[0].year == 2025
+
+    def test_date_failure(self):
+        s = pd.Series(["2025-01-15", "garbage"])
+        out, fails = coerce_series(s, "date")
+        assert fails == 1
+        assert pd.isna(out.iloc[1])
+
+    def test_string_passthrough(self):
+        s = pd.Series([1, 2, 3])
+        out, fails = coerce_series(s, "string")
+        assert fails == 0
+        assert out.dtype.name == "string"
+
+    def test_auto_returns_unchanged(self):
+        s = pd.Series([1, 2])
+        out, fails = coerce_series(s, "auto")
+        assert fails == 0
+        assert out is s
+
+    def test_unknown_dtype_raises(self):
+        with pytest.raises(InputValidationError):
+            coerce_series(pd.Series([1]), "bogus")  # type: ignore[arg-type]
+
+
+# ---------------------------------------------------------------------------
+# map_columns — explicit mapping
+# ---------------------------------------------------------------------------
+
+class TestMapColumnsExplicit:
+    def test_simple_rename(self):
+        df = pd.DataFrame({"a": [1], "b": [2]})
+        opts = MapOptions(mapping={"a": "alpha", "b": "beta"})
+        res = map_columns(df, opts)
+        assert list(res.mapped_df.columns) == ["alpha", "beta"]
+        assert res.columns_renamed == 2
+
+    def test_unknown_source_raises(self):
+        df = pd.DataFrame({"a": [1]})
+        opts = MapOptions(mapping={"missing": "x"})
+        with pytest.raises(InputValidationError):
+            map_columns(df, opts)
+
+    def test_duplicate_target_raises(self):
+        df = pd.DataFrame({"a": [1], "b": [2]})
+        opts = MapOptions(mapping={"a": "x", "b": "x"})
+        with pytest.raises(InputValidationError):
+            map_columns(df, opts)
+
+    def test_unmapped_keep(self):
+        df = pd.DataFrame({"a": [1], "b": [2]})
+        opts = MapOptions(mapping={"a": "alpha"}, unmapped="keep")
+        res = map_columns(df, opts)
+        assert "b" in res.mapped_df.columns
+        assert res.unmapped_kept == ["b"]
+
+    def test_unmapped_drop(self):
+        df = pd.DataFrame({"a": [1], "b": [2]})
+        opts = MapOptions(mapping={"a": "alpha"}, unmapped="drop")
+        res = map_columns(df, opts)
+        assert list(res.mapped_df.columns) == ["alpha"]
+        assert res.columns_dropped == ["b"]
+
+    def test_unmapped_error(self):
+        df = pd.DataFrame({"a": [1], "b": [2]})
+        opts = MapOptions(mapping={"a": "alpha"}, unmapped="error")
+        with pytest.raises(InputValidationError):
+            map_columns(df, opts)
+
+
+# ---------------------------------------------------------------------------
+# map_columns — schema + auto-inference
+# ---------------------------------------------------------------------------
+
+class TestMapColumnsWithSchema:
+    def test_auto_infer_renames(self):
+        df = pd.DataFrame({"First Name": ["A"], "Last Name": ["B"]})
+        schema = TargetSchema(fields=[
+            TargetField(name="first_name"), TargetField(name="last_name"),
+        ])
+        opts = MapOptions(schema=schema, auto_infer=True)
+        res = map_columns(df, opts)
+        assert "first_name" in res.mapped_df.columns
+        assert "last_name" in res.mapped_df.columns
+        assert res.inferred_pairs == {"First Name": "first_name", "Last Name": "last_name"}
+
+    def test_explicit_overrides_inferred(self):
+        df = pd.DataFrame({"name": ["A"], "fname": ["B"]})
+        schema = TargetSchema(fields=[TargetField(name="first_name")])
+        opts = MapOptions(
+            schema=schema,
+            mapping={"fname": "first_name"},
+            auto_infer=True,
+        )
+        res = map_columns(df, opts)
+        assert res.mapping["fname"] == "first_name"
+        assert "name" not in res.mapping
+
+    def test_required_missing_raises(self):
+        df = pd.DataFrame({"first_name": ["A"]})
+        schema = TargetSchema(fields=[
+            TargetField(name="first_name", required=True),
+            TargetField(name="email", required=True),
+        ])
+        opts = MapOptions(schema=schema, auto_infer=False, enforce_required=True)
+        with pytest.raises(InputValidationError):
+            map_columns(df, opts)
+
+    def test_required_missing_with_default_added(self):
+        df = pd.DataFrame({"first_name": ["A"]})
+        schema = TargetSchema(fields=[
+            TargetField(name="first_name", required=True),
+            TargetField(name="source", required=False, default="import"),
+        ])
+        opts = MapOptions(schema=schema, auto_infer=False)
+        res = map_columns(df, opts)
+        assert "source" in res.mapped_df.columns
+        assert res.mapped_df.iloc[0]["source"] == "import"
+        assert res.columns_added == ["source"]
+
+    def test_required_missing_disabled(self):
+        df = pd.DataFrame({"first_name": ["A"]})
+        schema = TargetSchema(fields=[
+            TargetField(name="first_name", required=True),
+            TargetField(name="email", required=True),
+        ])
+        opts = MapOptions(schema=schema, auto_infer=False, enforce_required=False)
+        res = map_columns(df, opts)
+        assert "email" in res.missing_required_targets
+
+    def test_reorder_to_schema(self):
+        df = pd.DataFrame({"z": [1], "a": [2], "m": [3]})
+        schema = TargetSchema(fields=[
+            TargetField(name="a"), TargetField(name="m"), TargetField(name="z"),
+        ])
+        opts = MapOptions(schema=schema, auto_infer=True, reorder_to_schema=True)
+        res = map_columns(df, opts)
+        assert list(res.mapped_df.columns) == ["a", "m", "z"]
+
+    def test_coerce_types(self):
+        df = pd.DataFrame({"age": ["30", "bad", "40"], "active": ["true", "no", "yes"]})
+        schema = TargetSchema(fields=[
+            TargetField(name="age", dtype="integer"),
+            TargetField(name="active", dtype="boolean"),
+        ])
+        opts = MapOptions(schema=schema, auto_infer=True, coerce_types=True)
+        res = map_columns(df, opts)
+        assert res.mapped_df["age"].iloc[0] == 30
+        assert res.mapped_df["active"].iloc[0] is True or res.mapped_df["active"].iloc[0]
+        assert res.coercion_failures == {"age": 1}
+
+
+# ---------------------------------------------------------------------------
+# Presets
+# ---------------------------------------------------------------------------
+
+class TestPresets:
+    def test_strict_schema_drops_and_coerces_and_reorders(self):
+        df = pd.DataFrame({"First Name": ["A"], "Email": ["a@x"], "extra": [1]})
+        schema = TargetSchema(fields=[
+            TargetField(name="first_name", required=True),
+            TargetField(name="email", required=True),
+        ])
+        opts = MapOptions.from_preset("strict-schema")
+        opts.schema = schema
+        res = map_columns(df, opts)
+        assert list(res.mapped_df.columns) == ["first_name", "email"]
+        assert res.columns_dropped == ["extra"]
+
+    def test_lenient_keeps_extras(self):
+        df = pd.DataFrame({"First Name": ["A"], "extra": [1]})
+        schema = TargetSchema(fields=[TargetField(name="first_name")])
+        opts = MapOptions.from_preset("lenient-schema")
+        opts.schema = schema
+        res = map_columns(df, opts)
+        assert "extra" in res.mapped_df.columns
+
+    def test_unknown_preset(self):
+        with pytest.raises(ConfigError):
+            MapOptions.from_preset("does-not-exist")
+
+
+# ---------------------------------------------------------------------------
+# Schema serialization
+# ---------------------------------------------------------------------------
+
+class TestSchemaIO:
+    def test_roundtrip_dict(self):
+        schema = TargetSchema(fields=[
+            TargetField(name="x", dtype="integer", required=True, aliases=["X", "X "]),
+            TargetField(name="y", default="z"),
+        ])
+        d = schema.to_dict()
+        loaded = TargetSchema.from_dict(d)
+        assert loaded.field_names() == ["x", "y"]
+        assert loaded.fields[0].required is True
+        assert loaded.fields[1].default == "z"
+
+    def test_from_dict_string_field(self):
+        # Allow shorthand: bare string defaults to dtype=auto.
+        loaded = TargetSchema.from_dict({"fields": ["a", "b"]})
+        assert loaded.field_names() == ["a", "b"]
+
+    def test_from_dict_unknown_dtype_raises(self):
+        with pytest.raises(ConfigError):
+            TargetSchema.from_dict({"fields": [{"name": "x", "dtype": "bogus"}]})
+
+    def test_from_dict_missing_name_raises(self):
+        with pytest.raises(ConfigError):
+            TargetSchema.from_dict({"fields": [{"dtype": "string"}]})
+
+    def test_options_roundtrip_to_file(self, tmp_path):
+        schema = TargetSchema(fields=[TargetField(name="x", dtype="string")])
+        opts = MapOptions(
+            schema=schema,
+            mapping={"a": "x"},
+            unmapped="drop",
+            coerce_types=True,
+            reorder_to_schema=True,
+        )
+        path = tmp_path / "cfg.json"
+        opts.to_file(path)
+        loaded = MapOptions.from_file(path)
+        assert loaded.mapping == {"a": "x"}
+        assert loaded.unmapped == "drop"
+        assert loaded.coerce_types is True
+        assert loaded.schema is not None
+        assert loaded.schema.field_names() == ["x"]
+
+
+# ---------------------------------------------------------------------------
+# Validation
+# ---------------------------------------------------------------------------
+
+class TestValidation:
+    def test_invalid_unmapped_strategy(self):
+        opts = MapOptions(unmapped="bogus")  # type: ignore[arg-type]
+        with pytest.raises(InputValidationError):
+            opts.validate()
+
+    def test_threshold_out_of_range(self):
+        opts = MapOptions(fuzzy_threshold=1.5)
+        with pytest.raises(ConfigError):
+            opts.validate()
+
+    def test_non_dataframe_input(self):
+        with pytest.raises(InputValidationError):
+            map_columns([1, 2, 3])  # type: ignore[arg-type]
+
+
+# ---------------------------------------------------------------------------
+# Idempotency
+# ---------------------------------------------------------------------------
+
+class TestIdempotency:
+    def test_double_apply_is_stable(self):
+        df = pd.DataFrame({"First Name": ["A"], "Email": ["a@x"]})
+        schema = TargetSchema(fields=[
+            TargetField(name="first_name"),
+            TargetField(name="email"),
+        ])
+        opts = MapOptions(schema=schema, auto_infer=True, reorder_to_schema=True)
+        first = map_columns(df, opts)
+        second = map_columns(first.mapped_df, opts)
+        pd.testing.assert_frame_equal(second.mapped_df, first.mapped_df)
+
+    def test_input_not_mutated(self):
+        df = pd.DataFrame({"a": [1], "b": [2]})
+        snapshot = df.copy(deep=True)
+        map_columns(df, MapOptions(mapping={"a": "x"}))
+        pd.testing.assert_frame_equal(df, snapshot)
--- a/tests/test_column_mapper_corpus.py
+++ b/tests/test_column_mapper_corpus.py
@@ -0,0 +1,240 @@
+"""Acceptance corpus for the Column Mapper.
+
+Loads every fixture in ``test-cases/column-mapper-corpus/test_data/``
+and asserts the documented behaviour against the documented schema.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import pandas as pd
+import pytest
+
+from src.core.errors import InputValidationError
+from src.core.column_mapper import (
+    MapOptions,
+    TargetField,
+    TargetSchema,
+    map_columns,
+)
+
+CORPUS = Path(__file__).resolve().parents[1] / "test-cases" / "column-mapper-corpus"
+TEST_DATA = CORPUS / "test_data"
+SCHEMAS = CORPUS / "schemas"
+
+
+def _read(name: str) -> pd.DataFrame:
+    return pd.read_csv(TEST_DATA / name)
+
+
+def _schema(name: str) -> TargetSchema:
+    return TargetSchema.from_file(SCHEMAS / name)
+
+
+# ---------------------------------------------------------------------------
+# UC01 — CRM import
+# ---------------------------------------------------------------------------
+
+class TestUC01CrmImport:
+    def test_strict_schema_round_trip(self):
+        df = _read("uc01_crm_import.csv")
+        schema = _schema("uc01_crm_target.json")
+        opts = MapOptions.from_preset("strict-schema")
+        opts.schema = schema
+        res = map_columns(df, opts)
+
+        # Every required target is present after the run.
+        for f in schema.fields:
+            if f.required:
+                assert f.name in res.mapped_df.columns
+
+        # 'owner' default added.
+        assert "owner" in res.columns_added
+        assert (res.mapped_df["owner"] == "unassigned").all()
+
+        # No unmapped survivors (strict preset drops extras).
+        assert res.unmapped_kept == []
+
+        # Reordered to schema order.
+        expected_prefix = [f.name for f in schema.fields]
+        assert list(res.mapped_df.columns)[: len(expected_prefix)] == expected_prefix
+
+    def test_types_coerced_from_strings(self):
+        df = _read("uc01_crm_import.csv")
+        schema = _schema("uc01_crm_target.json")
+        opts = MapOptions.from_preset("strict-schema")
+        opts.schema = schema
+        res = map_columns(df, opts)
+        # annual_rev → integer (was numeric strings in the source).
+        assert pd.api.types.is_integer_dtype(res.mapped_df["annual_rev"])
+        # created_date → datetime64.
+        assert pd.api.types.is_datetime64_any_dtype(res.mapped_df["created_date"])
+
+
+# ---------------------------------------------------------------------------
+# UC02 — Multi-vendor unification
+# ---------------------------------------------------------------------------
+
+class TestUC02MultiVendor:
+    @pytest.mark.parametrize("vendor", ["a", "b", "c"])
+    def test_each_vendor_normalises_to_canonical(self, vendor):
+        df = _read(f"uc02_vendor_{vendor}.csv")
+        schema = _schema("uc02_canonical.json")
+        opts = MapOptions.from_preset("lenient-schema")
+        opts.schema = schema
+        opts.fuzzy_threshold = 0.5  # vendor C uses obscure aliases ("FName", "Tel")
+        res = map_columns(df, opts)
+        # Every required canonical field landed in the output.
+        for f in schema.fields:
+            if f.required:
+                assert f.name in res.mapped_df.columns, (
+                    f"vendor {vendor}: missing {f.name}; mapping={res.mapping}"
+                )
+
+    def test_concatenated_vendors_share_schema(self):
+        # The point of unification: after each vendor goes through the
+        # mapper, the resulting frames stack cleanly.
+        schema = _schema("uc02_canonical.json")
+        opts = MapOptions.from_preset("strict-schema")
+        opts.schema = schema
+        opts.fuzzy_threshold = 0.5
+        frames = [
+            map_columns(_read(f"uc02_vendor_{v}.csv"), opts).mapped_df
+            for v in ("a", "b", "c")
+        ]
+        unified = pd.concat(frames, ignore_index=True)
+        assert list(unified.columns) == [f.name for f in schema.fields]
+        # Total rows = sum of inputs.
+        assert len(unified) == sum(len(f) for f in frames)
+
+
+# ---------------------------------------------------------------------------
+# UC03 — Type coercion
+# ---------------------------------------------------------------------------
+
+class TestUC03TypeCoercion:
+    def test_documented_failures_are_reported(self):
+        df = _read("uc03_type_coercion.csv")
+        schema = _schema("uc03_types.json")
+        opts = MapOptions.from_preset("lenient-schema")
+        opts.schema = schema
+        res = map_columns(df, opts)
+        # Bad rows survive as NaN, with counts recorded.
+        assert res.coercion_failures.get("age") == 1
+        assert res.coercion_failures.get("score") == 1
+        assert res.coercion_failures.get("joined") == 1
+
+    def test_coerced_dtypes(self):
+        df = _read("uc03_type_coercion.csv")
+        schema = _schema("uc03_types.json")
+        opts = MapOptions.from_preset("lenient-schema")
+        opts.schema = schema
+        res = map_columns(df, opts)
+        out = res.mapped_df
+        assert pd.api.types.is_integer_dtype(out["id"])
+        assert out["active"].dtype.name == "boolean"
+        assert pd.api.types.is_datetime64_any_dtype(out["joined"])
+        # Float failures NaN-ify.
+        assert pd.isna(out["score"].iloc[1])
+
+
+# ---------------------------------------------------------------------------
+# Edge cases
+# ---------------------------------------------------------------------------
+
+class TestEC01DuplicateTarget:
+    def test_two_sources_to_same_target_raises(self):
+        df = _read("ec01_duplicate_target.csv")
+        opts = MapOptions(mapping={"a": "x", "b": "x"})
+        with pytest.raises(InputValidationError):
+            map_columns(df, opts)
+
+
+class TestEC02UnicodeColumns:
+    def test_japanese_column_renamed(self):
+        df = _read("ec02_unicode_columns.csv")
+        opts = MapOptions(mapping={"名前": "name", "価格": "price"})
+        res = map_columns(df, opts)
+        assert "name" in res.mapped_df.columns
+        assert "price" in res.mapped_df.columns
+        # Email passes through (unmapped, kept by default).
+        assert "Email" in res.mapped_df.columns
+
+
+class TestEC03WhitespaceHeaders:
+    def test_header_whitespace_does_not_block_match(self):
+        df = _read("ec03_whitespace_headers.csv")
+        schema = TargetSchema(fields=[
+            TargetField(name="first_name", aliases=["First Name"]),
+            TargetField(name="last_name", aliases=["Last Name"]),
+            TargetField(name="email", aliases=["EmailAddr"]),
+        ])
+        opts = MapOptions(schema=schema, auto_infer=True)
+        res = map_columns(df, opts)
+        # All three columns should map despite the leading/trailing spaces.
+        assert len(res.mapping) == 3
+
+
+class TestEC04NoMatch:
+    def test_zero_inferred_with_no_match(self):
+        df = _read("ec04_no_match.csv")
+        schema = TargetSchema(fields=[
+            TargetField(name="email"), TargetField(name="phone"),
+        ])
+        opts = MapOptions(schema=schema, auto_infer=True, unmapped="keep")
+        res = map_columns(df, opts)
+        assert res.inferred_pairs == {}
+        # Source columns survive as-is under keep.
+        assert set(df.columns) <= set(res.mapped_df.columns)
+
+    def test_no_match_with_unmapped_error(self):
+        df = _read("ec04_no_match.csv")
+        schema = TargetSchema(fields=[TargetField(name="email")])
+        opts = MapOptions(
+            schema=schema, auto_infer=True, unmapped="error",
+            enforce_required=False,
+        )
+        with pytest.raises(InputValidationError):
+            map_columns(df, opts)
+
+
+class TestEC05RequiredMissing:
+    def test_required_missing_raises(self):
+        df = _read("ec05_required_missing.csv")
+        schema = TargetSchema(fields=[
+            TargetField(name="first_name", required=True),
+            TargetField(name="email", required=True),
+        ])
+        opts = MapOptions(schema=schema, auto_infer=True, enforce_required=True)
+        with pytest.raises(InputValidationError):
+            map_columns(df, opts)
+
+    def test_disable_enforce_surfaces_in_result(self):
+        df = _read("ec05_required_missing.csv")
+        schema = TargetSchema(fields=[
+            TargetField(name="first_name", required=True),
+            TargetField(name="email", required=True),
+        ])
+        opts = MapOptions(schema=schema, auto_infer=True, enforce_required=False)
+        res = map_columns(df, opts)
+        assert "email" in res.missing_required_targets
+
+
+# ---------------------------------------------------------------------------
+# Whole-corpus property tests
+# ---------------------------------------------------------------------------
+
+ALL_FIXTURES = sorted(p.name for p in TEST_DATA.glob("*.csv"))
+
+
+@pytest.mark.parametrize("fixture", ALL_FIXTURES)
+def test_map_columns_does_not_mutate_input(fixture):
+    df = pd.read_csv(TEST_DATA / fixture)
+    snapshot = df.copy(deep=True)
+    try:
+        map_columns(df, MapOptions())  # identity run; default options.
+    except InputValidationError:
+        pass  # ec01 / ec05 raise here — fine, mutation is what we care about.
+    pd.testing.assert_frame_equal(df, snapshot)
--- a/tests/test_corpus.py
+++ b/tests/test_corpus.py
@@ -169,8 +169,23 @@ class TestMojibake:
        assert actual.equals(expected), "14 mojibake default (no repair) differs"

    def test_fixed_variant(self):
-        # --fix-mojibake is Tier 2; the cleaner does not implement it. Mark xfail.
-        pytest.xfail("Mojibake auto-repair is Tier 2; not yet implemented (uses ftfy).")
+        """Mojibake auto-repair (ftfy-backed) restores the original text.
+
+        Skipped automatically when ftfy is not installed — the engine
+        falls back to a no-op in that case and the diff would never close.
+        """
+        try:
+            import ftfy  # noqa: F401
+        except ImportError:
+            pytest.skip("ftfy not installed — install ftfy to enable mojibake repair")
+
+        from src.core.fixes import repair_mojibake
+
+        df = _read_csv_strict(TEST_DATA / "14_mojibake.csv")
+        expected = _read_csv_strict(EXPECTED / "14_mojibake__fixed.csv")
+        repaired, _ = repair_mojibake(df)
+        actual = repaired.reset_index(drop=True)
+        assert actual.equals(expected), "14 mojibake fixed variant differs"


 class TestEmptyFile:
--- a/tests/test_encodings_corpus.py
+++ b/tests/test_encodings_corpus.py
@@ -14,12 +14,11 @@ What's tested
   REJECT / LOW_CONFIDENCE.
 3. The decoded DataFrame matches the canonical reference content.

-Cases where the current implementation is known to fail (charset-
-normalizer label drift on byte-equivalent encodings, ``repair_bytes``
-NUL-strip destroying UTF-16, the "lying BOM" pathological case) are
-marked ``xfail`` so they surface in the report as documented gaps.
-A future fix that makes the case pass will flip xfail to xpass and the
-test owner can drop the marker.
+Detection arbiter (cp1250→cp1252, mac_iceland→mac_roman, lying-BOM
+recovery) and a language-aware probe (Cyrillic / EE-Latin coverage)
+together close every documented gap; the ``KNOWN_*_FAILURES`` dicts
+below are kept empty as a tripwire — re-add an entry only when a real
+limitation surfaces.
 """

 from __future__ import annotations
@@ -41,27 +40,9 @@ REFERENCE_DIR = CORPUS / "reference"

 # Known failures the analyzer does not yet handle correctly. Each entry
 # has a one-line reason — drop the entry once a fix lands.
-KNOWN_DETECTION_FAILURES = {
-    "E03_western_basic_cp1252.csv": "charset-normalizer returns cp1250 for byte-equivalent content",
-    "E04_western_basic_latin1.csv": "charset-normalizer returns cp1250 for byte-equivalent content",
-    "E05_western_basic_latin9.csv": "charset-normalizer returns cp1250 for byte-equivalent content",
-    "E06_western_basic_macroman.csv": "returns mac_iceland (same family) instead of mac_roman",
-    "E11_western_extended_cp1252.csv": "charset-normalizer returns cp1250 for cp1252 content",
-    "E15_eastern_european_iso88592.csv": "charset-normalizer returns cp1258 for ISO-8859-2 content",
-    "E18_cyrillic_koi8r.csv": "charset-normalizer returns shift_jis_2004 for KOI8-R content",
-}
+KNOWN_DETECTION_FAILURES: dict[str, str] = {}

-KNOWN_DECODE_FAILURES = {
-    "E03_western_basic_cp1252.csv": "decoded as cp1250 — different mapping at 0xF1 (ñ vs ń)",
-    "E04_western_basic_latin1.csv": "decoded as cp1250 — different mapping at 0xF1",
-    "E05_western_basic_latin9.csv": "decoded as cp1250 — different mapping at 0xF1",
-    "E10_western_extended_utf8.csv": "byte-level smart-quote fold rewrites U+201C/U+201D to ASCII before parse",
-    "E11_western_extended_cp1252.csv": "wrong encoding + smart-quote fold",
-    "E12_western_extended_utf16le.csv": "byte-level smart-quote fold rewrites U+201C/U+201D before parse",
-    "E15_eastern_european_iso88592.csv": "wrong encoding (cp1258 != ISO-8859-2)",
-    "E18_cyrillic_koi8r.csv": "wrong encoding (shift_jis_2004 != KOI8-R)",
-    "E30_pathological_lying_bom.csv": "utf-8-sig fails on cp1252 body bytes; needs lying-BOM recovery",
-}
+KNOWN_DECODE_FAILURES: dict[str, str] = {}


 def _normalize_encoding(name: str) -> str:
@@ -164,7 +145,12 @@ def _decodable_entries():
    ],
 )
 def test_decoded_matches_reference(entry):
-    df, _, _ = _load_for_analysis(CORPUS / entry["filename"], sample_rows=1000)
+    # The reference files preserve smart quotes — disable byte-level
+    # smart-quote folding so this round-trip identity test isn't
+    # confounded by the analyzer's deliberate parser-safety fold.
+    df, _, _ = _load_for_analysis(
+        CORPUS / entry["filename"], sample_rows=1000, fold_quotes=False,
+    )
    ref_text = REFERENCES[entry["canonical_content_id"]]
    ref_rows = list(csv.reader(io.StringIO(ref_text)))
    if not ref_rows:
--- a/tests/test_fixes_unit.py
+++ b/tests/test_fixes_unit.py
@@ -230,8 +230,27 @@ class TestRepairMojibake:


 class TestRepairMojibakeNoFtfy:
-    @pytest.mark.skipif(_HAS_FTFY, reason="ftfy installed — exercises the no-op path")
-    def test_returns_input_unchanged_without_ftfy(self):
+    def test_returns_input_unchanged_without_ftfy(self, monkeypatch):
+        """Exercise the no-op path regardless of whether ftfy is installed.
+
+        ``repair_mojibake`` lazy-imports ftfy inside the function body, so
+        we hide ``ftfy`` from ``sys.modules`` and from import resolution
+        before calling. The function must then degrade to ``(df, 0)``
+        without raising.
+        """
+        import sys
+        import builtins
+
+        monkeypatch.delitem(sys.modules, "ftfy", raising=False)
+        real_import = builtins.__import__
+
+        def fake_import(name, *args, **kwargs):
+            if name == "ftfy" or name.startswith("ftfy."):
+                raise ImportError("ftfy hidden by test")
+            return real_import(name, *args, **kwargs)
+
+        monkeypatch.setattr(builtins, "__import__", fake_import)
+
        df = pd.DataFrame({"x": ["cafÃ©"]})
        out, changed = repair_mojibake(df)
        assert changed == 0
--- a/tests/test_format_intl_corpus.py
+++ b/tests/test_format_intl_corpus.py
@@ -0,0 +1,105 @@
+"""Acceptance corpus for international format standardization.
+
+Stresses the rework's three pillars on a single mixed-locale fixture:
+  * Per-row country column drives phone parsing.
+  * ``currency_decimal="auto"`` resolves comma-decimal locales.
+  * Streaming entry point handles the same content unchanged.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pandas as pd
+import pytest
+
+from src.core.format_standardize import (
+    FieldType,
+    StandardizeOptions,
+    standardize_dataframe,
+    standardize_file,
+)
+
+CORPUS = Path(__file__).resolve().parents[1] / "test-cases" / "format-cleaner-corpus" / "international"
+FIXTURE = CORPUS / "intl_phones_addresses.csv"
+
+
+@pytest.fixture(scope="module")
+def df():
+    return pd.read_csv(FIXTURE, dtype=str, keep_default_na=False)
+
+
+@pytest.fixture(scope="module")
+def options():
+    return StandardizeOptions(
+        column_types={
+            "name": FieldType.NAME,
+            "phone": FieldType.PHONE,
+            "price": FieldType.CURRENCY,
+        },
+        phone_country_column="country",
+        currency_preserve_code=True,
+        currency_decimal="auto",
+    )
+
+
+class TestPhonesByRegion:
+    def test_every_row_lands_on_correct_e164_prefix(self, df, options):
+        # Each row's country column drives the per-row region used by
+        # phonenumbers.parse — the correct + prefix is the acceptance bar.
+        res = standardize_dataframe(df, options)
+        out = res.standardized_df
+        # ISO-2 → expected E.164 country code prefix
+        prefix_for_country = {
+            "US": "+1", "GB": "+44", "RU": "+7", "ES": "+34",
+            "FR": "+33", "JP": "+81", "DE": "+49", "IT": "+39",
+            "CN": "+86", "IN": "+91", "EG": "+20", "AU": "+61",
+            "BR": "+55", "MX": "+52", "KR": "+82", "TR": "+90",
+            "IL": "+972", "PL": "+48", "DK": "+45", "SE": "+46",
+        }
+        bad: list[tuple[str, str, str]] = []
+        for _, row in out.iterrows():
+            want = prefix_for_country[row["country"]]
+            got = row["phone"]
+            if not got.startswith(want):
+                bad.append((row["country"], want, got))
+        assert not bad, f"phone prefix mismatches: {bad}"
+
+
+class TestCurrencyByLocale:
+    def test_eu_decimal_comma_resolves_under_auto(self, df, options):
+        res = standardize_dataframe(df, options)
+        # Spain, France, Germany, Italy, Brazil, Sweden all use decimal
+        # comma. Verify a clean numeric result post-standardization.
+        eu_idx = df.index[df["country"].isin(
+            ["ES", "FR", "DE", "IT", "BR", "SE"]
+        )]
+        for i in eu_idx:
+            val = res.standardized_df.loc[i, "price"]
+            # Either ``CODE NNN.NN`` or bare ``NNN.NN`` — but the comma
+            # in the source must have become a dot in the output.
+            assert "," not in val, (
+                f"row {i} ({df.loc[i, 'country']}): comma persisted in {val!r}"
+            )
+
+    def test_brl_real_prefix_recognised(self, df, options):
+        res = standardize_dataframe(df, options)
+        br_row = res.standardized_df[res.standardized_df["country"] == "BR"].iloc[0]
+        assert "BRL" in br_row["price"]
+
+
+class TestStreamingMatchesInMemory:
+    def test_same_output_via_streaming(self, tmp_path, df, options):
+        # Streaming the same fixture through standardize_file should
+        # produce a CSV byte-equivalent to the in-memory path.
+        in_mem = standardize_dataframe(df, options).standardized_df
+        out = tmp_path / "out.csv"
+        # Use a chunk size that splits the 20-row fixture mid-way.
+        res = standardize_file(FIXTURE, out, options, chunk_size=7)
+        assert res.rows_processed == len(df)
+        streamed = pd.read_csv(out, dtype=str, keep_default_na=False)
+        # Compare typed columns only — others pass through.
+        for col in options.column_types:
+            assert streamed[col].tolist() == in_mem[col].astype(str).tolist(), (
+                f"column {col} differs between in-memory and streaming"
+            )
--- a/tests/test_format_standardize_corpus.py
+++ b/tests/test_format_standardize_corpus.py
@@ -110,16 +110,16 @@ _DATE_EXPECTED_MDY: dict[str, object] = {
    "FD13": PASSTHROUGH,
    "FD14": PASSTHROUGH,
    "FD15": PASSTHROUGH,
-    # excel serial → 2024-01-15 (xfail — not implemented)
+    # excel serial dates (numeric days since 1899-12-30)
    "FD22": "2024-01-15",
    "FD23": "2024-01-15",
-    # unix timestamp seconds / millis → 2024-01-15 (xfail)
+    # unix timestamps (seconds, milliseconds)
    "FD24": "2024-01-15",
    "FD25": "2024-01-15",
    # partial precision — corpus preserves it
    "FD26": "2024-01",
-    "FD27": "2024-01",       # xfail — text precision
-    "FD28": "2024-Q1",       # xfail — quarter
+    "FD27": "2024-01",       # text precision month
+    "FD28": "2024-Q1",       # quarter
    "FD29": "2024",
    # 2-digit year cutoff (per docs: 1969 wins over 2069)
    "FD30": "1969-01-15",
@@ -135,7 +135,7 @@ _DATE_EXPECTED_MDY: dict[str, object] = {
    "FD37": "2024-01-15",
    # garbage → pass through (corpus 0.3 boundary table)
    # FD38/39/40 → PASSTHROUGH default
-    # locale-specific month names (xfail — not shipped)
+    # locale-specific month names (en/fr/de via month_locales)
    "FD41": "2024-01-15",
    "FD42": "2024-01-15",
    # timezone — corpus 3.3 says fixed-offset only
--- a/tests/test_format_streaming.py
+++ b/tests/test_format_streaming.py
@@ -0,0 +1,301 @@
+"""Tests for the format-standardizer rework: cache, vectorized dispatch,
+per-row country, audit cap, and streaming entry point."""
+
+from __future__ import annotations
+
+import csv
+from pathlib import Path
+
+import pandas as pd
+import pytest
+
+from src.core.format_standardize import (
+    FieldType,
+    StandardizeOptions,
+    StreamingStandardizeResult,
+    _normalize_region,
+    standardize_dataframe,
+    standardize_file,
+)
+
+
+# ---------------------------------------------------------------------------
+# Per-row country / region
+# ---------------------------------------------------------------------------
+
+class TestPerRowCountry:
+    def test_phone_uses_per_row_country(self):
+        df = pd.DataFrame({
+            "phone": ["020 7946 0958", "03-3210-7000", "(415) 555-1234"],
+            "country": ["GB", "JP", "US"],
+        })
+        opts = StandardizeOptions(
+            column_types={"phone": FieldType.PHONE},
+            phone_country_column="country",
+        )
+        res = standardize_dataframe(df, opts)
+        out = res.standardized_df["phone"].tolist()
+        assert out[0].startswith("+44")
+        assert out[1].startswith("+81")
+        assert out[2].startswith("+1")
+
+    def test_phone_country_full_name_resolved(self):
+        df = pd.DataFrame({
+            "phone": ["020 7946 0958"],
+            "country": ["United Kingdom"],
+        })
+        opts = StandardizeOptions(
+            column_types={"phone": FieldType.PHONE},
+            phone_country_column="country",
+        )
+        res = standardize_dataframe(df, opts)
+        assert res.standardized_df["phone"].iloc[0].startswith("+44")
+
+    def test_blank_country_falls_back_to_default(self):
+        df = pd.DataFrame({
+            "phone": ["(415) 555-1234"],
+            "country": [""],  # blank → use default region
+        })
+        opts = StandardizeOptions(
+            column_types={"phone": FieldType.PHONE},
+            phone_country_column="country",
+            phone_region="US",
+        )
+        res = standardize_dataframe(df, opts)
+        assert res.standardized_df["phone"].iloc[0] == "+14155551234"
+
+    def test_unknown_country_column_raises(self):
+        df = pd.DataFrame({"phone": ["x"]})
+        opts = StandardizeOptions(
+            column_types={"phone": FieldType.PHONE},
+            phone_country_column="missing_col",
+        )
+        from src.core.errors import InputValidationError
+        with pytest.raises(InputValidationError):
+            standardize_dataframe(df, opts)
+
+
+class TestNormalizeRegion:
+    def test_iso2_passthrough(self):
+        assert _normalize_region("US") == "US"
+        assert _normalize_region("us") == "US"
+        assert _normalize_region(" jp ") == "JP"
+
+    def test_iso3_mapped(self):
+        assert _normalize_region("USA") == "US"
+        assert _normalize_region("GBR") == "GB"
+        assert _normalize_region("JPN") == "JP"
+
+    def test_full_name(self):
+        assert _normalize_region("United States") == "US"
+        assert _normalize_region("Japan") == "JP"
+        assert _normalize_region("Brazil") == "BR"
+        assert _normalize_region("brasil") == "BR"
+        assert _normalize_region("España") == "ES"
+
+    def test_blank_or_unknown(self):
+        assert _normalize_region("") is None
+        assert _normalize_region("   ") is None
+        assert _normalize_region(None) is None
+        assert _normalize_region("xyz-no-such-country") is None
+
+
+# ---------------------------------------------------------------------------
+# Audit cap
+# ---------------------------------------------------------------------------
+
+class TestAuditCap:
+    def test_cap_truncates_change_rows(self):
+        df = pd.DataFrame({
+            "phone": ["(415) 555-12{:02d}".format(i) for i in range(50)],
+        })
+        opts = StandardizeOptions(
+            column_types={"phone": FieldType.PHONE},
+            audit_max_rows=5,
+        )
+        res = standardize_dataframe(df, opts)
+        # cells_changed counts everything; the audit table is capped.
+        assert res.cells_changed == 50
+        assert len(res.changes) == 5
+
+    def test_unbounded_audit(self):
+        df = pd.DataFrame({
+            "phone": ["(415) 555-12{:02d}".format(i) for i in range(20)],
+        })
+        opts = StandardizeOptions(
+            column_types={"phone": FieldType.PHONE},
+            audit_max_rows=None,
+        )
+        res = standardize_dataframe(df, opts)
+        assert len(res.changes) == 20
+
+
+# ---------------------------------------------------------------------------
+# Cache + vectorized dispatch (correctness)
+# ---------------------------------------------------------------------------
+
+class TestCacheCorrectness:
+    def test_repeated_phone_consistent(self):
+        # 1000 copies of the same phone should produce identical output.
+        df = pd.DataFrame({"phone": ["(415) 555-1234"] * 1000})
+        opts = StandardizeOptions(
+            column_types={"phone": FieldType.PHONE},
+            audit_max_rows=None,
+        )
+        res = standardize_dataframe(df, opts)
+        assert (res.standardized_df["phone"] == "+14155551234").all()
+        assert res.cells_changed == 1000
+
+    def test_cache_disabled_still_works(self):
+        df = pd.DataFrame({"phone": ["(415) 555-1234", "020 7946 0958"]})
+        opts = StandardizeOptions(
+            column_types={"phone": FieldType.PHONE},
+            cache_size=0,  # disabled
+        )
+        res = standardize_dataframe(df, opts)
+        assert res.standardized_df["phone"].iloc[0] == "+14155551234"
+
+
+# ---------------------------------------------------------------------------
+# Streaming standardize_file
+# ---------------------------------------------------------------------------
+
+class TestStandardizeFile:
+    def test_basic_streaming(self, tmp_path):
+        inp = tmp_path / "in.csv"
+        inp.write_text(
+            "phone,country,price\n"
+            "(415) 555-1234,US,$1500.00\n"
+            "020 7946 0958,GB,£99.99\n"
+            "03-3210-7000,JP,¥12000\n"
+            "+33 1 42 86 82 00,FR,€850.50\n"
+        )
+        out = tmp_path / "out.csv"
+        opts = StandardizeOptions(
+            column_types={"phone": FieldType.PHONE, "price": FieldType.CURRENCY},
+            phone_country_column="country",
+            currency_preserve_code=True,
+        )
+        res = standardize_file(inp, out, opts, chunk_size=2)
+        assert isinstance(res, StreamingStandardizeResult)
+        assert res.rows_processed == 4
+        assert res.chunks_processed == 2
+        assert out.exists()
+        out_df = pd.read_csv(out, dtype=str, keep_default_na=False)
+        assert out_df["phone"].iloc[0].startswith("+1")
+        assert out_df["phone"].iloc[1].startswith("+44")
+        assert out_df["phone"].iloc[2].startswith("+81")
+        assert out_df["phone"].iloc[3].startswith("+33")
+
+    def test_audit_capped_across_chunks(self, tmp_path):
+        # 60 rows, audit cap 10, chunks of 20 → audit must stop at 10.
+        inp = tmp_path / "in.csv"
+        rows = ["phone\n"] + [f"(415) 555-12{i:02d}\n" for i in range(60)]
+        inp.write_text("".join(rows))
+        out = tmp_path / "out.csv"
+        opts = StandardizeOptions(
+            column_types={"phone": FieldType.PHONE},
+            audit_max_rows=10,
+        )
+        res = standardize_file(inp, out, opts, chunk_size=20)
+        # Audit file exists and has exactly 10 data rows + 1 header.
+        audit_lines = res.audit_path.read_text().splitlines()
+        assert len(audit_lines) - 1 == 10
+
+    def test_audit_row_indices_are_global(self, tmp_path):
+        # Audit row numbers must reflect absolute file position, not chunk-local.
+        inp = tmp_path / "in.csv"
+        rows = ["phone\n"] + [f"(415) 555-12{i:02d}\n" for i in range(30)]
+        inp.write_text("".join(rows))
+        out = tmp_path / "out.csv"
+        opts = StandardizeOptions(
+            column_types={"phone": FieldType.PHONE},
+            audit_max_rows=None,
+        )
+        res = standardize_file(inp, out, opts, chunk_size=10)
+        audit = pd.read_csv(res.audit_path)
+        # Rows should be 0..29, monotonically increasing.
+        assert audit["row"].tolist() == list(range(30))
+
+    def test_progress_callback_fires(self, tmp_path):
+        inp = tmp_path / "in.csv"
+        inp.write_text("phone\n" + "\n".join("(415) 555-1234" for _ in range(20)) + "\n")
+        out = tmp_path / "out.csv"
+        opts = StandardizeOptions(column_types={"phone": FieldType.PHONE})
+        seen: list[tuple[int, int]] = []
+        def cb(rows, chunks):
+            seen.append((rows, chunks))
+        standardize_file(inp, out, opts, chunk_size=5, progress_callback=cb)
+        assert len(seen) == 4
+        assert seen[-1] == (20, 4)
+
+    def test_progress_callback_exception_does_not_abort(self, tmp_path):
+        inp = tmp_path / "in.csv"
+        inp.write_text("phone\n(415) 555-1234\n")
+        out = tmp_path / "out.csv"
+        opts = StandardizeOptions(column_types={"phone": FieldType.PHONE})
+        def bad_cb(*a, **k):
+            raise RuntimeError("boom")
+        # Must not raise.
+        res = standardize_file(inp, out, opts, chunk_size=1, progress_callback=bad_cb)
+        assert res.rows_processed == 1
+
+    def test_missing_input_raises_clean_error(self, tmp_path):
+        from src.core.errors import FileAccessError
+        opts = StandardizeOptions(column_types={"phone": FieldType.PHONE})
+        with pytest.raises(FileAccessError):
+            standardize_file(
+                tmp_path / "missing.csv",
+                tmp_path / "out.csv",
+                opts,
+            )
+
+
+# ---------------------------------------------------------------------------
+# International coverage smoke
+# ---------------------------------------------------------------------------
+
+class TestInternationalCoverage:
+    @pytest.mark.parametrize("number,country,prefix", [
+        ("020 7946 0958", "GB", "+44"),
+        ("03-3210-7000", "JP", "+81"),
+        ("+49 30 12345678", "DE", "+49"),
+        ("01 42 86 82 00", "FR", "+33"),
+        ("+39 06 6982", "IT", "+39"),
+        ("+34 91 411 1111", "ES", "+34"),
+        ("+86 10 1234 5678", "CN", "+86"),
+        ("+91 11 2345 6789", "IN", "+91"),
+        ("+61 2 9374 4000", "AU", "+61"),
+        ("11 3071 0000", "BR", "+55"),
+        ("+52 55 5555 0000", "MX", "+52"),
+        ("+82 2 2287 0114", "KR", "+82"),
+    ])
+    def test_phone_via_per_row_region(self, number, country, prefix):
+        df = pd.DataFrame({"phone": [number], "country": [country]})
+        opts = StandardizeOptions(
+            column_types={"phone": FieldType.PHONE},
+            phone_country_column="country",
+        )
+        res = standardize_dataframe(df, opts)
+        out = res.standardized_df["phone"].iloc[0]
+        assert out.startswith(prefix), (
+            f"{number!r} ({country}): expected to start with {prefix}, got {out!r}"
+        )
+
+    @pytest.mark.parametrize("price,want_code", [
+        ("$1,500.00", "USD"),
+        ("€850,50", "EUR"),
+        ("£99.99", "GBP"),
+        ("¥12000", "JPY"),
+        ("R$ 250,00", "BRL"),
+        ("CHF 1200.00", "CHF"),
+    ])
+    def test_currency_codes_detected(self, price, want_code):
+        df = pd.DataFrame({"price": [price]})
+        opts = StandardizeOptions(
+            column_types={"price": FieldType.CURRENCY},
+            currency_preserve_code=True,
+            currency_decimal="auto",  # international mode
+        )
+        res = standardize_dataframe(df, opts)
+        assert want_code in res.standardized_df["price"].iloc[0]
--- a/tests/test_gap_coverage.py
+++ b/tests/test_gap_coverage.py
@@ -8,10 +8,8 @@ These cover edges that existing suites missed:
 - ``analyze()`` with ``sample_rows >= len(df)`` (uses copy(), not head()).
 - ``findings_by_tool`` on an empty list.
 - BOM that appears mid-cell rather than at file start.
-
-The collapse-whitespace heuristic for numeric/date/phone-shaped cells (spec
-§4.17) is *not yet implemented* and is captured here as a known-gap xfail
-so it's surfaced rather than silently missing.
+- The collapse-whitespace heuristic for numeric/date/phone-shaped cells
+  (spec §4.17), now wired in via ``_smart_collapse_whitespace``.
 """

 from __future__ import annotations
--- a/tests/test_missing.py
+++ b/tests/test_missing.py
@@ -0,0 +1,462 @@
+"""Tests for src/core/missing.py."""
+
+from __future__ import annotations
+
+import json
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from src.core.errors import ConfigError, InputValidationError
+from src.core.missing import (
+    DEFAULT_SENTINELS,
+    MissingOptions,
+    PRESETS,
+    detect_sentinels,
+    handle_missing,
+    is_missing_like,
+    profile_missing,
+)
+
+
+# ---------------------------------------------------------------------------
+# is_missing_like
+# ---------------------------------------------------------------------------
+
+class TestIsMissingLike:
+    def test_none(self):
+        assert is_missing_like(None)
+
+    def test_nan(self):
+        assert is_missing_like(np.nan)
+
+    def test_pd_nat(self):
+        assert is_missing_like(pd.NaT)
+
+    def test_empty_string(self):
+        assert is_missing_like("")
+
+    def test_whitespace_only(self):
+        assert is_missing_like("   ")
+        assert is_missing_like("\t\n  ")
+
+    def test_default_sentinels(self):
+        for s in ("N/A", "n/a", "NULL", "null", "-", "--", "?", "TBD", "(blank)"):
+            assert is_missing_like(s), f"expected {s!r} to be missing-like"
+
+    def test_case_insensitive(self):
+        assert is_missing_like("N/A")
+        assert is_missing_like("n/A")
+        assert is_missing_like("NA")
+        assert is_missing_like("na")
+
+    def test_real_value_not_missing(self):
+        assert not is_missing_like("hello")
+        assert not is_missing_like("0")
+        assert not is_missing_like(0)
+        assert not is_missing_like(0.0)
+
+    def test_zero_is_not_missing(self):
+        # Common bug: treating 0 / "0" / False as missing.
+        assert not is_missing_like(0)
+        assert not is_missing_like(False)
+
+    def test_custom_sentinels_override(self):
+        assert is_missing_like("xx", sentinels=["xx"])
+        assert not is_missing_like("xx", sentinels=["zz"])
+
+
+# ---------------------------------------------------------------------------
+# detect_sentinels
+# ---------------------------------------------------------------------------
+
+class TestDetectSentinels:
+    def test_counts_by_label(self):
+        s = pd.Series(["alice", "N/A", "n/a", "NULL", "  ", "", "bob"])
+        counts = detect_sentinels(s)
+        # "n/a" matches both 'N/A' and 'n/a' under casefold; the canonical
+        # label that wins is whichever is in the DEFAULT_SENTINELS list.
+        assert sum(v for k, v in counts.items() if k != "(whitespace)") == 3
+        assert counts["(whitespace)"] == 2
+
+    def test_skips_real_nan(self):
+        s = pd.Series(["a", np.nan, "N/A"])
+        counts = detect_sentinels(s)
+        assert sum(counts.values()) == 1
+
+    def test_no_sentinels_returns_empty(self):
+        s = pd.Series(["alice", "bob", "charlie"])
+        assert detect_sentinels(s) == {}
+
+
+# ---------------------------------------------------------------------------
+# profile_missing
+# ---------------------------------------------------------------------------
+
+class TestProfileMissing:
+    def test_basic(self):
+        df = pd.DataFrame({
+            "name": ["Alice", "Bob", "N/A", "", "Charlie"],
+            "age":  [30, None, 25, 40, np.nan],
+        })
+        prof = profile_missing(df, MissingOptions())
+        assert prof.rows_total == 5
+        # name: '' + 'N/A' = 2 sentinels; age: 2 NaN
+        report_by_col = {r.column: r for r in prof.columns}
+        assert report_by_col["name"].missing == 2
+        assert report_by_col["age"].missing == 2
+        assert prof.cells_missing == 4
+
+    def test_complete_dataframe(self):
+        df = pd.DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]})
+        prof = profile_missing(df, MissingOptions())
+        assert prof.cells_missing == 0
+        assert prof.rows_complete == 3
+        assert prof.rows_with_any_missing == 0
+
+    def test_to_dataframe_columns(self):
+        df = pd.DataFrame({"x": [1, None]})
+        prof = profile_missing(df, MissingOptions())
+        out = prof.to_dataframe()
+        assert set(out.columns) >= {"column", "missing", "missing_pct", "top_sentinel"}
+
+    def test_disabled_sentinels_only_counts_real_nan(self):
+        df = pd.DataFrame({"x": ["N/A", "alice", np.nan]})
+        opts = MissingOptions(standardize_sentinels=False)
+        prof = profile_missing(df, opts)
+        report_by_col = {r.column: r for r in prof.columns}
+        # Only the real NaN counts; 'N/A' is left alone.
+        assert report_by_col["x"].missing == 1
+
+
+# ---------------------------------------------------------------------------
+# handle_missing — sentinel standardization
+# ---------------------------------------------------------------------------
+
+class TestSentinelStandardization:
+    def test_replaces_sentinels_with_nan(self):
+        df = pd.DataFrame({"x": ["alice", "N/A", "-", "  ", "bob"]})
+        res = handle_missing(df, MissingOptions(strategy="none"))
+        # 'N/A' + '-' + whitespace-only = 3
+        assert res.sentinels_standardized == 3
+        assert res.handled_df["x"].isna().sum() == 3
+        assert res.handled_df.iloc[0]["x"] == "alice"
+        assert res.handled_df.iloc[4]["x"] == "bob"
+
+    def test_audit_records_each_replacement(self):
+        df = pd.DataFrame({"x": ["alice", "N/A", "bob"]})
+        res = handle_missing(df, MissingOptions(strategy="none"))
+        assert len(res.changes) == 1
+        assert res.changes.iloc[0]["action"].startswith("standardize:")
+
+    def test_disabled_keeps_sentinels(self):
+        df = pd.DataFrame({"x": ["alice", "N/A", "bob"]})
+        opts = MissingOptions(standardize_sentinels=False, strategy="none")
+        res = handle_missing(df, opts)
+        assert res.sentinels_standardized == 0
+        assert res.handled_df.iloc[1]["x"] == "N/A"
+
+    def test_custom_sentinels_extend_default(self):
+        df = pd.DataFrame({"x": ["alice", "MISSING_DATA", "bob"]})
+        opts = MissingOptions(
+            sentinels=[*DEFAULT_SENTINELS, "MISSING_DATA"],
+            strategy="none",
+        )
+        res = handle_missing(df, opts)
+        assert res.sentinels_standardized == 1
+
+
+# ---------------------------------------------------------------------------
+# handle_missing — fill strategies
+# ---------------------------------------------------------------------------
+
+class TestFillStrategies:
+    @pytest.fixture
+    def numeric_df(self):
+        return pd.DataFrame({"x": [1.0, 2.0, np.nan, 4.0, np.nan]})
+
+    def test_mean(self, numeric_df):
+        res = handle_missing(numeric_df, MissingOptions(strategy="mean"))
+        # mean of [1, 2, 4] = 7/3
+        filled = res.handled_df["x"].iloc[2]
+        assert abs(filled - 7.0 / 3.0) < 1e-9
+        assert res.cells_filled == 2
+
+    def test_median(self, numeric_df):
+        res = handle_missing(numeric_df, MissingOptions(strategy="median"))
+        # median of [1, 2, 4] = 2.0
+        assert res.handled_df["x"].iloc[2] == 2.0
+
+    def test_mode(self):
+        df = pd.DataFrame({"x": ["a", "a", "b", None, None]})
+        res = handle_missing(df, MissingOptions(strategy="mode"))
+        assert res.handled_df["x"].iloc[3] == "a"
+        assert res.handled_df["x"].iloc[4] == "a"
+        assert res.cells_filled == 2
+
+    def test_constant_scalar(self, numeric_df):
+        res = handle_missing(
+            numeric_df,
+            MissingOptions(strategy="constant", fill_value=99.0),
+        )
+        assert res.handled_df["x"].iloc[2] == 99.0
+        assert res.handled_df["x"].iloc[4] == 99.0
+
+    def test_constant_per_column(self):
+        df = pd.DataFrame({"a": [1, np.nan], "b": ["x", None]})
+        opts = MissingOptions(
+            strategy="constant",
+            column_fill_values={"a": 0, "b": "?"},
+        )
+        res = handle_missing(df, opts)
+        assert res.handled_df["a"].iloc[1] == 0
+        assert res.handled_df["b"].iloc[1] == "?"
+
+    def test_ffill(self):
+        df = pd.DataFrame({"x": [1.0, np.nan, np.nan, 4.0]})
+        res = handle_missing(df, MissingOptions(strategy="ffill"))
+        assert list(res.handled_df["x"]) == [1.0, 1.0, 1.0, 4.0]
+
+    def test_bfill(self):
+        df = pd.DataFrame({"x": [1.0, np.nan, np.nan, 4.0]})
+        res = handle_missing(df, MissingOptions(strategy="bfill"))
+        assert list(res.handled_df["x"]) == [1.0, 4.0, 4.0, 4.0]
+
+    def test_interpolate(self):
+        df = pd.DataFrame({"x": [1.0, np.nan, np.nan, 4.0]})
+        res = handle_missing(df, MissingOptions(strategy="interpolate"))
+        assert list(res.handled_df["x"]) == [1.0, 2.0, 3.0, 4.0]
+
+    def test_numeric_strategy_falls_back_for_categorical(self):
+        df = pd.DataFrame({"x": ["a", "a", None, "b"]})
+        opts = MissingOptions(strategy="median", categorical_strategy="mode")
+        res = handle_missing(df, opts)
+        assert res.strategy_per_column["x"] == "mode"
+        assert res.handled_df["x"].iloc[2] == "a"
+
+    def test_per_column_strategy_overrides_global(self):
+        df = pd.DataFrame({"a": [1.0, np.nan], "b": ["x", None]})
+        opts = MissingOptions(
+            strategy="median",
+            column_strategies={"b": "constant"},
+            fill_value="??",
+        )
+        res = handle_missing(df, opts)
+        assert res.handled_df["a"].iloc[1] == 1.0  # median of [1.0]
+        assert res.handled_df["b"].iloc[1] == "??"
+
+    def test_all_nan_column_safely_skipped(self):
+        df = pd.DataFrame({"x": [np.nan, np.nan, np.nan]})
+        res = handle_missing(df, MissingOptions(strategy="mean"))
+        assert res.cells_filled == 0
+        assert res.handled_df["x"].isna().all()
+
+
+# ---------------------------------------------------------------------------
+# handle_missing — drops
+# ---------------------------------------------------------------------------
+
+class TestDropStrategies:
+    def test_drop_row_any_missing(self):
+        # Strict-greater: threshold 0.0 → drop any row with any missing.
+        df = pd.DataFrame({
+            "a": [1, 2, np.nan, 4],
+            "b": ["x", None, "z", "w"],
+        })
+        opts = MissingOptions(strategy="drop_row", row_drop_threshold=0.0)
+        res = handle_missing(df, opts)
+        # Rows 1 and 2 each have one missing cell; rows 0 and 3 are clean.
+        assert res.rows_dropped == 2
+        assert len(res.handled_df) == 2
+
+    def test_drop_row_default_threshold_never_drops(self):
+        # Default 1.0 = never drop — no fraction exceeds 100%.
+        df = pd.DataFrame({
+            "a": [1, 2, np.nan],
+            "b": ["x", "y", None],
+        })
+        opts = MissingOptions(strategy="drop_row")  # threshold defaults to 1.0
+        res = handle_missing(df, opts)
+        assert res.rows_dropped == 0
+
+    def test_drop_row_partial_threshold(self):
+        df = pd.DataFrame({
+            "a": [1, np.nan, np.nan, np.nan],
+            "b": [10, 20, np.nan, np.nan],
+            "c": [100, 200, np.nan, 400],
+        })
+        # Strict-greater: threshold 0.5 → drop rows with > 50% missing.
+        opts = MissingOptions(strategy="drop_row", row_drop_threshold=0.5)
+        res = handle_missing(df, opts)
+        # row 0: 0/3, row 1: 1/3 (0.33) -> keep
+        # row 2: 3/3 (1.0) -> drop, row 3: 2/3 (0.67) -> drop
+        assert res.rows_dropped == 2
+
+    def test_drop_col_threshold(self):
+        df = pd.DataFrame({
+            "keep": [1, 2, 3, 4],
+            "drop_me": [np.nan, np.nan, np.nan, 1],  # 75% missing
+        })
+        # Strict-greater: 0.5 → drop columns with > 50% missing.
+        opts = MissingOptions(strategy="drop_col", col_drop_threshold=0.5)
+        res = handle_missing(df, opts)
+        assert "drop_me" in res.columns_dropped
+        assert "keep" not in res.columns_dropped
+
+    def test_drop_both(self):
+        df = pd.DataFrame({
+            "keep": [1, 2, 3, 4, 5],
+            "drop_col": [np.nan] * 5,
+            "x": [1, np.nan, 3, np.nan, 5],
+        })
+        opts = MissingOptions(
+            strategy="drop_both",
+            col_drop_threshold=0.99,  # >99% missing → drop column
+            row_drop_threshold=0.0,   # any missing in remaining cols → drop row
+        )
+        res = handle_missing(df, opts)
+        # drop_col is 100% missing → dropped
+        assert "drop_col" in res.columns_dropped
+        # Remaining scope (keep + x): rows 1 and 3 have a missing x → drop.
+        assert res.rows_dropped == 2
+
+    def test_drop_audit_records_dropped_rows(self):
+        df = pd.DataFrame({"a": [1, np.nan], "b": [2, np.nan]})
+        # Drop the fully-missing row (frac > 0.99).
+        opts = MissingOptions(strategy="drop_row", row_drop_threshold=0.99)
+        res = handle_missing(df, opts)
+        drop_records = res.changes[res.changes["action"] == "drop_row"]
+        assert len(drop_records) == 1
+
+
+# ---------------------------------------------------------------------------
+# Scope: columns / skip_columns
+# ---------------------------------------------------------------------------
+
+class TestScope:
+    def test_columns_filter(self):
+        df = pd.DataFrame({"a": [np.nan, 2], "b": [np.nan, 4]})
+        opts = MissingOptions(columns=["a"], strategy="constant", fill_value=99)
+        res = handle_missing(df, opts)
+        assert res.handled_df["a"].iloc[0] == 99
+        # b should be untouched
+        assert pd.isna(res.handled_df["b"].iloc[0])
+
+    def test_skip_columns(self):
+        df = pd.DataFrame({"a": [np.nan, 2], "b": [np.nan, 4]})
+        opts = MissingOptions(skip_columns=["b"], strategy="constant", fill_value=99)
+        res = handle_missing(df, opts)
+        assert res.handled_df["a"].iloc[0] == 99
+        assert pd.isna(res.handled_df["b"].iloc[0])
+
+    def test_unknown_column_raises(self):
+        df = pd.DataFrame({"a": [1]})
+        opts = MissingOptions(columns=["does_not_exist"])
+        with pytest.raises(InputValidationError):
+            handle_missing(df, opts)
+
+
+# ---------------------------------------------------------------------------
+# Presets / config
+# ---------------------------------------------------------------------------
+
+class TestPresets:
+    def test_detect_only_does_not_fill(self):
+        df = pd.DataFrame({"x": ["alice", "N/A", "bob"]})
+        opts = MissingOptions.from_preset("detect-only")
+        res = handle_missing(df, opts)
+        assert res.sentinels_standardized == 1
+        assert res.cells_filled == 0
+        assert res.rows_dropped == 0
+
+    def test_safe_fill_fills(self):
+        df = pd.DataFrame({"age": [30, np.nan, 25, 40], "name": ["a", "a", None, "b"]})
+        opts = MissingOptions.from_preset("safe-fill")
+        res = handle_missing(df, opts)
+        assert res.cells_filled == 2
+
+    def test_drop_incomplete(self):
+        df = pd.DataFrame({"a": [1, np.nan, 3], "b": [10, 20, 30]})
+        opts = MissingOptions.from_preset("drop-incomplete")
+        res = handle_missing(df, opts)
+        assert res.rows_dropped == 1
+
+    def test_unknown_preset_raises(self):
+        with pytest.raises(ConfigError):
+            MissingOptions.from_preset("does-not-exist")
+
+    def test_roundtrip_to_file(self, tmp_path):
+        opts = MissingOptions.from_preset("safe-fill")
+        opts.column_strategies = {"age": "median"}
+        path = tmp_path / "cfg.json"
+        opts.to_file(path)
+        loaded = MissingOptions.from_file(path)
+        assert loaded.strategy == opts.strategy
+        assert loaded.column_strategies == opts.column_strategies
+
+
+# ---------------------------------------------------------------------------
+# Validation
+# ---------------------------------------------------------------------------
+
+class TestValidate:
+    def test_invalid_strategy(self):
+        opts = MissingOptions(strategy="bogus")  # type: ignore[arg-type]
+        with pytest.raises(InputValidationError):
+            opts.validate()
+
+    def test_threshold_out_of_range(self):
+        opts = MissingOptions(row_drop_threshold=1.5)
+        with pytest.raises(ConfigError):
+            opts.validate()
+
+    def test_handle_missing_validates(self):
+        df = pd.DataFrame({"x": [1]})
+        opts = MissingOptions(strategy="bogus")  # type: ignore[arg-type]
+        with pytest.raises(InputValidationError):
+            handle_missing(df, opts)
+
+    def test_non_dataframe_input(self):
+        with pytest.raises(InputValidationError):
+            handle_missing([1, 2, 3])  # type: ignore[arg-type]
+
+
+# ---------------------------------------------------------------------------
+# End-to-end realistic case
+# ---------------------------------------------------------------------------
+
+class TestEndToEnd:
+    def test_messy_customer_export(self):
+        df = pd.DataFrame({
+            "customer_id": [1, 2, 3, 4, 5, 6],
+            "name": ["Alice", "Bob", "N/A", "  ", "Charlie", None],
+            "email": ["a@x.com", "-", "c@x.com", "d@x.com", "NULL", "f@x.com"],
+            "age":   [30, np.nan, 25, 40, np.nan, 50],
+        })
+        opts = MissingOptions(
+            standardize_sentinels=True,
+            strategy="median",
+            categorical_strategy="constant",
+            fill_value="UNKNOWN",
+        )
+        res = handle_missing(df, opts)
+
+        # Sentinels: name "N/A","  ",None; email "-","NULL". (None is real-NaN, not sentinel.)
+        # Whitespace + 'N/A' on name = 2; '-' + 'NULL' on email = 2.  Total = 4.
+        assert res.sentinels_standardized == 4
+        # name has 3 missing after standardize (N/A, "  ", None) → constant fill
+        # email has 2 missing → constant fill
+        # age has 2 missing → median (32.5 of [30, 25, 40, 50])
+        assert res.cells_filled == 7
+        assert res.handled_df["name"].isna().sum() == 0
+        assert res.handled_df["email"].isna().sum() == 0
+        assert res.handled_df["age"].isna().sum() == 0
+        assert (res.handled_df["name"] == "UNKNOWN").sum() == 3
+        assert (res.handled_df["age"] == 35.0).sum() == 2  # median of [30, 25, 40, 50]
+
+    def test_input_not_mutated(self):
+        df = pd.DataFrame({"x": ["N/A", "alice", np.nan]})
+        df_copy = df.copy()
+        handle_missing(df, MissingOptions.from_preset("safe-fill"))
+        pd.testing.assert_frame_equal(df, df_copy)
--- a/tests/test_missing_corpus.py
+++ b/tests/test_missing_corpus.py
@@ -0,0 +1,463 @@
+"""Acceptance corpus for the Missing Value Handler.
+
+Loads every fixture in ``test-cases/missing-corpus/test_data/`` and
+asserts the documented behaviour. The fixtures are split into:
+
+  * ``uc##`` — three target-client use cases (Shopify operator,
+    marketing analyst, consultant intake).
+  * ``ec##`` — edge cases the engine must handle without surprise:
+    all-NaN columns, zeros that aren't missing, Excel errors, unicode
+    whitespace, mixed dtypes, padding, single row/column, every default
+    sentinel, per-column constants, drop thresholds, leading-NaN ffill,
+    numeric-strategy fallback for non-numeric columns, headers-only,
+    idempotency.
+
+Each test runs through the public API (``handle_missing``) so any
+regression in the engine surfaces here. Fixture files double as living
+documentation for what the tool is supposed to do.
+"""
+
+from __future__ import annotations
+
+import io
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from src.core.missing import (
+    MissingOptions,
+    handle_missing,
+    is_missing_like,
+    profile_missing,
+)
+
+CORPUS = Path(__file__).resolve().parents[1] / "test-cases" / "missing-corpus"
+TEST_DATA = CORPUS / "test_data"
+
+
+def _read(name: str, *, dtype_str: bool = False) -> pd.DataFrame:
+    """Load a corpus CSV.
+
+    By default we let pandas infer dtypes — that's the most realistic
+    intake path (Excel exports keep numeric columns numeric). A handful
+    of cases pass ``dtype_str=True`` to keep sentinels visible in
+    columns that would otherwise be coerced to float.
+    """
+    path = TEST_DATA / name
+    if dtype_str:
+        return pd.read_csv(path, dtype=str, keep_default_na=False)
+    return pd.read_csv(path)
+
+
+# ---------------------------------------------------------------------------
+# Use case 1 — Shopify operator: detect-only
+# ---------------------------------------------------------------------------
+
+class TestUC01ShopifyExport:
+    """SMB operator standardizes disguised nulls before reimporting."""
+
+    def test_detect_only_replaces_sentinels(self):
+        df = _read("uc01_shopify_export.csv", dtype_str=True)
+        opts = MissingOptions.from_preset("detect-only")
+        res = handle_missing(df, opts)
+        # Spot-check known sentinels from the fixture
+        assert res.sentinels_standardized > 0
+        assert res.cells_filled == 0
+        assert res.rows_dropped == 0
+
+        # Fields that contained 'N/A', '-', 'NULL', '(blank)', '#N/A',
+        # 'n/a', '?', '(none)' should now be NaN.
+        for row, col in [
+            (1, "phone"),       # 'N/A'
+            (2, "city"),        # '-'
+            (3, "total_orders"),  # 'NULL'
+            (5, "phone"),       # ' '
+            (5, "last_order_date"),  # '(blank)'
+            (6, "last_order_date"),  # '#N/A'
+            (7, "phone"),       # 'n/a'
+            (8, "city"),        # '?'
+            (9, "total_orders"),  # '(none)'
+        ]:
+            assert pd.isna(res.handled_df.iloc[row][col]), (
+                f"Expected NaN at row {row} col {col}, got "
+                f"{res.handled_df.iloc[row][col]!r}"
+            )
+
+    def test_real_values_preserved(self):
+        df = _read("uc01_shopify_export.csv", dtype_str=True)
+        res = handle_missing(df, MissingOptions.from_preset("detect-only"))
+        # First row should be untouched.
+        assert res.handled_df.iloc[0]["first_name"] == "Alice"
+        assert res.handled_df.iloc[0]["email"] == "alice@shop.com"
+        assert res.handled_df.iloc[0]["lifetime_value"] == "1240.50"
+
+    def test_audit_log_complete(self):
+        df = _read("uc01_shopify_export.csv", dtype_str=True)
+        res = handle_missing(df, MissingOptions.from_preset("detect-only"))
+        # One audit row per sentinel replacement.
+        assert len(res.changes) == res.sentinels_standardized
+        assert set(res.changes["action"].apply(lambda s: s.startswith("standardize:"))) == {True}
+
+
+# ---------------------------------------------------------------------------
+# Use case 2 — Marketing analyst: safe-fill
+# ---------------------------------------------------------------------------
+
+class TestUC02MarketingAudience:
+    """Marketer fills numeric columns with median, categorical with mode."""
+
+    def test_safe_fill_clears_all_missing(self):
+        df = _read("uc02_marketing_audience.csv")
+        opts = MissingOptions.from_preset("safe-fill")
+        res = handle_missing(df, opts)
+        # Every cell in scope should be filled.
+        assert res.profile_after.cells_missing == 0
+        assert res.cells_filled > 0
+
+    def test_numeric_uses_median_categorical_uses_mode(self):
+        df = _read("uc02_marketing_audience.csv")
+        opts = MissingOptions.from_preset("safe-fill")
+        res = handle_missing(df, opts)
+        # 'age' is numeric → median strategy
+        assert res.strategy_per_column["age"] == "median"
+        # 'segment' / 'region' / 'source' are object → mode fallback
+        assert res.strategy_per_column["segment"] == "mode"
+        assert res.strategy_per_column["region"] == "mode"
+
+    def test_per_column_override(self):
+        df = _read("uc02_marketing_audience.csv")
+        opts = MissingOptions.from_preset("safe-fill")
+        opts.column_strategies = {"source": "constant"}
+        opts.column_fill_values = {"source": "unknown"}
+        res = handle_missing(df, opts)
+        # Cells previously holding sentinels in 'source' should now equal "unknown".
+        assert (res.handled_df["source"] == "unknown").sum() >= 3
+
+    def test_consent_real_false_not_dropped(self):
+        # 'consent' column has empty cells but also explicit "true"; mode fill
+        # must not silently change a real "true" to anything else.
+        df = _read("uc02_marketing_audience.csv")
+        res = handle_missing(df, MissingOptions.from_preset("safe-fill"))
+        original_trues = (df["consent"] == "true").sum()
+        result_trues = (res.handled_df["consent"] == "true").sum()
+        # Filled rows can become "true" (mode) but should not lose existing trues.
+        assert result_trues >= original_trues
+
+
+# ---------------------------------------------------------------------------
+# Use case 3 — Consultant intake: threshold drops + fill
+# ---------------------------------------------------------------------------
+
+class TestUC03ConsultantIntake:
+    """Drop sparse columns and rows, then fill the survivors."""
+
+    def test_drop_col_removes_legacy_fields(self):
+        df = _read("uc03_consultant_intake.csv", dtype_str=True)
+        # internal_id_legacy and beta_field are 100% missing — drop them.
+        opts = MissingOptions(
+            standardize_sentinels=True,
+            strategy="drop_col",
+            col_drop_threshold=0.99,
+        )
+        res = handle_missing(df, opts)
+        assert "internal_id_legacy" in res.columns_dropped
+        assert "beta_field" in res.columns_dropped
+
+    def test_drop_row_removes_mostly_empty_respondents(self):
+        df = _read("uc03_consultant_intake.csv", dtype_str=True)
+        opts = MissingOptions(
+            standardize_sentinels=True,
+            strategy="drop_both",
+            col_drop_threshold=0.99,  # drop the legacy / beta cols first
+            row_drop_threshold=0.5,   # then drop rows with >50% missing
+        )
+        res = handle_missing(df, opts)
+        # R-002, R-005, R-007, R-010 are mostly-empty respondents.
+        assert res.rows_dropped >= 4
+        # Non-empty respondents survive.
+        kept_ids = set(res.handled_df["respondent_id"].tolist())
+        for survivor in ("R-001", "R-003", "R-006", "R-008", "R-009", "R-012"):
+            assert survivor in kept_ids
+
+
+# ---------------------------------------------------------------------------
+# Edge cases
+# ---------------------------------------------------------------------------
+
+class TestEC01AllNanColumn:
+    def test_fill_skips_all_nan_column(self):
+        df = _read("ec01_all_nan_column.csv")
+        res = handle_missing(df, MissingOptions(strategy="mean"))
+        # Mean of all-NaN is NaN — engine must NOT fabricate a value.
+        assert res.handled_df["deprecated_field"].isna().all()
+        assert res.cells_filled == 0
+
+    def test_drop_col_catches_all_nan(self):
+        df = _read("ec01_all_nan_column.csv")
+        res = handle_missing(
+            df, MissingOptions(strategy="drop_col", col_drop_threshold=0.99),
+        )
+        assert "deprecated_field" in res.columns_dropped
+        assert "name" not in res.columns_dropped
+
+
+class TestEC02NoMissing:
+    def test_clean_file_is_noop(self):
+        df = _read("ec02_no_missing.csv")
+        res = handle_missing(df, MissingOptions.from_preset("safe-fill"))
+        assert res.sentinels_standardized == 0
+        assert res.cells_filled == 0
+        assert res.rows_dropped == 0
+        pd.testing.assert_frame_equal(res.handled_df, df)
+
+
+class TestEC03ZeroIsNotMissing:
+    def test_zero_preserved(self):
+        df = _read("ec03_zero_is_not_missing.csv")
+        res = handle_missing(df, MissingOptions.from_preset("safe-fill"))
+        # Original zeros remain zero.
+        assert (res.handled_df["balance"] == 0).sum() == (df["balance"] == 0).sum()
+        assert (res.handled_df["count"] == 0).sum() == (df["count"] == 0).sum()
+        # No spurious changes recorded.
+        assert res.cells_filled == 0
+        assert res.sentinels_standardized == 0
+
+    def test_is_missing_like_zero_predicate(self):
+        # Direct predicate check — zeros, false, "0" must all be non-missing.
+        assert not is_missing_like(0)
+        assert not is_missing_like(0.0)
+        assert not is_missing_like(False)
+        assert not is_missing_like("0")
+        assert not is_missing_like("0.00")
+
+
+class TestEC04ExcelErrors:
+    def test_excel_error_sentinels_recognized(self):
+        df = _read("ec04_excel_errors.csv", dtype_str=True)
+        res = handle_missing(df, MissingOptions(strategy="none"))
+        # 6 error sentinels in the fixture: #N/A, #NULL!, #VALUE!, #N/A, #N/A, #NULL!
+        assert res.sentinels_standardized == 6
+
+
+class TestEC05UnicodeWhitespace:
+    def test_nbsp_and_ideographic_space_count_as_missing(self):
+        df = _read("ec05_unicode_whitespace.csv", dtype_str=True)
+        res = handle_missing(df, MissingOptions(strategy="none"))
+        # rows 1, 2, 4 contain NBSP / tab / ideographic space respectively
+        assert res.handled_df["note"].isna().sum() == 3
+        assert res.handled_df.iloc[0]["note"] == "hello"
+        assert res.handled_df.iloc[3]["note"] == "real"
+
+
+class TestEC06MixedDtypes:
+    def test_mixed_column_falls_back_to_mode(self):
+        # Read with native dtypes so 'real_num' stays numeric.
+        df = _read("ec06_mixed_dtypes.csv")
+        opts = MissingOptions(
+            standardize_sentinels=True,
+            strategy="median",
+            categorical_strategy="mode",
+        )
+        res = handle_missing(df, opts)
+        # mixed_col holds 'N/A' / 'hello' alongside numbers → object dtype,
+        # median falls back to mode.
+        assert res.strategy_per_column["mixed_col"] == "mode"
+        # real_num is float dtype → median runs.
+        assert res.strategy_per_column["real_num"] == "median"
+
+
+class TestEC07RealDataWithPadding:
+    def test_padded_real_data_not_treated_as_missing(self):
+        df = _read("ec07_real_data_with_padding.csv", dtype_str=True)
+        res = handle_missing(df, MissingOptions(strategy="none"))
+        # Only row 1 (name="  ") and row 2 (city=blank) should become NaN.
+        # "  Alice  ", " Bob ", "  SF" must remain.
+        assert res.handled_df.iloc[0]["name"] == "  Alice  "
+        assert res.handled_df.iloc[2]["name"] == " Bob "
+        assert res.handled_df.iloc[3]["city"] == "  SF"
+
+
+class TestEC08SingleRow:
+    def test_single_row_handles_cleanly(self):
+        df = _read("ec08_single_row.csv", dtype_str=True)
+        # detect-only
+        res = handle_missing(df, MissingOptions(strategy="none"))
+        assert res.sentinels_standardized == 2  # 'N/A' + ''
+        # safe-fill on a one-row file: median/mode of a single value is itself.
+        res2 = handle_missing(df, MissingOptions.from_preset("safe-fill"))
+        assert res2.handled_df.iloc[0]["name"] == "Alice"
+
+
+class TestEC09SingleColumn:
+    def test_single_column_works(self):
+        df = _read("ec09_single_column.csv", dtype_str=True)
+        res = handle_missing(df, MissingOptions(strategy="none"))
+        # 'N/A', whitespace-only ' ', '-' = 3 sentinels
+        assert res.sentinels_standardized == 3
+        assert res.handled_df["value"].isna().sum() == 3
+
+
+class TestEC10AllSentinelVariants:
+    def test_every_default_sentinel_recognized(self):
+        df = _read("ec10_all_sentinel_variants.csv", dtype_str=True)
+        res = handle_missing(df, MissingOptions(strategy="none"))
+        # 20 sentinels + 1 real value
+        assert res.sentinels_standardized == 20
+        # The 'real_value' row stays.
+        assert (res.handled_df["sentinel_value"] == "real_value").sum() == 1
+
+
+class TestEC11ConstantPerColumn:
+    def test_per_column_fill_values(self):
+        df = _read("ec11_constant_per_column.csv", dtype_str=True)
+        opts = MissingOptions(
+            strategy="constant",
+            column_fill_values={
+                "country": "USA",
+                "salary": "0",
+                "department": "Unassigned",
+            },
+        )
+        res = handle_missing(df, opts)
+        # Fixture has 1 UK row + 2 USA rows + 2 blanks. Filling blanks with
+        # "USA" yields 4 USA total; UK is preserved.
+        assert (res.handled_df["country"] == "USA").sum() == 4
+        assert (res.handled_df["country"] == "UK").sum() == 1
+        assert (res.handled_df["department"] == "Unassigned").sum() >= 2
+
+
+class TestEC12DropThresholdBoundary:
+    def test_threshold_one_never_drops(self):
+        # threshold 1.0 + strict-greater = never drop.
+        df = _read("ec12_drop_threshold_boundary.csv")
+        opts = MissingOptions(strategy="drop_row", row_drop_threshold=1.0)
+        res = handle_missing(df, opts)
+        assert res.rows_dropped == 0
+
+    def test_threshold_just_under_one_drops_fully_missing(self):
+        # threshold 0.99: drop only fully-missing rows (frac > 0.99 → frac == 1.0).
+        df = _read("ec12_drop_threshold_boundary.csv")
+        opts = MissingOptions(
+            strategy="drop_row",
+            row_drop_threshold=0.99,
+            columns=["a", "b", "c", "d"],  # exclude id from the scope
+        )
+        res = handle_missing(df, opts)
+        # Only row 3 (id=4, all four are NaN) qualifies.
+        assert res.rows_dropped == 1
+
+    def test_threshold_half_drops_majority_missing(self):
+        df = _read("ec12_drop_threshold_boundary.csv")
+        opts = MissingOptions(
+            strategy="drop_row",
+            row_drop_threshold=0.5,
+            columns=["a", "b", "c", "d"],
+        )
+        res = handle_missing(df, opts)
+        # Missing fractions across [a,b,c,d]:
+        #   row 0: 0/4=0.0   keep
+        #   row 1: 2/4=0.5   keep (strict >, not equal)
+        #   row 2: 3/4=0.75  drop
+        #   row 3: 4/4=1.0   drop
+        #   row 4: 2/4=0.5   keep
+        assert res.rows_dropped == 2
+
+    def test_threshold_zero_drops_any_missing(self):
+        df = _read("ec12_drop_threshold_boundary.csv")
+        opts = MissingOptions(
+            strategy="drop_row",
+            row_drop_threshold=0.0,
+            columns=["a", "b", "c", "d"],
+        )
+        res = handle_missing(df, opts)
+        # Every body row except row 0 has at least one missing.
+        assert res.rows_dropped == 4
+
+
+class TestEC13FfillLeadingNan:
+    def test_leading_nan_run_survives_ffill(self):
+        df = _read("ec13_ffill_leading_nan.csv")
+        res = handle_missing(df, MissingOptions(strategy="ffill"))
+        # First two rows (leading NaN) remain NaN — there's nothing to fill from.
+        assert pd.isna(res.handled_df["price"].iloc[0])
+        assert pd.isna(res.handled_df["price"].iloc[1])
+        # Mid-series gets filled forward.
+        assert res.handled_df["price"].iloc[3] == 100.0
+        assert res.handled_df["price"].iloc[4] == 100.0
+        # Trailing NaN gets filled by the last seen value.
+        assert res.handled_df["price"].iloc[6] == 150.0
+
+
+class TestEC14InterpolateFallback:
+    def test_interpolate_on_non_numeric_falls_back(self):
+        df = _read("ec14_interpolate_fallback.csv", dtype_str=True)
+        opts = MissingOptions(
+            strategy="interpolate",
+            categorical_strategy="mode",
+        )
+        res = handle_missing(df, opts)
+        # All columns are object dtype here → fallback to mode.
+        assert res.strategy_per_column["category"] == "mode"
+        assert res.strategy_per_column["value"] == "mode"
+
+
+class TestEC15HeadersOnly:
+    def test_empty_body_does_not_crash(self):
+        df = _read("ec15_headers_only.csv")
+        # All operations must be no-ops on an empty body.
+        for preset in ("detect-only", "safe-fill", "drop-incomplete"):
+            res = handle_missing(df, MissingOptions.from_preset(preset))
+            assert len(res.handled_df) == 0
+            assert res.cells_filled == 0
+            assert res.rows_dropped == 0
+
+
+class TestEC16Idempotency:
+    def test_safe_fill_is_idempotent(self):
+        df = _read("ec16_idempotent_apply.csv", dtype_str=True)
+        opts = MissingOptions.from_preset("safe-fill")
+        first = handle_missing(df, opts)
+        second = handle_missing(first.handled_df, opts)
+        # Second pass should make no further changes.
+        pd.testing.assert_frame_equal(
+            second.handled_df.reset_index(drop=True),
+            first.handled_df.reset_index(drop=True),
+        )
+        assert second.cells_filled == 0
+        assert second.sentinels_standardized == 0
+
+    def test_detect_only_is_idempotent(self):
+        df = _read("ec16_idempotent_apply.csv", dtype_str=True)
+        opts = MissingOptions.from_preset("detect-only")
+        first = handle_missing(df, opts)
+        second = handle_missing(first.handled_df, opts)
+        assert second.sentinels_standardized == 0
+
+
+# ---------------------------------------------------------------------------
+# Whole-corpus property tests
+# ---------------------------------------------------------------------------
+
+ALL_FIXTURES = sorted(p.name for p in TEST_DATA.glob("*.csv"))
+
+
+@pytest.mark.parametrize("fixture", ALL_FIXTURES)
+def test_handle_missing_does_not_mutate_input(fixture):
+    """Every fixture must leave the input DataFrame untouched."""
+    df = pd.read_csv(TEST_DATA / fixture, dtype=str, keep_default_na=False)
+    if df.empty and len(df.columns) == 0:
+        pytest.skip(f"{fixture}: completely empty file")
+    snapshot = df.copy(deep=True)
+    handle_missing(df, MissingOptions.from_preset("safe-fill"))
+    pd.testing.assert_frame_equal(df, snapshot)
+
+
+@pytest.mark.parametrize("fixture", ALL_FIXTURES)
+def test_profile_runs_on_every_fixture(fixture):
+    """``profile_missing`` must succeed on every corpus file."""
+    df = pd.read_csv(TEST_DATA / fixture, dtype=str, keep_default_na=False)
+    prof = profile_missing(df, MissingOptions())
+    assert prof.rows_total == len(df)
+    assert prof.cells_total == len(df) * len(df.columns)
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -0,0 +1,324 @@
+"""Tests for src/core/pipeline.py."""
+
+from __future__ import annotations
+
+import json
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from src.core.errors import ConfigError, InputValidationError
+from src.core.pipeline import (
+    Pipeline,
+    PipelineResult,
+    SOFT_DEPENDENCIES,
+    Step,
+    StepResult,
+    TOOL_ADAPTERS,
+    TOOL_NAMES,
+    recommended_pipeline,
+    run_pipeline,
+    validate_pipeline,
+)
+
+
+# ---------------------------------------------------------------------------
+# Step / Pipeline construction
+# ---------------------------------------------------------------------------
+
+class TestStep:
+    def test_unknown_tool_raises(self):
+        with pytest.raises(ConfigError):
+            Step(tool="bogus_tool")
+
+    def test_default_options_empty_dict(self):
+        s = Step(tool="text_clean")
+        assert s.options == {}
+        assert s.enabled is True
+
+    def test_display_name_falls_back_to_tool(self):
+        assert Step(tool="dedup").display_name() == "dedup"
+        assert Step(tool="dedup", name="Final dedup").display_name() == "Final dedup"
+
+
+class TestPipelineSerialization:
+    def test_roundtrip_dict(self):
+        p = Pipeline(steps=[
+            Step("text_clean", {"trim": True}),
+            Step("dedup", {"survivor_rule": "first"}),
+        ])
+        out = p.to_dict()
+        loaded = Pipeline.from_dict(out)
+        assert len(loaded.steps) == 2
+        assert loaded.steps[0].tool == "text_clean"
+        assert loaded.steps[1].options["survivor_rule"] == "first"
+
+    def test_roundtrip_file(self, tmp_path):
+        p = Pipeline(steps=[Step("text_clean")])
+        path = tmp_path / "p.json"
+        p.to_file(path)
+        loaded = Pipeline.from_file(path)
+        assert loaded.steps[0].tool == "text_clean"
+
+    def test_from_dict_missing_steps_key(self):
+        with pytest.raises(ConfigError):
+            Pipeline.from_dict({})
+
+    def test_from_dict_missing_tool(self):
+        with pytest.raises(ConfigError):
+            Pipeline.from_dict({"steps": [{"options": {}}]})
+
+
+# ---------------------------------------------------------------------------
+# recommended_pipeline
+# ---------------------------------------------------------------------------
+
+class TestRecommendedPipeline:
+    def test_default_order(self):
+        p = recommended_pipeline()
+        assert [s.tool for s in p.steps] == [
+            "text_clean", "format_standardize", "missing", "dedup",
+        ]
+
+    def test_default_passes_validation(self):
+        p = recommended_pipeline()
+        assert validate_pipeline(p) == []
+
+    def test_include_overrides_default(self):
+        p = recommended_pipeline(include=["text_clean", "missing"])
+        assert [s.tool for s in p.steps] == ["text_clean", "missing"]
+
+    def test_options_seed_reaches_step(self):
+        p = recommended_pipeline(options={"text_clean": {"trim": False}})
+        assert p.steps[0].options == {"trim": False}
+
+    def test_unknown_tool_raises(self):
+        with pytest.raises(InputValidationError):
+            recommended_pipeline(include=["bogus"])
+
+    def test_can_place_column_map_first_or_last(self):
+        # Both placements must be acceptable per the docstring.
+        first = recommended_pipeline(include=[
+            "column_map", "text_clean", "format_standardize", "missing", "dedup",
+        ])
+        last = recommended_pipeline(include=[
+            "text_clean", "format_standardize", "missing", "column_map", "dedup",
+        ])
+        # No soft-dependency rule names column_map, so neither warns.
+        assert validate_pipeline(first) == []
+        assert validate_pipeline(last) == []
+
+
+# ---------------------------------------------------------------------------
+# validate_pipeline — soft dependencies
+# ---------------------------------------------------------------------------
+
+class TestValidatePipeline:
+    def test_in_order_no_warnings(self):
+        p = recommended_pipeline()
+        assert validate_pipeline(p) == []
+
+    def test_dedup_before_text_clean_warns(self):
+        p = Pipeline(steps=[Step("dedup"), Step("text_clean")])
+        ws = validate_pipeline(p)
+        assert len(ws) == 1
+        assert "dedup" in ws[0] and "text_clean" in ws[0]
+
+    def test_format_before_text_clean_warns(self):
+        p = Pipeline(steps=[Step("format_standardize"), Step("text_clean")])
+        ws = validate_pipeline(p)
+        assert any("format_standardize" in w for w in ws)
+
+    def test_disabled_steps_ignored(self):
+        # Disabled dedup-first should not trigger a warning.
+        p = Pipeline(steps=[
+            Step("dedup", enabled=False),
+            Step("text_clean"),
+        ])
+        assert validate_pipeline(p) == []
+
+    def test_duplicate_tool_does_not_double_warn(self):
+        # text_clean twice (legitimate: two-pass cleaning) shouldn't
+        # generate redundant warnings.
+        p = Pipeline(steps=[
+            Step("text_clean"),
+            Step("text_clean"),
+        ])
+        assert validate_pipeline(p) == []
+
+
+# ---------------------------------------------------------------------------
+# run_pipeline — execution
+# ---------------------------------------------------------------------------
+
+@pytest.fixture
+def messy_df():
+    return pd.DataFrame({
+        "name":  ["  Alice  ", "BOB", "N/A", "", "charlie "],
+        "phone": ["(415) 555-1234", "+44 20 7946 0958", "03-3210-7000", "", "(415) 555-1234"],
+        "country": ["US", "GB", "JP", "", "US"],
+    })
+
+
+class TestRunPipeline:
+    def test_recommended_pipeline_runs_end_to_end(self, messy_df):
+        p = recommended_pipeline(options={
+            "format_standardize": {
+                "column_types": {"phone": "phone"},
+                "phone_country_column": "country",
+            },
+            "missing": {"strategy": "none"},
+        })
+        res = run_pipeline(messy_df, p)
+        assert isinstance(res, PipelineResult)
+        assert res.initial_rows == 5
+        # Dedup at the end removes the Alice/charlie duplicate (same phone).
+        assert res.final_rows < res.initial_rows
+        assert res.warnings == []
+
+    def test_initial_df_not_mutated(self, messy_df):
+        snapshot = messy_df.copy(deep=True)
+        run_pipeline(messy_df, recommended_pipeline())
+        pd.testing.assert_frame_equal(messy_df, snapshot)
+
+    def test_disabled_step_skipped(self, messy_df):
+        p = Pipeline(steps=[
+            Step("text_clean", enabled=False),
+            Step("missing", options={"strategy": "none"}),
+        ])
+        res = run_pipeline(messy_df, p)
+        assert res.step_results[0].skipped is True
+        assert res.step_results[1].skipped is False
+
+    def test_step_results_ordered_and_timed(self, messy_df):
+        p = recommended_pipeline(options={
+            "missing": {"strategy": "none"},
+        })
+        res = run_pipeline(messy_df, p)
+        assert len(res.step_results) == 4
+        for sr in res.step_results:
+            assert sr.elapsed_seconds >= 0
+        assert [sr.step.tool for sr in res.step_results] == [
+            "text_clean", "format_standardize", "missing", "dedup",
+        ]
+
+    def test_warnings_returned_but_run_proceeds(self, messy_df):
+        p = Pipeline(steps=[
+            Step("dedup"),
+            Step("text_clean"),
+        ])
+        res = run_pipeline(messy_df, p)
+        assert res.warnings  # warnings present
+        # Both steps still ran.
+        assert all(not sr.skipped for sr in res.step_results)
+
+    def test_progress_callback_fires_per_step(self, messy_df):
+        seen: list[StepResult] = []
+        p = Pipeline(steps=[
+            Step("text_clean"),
+            Step("missing", options={"strategy": "none"}),
+        ])
+        run_pipeline(messy_df, p, on_step_complete=seen.append)
+        assert len(seen) == 2
+        assert all(isinstance(s, StepResult) for s in seen)
+
+    def test_progress_callback_exception_does_not_abort(self, messy_df):
+        def bad(_sr):
+            raise RuntimeError("boom")
+        p = Pipeline(steps=[Step("text_clean")])
+        # Must not raise.
+        res = run_pipeline(messy_df, p, on_step_complete=bad)
+        assert res.final_rows == 5
+
+    def test_stop_on_error_default(self, messy_df):
+        # Force an error by giving format_standardize a non-existent column.
+        p = Pipeline(steps=[
+            Step("format_standardize", options={
+                "column_types": {"does_not_exist": "phone"},
+            }),
+        ])
+        with pytest.raises(InputValidationError):
+            run_pipeline(messy_df, p)
+
+    def test_continue_on_error_carries_previous_df(self, messy_df):
+        p = Pipeline(steps=[
+            Step("text_clean"),
+            Step("format_standardize", options={
+                "column_types": {"does_not_exist": "phone"},
+            }),
+            Step("missing", options={"strategy": "none"}),
+        ])
+        res = run_pipeline(messy_df, p, stop_on_error=False)
+        # Step 2 errored, step 3 still ran.
+        assert res.step_results[1].error is not None
+        assert res.step_results[2].error is None
+        assert res.final_rows == 5
+
+    def test_non_dataframe_input(self):
+        with pytest.raises(InputValidationError):
+            run_pipeline([1, 2, 3], recommended_pipeline())  # type: ignore[arg-type]
+
+
+# ---------------------------------------------------------------------------
+# Per-tool adapter sanity
+# ---------------------------------------------------------------------------
+
+class TestAdapters:
+    @pytest.mark.parametrize("tool", TOOL_NAMES)
+    def test_adapter_with_default_options_runs(self, tool, messy_df):
+        # Each adapter must accept an empty options dict and return a
+        # (df, summary) pair.
+        out_df, summary = TOOL_ADAPTERS[tool](messy_df, {})
+        assert isinstance(out_df, pd.DataFrame)
+        assert isinstance(summary, dict)
+
+    def test_format_standardize_adapter_passes_column_types(self, messy_df):
+        out, summary = TOOL_ADAPTERS["format_standardize"](
+            messy_df, {"column_types": {"phone": "phone"}},
+        )
+        assert summary["columns_processed"] == ["phone"]
+
+    def test_dedup_adapter_with_unknown_survivor_rule_raises(self, messy_df):
+        with pytest.raises(ConfigError):
+            TOOL_ADAPTERS["dedup"](messy_df, {"survivor_rule": "bogus"})
+
+
+# ---------------------------------------------------------------------------
+# SOFT_DEPENDENCIES integrity
+# ---------------------------------------------------------------------------
+
+class TestSoftDependencies:
+    def test_every_pair_uses_known_tools(self):
+        for earlier, later, _ in SOFT_DEPENDENCIES:
+            assert earlier in TOOL_NAMES
+            assert later in TOOL_NAMES
+
+    def test_all_reasons_non_empty(self):
+        for _, _, why in SOFT_DEPENDENCIES:
+            assert why and isinstance(why, str)
+            # Reason should be a sentence — at least 20 chars.
+            assert len(why) > 20
+
+    def test_dependencies_form_a_dag(self):
+        # No cycles — there must exist a topological ordering of the
+        # tools such that every soft dependency (earlier, later)
+        # is satisfied. With 5 tools and 6 deps this is easy to verify.
+        from collections import defaultdict, deque
+        edges: dict[str, list[str]] = defaultdict(list)
+        in_degree: dict[str, int] = {t: 0 for t in TOOL_NAMES}
+        for e, l, _ in SOFT_DEPENDENCIES:
+            edges[e].append(l)
+            in_degree[l] += 1
+        queue = deque(t for t, d in in_degree.items() if d == 0)
+        order = []
+        while queue:
+            t = queue.popleft()
+            order.append(t)
+            for nxt in edges[t]:
+                in_degree[nxt] -= 1
+                if in_degree[nxt] == 0:
+                    queue.append(nxt)
+        assert len(order) == len(TOOL_NAMES), (
+            f"SOFT_DEPENDENCIES contain a cycle; topo order={order}"
+        )