diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..e93a5da
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,20 @@
+; pytest configuration shared by tox, run_tests.py, and direct pytest calls.
+
+[pytest]
+testpaths = tests
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+
+# Custom markers used by run_tests.py --quick and the e2e/install groupings.
+markers =
+    slow: tests that take longer than ~1s (skipped under --quick)
+    e2e: end-to-end CLI / integration tests
+    install: import / dependency sanity tests
+    fixture_sweep: parametrized sweep over the test-cases/ folder
+
+# Warnings discipline: fail on unexpected DeprecationWarning from our own
+# code, but tolerate third-party deprecations that we can't fix.
+filterwarnings =
+    error::DeprecationWarning:src
+    ignore::DeprecationWarning
diff --git a/run_tests.py b/run_tests.py
new file mode 100755
index 0000000..d801ad0
--- /dev/null
+++ b/run_tests.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+"""DataTools test runner — single entry point with category flags.
+
+Examples
+--------
+Run everything (default)::
+
+    python run_tests.py
+
+Run a single tool's tests::
+
+    python run_tests.py --tool dedup
+    python run_tests.py --tool text_clean
+    python run_tests.py --tool analyze
+    python run_tests.py --tool io
+    python run_tests.py --tool cli
+
+Categories::
+
+    python run_tests.py --unit          # unit tests only (no e2e, no install)
+    python run_tests.py --e2e           # end-to-end smoke tests
+    python run_tests.py --install       # install / dependency sanity
+    python run_tests.py --fixtures      # corpus + dropped-file sweep
+    python run_tests.py --coverage      # add a coverage report
+    python run_tests.py --quick         # skip @pytest.mark.slow
+    python run_tests.py -v / --verbose  # verbose pytest output
+
+Multiple flags compose. ``--tool X --quick`` runs that tool's quick tests.
+
+Dropping a new fixture into ``test-cases/`` is automatic: the fixture sweep
+test (``tests/test_fixtures_sweep.py``) parametrizes over every CSV/XLSX in
+that directory (excluding ``text-cleaner-corpus/`` which has its own suite).
+"""
+
+from __future__ import annotations
+
+import argparse
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+PROJECT_ROOT = Path(__file__).resolve().parent
+
+# Tool name -> matching pytest -k expression. Keep aligned with test_*.py
+# filenames; run_tests.py --tool foo translates to ``-k foo``.
+_TOOL_MAP: dict[str, str] = {
+    "dedup": "test_dedup or test_cli.py",
+    "text_clean": "test_text_clean or test_cli_text_clean or test_corpus",
+    "analyze": "test_analyze or test_cli_analyze",
+    "io": "test_io",
+    "cli": "test_cli or test_cli_text_clean or test_cli_analyze",
+    "config": "test_config",
+    "normalizers": "test_normalizers",
+}
+
+_CATEGORY_PATHS: dict[str, list[str]] = {
+    "unit": ["tests/"],          # all tests are unit unless marked otherwise
+    "e2e": ["tests/test_e2e.py"],
+    "install": ["tests/test_install.py"],
+    "fixtures": ["tests/test_corpus.py", "tests/test_fixtures_sweep.py"],
+}
+
+
+def _build_pytest_args(args: argparse.Namespace) -> list[str]:
+    cmd: list[str] = [sys.executable, "-m", "pytest"]
+
+    # Verbosity
+    if args.verbose:
+        cmd.append("-vv")
+    else:
+        cmd.append("-q")
+
+    # Coverage
+    if args.coverage:
+        cmd.extend(["--cov=src", "--cov-report=term-missing"])
+
+    # Quick: skip anything marked slow.
+    if args.quick:
+        cmd.extend(["-m", "not slow"])
+
+    # Tool filter via -k expression.
+    if args.tool:
+        if args.tool not in _TOOL_MAP:
+            print(
+                f"unknown --tool '{args.tool}'. "
+                f"available: {', '.join(sorted(_TOOL_MAP))}",
+                file=sys.stderr,
+            )
+            sys.exit(2)
+        cmd.extend(["-k", _TOOL_MAP[args.tool]])
+
+    # Category selection (--unit/--e2e/--install/--fixtures). When several
+    # categories are requested they're OR'd by passing all paths.
+    paths: list[str] = []
+    selected_categories = [
+        c for c in ("unit", "e2e", "install", "fixtures")
+        if getattr(args, c)
+    ]
+    if selected_categories:
+        for cat in selected_categories:
+            paths.extend(_CATEGORY_PATHS[cat])
+    elif args.path:
+        paths.extend(args.path)
+    else:
+        paths.append("tests/")
+
+    cmd.extend(paths)
+    return cmd
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        prog="run_tests.py",
+        description=(
+            "DataTools test runner. With no flags runs every test. Use "
+            "--tool to scope to one tool, --unit/--e2e/--install/--fixtures "
+            "to scope by category. Combine flags freely."
+        ),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=(
+            "Available tools: " + ", ".join(sorted(_TOOL_MAP)) + "\n\n"
+            "To add a new fixture-driven test: drop a CSV or XLSX into "
+            "test-cases/ and re-run. tests/test_fixtures_sweep.py picks up "
+            "new files automatically — no test code changes required."
+        ),
+    )
+    parser.add_argument("--tool", help="Limit tests to one tool.")
+    parser.add_argument("--unit", action="store_true",
+                        help="Unit tests only (default scope).")
+    parser.add_argument("--e2e", action="store_true",
+                        help="End-to-end CLI/integration smoke tests.")
+    parser.add_argument("--install", action="store_true",
+                        help="Install / import / entry-point sanity tests.")
+    parser.add_argument("--fixtures", action="store_true",
+                        help="Run the corpus + dropped-fixture sweep.")
+    parser.add_argument("--coverage", action="store_true",
+                        help="Emit a coverage report (term-missing).")
+    parser.add_argument("--quick", action="store_true",
+                        help="Skip tests marked @pytest.mark.slow.")
+    parser.add_argument("-v", "--verbose", action="store_true",
+                        help="Verbose pytest output.")
+    parser.add_argument("path", nargs="*",
+                        help="Optional explicit test paths (override category).")
+
+    args = parser.parse_args(argv)
+
+    # Ensure we run from the project root so relative imports / paths work.
+    cwd_target = PROJECT_ROOT
+    if Path.cwd() != cwd_target:
+        print(f"running from {cwd_target}")
+
+    if shutil.which("pytest") is None and not _python_has_pytest():
+        print(
+            "pytest is not installed. Install dev deps:\n"
+            "  pip install -r requirements-dev.txt",
+            file=sys.stderr,
+        )
+        return 2
+
+    cmd = _build_pytest_args(args)
+    if args.verbose:
+        print("→", " ".join(cmd))
+    proc = subprocess.run(cmd, cwd=cwd_target)
+    return proc.returncode
+
+
+def _python_has_pytest() -> bool:
+    try:
+        __import__("pytest")
+        return True
+    except ImportError:
+        return False
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/test_e2e.py b/tests/test_e2e.py
new file mode 100644
index 0000000..59c2a3f
--- /dev/null
+++ b/tests/test_e2e.py
@@ -0,0 +1,143 @@
+"""End-to-end smoke tests.
+
+Round-trips through the CLI binaries with real fixture inputs to catch
+glue-code breakage that pure unit tests miss: argv parsing, file I/O, log
+configuration, exit codes, and the integration between the analyzer, the
+pre-parse repair, and pandas.
+
+These are intentionally lightweight — one happy path per CLI plus a
+couple of failure modes. Bigger scenarios live in ``test_corpus.py`` and
+``test_fixtures_sweep.py``.
+"""
+
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+import pandas as pd
+import pytest
+
+pytestmark = pytest.mark.e2e
+
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+CORPUS_KITCHEN_SINK = (
+    PROJECT_ROOT / "test-cases" / "text-cleaner-corpus" / "test_data" / "20_kitchen_sink.csv"
+)
+
+
+def _run(*args: str, cwd: Path | None = None, **kwargs):
+    return subprocess.run(
+        [sys.executable, *args],
+        capture_output=True, text=True, timeout=60,
+        cwd=cwd or PROJECT_ROOT,
+        **kwargs,
+    )
+
+
+# ---------------------------------------------------------------------------
+# cli_analyze — full round-trip
+# ---------------------------------------------------------------------------
+
+class TestAnalyzeCliE2E:
+    def test_table_output_on_kitchen_sink(self):
+        if not CORPUS_KITCHEN_SINK.exists():
+            pytest.skip("kitchen-sink fixture missing")
+        proc = _run("-m", "src.cli_analyze", str(CORPUS_KITCHEN_SINK))
+        assert proc.returncode == 0, proc.stderr
+        # Rich tables wrap; assert on stable substrings.
+        assert "Text Cleaner" in proc.stdout
+        assert "csv_bom_stripped" in proc.stdout or "smart_quotes" in proc.stdout
+
+    def test_json_output_parses(self):
+        if not CORPUS_KITCHEN_SINK.exists():
+            pytest.skip("kitchen-sink fixture missing")
+        proc = _run("-m", "src.cli_analyze", str(CORPUS_KITCHEN_SINK), "--json")
+        assert proc.returncode == 0, proc.stderr
+        data = json.loads(proc.stdout)
+        assert isinstance(data, list) and len(data) > 0
+        for item in data:
+            assert {"id", "severity", "tool", "count", "description"} <= set(item)
+
+
+# ---------------------------------------------------------------------------
+# cli_text_clean — full round-trip
+# ---------------------------------------------------------------------------
+
+class TestTextCleanCliE2E:
+    def test_apply_writes_cleaned_file(self, tmp_path):
+        # Build a small dirty CSV: NBSP padding + smart quotes.
+        src = tmp_path / "dirty.csv"
+        src.write_text(
+            "id,name,note\n"
+            "1, Alice ,“hello”\n"
+            "2,  Bob  ,it’s fine\n",
+            encoding="utf-8",
+        )
+        out = tmp_path / "out.csv"
+        proc = _run(
+            "-m", "src.cli_text_clean", str(src),
+            "--apply", "--output", str(out),
+        )
+        assert proc.returncode == 0, proc.stderr
+        assert out.exists(), "cleaned file was not written"
+        cleaned = pd.read_csv(out, dtype=str, keep_default_na=False, encoding="utf-8-sig")
+        # NBSP padding stripped
+        assert cleaned.iloc[0]["name"] == "Alice"
+        assert cleaned.iloc[1]["name"] == "Bob"
+        # Smart quotes folded
+        assert cleaned.iloc[0]["note"] == '"hello"'
+        assert cleaned.iloc[1]["note"] == "it's fine"
+
+    def test_preview_does_not_write(self, tmp_path):
+        src = tmp_path / "input.csv"
+        src.write_text("id,name\n1,Alice\n", encoding="utf-8")
+        # Without --apply, no output file should appear.
+        proc = _run("-m", "src.cli_text_clean", str(src))
+        assert proc.returncode == 0
+        # Default output path next to input — must not exist.
+        default_out = src.with_name(src.stem + "_cleaned.csv")
+        assert not default_out.exists()
+
+
+# ---------------------------------------------------------------------------
+# cli (dedup) — full round-trip
+# ---------------------------------------------------------------------------
+
+class TestDedupCliE2E:
+    def test_apply_removes_duplicates(self, tmp_path):
+        src = tmp_path / "dups.csv"
+        src.write_text(
+            "name,email\n"
+            "Alice,alice@x.com\n"
+            "Alice,alice@x.com\n"
+            "Bob,bob@x.com\n",
+            encoding="utf-8",
+        )
+        out = tmp_path / "deduped.csv"
+        proc = _run(
+            "-m", "src.cli", str(src),
+            "--apply", "--output", str(out),
+        )
+        assert proc.returncode == 0, proc.stderr
+        assert out.exists()
+        result = pd.read_csv(out, dtype=str, keep_default_na=False, encoding="utf-8-sig")
+        assert len(result) == 2  # Alice deduped, Bob unique
+
+
+# ---------------------------------------------------------------------------
+# run_tests.py self-test — sanity check the runner itself works
+# ---------------------------------------------------------------------------
+
+class TestRunTestsE2E:
+    def test_tool_filter_runs_subset(self):
+        proc = _run("run_tests.py", "--tool", "config", "-v")
+        assert proc.returncode == 0, proc.stderr
+        # Check we limited the run via -k.
+        assert "config" in proc.stdout.lower()
+
+    def test_unknown_tool_exits_2(self):
+        proc = _run("run_tests.py", "--tool", "no_such_tool")
+        assert proc.returncode == 2
diff --git a/tests/test_fixtures_sweep.py b/tests/test_fixtures_sweep.py
new file mode 100644
index 0000000..fb41a66
--- /dev/null
+++ b/tests/test_fixtures_sweep.py
@@ -0,0 +1,156 @@
+"""Automated sweep over every fixture in ``test-cases/``.
+
+Drop a new CSV/TSV/XLSX into ``test-cases/`` and the sweep picks it up the
+next time pytest runs — no test code changes required. Each fixture goes
+through three smoke tests:
+
+1. **Pre-parse repair runs cleanly.** Byte-level repair (BOM, NUL, smart
+   quotes, rogue delimiters) must not crash, and produced bytes must be
+   valid for ``pd.read_csv``.
+2. **Analyzer runs cleanly.** ``analyze()`` must produce a list of
+   :class:`Finding` objects without raising.
+3. **Text cleaner runs cleanly and preserves schema.** Default-config
+   ``clean_dataframe`` must not change row count and must return the same
+   number of columns it started with.
+
+The sweep skips files inside ``text-cleaner-corpus/`` because that subdir
+has its own dedicated test (``test_corpus.py``) with byte-exact expected
+outputs.
+"""
+
+from __future__ import annotations
+
+import io
+from pathlib import Path
+
+import pandas as pd
+import pytest
+
+from src.core.analyze import Finding, analyze
+from src.core.io import detect_delimiter, detect_encoding, repair_bytes
+from src.core.text_clean import clean_dataframe
+
+
+TEST_CASES_DIR = Path(__file__).resolve().parent.parent / "test-cases"
+
+# Subdirectories in test-cases/ that are exercised by their own dedicated
+# tests. The sweep ignores these so we don't double-test or fight expected
+# byte-exact outputs.
+_EXCLUDED_SUBDIRS = {"text-cleaner-corpus"}
+
+# File suffixes we know how to load.
+_SUPPORTED_SUFFIXES = {".csv", ".tsv", ".xlsx", ".xls"}
+
+
+def _discover_fixtures() -> list[Path]:
+    """Return every fixture file under test-cases/ that the sweep should run.
+
+    Walks one level deep — CSV/XLSX directly inside test-cases/ are picked
+    up; files in excluded subdirectories are not.
+    """
+    if not TEST_CASES_DIR.is_dir():
+        return []
+    out: list[Path] = []
+    for entry in sorted(TEST_CASES_DIR.iterdir()):
+        if entry.is_dir():
+            if entry.name in _EXCLUDED_SUBDIRS:
+                continue
+            for sub in sorted(entry.rglob("*")):
+                if sub.is_file() and sub.suffix.lower() in _SUPPORTED_SUFFIXES:
+                    out.append(sub)
+            continue
+        if entry.is_file() and entry.suffix.lower() in _SUPPORTED_SUFFIXES:
+            out.append(entry)
+    return out
+
+
+_FIXTURES = _discover_fixtures()
+
+
+def _fixture_id(path: Path) -> str:
+    """Pretty pytest id derived from the filename, keeping subdirs visible."""
+    rel = path.relative_to(TEST_CASES_DIR)
+    return str(rel)
+
+
+# Skip the entire module gracefully when no fixtures are present, instead of
+# emitting a "no tests collected" failure.
+pytestmark = [
+    pytest.mark.fixture_sweep,
+    pytest.mark.skipif(
+        not _FIXTURES,
+        reason="no fixtures found under test-cases/ — drop a CSV/XLSX in to enable the sweep",
+    ),
+]
+
+
+def _read_with_repair(path: Path) -> tuple[pd.DataFrame, object | None]:
+    """Read *path* with the same robust pipeline analyze() uses.
+
+    Returns ``(df, repair_result)`` where repair_result is None for Excel.
+    """
+    suffix = path.suffix.lower()
+    if suffix in (".xlsx", ".xls"):
+        df = pd.read_excel(path, dtype=str, keep_default_na=False, engine="openpyxl")
+        return df, None
+    enc = detect_encoding(path)
+    delim = detect_delimiter(path, enc)
+    raw = path.read_bytes()
+    repair = repair_bytes(raw, encoding=enc, delimiter=delim)
+    df = pd.read_csv(
+        io.BytesIO(repair.repaired_bytes),
+        encoding="utf-8", delimiter=delim,
+        dtype=str, keep_default_na=False, on_bad_lines="warn",
+    )
+    return df, repair
+
+
+@pytest.mark.parametrize("fixture", _FIXTURES, ids=[_fixture_id(p) for p in _FIXTURES])
+class TestFixtureSweep:
+    """Smoke tests that every fixture in ``test-cases/`` must pass."""
+
+    def test_repair_and_load(self, fixture: Path) -> None:
+        df, _ = _read_with_repair(fixture)
+        assert isinstance(df, pd.DataFrame), f"{fixture.name}: did not return a DataFrame"
+        assert len(df.columns) >= 1, f"{fixture.name}: zero columns after parse"
+
+    def test_analyze_runs(self, fixture: Path) -> None:
+        df, repair = _read_with_repair(fixture)
+        findings = analyze(df, repair_result=repair)
+        assert isinstance(findings, list)
+        for f in findings:
+            assert isinstance(f, Finding), (
+                f"{fixture.name}: analyze() returned a non-Finding ({type(f)})"
+            )
+
+    def test_text_cleaner_preserves_schema(self, fixture: Path) -> None:
+        df, _ = _read_with_repair(fixture)
+        before_rows = len(df)
+        before_cols = len(df.columns)
+        result = clean_dataframe(df)
+        assert len(result.cleaned_df) == before_rows, (
+            f"{fixture.name}: row count changed "
+            f"({before_rows} -> {len(result.cleaned_df)})"
+        )
+        assert len(result.cleaned_df.columns) == before_cols, (
+            f"{fixture.name}: column count changed "
+            f"({before_cols} -> {len(result.cleaned_df.columns)})"
+        )
+
+    def test_text_cleaner_idempotent(self, fixture: Path) -> None:
+        df, _ = _read_with_repair(fixture)
+        once = clean_dataframe(df).cleaned_df.reset_index(drop=True)
+        twice = clean_dataframe(once).cleaned_df.reset_index(drop=True)
+        assert once.equals(twice), (
+            f"{fixture.name}: clean(clean(x)) != clean(x); cleaner is not idempotent"
+        )
+
+
+def test_at_least_one_fixture_present() -> None:
+    """Smoke check: every project should ship at least one fixture so the
+    sweep is not silently skipped on a clean checkout. Adjust the threshold
+    only if intentionally moving fixtures elsewhere."""
+    assert len(_FIXTURES) > 0, (
+        "No fixtures found under test-cases/. "
+        "Drop a CSV or XLSX file into the directory and re-run."
+    )
diff --git a/tests/test_gap_coverage.py b/tests/test_gap_coverage.py
new file mode 100644
index 0000000..079652a
--- /dev/null
+++ b/tests/test_gap_coverage.py
@@ -0,0 +1,161 @@
+"""Tests added to close gaps surfaced by the test audit.
+
+These cover edges that existing suites missed:
+
+- ``CleanOptions.clean_headers=False`` toggle (added but not directly tested).
+- ``repair_bytes`` with non-comma delimiters and combined-fix scenarios.
+- ``analyze()`` over a path-based Excel file.
+- ``analyze()`` with ``sample_rows >= len(df)`` (uses copy(), not head()).
+- ``findings_by_tool`` on an empty list.
+- BOM that appears mid-cell rather than at file start.
+
+The collapse-whitespace heuristic for numeric/date/phone-shaped cells (spec
+§4.17) is *not yet implemented* and is captured here as a known-gap xfail
+so it's surfaced rather than silently missing.
+"""
+
+from __future__ import annotations
+
+import io
+
+import pandas as pd
+import pytest
+
+from src.core.analyze import analyze, findings_by_tool
+from src.core.io import RepairAction, repair_bytes
+from src.core.text_clean import CleanOptions, clean_dataframe
+
+
+# ---------------------------------------------------------------------------
+# clean_headers toggle
+# ---------------------------------------------------------------------------
+
+class TestCleanHeadersToggle:
+    def test_default_cleans_headers(self):
+        df = pd.DataFrame({"  id  ": [1], "Email​": ["a@b.com"]})
+        result = clean_dataframe(df)
+        assert list(result.cleaned_df.columns) == ["id", "Email"]
+
+    def test_disable_preserves_dirty_headers(self):
+        df = pd.DataFrame({"  id  ": [1], "Email​": ["a@b.com"]})
+        result = clean_dataframe(df, CleanOptions(clean_headers=False))
+        assert list(result.cleaned_df.columns) == ["  id  ", "Email​"]
+
+    def test_disable_still_cleans_data_cells(self):
+        df = pd.DataFrame({"name": ["  Alice  ", "Bob "]})
+        result = clean_dataframe(df, CleanOptions(clean_headers=False))
+        assert result.cleaned_df["name"].tolist() == ["Alice", "Bob"]
+
+
+# ---------------------------------------------------------------------------
+# repair_bytes — non-comma delimiters and combined fixes
+# ---------------------------------------------------------------------------
+
+class TestRepairBytesDelimiters:
+    def test_tab_delimited_smart_quote_fold(self):
+        raw = "id\tnote\n1\t“hi”\n".encode("utf-8")
+        result = repair_bytes(raw, delimiter="\t")
+        text = result.repaired_bytes.decode("utf-8")
+        assert "“" not in text and "”" not in text
+        assert "\t" in text  # delimiter preserved
+
+    def test_semicolon_delimited_unrepairable_extras(self):
+        raw = b"id;a;b\n1;foo;bar\n2;1;2;3;4\n"
+        result = repair_bytes(raw, delimiter=";")
+        # Extra-field row with no clear merge candidate is logged unrepairable.
+        assert 3 in result.unrepairable_lines
+
+
+class TestRepairBytesCombinedFixes:
+    def test_bom_plus_nul_plus_smart_quotes(self):
+        raw = (
+            b"\xef\xbb\xbf"
+            b"id,note\n"
+            b"1,Hel\x00lo \xe2\x80\x9cworld\xe2\x80\x9d\n"
+        )
+        result = repair_bytes(raw)
+        kinds = {a.kind for a in result.actions}
+        assert {"strip_bom", "strip_nul", "fold_smart_quote"} <= kinds
+        # Resulting bytes parse cleanly.
+        df = pd.read_csv(io.BytesIO(result.repaired_bytes))
+        assert df.iloc[0]["note"] == 'Hello "world"'
+
+
+# ---------------------------------------------------------------------------
+# analyze() — path-based Excel and large-sample edges
+# ---------------------------------------------------------------------------
+
+class TestAnalyzeXlsxPath:
+    def test_excel_path_runs_without_repair(self, tmp_path):
+        path = tmp_path / "small.xlsx"
+        df = pd.DataFrame({
+            "id": ["1", "2"],
+            "name": ["  Alice  ", "Bob"],   # padding in xlsx
+        })
+        df.to_excel(path, index=False, engine="openpyxl")
+        findings = analyze(path)
+        ids = {f.id for f in findings}
+        assert "whitespace_padding" in ids
+        # Excel skips csv_* findings — no pre-parse repair on xlsx.
+        assert not any(i.startswith("csv_") for i in ids)
+
+
+class TestAnalyzeSampleRowsEdge:
+    def test_sample_rows_larger_than_df(self):
+        df = pd.DataFrame({"x": ["  pad  ", "clean"]})
+        # sample_rows=1000 but df has only 2 rows; must not crash.
+        findings = analyze(df, sample_rows=1000)
+        assert any(f.id == "whitespace_padding" for f in findings)
+
+
+class TestAnalyzeMidCellBom:
+    def test_bom_inside_cell_treated_as_zero_width(self):
+        df = pd.DataFrame({"name": ["Hel﻿lo"]})
+        findings = analyze(df)
+        assert any(f.id == "zero_width_or_invisible" for f in findings)
+
+
+# ---------------------------------------------------------------------------
+# findings_by_tool — edge cases
+# ---------------------------------------------------------------------------
+
+class TestFindingsByToolEdges:
+    def test_empty_list_returns_empty_dict(self):
+        assert findings_by_tool([]) == {}
+
+    def test_only_toolless_findings_returns_empty_dict(self):
+        from src.core.analyze import Finding
+        # Construct a Finding with no tool — like csv_unrepairable_rows.
+        f = Finding(
+            id="x", severity="info", tool="", count=1,
+            description="d",
+        )
+        assert findings_by_tool([f]) == {}
+
+
+# ---------------------------------------------------------------------------
+# Known gap: collapse_whitespace on numeric/date/phone-shaped cells
+# ---------------------------------------------------------------------------
+
+class TestNumericPhoneWhitespaceGap:
+    """Spec §4.17: ``collapse_whitespace`` should NOT collapse internal
+    whitespace in cells that look numeric, dated, or phone-shaped.
+
+    Currently unconditional. Marked xfail so the suite tracks the gap
+    without silently allowing regressions on the cells that *do* get
+    correctly collapsed.
+    """
+
+    @pytest.mark.xfail(
+        reason=(
+            "Heuristic not yet implemented — collapse_whitespace runs on every "
+            "string cell, including phone-shaped ones. See TEST-CASES.md §4.17."
+        ),
+        strict=True,
+    )
+    def test_phone_internal_double_space_preserved(self):
+        df = pd.DataFrame({"phone": ["(555)  123-4567"]})  # double space inside
+        result = clean_dataframe(df)
+        # Spec requires the double space to survive because the cell looks
+        # phone-shaped. Today the cleaner collapses it.
+        assert result.cleaned_df.iloc[0]["phone"] == "(555)  123-4567"
diff --git a/tests/test_install.py b/tests/test_install.py
new file mode 100644
index 0000000..9ed9fc8
--- /dev/null
+++ b/tests/test_install.py
@@ -0,0 +1,173 @@
+"""Install / dependency / entry-point sanity tests.
+
+These tests answer the question: "after running ``pip install -r
+requirements.txt`` on a fresh machine, can the user actually use this
+project?" They run on every supported platform — the asserts touch only
+public APIs and CLI ``--help`` exits, never any platform-specific paths.
+
+If a future dependency upgrade or refactor breaks an import that's used by
+the CLI or the GUI, these tests catch it before the rest of the suite even
+gets a chance to run.
+"""
+
+from __future__ import annotations
+
+import importlib
+import subprocess
+import sys
+
+import pytest
+
+pytestmark = pytest.mark.install
+
+
+# ---------------------------------------------------------------------------
+# Required dependencies
+# ---------------------------------------------------------------------------
+
+# Top-level packages that must import cleanly. If any of these fails, the
+# user's install is broken — fail loudly with the offender's name.
+_REQUIRED_DEPS = [
+    "pandas",
+    "numpy",
+    "openpyxl",
+    "rapidfuzz",
+    "charset_normalizer",
+    "loguru",
+    "tqdm",
+    "typer",
+    "phonenumbers",
+    "streamlit",
+]
+
+
+@pytest.mark.parametrize("module", _REQUIRED_DEPS)
+def test_required_dependency_imports(module: str) -> None:
+    importlib.import_module(module)
+
+
+# ---------------------------------------------------------------------------
+# Project package imports
+# ---------------------------------------------------------------------------
+
+_PROJECT_MODULES = [
+    "src",
+    "src.core",
+    "src.core.io",
+    "src.core.text_clean",
+    "src.core.dedup",
+    "src.core.normalizers",
+    "src.core.analyze",
+    "src.core.config",
+    "src.cli",
+    "src.cli_text_clean",
+    "src.cli_analyze",
+    "src.gui.components",
+]
+
+
+@pytest.mark.parametrize("module", _PROJECT_MODULES)
+def test_project_module_imports(module: str) -> None:
+    importlib.import_module(module)
+
+
+# ---------------------------------------------------------------------------
+# Public API surface
+# ---------------------------------------------------------------------------
+
+def test_core_public_api_present() -> None:
+    """Spot-check the symbols re-exported via ``src.core``.
+
+    Catches an accidental rename or drop in ``src/core/__init__.py``.
+    """
+    import src.core as core
+
+    expected = [
+        # I/O
+        "read_file", "write_file", "list_sheets",
+        "detect_encoding", "detect_delimiter", "detect_header_row",
+        "read_csv_repaired", "repair_bytes",
+        "RepairAction", "RepairResult",
+        # Analyzer
+        "Finding", "analyze", "findings_by_tool", "to_dict",
+        # Text cleaner
+        "CleanOptions", "CleanResult", "clean_dataframe", "clean_value",
+        "smart_title_case", "sentence_case", "apply_case",
+        # Dedup
+        "deduplicate", "build_default_strategies",
+        "Algorithm", "SurvivorRule", "MatchStrategy", "MatchResult",
+        "DeduplicationResult",
+        # Normalizers
+        "normalize_email", "normalize_phone", "normalize_name",
+        "normalize_address", "normalize_string", "get_normalizer",
+        "NormalizerType",
+    ]
+    missing = [name for name in expected if not hasattr(core, name)]
+    assert not missing, f"src.core is missing public symbols: {missing}"
+
+
+# ---------------------------------------------------------------------------
+# CLI entry points
+# ---------------------------------------------------------------------------
+
+def _cli_help(module: str) -> subprocess.CompletedProcess:
+    """Run ``python -m <module> --help`` and return the CompletedProcess.
+
+    Captures both stdout and stderr so tests can inspect either; uses a
+    short timeout so a hung CLI fails fast on CI.
+    """
+    return subprocess.run(
+        [sys.executable, "-m", module, "--help"],
+        capture_output=True, text=True, timeout=30,
+    )
+
+
+@pytest.mark.parametrize("cli_module", [
+    "src.cli",
+    "src.cli_text_clean",
+    "src.cli_analyze",
+])
+def test_cli_help_exits_zero(cli_module: str) -> None:
+    proc = _cli_help(cli_module)
+    assert proc.returncode == 0, (
+        f"{cli_module} --help exited {proc.returncode}.\n"
+        f"stdout:\n{proc.stdout}\nstderr:\n{proc.stderr}"
+    )
+    # Help output must mention the command name or at least include "Usage:".
+    combined = (proc.stdout + proc.stderr).lower()
+    assert "usage" in combined, (
+        f"{cli_module} --help did not produce a Usage line"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Streamlit GUI entry point
+# ---------------------------------------------------------------------------
+
+def test_streamlit_app_module_compiles() -> None:
+    """Ensure ``src/gui/app.py`` is at least syntactically valid Python.
+
+    A full Streamlit launch is too heavy for the install layer; that's
+    covered by the e2e suite.
+    """
+    import ast
+    from pathlib import Path
+
+    app_path = Path(__file__).resolve().parent.parent / "src" / "gui" / "app.py"
+    assert app_path.exists(), f"missing {app_path}"
+    ast.parse(app_path.read_text(encoding="utf-8"))
+
+
+# ---------------------------------------------------------------------------
+# Test runner sanity
+# ---------------------------------------------------------------------------
+
+def test_run_tests_help_works() -> None:
+    """``python run_tests.py --help`` should describe the available flags."""
+    proc = subprocess.run(
+        [sys.executable, "run_tests.py", "--help"],
+        capture_output=True, text=True, timeout=30,
+    )
+    assert proc.returncode == 0
+    assert "--tool" in proc.stdout
+    assert "--fixtures" in proc.stdout
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..4ad8a36
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,67 @@
+; Cross-platform test automation for DataTools.
+;
+; Drives the pytest suite under multiple Python versions on Linux, macOS,
+; and Windows. Use:
+;
+;   tox                    # all envs
+;   tox -e py312           # one Python version
+;   tox -e e2e             # CLI smoke tests
+;   tox -e install         # import / dependency sanity
+;   tox -e lint            # static checks (mypy / ruff if installed)
+;   tox -e coverage        # full suite with coverage report
+;
+; Adding a new fixture: drop the CSV/XLSX into test-cases/ and re-run.
+; tests/test_fixtures_sweep.py picks new files up automatically.
+
+[tox]
+envlist = py310, py311, py312, py313, install, e2e
+skip_missing_interpreters = true
+isolated_build = false
+
+[testenv]
+description = Run the full pytest suite under {envname}.
+deps =
+    -r requirements.txt
+    -r requirements-dev.txt
+commands =
+    python run_tests.py {posargs}
+passenv =
+    HOME
+    USER
+    LANG
+    LC_ALL
+    PATH
+setenv =
+    PYTHONIOENCODING = utf-8
+    PYTHONUTF8 = 1
+
+[testenv:install]
+description = Verify imports and CLI entry points work after a fresh install.
+commands =
+    python run_tests.py --install -v
+
+[testenv:e2e]
+description = End-to-end CLI smoke tests against real fixtures.
+commands =
+    python run_tests.py --e2e -v
+
+[testenv:fixtures]
+description = Sweep test-cases/ for any newly-dropped fixtures.
+commands =
+    python run_tests.py --fixtures -v
+
+[testenv:coverage]
+description = Full suite with coverage report.
+commands =
+    python run_tests.py --coverage
+
+[testenv:lint]
+description = Static checks (run only if the optional tools are installed).
+deps =
+    -r requirements.txt
+    ruff>=0.5; python_version >= "3.10"
+    mypy>=1.10; python_version >= "3.10"
+allowlist_externals = sh
+commands =
+    sh -c "command -v ruff && ruff check src/ tests/ || echo 'ruff not installed; skipping'"
+    sh -c "command -v mypy && mypy src/ || echo 'mypy not installed; skipping'"