Adds a top-level test infrastructure layer addressing four needs at once:
a single command to run anything, cross-platform automation, install/e2e
sanity, and zero-config pickup of new fixtures dropped into test-cases/.
Top-level runner — run_tests.py
python run_tests.py # everything (default)
python run_tests.py --tool dedup # one tool's tests
python run_tests.py --unit # category scopes
python run_tests.py --e2e # end-to-end CLI
python run_tests.py --install # import / dependency sanity
python run_tests.py --fixtures # corpus + dropped-file sweep
python run_tests.py --coverage # term-missing report
python run_tests.py --quick # skip @pytest.mark.slow
Tools: analyze, cli, config, dedup, io, normalizers, text_clean.
Cross-platform — tox.ini
Envs for py310-py313 plus install / e2e / fixtures / coverage / lint.
Forces UTF-8 (PYTHONUTF8=1, PYTHONIOENCODING=utf-8) so identical fixture
bytes parse the same on Linux/macOS/Windows.
Shared config — pytest.ini
testpaths, python_files conventions, custom markers (slow, e2e, install,
fixture_sweep), warning filters that fail on our own DeprecationWarnings
while tolerating third-party ones.
New test layers
tests/test_install.py — required deps import; project modules import;
src.core public API surface; CLI --help exits 0; streamlit app.py
parses as valid Python; run_tests.py --help works.
tests/test_e2e.py — CLI roundtrips: cli_analyze table + JSON, cli_text_clean
--apply writes a real file with NBSP/smart-quote folded, dedup CLI
removes duplicates, run_tests.py self-tests.
tests/test_fixtures_sweep.py — parametrizes over every CSV/TSV/XLSX
inside test-cases/ (excluding text-cleaner-corpus/, which has its own
suite). Each fixture must: load through repair_bytes, run analyze()
cleanly, and survive clean_dataframe() with row/col counts unchanged
plus idempotency. Drop a CSV in, re-run — no test code changes needed.
tests/test_gap_coverage.py — closes audit gaps: clean_headers=False
toggle, repair_bytes with tab/semicolon delimiters, BOM+NUL+smart-
quote combined-fix scenario, analyze() over an XLSX path, sample_rows
larger than the DataFrame, mid-cell BOM, findings_by_tool edges, plus
a strict xfail documenting the known §4.17 numeric/phone whitespace
heuristic gap.
Test count
Before: 288 passed + 1 xfailed
After: 475 passed + 2 xfailed (the second xfail is the documented
collapse_whitespace gap on phone-shaped cells; spec §4.17 calls
for a heuristic that hasn't been implemented yet).
Functional gaps surfaced (not fixed in this commit):
- Text cleaner: collapse_whitespace runs unconditionally on every string
cell, including phone/numeric/date-shaped ones. Spec §4.17 requires a
skip heuristic. Captured as strict xfail so the gap stays visible.
- io.read_file does not run pre-parse repair; only analyze() and direct
callers of read_csv_repaired() get it. CLI tool pages and the dedup
CLI miss the safety net.
- Analyzer has no mixed_line_endings detector or near_duplicate_rows
detector; both planned but require additional plumbing.
- GUI tool pages each have their own uploader instead of picking up the
home-page upload through session_state.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
178 lines
5.9 KiB
Python
Executable File
178 lines
5.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""DataTools test runner — single entry point with category flags.
|
|
|
|
Examples
|
|
--------
|
|
Run everything (default)::
|
|
|
|
python run_tests.py
|
|
|
|
Run a single tool's tests::
|
|
|
|
python run_tests.py --tool dedup
|
|
python run_tests.py --tool text_clean
|
|
python run_tests.py --tool analyze
|
|
python run_tests.py --tool io
|
|
python run_tests.py --tool cli
|
|
|
|
Categories::
|
|
|
|
python run_tests.py --unit # unit tests only (no e2e, no install)
|
|
python run_tests.py --e2e # end-to-end smoke tests
|
|
python run_tests.py --install # install / dependency sanity
|
|
python run_tests.py --fixtures # corpus + dropped-file sweep
|
|
python run_tests.py --coverage # add a coverage report
|
|
python run_tests.py --quick # skip @pytest.mark.slow
|
|
python run_tests.py -v / --verbose # verbose pytest output
|
|
|
|
Multiple flags compose. ``--tool X --quick`` runs that tool's quick tests.
|
|
|
|
Dropping a new fixture into ``test-cases/`` is automatic: the fixture sweep
|
|
test (``tests/test_fixtures_sweep.py``) parametrizes over every CSV/XLSX in
|
|
that directory (excluding ``text-cleaner-corpus/`` which has its own suite).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
PROJECT_ROOT = Path(__file__).resolve().parent
|
|
|
|
# Tool name -> matching pytest -k expression. Keep aligned with test_*.py
|
|
# filenames; run_tests.py --tool foo translates to ``-k foo``.
|
|
_TOOL_MAP: dict[str, str] = {
|
|
"dedup": "test_dedup or test_cli.py",
|
|
"text_clean": "test_text_clean or test_cli_text_clean or test_corpus",
|
|
"analyze": "test_analyze or test_cli_analyze",
|
|
"io": "test_io",
|
|
"cli": "test_cli or test_cli_text_clean or test_cli_analyze",
|
|
"config": "test_config",
|
|
"normalizers": "test_normalizers",
|
|
}
|
|
|
|
_CATEGORY_PATHS: dict[str, list[str]] = {
|
|
"unit": ["tests/"], # all tests are unit unless marked otherwise
|
|
"e2e": ["tests/test_e2e.py"],
|
|
"install": ["tests/test_install.py"],
|
|
"fixtures": ["tests/test_corpus.py", "tests/test_fixtures_sweep.py"],
|
|
}
|
|
|
|
|
|
def _build_pytest_args(args: argparse.Namespace) -> list[str]:
|
|
cmd: list[str] = [sys.executable, "-m", "pytest"]
|
|
|
|
# Verbosity
|
|
if args.verbose:
|
|
cmd.append("-vv")
|
|
else:
|
|
cmd.append("-q")
|
|
|
|
# Coverage
|
|
if args.coverage:
|
|
cmd.extend(["--cov=src", "--cov-report=term-missing"])
|
|
|
|
# Quick: skip anything marked slow.
|
|
if args.quick:
|
|
cmd.extend(["-m", "not slow"])
|
|
|
|
# Tool filter via -k expression.
|
|
if args.tool:
|
|
if args.tool not in _TOOL_MAP:
|
|
print(
|
|
f"unknown --tool '{args.tool}'. "
|
|
f"available: {', '.join(sorted(_TOOL_MAP))}",
|
|
file=sys.stderr,
|
|
)
|
|
sys.exit(2)
|
|
cmd.extend(["-k", _TOOL_MAP[args.tool]])
|
|
|
|
# Category selection (--unit/--e2e/--install/--fixtures). When several
|
|
# categories are requested they're OR'd by passing all paths.
|
|
paths: list[str] = []
|
|
selected_categories = [
|
|
c for c in ("unit", "e2e", "install", "fixtures")
|
|
if getattr(args, c)
|
|
]
|
|
if selected_categories:
|
|
for cat in selected_categories:
|
|
paths.extend(_CATEGORY_PATHS[cat])
|
|
elif args.path:
|
|
paths.extend(args.path)
|
|
else:
|
|
paths.append("tests/")
|
|
|
|
cmd.extend(paths)
|
|
return cmd
|
|
|
|
|
|
def main(argv: list[str] | None = None) -> int:
|
|
parser = argparse.ArgumentParser(
|
|
prog="run_tests.py",
|
|
description=(
|
|
"DataTools test runner. With no flags runs every test. Use "
|
|
"--tool to scope to one tool, --unit/--e2e/--install/--fixtures "
|
|
"to scope by category. Combine flags freely."
|
|
),
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=(
|
|
"Available tools: " + ", ".join(sorted(_TOOL_MAP)) + "\n\n"
|
|
"To add a new fixture-driven test: drop a CSV or XLSX into "
|
|
"test-cases/ and re-run. tests/test_fixtures_sweep.py picks up "
|
|
"new files automatically — no test code changes required."
|
|
),
|
|
)
|
|
parser.add_argument("--tool", help="Limit tests to one tool.")
|
|
parser.add_argument("--unit", action="store_true",
|
|
help="Unit tests only (default scope).")
|
|
parser.add_argument("--e2e", action="store_true",
|
|
help="End-to-end CLI/integration smoke tests.")
|
|
parser.add_argument("--install", action="store_true",
|
|
help="Install / import / entry-point sanity tests.")
|
|
parser.add_argument("--fixtures", action="store_true",
|
|
help="Run the corpus + dropped-fixture sweep.")
|
|
parser.add_argument("--coverage", action="store_true",
|
|
help="Emit a coverage report (term-missing).")
|
|
parser.add_argument("--quick", action="store_true",
|
|
help="Skip tests marked @pytest.mark.slow.")
|
|
parser.add_argument("-v", "--verbose", action="store_true",
|
|
help="Verbose pytest output.")
|
|
parser.add_argument("path", nargs="*",
|
|
help="Optional explicit test paths (override category).")
|
|
|
|
args = parser.parse_args(argv)
|
|
|
|
# Ensure we run from the project root so relative imports / paths work.
|
|
cwd_target = PROJECT_ROOT
|
|
if Path.cwd() != cwd_target:
|
|
print(f"running from {cwd_target}")
|
|
|
|
if shutil.which("pytest") is None and not _python_has_pytest():
|
|
print(
|
|
"pytest is not installed. Install dev deps:\n"
|
|
" pip install -r requirements-dev.txt",
|
|
file=sys.stderr,
|
|
)
|
|
return 2
|
|
|
|
cmd = _build_pytest_args(args)
|
|
if args.verbose:
|
|
print("→", " ".join(cmd))
|
|
proc = subprocess.run(cmd, cwd=cwd_target)
|
|
return proc.returncode
|
|
|
|
|
|
def _python_has_pytest() -> bool:
|
|
try:
|
|
__import__("pytest")
|
|
return True
|
|
except ImportError:
|
|
return False
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|