Files
datatools-dev/run_tests.py
Michael 82d7fef21e feat(gate): CSV-normalization gate with confidence-tiered findings
Adds a Review & Normalize page that sits between upload and every tool
page. The analyzer now tags each finding with confidence (high/medium/low)
and a fix_action; the gate auto-applies high-confidence fixes, surfaces
medium/low ones for user review, and blocks tool pages on error-level
findings until resolved or waived.

Core (src/core/):
  - analyze.py: Finding gains confidence, fix_action, pre_applied; new
    detectors for encoding_uncertain, encoding_decode_failed; new top-
    level encoding_override parameter.
  - fixes.py: registry of fix algorithms keyed by fix_action id.
  - normalize.py: auto_fix(), apply_decisions(), is_normalized(), and
    the NormalizationResult / Decision dataclasses the gate consumes.
  - io.py: detect_encoding tries strict UTF-8 first; repair_bytes now
    transcodes UTF-16/32 to UTF-8 before NUL-strip (fixes UTF-16 corruption)
    and normalizes line endings (fixes bare-CR parser crash); empty file
    handled gracefully instead of EmptyDataError traceback.

GUI (src/gui/):
  - pages/0_Review.py: gate page with per-finding decision controls,
    encoding override picker (16 codepages + custom), and Advanced output
    options (encoding, delimiter, line terminator) on the download.
  - components.py: require_normalization_gate() helper.
  - pages/1-9: gate guard wired on every tool page.

Test corpora:
  - test-cases/encodings-corpus/: 31 encoded CSV fixtures + 9 reference
    UTF-8 files + manifest, synced from Business/DataTools.
  - test-cases/text-cleaner-corpus/test_data/17: synced malformed input
    (unquoted $1,500.00) for the unquoted-delimiter detector.

Tests (94 new):
  - test_normalize.py (48): finding fields, fix registry, auto_fix scope,
    decision paths, gate idempotency, output-options helper.
  - test_encodings_corpus.py (90, 16 xfailed): parametric detection +
    decode + analyzer-no-crash sweep against the manifest.
  - test_analyze.py: encoding override + encoding_uncertain detectors.
  - test_corpus.py: pre-parse repair in the strict reader.

run_tests.py: new aliases --tool normalize, --tool encodings, --tool gate;
encodings corpus added to --fixtures category.

Docs: USER-GUIDE §3.3 covers the gate workflow, encoding override, and
output options; TECHNICAL §10.2.1-10.2.4 documents the analyzer schema,
gate API, Review page, and pre-parse repair pipeline; CLI-REFERENCE adds
the analyzer JSON schema with the new fields; README links to all of it.

Suite: 765 passed, 17 xfailed (was 458 passed).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 20:35:27 +00:00

185 lines
6.1 KiB
Python
Executable File

#!/usr/bin/env python3
"""DataTools test runner — single entry point with category flags.
Examples
--------
Run everything (default)::
python run_tests.py
Run a single tool's tests::
python run_tests.py --tool dedup
python run_tests.py --tool text_clean
python run_tests.py --tool analyze
python run_tests.py --tool io
python run_tests.py --tool cli
Categories::
python run_tests.py --unit # unit tests only (no e2e, no install)
python run_tests.py --e2e # end-to-end smoke tests
python run_tests.py --install # install / dependency sanity
python run_tests.py --fixtures # corpus + dropped-file sweep
python run_tests.py --coverage # add a coverage report
python run_tests.py --quick # skip @pytest.mark.slow
python run_tests.py -v / --verbose # verbose pytest output
Multiple flags compose. ``--tool X --quick`` runs that tool's quick tests.
Dropping a new fixture into ``test-cases/`` is automatic: the fixture sweep
test (``tests/test_fixtures_sweep.py``) parametrizes over every CSV/XLSX in
that directory (excluding ``text-cleaner-corpus/`` which has its own suite).
"""
from __future__ import annotations
import argparse
import shutil
import subprocess
import sys
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parent
# Tool name -> matching pytest -k expression. Keep aligned with test_*.py
# filenames; run_tests.py --tool foo translates to ``-k foo``.
_TOOL_MAP: dict[str, str] = {
"dedup": "test_dedup or test_cli.py",
"text_clean": "test_text_clean or test_cli_text_clean or test_corpus",
"analyze": "test_analyze or test_cli_analyze",
"io": "test_io",
"cli": "test_cli or test_cli_text_clean or test_cli_analyze",
"config": "test_config",
"normalizers": "test_normalizers",
"normalize": "test_normalize",
"encodings": "test_encodings_corpus or test_io",
"gate": "test_normalize",
}
_CATEGORY_PATHS: dict[str, list[str]] = {
"unit": ["tests/"], # all tests are unit unless marked otherwise
"e2e": ["tests/test_e2e.py"],
"install": ["tests/test_install.py"],
"fixtures": [
"tests/test_corpus.py",
"tests/test_fixtures_sweep.py",
"tests/test_encodings_corpus.py",
],
}
def _build_pytest_args(args: argparse.Namespace) -> list[str]:
cmd: list[str] = [sys.executable, "-m", "pytest"]
# Verbosity
if args.verbose:
cmd.append("-vv")
else:
cmd.append("-q")
# Coverage
if args.coverage:
cmd.extend(["--cov=src", "--cov-report=term-missing"])
# Quick: skip anything marked slow.
if args.quick:
cmd.extend(["-m", "not slow"])
# Tool filter via -k expression.
if args.tool:
if args.tool not in _TOOL_MAP:
print(
f"unknown --tool '{args.tool}'. "
f"available: {', '.join(sorted(_TOOL_MAP))}",
file=sys.stderr,
)
sys.exit(2)
cmd.extend(["-k", _TOOL_MAP[args.tool]])
# Category selection (--unit/--e2e/--install/--fixtures). When several
# categories are requested they're OR'd by passing all paths.
paths: list[str] = []
selected_categories = [
c for c in ("unit", "e2e", "install", "fixtures")
if getattr(args, c)
]
if selected_categories:
for cat in selected_categories:
paths.extend(_CATEGORY_PATHS[cat])
elif args.path:
paths.extend(args.path)
else:
paths.append("tests/")
cmd.extend(paths)
return cmd
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(
prog="run_tests.py",
description=(
"DataTools test runner. With no flags runs every test. Use "
"--tool to scope to one tool, --unit/--e2e/--install/--fixtures "
"to scope by category. Combine flags freely."
),
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=(
"Available tools: " + ", ".join(sorted(_TOOL_MAP)) + "\n\n"
"To add a new fixture-driven test: drop a CSV or XLSX into "
"test-cases/ and re-run. tests/test_fixtures_sweep.py picks up "
"new files automatically — no test code changes required."
),
)
parser.add_argument("--tool", help="Limit tests to one tool.")
parser.add_argument("--unit", action="store_true",
help="Unit tests only (default scope).")
parser.add_argument("--e2e", action="store_true",
help="End-to-end CLI/integration smoke tests.")
parser.add_argument("--install", action="store_true",
help="Install / import / entry-point sanity tests.")
parser.add_argument("--fixtures", action="store_true",
help="Run the corpus + dropped-fixture sweep.")
parser.add_argument("--coverage", action="store_true",
help="Emit a coverage report (term-missing).")
parser.add_argument("--quick", action="store_true",
help="Skip tests marked @pytest.mark.slow.")
parser.add_argument("-v", "--verbose", action="store_true",
help="Verbose pytest output.")
parser.add_argument("path", nargs="*",
help="Optional explicit test paths (override category).")
args = parser.parse_args(argv)
# Ensure we run from the project root so relative imports / paths work.
cwd_target = PROJECT_ROOT
if Path.cwd() != cwd_target:
print(f"running from {cwd_target}")
if shutil.which("pytest") is None and not _python_has_pytest():
print(
"pytest is not installed. Install dev deps:\n"
" pip install -r requirements-dev.txt",
file=sys.stderr,
)
return 2
cmd = _build_pytest_args(args)
if args.verbose:
print("", " ".join(cmd))
proc = subprocess.run(cmd, cwd=cwd_target)
return proc.returncode
def _python_has_pytest() -> bool:
try:
__import__("pytest")
return True
except ImportError:
return False
if __name__ == "__main__":
sys.exit(main())