Files
datatools-dev/tests/test_install.py
Michael 4687cf87b4 test: single-command runner, cross-platform automation, fixture auto-discovery
Adds a top-level test infrastructure layer addressing four needs at once:
a single command to run anything, cross-platform automation, install/e2e
sanity, and zero-config pickup of new fixtures dropped into test-cases/.

Top-level runner — run_tests.py
  python run_tests.py                # everything (default)
  python run_tests.py --tool dedup   # one tool's tests
  python run_tests.py --unit         # category scopes
  python run_tests.py --e2e          # end-to-end CLI
  python run_tests.py --install      # import / dependency sanity
  python run_tests.py --fixtures     # corpus + dropped-file sweep
  python run_tests.py --coverage     # term-missing report
  python run_tests.py --quick        # skip @pytest.mark.slow
Tools: analyze, cli, config, dedup, io, normalizers, text_clean.

Cross-platform — tox.ini
  Envs for py310-py313 plus install / e2e / fixtures / coverage / lint.
  Forces UTF-8 (PYTHONUTF8=1, PYTHONIOENCODING=utf-8) so identical fixture
  bytes parse the same on Linux/macOS/Windows.

Shared config — pytest.ini
  testpaths, python_files conventions, custom markers (slow, e2e, install,
  fixture_sweep), warning filters that fail on our own DeprecationWarnings
  while tolerating third-party ones.

New test layers
  tests/test_install.py — required deps import; project modules import;
    src.core public API surface; CLI --help exits 0; streamlit app.py
    parses as valid Python; run_tests.py --help works.
  tests/test_e2e.py — CLI roundtrips: cli_analyze table + JSON, cli_text_clean
    --apply writes a real file with NBSP/smart-quote folded, dedup CLI
    removes duplicates, run_tests.py self-tests.
  tests/test_fixtures_sweep.py — parametrizes over every CSV/TSV/XLSX
    inside test-cases/ (excluding text-cleaner-corpus/, which has its own
    suite). Each fixture must: load through repair_bytes, run analyze()
    cleanly, and survive clean_dataframe() with row/col counts unchanged
    plus idempotency. Drop a CSV in, re-run — no test code changes needed.
  tests/test_gap_coverage.py — closes audit gaps: clean_headers=False
    toggle, repair_bytes with tab/semicolon delimiters, BOM+NUL+smart-
    quote combined-fix scenario, analyze() over an XLSX path, sample_rows
    larger than the DataFrame, mid-cell BOM, findings_by_tool edges, plus
    a strict xfail documenting the known §4.17 numeric/phone whitespace
    heuristic gap.

Test count
  Before: 288 passed + 1 xfailed
  After:  475 passed + 2 xfailed (the second xfail is the documented
          collapse_whitespace gap on phone-shaped cells; spec §4.17 calls
          for a heuristic that hasn't been implemented yet).

Functional gaps surfaced (not fixed in this commit):
  - Text cleaner: collapse_whitespace runs unconditionally on every string
    cell, including phone/numeric/date-shaped ones. Spec §4.17 requires a
    skip heuristic. Captured as strict xfail so the gap stays visible.
  - io.read_file does not run pre-parse repair; only analyze() and direct
    callers of read_csv_repaired() get it. CLI tool pages and the dedup
    CLI miss the safety net.
  - Analyzer has no mixed_line_endings detector or near_duplicate_rows
    detector; both planned but require additional plumbing.
  - GUI tool pages each have their own uploader instead of picking up the
    home-page upload through session_state.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 16:01:06 +00:00

174 lines
5.5 KiB
Python

"""Install / dependency / entry-point sanity tests.
These tests answer the question: "after running ``pip install -r
requirements.txt`` on a fresh machine, can the user actually use this
project?" They run on every supported platform — the asserts touch only
public APIs and CLI ``--help`` exits, never any platform-specific paths.
If a future dependency upgrade or refactor breaks an import that's used by
the CLI or the GUI, these tests catch it before the rest of the suite even
gets a chance to run.
"""
from __future__ import annotations
import importlib
import subprocess
import sys
import pytest
pytestmark = pytest.mark.install
# ---------------------------------------------------------------------------
# Required dependencies
# ---------------------------------------------------------------------------
# Top-level packages that must import cleanly. If any of these fails, the
# user's install is broken — fail loudly with the offender's name.
_REQUIRED_DEPS = [
"pandas",
"numpy",
"openpyxl",
"rapidfuzz",
"charset_normalizer",
"loguru",
"tqdm",
"typer",
"phonenumbers",
"streamlit",
]
@pytest.mark.parametrize("module", _REQUIRED_DEPS)
def test_required_dependency_imports(module: str) -> None:
importlib.import_module(module)
# ---------------------------------------------------------------------------
# Project package imports
# ---------------------------------------------------------------------------
_PROJECT_MODULES = [
"src",
"src.core",
"src.core.io",
"src.core.text_clean",
"src.core.dedup",
"src.core.normalizers",
"src.core.analyze",
"src.core.config",
"src.cli",
"src.cli_text_clean",
"src.cli_analyze",
"src.gui.components",
]
@pytest.mark.parametrize("module", _PROJECT_MODULES)
def test_project_module_imports(module: str) -> None:
importlib.import_module(module)
# ---------------------------------------------------------------------------
# Public API surface
# ---------------------------------------------------------------------------
def test_core_public_api_present() -> None:
"""Spot-check the symbols re-exported via ``src.core``.
Catches an accidental rename or drop in ``src/core/__init__.py``.
"""
import src.core as core
expected = [
# I/O
"read_file", "write_file", "list_sheets",
"detect_encoding", "detect_delimiter", "detect_header_row",
"read_csv_repaired", "repair_bytes",
"RepairAction", "RepairResult",
# Analyzer
"Finding", "analyze", "findings_by_tool", "to_dict",
# Text cleaner
"CleanOptions", "CleanResult", "clean_dataframe", "clean_value",
"smart_title_case", "sentence_case", "apply_case",
# Dedup
"deduplicate", "build_default_strategies",
"Algorithm", "SurvivorRule", "MatchStrategy", "MatchResult",
"DeduplicationResult",
# Normalizers
"normalize_email", "normalize_phone", "normalize_name",
"normalize_address", "normalize_string", "get_normalizer",
"NormalizerType",
]
missing = [name for name in expected if not hasattr(core, name)]
assert not missing, f"src.core is missing public symbols: {missing}"
# ---------------------------------------------------------------------------
# CLI entry points
# ---------------------------------------------------------------------------
def _cli_help(module: str) -> subprocess.CompletedProcess:
"""Run ``python -m <module> --help`` and return the CompletedProcess.
Captures both stdout and stderr so tests can inspect either; uses a
short timeout so a hung CLI fails fast on CI.
"""
return subprocess.run(
[sys.executable, "-m", module, "--help"],
capture_output=True, text=True, timeout=30,
)
@pytest.mark.parametrize("cli_module", [
"src.cli",
"src.cli_text_clean",
"src.cli_analyze",
])
def test_cli_help_exits_zero(cli_module: str) -> None:
proc = _cli_help(cli_module)
assert proc.returncode == 0, (
f"{cli_module} --help exited {proc.returncode}.\n"
f"stdout:\n{proc.stdout}\nstderr:\n{proc.stderr}"
)
# Help output must mention the command name or at least include "Usage:".
combined = (proc.stdout + proc.stderr).lower()
assert "usage" in combined, (
f"{cli_module} --help did not produce a Usage line"
)
# ---------------------------------------------------------------------------
# Streamlit GUI entry point
# ---------------------------------------------------------------------------
def test_streamlit_app_module_compiles() -> None:
"""Ensure ``src/gui/app.py`` is at least syntactically valid Python.
A full Streamlit launch is too heavy for the install layer; that's
covered by the e2e suite.
"""
import ast
from pathlib import Path
app_path = Path(__file__).resolve().parent.parent / "src" / "gui" / "app.py"
assert app_path.exists(), f"missing {app_path}"
ast.parse(app_path.read_text(encoding="utf-8"))
# ---------------------------------------------------------------------------
# Test runner sanity
# ---------------------------------------------------------------------------
def test_run_tests_help_works() -> None:
"""``python run_tests.py --help`` should describe the available flags."""
proc = subprocess.run(
[sys.executable, "run_tests.py", "--help"],
capture_output=True, text=True, timeout=30,
)
assert proc.returncode == 0
assert "--tool" in proc.stdout
assert "--fixtures" in proc.stdout