datatools-dev/tests/test_install.py

"""Install / dependency / entry-point sanity tests.

These tests answer the question: "after running ``pip install -r
requirements.txt`` on a fresh machine, can the user actually use this
project?" They run on every supported platform — the asserts touch only
public APIs and CLI ``--help`` exits, never any platform-specific paths.

If a future dependency upgrade or refactor breaks an import that's used by
the CLI or the GUI, these tests catch it before the rest of the suite even
gets a chance to run.
"""

from __future__ import annotations

import importlib
import subprocess
import sys

import pytest

pytestmark = pytest.mark.install


# ---------------------------------------------------------------------------
# Required dependencies
# ---------------------------------------------------------------------------

# Top-level packages that must import cleanly. If any of these fails, the
# user's install is broken — fail loudly with the offender's name.
_REQUIRED_DEPS = [
    "pandas",
    "numpy",
    "openpyxl",
    "rapidfuzz",
    "charset_normalizer",
    "loguru",
    "tqdm",
    "typer",
    "phonenumbers",
    "streamlit",
]


@pytest.mark.parametrize("module", _REQUIRED_DEPS)
def test_required_dependency_imports(module: str) -> None:
    importlib.import_module(module)


# ---------------------------------------------------------------------------
# Project package imports
# ---------------------------------------------------------------------------

_PROJECT_MODULES = [
    "src",
    "src.core",
    "src.core.io",
    "src.core.text_clean",
    "src.core.dedup",
    "src.core.normalizers",
    "src.core.analyze",
    "src.core.config",
    "src.cli",
    "src.cli_text_clean",
    "src.cli_analyze",
    "src.gui.components",
]


@pytest.mark.parametrize("module", _PROJECT_MODULES)
def test_project_module_imports(module: str) -> None:
    importlib.import_module(module)


# ---------------------------------------------------------------------------
# Public API surface
# ---------------------------------------------------------------------------

def test_core_public_api_present() -> None:
    """Spot-check the symbols re-exported via ``src.core``.

    Catches an accidental rename or drop in ``src/core/__init__.py``.
    """
    import src.core as core

    expected = [
        # I/O
        "read_file", "write_file", "list_sheets",
        "detect_encoding", "detect_delimiter", "detect_header_row",
        "read_csv_repaired", "repair_bytes",
        "RepairAction", "RepairResult",
        # Analyzer
        "Finding", "analyze", "findings_by_tool", "to_dict",
        # Text cleaner
        "CleanOptions", "CleanResult", "clean_dataframe", "clean_value",
        "smart_title_case", "sentence_case", "apply_case",
        # Dedup
        "deduplicate", "build_default_strategies",
        "Algorithm", "SurvivorRule", "MatchStrategy", "MatchResult",
        "DeduplicationResult",
        # Normalizers
        "normalize_email", "normalize_phone", "normalize_name",
        "normalize_address", "normalize_string", "get_normalizer",
        "NormalizerType",
    ]
    missing = [name for name in expected if not hasattr(core, name)]
    assert not missing, f"src.core is missing public symbols: {missing}"


# ---------------------------------------------------------------------------
# CLI entry points
# ---------------------------------------------------------------------------

def _cli_help(module: str) -> subprocess.CompletedProcess:
    """Run ``python -m <module> --help`` and return the CompletedProcess.

    Captures both stdout and stderr so tests can inspect either; uses a
    short timeout so a hung CLI fails fast on CI.
    """
    return subprocess.run(
        [sys.executable, "-m", module, "--help"],
        capture_output=True, text=True, timeout=30,
    )


@pytest.mark.parametrize("cli_module", [
    "src.cli",
    "src.cli_text_clean",
    "src.cli_analyze",
])
def test_cli_help_exits_zero(cli_module: str) -> None:
    proc = _cli_help(cli_module)
    assert proc.returncode == 0, (
        f"{cli_module} --help exited {proc.returncode}.\n"
        f"stdout:\n{proc.stdout}\nstderr:\n{proc.stderr}"
    )
    # Help output must mention the command name or at least include "Usage:".
    combined = (proc.stdout + proc.stderr).lower()
    assert "usage" in combined, (
        f"{cli_module} --help did not produce a Usage line"
    )


# ---------------------------------------------------------------------------
# Streamlit GUI entry point
# ---------------------------------------------------------------------------

def test_streamlit_app_module_compiles() -> None:
    """Ensure ``src/gui/app.py`` is at least syntactically valid Python.

    A full Streamlit launch is too heavy for the install layer; that's
    covered by the e2e suite.
    """
    import ast
    from pathlib import Path

    app_path = Path(__file__).resolve().parent.parent / "src" / "gui" / "app.py"
    assert app_path.exists(), f"missing {app_path}"
    ast.parse(app_path.read_text(encoding="utf-8"))


# ---------------------------------------------------------------------------
# Test runner sanity
# ---------------------------------------------------------------------------

def test_run_tests_help_works() -> None:
    """``python run_tests.py --help`` should describe the available flags."""
    proc = subprocess.run(
        [sys.executable, "run_tests.py", "--help"],
        capture_output=True, text=True, timeout=30,
    )
    assert proc.returncode == 0
    assert "--tool" in proc.stdout
    assert "--fixtures" in proc.stdout