Adds a top-level test infrastructure layer addressing four needs at once:
a single command to run anything, cross-platform automation, install/e2e
sanity, and zero-config pickup of new fixtures dropped into test-cases/.
Top-level runner — run_tests.py
python run_tests.py # everything (default)
python run_tests.py --tool dedup # one tool's tests
python run_tests.py --unit # category scopes
python run_tests.py --e2e # end-to-end CLI
python run_tests.py --install # import / dependency sanity
python run_tests.py --fixtures # corpus + dropped-file sweep
python run_tests.py --coverage # term-missing report
python run_tests.py --quick # skip @pytest.mark.slow
Tools: analyze, cli, config, dedup, io, normalizers, text_clean.
Cross-platform — tox.ini
Envs for py310-py313 plus install / e2e / fixtures / coverage / lint.
Forces UTF-8 (PYTHONUTF8=1, PYTHONIOENCODING=utf-8) so identical fixture
bytes parse the same on Linux/macOS/Windows.
Shared config — pytest.ini
testpaths, python_files conventions, custom markers (slow, e2e, install,
fixture_sweep), warning filters that fail on our own DeprecationWarnings
while tolerating third-party ones.
New test layers
tests/test_install.py — required deps import; project modules import;
src.core public API surface; CLI --help exits 0; streamlit app.py
parses as valid Python; run_tests.py --help works.
tests/test_e2e.py — CLI roundtrips: cli_analyze table + JSON, cli_text_clean
--apply writes a real file with NBSP/smart-quote folded, dedup CLI
removes duplicates, run_tests.py self-tests.
tests/test_fixtures_sweep.py — parametrizes over every CSV/TSV/XLSX
inside test-cases/ (excluding text-cleaner-corpus/, which has its own
suite). Each fixture must: load through repair_bytes, run analyze()
cleanly, and survive clean_dataframe() with row/col counts unchanged
plus idempotency. Drop a CSV in, re-run — no test code changes needed.
tests/test_gap_coverage.py — closes audit gaps: clean_headers=False
toggle, repair_bytes with tab/semicolon delimiters, BOM+NUL+smart-
quote combined-fix scenario, analyze() over an XLSX path, sample_rows
larger than the DataFrame, mid-cell BOM, findings_by_tool edges, plus
a strict xfail documenting the known §4.17 numeric/phone whitespace
heuristic gap.
Test count
Before: 288 passed + 1 xfailed
After: 475 passed + 2 xfailed (the second xfail is the documented
collapse_whitespace gap on phone-shaped cells; spec §4.17 calls
for a heuristic that hasn't been implemented yet).
Functional gaps surfaced (not fixed in this commit):
- Text cleaner: collapse_whitespace runs unconditionally on every string
cell, including phone/numeric/date-shaped ones. Spec §4.17 requires a
skip heuristic. Captured as strict xfail so the gap stays visible.
- io.read_file does not run pre-parse repair; only analyze() and direct
callers of read_csv_repaired() get it. CLI tool pages and the dedup
CLI miss the safety net.
- Analyzer has no mixed_line_endings detector or near_duplicate_rows
detector; both planned but require additional plumbing.
- GUI tool pages each have their own uploader instead of picking up the
home-page upload through session_state.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
174 lines
5.5 KiB
Python
174 lines
5.5 KiB
Python
"""Install / dependency / entry-point sanity tests.
|
|
|
|
These tests answer the question: "after running ``pip install -r
|
|
requirements.txt`` on a fresh machine, can the user actually use this
|
|
project?" They run on every supported platform — the asserts touch only
|
|
public APIs and CLI ``--help`` exits, never any platform-specific paths.
|
|
|
|
If a future dependency upgrade or refactor breaks an import that's used by
|
|
the CLI or the GUI, these tests catch it before the rest of the suite even
|
|
gets a chance to run.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import importlib
|
|
import subprocess
|
|
import sys
|
|
|
|
import pytest
|
|
|
|
pytestmark = pytest.mark.install
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Required dependencies
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Top-level packages that must import cleanly. If any of these fails, the
|
|
# user's install is broken — fail loudly with the offender's name.
|
|
_REQUIRED_DEPS = [
|
|
"pandas",
|
|
"numpy",
|
|
"openpyxl",
|
|
"rapidfuzz",
|
|
"charset_normalizer",
|
|
"loguru",
|
|
"tqdm",
|
|
"typer",
|
|
"phonenumbers",
|
|
"streamlit",
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize("module", _REQUIRED_DEPS)
|
|
def test_required_dependency_imports(module: str) -> None:
|
|
importlib.import_module(module)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Project package imports
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_PROJECT_MODULES = [
|
|
"src",
|
|
"src.core",
|
|
"src.core.io",
|
|
"src.core.text_clean",
|
|
"src.core.dedup",
|
|
"src.core.normalizers",
|
|
"src.core.analyze",
|
|
"src.core.config",
|
|
"src.cli",
|
|
"src.cli_text_clean",
|
|
"src.cli_analyze",
|
|
"src.gui.components",
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize("module", _PROJECT_MODULES)
|
|
def test_project_module_imports(module: str) -> None:
|
|
importlib.import_module(module)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Public API surface
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_core_public_api_present() -> None:
|
|
"""Spot-check the symbols re-exported via ``src.core``.
|
|
|
|
Catches an accidental rename or drop in ``src/core/__init__.py``.
|
|
"""
|
|
import src.core as core
|
|
|
|
expected = [
|
|
# I/O
|
|
"read_file", "write_file", "list_sheets",
|
|
"detect_encoding", "detect_delimiter", "detect_header_row",
|
|
"read_csv_repaired", "repair_bytes",
|
|
"RepairAction", "RepairResult",
|
|
# Analyzer
|
|
"Finding", "analyze", "findings_by_tool", "to_dict",
|
|
# Text cleaner
|
|
"CleanOptions", "CleanResult", "clean_dataframe", "clean_value",
|
|
"smart_title_case", "sentence_case", "apply_case",
|
|
# Dedup
|
|
"deduplicate", "build_default_strategies",
|
|
"Algorithm", "SurvivorRule", "MatchStrategy", "MatchResult",
|
|
"DeduplicationResult",
|
|
# Normalizers
|
|
"normalize_email", "normalize_phone", "normalize_name",
|
|
"normalize_address", "normalize_string", "get_normalizer",
|
|
"NormalizerType",
|
|
]
|
|
missing = [name for name in expected if not hasattr(core, name)]
|
|
assert not missing, f"src.core is missing public symbols: {missing}"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CLI entry points
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _cli_help(module: str) -> subprocess.CompletedProcess:
|
|
"""Run ``python -m <module> --help`` and return the CompletedProcess.
|
|
|
|
Captures both stdout and stderr so tests can inspect either; uses a
|
|
short timeout so a hung CLI fails fast on CI.
|
|
"""
|
|
return subprocess.run(
|
|
[sys.executable, "-m", module, "--help"],
|
|
capture_output=True, text=True, timeout=30,
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize("cli_module", [
|
|
"src.cli",
|
|
"src.cli_text_clean",
|
|
"src.cli_analyze",
|
|
])
|
|
def test_cli_help_exits_zero(cli_module: str) -> None:
|
|
proc = _cli_help(cli_module)
|
|
assert proc.returncode == 0, (
|
|
f"{cli_module} --help exited {proc.returncode}.\n"
|
|
f"stdout:\n{proc.stdout}\nstderr:\n{proc.stderr}"
|
|
)
|
|
# Help output must mention the command name or at least include "Usage:".
|
|
combined = (proc.stdout + proc.stderr).lower()
|
|
assert "usage" in combined, (
|
|
f"{cli_module} --help did not produce a Usage line"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Streamlit GUI entry point
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_streamlit_app_module_compiles() -> None:
|
|
"""Ensure ``src/gui/app.py`` is at least syntactically valid Python.
|
|
|
|
A full Streamlit launch is too heavy for the install layer; that's
|
|
covered by the e2e suite.
|
|
"""
|
|
import ast
|
|
from pathlib import Path
|
|
|
|
app_path = Path(__file__).resolve().parent.parent / "src" / "gui" / "app.py"
|
|
assert app_path.exists(), f"missing {app_path}"
|
|
ast.parse(app_path.read_text(encoding="utf-8"))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Test runner sanity
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_run_tests_help_works() -> None:
|
|
"""``python run_tests.py --help`` should describe the available flags."""
|
|
proc = subprocess.run(
|
|
[sys.executable, "run_tests.py", "--help"],
|
|
capture_output=True, text=True, timeout=30,
|
|
)
|
|
assert proc.returncode == 0
|
|
assert "--tool" in proc.stdout
|
|
assert "--fixtures" in proc.stdout
|