diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..e93a5da --- /dev/null +++ b/pytest.ini @@ -0,0 +1,20 @@ +; pytest configuration shared by tox, run_tests.py, and direct pytest calls. + +[pytest] +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* + +# Custom markers used by run_tests.py --quick and the e2e/install groupings. +markers = + slow: tests that take longer than ~1s (skipped under --quick) + e2e: end-to-end CLI / integration tests + install: import / dependency sanity tests + fixture_sweep: parametrized sweep over the test-cases/ folder + +# Warnings discipline: fail on unexpected DeprecationWarning from our own +# code, but tolerate third-party deprecations that we can't fix. +filterwarnings = + error::DeprecationWarning:src + ignore::DeprecationWarning diff --git a/run_tests.py b/run_tests.py new file mode 100755 index 0000000..d801ad0 --- /dev/null +++ b/run_tests.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +"""DataTools test runner — single entry point with category flags. + +Examples +-------- +Run everything (default):: + + python run_tests.py + +Run a single tool's tests:: + + python run_tests.py --tool dedup + python run_tests.py --tool text_clean + python run_tests.py --tool analyze + python run_tests.py --tool io + python run_tests.py --tool cli + +Categories:: + + python run_tests.py --unit # unit tests only (no e2e, no install) + python run_tests.py --e2e # end-to-end smoke tests + python run_tests.py --install # install / dependency sanity + python run_tests.py --fixtures # corpus + dropped-file sweep + python run_tests.py --coverage # add a coverage report + python run_tests.py --quick # skip @pytest.mark.slow + python run_tests.py -v / --verbose # verbose pytest output + +Multiple flags compose. ``--tool X --quick`` runs that tool's quick tests. + +Dropping a new fixture into ``test-cases/`` is automatic: the fixture sweep +test (``tests/test_fixtures_sweep.py``) parametrizes over every CSV/XLSX in +that directory (excluding ``text-cleaner-corpus/`` which has its own suite). +""" + +from __future__ import annotations + +import argparse +import shutil +import subprocess +import sys +from pathlib import Path + +PROJECT_ROOT = Path(__file__).resolve().parent + +# Tool name -> matching pytest -k expression. Keep aligned with test_*.py +# filenames; run_tests.py --tool foo translates to ``-k foo``. +_TOOL_MAP: dict[str, str] = { + "dedup": "test_dedup or test_cli.py", + "text_clean": "test_text_clean or test_cli_text_clean or test_corpus", + "analyze": "test_analyze or test_cli_analyze", + "io": "test_io", + "cli": "test_cli or test_cli_text_clean or test_cli_analyze", + "config": "test_config", + "normalizers": "test_normalizers", +} + +_CATEGORY_PATHS: dict[str, list[str]] = { + "unit": ["tests/"], # all tests are unit unless marked otherwise + "e2e": ["tests/test_e2e.py"], + "install": ["tests/test_install.py"], + "fixtures": ["tests/test_corpus.py", "tests/test_fixtures_sweep.py"], +} + + +def _build_pytest_args(args: argparse.Namespace) -> list[str]: + cmd: list[str] = [sys.executable, "-m", "pytest"] + + # Verbosity + if args.verbose: + cmd.append("-vv") + else: + cmd.append("-q") + + # Coverage + if args.coverage: + cmd.extend(["--cov=src", "--cov-report=term-missing"]) + + # Quick: skip anything marked slow. + if args.quick: + cmd.extend(["-m", "not slow"]) + + # Tool filter via -k expression. + if args.tool: + if args.tool not in _TOOL_MAP: + print( + f"unknown --tool '{args.tool}'. " + f"available: {', '.join(sorted(_TOOL_MAP))}", + file=sys.stderr, + ) + sys.exit(2) + cmd.extend(["-k", _TOOL_MAP[args.tool]]) + + # Category selection (--unit/--e2e/--install/--fixtures). When several + # categories are requested they're OR'd by passing all paths. + paths: list[str] = [] + selected_categories = [ + c for c in ("unit", "e2e", "install", "fixtures") + if getattr(args, c) + ] + if selected_categories: + for cat in selected_categories: + paths.extend(_CATEGORY_PATHS[cat]) + elif args.path: + paths.extend(args.path) + else: + paths.append("tests/") + + cmd.extend(paths) + return cmd + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + prog="run_tests.py", + description=( + "DataTools test runner. With no flags runs every test. Use " + "--tool to scope to one tool, --unit/--e2e/--install/--fixtures " + "to scope by category. Combine flags freely." + ), + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=( + "Available tools: " + ", ".join(sorted(_TOOL_MAP)) + "\n\n" + "To add a new fixture-driven test: drop a CSV or XLSX into " + "test-cases/ and re-run. tests/test_fixtures_sweep.py picks up " + "new files automatically — no test code changes required." + ), + ) + parser.add_argument("--tool", help="Limit tests to one tool.") + parser.add_argument("--unit", action="store_true", + help="Unit tests only (default scope).") + parser.add_argument("--e2e", action="store_true", + help="End-to-end CLI/integration smoke tests.") + parser.add_argument("--install", action="store_true", + help="Install / import / entry-point sanity tests.") + parser.add_argument("--fixtures", action="store_true", + help="Run the corpus + dropped-fixture sweep.") + parser.add_argument("--coverage", action="store_true", + help="Emit a coverage report (term-missing).") + parser.add_argument("--quick", action="store_true", + help="Skip tests marked @pytest.mark.slow.") + parser.add_argument("-v", "--verbose", action="store_true", + help="Verbose pytest output.") + parser.add_argument("path", nargs="*", + help="Optional explicit test paths (override category).") + + args = parser.parse_args(argv) + + # Ensure we run from the project root so relative imports / paths work. + cwd_target = PROJECT_ROOT + if Path.cwd() != cwd_target: + print(f"running from {cwd_target}") + + if shutil.which("pytest") is None and not _python_has_pytest(): + print( + "pytest is not installed. Install dev deps:\n" + " pip install -r requirements-dev.txt", + file=sys.stderr, + ) + return 2 + + cmd = _build_pytest_args(args) + if args.verbose: + print("→", " ".join(cmd)) + proc = subprocess.run(cmd, cwd=cwd_target) + return proc.returncode + + +def _python_has_pytest() -> bool: + try: + __import__("pytest") + return True + except ImportError: + return False + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/test_e2e.py b/tests/test_e2e.py new file mode 100644 index 0000000..59c2a3f --- /dev/null +++ b/tests/test_e2e.py @@ -0,0 +1,143 @@ +"""End-to-end smoke tests. + +Round-trips through the CLI binaries with real fixture inputs to catch +glue-code breakage that pure unit tests miss: argv parsing, file I/O, log +configuration, exit codes, and the integration between the analyzer, the +pre-parse repair, and pandas. + +These are intentionally lightweight — one happy path per CLI plus a +couple of failure modes. Bigger scenarios live in ``test_corpus.py`` and +``test_fixtures_sweep.py``. +""" + +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + +import pandas as pd +import pytest + +pytestmark = pytest.mark.e2e + +PROJECT_ROOT = Path(__file__).resolve().parent.parent +CORPUS_KITCHEN_SINK = ( + PROJECT_ROOT / "test-cases" / "text-cleaner-corpus" / "test_data" / "20_kitchen_sink.csv" +) + + +def _run(*args: str, cwd: Path | None = None, **kwargs): + return subprocess.run( + [sys.executable, *args], + capture_output=True, text=True, timeout=60, + cwd=cwd or PROJECT_ROOT, + **kwargs, + ) + + +# --------------------------------------------------------------------------- +# cli_analyze — full round-trip +# --------------------------------------------------------------------------- + +class TestAnalyzeCliE2E: + def test_table_output_on_kitchen_sink(self): + if not CORPUS_KITCHEN_SINK.exists(): + pytest.skip("kitchen-sink fixture missing") + proc = _run("-m", "src.cli_analyze", str(CORPUS_KITCHEN_SINK)) + assert proc.returncode == 0, proc.stderr + # Rich tables wrap; assert on stable substrings. + assert "Text Cleaner" in proc.stdout + assert "csv_bom_stripped" in proc.stdout or "smart_quotes" in proc.stdout + + def test_json_output_parses(self): + if not CORPUS_KITCHEN_SINK.exists(): + pytest.skip("kitchen-sink fixture missing") + proc = _run("-m", "src.cli_analyze", str(CORPUS_KITCHEN_SINK), "--json") + assert proc.returncode == 0, proc.stderr + data = json.loads(proc.stdout) + assert isinstance(data, list) and len(data) > 0 + for item in data: + assert {"id", "severity", "tool", "count", "description"} <= set(item) + + +# --------------------------------------------------------------------------- +# cli_text_clean — full round-trip +# --------------------------------------------------------------------------- + +class TestTextCleanCliE2E: + def test_apply_writes_cleaned_file(self, tmp_path): + # Build a small dirty CSV: NBSP padding + smart quotes. + src = tmp_path / "dirty.csv" + src.write_text( + "id,name,note\n" + "1, Alice ,“hello”\n" + "2, Bob ,it’s fine\n", + encoding="utf-8", + ) + out = tmp_path / "out.csv" + proc = _run( + "-m", "src.cli_text_clean", str(src), + "--apply", "--output", str(out), + ) + assert proc.returncode == 0, proc.stderr + assert out.exists(), "cleaned file was not written" + cleaned = pd.read_csv(out, dtype=str, keep_default_na=False, encoding="utf-8-sig") + # NBSP padding stripped + assert cleaned.iloc[0]["name"] == "Alice" + assert cleaned.iloc[1]["name"] == "Bob" + # Smart quotes folded + assert cleaned.iloc[0]["note"] == '"hello"' + assert cleaned.iloc[1]["note"] == "it's fine" + + def test_preview_does_not_write(self, tmp_path): + src = tmp_path / "input.csv" + src.write_text("id,name\n1,Alice\n", encoding="utf-8") + # Without --apply, no output file should appear. + proc = _run("-m", "src.cli_text_clean", str(src)) + assert proc.returncode == 0 + # Default output path next to input — must not exist. + default_out = src.with_name(src.stem + "_cleaned.csv") + assert not default_out.exists() + + +# --------------------------------------------------------------------------- +# cli (dedup) — full round-trip +# --------------------------------------------------------------------------- + +class TestDedupCliE2E: + def test_apply_removes_duplicates(self, tmp_path): + src = tmp_path / "dups.csv" + src.write_text( + "name,email\n" + "Alice,alice@x.com\n" + "Alice,alice@x.com\n" + "Bob,bob@x.com\n", + encoding="utf-8", + ) + out = tmp_path / "deduped.csv" + proc = _run( + "-m", "src.cli", str(src), + "--apply", "--output", str(out), + ) + assert proc.returncode == 0, proc.stderr + assert out.exists() + result = pd.read_csv(out, dtype=str, keep_default_na=False, encoding="utf-8-sig") + assert len(result) == 2 # Alice deduped, Bob unique + + +# --------------------------------------------------------------------------- +# run_tests.py self-test — sanity check the runner itself works +# --------------------------------------------------------------------------- + +class TestRunTestsE2E: + def test_tool_filter_runs_subset(self): + proc = _run("run_tests.py", "--tool", "config", "-v") + assert proc.returncode == 0, proc.stderr + # Check we limited the run via -k. + assert "config" in proc.stdout.lower() + + def test_unknown_tool_exits_2(self): + proc = _run("run_tests.py", "--tool", "no_such_tool") + assert proc.returncode == 2 diff --git a/tests/test_fixtures_sweep.py b/tests/test_fixtures_sweep.py new file mode 100644 index 0000000..fb41a66 --- /dev/null +++ b/tests/test_fixtures_sweep.py @@ -0,0 +1,156 @@ +"""Automated sweep over every fixture in ``test-cases/``. + +Drop a new CSV/TSV/XLSX into ``test-cases/`` and the sweep picks it up the +next time pytest runs — no test code changes required. Each fixture goes +through three smoke tests: + +1. **Pre-parse repair runs cleanly.** Byte-level repair (BOM, NUL, smart + quotes, rogue delimiters) must not crash, and produced bytes must be + valid for ``pd.read_csv``. +2. **Analyzer runs cleanly.** ``analyze()`` must produce a list of + :class:`Finding` objects without raising. +3. **Text cleaner runs cleanly and preserves schema.** Default-config + ``clean_dataframe`` must not change row count and must return the same + number of columns it started with. + +The sweep skips files inside ``text-cleaner-corpus/`` because that subdir +has its own dedicated test (``test_corpus.py``) with byte-exact expected +outputs. +""" + +from __future__ import annotations + +import io +from pathlib import Path + +import pandas as pd +import pytest + +from src.core.analyze import Finding, analyze +from src.core.io import detect_delimiter, detect_encoding, repair_bytes +from src.core.text_clean import clean_dataframe + + +TEST_CASES_DIR = Path(__file__).resolve().parent.parent / "test-cases" + +# Subdirectories in test-cases/ that are exercised by their own dedicated +# tests. The sweep ignores these so we don't double-test or fight expected +# byte-exact outputs. +_EXCLUDED_SUBDIRS = {"text-cleaner-corpus"} + +# File suffixes we know how to load. +_SUPPORTED_SUFFIXES = {".csv", ".tsv", ".xlsx", ".xls"} + + +def _discover_fixtures() -> list[Path]: + """Return every fixture file under test-cases/ that the sweep should run. + + Walks one level deep — CSV/XLSX directly inside test-cases/ are picked + up; files in excluded subdirectories are not. + """ + if not TEST_CASES_DIR.is_dir(): + return [] + out: list[Path] = [] + for entry in sorted(TEST_CASES_DIR.iterdir()): + if entry.is_dir(): + if entry.name in _EXCLUDED_SUBDIRS: + continue + for sub in sorted(entry.rglob("*")): + if sub.is_file() and sub.suffix.lower() in _SUPPORTED_SUFFIXES: + out.append(sub) + continue + if entry.is_file() and entry.suffix.lower() in _SUPPORTED_SUFFIXES: + out.append(entry) + return out + + +_FIXTURES = _discover_fixtures() + + +def _fixture_id(path: Path) -> str: + """Pretty pytest id derived from the filename, keeping subdirs visible.""" + rel = path.relative_to(TEST_CASES_DIR) + return str(rel) + + +# Skip the entire module gracefully when no fixtures are present, instead of +# emitting a "no tests collected" failure. +pytestmark = [ + pytest.mark.fixture_sweep, + pytest.mark.skipif( + not _FIXTURES, + reason="no fixtures found under test-cases/ — drop a CSV/XLSX in to enable the sweep", + ), +] + + +def _read_with_repair(path: Path) -> tuple[pd.DataFrame, object | None]: + """Read *path* with the same robust pipeline analyze() uses. + + Returns ``(df, repair_result)`` where repair_result is None for Excel. + """ + suffix = path.suffix.lower() + if suffix in (".xlsx", ".xls"): + df = pd.read_excel(path, dtype=str, keep_default_na=False, engine="openpyxl") + return df, None + enc = detect_encoding(path) + delim = detect_delimiter(path, enc) + raw = path.read_bytes() + repair = repair_bytes(raw, encoding=enc, delimiter=delim) + df = pd.read_csv( + io.BytesIO(repair.repaired_bytes), + encoding="utf-8", delimiter=delim, + dtype=str, keep_default_na=False, on_bad_lines="warn", + ) + return df, repair + + +@pytest.mark.parametrize("fixture", _FIXTURES, ids=[_fixture_id(p) for p in _FIXTURES]) +class TestFixtureSweep: + """Smoke tests that every fixture in ``test-cases/`` must pass.""" + + def test_repair_and_load(self, fixture: Path) -> None: + df, _ = _read_with_repair(fixture) + assert isinstance(df, pd.DataFrame), f"{fixture.name}: did not return a DataFrame" + assert len(df.columns) >= 1, f"{fixture.name}: zero columns after parse" + + def test_analyze_runs(self, fixture: Path) -> None: + df, repair = _read_with_repair(fixture) + findings = analyze(df, repair_result=repair) + assert isinstance(findings, list) + for f in findings: + assert isinstance(f, Finding), ( + f"{fixture.name}: analyze() returned a non-Finding ({type(f)})" + ) + + def test_text_cleaner_preserves_schema(self, fixture: Path) -> None: + df, _ = _read_with_repair(fixture) + before_rows = len(df) + before_cols = len(df.columns) + result = clean_dataframe(df) + assert len(result.cleaned_df) == before_rows, ( + f"{fixture.name}: row count changed " + f"({before_rows} -> {len(result.cleaned_df)})" + ) + assert len(result.cleaned_df.columns) == before_cols, ( + f"{fixture.name}: column count changed " + f"({before_cols} -> {len(result.cleaned_df.columns)})" + ) + + def test_text_cleaner_idempotent(self, fixture: Path) -> None: + df, _ = _read_with_repair(fixture) + once = clean_dataframe(df).cleaned_df.reset_index(drop=True) + twice = clean_dataframe(once).cleaned_df.reset_index(drop=True) + assert once.equals(twice), ( + f"{fixture.name}: clean(clean(x)) != clean(x); cleaner is not idempotent" + ) + + +def test_at_least_one_fixture_present() -> None: + """Smoke check: every project should ship at least one fixture so the + sweep is not silently skipped on a clean checkout. Adjust the threshold + only if intentionally moving fixtures elsewhere.""" + assert len(_FIXTURES) > 0, ( + "No fixtures found under test-cases/. " + "Drop a CSV or XLSX file into the directory and re-run." + ) diff --git a/tests/test_gap_coverage.py b/tests/test_gap_coverage.py new file mode 100644 index 0000000..079652a --- /dev/null +++ b/tests/test_gap_coverage.py @@ -0,0 +1,161 @@ +"""Tests added to close gaps surfaced by the test audit. + +These cover edges that existing suites missed: + +- ``CleanOptions.clean_headers=False`` toggle (added but not directly tested). +- ``repair_bytes`` with non-comma delimiters and combined-fix scenarios. +- ``analyze()`` over a path-based Excel file. +- ``analyze()`` with ``sample_rows >= len(df)`` (uses copy(), not head()). +- ``findings_by_tool`` on an empty list. +- BOM that appears mid-cell rather than at file start. + +The collapse-whitespace heuristic for numeric/date/phone-shaped cells (spec +§4.17) is *not yet implemented* and is captured here as a known-gap xfail +so it's surfaced rather than silently missing. +""" + +from __future__ import annotations + +import io + +import pandas as pd +import pytest + +from src.core.analyze import analyze, findings_by_tool +from src.core.io import RepairAction, repair_bytes +from src.core.text_clean import CleanOptions, clean_dataframe + + +# --------------------------------------------------------------------------- +# clean_headers toggle +# --------------------------------------------------------------------------- + +class TestCleanHeadersToggle: + def test_default_cleans_headers(self): + df = pd.DataFrame({" id ": [1], "Email​": ["a@b.com"]}) + result = clean_dataframe(df) + assert list(result.cleaned_df.columns) == ["id", "Email"] + + def test_disable_preserves_dirty_headers(self): + df = pd.DataFrame({" id ": [1], "Email​": ["a@b.com"]}) + result = clean_dataframe(df, CleanOptions(clean_headers=False)) + assert list(result.cleaned_df.columns) == [" id ", "Email​"] + + def test_disable_still_cleans_data_cells(self): + df = pd.DataFrame({"name": [" Alice ", "Bob "]}) + result = clean_dataframe(df, CleanOptions(clean_headers=False)) + assert result.cleaned_df["name"].tolist() == ["Alice", "Bob"] + + +# --------------------------------------------------------------------------- +# repair_bytes — non-comma delimiters and combined fixes +# --------------------------------------------------------------------------- + +class TestRepairBytesDelimiters: + def test_tab_delimited_smart_quote_fold(self): + raw = "id\tnote\n1\t“hi”\n".encode("utf-8") + result = repair_bytes(raw, delimiter="\t") + text = result.repaired_bytes.decode("utf-8") + assert "“" not in text and "”" not in text + assert "\t" in text # delimiter preserved + + def test_semicolon_delimited_unrepairable_extras(self): + raw = b"id;a;b\n1;foo;bar\n2;1;2;3;4\n" + result = repair_bytes(raw, delimiter=";") + # Extra-field row with no clear merge candidate is logged unrepairable. + assert 3 in result.unrepairable_lines + + +class TestRepairBytesCombinedFixes: + def test_bom_plus_nul_plus_smart_quotes(self): + raw = ( + b"\xef\xbb\xbf" + b"id,note\n" + b"1,Hel\x00lo \xe2\x80\x9cworld\xe2\x80\x9d\n" + ) + result = repair_bytes(raw) + kinds = {a.kind for a in result.actions} + assert {"strip_bom", "strip_nul", "fold_smart_quote"} <= kinds + # Resulting bytes parse cleanly. + df = pd.read_csv(io.BytesIO(result.repaired_bytes)) + assert df.iloc[0]["note"] == 'Hello "world"' + + +# --------------------------------------------------------------------------- +# analyze() — path-based Excel and large-sample edges +# --------------------------------------------------------------------------- + +class TestAnalyzeXlsxPath: + def test_excel_path_runs_without_repair(self, tmp_path): + path = tmp_path / "small.xlsx" + df = pd.DataFrame({ + "id": ["1", "2"], + "name": [" Alice ", "Bob"], # padding in xlsx + }) + df.to_excel(path, index=False, engine="openpyxl") + findings = analyze(path) + ids = {f.id for f in findings} + assert "whitespace_padding" in ids + # Excel skips csv_* findings — no pre-parse repair on xlsx. + assert not any(i.startswith("csv_") for i in ids) + + +class TestAnalyzeSampleRowsEdge: + def test_sample_rows_larger_than_df(self): + df = pd.DataFrame({"x": [" pad ", "clean"]}) + # sample_rows=1000 but df has only 2 rows; must not crash. + findings = analyze(df, sample_rows=1000) + assert any(f.id == "whitespace_padding" for f in findings) + + +class TestAnalyzeMidCellBom: + def test_bom_inside_cell_treated_as_zero_width(self): + df = pd.DataFrame({"name": ["Hello"]}) + findings = analyze(df) + assert any(f.id == "zero_width_or_invisible" for f in findings) + + +# --------------------------------------------------------------------------- +# findings_by_tool — edge cases +# --------------------------------------------------------------------------- + +class TestFindingsByToolEdges: + def test_empty_list_returns_empty_dict(self): + assert findings_by_tool([]) == {} + + def test_only_toolless_findings_returns_empty_dict(self): + from src.core.analyze import Finding + # Construct a Finding with no tool — like csv_unrepairable_rows. + f = Finding( + id="x", severity="info", tool="", count=1, + description="d", + ) + assert findings_by_tool([f]) == {} + + +# --------------------------------------------------------------------------- +# Known gap: collapse_whitespace on numeric/date/phone-shaped cells +# --------------------------------------------------------------------------- + +class TestNumericPhoneWhitespaceGap: + """Spec §4.17: ``collapse_whitespace`` should NOT collapse internal + whitespace in cells that look numeric, dated, or phone-shaped. + + Currently unconditional. Marked xfail so the suite tracks the gap + without silently allowing regressions on the cells that *do* get + correctly collapsed. + """ + + @pytest.mark.xfail( + reason=( + "Heuristic not yet implemented — collapse_whitespace runs on every " + "string cell, including phone-shaped ones. See TEST-CASES.md §4.17." + ), + strict=True, + ) + def test_phone_internal_double_space_preserved(self): + df = pd.DataFrame({"phone": ["(555) 123-4567"]}) # double space inside + result = clean_dataframe(df) + # Spec requires the double space to survive because the cell looks + # phone-shaped. Today the cleaner collapses it. + assert result.cleaned_df.iloc[0]["phone"] == "(555) 123-4567" diff --git a/tests/test_install.py b/tests/test_install.py new file mode 100644 index 0000000..9ed9fc8 --- /dev/null +++ b/tests/test_install.py @@ -0,0 +1,173 @@ +"""Install / dependency / entry-point sanity tests. + +These tests answer the question: "after running ``pip install -r +requirements.txt`` on a fresh machine, can the user actually use this +project?" They run on every supported platform — the asserts touch only +public APIs and CLI ``--help`` exits, never any platform-specific paths. + +If a future dependency upgrade or refactor breaks an import that's used by +the CLI or the GUI, these tests catch it before the rest of the suite even +gets a chance to run. +""" + +from __future__ import annotations + +import importlib +import subprocess +import sys + +import pytest + +pytestmark = pytest.mark.install + + +# --------------------------------------------------------------------------- +# Required dependencies +# --------------------------------------------------------------------------- + +# Top-level packages that must import cleanly. If any of these fails, the +# user's install is broken — fail loudly with the offender's name. +_REQUIRED_DEPS = [ + "pandas", + "numpy", + "openpyxl", + "rapidfuzz", + "charset_normalizer", + "loguru", + "tqdm", + "typer", + "phonenumbers", + "streamlit", +] + + +@pytest.mark.parametrize("module", _REQUIRED_DEPS) +def test_required_dependency_imports(module: str) -> None: + importlib.import_module(module) + + +# --------------------------------------------------------------------------- +# Project package imports +# --------------------------------------------------------------------------- + +_PROJECT_MODULES = [ + "src", + "src.core", + "src.core.io", + "src.core.text_clean", + "src.core.dedup", + "src.core.normalizers", + "src.core.analyze", + "src.core.config", + "src.cli", + "src.cli_text_clean", + "src.cli_analyze", + "src.gui.components", +] + + +@pytest.mark.parametrize("module", _PROJECT_MODULES) +def test_project_module_imports(module: str) -> None: + importlib.import_module(module) + + +# --------------------------------------------------------------------------- +# Public API surface +# --------------------------------------------------------------------------- + +def test_core_public_api_present() -> None: + """Spot-check the symbols re-exported via ``src.core``. + + Catches an accidental rename or drop in ``src/core/__init__.py``. + """ + import src.core as core + + expected = [ + # I/O + "read_file", "write_file", "list_sheets", + "detect_encoding", "detect_delimiter", "detect_header_row", + "read_csv_repaired", "repair_bytes", + "RepairAction", "RepairResult", + # Analyzer + "Finding", "analyze", "findings_by_tool", "to_dict", + # Text cleaner + "CleanOptions", "CleanResult", "clean_dataframe", "clean_value", + "smart_title_case", "sentence_case", "apply_case", + # Dedup + "deduplicate", "build_default_strategies", + "Algorithm", "SurvivorRule", "MatchStrategy", "MatchResult", + "DeduplicationResult", + # Normalizers + "normalize_email", "normalize_phone", "normalize_name", + "normalize_address", "normalize_string", "get_normalizer", + "NormalizerType", + ] + missing = [name for name in expected if not hasattr(core, name)] + assert not missing, f"src.core is missing public symbols: {missing}" + + +# --------------------------------------------------------------------------- +# CLI entry points +# --------------------------------------------------------------------------- + +def _cli_help(module: str) -> subprocess.CompletedProcess: + """Run ``python -m --help`` and return the CompletedProcess. + + Captures both stdout and stderr so tests can inspect either; uses a + short timeout so a hung CLI fails fast on CI. + """ + return subprocess.run( + [sys.executable, "-m", module, "--help"], + capture_output=True, text=True, timeout=30, + ) + + +@pytest.mark.parametrize("cli_module", [ + "src.cli", + "src.cli_text_clean", + "src.cli_analyze", +]) +def test_cli_help_exits_zero(cli_module: str) -> None: + proc = _cli_help(cli_module) + assert proc.returncode == 0, ( + f"{cli_module} --help exited {proc.returncode}.\n" + f"stdout:\n{proc.stdout}\nstderr:\n{proc.stderr}" + ) + # Help output must mention the command name or at least include "Usage:". + combined = (proc.stdout + proc.stderr).lower() + assert "usage" in combined, ( + f"{cli_module} --help did not produce a Usage line" + ) + + +# --------------------------------------------------------------------------- +# Streamlit GUI entry point +# --------------------------------------------------------------------------- + +def test_streamlit_app_module_compiles() -> None: + """Ensure ``src/gui/app.py`` is at least syntactically valid Python. + + A full Streamlit launch is too heavy for the install layer; that's + covered by the e2e suite. + """ + import ast + from pathlib import Path + + app_path = Path(__file__).resolve().parent.parent / "src" / "gui" / "app.py" + assert app_path.exists(), f"missing {app_path}" + ast.parse(app_path.read_text(encoding="utf-8")) + + +# --------------------------------------------------------------------------- +# Test runner sanity +# --------------------------------------------------------------------------- + +def test_run_tests_help_works() -> None: + """``python run_tests.py --help`` should describe the available flags.""" + proc = subprocess.run( + [sys.executable, "run_tests.py", "--help"], + capture_output=True, text=True, timeout=30, + ) + assert proc.returncode == 0 + assert "--tool" in proc.stdout + assert "--fixtures" in proc.stdout diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..4ad8a36 --- /dev/null +++ b/tox.ini @@ -0,0 +1,67 @@ +; Cross-platform test automation for DataTools. +; +; Drives the pytest suite under multiple Python versions on Linux, macOS, +; and Windows. Use: +; +; tox # all envs +; tox -e py312 # one Python version +; tox -e e2e # CLI smoke tests +; tox -e install # import / dependency sanity +; tox -e lint # static checks (mypy / ruff if installed) +; tox -e coverage # full suite with coverage report +; +; Adding a new fixture: drop the CSV/XLSX into test-cases/ and re-run. +; tests/test_fixtures_sweep.py picks new files up automatically. + +[tox] +envlist = py310, py311, py312, py313, install, e2e +skip_missing_interpreters = true +isolated_build = false + +[testenv] +description = Run the full pytest suite under {envname}. +deps = + -r requirements.txt + -r requirements-dev.txt +commands = + python run_tests.py {posargs} +passenv = + HOME + USER + LANG + LC_ALL + PATH +setenv = + PYTHONIOENCODING = utf-8 + PYTHONUTF8 = 1 + +[testenv:install] +description = Verify imports and CLI entry points work after a fresh install. +commands = + python run_tests.py --install -v + +[testenv:e2e] +description = End-to-end CLI smoke tests against real fixtures. +commands = + python run_tests.py --e2e -v + +[testenv:fixtures] +description = Sweep test-cases/ for any newly-dropped fixtures. +commands = + python run_tests.py --fixtures -v + +[testenv:coverage] +description = Full suite with coverage report. +commands = + python run_tests.py --coverage + +[testenv:lint] +description = Static checks (run only if the optional tools are installed). +deps = + -r requirements.txt + ruff>=0.5; python_version >= "3.10" + mypy>=1.10; python_version >= "3.10" +allowlist_externals = sh +commands = + sh -c "command -v ruff && ruff check src/ tests/ || echo 'ruff not installed; skipping'" + sh -c "command -v mypy && mypy src/ || echo 'mypy not installed; skipping'"