datatools-dev/tests/test_cli_analyze.py

"""Tests for src.cli_analyze — Typer CLI."""

from __future__ import annotations

import json
from pathlib import Path

import pytest
from typer.testing import CliRunner

from src.cli_analyze import app


runner = CliRunner()


def _make_dirty(tmp_path: Path) -> Path:
    """Write a small CSV with a mix of detectable issues."""
    f = tmp_path / "dirty.csv"
    f.write_bytes(
        b"\xef\xbb\xbf"            # BOM
        b"  id  ,Name,Email\n"      # padded header
        b"1,Alice,Alice@Example.COM\n"
        b"2,  Bob  ,bob@example.com\n"
        b"3,N/A,carol@example.com\n"
    )
    return f


class TestAnalyzeCli:
    def test_clean_file_says_so(self, tmp_path):
        f = tmp_path / "clean.csv"
        f.write_text("id,name\n1,Alice\n2,Bob\n")
        result = runner.invoke(app, [str(f)])
        assert result.exit_code == 0
        assert "No issues detected" in result.stdout

    def test_dirty_file_lists_findings(self, tmp_path):
        f = _make_dirty(tmp_path)
        result = runner.invoke(app, [str(f)])
        assert result.exit_code == 0
        # The Rich table breaks lines; assert on stable substrings instead of
        # full finding ids.
        assert "Clean Text" in result.stdout
        assert "Fix Missing Values" in result.stdout
        # Severity column is rendered.
        assert "warn" in result.stdout

    def test_json_output_round_trips(self, tmp_path):
        f = _make_dirty(tmp_path)
        result = runner.invoke(app, [str(f), "--json"])
        assert result.exit_code == 0
        data = json.loads(result.stdout)
        assert isinstance(data, list)
        assert len(data) > 0
        ids = {item["id"] for item in data}
        assert "dirty_column_headers" in ids or "whitespace_padding" in ids
        # Each finding has the documented shape.
        for f in data:
            assert {"id", "severity", "tool", "count", "description", "samples"} <= set(f)

    def test_missing_file_exits_2(self, tmp_path):
        result = runner.invoke(app, [str(tmp_path / "nope.csv")])
        assert result.exit_code == 2
        assert "not found" in result.stdout.lower() or "not found" in (result.stderr or "")

    def test_strict_exits_1_on_warnings(self, tmp_path):
        f = _make_dirty(tmp_path)
        result = runner.invoke(app, [str(f), "--strict", "--json"])
        # JSON output is still printed, but exit code is 1 because warns exist.
        assert result.exit_code == 1
        data = json.loads(result.stdout)
        assert any(item["severity"] in ("warn", "error") for item in data)

    def test_strict_exits_0_on_clean(self, tmp_path):
        f = tmp_path / "clean.csv"
        f.write_text("id,name\n1,Alice\n2,Bob\n")
        result = runner.invoke(app, [str(f), "--strict"])
        assert result.exit_code == 0

    def test_sample_rows_caps_scan(self, tmp_path):
        # Build a file where ONLY rows past 100 have NBSP padding; with
        # --sample-rows 50 we should miss it.
        rows = ["id,name"]
        for i in range(1, 101):
            rows.append(f"{i},Alice")
        for i in range(101, 200):
            rows.append(f"{i},Alice ")  # NBSP padding
        f = tmp_path / "big.csv"
        f.write_text("\n".join(rows) + "\n", encoding="utf-8")

        capped = runner.invoke(app, [str(f), "--sample-rows", "50", "--json"])
        full = runner.invoke(app, [str(f), "--sample-rows", "200", "--json"])
        capped_ids = {x["id"] for x in json.loads(capped.stdout)}
        full_ids = {x["id"] for x in json.loads(full.stdout)}
        assert "nbsp_or_unicode_whitespace" not in capped_ids
        assert "nbsp_or_unicode_whitespace" in full_ids