datatools-dev/src/cli_analyze.py

"""CLI for the DataTools upload-time analyzer.

Usage:
    python -m src.cli_analyze input.csv               # human-readable report
    python -m src.cli_analyze input.csv --json        # JSON to stdout
    python -m src.cli_analyze input.csv --sample-rows 5000

The analyzer is purely advisory; exit code is always 0 on a successful scan
even when findings are present. Use --strict to exit non-zero on warnings.
"""

from __future__ import annotations

import json
import sys
from pathlib import Path
from typing import Optional

import typer
from rich.console import Console
from rich.table import Table

from src.core.analyze import analyze, findings_by_tool, to_dict

app = typer.Typer(
    name="analyze",
    help=(
        "Scan a CSV or Excel file and report data quality issues with the "
        "tools that can fix each one. Read-only and advisory.\n\n"
        "Examples:\n\n"
        "  # Default scan (first 1000 rows, human-readable)\n"
        "  python -m src.cli_analyze customers.csv\n\n"
        "  # Machine-readable output for piping\n"
        "  python -m src.cli_analyze customers.csv --json\n\n"
        "  # Scan more rows on a large file\n"
        "  python -m src.cli_analyze big.csv --sample-rows 50000\n\n"
        "  # Exit non-zero when warnings exist (CI gate)\n"
        "  python -m src.cli_analyze customers.csv --strict\n"
    ),
    add_completion=False,
    no_args_is_help=True,
)


# Tool id -> friendly display name. Kept in the CLI module since the GUI has
# its own version; both stay in lockstep with the actual script lineup.
_TOOL_DISPLAY = {
    "01_deduplicator": "Deduplicator",
    "02_text_cleaner": "Text Cleaner",
    "03_format_standardizer": "Format Standardizer",
    "04_missing_handler": "Missing Value Handler",
    "05_column_mapper": "Column Mapper",
    "06_outlier_detector": "Outlier Detector",
    "07_multi_file_merger": "Multi-File Merger",
    "08_validator_reporter": "Validator & Reporter",
    "09_pipeline_runner": "Pipeline Runner",
}


def _tool_label(tool_id: str) -> str:
    return _TOOL_DISPLAY.get(tool_id, tool_id) if tool_id else "—"


_SEVERITY_STYLE = {
    "info": "cyan",
    "warn": "yellow",
    "error": "red",
}


@app.command()
def scan(
    input_file: str = typer.Argument(
        ..., help="Path to the CSV or Excel file to scan.",
    ),
    sample_rows: int = typer.Option(
        1000, "--sample-rows", "-n",
        help="Cap on rows scanned. Default 1000.",
    ),
    json_out: bool = typer.Option(
        False, "--json",
        help="Print findings as a JSON array on stdout.",
    ),
    strict: bool = typer.Option(
        False, "--strict",
        help="Exit non-zero when any 'warn' or 'error' finding is reported.",
    ),
) -> None:
    path = Path(input_file)
    if not path.exists():
        typer.echo(f"File not found: {path}", err=True)
        raise typer.Exit(code=2)

    findings = analyze(path, sample_rows=sample_rows)

    if json_out:
        typer.echo(json.dumps([to_dict(f) for f in findings], indent=2))
        _maybe_strict_exit(findings, strict)
        return

    console = Console()
    if not findings:
        console.print(f"[green]✓[/green] No issues detected in {path.name}.")
        return

    grouped = findings_by_tool(findings)
    untargeted = [f for f in findings if not f.tool]

    # Top-line summary
    by_sev: dict[str, int] = {}
    for f in findings:
        by_sev[f.severity] = by_sev.get(f.severity, 0) + 1
    summary_parts = [
        f"[{_SEVERITY_STYLE[s]}]{by_sev[s]} {s}[/{_SEVERITY_STYLE[s]}]"
        for s in ("error", "warn", "info") if by_sev.get(s)
    ]
    console.print(
        f"[bold]Scanned[/bold] {path.name}: "
        f"{len(findings)} finding(s) ({', '.join(summary_parts)})."
    )
    console.print()

    # Per-tool tables — surface what each downstream tool would need to do.
    for tool_id in sorted(grouped):
        _render_tool_table(console, tool_id, grouped[tool_id])

    if untargeted:
        _render_tool_table(console, "", untargeted, header="Informational / file-level")

    _maybe_strict_exit(findings, strict)


def _render_tool_table(console: Console, tool_id: str, items, header: str | None = None) -> None:
    label = header or f"→ {_tool_label(tool_id)}"
    table = Table(title=label, title_style="bold", show_lines=False, expand=True)
    table.add_column("Severity", width=8)
    table.add_column("Finding", width=32)
    table.add_column("Count", justify="right", width=7)
    table.add_column("Description")
    for f in items:
        sev = f"[{_SEVERITY_STYLE[f.severity]}]{f.severity}[/{_SEVERITY_STYLE[f.severity]}]"
        table.add_row(sev, f.id, str(f.count), f.description)
    console.print(table)
    console.print()


def _maybe_strict_exit(findings, strict: bool) -> None:
    if not strict:
        return
    if any(f.severity in ("warn", "error") for f in findings):
        raise typer.Exit(code=1)


def main() -> None:
    from src.cli_license_guard import guard
    guard()
    app()


# Entrypoint when run via `python -m src.cli_analyze`. Typer's no_args_is_help
# kicks in when the user invokes without args; we expose the single command at
# the top level for convenience: ``python -m src.cli_analyze input.csv``.
if __name__ == "__main__":
    main()