feat(cli): src.cli_analyze — Typer CLI for the analyzer

python -m src.cli_analyze input.csv # rich table per tool python -m src.cli_analyze input.csv --json # array of finding dicts python -m src.cli_analyze input.csv --strict # exit 1 on warn/error python -m src.cli_analyze input.csv -n 50000 # cap rows scanned Findings are grouped by destination tool so the user can see at a glance which tool to open next. Read-only; exit code 0 unless --strict is set. The CLI keeps its own tool-id -> display-name map so it doesn't depend on the GUI module. 7 tests cover: clean-file passthrough, dirty-file table, --json round-trip, missing-file (exit 2), --strict exit code, --sample-rows cap. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 15:53:11 +00:00
parent edf6ccf90b
commit 5c62fb6117
2 changed files with 255 additions and 0 deletions
--- a/src/cli_analyze.py
+++ b/src/cli_analyze.py
@@ -0,0 +1,158 @@
+"""CLI for the DataTools upload-time analyzer.
+
+Usage:
+    python -m src.cli_analyze input.csv               # human-readable report
+    python -m src.cli_analyze input.csv --json        # JSON to stdout
+    python -m src.cli_analyze input.csv --sample-rows 5000
+
+The analyzer is purely advisory; exit code is always 0 on a successful scan
+even when findings are present. Use --strict to exit non-zero on warnings.
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+from typing import Optional
+
+import typer
+from rich.console import Console
+from rich.table import Table
+
+from src.core.analyze import analyze, findings_by_tool, to_dict
+
+app = typer.Typer(
+    name="analyze",
+    help=(
+        "Scan a CSV or Excel file and report data quality issues with the "
+        "tools that can fix each one. Read-only and advisory.\n\n"
+        "Examples:\n\n"
+        "  # Default scan (first 1000 rows, human-readable)\n"
+        "  python -m src.cli_analyze customers.csv\n\n"
+        "  # Machine-readable output for piping\n"
+        "  python -m src.cli_analyze customers.csv --json\n\n"
+        "  # Scan more rows on a large file\n"
+        "  python -m src.cli_analyze big.csv --sample-rows 50000\n\n"
+        "  # Exit non-zero when warnings exist (CI gate)\n"
+        "  python -m src.cli_analyze customers.csv --strict\n"
+    ),
+    add_completion=False,
+    no_args_is_help=True,
+)
+
+
+# Tool id -> friendly display name. Kept in the CLI module since the GUI has
+# its own version; both stay in lockstep with the actual script lineup.
+_TOOL_DISPLAY = {
+    "01_deduplicator": "Deduplicator",
+    "02_text_cleaner": "Text Cleaner",
+    "03_format_standardizer": "Format Standardizer",
+    "04_missing_handler": "Missing Value Handler",
+    "05_column_mapper": "Column Mapper",
+    "06_outlier_detector": "Outlier Detector",
+    "07_multi_file_merger": "Multi-File Merger",
+    "08_validator_reporter": "Validator & Reporter",
+    "09_pipeline_runner": "Pipeline Runner",
+}
+
+
+def _tool_label(tool_id: str) -> str:
+    return _TOOL_DISPLAY.get(tool_id, tool_id) if tool_id else "—"
+
+
+_SEVERITY_STYLE = {
+    "info": "cyan",
+    "warn": "yellow",
+    "error": "red",
+}
+
+
+@app.command()
+def scan(
+    input_file: str = typer.Argument(
+        ..., help="Path to the CSV or Excel file to scan.",
+    ),
+    sample_rows: int = typer.Option(
+        1000, "--sample-rows", "-n",
+        help="Cap on rows scanned. Default 1000.",
+    ),
+    json_out: bool = typer.Option(
+        False, "--json",
+        help="Print findings as a JSON array on stdout.",
+    ),
+    strict: bool = typer.Option(
+        False, "--strict",
+        help="Exit non-zero when any 'warn' or 'error' finding is reported.",
+    ),
+) -> None:
+    path = Path(input_file)
+    if not path.exists():
+        typer.echo(f"File not found: {path}", err=True)
+        raise typer.Exit(code=2)
+
+    findings = analyze(path, sample_rows=sample_rows)
+
+    if json_out:
+        typer.echo(json.dumps([to_dict(f) for f in findings], indent=2))
+        _maybe_strict_exit(findings, strict)
+        return
+
+    console = Console()
+    if not findings:
+        console.print(f"[green]✓[/green] No issues detected in {path.name}.")
+        return
+
+    grouped = findings_by_tool(findings)
+    untargeted = [f for f in findings if not f.tool]
+
+    # Top-line summary
+    by_sev: dict[str, int] = {}
+    for f in findings:
+        by_sev[f.severity] = by_sev.get(f.severity, 0) + 1
+    summary_parts = [
+        f"[{_SEVERITY_STYLE[s]}]{by_sev[s]} {s}[/{_SEVERITY_STYLE[s]}]"
+        for s in ("error", "warn", "info") if by_sev.get(s)
+    ]
+    console.print(
+        f"[bold]Scanned[/bold] {path.name}: "
+        f"{len(findings)} finding(s) ({', '.join(summary_parts)})."
+    )
+    console.print()
+
+    # Per-tool tables — surface what each downstream tool would need to do.
+    for tool_id in sorted(grouped):
+        _render_tool_table(console, tool_id, grouped[tool_id])
+
+    if untargeted:
+        _render_tool_table(console, "", untargeted, header="Informational / file-level")
+
+    _maybe_strict_exit(findings, strict)
+
+
+def _render_tool_table(console: Console, tool_id: str, items, header: str | None = None) -> None:
+    label = header or f"→ {_tool_label(tool_id)}"
+    table = Table(title=label, title_style="bold", show_lines=False, expand=True)
+    table.add_column("Severity", width=8)
+    table.add_column("Finding", width=32)
+    table.add_column("Count", justify="right", width=7)
+    table.add_column("Description")
+    for f in items:
+        sev = f"[{_SEVERITY_STYLE[f.severity]}]{f.severity}[/{_SEVERITY_STYLE[f.severity]}]"
+        table.add_row(sev, f.id, str(f.count), f.description)
+    console.print(table)
+    console.print()
+
+
+def _maybe_strict_exit(findings, strict: bool) -> None:
+    if not strict:
+        return
+    if any(f.severity in ("warn", "error") for f in findings):
+        raise typer.Exit(code=1)
+
+
+# Entrypoint when run via `python -m src.cli_analyze`. Typer's no_args_is_help
+# kicks in when the user invokes without args; we expose the single command at
+# the top level for convenience: ``python -m src.cli_analyze input.csv``.
+if __name__ == "__main__":
+    app()