feat(cli): src.cli_analyze — Typer CLI for the analyzer

python -m src.cli_analyze input.csv             # rich table per tool
python -m src.cli_analyze input.csv --json      # array of finding dicts
python -m src.cli_analyze input.csv --strict    # exit 1 on warn/error
python -m src.cli_analyze input.csv -n 50000    # cap rows scanned

Findings are grouped by destination tool so the user can see at a glance
which tool to open next. Read-only; exit code 0 unless --strict is set.
The CLI keeps its own tool-id -> display-name map so it doesn't depend on
the GUI module.

7 tests cover: clean-file passthrough, dirty-file table, --json round-trip,
missing-file (exit 2), --strict exit code, --sample-rows cap.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-29 15:53:11 +00:00
parent edf6ccf90b
commit 5c62fb6117
2 changed files with 255 additions and 0 deletions

158
src/cli_analyze.py Normal file
View File

@@ -0,0 +1,158 @@
"""CLI for the DataTools upload-time analyzer.
Usage:
python -m src.cli_analyze input.csv # human-readable report
python -m src.cli_analyze input.csv --json # JSON to stdout
python -m src.cli_analyze input.csv --sample-rows 5000
The analyzer is purely advisory; exit code is always 0 on a successful scan
even when findings are present. Use --strict to exit non-zero on warnings.
"""
from __future__ import annotations
import json
import sys
from pathlib import Path
from typing import Optional
import typer
from rich.console import Console
from rich.table import Table
from src.core.analyze import analyze, findings_by_tool, to_dict
app = typer.Typer(
name="analyze",
help=(
"Scan a CSV or Excel file and report data quality issues with the "
"tools that can fix each one. Read-only and advisory.\n\n"
"Examples:\n\n"
" # Default scan (first 1000 rows, human-readable)\n"
" python -m src.cli_analyze customers.csv\n\n"
" # Machine-readable output for piping\n"
" python -m src.cli_analyze customers.csv --json\n\n"
" # Scan more rows on a large file\n"
" python -m src.cli_analyze big.csv --sample-rows 50000\n\n"
" # Exit non-zero when warnings exist (CI gate)\n"
" python -m src.cli_analyze customers.csv --strict\n"
),
add_completion=False,
no_args_is_help=True,
)
# Tool id -> friendly display name. Kept in the CLI module since the GUI has
# its own version; both stay in lockstep with the actual script lineup.
_TOOL_DISPLAY = {
"01_deduplicator": "Deduplicator",
"02_text_cleaner": "Text Cleaner",
"03_format_standardizer": "Format Standardizer",
"04_missing_handler": "Missing Value Handler",
"05_column_mapper": "Column Mapper",
"06_outlier_detector": "Outlier Detector",
"07_multi_file_merger": "Multi-File Merger",
"08_validator_reporter": "Validator & Reporter",
"09_pipeline_runner": "Pipeline Runner",
}
def _tool_label(tool_id: str) -> str:
return _TOOL_DISPLAY.get(tool_id, tool_id) if tool_id else ""
_SEVERITY_STYLE = {
"info": "cyan",
"warn": "yellow",
"error": "red",
}
@app.command()
def scan(
input_file: str = typer.Argument(
..., help="Path to the CSV or Excel file to scan.",
),
sample_rows: int = typer.Option(
1000, "--sample-rows", "-n",
help="Cap on rows scanned. Default 1000.",
),
json_out: bool = typer.Option(
False, "--json",
help="Print findings as a JSON array on stdout.",
),
strict: bool = typer.Option(
False, "--strict",
help="Exit non-zero when any 'warn' or 'error' finding is reported.",
),
) -> None:
path = Path(input_file)
if not path.exists():
typer.echo(f"File not found: {path}", err=True)
raise typer.Exit(code=2)
findings = analyze(path, sample_rows=sample_rows)
if json_out:
typer.echo(json.dumps([to_dict(f) for f in findings], indent=2))
_maybe_strict_exit(findings, strict)
return
console = Console()
if not findings:
console.print(f"[green]✓[/green] No issues detected in {path.name}.")
return
grouped = findings_by_tool(findings)
untargeted = [f for f in findings if not f.tool]
# Top-line summary
by_sev: dict[str, int] = {}
for f in findings:
by_sev[f.severity] = by_sev.get(f.severity, 0) + 1
summary_parts = [
f"[{_SEVERITY_STYLE[s]}]{by_sev[s]} {s}[/{_SEVERITY_STYLE[s]}]"
for s in ("error", "warn", "info") if by_sev.get(s)
]
console.print(
f"[bold]Scanned[/bold] {path.name}: "
f"{len(findings)} finding(s) ({', '.join(summary_parts)})."
)
console.print()
# Per-tool tables — surface what each downstream tool would need to do.
for tool_id in sorted(grouped):
_render_tool_table(console, tool_id, grouped[tool_id])
if untargeted:
_render_tool_table(console, "", untargeted, header="Informational / file-level")
_maybe_strict_exit(findings, strict)
def _render_tool_table(console: Console, tool_id: str, items, header: str | None = None) -> None:
label = header or f"{_tool_label(tool_id)}"
table = Table(title=label, title_style="bold", show_lines=False, expand=True)
table.add_column("Severity", width=8)
table.add_column("Finding", width=32)
table.add_column("Count", justify="right", width=7)
table.add_column("Description")
for f in items:
sev = f"[{_SEVERITY_STYLE[f.severity]}]{f.severity}[/{_SEVERITY_STYLE[f.severity]}]"
table.add_row(sev, f.id, str(f.count), f.description)
console.print(table)
console.print()
def _maybe_strict_exit(findings, strict: bool) -> None:
if not strict:
return
if any(f.severity in ("warn", "error") for f in findings):
raise typer.Exit(code=1)
# Entrypoint when run via `python -m src.cli_analyze`. Typer's no_args_is_help
# kicks in when the user invokes without args; we expose the single command at
# the top level for convenience: ``python -m src.cli_analyze input.csv``.
if __name__ == "__main__":
app()