"""CLI for the DataTools upload-time analyzer. Usage: python -m src.cli_analyze input.csv # human-readable report python -m src.cli_analyze input.csv --json # JSON to stdout python -m src.cli_analyze input.csv --sample-rows 5000 The analyzer is purely advisory; exit code is always 0 on a successful scan even when findings are present. Use --strict to exit non-zero on warnings. """ from __future__ import annotations import json import sys from pathlib import Path from typing import Optional import typer from rich.console import Console from rich.table import Table from src.core.analyze import analyze, findings_by_tool, to_dict app = typer.Typer( name="analyze", help=( "Scan a CSV or Excel file and report data quality issues with the " "tools that can fix each one. Read-only and advisory.\n\n" "Examples:\n\n" " # Default scan (first 1000 rows, human-readable)\n" " python -m src.cli_analyze customers.csv\n\n" " # Machine-readable output for piping\n" " python -m src.cli_analyze customers.csv --json\n\n" " # Scan more rows on a large file\n" " python -m src.cli_analyze big.csv --sample-rows 50000\n\n" " # Exit non-zero when warnings exist (CI gate)\n" " python -m src.cli_analyze customers.csv --strict\n" ), add_completion=False, no_args_is_help=True, ) # Tool id -> friendly display name. Kept in the CLI module since the GUI has # its own version; both stay in lockstep with the actual script lineup. _TOOL_DISPLAY = { "01_deduplicator": "Find Duplicates", "02_text_cleaner": "Clean Text", "03_format_standardizer": "Standardize Formats", "04_missing_handler": "Fix Missing Values", "05_column_mapper": "Map Columns", "06_outlier_detector": "Find Unusual Values", "07_multi_file_merger": "Combine Files", "08_validator_reporter": "Quality Check", "09_pipeline_runner": "Automated Workflows", } def _tool_label(tool_id: str) -> str: return _TOOL_DISPLAY.get(tool_id, tool_id) if tool_id else "—" _SEVERITY_STYLE = { "info": "cyan", "warn": "yellow", "error": "red", } @app.command() def scan( input_file: str = typer.Argument( ..., help="Path to the CSV or Excel file to scan.", ), sample_rows: int = typer.Option( 1000, "--sample-rows", "-n", help="Cap on rows scanned. Default 1000.", ), json_out: bool = typer.Option( False, "--json", help="Print findings as a JSON array on stdout.", ), strict: bool = typer.Option( False, "--strict", help="Exit non-zero when any 'warn' or 'error' finding is reported.", ), ) -> None: path = Path(input_file) if not path.exists(): typer.echo(f"File not found: {path}", err=True) raise typer.Exit(code=2) findings = analyze(path, sample_rows=sample_rows) if json_out: typer.echo(json.dumps([to_dict(f) for f in findings], indent=2)) _maybe_strict_exit(findings, strict) return console = Console() if not findings: console.print(f"[green]✓[/green] No issues detected in {path.name}.") return grouped = findings_by_tool(findings) untargeted = [f for f in findings if not f.tool] # Top-line summary by_sev: dict[str, int] = {} for f in findings: by_sev[f.severity] = by_sev.get(f.severity, 0) + 1 summary_parts = [ f"[{_SEVERITY_STYLE[s]}]{by_sev[s]} {s}[/{_SEVERITY_STYLE[s]}]" for s in ("error", "warn", "info") if by_sev.get(s) ] console.print( f"[bold]Scanned[/bold] {path.name}: " f"{len(findings)} finding(s) ({', '.join(summary_parts)})." ) console.print() # Per-tool tables — surface what each downstream tool would need to do. for tool_id in sorted(grouped): _render_tool_table(console, tool_id, grouped[tool_id]) if untargeted: _render_tool_table(console, "", untargeted, header="Informational / file-level") _maybe_strict_exit(findings, strict) def _render_tool_table(console: Console, tool_id: str, items, header: str | None = None) -> None: label = header or f"→ {_tool_label(tool_id)}" table = Table(title=label, title_style="bold", show_lines=False, expand=True) table.add_column("Severity", width=8) table.add_column("Finding", width=32) table.add_column("Count", justify="right", width=7) table.add_column("Description") for f in items: sev = f"[{_SEVERITY_STYLE[f.severity]}]{f.severity}[/{_SEVERITY_STYLE[f.severity]}]" table.add_row(sev, f.id, str(f.count), f.description) console.print(table) console.print() def _maybe_strict_exit(findings, strict: bool) -> None: if not strict: return if any(f.severity in ("warn", "error") for f in findings): raise typer.Exit(code=1) def main() -> None: from src.cli_license_guard import guard guard() app() # Entrypoint when run via `python -m src.cli_analyze`. Typer's no_args_is_help # kicks in when the user invokes without args; we expose the single command at # the top level for convenience: ``python -m src.cli_analyze input.csv``. if __name__ == "__main__": main()