Files
datatools-dev/src/cli_analyze.py
Michael db5ec084da docs+code: rename tool labels everywhere
Sweep follow-up to 93e43fc. Display labels now consistent across docs,
landing pages, CLI output, code comments, docstrings, and test prose.
Five parallel surfaces touched:

- docs (EN + ES): README, USER-GUIDE, CLI-REFERENCE, and 11 internal
  design/planning docs
- landing pages: index + bookkeeper/revops/shopify-pet
- src: CLI module docstrings, _TOOL_DISPLAY dicts in cli_analyze.py
  and gui/components/_legacy.py, core module headers, every tool
  page's module docstring
- tests: class/method/module docstrings and section-header comments
- test-cases READMEs

Page slugs (1_Deduplicator etc.), tool_id strings (01_deduplicator
etc.), Python class names (TestDeduplicatorWorkflow, FeatureFlag.*),
URL paths, anchor IDs, CSS classes, and asset filenames were left
intact since they're code identifiers / structural references.

All 2033 tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 19:50:09 +00:00

165 lines
5.2 KiB
Python

"""CLI for the DataTools upload-time analyzer.
Usage:
python -m src.cli_analyze input.csv # human-readable report
python -m src.cli_analyze input.csv --json # JSON to stdout
python -m src.cli_analyze input.csv --sample-rows 5000
The analyzer is purely advisory; exit code is always 0 on a successful scan
even when findings are present. Use --strict to exit non-zero on warnings.
"""
from __future__ import annotations
import json
import sys
from pathlib import Path
from typing import Optional
import typer
from rich.console import Console
from rich.table import Table
from src.core.analyze import analyze, findings_by_tool, to_dict
app = typer.Typer(
name="analyze",
help=(
"Scan a CSV or Excel file and report data quality issues with the "
"tools that can fix each one. Read-only and advisory.\n\n"
"Examples:\n\n"
" # Default scan (first 1000 rows, human-readable)\n"
" python -m src.cli_analyze customers.csv\n\n"
" # Machine-readable output for piping\n"
" python -m src.cli_analyze customers.csv --json\n\n"
" # Scan more rows on a large file\n"
" python -m src.cli_analyze big.csv --sample-rows 50000\n\n"
" # Exit non-zero when warnings exist (CI gate)\n"
" python -m src.cli_analyze customers.csv --strict\n"
),
add_completion=False,
no_args_is_help=True,
)
# Tool id -> friendly display name. Kept in the CLI module since the GUI has
# its own version; both stay in lockstep with the actual script lineup.
_TOOL_DISPLAY = {
"01_deduplicator": "Find Duplicates",
"02_text_cleaner": "Clean Text",
"03_format_standardizer": "Standardize Formats",
"04_missing_handler": "Fix Missing Values",
"05_column_mapper": "Map Columns",
"06_outlier_detector": "Find Unusual Values",
"07_multi_file_merger": "Combine Files",
"08_validator_reporter": "Quality Check",
"09_pipeline_runner": "Automated Workflows",
}
def _tool_label(tool_id: str) -> str:
return _TOOL_DISPLAY.get(tool_id, tool_id) if tool_id else ""
_SEVERITY_STYLE = {
"info": "cyan",
"warn": "yellow",
"error": "red",
}
@app.command()
def scan(
input_file: str = typer.Argument(
..., help="Path to the CSV or Excel file to scan.",
),
sample_rows: int = typer.Option(
1000, "--sample-rows", "-n",
help="Cap on rows scanned. Default 1000.",
),
json_out: bool = typer.Option(
False, "--json",
help="Print findings as a JSON array on stdout.",
),
strict: bool = typer.Option(
False, "--strict",
help="Exit non-zero when any 'warn' or 'error' finding is reported.",
),
) -> None:
path = Path(input_file)
if not path.exists():
typer.echo(f"File not found: {path}", err=True)
raise typer.Exit(code=2)
findings = analyze(path, sample_rows=sample_rows)
if json_out:
typer.echo(json.dumps([to_dict(f) for f in findings], indent=2))
_maybe_strict_exit(findings, strict)
return
console = Console()
if not findings:
console.print(f"[green]✓[/green] No issues detected in {path.name}.")
return
grouped = findings_by_tool(findings)
untargeted = [f for f in findings if not f.tool]
# Top-line summary
by_sev: dict[str, int] = {}
for f in findings:
by_sev[f.severity] = by_sev.get(f.severity, 0) + 1
summary_parts = [
f"[{_SEVERITY_STYLE[s]}]{by_sev[s]} {s}[/{_SEVERITY_STYLE[s]}]"
for s in ("error", "warn", "info") if by_sev.get(s)
]
console.print(
f"[bold]Scanned[/bold] {path.name}: "
f"{len(findings)} finding(s) ({', '.join(summary_parts)})."
)
console.print()
# Per-tool tables — surface what each downstream tool would need to do.
for tool_id in sorted(grouped):
_render_tool_table(console, tool_id, grouped[tool_id])
if untargeted:
_render_tool_table(console, "", untargeted, header="Informational / file-level")
_maybe_strict_exit(findings, strict)
def _render_tool_table(console: Console, tool_id: str, items, header: str | None = None) -> None:
label = header or f"{_tool_label(tool_id)}"
table = Table(title=label, title_style="bold", show_lines=False, expand=True)
table.add_column("Severity", width=8)
table.add_column("Finding", width=32)
table.add_column("Count", justify="right", width=7)
table.add_column("Description")
for f in items:
sev = f"[{_SEVERITY_STYLE[f.severity]}]{f.severity}[/{_SEVERITY_STYLE[f.severity]}]"
table.add_row(sev, f.id, str(f.count), f.description)
console.print(table)
console.print()
def _maybe_strict_exit(findings, strict: bool) -> None:
if not strict:
return
if any(f.severity in ("warn", "error") for f in findings):
raise typer.Exit(code=1)
def main() -> None:
from src.cli_license_guard import guard
guard()
app()
# Entrypoint when run via `python -m src.cli_analyze`. Typer's no_args_is_help
# kicks in when the user invokes without args; we expose the single command at
# the top level for convenience: ``python -m src.cli_analyze input.csv``.
if __name__ == "__main__":
main()