datatools-dev/src/cli_format.py

"""CLI for the DataTools Standardize Formats tool (script 03).

Usage:
    python -m src.cli_format input.csv \\
        --types 'phone:phone,price:currency,name:name' \\
        --apply

    # 1 GB international file with per-row country column:
    python -m src.cli_format huge.csv \\
        --types 'phone:phone,address:address,price:currency' \\
        --phone-country country --address-country country \\
        --preserve-code --audit-max 50000 --apply

The CLI auto-streams (chunked read/write, bounded RAM) when the input
exceeds ~100 MB. Force or disable with ``--stream`` / ``--no-stream``.
"""

from __future__ import annotations

import sys
from datetime import datetime
from pathlib import Path
from typing import Optional

import typer
from loguru import logger

app = typer.Typer(
    name="format",
    help=(
        "Standardize dates, phones, currencies, names, and addresses "
        "in CSV / Excel files.\n\n"
        "Default behaviour: preview the changes (no file written). "
        "Add --apply to write output.\n\n"
        "For 1 GB+ international files, the CLI auto-streams in 50,000-row "
        "chunks so memory stays bounded. Use --phone-country / "
        "--address-country to point at a per-row ISO-3166 column for "
        "country-aware parsing.\n\n"
        "Examples:\n\n"
        "  # Preview\n"
        "  python -m src.cli_format data.csv --types 'phone:phone,price:currency'\n\n"
        "  # International file with per-row country\n"
        "  python -m src.cli_format leads.csv --types 'phone:phone' "
        "--phone-country country --apply\n\n"
        "  # Force streaming with smaller chunks for tight memory\n"
        "  python -m src.cli_format huge.csv --types 'phone:phone' "
        "--stream --chunk-size 10000 --apply\n"
    ),
    add_completion=False,
    no_args_is_help=True,
)


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _setup_logging(log_dir: Path) -> Path:
    log_dir.mkdir(parents=True, exist_ok=True)
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_path = log_dir / f"format_{ts}.log"
    logger.remove()
    logger.add(sys.stderr, level="WARNING", format="{message}")
    logger.add(
        str(log_path), level="DEBUG",
        format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}",
    )
    return log_path


def _parse_types(raw: Optional[str]) -> dict[str, str]:
    """Parse ``col:phone,col:date`` into a dict."""
    if not raw:
        return {}
    out: dict[str, str] = {}
    for piece in raw.split(","):
        piece = piece.strip()
        if not piece:
            continue
        if ":" not in piece:
            raise typer.BadParameter(
                f"Invalid --types piece: {piece!r}. "
                f"Expected 'col:type[,col:type...]' "
                f"where type is one of: date, phone, currency, name, address, email, boolean."
            )
        col, ft = piece.split(":", 1)
        out[col.strip()] = ft.strip()
    return out


_AUTO_STREAM_THRESHOLD = 100 * 1024 * 1024  # 100 MB


# ---------------------------------------------------------------------------
# Main command
# ---------------------------------------------------------------------------

@app.command()
def standardize(
    input_file: str = typer.Argument(..., help="CSV or TSV file path."),
    output: Optional[str] = typer.Option(
        None, "--output", "-o",
        help="Output file path. Default: {input}_standardized.csv",
    ),
    apply: bool = typer.Option(
        False, "--apply",
        help="Write the output. Without this flag, only a preview is shown.",
    ),
    types: Optional[str] = typer.Option(
        None, "--types",
        help="Per-column types: 'col:type[,col:type...]'. "
             "Types: date, phone, currency, name, address, email, boolean.",
    ),
    preset: Optional[str] = typer.Option(
        None, "--preset",
        help="Named preset (e.g. 'us', 'uk', 'eu', 'jp'). Layered before --types.",
    ),
    phone_country: Optional[str] = typer.Option(
        None, "--phone-country",
        help="Column name carrying the per-row ISO-3166 country code for phones.",
    ),
    address_country: Optional[str] = typer.Option(
        None, "--address-country",
        help="Column name carrying the per-row country code for addresses.",
    ),
    phone_region: str = typer.Option(
        "US", "--phone-region",
        help="Default phone region when no per-row column is set. ISO-3166 alpha-2.",
    ),
    phone_format: str = typer.Option(
        "E164", "--phone-format",
        help="Phone output format: E164 | INTERNATIONAL | NATIONAL | RFC3966 | DIGITS.",
    ),
    preserve_code: bool = typer.Option(
        False, "--preserve-code",
        help="Currency: emit ISO-4217 prefix (e.g. 'USD 1500.00').",
    ),
    decimals: int = typer.Option(
        2, "--decimals",
        help="Currency decimal precision.",
    ),
    audit_max: int = typer.Option(
        10_000, "--audit-max",
        help="Cap the change-audit at N rows (0 = no audit, -1 = unbounded).",
    ),
    stream: Optional[bool] = typer.Option(
        None, "--stream/--no-stream",
        help="Force streaming (chunked, bounded RAM). Auto-on for inputs > 100 MB.",
    ),
    chunk_size: int = typer.Option(
        50_000, "--chunk-size",
        help="Rows per chunk in streaming mode.",
    ),
    cache_size: int = typer.Option(
        262_144, "--cache-size",
        help="Per-column LRU-cache size (set 0 to disable).",
    ),
    encoding_override: Optional[str] = typer.Option(
        None, "--encoding",
        help="Override auto-detected file encoding.",
    ),
    delimiter: Optional[str] = typer.Option(
        None, "--delimiter",
        help="Override auto-detected delimiter.",
    ),
    config: Optional[str] = typer.Option(
        None, "--config",
        help="Load options from a saved JSON config.",
    ),
    save_config: Optional[str] = typer.Option(
        None, "--save-config",
        help="Save current options to a JSON config.",
    ),
):
    """Standardize formats across a CSV / TSV. Auto-streams for large inputs."""
    from src.core.format_standardize import (
        FieldType,
        StandardizeOptions,
        standardize_dataframe,
        standardize_file,
    )
    from src.core.io import read_file, detect_encoding, detect_delimiter
    import pandas as pd

    inp = Path(input_file)
    if not inp.exists():
        typer.echo(f"Error: File not found: {inp}", err=True)
        raise typer.Exit(1)

    log_path = _setup_logging(Path("logs"))

    # Build options
    if config:
        cp = Path(config)
        if not cp.exists():
            typer.echo(f"Error: Config file not found: {cp}", err=True)
            raise typer.Exit(1)
        options = StandardizeOptions.from_file(cp)
    elif preset:
        try:
            options = StandardizeOptions.from_preset(preset)
        except ValueError as e:
            typer.echo(f"Error: {e}", err=True)
            raise typer.Exit(1)
    else:
        options = StandardizeOptions()

    parsed_types = _parse_types(types)
    if parsed_types:
        try:
            options.column_types = {
                col: FieldType(t) for col, t in parsed_types.items()
            }
        except ValueError as e:
            typer.echo(
                f"Error: {e}. Valid types: "
                + ", ".join(sorted(t.value for t in FieldType)),
                err=True,
            )
            raise typer.Exit(1)

    if not options.column_types:
        typer.echo(
            "Error: no column types declared. Pass --types 'col:type,...' "
            "or --preset / --config with a column_types map.",
            err=True,
        )
        raise typer.Exit(1)

    if phone_country:
        options.phone_country_column = phone_country
    if address_country:
        options.address_country_column = address_country
    options.phone_region = phone_region
    options.phone_format = phone_format  # type: ignore[assignment]
    options.currency_preserve_code = preserve_code
    options.currency_decimals = decimals
    options.audit_max_rows = (
        None if audit_max < 0 else audit_max
    )
    options.cache_size = cache_size

    if save_config:
        saved = options.to_file(save_config)
        typer.echo(f"Config saved to {saved}")

    # Decide streaming mode
    file_size = inp.stat().st_size
    use_stream = stream if stream is not None else file_size > _AUTO_STREAM_THRESHOLD

    enc = encoding_override or detect_encoding(inp)
    delim = delimiter or detect_delimiter(inp, enc)

    out_path = Path(output) if output else inp.parent / f"{inp.stem}_standardized.csv"

    typer.echo(
        f"Reading {inp.name} ({file_size/1024/1024:.1f} MB; "
        f"{'streaming' if use_stream else 'in-memory'} mode)..."
    )

    if use_stream:
        if not apply:
            typer.echo(
                "\nStreaming mode does not produce a preview. "
                "Re-run with --apply to write output, or remove --stream to preview a sample."
            )
            raise typer.Exit(0)

        last_log = [0.0]
        import time as _time

        def _progress(rows, chunks):
            now = _time.perf_counter()
            if now - last_log[0] < 1.0:
                return
            last_log[0] = now
            typer.echo(f"  ... {rows:,} rows ({chunks} chunks)")

        t0 = _time.perf_counter()
        res = standardize_file(
            inp, out_path, options,
            chunk_size=chunk_size,
            progress_callback=_progress,
            encoding=enc,
            delimiter=delim,
        )
        elapsed = _time.perf_counter() - t0
        typer.echo(f"\n{'─'*60}")
        typer.echo(f"  File:           {inp.name}")
        typer.echo(f"  Rows:           {res.rows_processed:,}")
        typer.echo(f"  Chunks:         {res.chunks_processed}")
        typer.echo(f"  Cells changed:  {res.cells_changed:,}")
        typer.echo(
            f"  Cells unparseable: {res.cells_unparseable:,} / {res.cells_total:,}"
        )
        typer.echo(
            f"  Throughput:     {res.rows_processed / max(elapsed, 1e-9):,.0f} rows/sec"
        )
        typer.echo(f"  Elapsed:        {elapsed:.2f}s")
        typer.echo(f"{'─'*60}")
        typer.echo(f"\nStandardized:   {res.output_path}")
        if res.audit_path:
            typer.echo(f"Changes audit:  {res.audit_path}")
        typer.echo(f"Log:            {log_path}")
        return

    # In-memory path
    try:
        df = read_file(
            inp, encoding=enc, delimiter=delim, repair=False,
        )
        if not isinstance(df, pd.DataFrame):
            df = pd.concat(list(df), ignore_index=True)
    except Exception as e:
        typer.echo(f"Error reading file: {e}", err=True)
        raise typer.Exit(1)

    typer.echo(f"  {len(df):,} rows, {len(df.columns)} columns")

    typer.echo("Standardizing...")
    try:
        result = standardize_dataframe(df, options)
    except (ValueError, OSError) as e:
        typer.echo(f"Error: {e}", err=True)
        raise typer.Exit(1)

    pct = (result.cells_changed / result.cells_total * 100) if result.cells_total else 0
    typer.echo(f"\n{'─'*60}")
    typer.echo(f"  File:                {inp.name}")
    typer.echo(f"  Columns processed:   {len(result.columns_processed)}")
    typer.echo(f"  Cells scanned:       {result.cells_total:,}")
    typer.echo(f"  Cells changed:       {result.cells_changed:,} ({pct:.1f}%)")
    typer.echo(f"  Cells unparseable:   {result.cells_unparseable:,}")
    typer.echo(f"{'─'*60}")
    if result.cells_changed and not result.changes.empty:
        typer.echo("\nFirst examples:")
        for _, row in result.changes.head(5).iterrows():
            old = repr(row["old"])[:40]
            new = repr(row["new"])[:40]
            typer.echo(
                f"  Row {row['row'] + 1}, {row['column']} "
                f"({row['field_type']}): {old} → {new}"
            )

    if apply:
        from src.core.io import write_file
        write_file(result.standardized_df, out_path)
        typer.echo(f"\nStandardized:   {out_path}")
        if not result.changes.empty:
            audit_path = inp.parent / f"{inp.stem}_changes.csv"
            write_file(result.changes, audit_path)
            typer.echo(f"Changes audit:  {audit_path}")
    else:
        typer.echo("\nThis was a preview. Add --apply to write the output.")

    typer.echo(f"Log: {log_path}")


def main():
    from src.cli_license_guard import guard
    from src.license import FeatureFlag
    guard(feature=FeatureFlag.FORMAT_STANDARDIZER.value)
    app()


if __name__ == "__main__":
    main()