datatools-dev/src/cli_column_map.py

"""CLI for the DataTools Column Mapper (script 05).

Usage:
    python -m src.cli_column_map input.csv                              # auto-mapping preview
    python -m src.cli_column_map input.csv --schema target.json --apply
    python -m src.cli_column_map input.csv --rename "First Name=first_name,Email=email" --apply
    python -m src.cli_column_map input.csv --schema target.json --preset strict-schema --apply
    python -m src.cli_column_map input.csv --schema target.json --coerce --apply
    python -m src.cli_column_map --help
"""

from __future__ import annotations

import json
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional

import typer
from loguru import logger

app = typer.Typer(
    name="column-map",
    help=(
        "Rename columns, enforce a target schema, and coerce types in CSV / Excel files.\n\n"
        "Default behaviour: preview the mapping (no file written). Add --apply "
        "to write the mapped output and audit log.\n\n"
        "Examples:\n\n"
        "  # Show what auto-mapping would do (no schema → identity)\n"
        "  python -m src.cli_column_map vendor.csv\n\n"
        "  # Map against a target JSON schema with strict drop / coerce / reorder\n"
        "  python -m src.cli_column_map vendor.csv --schema target.json "
        "--preset strict-schema --apply\n\n"
        "  # Hand-rolled rename without a schema\n"
        "  python -m src.cli_column_map data.csv "
        "--rename 'First Name=first_name,Last Name=last_name' --apply\n\n"
        "  # Coerce specific columns inline\n"
        "  python -m src.cli_column_map data.csv "
        "--coerce-col 'age:integer,joined:date' --apply\n"
    ),
    add_completion=False,
    no_args_is_help=True,
)


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _setup_logging(log_dir: Path) -> Path:
    log_dir.mkdir(parents=True, exist_ok=True)
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_path = log_dir / f"column_map_{ts}.log"
    logger.remove()
    logger.add(sys.stderr, level="WARNING", format="{message}")
    logger.add(
        str(log_path),
        level="DEBUG",
        format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}",
    )
    return log_path


def _parse_pairs(raw: Optional[str], separator: str = ",") -> dict[str, str]:
    """Parse ``a=1,b=2`` into a dict."""
    if not raw:
        return {}
    out: dict[str, str] = {}
    for piece in raw.split(separator):
        piece = piece.strip()
        if not piece:
            continue
        if "=" not in piece:
            raise typer.BadParameter(
                f"Invalid pair: {piece!r}. Expected 'key=value[,key=value...]'."
            )
        k, v = piece.split("=", 1)
        out[k.strip()] = v.strip()
    return out


def _parse_coerce(raw: Optional[str]) -> dict[str, str]:
    """Parse ``age:integer,joined:date`` into a dict."""
    if not raw:
        return {}
    out: dict[str, str] = {}
    for piece in raw.split(","):
        piece = piece.strip()
        if not piece:
            continue
        if ":" not in piece:
            raise typer.BadParameter(
                f"Invalid --coerce-col piece: {piece!r}. "
                f"Expected 'col:dtype[,col:dtype...]'."
            )
        col, dtype = piece.split(":", 1)
        out[col.strip()] = dtype.strip()
    return out


# ---------------------------------------------------------------------------
# Main command
# ---------------------------------------------------------------------------

@app.command()
def map_(
    input_file: str = typer.Argument(
        ...,
        help="Path to the CSV or Excel file.",
    ),
    output: Optional[str] = typer.Option(
        None, "--output", "-o",
        help="Output file path. Default: {input}_mapped.csv",
    ),
    apply: bool = typer.Option(
        False, "--apply",
        help="Write the output. Without this flag, only the mapping plan is shown.",
    ),
    preset: str = typer.Option(
        "rename-only", "--preset",
        help="Preset: rename-only, strict-schema, or lenient-schema.",
    ),
    schema: Optional[str] = typer.Option(
        None, "--schema",
        help="Path to a target schema JSON file (TargetSchema format).",
    ),
    rename: Optional[str] = typer.Option(
        None, "--rename",
        help="Explicit rename pairs: 'src=tgt[,src=tgt...]' (overrides auto-inference).",
    ),
    coerce_col: Optional[str] = typer.Option(
        None, "--coerce-col",
        help=(
            "Inline type coercion (no schema needed): 'col:dtype[,col:dtype...]'. "
            "Valid dtypes: string, integer, float, boolean, date, datetime, category, auto."
        ),
    ),
    unmapped: Optional[str] = typer.Option(
        None, "--unmapped",
        help="Strategy for unmapped source columns: keep | drop | error.",
    ),
    threshold: Optional[float] = typer.Option(
        None, "--threshold",
        help="Fuzzy-match threshold for auto-inference (0.0..1.0). Default 0.6.",
    ),
    no_auto: bool = typer.Option(
        False, "--no-auto",
        help="Disable auto-inference; honour only explicit --rename pairs.",
    ),
    no_coerce: bool = typer.Option(
        False, "--no-coerce",
        help="Disable type coercion (overrides preset).",
    ),
    no_reorder: bool = typer.Option(
        False, "--no-reorder",
        help="Disable schema-order reorder (overrides preset).",
    ),
    no_required: bool = typer.Option(
        False, "--no-required",
        help="Don't enforce required-target presence (overrides preset).",
    ),
    config: Optional[str] = typer.Option(
        None, "--config",
        help="Load options from a saved JSON config file.",
    ),
    save_config: Optional[str] = typer.Option(
        None, "--save-config",
        help="Save current options to a JSON config file.",
    ),
    sheet: Optional[str] = typer.Option(
        None, "--sheet",
        help="Excel sheet name or index (default: first sheet).",
    ),
    encoding_override: Optional[str] = typer.Option(
        None, "--encoding",
        help="Override auto-detected file encoding.",
    ),
    header_row: Optional[int] = typer.Option(
        None, "--header-row",
        help="0-based row index for the header (default: auto-detect).",
    ),
):
    """Map source columns to a target schema; rename, coerce, drop, reorder."""
    from src.core.io import read_file, write_file
    from src.core.column_mapper import (
        MapOptions,
        PRESETS,
        TargetField,
        TargetSchema,
        coerce_series,
        map_columns,
    )
    import pandas as pd

    input_path = Path(input_file)
    if not input_path.exists():
        typer.echo(f"Error: File not found: {input_path}", err=True)
        raise typer.Exit(1)

    if preset not in PRESETS:
        typer.echo(
            f"Error: Unknown preset '{preset}'. "
            f"Choose from: {', '.join(sorted(PRESETS))}.",
            err=True,
        )
        raise typer.Exit(1)

    log_path = _setup_logging(Path("logs"))

    # Build options
    if config:
        cfg_path = Path(config)
        if not cfg_path.exists():
            typer.echo(f"Error: Config file not found: {cfg_path}", err=True)
            raise typer.Exit(1)
        options = MapOptions.from_file(cfg_path)
    else:
        options = MapOptions.from_preset(preset)

    if schema:
        sp = Path(schema)
        if not sp.exists():
            typer.echo(f"Error: Schema file not found: {sp}", err=True)
            raise typer.Exit(1)
        options.schema = TargetSchema.from_file(sp)
    if rename:
        options.mapping = {**options.mapping, **_parse_pairs(rename)}
    if unmapped:
        options.unmapped = unmapped  # type: ignore[assignment]
    if threshold is not None:
        options.fuzzy_threshold = threshold
    if no_auto:
        options.auto_infer = False
    if no_coerce:
        options.coerce_types = False
    if no_reorder:
        options.reorder_to_schema = False
    if no_required:
        options.enforce_required = False

    # Inline coercion (no schema): build a tiny one-field-per-column schema.
    inline_coerce = _parse_coerce(coerce_col)
    if inline_coerce and options.schema is None:
        options.schema = TargetSchema(fields=[
            TargetField(name=col, dtype=dt)  # type: ignore[arg-type]
            for col, dt in inline_coerce.items()
        ])
        options.coerce_types = True

    if save_config:
        saved = options.to_file(save_config)
        typer.echo(f"Config saved to {saved}")

    # Read input
    typer.echo(f"Reading {input_path.name}...")
    try:
        sheet_arg: str | int | None = None
        if sheet is not None:
            try:
                sheet_arg = int(sheet)
            except ValueError:
                sheet_arg = sheet
        df = read_file(
            input_path,
            encoding=encoding_override,
            header_row=header_row,
            sheet_name=sheet_arg if sheet_arg is not None else 0,
            repair=False,
        )
        if not isinstance(df, pd.DataFrame):
            df = pd.concat(list(df), ignore_index=True)
    except Exception as e:
        typer.echo(f"Error reading file: {e}", err=True)
        raise typer.Exit(1)

    typer.echo(f"  {len(df)} rows, {len(df.columns)} columns")

    typer.echo("Mapping columns...")
    try:
        result = map_columns(df, options)
    except (ValueError, OSError) as e:
        typer.echo(f"Error: {e}", err=True)
        raise typer.Exit(1)

    _print_results(result, input_path, options)

    if apply:
        stem = input_path.stem
        out_path = Path(output) if output else input_path.parent / f"{stem}_mapped.csv"
        write_file(result.mapped_df, out_path)
        typer.echo(f"\nMapped file:    {out_path}")
        # Audit: write the resolved mapping as JSON next to the output.
        audit_path = input_path.parent / f"{stem}_mapping.json"
        audit_path.write_text(json.dumps({
            "mapping": result.mapping,
            "inferred_pairs": result.inferred_pairs,
            "columns_renamed": result.columns_renamed,
            "columns_dropped": result.columns_dropped,
            "columns_added": result.columns_added,
            "coercion_failures": result.coercion_failures,
            "unmapped_kept": result.unmapped_kept,
            "missing_required_targets": result.missing_required_targets,
        }, indent=2, default=str))
        typer.echo(f"Mapping audit:  {audit_path}")
    else:
        typer.echo("\nThis was a preview. Add --apply to write the mapped output.")

    typer.echo(f"Log: {log_path}")


# ---------------------------------------------------------------------------
# Output formatting
# ---------------------------------------------------------------------------

def _print_results(result, input_path: Path, options) -> None:
    typer.echo(f"\n{'─'*60}")
    typer.echo(f"  File:                 {input_path.name}")
    typer.echo(f"  Columns renamed:      {result.columns_renamed}")
    typer.echo(f"  Columns dropped:      {len(result.columns_dropped)}")
    typer.echo(f"  Columns added:        {len(result.columns_added)}")
    typer.echo(f"  Unmapped kept:        {len(result.unmapped_kept)}")
    typer.echo(f"  Coercion failures:    "
               f"{sum(result.coercion_failures.values())} cells across "
               f"{len(result.coercion_failures)} column(s)")
    typer.echo(f"{'─'*60}")

    if result.mapping:
        typer.echo("\nMapping:")
        for src, tgt in result.mapping.items():
            tag = " (auto)" if src in result.inferred_pairs else ""
            arrow = "→" if src != tgt else "≡"
            typer.echo(f"  {src!r} {arrow} {tgt!r}{tag}")
    if result.columns_dropped:
        typer.echo(f"\nDropped: {result.columns_dropped}")
    if result.columns_added:
        typer.echo(f"\nAdded (defaults): {result.columns_added}")
    if result.coercion_failures:
        typer.echo("\nCoercion failures:")
        for col, n in result.coercion_failures.items():
            typer.echo(f"  {col}: {n} row(s) could not be coerced")
    if result.missing_required_targets:
        typer.echo(f"\nMissing required targets: {result.missing_required_targets}")


# ---------------------------------------------------------------------------
# __main__
# ---------------------------------------------------------------------------

def main():
    from src.cli_license_guard import guard
    from src.license import FeatureFlag
    guard(feature=FeatureFlag.COLUMN_MAPPER.value)
    app()


if __name__ == "__main__":
    main()