datatools-dev/src/cli_missing.py

"""CLI for the DataTools Missing Value Handler (script 04).

Usage:
    python -m src.cli_missing input.csv                              # profile only
    python -m src.cli_missing input.csv --apply                      # detect-only + write
    python -m src.cli_missing input.csv --preset safe-fill --apply
    python -m src.cli_missing input.csv --strategy median --apply
    python -m src.cli_missing input.csv --strategy drop_row --apply
    python -m src.cli_missing input.csv --strategy constant --fill-value 0 --apply
    python -m src.cli_missing input.csv --strategy median --columns age,score --apply
    python -m src.cli_missing input.csv --col-strategy "age:median,city:mode" --apply
    python -m src.cli_missing --help
"""

from __future__ import annotations

import sys
from datetime import datetime
from pathlib import Path
from typing import Optional

import typer
from loguru import logger

app = typer.Typer(
    name="missing",
    help=(
        "Detect and handle missing values in CSV / Excel files.\n\n"
        "Default behaviour: profile only (no file written). Add --apply to "
        "write the handled output and audit log.\n\n"
        "Strategies:\n"
        "  none, drop_row, drop_col, drop_both,\n"
        "  mean, median, mode, constant,\n"
        "  ffill, bfill, interpolate\n\n"
        "Examples:\n\n"
        "  # Profile missingness without writing anything\n"
        "  python -m src.cli_missing customers.csv\n\n"
        "  # Standardize sentinels (\"N/A\", \"-\", \"NULL\", …) to NaN and write\n"
        "  python -m src.cli_missing customers.csv --apply\n\n"
        "  # Safe fill: numeric → median, categorical → mode\n"
        "  python -m src.cli_missing customers.csv --preset safe-fill --apply\n\n"
        "  # Drop rows missing >50%% of selected columns\n"
        "  python -m src.cli_missing customers.csv --strategy drop_row "
        "--row-threshold 0.5 --apply\n\n"
        "  # Per-column strategies\n"
        "  python -m src.cli_missing customers.csv "
        "--col-strategy 'age:median,city:mode,notes:constant' --fill-value '' --apply\n"
    ),
    add_completion=False,
    no_args_is_help=True,
)


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _setup_logging(log_dir: Path) -> Path:
    log_dir.mkdir(parents=True, exist_ok=True)
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_path = log_dir / f"missing_{ts}.log"
    logger.remove()
    logger.add(sys.stderr, level="WARNING", format="{message}")
    logger.add(
        str(log_path),
        level="DEBUG",
        format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}",
    )
    return log_path


def _split_csv_arg(raw: Optional[str]) -> Optional[list[str]]:
    if raw is None:
        return None
    return [c.strip() for c in raw.split(",") if c.strip()]


def _parse_col_strategy(raw: Optional[str]) -> dict[str, str]:
    """Parse ``--col-strategy 'age:median,city:mode'`` into a dict."""
    if not raw:
        return {}
    out: dict[str, str] = {}
    for piece in raw.split(","):
        piece = piece.strip()
        if not piece:
            continue
        if ":" not in piece:
            raise typer.BadParameter(
                f"Invalid --col-strategy piece: '{piece}'. "
                f"Expected 'col:strategy[,col:strategy...]'."
            )
        col, strat = piece.split(":", 1)
        out[col.strip()] = strat.strip()
    return out


# ---------------------------------------------------------------------------
# Main command
# ---------------------------------------------------------------------------

@app.command()
def handle(
    input_file: str = typer.Argument(
        ...,
        help="Path to the CSV or Excel file.",
    ),
    output: Optional[str] = typer.Option(
        None, "--output", "-o",
        help="Output file path. Default: {input}_missing.csv",
    ),
    apply: bool = typer.Option(
        False, "--apply",
        help="Write the output. Without this flag, only the profile is shown.",
    ),
    preset: str = typer.Option(
        "detect-only", "--preset",
        help="Preset: detect-only, safe-fill, or drop-incomplete.",
    ),
    strategy: Optional[str] = typer.Option(
        None, "--strategy",
        help=(
            "Override the preset strategy: none, drop_row, drop_col, drop_both, "
            "mean, median, mode, constant, ffill, bfill, interpolate."
        ),
    ),
    col_strategy: Optional[str] = typer.Option(
        None, "--col-strategy",
        help="Per-column strategies: 'col:strategy[,col:strategy...]'.",
    ),
    fill_value: Optional[str] = typer.Option(
        None, "--fill-value",
        help="Constant fill value (used with --strategy constant).",
    ),
    columns: Optional[str] = typer.Option(
        None, "--columns",
        help="Comma-separated columns to handle (default: all columns).",
    ),
    skip: Optional[str] = typer.Option(
        None, "--skip",
        help="Comma-separated columns to skip.",
    ),
    sentinels: Optional[str] = typer.Option(
        None, "--sentinels",
        help=(
            "Comma-separated extra sentinels to treat as missing "
            "(merged with the built-in defaults)."
        ),
    ),
    no_sentinels: bool = typer.Option(
        False, "--no-sentinels",
        help="Disable disguised-null standardization entirely.",
    ),
    row_threshold: float = typer.Option(
        1.0, "--row-threshold",
        help=(
            "For drop_row: drop rows whose missing fraction across selected "
            "columns is STRICTLY GREATER than this value (0.0..1.0). "
            "Default 1.0 = never drop. Use 0.0 to drop any row with any "
            "missing; 0.5 to drop rows >50%% missing."
        ),
    ),
    col_threshold: float = typer.Option(
        1.0, "--col-threshold",
        help=(
            "For drop_col: drop columns whose missing fraction is strictly "
            "greater than this value. Default 1.0 = never drop."
        ),
    ),
    config: Optional[str] = typer.Option(
        None, "--config",
        help="Load options from a saved JSON config file.",
    ),
    save_config: Optional[str] = typer.Option(
        None, "--save-config",
        help="Save current options to a JSON config file.",
    ),
    sheet: Optional[str] = typer.Option(
        None, "--sheet",
        help="Excel sheet name or index (default: first sheet).",
    ),
    encoding_override: Optional[str] = typer.Option(
        None, "--encoding",
        help="Override auto-detected file encoding.",
    ),
    header_row: Optional[int] = typer.Option(
        None, "--header-row",
        help="0-based row index for the header (default: auto-detect).",
    ),
    full_changelog: bool = typer.Option(
        False, "--full-changelog",
        help="Write every change to the audit CSV (default caps to first 1000).",
    ),
):
    """Detect and handle missing values."""
    from src.core.io import read_file, write_file
    from src.core.missing import MissingOptions, PRESETS, handle_missing
    import pandas as pd

    # Validate inputs
    input_path = Path(input_file)
    if not input_path.exists():
        typer.echo(f"Error: File not found: {input_path}", err=True)
        raise typer.Exit(1)

    if preset not in PRESETS:
        typer.echo(
            f"Error: Unknown preset '{preset}'. "
            f"Choose from: {', '.join(sorted(PRESETS))}.",
            err=True,
        )
        raise typer.Exit(1)

    log_path = _setup_logging(Path("logs"))

    # Build options
    if config:
        cfg_path = Path(config)
        if not cfg_path.exists():
            typer.echo(f"Error: Config file not found: {cfg_path}", err=True)
            raise typer.Exit(1)
        options = MissingOptions.from_file(cfg_path)
        logger.info("Loaded config from {}", cfg_path)
    else:
        options = MissingOptions.from_preset(preset)

    if strategy:
        options.strategy = strategy  # type: ignore[assignment]
    if col_strategy:
        options.column_strategies = _parse_col_strategy(col_strategy)  # type: ignore[assignment]
    if fill_value is not None:
        options.fill_value = fill_value
    cols_list = _split_csv_arg(columns)
    if cols_list is not None:
        options.columns = cols_list
    skip_list = _split_csv_arg(skip)
    if skip_list:
        options.skip_columns = skip_list
    extra = _split_csv_arg(sentinels)
    if extra:
        options.sentinels = list(dict.fromkeys([*options.sentinels, *extra]))
    if no_sentinels:
        options.standardize_sentinels = False
    options.row_drop_threshold = row_threshold
    options.col_drop_threshold = col_threshold

    if save_config:
        saved = options.to_file(save_config)
        typer.echo(f"Config saved to {saved}")

    # Read input
    typer.echo(f"Reading {input_path.name}...")
    try:
        sheet_arg: str | int | None = None
        if sheet is not None:
            try:
                sheet_arg = int(sheet)
            except ValueError:
                sheet_arg = sheet
        df = read_file(
            input_path,
            encoding=encoding_override,
            header_row=header_row,
            sheet_name=sheet_arg if sheet_arg is not None else 0,
            repair=False,
        )
        if not isinstance(df, pd.DataFrame):
            df = pd.concat(list(df), ignore_index=True)
    except Exception as e:
        typer.echo(f"Error reading file: {e}", err=True)
        raise typer.Exit(1)

    typer.echo(f"  {len(df)} rows, {len(df.columns)} columns")

    # Run
    typer.echo("Profiling missingness...")
    try:
        result = handle_missing(df, options)
    except (ValueError, OSError) as e:
        typer.echo(f"Error: {e}", err=True)
        raise typer.Exit(1)

    _print_results(result, input_path, options)

    # Write
    if apply:
        stem = input_path.stem
        out_path = Path(output) if output else input_path.parent / f"{stem}_missing.csv"
        write_file(result.handled_df, out_path)
        typer.echo(f"\nHandled file:    {out_path}")

        if not result.changes.empty:
            changes_path = input_path.parent / f"{stem}_missing_changes.csv"
            audit_df = result.changes
            cap = 1000
            if not full_changelog and len(audit_df) > cap:
                typer.echo(
                    f"Note: changelog capped at {cap} rows. "
                    f"Use --full-changelog to write all {len(audit_df)} changes."
                )
                audit_df = audit_df.head(cap)
            write_file(audit_df, changes_path)
            typer.echo(f"Changes audit:   {changes_path}")
    else:
        typer.echo(
            "\nThis was a profile only. Add --apply to write the handled output."
        )

    typer.echo(f"Log: {log_path}")


# ---------------------------------------------------------------------------
# Output formatting
# ---------------------------------------------------------------------------

def _print_results(result, input_path: Path, options) -> None:
    typer.echo(f"\n{'─'*60}")
    typer.echo(f"  File:                 {input_path.name}")
    typer.echo(f"  Rows:                 {result.profile_before.rows_total}")
    typer.echo(f"  Columns processed:    {len(result.columns_processed)}")
    typer.echo(
        f"  Cells missing:        "
        f"{result.profile_before.cells_missing} / {result.profile_before.cells_total}"
        f" ({result.profile_before.cells_missing_pct:.1f}%)"
    )
    typer.echo(
        f"  Rows w/ any missing:  "
        f"{result.profile_before.rows_with_any_missing} "
        f"(complete: {result.profile_before.rows_complete})"
    )
    typer.echo(f"{'─'*60}")

    typer.echo("\nPer-column profile:")
    profile_df = result.profile_before.to_dataframe()
    for _, row in profile_df.iterrows():
        marker = "  " if row["missing"] == 0 else "  "
        typer.echo(
            f"{marker}{row['column']:<24} {row['dtype']:<10} "
            f"missing={row['missing']:<6} ({row['missing_pct']:>5.1f}%)"
            + (
                f"  top sentinel: {row['top_sentinel']!r} ×{row['top_sentinel_count']}"
                if row["top_sentinel_count"] else ""
            )
        )

    typer.echo("\nActions:")
    typer.echo(f"  Sentinels standardized to NaN:  {result.sentinels_standardized}")
    typer.echo(f"  Cells filled:                   {result.cells_filled}")
    typer.echo(f"  Rows dropped:                   {result.rows_dropped}")
    typer.echo(
        f"  Columns dropped:                {len(result.columns_dropped)}"
        + (f" ({', '.join(result.columns_dropped)})" if result.columns_dropped else "")
    )

    if result.strategy_per_column:
        typer.echo("\nStrategy per column:")
        for col, strat in result.strategy_per_column.items():
            typer.echo(f"  {col}: {strat}")

    if not result.changes.empty:
        typer.echo("\nFirst examples:")
        for _, row in result.changes.head(5).iterrows():
            old = repr(row["old"])[:40]
            new = repr(row["new"])[:40]
            row_label = "—" if row["row"] == -1 else f"Row {row['row'] + 1}"
            typer.echo(
                f"  {row_label}, {row['column']}: {old} → {new} "
                f"[{row['action']}]"
            )


# ---------------------------------------------------------------------------
# __main__
# ---------------------------------------------------------------------------

def main():
    app()


if __name__ == "__main__":
    main()