"""CLI for the DataTools Missing Value Handler (script 04). Usage: python -m src.cli_missing input.csv # profile only python -m src.cli_missing input.csv --apply # detect-only + write python -m src.cli_missing input.csv --preset safe-fill --apply python -m src.cli_missing input.csv --strategy median --apply python -m src.cli_missing input.csv --strategy drop_row --apply python -m src.cli_missing input.csv --strategy constant --fill-value 0 --apply python -m src.cli_missing input.csv --strategy median --columns age,score --apply python -m src.cli_missing input.csv --col-strategy "age:median,city:mode" --apply python -m src.cli_missing --help """ from __future__ import annotations import sys from datetime import datetime from pathlib import Path from typing import Optional import typer from loguru import logger app = typer.Typer( name="missing", help=( "Detect and handle missing values in CSV / Excel files.\n\n" "Default behaviour: profile only (no file written). Add --apply to " "write the handled output and audit log.\n\n" "Strategies:\n" " none, drop_row, drop_col, drop_both,\n" " mean, median, mode, constant,\n" " ffill, bfill, interpolate\n\n" "Examples:\n\n" " # Profile missingness without writing anything\n" " python -m src.cli_missing customers.csv\n\n" " # Standardize sentinels (\"N/A\", \"-\", \"NULL\", …) to NaN and write\n" " python -m src.cli_missing customers.csv --apply\n\n" " # Safe fill: numeric → median, categorical → mode\n" " python -m src.cli_missing customers.csv --preset safe-fill --apply\n\n" " # Drop rows missing >50%% of selected columns\n" " python -m src.cli_missing customers.csv --strategy drop_row " "--row-threshold 0.5 --apply\n\n" " # Per-column strategies\n" " python -m src.cli_missing customers.csv " "--col-strategy 'age:median,city:mode,notes:constant' --fill-value '' --apply\n" ), add_completion=False, no_args_is_help=True, ) # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _setup_logging(log_dir: Path) -> Path: log_dir.mkdir(parents=True, exist_ok=True) ts = datetime.now().strftime("%Y%m%d_%H%M%S") log_path = log_dir / f"missing_{ts}.log" logger.remove() logger.add(sys.stderr, level="WARNING", format="{message}") logger.add( str(log_path), level="DEBUG", format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}", ) return log_path def _split_csv_arg(raw: Optional[str]) -> Optional[list[str]]: if raw is None: return None return [c.strip() for c in raw.split(",") if c.strip()] def _parse_col_strategy(raw: Optional[str]) -> dict[str, str]: """Parse ``--col-strategy 'age:median,city:mode'`` into a dict.""" if not raw: return {} out: dict[str, str] = {} for piece in raw.split(","): piece = piece.strip() if not piece: continue if ":" not in piece: raise typer.BadParameter( f"Invalid --col-strategy piece: '{piece}'. " f"Expected 'col:strategy[,col:strategy...]'." ) col, strat = piece.split(":", 1) out[col.strip()] = strat.strip() return out # --------------------------------------------------------------------------- # Main command # --------------------------------------------------------------------------- @app.command() def handle( input_file: str = typer.Argument( ..., help="Path to the CSV or Excel file.", ), output: Optional[str] = typer.Option( None, "--output", "-o", help="Output file path. Default: {input}_missing.csv", ), apply: bool = typer.Option( False, "--apply", help="Write the output. Without this flag, only the profile is shown.", ), preset: str = typer.Option( "detect-only", "--preset", help="Preset: detect-only, safe-fill, or drop-incomplete.", ), strategy: Optional[str] = typer.Option( None, "--strategy", help=( "Override the preset strategy: none, drop_row, drop_col, drop_both, " "mean, median, mode, constant, ffill, bfill, interpolate." ), ), col_strategy: Optional[str] = typer.Option( None, "--col-strategy", help="Per-column strategies: 'col:strategy[,col:strategy...]'.", ), fill_value: Optional[str] = typer.Option( None, "--fill-value", help="Constant fill value (used with --strategy constant).", ), columns: Optional[str] = typer.Option( None, "--columns", help="Comma-separated columns to handle (default: all columns).", ), skip: Optional[str] = typer.Option( None, "--skip", help="Comma-separated columns to skip.", ), sentinels: Optional[str] = typer.Option( None, "--sentinels", help=( "Comma-separated extra sentinels to treat as missing " "(merged with the built-in defaults)." ), ), no_sentinels: bool = typer.Option( False, "--no-sentinels", help="Disable disguised-null standardization entirely.", ), row_threshold: float = typer.Option( 1.0, "--row-threshold", help=( "For drop_row: drop rows whose missing fraction across selected " "columns is STRICTLY GREATER than this value (0.0..1.0). " "Default 1.0 = never drop. Use 0.0 to drop any row with any " "missing; 0.5 to drop rows >50%% missing." ), ), col_threshold: float = typer.Option( 1.0, "--col-threshold", help=( "For drop_col: drop columns whose missing fraction is strictly " "greater than this value. Default 1.0 = never drop." ), ), config: Optional[str] = typer.Option( None, "--config", help="Load options from a saved JSON config file.", ), save_config: Optional[str] = typer.Option( None, "--save-config", help="Save current options to a JSON config file.", ), sheet: Optional[str] = typer.Option( None, "--sheet", help="Excel sheet name or index (default: first sheet).", ), encoding_override: Optional[str] = typer.Option( None, "--encoding", help="Override auto-detected file encoding.", ), header_row: Optional[int] = typer.Option( None, "--header-row", help="0-based row index for the header (default: auto-detect).", ), full_changelog: bool = typer.Option( False, "--full-changelog", help="Write every change to the audit CSV (default caps to first 1000).", ), ): """Detect and handle missing values.""" from src.core.io import read_file, write_file from src.core.missing import MissingOptions, PRESETS, handle_missing import pandas as pd # Validate inputs input_path = Path(input_file) if not input_path.exists(): typer.echo(f"Error: File not found: {input_path}", err=True) raise typer.Exit(1) if preset not in PRESETS: typer.echo( f"Error: Unknown preset '{preset}'. " f"Choose from: {', '.join(sorted(PRESETS))}.", err=True, ) raise typer.Exit(1) log_path = _setup_logging(Path("logs")) # Build options if config: cfg_path = Path(config) if not cfg_path.exists(): typer.echo(f"Error: Config file not found: {cfg_path}", err=True) raise typer.Exit(1) options = MissingOptions.from_file(cfg_path) logger.info("Loaded config from {}", cfg_path) else: options = MissingOptions.from_preset(preset) if strategy: options.strategy = strategy # type: ignore[assignment] if col_strategy: options.column_strategies = _parse_col_strategy(col_strategy) # type: ignore[assignment] if fill_value is not None: options.fill_value = fill_value cols_list = _split_csv_arg(columns) if cols_list is not None: options.columns = cols_list skip_list = _split_csv_arg(skip) if skip_list: options.skip_columns = skip_list extra = _split_csv_arg(sentinels) if extra: options.sentinels = list(dict.fromkeys([*options.sentinels, *extra])) if no_sentinels: options.standardize_sentinels = False options.row_drop_threshold = row_threshold options.col_drop_threshold = col_threshold if save_config: saved = options.to_file(save_config) typer.echo(f"Config saved to {saved}") # Read input typer.echo(f"Reading {input_path.name}...") try: sheet_arg: str | int | None = None if sheet is not None: try: sheet_arg = int(sheet) except ValueError: sheet_arg = sheet df = read_file( input_path, encoding=encoding_override, header_row=header_row, sheet_name=sheet_arg if sheet_arg is not None else 0, repair=False, ) if not isinstance(df, pd.DataFrame): df = pd.concat(list(df), ignore_index=True) except Exception as e: typer.echo(f"Error reading file: {e}", err=True) raise typer.Exit(1) typer.echo(f" {len(df)} rows, {len(df.columns)} columns") # Run typer.echo("Profiling missingness...") try: result = handle_missing(df, options) except (ValueError, OSError) as e: typer.echo(f"Error: {e}", err=True) raise typer.Exit(1) _print_results(result, input_path, options) # Write if apply: stem = input_path.stem out_path = Path(output) if output else input_path.parent / f"{stem}_missing.csv" write_file(result.handled_df, out_path) typer.echo(f"\nHandled file: {out_path}") if not result.changes.empty: changes_path = input_path.parent / f"{stem}_missing_changes.csv" audit_df = result.changes cap = 1000 if not full_changelog and len(audit_df) > cap: typer.echo( f"Note: changelog capped at {cap} rows. " f"Use --full-changelog to write all {len(audit_df)} changes." ) audit_df = audit_df.head(cap) write_file(audit_df, changes_path) typer.echo(f"Changes audit: {changes_path}") else: typer.echo( "\nThis was a profile only. Add --apply to write the handled output." ) typer.echo(f"Log: {log_path}") # --------------------------------------------------------------------------- # Output formatting # --------------------------------------------------------------------------- def _print_results(result, input_path: Path, options) -> None: typer.echo(f"\n{'─'*60}") typer.echo(f" File: {input_path.name}") typer.echo(f" Rows: {result.profile_before.rows_total}") typer.echo(f" Columns processed: {len(result.columns_processed)}") typer.echo( f" Cells missing: " f"{result.profile_before.cells_missing} / {result.profile_before.cells_total}" f" ({result.profile_before.cells_missing_pct:.1f}%)" ) typer.echo( f" Rows w/ any missing: " f"{result.profile_before.rows_with_any_missing} " f"(complete: {result.profile_before.rows_complete})" ) typer.echo(f"{'─'*60}") typer.echo("\nPer-column profile:") profile_df = result.profile_before.to_dataframe() for _, row in profile_df.iterrows(): marker = " " if row["missing"] == 0 else " " typer.echo( f"{marker}{row['column']:<24} {row['dtype']:<10} " f"missing={row['missing']:<6} ({row['missing_pct']:>5.1f}%)" + ( f" top sentinel: {row['top_sentinel']!r} ×{row['top_sentinel_count']}" if row["top_sentinel_count"] else "" ) ) typer.echo("\nActions:") typer.echo(f" Sentinels standardized to NaN: {result.sentinels_standardized}") typer.echo(f" Cells filled: {result.cells_filled}") typer.echo(f" Rows dropped: {result.rows_dropped}") typer.echo( f" Columns dropped: {len(result.columns_dropped)}" + (f" ({', '.join(result.columns_dropped)})" if result.columns_dropped else "") ) if result.strategy_per_column: typer.echo("\nStrategy per column:") for col, strat in result.strategy_per_column.items(): typer.echo(f" {col}: {strat}") if not result.changes.empty: typer.echo("\nFirst examples:") for _, row in result.changes.head(5).iterrows(): old = repr(row["old"])[:40] new = repr(row["new"])[:40] row_label = "—" if row["row"] == -1 else f"Row {row['row'] + 1}" typer.echo( f" {row_label}, {row['column']}: {old} → {new} " f"[{row['action']}]" ) # --------------------------------------------------------------------------- # __main__ # --------------------------------------------------------------------------- def main(): from src.cli_license_guard import guard from src.license import FeatureFlag guard(feature=FeatureFlag.MISSING_HANDLER.value) app() if __name__ == "__main__": main()