"""CLI for the DataTools Column Mapper (script 05). Usage: python -m src.cli_column_map input.csv # auto-mapping preview python -m src.cli_column_map input.csv --schema target.json --apply python -m src.cli_column_map input.csv --rename "First Name=first_name,Email=email" --apply python -m src.cli_column_map input.csv --schema target.json --preset strict-schema --apply python -m src.cli_column_map input.csv --schema target.json --coerce --apply python -m src.cli_column_map --help """ from __future__ import annotations import json import sys from datetime import datetime from pathlib import Path from typing import Optional import typer from loguru import logger app = typer.Typer( name="column-map", help=( "Rename columns, enforce a target schema, and coerce types in CSV / Excel files.\n\n" "Default behaviour: preview the mapping (no file written). Add --apply " "to write the mapped output and audit log.\n\n" "Examples:\n\n" " # Show what auto-mapping would do (no schema → identity)\n" " python -m src.cli_column_map vendor.csv\n\n" " # Map against a target JSON schema with strict drop / coerce / reorder\n" " python -m src.cli_column_map vendor.csv --schema target.json " "--preset strict-schema --apply\n\n" " # Hand-rolled rename without a schema\n" " python -m src.cli_column_map data.csv " "--rename 'First Name=first_name,Last Name=last_name' --apply\n\n" " # Coerce specific columns inline\n" " python -m src.cli_column_map data.csv " "--coerce-col 'age:integer,joined:date' --apply\n" ), add_completion=False, no_args_is_help=True, ) # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _setup_logging(log_dir: Path) -> Path: log_dir.mkdir(parents=True, exist_ok=True) ts = datetime.now().strftime("%Y%m%d_%H%M%S") log_path = log_dir / f"column_map_{ts}.log" logger.remove() logger.add(sys.stderr, level="WARNING", format="{message}") logger.add( str(log_path), level="DEBUG", format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}", ) return log_path def _parse_pairs(raw: Optional[str], separator: str = ",") -> dict[str, str]: """Parse ``a=1,b=2`` into a dict.""" if not raw: return {} out: dict[str, str] = {} for piece in raw.split(separator): piece = piece.strip() if not piece: continue if "=" not in piece: raise typer.BadParameter( f"Invalid pair: {piece!r}. Expected 'key=value[,key=value...]'." ) k, v = piece.split("=", 1) out[k.strip()] = v.strip() return out def _parse_coerce(raw: Optional[str]) -> dict[str, str]: """Parse ``age:integer,joined:date`` into a dict.""" if not raw: return {} out: dict[str, str] = {} for piece in raw.split(","): piece = piece.strip() if not piece: continue if ":" not in piece: raise typer.BadParameter( f"Invalid --coerce-col piece: {piece!r}. " f"Expected 'col:dtype[,col:dtype...]'." ) col, dtype = piece.split(":", 1) out[col.strip()] = dtype.strip() return out # --------------------------------------------------------------------------- # Main command # --------------------------------------------------------------------------- @app.command() def map_( input_file: str = typer.Argument( ..., help="Path to the CSV or Excel file.", ), output: Optional[str] = typer.Option( None, "--output", "-o", help="Output file path. Default: {input}_mapped.csv", ), apply: bool = typer.Option( False, "--apply", help="Write the output. Without this flag, only the mapping plan is shown.", ), preset: str = typer.Option( "rename-only", "--preset", help="Preset: rename-only, strict-schema, or lenient-schema.", ), schema: Optional[str] = typer.Option( None, "--schema", help="Path to a target schema JSON file (TargetSchema format).", ), rename: Optional[str] = typer.Option( None, "--rename", help="Explicit rename pairs: 'src=tgt[,src=tgt...]' (overrides auto-inference).", ), coerce_col: Optional[str] = typer.Option( None, "--coerce-col", help=( "Inline type coercion (no schema needed): 'col:dtype[,col:dtype...]'. " "Valid dtypes: string, integer, float, boolean, date, datetime, category, auto." ), ), unmapped: Optional[str] = typer.Option( None, "--unmapped", help="Strategy for unmapped source columns: keep | drop | error.", ), threshold: Optional[float] = typer.Option( None, "--threshold", help="Fuzzy-match threshold for auto-inference (0.0..1.0). Default 0.6.", ), no_auto: bool = typer.Option( False, "--no-auto", help="Disable auto-inference; honour only explicit --rename pairs.", ), no_coerce: bool = typer.Option( False, "--no-coerce", help="Disable type coercion (overrides preset).", ), no_reorder: bool = typer.Option( False, "--no-reorder", help="Disable schema-order reorder (overrides preset).", ), no_required: bool = typer.Option( False, "--no-required", help="Don't enforce required-target presence (overrides preset).", ), config: Optional[str] = typer.Option( None, "--config", help="Load options from a saved JSON config file.", ), save_config: Optional[str] = typer.Option( None, "--save-config", help="Save current options to a JSON config file.", ), sheet: Optional[str] = typer.Option( None, "--sheet", help="Excel sheet name or index (default: first sheet).", ), encoding_override: Optional[str] = typer.Option( None, "--encoding", help="Override auto-detected file encoding.", ), header_row: Optional[int] = typer.Option( None, "--header-row", help="0-based row index for the header (default: auto-detect).", ), ): """Map source columns to a target schema; rename, coerce, drop, reorder.""" from src.core.io import read_file, write_file from src.core.column_mapper import ( MapOptions, PRESETS, TargetField, TargetSchema, coerce_series, map_columns, ) import pandas as pd input_path = Path(input_file) if not input_path.exists(): typer.echo(f"Error: File not found: {input_path}", err=True) raise typer.Exit(1) if preset not in PRESETS: typer.echo( f"Error: Unknown preset '{preset}'. " f"Choose from: {', '.join(sorted(PRESETS))}.", err=True, ) raise typer.Exit(1) log_path = _setup_logging(Path("logs")) # Build options if config: cfg_path = Path(config) if not cfg_path.exists(): typer.echo(f"Error: Config file not found: {cfg_path}", err=True) raise typer.Exit(1) options = MapOptions.from_file(cfg_path) else: options = MapOptions.from_preset(preset) if schema: sp = Path(schema) if not sp.exists(): typer.echo(f"Error: Schema file not found: {sp}", err=True) raise typer.Exit(1) options.schema = TargetSchema.from_file(sp) if rename: options.mapping = {**options.mapping, **_parse_pairs(rename)} if unmapped: options.unmapped = unmapped # type: ignore[assignment] if threshold is not None: options.fuzzy_threshold = threshold if no_auto: options.auto_infer = False if no_coerce: options.coerce_types = False if no_reorder: options.reorder_to_schema = False if no_required: options.enforce_required = False # Inline coercion (no schema): build a tiny one-field-per-column schema. inline_coerce = _parse_coerce(coerce_col) if inline_coerce and options.schema is None: options.schema = TargetSchema(fields=[ TargetField(name=col, dtype=dt) # type: ignore[arg-type] for col, dt in inline_coerce.items() ]) options.coerce_types = True if save_config: saved = options.to_file(save_config) typer.echo(f"Config saved to {saved}") # Read input typer.echo(f"Reading {input_path.name}...") try: sheet_arg: str | int | None = None if sheet is not None: try: sheet_arg = int(sheet) except ValueError: sheet_arg = sheet df = read_file( input_path, encoding=encoding_override, header_row=header_row, sheet_name=sheet_arg if sheet_arg is not None else 0, repair=False, ) if not isinstance(df, pd.DataFrame): df = pd.concat(list(df), ignore_index=True) except Exception as e: typer.echo(f"Error reading file: {e}", err=True) raise typer.Exit(1) typer.echo(f" {len(df)} rows, {len(df.columns)} columns") typer.echo("Mapping columns...") try: result = map_columns(df, options) except (ValueError, OSError) as e: typer.echo(f"Error: {e}", err=True) raise typer.Exit(1) _print_results(result, input_path, options) if apply: stem = input_path.stem out_path = Path(output) if output else input_path.parent / f"{stem}_mapped.csv" write_file(result.mapped_df, out_path) typer.echo(f"\nMapped file: {out_path}") # Audit: write the resolved mapping as JSON next to the output. audit_path = input_path.parent / f"{stem}_mapping.json" audit_path.write_text(json.dumps({ "mapping": result.mapping, "inferred_pairs": result.inferred_pairs, "columns_renamed": result.columns_renamed, "columns_dropped": result.columns_dropped, "columns_added": result.columns_added, "coercion_failures": result.coercion_failures, "unmapped_kept": result.unmapped_kept, "missing_required_targets": result.missing_required_targets, }, indent=2, default=str)) typer.echo(f"Mapping audit: {audit_path}") else: typer.echo("\nThis was a preview. Add --apply to write the mapped output.") typer.echo(f"Log: {log_path}") # --------------------------------------------------------------------------- # Output formatting # --------------------------------------------------------------------------- def _print_results(result, input_path: Path, options) -> None: typer.echo(f"\n{'─'*60}") typer.echo(f" File: {input_path.name}") typer.echo(f" Columns renamed: {result.columns_renamed}") typer.echo(f" Columns dropped: {len(result.columns_dropped)}") typer.echo(f" Columns added: {len(result.columns_added)}") typer.echo(f" Unmapped kept: {len(result.unmapped_kept)}") typer.echo(f" Coercion failures: " f"{sum(result.coercion_failures.values())} cells across " f"{len(result.coercion_failures)} column(s)") typer.echo(f"{'─'*60}") if result.mapping: typer.echo("\nMapping:") for src, tgt in result.mapping.items(): tag = " (auto)" if src in result.inferred_pairs else "" arrow = "→" if src != tgt else "≡" typer.echo(f" {src!r} {arrow} {tgt!r}{tag}") if result.columns_dropped: typer.echo(f"\nDropped: {result.columns_dropped}") if result.columns_added: typer.echo(f"\nAdded (defaults): {result.columns_added}") if result.coercion_failures: typer.echo("\nCoercion failures:") for col, n in result.coercion_failures.items(): typer.echo(f" {col}: {n} row(s) could not be coerced") if result.missing_required_targets: typer.echo(f"\nMissing required targets: {result.missing_required_targets}") # --------------------------------------------------------------------------- # __main__ # --------------------------------------------------------------------------- def main(): from src.cli_license_guard import guard from src.license import FeatureFlag guard(feature=FeatureFlag.COLUMN_MAPPER.value) app() if __name__ == "__main__": main()