Files
datatools-dev/src/cli_missing.py
Michael db5ec084da docs+code: rename tool labels everywhere
Sweep follow-up to 93e43fc. Display labels now consistent across docs,
landing pages, CLI output, code comments, docstrings, and test prose.
Five parallel surfaces touched:

- docs (EN + ES): README, USER-GUIDE, CLI-REFERENCE, and 11 internal
  design/planning docs
- landing pages: index + bookkeeper/revops/shopify-pet
- src: CLI module docstrings, _TOOL_DISPLAY dicts in cli_analyze.py
  and gui/components/_legacy.py, core module headers, every tool
  page's module docstring
- tests: class/method/module docstrings and section-header comments
- test-cases READMEs

Page slugs (1_Deduplicator etc.), tool_id strings (01_deduplicator
etc.), Python class names (TestDeduplicatorWorkflow, FeatureFlag.*),
URL paths, anchor IDs, CSS classes, and asset filenames were left
intact since they're code identifiers / structural references.

All 2033 tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 19:50:09 +00:00

384 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""CLI for the DataTools Fix Missing Values tool (script 04).
Usage:
python -m src.cli_missing input.csv # profile only
python -m src.cli_missing input.csv --apply # detect-only + write
python -m src.cli_missing input.csv --preset safe-fill --apply
python -m src.cli_missing input.csv --strategy median --apply
python -m src.cli_missing input.csv --strategy drop_row --apply
python -m src.cli_missing input.csv --strategy constant --fill-value 0 --apply
python -m src.cli_missing input.csv --strategy median --columns age,score --apply
python -m src.cli_missing input.csv --col-strategy "age:median,city:mode" --apply
python -m src.cli_missing --help
"""
from __future__ import annotations
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional
import typer
from loguru import logger
app = typer.Typer(
name="missing",
help=(
"Detect and handle missing values in CSV / Excel files.\n\n"
"Default behaviour: profile only (no file written). Add --apply to "
"write the handled output and audit log.\n\n"
"Strategies:\n"
" none, drop_row, drop_col, drop_both,\n"
" mean, median, mode, constant,\n"
" ffill, bfill, interpolate\n\n"
"Examples:\n\n"
" # Profile missingness without writing anything\n"
" python -m src.cli_missing customers.csv\n\n"
" # Standardize sentinels (\"N/A\", \"-\", \"NULL\", …) to NaN and write\n"
" python -m src.cli_missing customers.csv --apply\n\n"
" # Safe fill: numeric → median, categorical → mode\n"
" python -m src.cli_missing customers.csv --preset safe-fill --apply\n\n"
" # Drop rows missing >50%% of selected columns\n"
" python -m src.cli_missing customers.csv --strategy drop_row "
"--row-threshold 0.5 --apply\n\n"
" # Per-column strategies\n"
" python -m src.cli_missing customers.csv "
"--col-strategy 'age:median,city:mode,notes:constant' --fill-value '' --apply\n"
),
add_completion=False,
no_args_is_help=True,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _setup_logging(log_dir: Path) -> Path:
log_dir.mkdir(parents=True, exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
log_path = log_dir / f"missing_{ts}.log"
logger.remove()
logger.add(sys.stderr, level="WARNING", format="{message}")
logger.add(
str(log_path),
level="DEBUG",
format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}",
)
return log_path
def _split_csv_arg(raw: Optional[str]) -> Optional[list[str]]:
if raw is None:
return None
return [c.strip() for c in raw.split(",") if c.strip()]
def _parse_col_strategy(raw: Optional[str]) -> dict[str, str]:
"""Parse ``--col-strategy 'age:median,city:mode'`` into a dict."""
if not raw:
return {}
out: dict[str, str] = {}
for piece in raw.split(","):
piece = piece.strip()
if not piece:
continue
if ":" not in piece:
raise typer.BadParameter(
f"Invalid --col-strategy piece: '{piece}'. "
f"Expected 'col:strategy[,col:strategy...]'."
)
col, strat = piece.split(":", 1)
out[col.strip()] = strat.strip()
return out
# ---------------------------------------------------------------------------
# Main command
# ---------------------------------------------------------------------------
@app.command()
def handle(
input_file: str = typer.Argument(
...,
help="Path to the CSV or Excel file.",
),
output: Optional[str] = typer.Option(
None, "--output", "-o",
help="Output file path. Default: {input}_missing.csv",
),
apply: bool = typer.Option(
False, "--apply",
help="Write the output. Without this flag, only the profile is shown.",
),
preset: str = typer.Option(
"detect-only", "--preset",
help="Preset: detect-only, safe-fill, or drop-incomplete.",
),
strategy: Optional[str] = typer.Option(
None, "--strategy",
help=(
"Override the preset strategy: none, drop_row, drop_col, drop_both, "
"mean, median, mode, constant, ffill, bfill, interpolate."
),
),
col_strategy: Optional[str] = typer.Option(
None, "--col-strategy",
help="Per-column strategies: 'col:strategy[,col:strategy...]'.",
),
fill_value: Optional[str] = typer.Option(
None, "--fill-value",
help="Constant fill value (used with --strategy constant).",
),
columns: Optional[str] = typer.Option(
None, "--columns",
help="Comma-separated columns to handle (default: all columns).",
),
skip: Optional[str] = typer.Option(
None, "--skip",
help="Comma-separated columns to skip.",
),
sentinels: Optional[str] = typer.Option(
None, "--sentinels",
help=(
"Comma-separated extra sentinels to treat as missing "
"(merged with the built-in defaults)."
),
),
no_sentinels: bool = typer.Option(
False, "--no-sentinels",
help="Disable disguised-null standardization entirely.",
),
row_threshold: float = typer.Option(
1.0, "--row-threshold",
help=(
"For drop_row: drop rows whose missing fraction across selected "
"columns is STRICTLY GREATER than this value (0.0..1.0). "
"Default 1.0 = never drop. Use 0.0 to drop any row with any "
"missing; 0.5 to drop rows >50%% missing."
),
),
col_threshold: float = typer.Option(
1.0, "--col-threshold",
help=(
"For drop_col: drop columns whose missing fraction is strictly "
"greater than this value. Default 1.0 = never drop."
),
),
config: Optional[str] = typer.Option(
None, "--config",
help="Load options from a saved JSON config file.",
),
save_config: Optional[str] = typer.Option(
None, "--save-config",
help="Save current options to a JSON config file.",
),
sheet: Optional[str] = typer.Option(
None, "--sheet",
help="Excel sheet name or index (default: first sheet).",
),
encoding_override: Optional[str] = typer.Option(
None, "--encoding",
help="Override auto-detected file encoding.",
),
header_row: Optional[int] = typer.Option(
None, "--header-row",
help="0-based row index for the header (default: auto-detect).",
),
full_changelog: bool = typer.Option(
False, "--full-changelog",
help="Write every change to the audit CSV (default caps to first 1000).",
),
):
"""Detect and handle missing values."""
from src.core.io import read_file, write_file
from src.core.missing import MissingOptions, PRESETS, handle_missing
import pandas as pd
# Validate inputs
input_path = Path(input_file)
if not input_path.exists():
typer.echo(f"Error: File not found: {input_path}", err=True)
raise typer.Exit(1)
if preset not in PRESETS:
typer.echo(
f"Error: Unknown preset '{preset}'. "
f"Choose from: {', '.join(sorted(PRESETS))}.",
err=True,
)
raise typer.Exit(1)
log_path = _setup_logging(Path("logs"))
# Build options
if config:
cfg_path = Path(config)
if not cfg_path.exists():
typer.echo(f"Error: Config file not found: {cfg_path}", err=True)
raise typer.Exit(1)
options = MissingOptions.from_file(cfg_path)
logger.info("Loaded config from {}", cfg_path)
else:
options = MissingOptions.from_preset(preset)
if strategy:
options.strategy = strategy # type: ignore[assignment]
if col_strategy:
options.column_strategies = _parse_col_strategy(col_strategy) # type: ignore[assignment]
if fill_value is not None:
options.fill_value = fill_value
cols_list = _split_csv_arg(columns)
if cols_list is not None:
options.columns = cols_list
skip_list = _split_csv_arg(skip)
if skip_list:
options.skip_columns = skip_list
extra = _split_csv_arg(sentinels)
if extra:
options.sentinels = list(dict.fromkeys([*options.sentinels, *extra]))
if no_sentinels:
options.standardize_sentinels = False
options.row_drop_threshold = row_threshold
options.col_drop_threshold = col_threshold
if save_config:
saved = options.to_file(save_config)
typer.echo(f"Config saved to {saved}")
# Read input
typer.echo(f"Reading {input_path.name}...")
try:
sheet_arg: str | int | None = None
if sheet is not None:
try:
sheet_arg = int(sheet)
except ValueError:
sheet_arg = sheet
df = read_file(
input_path,
encoding=encoding_override,
header_row=header_row,
sheet_name=sheet_arg if sheet_arg is not None else 0,
repair=False,
)
if not isinstance(df, pd.DataFrame):
df = pd.concat(list(df), ignore_index=True)
except Exception as e:
typer.echo(f"Error reading file: {e}", err=True)
raise typer.Exit(1)
typer.echo(f" {len(df)} rows, {len(df.columns)} columns")
# Run
typer.echo("Profiling missingness...")
try:
result = handle_missing(df, options)
except (ValueError, OSError) as e:
typer.echo(f"Error: {e}", err=True)
raise typer.Exit(1)
_print_results(result, input_path, options)
# Write
if apply:
stem = input_path.stem
out_path = Path(output) if output else input_path.parent / f"{stem}_missing.csv"
write_file(result.handled_df, out_path)
typer.echo(f"\nHandled file: {out_path}")
if not result.changes.empty:
changes_path = input_path.parent / f"{stem}_missing_changes.csv"
audit_df = result.changes
cap = 1000
if not full_changelog and len(audit_df) > cap:
typer.echo(
f"Note: changelog capped at {cap} rows. "
f"Use --full-changelog to write all {len(audit_df)} changes."
)
audit_df = audit_df.head(cap)
write_file(audit_df, changes_path)
typer.echo(f"Changes audit: {changes_path}")
else:
typer.echo(
"\nThis was a profile only. Add --apply to write the handled output."
)
typer.echo(f"Log: {log_path}")
# ---------------------------------------------------------------------------
# Output formatting
# ---------------------------------------------------------------------------
def _print_results(result, input_path: Path, options) -> None:
typer.echo(f"\n{''*60}")
typer.echo(f" File: {input_path.name}")
typer.echo(f" Rows: {result.profile_before.rows_total}")
typer.echo(f" Columns processed: {len(result.columns_processed)}")
typer.echo(
f" Cells missing: "
f"{result.profile_before.cells_missing} / {result.profile_before.cells_total}"
f" ({result.profile_before.cells_missing_pct:.1f}%)"
)
typer.echo(
f" Rows w/ any missing: "
f"{result.profile_before.rows_with_any_missing} "
f"(complete: {result.profile_before.rows_complete})"
)
typer.echo(f"{''*60}")
typer.echo("\nPer-column profile:")
profile_df = result.profile_before.to_dataframe()
for _, row in profile_df.iterrows():
marker = " " if row["missing"] == 0 else " "
typer.echo(
f"{marker}{row['column']:<24} {row['dtype']:<10} "
f"missing={row['missing']:<6} ({row['missing_pct']:>5.1f}%)"
+ (
f" top sentinel: {row['top_sentinel']!r} ×{row['top_sentinel_count']}"
if row["top_sentinel_count"] else ""
)
)
typer.echo("\nActions:")
typer.echo(f" Sentinels standardized to NaN: {result.sentinels_standardized}")
typer.echo(f" Cells filled: {result.cells_filled}")
typer.echo(f" Rows dropped: {result.rows_dropped}")
typer.echo(
f" Columns dropped: {len(result.columns_dropped)}"
+ (f" ({', '.join(result.columns_dropped)})" if result.columns_dropped else "")
)
if result.strategy_per_column:
typer.echo("\nStrategy per column:")
for col, strat in result.strategy_per_column.items():
typer.echo(f" {col}: {strat}")
if not result.changes.empty:
typer.echo("\nFirst examples:")
for _, row in result.changes.head(5).iterrows():
old = repr(row["old"])[:40]
new = repr(row["new"])[:40]
row_label = "" if row["row"] == -1 else f"Row {row['row'] + 1}"
typer.echo(
f" {row_label}, {row['column']}: {old}{new} "
f"[{row['action']}]"
)
# ---------------------------------------------------------------------------
# __main__
# ---------------------------------------------------------------------------
def main():
from src.cli_license_guard import guard
from src.license import FeatureFlag
guard(feature=FeatureFlag.MISSING_HANDLER.value)
app()
if __name__ == "__main__":
main()