Files
datatools-dev/src/cli_missing.py
Michael d32b58e61a feat(license): add Lite SKU; remove user-facing free trial
Two coupled changes:

1. Lite tier
   - New Tier.LITE in src/license/schema.py.
   - FEATURES_BY_TIER[Tier.LITE] = {Deduplicator, Text Cleaner,
     Format Standardizer}. The three universally-useful tools that
     cover the most common bookkeeping / RevOps / Klaviyo prep
     workflows. Other six tools require Core.
   - i18n: license.tier_lite, license.feature_locked_title,
     license.feature_locked_body, license.upgrade_link,
     license.status_locked (en + es).
   - Per-tool feature gate at every GUI tool page
     (require_feature_or_render_upgrade) and every tool CLI
     (guard(feature=...)). A locked tool renders an upgrade
     prompt + Manage-license button (GUI) or exits with code 2
     (CLI).
   - Home grid: tool cards the user's tier doesn't unlock get a
     red 🔒 Locked badge in place of green Ready.

2. Trial removed
   - Activation form's "Start 1-year trial" button removed.
   - license_cli's `trial` subcommand removed.
   - activation.trial_button / activation.trial_help i18n keys
     dropped (pack parity test stays green).
   - Tier.TRIAL stays in the enum (back-compat with any field-
     tested trial licenses); LicenseManager._mint stays internal
     for tests and the seller's key generator.
   - Decision logged in DECISIONS §9b: a 1-year all-features
     trial undercuts paid Lite; paid-only keeps tier economics
     clean.

Tests (+29 net): +17 Lite-tier unit/guard tests + 13 Lite-tier
GUI tests + 1 trial-absent assertion - 2 trial CLI tests - 1
trial GUI button test. Total: 1995 → 2024.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 17:19:30 +00:00

384 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""CLI for the DataTools Missing Value Handler (script 04).
Usage:
python -m src.cli_missing input.csv # profile only
python -m src.cli_missing input.csv --apply # detect-only + write
python -m src.cli_missing input.csv --preset safe-fill --apply
python -m src.cli_missing input.csv --strategy median --apply
python -m src.cli_missing input.csv --strategy drop_row --apply
python -m src.cli_missing input.csv --strategy constant --fill-value 0 --apply
python -m src.cli_missing input.csv --strategy median --columns age,score --apply
python -m src.cli_missing input.csv --col-strategy "age:median,city:mode" --apply
python -m src.cli_missing --help
"""
from __future__ import annotations
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional
import typer
from loguru import logger
app = typer.Typer(
name="missing",
help=(
"Detect and handle missing values in CSV / Excel files.\n\n"
"Default behaviour: profile only (no file written). Add --apply to "
"write the handled output and audit log.\n\n"
"Strategies:\n"
" none, drop_row, drop_col, drop_both,\n"
" mean, median, mode, constant,\n"
" ffill, bfill, interpolate\n\n"
"Examples:\n\n"
" # Profile missingness without writing anything\n"
" python -m src.cli_missing customers.csv\n\n"
" # Standardize sentinels (\"N/A\", \"-\", \"NULL\", …) to NaN and write\n"
" python -m src.cli_missing customers.csv --apply\n\n"
" # Safe fill: numeric → median, categorical → mode\n"
" python -m src.cli_missing customers.csv --preset safe-fill --apply\n\n"
" # Drop rows missing >50%% of selected columns\n"
" python -m src.cli_missing customers.csv --strategy drop_row "
"--row-threshold 0.5 --apply\n\n"
" # Per-column strategies\n"
" python -m src.cli_missing customers.csv "
"--col-strategy 'age:median,city:mode,notes:constant' --fill-value '' --apply\n"
),
add_completion=False,
no_args_is_help=True,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _setup_logging(log_dir: Path) -> Path:
log_dir.mkdir(parents=True, exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
log_path = log_dir / f"missing_{ts}.log"
logger.remove()
logger.add(sys.stderr, level="WARNING", format="{message}")
logger.add(
str(log_path),
level="DEBUG",
format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}",
)
return log_path
def _split_csv_arg(raw: Optional[str]) -> Optional[list[str]]:
if raw is None:
return None
return [c.strip() for c in raw.split(",") if c.strip()]
def _parse_col_strategy(raw: Optional[str]) -> dict[str, str]:
"""Parse ``--col-strategy 'age:median,city:mode'`` into a dict."""
if not raw:
return {}
out: dict[str, str] = {}
for piece in raw.split(","):
piece = piece.strip()
if not piece:
continue
if ":" not in piece:
raise typer.BadParameter(
f"Invalid --col-strategy piece: '{piece}'. "
f"Expected 'col:strategy[,col:strategy...]'."
)
col, strat = piece.split(":", 1)
out[col.strip()] = strat.strip()
return out
# ---------------------------------------------------------------------------
# Main command
# ---------------------------------------------------------------------------
@app.command()
def handle(
input_file: str = typer.Argument(
...,
help="Path to the CSV or Excel file.",
),
output: Optional[str] = typer.Option(
None, "--output", "-o",
help="Output file path. Default: {input}_missing.csv",
),
apply: bool = typer.Option(
False, "--apply",
help="Write the output. Without this flag, only the profile is shown.",
),
preset: str = typer.Option(
"detect-only", "--preset",
help="Preset: detect-only, safe-fill, or drop-incomplete.",
),
strategy: Optional[str] = typer.Option(
None, "--strategy",
help=(
"Override the preset strategy: none, drop_row, drop_col, drop_both, "
"mean, median, mode, constant, ffill, bfill, interpolate."
),
),
col_strategy: Optional[str] = typer.Option(
None, "--col-strategy",
help="Per-column strategies: 'col:strategy[,col:strategy...]'.",
),
fill_value: Optional[str] = typer.Option(
None, "--fill-value",
help="Constant fill value (used with --strategy constant).",
),
columns: Optional[str] = typer.Option(
None, "--columns",
help="Comma-separated columns to handle (default: all columns).",
),
skip: Optional[str] = typer.Option(
None, "--skip",
help="Comma-separated columns to skip.",
),
sentinels: Optional[str] = typer.Option(
None, "--sentinels",
help=(
"Comma-separated extra sentinels to treat as missing "
"(merged with the built-in defaults)."
),
),
no_sentinels: bool = typer.Option(
False, "--no-sentinels",
help="Disable disguised-null standardization entirely.",
),
row_threshold: float = typer.Option(
1.0, "--row-threshold",
help=(
"For drop_row: drop rows whose missing fraction across selected "
"columns is STRICTLY GREATER than this value (0.0..1.0). "
"Default 1.0 = never drop. Use 0.0 to drop any row with any "
"missing; 0.5 to drop rows >50%% missing."
),
),
col_threshold: float = typer.Option(
1.0, "--col-threshold",
help=(
"For drop_col: drop columns whose missing fraction is strictly "
"greater than this value. Default 1.0 = never drop."
),
),
config: Optional[str] = typer.Option(
None, "--config",
help="Load options from a saved JSON config file.",
),
save_config: Optional[str] = typer.Option(
None, "--save-config",
help="Save current options to a JSON config file.",
),
sheet: Optional[str] = typer.Option(
None, "--sheet",
help="Excel sheet name or index (default: first sheet).",
),
encoding_override: Optional[str] = typer.Option(
None, "--encoding",
help="Override auto-detected file encoding.",
),
header_row: Optional[int] = typer.Option(
None, "--header-row",
help="0-based row index for the header (default: auto-detect).",
),
full_changelog: bool = typer.Option(
False, "--full-changelog",
help="Write every change to the audit CSV (default caps to first 1000).",
),
):
"""Detect and handle missing values."""
from src.core.io import read_file, write_file
from src.core.missing import MissingOptions, PRESETS, handle_missing
import pandas as pd
# Validate inputs
input_path = Path(input_file)
if not input_path.exists():
typer.echo(f"Error: File not found: {input_path}", err=True)
raise typer.Exit(1)
if preset not in PRESETS:
typer.echo(
f"Error: Unknown preset '{preset}'. "
f"Choose from: {', '.join(sorted(PRESETS))}.",
err=True,
)
raise typer.Exit(1)
log_path = _setup_logging(Path("logs"))
# Build options
if config:
cfg_path = Path(config)
if not cfg_path.exists():
typer.echo(f"Error: Config file not found: {cfg_path}", err=True)
raise typer.Exit(1)
options = MissingOptions.from_file(cfg_path)
logger.info("Loaded config from {}", cfg_path)
else:
options = MissingOptions.from_preset(preset)
if strategy:
options.strategy = strategy # type: ignore[assignment]
if col_strategy:
options.column_strategies = _parse_col_strategy(col_strategy) # type: ignore[assignment]
if fill_value is not None:
options.fill_value = fill_value
cols_list = _split_csv_arg(columns)
if cols_list is not None:
options.columns = cols_list
skip_list = _split_csv_arg(skip)
if skip_list:
options.skip_columns = skip_list
extra = _split_csv_arg(sentinels)
if extra:
options.sentinels = list(dict.fromkeys([*options.sentinels, *extra]))
if no_sentinels:
options.standardize_sentinels = False
options.row_drop_threshold = row_threshold
options.col_drop_threshold = col_threshold
if save_config:
saved = options.to_file(save_config)
typer.echo(f"Config saved to {saved}")
# Read input
typer.echo(f"Reading {input_path.name}...")
try:
sheet_arg: str | int | None = None
if sheet is not None:
try:
sheet_arg = int(sheet)
except ValueError:
sheet_arg = sheet
df = read_file(
input_path,
encoding=encoding_override,
header_row=header_row,
sheet_name=sheet_arg if sheet_arg is not None else 0,
repair=False,
)
if not isinstance(df, pd.DataFrame):
df = pd.concat(list(df), ignore_index=True)
except Exception as e:
typer.echo(f"Error reading file: {e}", err=True)
raise typer.Exit(1)
typer.echo(f" {len(df)} rows, {len(df.columns)} columns")
# Run
typer.echo("Profiling missingness...")
try:
result = handle_missing(df, options)
except (ValueError, OSError) as e:
typer.echo(f"Error: {e}", err=True)
raise typer.Exit(1)
_print_results(result, input_path, options)
# Write
if apply:
stem = input_path.stem
out_path = Path(output) if output else input_path.parent / f"{stem}_missing.csv"
write_file(result.handled_df, out_path)
typer.echo(f"\nHandled file: {out_path}")
if not result.changes.empty:
changes_path = input_path.parent / f"{stem}_missing_changes.csv"
audit_df = result.changes
cap = 1000
if not full_changelog and len(audit_df) > cap:
typer.echo(
f"Note: changelog capped at {cap} rows. "
f"Use --full-changelog to write all {len(audit_df)} changes."
)
audit_df = audit_df.head(cap)
write_file(audit_df, changes_path)
typer.echo(f"Changes audit: {changes_path}")
else:
typer.echo(
"\nThis was a profile only. Add --apply to write the handled output."
)
typer.echo(f"Log: {log_path}")
# ---------------------------------------------------------------------------
# Output formatting
# ---------------------------------------------------------------------------
def _print_results(result, input_path: Path, options) -> None:
typer.echo(f"\n{''*60}")
typer.echo(f" File: {input_path.name}")
typer.echo(f" Rows: {result.profile_before.rows_total}")
typer.echo(f" Columns processed: {len(result.columns_processed)}")
typer.echo(
f" Cells missing: "
f"{result.profile_before.cells_missing} / {result.profile_before.cells_total}"
f" ({result.profile_before.cells_missing_pct:.1f}%)"
)
typer.echo(
f" Rows w/ any missing: "
f"{result.profile_before.rows_with_any_missing} "
f"(complete: {result.profile_before.rows_complete})"
)
typer.echo(f"{''*60}")
typer.echo("\nPer-column profile:")
profile_df = result.profile_before.to_dataframe()
for _, row in profile_df.iterrows():
marker = " " if row["missing"] == 0 else " "
typer.echo(
f"{marker}{row['column']:<24} {row['dtype']:<10} "
f"missing={row['missing']:<6} ({row['missing_pct']:>5.1f}%)"
+ (
f" top sentinel: {row['top_sentinel']!r} ×{row['top_sentinel_count']}"
if row["top_sentinel_count"] else ""
)
)
typer.echo("\nActions:")
typer.echo(f" Sentinels standardized to NaN: {result.sentinels_standardized}")
typer.echo(f" Cells filled: {result.cells_filled}")
typer.echo(f" Rows dropped: {result.rows_dropped}")
typer.echo(
f" Columns dropped: {len(result.columns_dropped)}"
+ (f" ({', '.join(result.columns_dropped)})" if result.columns_dropped else "")
)
if result.strategy_per_column:
typer.echo("\nStrategy per column:")
for col, strat in result.strategy_per_column.items():
typer.echo(f" {col}: {strat}")
if not result.changes.empty:
typer.echo("\nFirst examples:")
for _, row in result.changes.head(5).iterrows():
old = repr(row["old"])[:40]
new = repr(row["new"])[:40]
row_label = "" if row["row"] == -1 else f"Row {row['row'] + 1}"
typer.echo(
f" {row_label}, {row['column']}: {old}{new} "
f"[{row['action']}]"
)
# ---------------------------------------------------------------------------
# __main__
# ---------------------------------------------------------------------------
def main():
from src.cli_license_guard import guard
from src.license import FeatureFlag
guard(feature=FeatureFlag.MISSING_HANDLER.value)
app()
if __name__ == "__main__":
main()