Tools shipped this batch (4 → 6 of 9 Ready):
04 Missing Value Handler src/core/missing.py + cli_missing.py + GUI
05 Column Mapper src/core/column_mapper.py + cli_column_map.py + GUI
09 Pipeline Runner src/core/pipeline.py + cli_pipeline.py + GUI
with soft tool-dependency graph (recommended,
not enforced) and JSON save/load for repeatable
weekly cleanups.
Format Standardizer reworked for 1 GB international files:
• Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
• Per-row country / address columns drive parsing
• Audit cap (default 10 k rows, ~50 MB RAM)
• standardize_file(): chunked streaming entry point (~165 k rows/sec)
• currency_decimal="auto" for EU comma-decimal locales
• R$ / kr / zł multi-char currency prefixes
• cli_format.py with auto-stream above 100 MB inputs
Encoding detection arbiter + language-aware probe:
Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.
Distribution-readiness assets:
• streamlit_app.py — Streamlit Community Cloud entry shim
• src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
100-row cap + watermark, free-vs-paid boundary enforced at surface
• samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
• landing/ — 4 static HTML pages (apex chooser + 3 niche),
shared CSS, deploy.py URL-substitution script,
auto-generated robots.txt + sitemap.xml + 404.html + favicon
• docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
— full strategy + measurement + deployment + master checklist
Test counts:
before: 1,520 passed · 4 skipped · 17 xfailed
after: 1,729 passed · 0 skipped · 0 xfailed
Tier-1 corpora added:
• missing-corpus 3 use cases + 16 edge cases
• column-mapper-corpus 3 use cases + 5 edge cases
• format-cleaner intl 20-row 13-country stress fixture
Engine hardening flushed out by the corpora:
• interpolate guards against object-dtype columns
• mean/median skip all-NaN columns (silences numpy warning)
• fillna runs under future.no_silent_downcasting (silences pandas warning)
• mojibake test no longer skips when ftfy installed (monkeypatch path)
• drop-row threshold semantics: strict-greater (consistent across rows / cols)
• currency_decimal validator allow-set updated for "auto"
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
381 lines
13 KiB
Python
381 lines
13 KiB
Python
"""CLI for the DataTools Missing Value Handler (script 04).
|
||
|
||
Usage:
|
||
python -m src.cli_missing input.csv # profile only
|
||
python -m src.cli_missing input.csv --apply # detect-only + write
|
||
python -m src.cli_missing input.csv --preset safe-fill --apply
|
||
python -m src.cli_missing input.csv --strategy median --apply
|
||
python -m src.cli_missing input.csv --strategy drop_row --apply
|
||
python -m src.cli_missing input.csv --strategy constant --fill-value 0 --apply
|
||
python -m src.cli_missing input.csv --strategy median --columns age,score --apply
|
||
python -m src.cli_missing input.csv --col-strategy "age:median,city:mode" --apply
|
||
python -m src.cli_missing --help
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import sys
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
from typing import Optional
|
||
|
||
import typer
|
||
from loguru import logger
|
||
|
||
app = typer.Typer(
|
||
name="missing",
|
||
help=(
|
||
"Detect and handle missing values in CSV / Excel files.\n\n"
|
||
"Default behaviour: profile only (no file written). Add --apply to "
|
||
"write the handled output and audit log.\n\n"
|
||
"Strategies:\n"
|
||
" none, drop_row, drop_col, drop_both,\n"
|
||
" mean, median, mode, constant,\n"
|
||
" ffill, bfill, interpolate\n\n"
|
||
"Examples:\n\n"
|
||
" # Profile missingness without writing anything\n"
|
||
" python -m src.cli_missing customers.csv\n\n"
|
||
" # Standardize sentinels (\"N/A\", \"-\", \"NULL\", …) to NaN and write\n"
|
||
" python -m src.cli_missing customers.csv --apply\n\n"
|
||
" # Safe fill: numeric → median, categorical → mode\n"
|
||
" python -m src.cli_missing customers.csv --preset safe-fill --apply\n\n"
|
||
" # Drop rows missing >50%% of selected columns\n"
|
||
" python -m src.cli_missing customers.csv --strategy drop_row "
|
||
"--row-threshold 0.5 --apply\n\n"
|
||
" # Per-column strategies\n"
|
||
" python -m src.cli_missing customers.csv "
|
||
"--col-strategy 'age:median,city:mode,notes:constant' --fill-value '' --apply\n"
|
||
),
|
||
add_completion=False,
|
||
no_args_is_help=True,
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _setup_logging(log_dir: Path) -> Path:
|
||
log_dir.mkdir(parents=True, exist_ok=True)
|
||
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
log_path = log_dir / f"missing_{ts}.log"
|
||
logger.remove()
|
||
logger.add(sys.stderr, level="WARNING", format="{message}")
|
||
logger.add(
|
||
str(log_path),
|
||
level="DEBUG",
|
||
format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}",
|
||
)
|
||
return log_path
|
||
|
||
|
||
def _split_csv_arg(raw: Optional[str]) -> Optional[list[str]]:
|
||
if raw is None:
|
||
return None
|
||
return [c.strip() for c in raw.split(",") if c.strip()]
|
||
|
||
|
||
def _parse_col_strategy(raw: Optional[str]) -> dict[str, str]:
|
||
"""Parse ``--col-strategy 'age:median,city:mode'`` into a dict."""
|
||
if not raw:
|
||
return {}
|
||
out: dict[str, str] = {}
|
||
for piece in raw.split(","):
|
||
piece = piece.strip()
|
||
if not piece:
|
||
continue
|
||
if ":" not in piece:
|
||
raise typer.BadParameter(
|
||
f"Invalid --col-strategy piece: '{piece}'. "
|
||
f"Expected 'col:strategy[,col:strategy...]'."
|
||
)
|
||
col, strat = piece.split(":", 1)
|
||
out[col.strip()] = strat.strip()
|
||
return out
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Main command
|
||
# ---------------------------------------------------------------------------
|
||
|
||
@app.command()
|
||
def handle(
|
||
input_file: str = typer.Argument(
|
||
...,
|
||
help="Path to the CSV or Excel file.",
|
||
),
|
||
output: Optional[str] = typer.Option(
|
||
None, "--output", "-o",
|
||
help="Output file path. Default: {input}_missing.csv",
|
||
),
|
||
apply: bool = typer.Option(
|
||
False, "--apply",
|
||
help="Write the output. Without this flag, only the profile is shown.",
|
||
),
|
||
preset: str = typer.Option(
|
||
"detect-only", "--preset",
|
||
help="Preset: detect-only, safe-fill, or drop-incomplete.",
|
||
),
|
||
strategy: Optional[str] = typer.Option(
|
||
None, "--strategy",
|
||
help=(
|
||
"Override the preset strategy: none, drop_row, drop_col, drop_both, "
|
||
"mean, median, mode, constant, ffill, bfill, interpolate."
|
||
),
|
||
),
|
||
col_strategy: Optional[str] = typer.Option(
|
||
None, "--col-strategy",
|
||
help="Per-column strategies: 'col:strategy[,col:strategy...]'.",
|
||
),
|
||
fill_value: Optional[str] = typer.Option(
|
||
None, "--fill-value",
|
||
help="Constant fill value (used with --strategy constant).",
|
||
),
|
||
columns: Optional[str] = typer.Option(
|
||
None, "--columns",
|
||
help="Comma-separated columns to handle (default: all columns).",
|
||
),
|
||
skip: Optional[str] = typer.Option(
|
||
None, "--skip",
|
||
help="Comma-separated columns to skip.",
|
||
),
|
||
sentinels: Optional[str] = typer.Option(
|
||
None, "--sentinels",
|
||
help=(
|
||
"Comma-separated extra sentinels to treat as missing "
|
||
"(merged with the built-in defaults)."
|
||
),
|
||
),
|
||
no_sentinels: bool = typer.Option(
|
||
False, "--no-sentinels",
|
||
help="Disable disguised-null standardization entirely.",
|
||
),
|
||
row_threshold: float = typer.Option(
|
||
1.0, "--row-threshold",
|
||
help=(
|
||
"For drop_row: drop rows whose missing fraction across selected "
|
||
"columns is STRICTLY GREATER than this value (0.0..1.0). "
|
||
"Default 1.0 = never drop. Use 0.0 to drop any row with any "
|
||
"missing; 0.5 to drop rows >50%% missing."
|
||
),
|
||
),
|
||
col_threshold: float = typer.Option(
|
||
1.0, "--col-threshold",
|
||
help=(
|
||
"For drop_col: drop columns whose missing fraction is strictly "
|
||
"greater than this value. Default 1.0 = never drop."
|
||
),
|
||
),
|
||
config: Optional[str] = typer.Option(
|
||
None, "--config",
|
||
help="Load options from a saved JSON config file.",
|
||
),
|
||
save_config: Optional[str] = typer.Option(
|
||
None, "--save-config",
|
||
help="Save current options to a JSON config file.",
|
||
),
|
||
sheet: Optional[str] = typer.Option(
|
||
None, "--sheet",
|
||
help="Excel sheet name or index (default: first sheet).",
|
||
),
|
||
encoding_override: Optional[str] = typer.Option(
|
||
None, "--encoding",
|
||
help="Override auto-detected file encoding.",
|
||
),
|
||
header_row: Optional[int] = typer.Option(
|
||
None, "--header-row",
|
||
help="0-based row index for the header (default: auto-detect).",
|
||
),
|
||
full_changelog: bool = typer.Option(
|
||
False, "--full-changelog",
|
||
help="Write every change to the audit CSV (default caps to first 1000).",
|
||
),
|
||
):
|
||
"""Detect and handle missing values."""
|
||
from src.core.io import read_file, write_file
|
||
from src.core.missing import MissingOptions, PRESETS, handle_missing
|
||
import pandas as pd
|
||
|
||
# Validate inputs
|
||
input_path = Path(input_file)
|
||
if not input_path.exists():
|
||
typer.echo(f"Error: File not found: {input_path}", err=True)
|
||
raise typer.Exit(1)
|
||
|
||
if preset not in PRESETS:
|
||
typer.echo(
|
||
f"Error: Unknown preset '{preset}'. "
|
||
f"Choose from: {', '.join(sorted(PRESETS))}.",
|
||
err=True,
|
||
)
|
||
raise typer.Exit(1)
|
||
|
||
log_path = _setup_logging(Path("logs"))
|
||
|
||
# Build options
|
||
if config:
|
||
cfg_path = Path(config)
|
||
if not cfg_path.exists():
|
||
typer.echo(f"Error: Config file not found: {cfg_path}", err=True)
|
||
raise typer.Exit(1)
|
||
options = MissingOptions.from_file(cfg_path)
|
||
logger.info("Loaded config from {}", cfg_path)
|
||
else:
|
||
options = MissingOptions.from_preset(preset)
|
||
|
||
if strategy:
|
||
options.strategy = strategy # type: ignore[assignment]
|
||
if col_strategy:
|
||
options.column_strategies = _parse_col_strategy(col_strategy) # type: ignore[assignment]
|
||
if fill_value is not None:
|
||
options.fill_value = fill_value
|
||
cols_list = _split_csv_arg(columns)
|
||
if cols_list is not None:
|
||
options.columns = cols_list
|
||
skip_list = _split_csv_arg(skip)
|
||
if skip_list:
|
||
options.skip_columns = skip_list
|
||
extra = _split_csv_arg(sentinels)
|
||
if extra:
|
||
options.sentinels = list(dict.fromkeys([*options.sentinels, *extra]))
|
||
if no_sentinels:
|
||
options.standardize_sentinels = False
|
||
options.row_drop_threshold = row_threshold
|
||
options.col_drop_threshold = col_threshold
|
||
|
||
if save_config:
|
||
saved = options.to_file(save_config)
|
||
typer.echo(f"Config saved to {saved}")
|
||
|
||
# Read input
|
||
typer.echo(f"Reading {input_path.name}...")
|
||
try:
|
||
sheet_arg: str | int | None = None
|
||
if sheet is not None:
|
||
try:
|
||
sheet_arg = int(sheet)
|
||
except ValueError:
|
||
sheet_arg = sheet
|
||
df = read_file(
|
||
input_path,
|
||
encoding=encoding_override,
|
||
header_row=header_row,
|
||
sheet_name=sheet_arg if sheet_arg is not None else 0,
|
||
repair=False,
|
||
)
|
||
if not isinstance(df, pd.DataFrame):
|
||
df = pd.concat(list(df), ignore_index=True)
|
||
except Exception as e:
|
||
typer.echo(f"Error reading file: {e}", err=True)
|
||
raise typer.Exit(1)
|
||
|
||
typer.echo(f" {len(df)} rows, {len(df.columns)} columns")
|
||
|
||
# Run
|
||
typer.echo("Profiling missingness...")
|
||
try:
|
||
result = handle_missing(df, options)
|
||
except (ValueError, OSError) as e:
|
||
typer.echo(f"Error: {e}", err=True)
|
||
raise typer.Exit(1)
|
||
|
||
_print_results(result, input_path, options)
|
||
|
||
# Write
|
||
if apply:
|
||
stem = input_path.stem
|
||
out_path = Path(output) if output else input_path.parent / f"{stem}_missing.csv"
|
||
write_file(result.handled_df, out_path)
|
||
typer.echo(f"\nHandled file: {out_path}")
|
||
|
||
if not result.changes.empty:
|
||
changes_path = input_path.parent / f"{stem}_missing_changes.csv"
|
||
audit_df = result.changes
|
||
cap = 1000
|
||
if not full_changelog and len(audit_df) > cap:
|
||
typer.echo(
|
||
f"Note: changelog capped at {cap} rows. "
|
||
f"Use --full-changelog to write all {len(audit_df)} changes."
|
||
)
|
||
audit_df = audit_df.head(cap)
|
||
write_file(audit_df, changes_path)
|
||
typer.echo(f"Changes audit: {changes_path}")
|
||
else:
|
||
typer.echo(
|
||
"\nThis was a profile only. Add --apply to write the handled output."
|
||
)
|
||
|
||
typer.echo(f"Log: {log_path}")
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Output formatting
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _print_results(result, input_path: Path, options) -> None:
|
||
typer.echo(f"\n{'─'*60}")
|
||
typer.echo(f" File: {input_path.name}")
|
||
typer.echo(f" Rows: {result.profile_before.rows_total}")
|
||
typer.echo(f" Columns processed: {len(result.columns_processed)}")
|
||
typer.echo(
|
||
f" Cells missing: "
|
||
f"{result.profile_before.cells_missing} / {result.profile_before.cells_total}"
|
||
f" ({result.profile_before.cells_missing_pct:.1f}%)"
|
||
)
|
||
typer.echo(
|
||
f" Rows w/ any missing: "
|
||
f"{result.profile_before.rows_with_any_missing} "
|
||
f"(complete: {result.profile_before.rows_complete})"
|
||
)
|
||
typer.echo(f"{'─'*60}")
|
||
|
||
typer.echo("\nPer-column profile:")
|
||
profile_df = result.profile_before.to_dataframe()
|
||
for _, row in profile_df.iterrows():
|
||
marker = " " if row["missing"] == 0 else " "
|
||
typer.echo(
|
||
f"{marker}{row['column']:<24} {row['dtype']:<10} "
|
||
f"missing={row['missing']:<6} ({row['missing_pct']:>5.1f}%)"
|
||
+ (
|
||
f" top sentinel: {row['top_sentinel']!r} ×{row['top_sentinel_count']}"
|
||
if row["top_sentinel_count"] else ""
|
||
)
|
||
)
|
||
|
||
typer.echo("\nActions:")
|
||
typer.echo(f" Sentinels standardized to NaN: {result.sentinels_standardized}")
|
||
typer.echo(f" Cells filled: {result.cells_filled}")
|
||
typer.echo(f" Rows dropped: {result.rows_dropped}")
|
||
typer.echo(
|
||
f" Columns dropped: {len(result.columns_dropped)}"
|
||
+ (f" ({', '.join(result.columns_dropped)})" if result.columns_dropped else "")
|
||
)
|
||
|
||
if result.strategy_per_column:
|
||
typer.echo("\nStrategy per column:")
|
||
for col, strat in result.strategy_per_column.items():
|
||
typer.echo(f" {col}: {strat}")
|
||
|
||
if not result.changes.empty:
|
||
typer.echo("\nFirst examples:")
|
||
for _, row in result.changes.head(5).iterrows():
|
||
old = repr(row["old"])[:40]
|
||
new = repr(row["new"])[:40]
|
||
row_label = "—" if row["row"] == -1 else f"Row {row['row'] + 1}"
|
||
typer.echo(
|
||
f" {row_label}, {row['column']}: {old} → {new} "
|
||
f"[{row['action']}]"
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# __main__
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def main():
|
||
app()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|