feat: 3 new tools, format streaming, distribution-ready demo + landing pages

Tools shipped this batch (4 → 6 of 9 Ready): 04 Missing Value Handler src/core/missing.py + cli_missing.py + GUI 05 Column Mapper src/core/column_mapper.py + cli_column_map.py + GUI 09 Pipeline Runner src/core/pipeline.py + cli_pipeline.py + GUI with soft tool-dependency graph (recommended, not enforced) and JSON save/load for repeatable weekly cleanups. Format Standardizer reworked for 1 GB international files: • Vectorised dispatch + LRU cache over phone/date/currency/boolean/email • Per-row country / address columns drive parsing • Audit cap (default 10 k rows, ~50 MB RAM) • standardize_file(): chunked streaming entry point (~165 k rows/sec) • currency_decimal="auto" for EU comma-decimal locales • R$ / kr / zł multi-char currency prefixes • cli_format.py with auto-stream above 100 MB inputs Encoding detection arbiter + language-aware probe: Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM) via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes. Distribution-readiness assets: • streamlit_app.py — Streamlit Community Cloud entry shim • src/gui/app_demo.py — single-page demo, ?p=<persona> routing, 100-row cap + watermark, free-vs-paid boundary enforced at surface • samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs • landing/ — 4 static HTML pages (apex chooser + 3 niche), shared CSS, deploy.py URL-substitution script, auto-generated robots.txt + sitemap.xml + 404.html + favicon • docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md — full strategy + measurement + deployment + master checklist Test counts: before: 1,520 passed · 4 skipped · 17 xfailed after: 1,729 passed · 0 skipped · 0 xfailed Tier-1 corpora added: • missing-corpus 3 use cases + 16 edge cases • column-mapper-corpus 3 use cases + 5 edge cases • format-cleaner intl 20-row 13-country stress fixture Engine hardening flushed out by the corpora: • interpolate guards against object-dtype columns • mean/median skip all-NaN columns (silences numpy warning) • fillna runs under future.no_silent_downcasting (silences pandas warning) • mojibake test no longer skips when ftfy installed (monkeypatch path) • drop-row threshold semantics: strict-greater (consistent across rows / cols) • currency_decimal validator allow-set updated for "auto" Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 22:31:26 +00:00
parent d18b95880d
commit 966af8ef94
89 changed files with 12039 additions and 284 deletions
--- a/src/cli_missing.py
+++ b/src/cli_missing.py
@@ -0,0 +1,380 @@
+"""CLI for the DataTools Missing Value Handler (script 04).
+
+Usage:
+    python -m src.cli_missing input.csv                              # profile only
+    python -m src.cli_missing input.csv --apply                      # detect-only + write
+    python -m src.cli_missing input.csv --preset safe-fill --apply
+    python -m src.cli_missing input.csv --strategy median --apply
+    python -m src.cli_missing input.csv --strategy drop_row --apply
+    python -m src.cli_missing input.csv --strategy constant --fill-value 0 --apply
+    python -m src.cli_missing input.csv --strategy median --columns age,score --apply
+    python -m src.cli_missing input.csv --col-strategy "age:median,city:mode" --apply
+    python -m src.cli_missing --help
+"""
+
+from __future__ import annotations
+
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+import typer
+from loguru import logger
+
+app = typer.Typer(
+    name="missing",
+    help=(
+        "Detect and handle missing values in CSV / Excel files.\n\n"
+        "Default behaviour: profile only (no file written). Add --apply to "
+        "write the handled output and audit log.\n\n"
+        "Strategies:\n"
+        "  none, drop_row, drop_col, drop_both,\n"
+        "  mean, median, mode, constant,\n"
+        "  ffill, bfill, interpolate\n\n"
+        "Examples:\n\n"
+        "  # Profile missingness without writing anything\n"
+        "  python -m src.cli_missing customers.csv\n\n"
+        "  # Standardize sentinels (\"N/A\", \"-\", \"NULL\", …) to NaN and write\n"
+        "  python -m src.cli_missing customers.csv --apply\n\n"
+        "  # Safe fill: numeric → median, categorical → mode\n"
+        "  python -m src.cli_missing customers.csv --preset safe-fill --apply\n\n"
+        "  # Drop rows missing >50%% of selected columns\n"
+        "  python -m src.cli_missing customers.csv --strategy drop_row "
+        "--row-threshold 0.5 --apply\n\n"
+        "  # Per-column strategies\n"
+        "  python -m src.cli_missing customers.csv "
+        "--col-strategy 'age:median,city:mode,notes:constant' --fill-value '' --apply\n"
+    ),
+    add_completion=False,
+    no_args_is_help=True,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _setup_logging(log_dir: Path) -> Path:
+    log_dir.mkdir(parents=True, exist_ok=True)
+    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+    log_path = log_dir / f"missing_{ts}.log"
+    logger.remove()
+    logger.add(sys.stderr, level="WARNING", format="{message}")
+    logger.add(
+        str(log_path),
+        level="DEBUG",
+        format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}",
+    )
+    return log_path
+
+
+def _split_csv_arg(raw: Optional[str]) -> Optional[list[str]]:
+    if raw is None:
+        return None
+    return [c.strip() for c in raw.split(",") if c.strip()]
+
+
+def _parse_col_strategy(raw: Optional[str]) -> dict[str, str]:
+    """Parse ``--col-strategy 'age:median,city:mode'`` into a dict."""
+    if not raw:
+        return {}
+    out: dict[str, str] = {}
+    for piece in raw.split(","):
+        piece = piece.strip()
+        if not piece:
+            continue
+        if ":" not in piece:
+            raise typer.BadParameter(
+                f"Invalid --col-strategy piece: '{piece}'. "
+                f"Expected 'col:strategy[,col:strategy...]'."
+            )
+        col, strat = piece.split(":", 1)
+        out[col.strip()] = strat.strip()
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Main command
+# ---------------------------------------------------------------------------
+
+@app.command()
+def handle(
+    input_file: str = typer.Argument(
+        ...,
+        help="Path to the CSV or Excel file.",
+    ),
+    output: Optional[str] = typer.Option(
+        None, "--output", "-o",
+        help="Output file path. Default: {input}_missing.csv",
+    ),
+    apply: bool = typer.Option(
+        False, "--apply",
+        help="Write the output. Without this flag, only the profile is shown.",
+    ),
+    preset: str = typer.Option(
+        "detect-only", "--preset",
+        help="Preset: detect-only, safe-fill, or drop-incomplete.",
+    ),
+    strategy: Optional[str] = typer.Option(
+        None, "--strategy",
+        help=(
+            "Override the preset strategy: none, drop_row, drop_col, drop_both, "
+            "mean, median, mode, constant, ffill, bfill, interpolate."
+        ),
+    ),
+    col_strategy: Optional[str] = typer.Option(
+        None, "--col-strategy",
+        help="Per-column strategies: 'col:strategy[,col:strategy...]'.",
+    ),
+    fill_value: Optional[str] = typer.Option(
+        None, "--fill-value",
+        help="Constant fill value (used with --strategy constant).",
+    ),
+    columns: Optional[str] = typer.Option(
+        None, "--columns",
+        help="Comma-separated columns to handle (default: all columns).",
+    ),
+    skip: Optional[str] = typer.Option(
+        None, "--skip",
+        help="Comma-separated columns to skip.",
+    ),
+    sentinels: Optional[str] = typer.Option(
+        None, "--sentinels",
+        help=(
+            "Comma-separated extra sentinels to treat as missing "
+            "(merged with the built-in defaults)."
+        ),
+    ),
+    no_sentinels: bool = typer.Option(
+        False, "--no-sentinels",
+        help="Disable disguised-null standardization entirely.",
+    ),
+    row_threshold: float = typer.Option(
+        1.0, "--row-threshold",
+        help=(
+            "For drop_row: drop rows whose missing fraction across selected "
+            "columns is STRICTLY GREATER than this value (0.0..1.0). "
+            "Default 1.0 = never drop. Use 0.0 to drop any row with any "
+            "missing; 0.5 to drop rows >50%% missing."
+        ),
+    ),
+    col_threshold: float = typer.Option(
+        1.0, "--col-threshold",
+        help=(
+            "For drop_col: drop columns whose missing fraction is strictly "
+            "greater than this value. Default 1.0 = never drop."
+        ),
+    ),
+    config: Optional[str] = typer.Option(
+        None, "--config",
+        help="Load options from a saved JSON config file.",
+    ),
+    save_config: Optional[str] = typer.Option(
+        None, "--save-config",
+        help="Save current options to a JSON config file.",
+    ),
+    sheet: Optional[str] = typer.Option(
+        None, "--sheet",
+        help="Excel sheet name or index (default: first sheet).",
+    ),
+    encoding_override: Optional[str] = typer.Option(
+        None, "--encoding",
+        help="Override auto-detected file encoding.",
+    ),
+    header_row: Optional[int] = typer.Option(
+        None, "--header-row",
+        help="0-based row index for the header (default: auto-detect).",
+    ),
+    full_changelog: bool = typer.Option(
+        False, "--full-changelog",
+        help="Write every change to the audit CSV (default caps to first 1000).",
+    ),
+):
+    """Detect and handle missing values."""
+    from src.core.io import read_file, write_file
+    from src.core.missing import MissingOptions, PRESETS, handle_missing
+    import pandas as pd
+
+    # Validate inputs
+    input_path = Path(input_file)
+    if not input_path.exists():
+        typer.echo(f"Error: File not found: {input_path}", err=True)
+        raise typer.Exit(1)
+
+    if preset not in PRESETS:
+        typer.echo(
+            f"Error: Unknown preset '{preset}'. "
+            f"Choose from: {', '.join(sorted(PRESETS))}.",
+            err=True,
+        )
+        raise typer.Exit(1)
+
+    log_path = _setup_logging(Path("logs"))
+
+    # Build options
+    if config:
+        cfg_path = Path(config)
+        if not cfg_path.exists():
+            typer.echo(f"Error: Config file not found: {cfg_path}", err=True)
+            raise typer.Exit(1)
+        options = MissingOptions.from_file(cfg_path)
+        logger.info("Loaded config from {}", cfg_path)
+    else:
+        options = MissingOptions.from_preset(preset)
+
+    if strategy:
+        options.strategy = strategy  # type: ignore[assignment]
+    if col_strategy:
+        options.column_strategies = _parse_col_strategy(col_strategy)  # type: ignore[assignment]
+    if fill_value is not None:
+        options.fill_value = fill_value
+    cols_list = _split_csv_arg(columns)
+    if cols_list is not None:
+        options.columns = cols_list
+    skip_list = _split_csv_arg(skip)
+    if skip_list:
+        options.skip_columns = skip_list
+    extra = _split_csv_arg(sentinels)
+    if extra:
+        options.sentinels = list(dict.fromkeys([*options.sentinels, *extra]))
+    if no_sentinels:
+        options.standardize_sentinels = False
+    options.row_drop_threshold = row_threshold
+    options.col_drop_threshold = col_threshold
+
+    if save_config:
+        saved = options.to_file(save_config)
+        typer.echo(f"Config saved to {saved}")
+
+    # Read input
+    typer.echo(f"Reading {input_path.name}...")
+    try:
+        sheet_arg: str | int | None = None
+        if sheet is not None:
+            try:
+                sheet_arg = int(sheet)
+            except ValueError:
+                sheet_arg = sheet
+        df = read_file(
+            input_path,
+            encoding=encoding_override,
+            header_row=header_row,
+            sheet_name=sheet_arg if sheet_arg is not None else 0,
+            repair=False,
+        )
+        if not isinstance(df, pd.DataFrame):
+            df = pd.concat(list(df), ignore_index=True)
+    except Exception as e:
+        typer.echo(f"Error reading file: {e}", err=True)
+        raise typer.Exit(1)
+
+    typer.echo(f"  {len(df)} rows, {len(df.columns)} columns")
+
+    # Run
+    typer.echo("Profiling missingness...")
+    try:
+        result = handle_missing(df, options)
+    except (ValueError, OSError) as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1)
+
+    _print_results(result, input_path, options)
+
+    # Write
+    if apply:
+        stem = input_path.stem
+        out_path = Path(output) if output else input_path.parent / f"{stem}_missing.csv"
+        write_file(result.handled_df, out_path)
+        typer.echo(f"\nHandled file:    {out_path}")
+
+        if not result.changes.empty:
+            changes_path = input_path.parent / f"{stem}_missing_changes.csv"
+            audit_df = result.changes
+            cap = 1000
+            if not full_changelog and len(audit_df) > cap:
+                typer.echo(
+                    f"Note: changelog capped at {cap} rows. "
+                    f"Use --full-changelog to write all {len(audit_df)} changes."
+                )
+                audit_df = audit_df.head(cap)
+            write_file(audit_df, changes_path)
+            typer.echo(f"Changes audit:   {changes_path}")
+    else:
+        typer.echo(
+            "\nThis was a profile only. Add --apply to write the handled output."
+        )
+
+    typer.echo(f"Log: {log_path}")
+
+
+# ---------------------------------------------------------------------------
+# Output formatting
+# ---------------------------------------------------------------------------
+
+def _print_results(result, input_path: Path, options) -> None:
+    typer.echo(f"\n{'─'*60}")
+    typer.echo(f"  File:                 {input_path.name}")
+    typer.echo(f"  Rows:                 {result.profile_before.rows_total}")
+    typer.echo(f"  Columns processed:    {len(result.columns_processed)}")
+    typer.echo(
+        f"  Cells missing:        "
+        f"{result.profile_before.cells_missing} / {result.profile_before.cells_total}"
+        f" ({result.profile_before.cells_missing_pct:.1f}%)"
+    )
+    typer.echo(
+        f"  Rows w/ any missing:  "
+        f"{result.profile_before.rows_with_any_missing} "
+        f"(complete: {result.profile_before.rows_complete})"
+    )
+    typer.echo(f"{'─'*60}")
+
+    typer.echo("\nPer-column profile:")
+    profile_df = result.profile_before.to_dataframe()
+    for _, row in profile_df.iterrows():
+        marker = "  " if row["missing"] == 0 else "  "
+        typer.echo(
+            f"{marker}{row['column']:<24} {row['dtype']:<10} "
+            f"missing={row['missing']:<6} ({row['missing_pct']:>5.1f}%)"
+            + (
+                f"  top sentinel: {row['top_sentinel']!r} ×{row['top_sentinel_count']}"
+                if row["top_sentinel_count"] else ""
+            )
+        )
+
+    typer.echo("\nActions:")
+    typer.echo(f"  Sentinels standardized to NaN:  {result.sentinels_standardized}")
+    typer.echo(f"  Cells filled:                   {result.cells_filled}")
+    typer.echo(f"  Rows dropped:                   {result.rows_dropped}")
+    typer.echo(
+        f"  Columns dropped:                {len(result.columns_dropped)}"
+        + (f" ({', '.join(result.columns_dropped)})" if result.columns_dropped else "")
+    )
+
+    if result.strategy_per_column:
+        typer.echo("\nStrategy per column:")
+        for col, strat in result.strategy_per_column.items():
+            typer.echo(f"  {col}: {strat}")
+
+    if not result.changes.empty:
+        typer.echo("\nFirst examples:")
+        for _, row in result.changes.head(5).iterrows():
+            old = repr(row["old"])[:40]
+            new = repr(row["new"])[:40]
+            row_label = "—" if row["row"] == -1 else f"Row {row['row'] + 1}"
+            typer.echo(
+                f"  {row_label}, {row['column']}: {old} → {new} "
+                f"[{row['action']}]"
+            )
+
+
+# ---------------------------------------------------------------------------
+# __main__
+# ---------------------------------------------------------------------------
+
+def main():
+    app()
+
+
+if __name__ == "__main__":
+    main()