feat: 3 new tools, format streaming, distribution-ready demo + landing pages

Tools shipped this batch (4 → 6 of 9 Ready):
  04 Missing Value Handler   src/core/missing.py + cli_missing.py + GUI
  05 Column Mapper           src/core/column_mapper.py + cli_column_map.py + GUI
  09 Pipeline Runner         src/core/pipeline.py + cli_pipeline.py + GUI
                             with soft tool-dependency graph (recommended,
                             not enforced) and JSON save/load for repeatable
                             weekly cleanups.

Format Standardizer reworked for 1 GB international files:
  • Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
  • Per-row country / address columns drive parsing
  • Audit cap (default 10 k rows, ~50 MB RAM)
  • standardize_file(): chunked streaming entry point (~165 k rows/sec)
  • currency_decimal="auto" for EU comma-decimal locales
  • R$ / kr / zł multi-char currency prefixes
  • cli_format.py with auto-stream above 100 MB inputs

Encoding detection arbiter + language-aware probe:
  Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
  via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.

Distribution-readiness assets:
  • streamlit_app.py — Streamlit Community Cloud entry shim
  • src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
    100-row cap + watermark, free-vs-paid boundary enforced at surface
  • samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
  • landing/ — 4 static HTML pages (apex chooser + 3 niche),
    shared CSS, deploy.py URL-substitution script,
    auto-generated robots.txt + sitemap.xml + 404.html + favicon
  • docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
    — full strategy + measurement + deployment + master checklist

Test counts:
  before: 1,520 passed · 4 skipped · 17 xfailed
  after:  1,729 passed · 0 skipped · 0  xfailed

Tier-1 corpora added:
  • missing-corpus           3 use cases + 16 edge cases
  • column-mapper-corpus     3 use cases + 5 edge cases
  • format-cleaner intl      20-row 13-country stress fixture

Engine hardening flushed out by the corpora:
  • interpolate guards against object-dtype columns
  • mean/median skip all-NaN columns (silences numpy warning)
  • fillna runs under future.no_silent_downcasting (silences pandas warning)
  • mojibake test no longer skips when ftfy installed (monkeypatch path)
  • drop-row threshold semantics: strict-greater (consistent across rows / cols)
  • currency_decimal validator allow-set updated for "auto"

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-01 22:31:26 +00:00
parent d18b95880d
commit 966af8ef94
89 changed files with 12039 additions and 284 deletions

307
src/cli_pipeline.py Normal file
View File

@@ -0,0 +1,307 @@
"""CLI for the DataTools Pipeline Runner (script 09).
Usage:
# Run the recommended default pipeline (text → format → missing → dedup):
python -m src.cli_pipeline input.csv --apply
# Quick custom order via --steps:
python -m src.cli_pipeline input.csv \\
--steps text_clean,format_standardize,missing --apply
# Save the recommended pipeline to a JSON for editing:
python -m src.cli_pipeline --recommend --output pipeline.json
# Run a saved pipeline:
python -m src.cli_pipeline weekly_export.csv --pipeline pipeline.json --apply
# Strict mode: fail if the pipeline contains soft-dependency violations
python -m src.cli_pipeline data.csv --steps dedup,text_clean \\
--strict --apply
"""
from __future__ import annotations
import json
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional
import typer
from loguru import logger
app = typer.Typer(
name="pipeline",
help=(
"Chain DataTools cleaning steps into one orchestrated workflow.\n\n"
"Default behaviour: preview the plan + run the pipeline (no file "
"written). Add --apply to write the cleaned output and audit log.\n\n"
"The pipeline RECOMMENDS an order based on tool dependencies "
"(text-clean before format-standardize, format before dedup, etc.) "
"and WARNS on out-of-order configs but does not block them. Use "
"--strict to escalate warnings to errors.\n\n"
"Tools available: text_clean, format_standardize, missing, "
"column_map, dedup."
),
add_completion=False,
no_args_is_help=False,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _setup_logging(log_dir: Path) -> Path:
log_dir.mkdir(parents=True, exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
log_path = log_dir / f"pipeline_{ts}.log"
logger.remove()
logger.add(sys.stderr, level="WARNING", format="{message}")
logger.add(
str(log_path), level="DEBUG",
format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}",
)
return log_path
def _split_csv_arg(raw: Optional[str]) -> Optional[list[str]]:
if raw is None:
return None
return [c.strip() for c in raw.split(",") if c.strip()]
# ---------------------------------------------------------------------------
# Main command
# ---------------------------------------------------------------------------
@app.command()
def run(
input_file: Optional[str] = typer.Argument(
None,
help="CSV / TSV / Excel file. Optional with --recommend.",
),
pipeline_path: Optional[str] = typer.Option(
None, "--pipeline", "-p",
help="Path to a pipeline JSON file (Pipeline.from_file format).",
),
steps: Optional[str] = typer.Option(
None, "--steps",
help=(
"Quick pipeline: comma-separated tool names in execution order. "
"Each step uses defaults. Example: 'text_clean,format_standardize,dedup'."
),
),
recommend: bool = typer.Option(
False, "--recommend",
help="Print (or save) the recommended default pipeline and exit.",
),
output: Optional[str] = typer.Option(
None, "--output", "-o",
help=(
"When --recommend is set, save the pipeline JSON here. "
"Otherwise, write the pipeline output to this CSV path "
"(default: {input}_pipeline.csv)."
),
),
apply: bool = typer.Option(
False, "--apply",
help="Write the output. Without this flag, only the plan is shown.",
),
strict: bool = typer.Option(
False, "--strict",
help="Treat soft-dependency warnings as errors (refuse to run).",
),
continue_on_error: bool = typer.Option(
False, "--continue-on-error",
help="Don't abort if a step fails; carry the previous step's df forward.",
),
encoding_override: Optional[str] = typer.Option(
None, "--encoding",
help="Override auto-detected file encoding.",
),
delimiter: Optional[str] = typer.Option(
None, "--delimiter",
help="Override auto-detected delimiter.",
),
):
"""Run a DataTools cleaning pipeline."""
from src.core.pipeline import (
Pipeline,
recommended_pipeline,
run_pipeline,
validate_pipeline,
)
# ------------------------------------------------------------------
# --recommend: print or save the default pipeline and exit
# ------------------------------------------------------------------
if recommend:
pipe = recommended_pipeline()
body = json.dumps(pipe.to_dict(), indent=2)
if output:
Path(output).write_text(body)
typer.echo(f"Recommended pipeline saved to {output}")
else:
typer.echo(body)
return
if not input_file:
typer.echo(
"Error: input file is required (or use --recommend to "
"emit the default pipeline).",
err=True,
)
raise typer.Exit(2)
inp = Path(input_file)
if not inp.exists():
typer.echo(f"Error: File not found: {inp}", err=True)
raise typer.Exit(1)
log_path = _setup_logging(Path("logs"))
# ------------------------------------------------------------------
# Resolve pipeline source: --pipeline file, --steps list, or default
# ------------------------------------------------------------------
if pipeline_path and steps:
typer.echo(
"Error: pass either --pipeline or --steps, not both.",
err=True,
)
raise typer.Exit(1)
if pipeline_path:
pp = Path(pipeline_path)
if not pp.exists():
typer.echo(f"Error: pipeline file not found: {pp}", err=True)
raise typer.Exit(1)
try:
pipe = Pipeline.from_file(pp)
except Exception as e:
from src.core.errors import format_for_user
typer.echo(f"Error reading pipeline: {format_for_user(e)}", err=True)
raise typer.Exit(1)
elif steps:
names = _split_csv_arg(steps) or []
try:
pipe = recommended_pipeline(include=names)
except Exception as e:
from src.core.errors import format_for_user
typer.echo(f"Error: {format_for_user(e)}", err=True)
raise typer.Exit(1)
else:
pipe = recommended_pipeline()
# ------------------------------------------------------------------
# Plan + warnings
# ------------------------------------------------------------------
warnings = validate_pipeline(pipe)
typer.echo(f"\n{''*60}")
typer.echo(" Pipeline plan:")
for i, step in enumerate(pipe.steps, 1):
flag = " " if step.enabled else ""
typer.echo(f" {i}. {flag}{step.display_name():<22} options={step.options or {}}")
typer.echo(f"{''*60}")
if warnings:
typer.echo("\nSoft-dependency warnings (recommended order violated):")
for w in warnings:
typer.echo(f" ! {w}")
if strict:
typer.echo(
"\nAborting: --strict was set. Reorder the steps or drop --strict.",
err=True,
)
raise typer.Exit(2)
if not apply:
typer.echo(
"\nThis was a plan-only run. Add --apply to execute the pipeline."
)
typer.echo(f"Log: {log_path}")
return
# ------------------------------------------------------------------
# Read input + execute
# ------------------------------------------------------------------
from src.core.io import read_file, write_file
import pandas as pd
typer.echo(f"\nReading {inp.name}...")
try:
df = read_file(
inp, encoding=encoding_override, delimiter=delimiter, repair=False,
)
if not isinstance(df, pd.DataFrame):
df = pd.concat(list(df), ignore_index=True)
except Exception as e:
typer.echo(f"Error reading file: {e}", err=True)
raise typer.Exit(1)
typer.echo(f" {len(df):,} rows, {len(df.columns)} columns")
typer.echo("\nExecuting pipeline:")
def _on_step(sr) -> None:
if sr.skipped:
typer.echo(f" - {sr.step.display_name()} (skipped)")
elif sr.error:
typer.echo(f"{sr.step.display_name()} ({sr.elapsed_seconds*1000:.0f} ms) — ERROR: {sr.error.splitlines()[0]}")
else:
typer.echo(f"{sr.step.display_name()} ({sr.elapsed_seconds*1000:.0f} ms) {sr.summary}")
try:
result = run_pipeline(
df, pipe,
on_step_complete=_on_step,
stop_on_error=not continue_on_error,
)
except Exception as e:
from src.core.errors import format_for_user
typer.echo(f"\nPipeline halted: {format_for_user(e)}", err=True)
raise typer.Exit(1)
typer.echo(f"\n{''*60}")
typer.echo(f" Initial rows: {result.initial_rows:,}")
typer.echo(f" Final rows: {result.final_rows:,}")
typer.echo(f" Steps run: {sum(1 for s in result.step_results if not s.skipped)}")
typer.echo(f" Total elapsed: {result.total_elapsed:.2f} s")
typer.echo(f"{''*60}")
# ------------------------------------------------------------------
# Write output + audit
# ------------------------------------------------------------------
out_path = Path(output) if output else inp.parent / f"{inp.stem}_pipeline.csv"
write_file(result.final_df, out_path)
typer.echo(f"\nPipeline output: {out_path}")
audit_path = inp.parent / f"{inp.stem}_pipeline.json"
audit_path.write_text(json.dumps({
"pipeline": pipe.to_dict(),
"warnings": result.warnings,
"initial_rows": result.initial_rows,
"final_rows": result.final_rows,
"total_elapsed_seconds": result.total_elapsed,
"steps": [
{
"tool": sr.step.tool,
"name": sr.step.display_name(),
"enabled": sr.step.enabled,
"skipped": sr.skipped,
"elapsed_seconds": sr.elapsed_seconds,
"summary": sr.summary,
"error": sr.error,
}
for sr in result.step_results
],
}, indent=2, default=str))
typer.echo(f"Pipeline audit: {audit_path}")
typer.echo(f"Log: {log_path}")
def main() -> None:
app()
if __name__ == "__main__":
main()