feat: 3 new tools, format streaming, distribution-ready demo + landing pages
Tools shipped this batch (4 → 6 of 9 Ready):
04 Missing Value Handler src/core/missing.py + cli_missing.py + GUI
05 Column Mapper src/core/column_mapper.py + cli_column_map.py + GUI
09 Pipeline Runner src/core/pipeline.py + cli_pipeline.py + GUI
with soft tool-dependency graph (recommended,
not enforced) and JSON save/load for repeatable
weekly cleanups.
Format Standardizer reworked for 1 GB international files:
• Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
• Per-row country / address columns drive parsing
• Audit cap (default 10 k rows, ~50 MB RAM)
• standardize_file(): chunked streaming entry point (~165 k rows/sec)
• currency_decimal="auto" for EU comma-decimal locales
• R$ / kr / zł multi-char currency prefixes
• cli_format.py with auto-stream above 100 MB inputs
Encoding detection arbiter + language-aware probe:
Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.
Distribution-readiness assets:
• streamlit_app.py — Streamlit Community Cloud entry shim
• src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
100-row cap + watermark, free-vs-paid boundary enforced at surface
• samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
• landing/ — 4 static HTML pages (apex chooser + 3 niche),
shared CSS, deploy.py URL-substitution script,
auto-generated robots.txt + sitemap.xml + 404.html + favicon
• docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
— full strategy + measurement + deployment + master checklist
Test counts:
before: 1,520 passed · 4 skipped · 17 xfailed
after: 1,729 passed · 0 skipped · 0 xfailed
Tier-1 corpora added:
• missing-corpus 3 use cases + 16 edge cases
• column-mapper-corpus 3 use cases + 5 edge cases
• format-cleaner intl 20-row 13-country stress fixture
Engine hardening flushed out by the corpora:
• interpolate guards against object-dtype columns
• mean/median skip all-NaN columns (silences numpy warning)
• fillna runs under future.no_silent_downcasting (silences pandas warning)
• mojibake test no longer skips when ftfy installed (monkeypatch path)
• drop-row threshold semantics: strict-greater (consistent across rows / cols)
• currency_decimal validator allow-set updated for "auto"
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
307
src/cli_pipeline.py
Normal file
307
src/cli_pipeline.py
Normal file
@@ -0,0 +1,307 @@
|
||||
"""CLI for the DataTools Pipeline Runner (script 09).
|
||||
|
||||
Usage:
|
||||
# Run the recommended default pipeline (text → format → missing → dedup):
|
||||
python -m src.cli_pipeline input.csv --apply
|
||||
|
||||
# Quick custom order via --steps:
|
||||
python -m src.cli_pipeline input.csv \\
|
||||
--steps text_clean,format_standardize,missing --apply
|
||||
|
||||
# Save the recommended pipeline to a JSON for editing:
|
||||
python -m src.cli_pipeline --recommend --output pipeline.json
|
||||
|
||||
# Run a saved pipeline:
|
||||
python -m src.cli_pipeline weekly_export.csv --pipeline pipeline.json --apply
|
||||
|
||||
# Strict mode: fail if the pipeline contains soft-dependency violations
|
||||
python -m src.cli_pipeline data.csv --steps dedup,text_clean \\
|
||||
--strict --apply
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import typer
|
||||
from loguru import logger
|
||||
|
||||
app = typer.Typer(
|
||||
name="pipeline",
|
||||
help=(
|
||||
"Chain DataTools cleaning steps into one orchestrated workflow.\n\n"
|
||||
"Default behaviour: preview the plan + run the pipeline (no file "
|
||||
"written). Add --apply to write the cleaned output and audit log.\n\n"
|
||||
"The pipeline RECOMMENDS an order based on tool dependencies "
|
||||
"(text-clean before format-standardize, format before dedup, etc.) "
|
||||
"and WARNS on out-of-order configs but does not block them. Use "
|
||||
"--strict to escalate warnings to errors.\n\n"
|
||||
"Tools available: text_clean, format_standardize, missing, "
|
||||
"column_map, dedup."
|
||||
),
|
||||
add_completion=False,
|
||||
no_args_is_help=False,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _setup_logging(log_dir: Path) -> Path:
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
log_path = log_dir / f"pipeline_{ts}.log"
|
||||
logger.remove()
|
||||
logger.add(sys.stderr, level="WARNING", format="{message}")
|
||||
logger.add(
|
||||
str(log_path), level="DEBUG",
|
||||
format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}",
|
||||
)
|
||||
return log_path
|
||||
|
||||
|
||||
def _split_csv_arg(raw: Optional[str]) -> Optional[list[str]]:
|
||||
if raw is None:
|
||||
return None
|
||||
return [c.strip() for c in raw.split(",") if c.strip()]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main command
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@app.command()
|
||||
def run(
|
||||
input_file: Optional[str] = typer.Argument(
|
||||
None,
|
||||
help="CSV / TSV / Excel file. Optional with --recommend.",
|
||||
),
|
||||
pipeline_path: Optional[str] = typer.Option(
|
||||
None, "--pipeline", "-p",
|
||||
help="Path to a pipeline JSON file (Pipeline.from_file format).",
|
||||
),
|
||||
steps: Optional[str] = typer.Option(
|
||||
None, "--steps",
|
||||
help=(
|
||||
"Quick pipeline: comma-separated tool names in execution order. "
|
||||
"Each step uses defaults. Example: 'text_clean,format_standardize,dedup'."
|
||||
),
|
||||
),
|
||||
recommend: bool = typer.Option(
|
||||
False, "--recommend",
|
||||
help="Print (or save) the recommended default pipeline and exit.",
|
||||
),
|
||||
output: Optional[str] = typer.Option(
|
||||
None, "--output", "-o",
|
||||
help=(
|
||||
"When --recommend is set, save the pipeline JSON here. "
|
||||
"Otherwise, write the pipeline output to this CSV path "
|
||||
"(default: {input}_pipeline.csv)."
|
||||
),
|
||||
),
|
||||
apply: bool = typer.Option(
|
||||
False, "--apply",
|
||||
help="Write the output. Without this flag, only the plan is shown.",
|
||||
),
|
||||
strict: bool = typer.Option(
|
||||
False, "--strict",
|
||||
help="Treat soft-dependency warnings as errors (refuse to run).",
|
||||
),
|
||||
continue_on_error: bool = typer.Option(
|
||||
False, "--continue-on-error",
|
||||
help="Don't abort if a step fails; carry the previous step's df forward.",
|
||||
),
|
||||
encoding_override: Optional[str] = typer.Option(
|
||||
None, "--encoding",
|
||||
help="Override auto-detected file encoding.",
|
||||
),
|
||||
delimiter: Optional[str] = typer.Option(
|
||||
None, "--delimiter",
|
||||
help="Override auto-detected delimiter.",
|
||||
),
|
||||
):
|
||||
"""Run a DataTools cleaning pipeline."""
|
||||
from src.core.pipeline import (
|
||||
Pipeline,
|
||||
recommended_pipeline,
|
||||
run_pipeline,
|
||||
validate_pipeline,
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# --recommend: print or save the default pipeline and exit
|
||||
# ------------------------------------------------------------------
|
||||
if recommend:
|
||||
pipe = recommended_pipeline()
|
||||
body = json.dumps(pipe.to_dict(), indent=2)
|
||||
if output:
|
||||
Path(output).write_text(body)
|
||||
typer.echo(f"Recommended pipeline saved to {output}")
|
||||
else:
|
||||
typer.echo(body)
|
||||
return
|
||||
|
||||
if not input_file:
|
||||
typer.echo(
|
||||
"Error: input file is required (or use --recommend to "
|
||||
"emit the default pipeline).",
|
||||
err=True,
|
||||
)
|
||||
raise typer.Exit(2)
|
||||
|
||||
inp = Path(input_file)
|
||||
if not inp.exists():
|
||||
typer.echo(f"Error: File not found: {inp}", err=True)
|
||||
raise typer.Exit(1)
|
||||
|
||||
log_path = _setup_logging(Path("logs"))
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Resolve pipeline source: --pipeline file, --steps list, or default
|
||||
# ------------------------------------------------------------------
|
||||
if pipeline_path and steps:
|
||||
typer.echo(
|
||||
"Error: pass either --pipeline or --steps, not both.",
|
||||
err=True,
|
||||
)
|
||||
raise typer.Exit(1)
|
||||
|
||||
if pipeline_path:
|
||||
pp = Path(pipeline_path)
|
||||
if not pp.exists():
|
||||
typer.echo(f"Error: pipeline file not found: {pp}", err=True)
|
||||
raise typer.Exit(1)
|
||||
try:
|
||||
pipe = Pipeline.from_file(pp)
|
||||
except Exception as e:
|
||||
from src.core.errors import format_for_user
|
||||
typer.echo(f"Error reading pipeline: {format_for_user(e)}", err=True)
|
||||
raise typer.Exit(1)
|
||||
elif steps:
|
||||
names = _split_csv_arg(steps) or []
|
||||
try:
|
||||
pipe = recommended_pipeline(include=names)
|
||||
except Exception as e:
|
||||
from src.core.errors import format_for_user
|
||||
typer.echo(f"Error: {format_for_user(e)}", err=True)
|
||||
raise typer.Exit(1)
|
||||
else:
|
||||
pipe = recommended_pipeline()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Plan + warnings
|
||||
# ------------------------------------------------------------------
|
||||
warnings = validate_pipeline(pipe)
|
||||
typer.echo(f"\n{'─'*60}")
|
||||
typer.echo(" Pipeline plan:")
|
||||
for i, step in enumerate(pipe.steps, 1):
|
||||
flag = " " if step.enabled else "✗ "
|
||||
typer.echo(f" {i}. {flag}{step.display_name():<22} options={step.options or {}}")
|
||||
typer.echo(f"{'─'*60}")
|
||||
if warnings:
|
||||
typer.echo("\nSoft-dependency warnings (recommended order violated):")
|
||||
for w in warnings:
|
||||
typer.echo(f" ! {w}")
|
||||
if strict:
|
||||
typer.echo(
|
||||
"\nAborting: --strict was set. Reorder the steps or drop --strict.",
|
||||
err=True,
|
||||
)
|
||||
raise typer.Exit(2)
|
||||
|
||||
if not apply:
|
||||
typer.echo(
|
||||
"\nThis was a plan-only run. Add --apply to execute the pipeline."
|
||||
)
|
||||
typer.echo(f"Log: {log_path}")
|
||||
return
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Read input + execute
|
||||
# ------------------------------------------------------------------
|
||||
from src.core.io import read_file, write_file
|
||||
import pandas as pd
|
||||
|
||||
typer.echo(f"\nReading {inp.name}...")
|
||||
try:
|
||||
df = read_file(
|
||||
inp, encoding=encoding_override, delimiter=delimiter, repair=False,
|
||||
)
|
||||
if not isinstance(df, pd.DataFrame):
|
||||
df = pd.concat(list(df), ignore_index=True)
|
||||
except Exception as e:
|
||||
typer.echo(f"Error reading file: {e}", err=True)
|
||||
raise typer.Exit(1)
|
||||
|
||||
typer.echo(f" {len(df):,} rows, {len(df.columns)} columns")
|
||||
|
||||
typer.echo("\nExecuting pipeline:")
|
||||
|
||||
def _on_step(sr) -> None:
|
||||
if sr.skipped:
|
||||
typer.echo(f" - {sr.step.display_name()} (skipped)")
|
||||
elif sr.error:
|
||||
typer.echo(f" ✗ {sr.step.display_name()} ({sr.elapsed_seconds*1000:.0f} ms) — ERROR: {sr.error.splitlines()[0]}")
|
||||
else:
|
||||
typer.echo(f" ✓ {sr.step.display_name()} ({sr.elapsed_seconds*1000:.0f} ms) {sr.summary}")
|
||||
|
||||
try:
|
||||
result = run_pipeline(
|
||||
df, pipe,
|
||||
on_step_complete=_on_step,
|
||||
stop_on_error=not continue_on_error,
|
||||
)
|
||||
except Exception as e:
|
||||
from src.core.errors import format_for_user
|
||||
typer.echo(f"\nPipeline halted: {format_for_user(e)}", err=True)
|
||||
raise typer.Exit(1)
|
||||
|
||||
typer.echo(f"\n{'─'*60}")
|
||||
typer.echo(f" Initial rows: {result.initial_rows:,}")
|
||||
typer.echo(f" Final rows: {result.final_rows:,}")
|
||||
typer.echo(f" Steps run: {sum(1 for s in result.step_results if not s.skipped)}")
|
||||
typer.echo(f" Total elapsed: {result.total_elapsed:.2f} s")
|
||||
typer.echo(f"{'─'*60}")
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Write output + audit
|
||||
# ------------------------------------------------------------------
|
||||
out_path = Path(output) if output else inp.parent / f"{inp.stem}_pipeline.csv"
|
||||
write_file(result.final_df, out_path)
|
||||
typer.echo(f"\nPipeline output: {out_path}")
|
||||
|
||||
audit_path = inp.parent / f"{inp.stem}_pipeline.json"
|
||||
audit_path.write_text(json.dumps({
|
||||
"pipeline": pipe.to_dict(),
|
||||
"warnings": result.warnings,
|
||||
"initial_rows": result.initial_rows,
|
||||
"final_rows": result.final_rows,
|
||||
"total_elapsed_seconds": result.total_elapsed,
|
||||
"steps": [
|
||||
{
|
||||
"tool": sr.step.tool,
|
||||
"name": sr.step.display_name(),
|
||||
"enabled": sr.step.enabled,
|
||||
"skipped": sr.skipped,
|
||||
"elapsed_seconds": sr.elapsed_seconds,
|
||||
"summary": sr.summary,
|
||||
"error": sr.error,
|
||||
}
|
||||
for sr in result.step_results
|
||||
],
|
||||
}, indent=2, default=str))
|
||||
typer.echo(f"Pipeline audit: {audit_path}")
|
||||
typer.echo(f"Log: {log_path}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
app()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user