feat: 3 new tools, format streaming, distribution-ready demo + landing pages

Tools shipped this batch (4 → 6 of 9 Ready):
  04 Missing Value Handler   src/core/missing.py + cli_missing.py + GUI
  05 Column Mapper           src/core/column_mapper.py + cli_column_map.py + GUI
  09 Pipeline Runner         src/core/pipeline.py + cli_pipeline.py + GUI
                             with soft tool-dependency graph (recommended,
                             not enforced) and JSON save/load for repeatable
                             weekly cleanups.

Format Standardizer reworked for 1 GB international files:
  • Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
  • Per-row country / address columns drive parsing
  • Audit cap (default 10 k rows, ~50 MB RAM)
  • standardize_file(): chunked streaming entry point (~165 k rows/sec)
  • currency_decimal="auto" for EU comma-decimal locales
  • R$ / kr / zł multi-char currency prefixes
  • cli_format.py with auto-stream above 100 MB inputs

Encoding detection arbiter + language-aware probe:
  Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
  via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.

Distribution-readiness assets:
  • streamlit_app.py — Streamlit Community Cloud entry shim
  • src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
    100-row cap + watermark, free-vs-paid boundary enforced at surface
  • samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
  • landing/ — 4 static HTML pages (apex chooser + 3 niche),
    shared CSS, deploy.py URL-substitution script,
    auto-generated robots.txt + sitemap.xml + 404.html + favicon
  • docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
    — full strategy + measurement + deployment + master checklist

Test counts:
  before: 1,520 passed · 4 skipped · 17 xfailed
  after:  1,729 passed · 0 skipped · 0  xfailed

Tier-1 corpora added:
  • missing-corpus           3 use cases + 16 edge cases
  • column-mapper-corpus     3 use cases + 5 edge cases
  • format-cleaner intl      20-row 13-country stress fixture

Engine hardening flushed out by the corpora:
  • interpolate guards against object-dtype columns
  • mean/median skip all-NaN columns (silences numpy warning)
  • fillna runs under future.no_silent_downcasting (silences pandas warning)
  • mojibake test no longer skips when ftfy installed (monkeypatch path)
  • drop-row threshold semantics: strict-greater (consistent across rows / cols)
  • currency_decimal validator allow-set updated for "auto"

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-01 22:31:26 +00:00
parent d18b95880d
commit 966af8ef94
89 changed files with 12039 additions and 284 deletions

355
src/cli_column_map.py Normal file
View File

@@ -0,0 +1,355 @@
"""CLI for the DataTools Column Mapper (script 05).
Usage:
python -m src.cli_column_map input.csv # auto-mapping preview
python -m src.cli_column_map input.csv --schema target.json --apply
python -m src.cli_column_map input.csv --rename "First Name=first_name,Email=email" --apply
python -m src.cli_column_map input.csv --schema target.json --preset strict-schema --apply
python -m src.cli_column_map input.csv --schema target.json --coerce --apply
python -m src.cli_column_map --help
"""
from __future__ import annotations
import json
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional
import typer
from loguru import logger
app = typer.Typer(
name="column-map",
help=(
"Rename columns, enforce a target schema, and coerce types in CSV / Excel files.\n\n"
"Default behaviour: preview the mapping (no file written). Add --apply "
"to write the mapped output and audit log.\n\n"
"Examples:\n\n"
" # Show what auto-mapping would do (no schema → identity)\n"
" python -m src.cli_column_map vendor.csv\n\n"
" # Map against a target JSON schema with strict drop / coerce / reorder\n"
" python -m src.cli_column_map vendor.csv --schema target.json "
"--preset strict-schema --apply\n\n"
" # Hand-rolled rename without a schema\n"
" python -m src.cli_column_map data.csv "
"--rename 'First Name=first_name,Last Name=last_name' --apply\n\n"
" # Coerce specific columns inline\n"
" python -m src.cli_column_map data.csv "
"--coerce-col 'age:integer,joined:date' --apply\n"
),
add_completion=False,
no_args_is_help=True,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _setup_logging(log_dir: Path) -> Path:
log_dir.mkdir(parents=True, exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
log_path = log_dir / f"column_map_{ts}.log"
logger.remove()
logger.add(sys.stderr, level="WARNING", format="{message}")
logger.add(
str(log_path),
level="DEBUG",
format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}",
)
return log_path
def _parse_pairs(raw: Optional[str], separator: str = ",") -> dict[str, str]:
"""Parse ``a=1,b=2`` into a dict."""
if not raw:
return {}
out: dict[str, str] = {}
for piece in raw.split(separator):
piece = piece.strip()
if not piece:
continue
if "=" not in piece:
raise typer.BadParameter(
f"Invalid pair: {piece!r}. Expected 'key=value[,key=value...]'."
)
k, v = piece.split("=", 1)
out[k.strip()] = v.strip()
return out
def _parse_coerce(raw: Optional[str]) -> dict[str, str]:
"""Parse ``age:integer,joined:date`` into a dict."""
if not raw:
return {}
out: dict[str, str] = {}
for piece in raw.split(","):
piece = piece.strip()
if not piece:
continue
if ":" not in piece:
raise typer.BadParameter(
f"Invalid --coerce-col piece: {piece!r}. "
f"Expected 'col:dtype[,col:dtype...]'."
)
col, dtype = piece.split(":", 1)
out[col.strip()] = dtype.strip()
return out
# ---------------------------------------------------------------------------
# Main command
# ---------------------------------------------------------------------------
@app.command()
def map_(
input_file: str = typer.Argument(
...,
help="Path to the CSV or Excel file.",
),
output: Optional[str] = typer.Option(
None, "--output", "-o",
help="Output file path. Default: {input}_mapped.csv",
),
apply: bool = typer.Option(
False, "--apply",
help="Write the output. Without this flag, only the mapping plan is shown.",
),
preset: str = typer.Option(
"rename-only", "--preset",
help="Preset: rename-only, strict-schema, or lenient-schema.",
),
schema: Optional[str] = typer.Option(
None, "--schema",
help="Path to a target schema JSON file (TargetSchema format).",
),
rename: Optional[str] = typer.Option(
None, "--rename",
help="Explicit rename pairs: 'src=tgt[,src=tgt...]' (overrides auto-inference).",
),
coerce_col: Optional[str] = typer.Option(
None, "--coerce-col",
help=(
"Inline type coercion (no schema needed): 'col:dtype[,col:dtype...]'. "
"Valid dtypes: string, integer, float, boolean, date, datetime, category, auto."
),
),
unmapped: Optional[str] = typer.Option(
None, "--unmapped",
help="Strategy for unmapped source columns: keep | drop | error.",
),
threshold: Optional[float] = typer.Option(
None, "--threshold",
help="Fuzzy-match threshold for auto-inference (0.0..1.0). Default 0.6.",
),
no_auto: bool = typer.Option(
False, "--no-auto",
help="Disable auto-inference; honour only explicit --rename pairs.",
),
no_coerce: bool = typer.Option(
False, "--no-coerce",
help="Disable type coercion (overrides preset).",
),
no_reorder: bool = typer.Option(
False, "--no-reorder",
help="Disable schema-order reorder (overrides preset).",
),
no_required: bool = typer.Option(
False, "--no-required",
help="Don't enforce required-target presence (overrides preset).",
),
config: Optional[str] = typer.Option(
None, "--config",
help="Load options from a saved JSON config file.",
),
save_config: Optional[str] = typer.Option(
None, "--save-config",
help="Save current options to a JSON config file.",
),
sheet: Optional[str] = typer.Option(
None, "--sheet",
help="Excel sheet name or index (default: first sheet).",
),
encoding_override: Optional[str] = typer.Option(
None, "--encoding",
help="Override auto-detected file encoding.",
),
header_row: Optional[int] = typer.Option(
None, "--header-row",
help="0-based row index for the header (default: auto-detect).",
),
):
"""Map source columns to a target schema; rename, coerce, drop, reorder."""
from src.core.io import read_file, write_file
from src.core.column_mapper import (
MapOptions,
PRESETS,
TargetField,
TargetSchema,
coerce_series,
map_columns,
)
import pandas as pd
input_path = Path(input_file)
if not input_path.exists():
typer.echo(f"Error: File not found: {input_path}", err=True)
raise typer.Exit(1)
if preset not in PRESETS:
typer.echo(
f"Error: Unknown preset '{preset}'. "
f"Choose from: {', '.join(sorted(PRESETS))}.",
err=True,
)
raise typer.Exit(1)
log_path = _setup_logging(Path("logs"))
# Build options
if config:
cfg_path = Path(config)
if not cfg_path.exists():
typer.echo(f"Error: Config file not found: {cfg_path}", err=True)
raise typer.Exit(1)
options = MapOptions.from_file(cfg_path)
else:
options = MapOptions.from_preset(preset)
if schema:
sp = Path(schema)
if not sp.exists():
typer.echo(f"Error: Schema file not found: {sp}", err=True)
raise typer.Exit(1)
options.schema = TargetSchema.from_file(sp)
if rename:
options.mapping = {**options.mapping, **_parse_pairs(rename)}
if unmapped:
options.unmapped = unmapped # type: ignore[assignment]
if threshold is not None:
options.fuzzy_threshold = threshold
if no_auto:
options.auto_infer = False
if no_coerce:
options.coerce_types = False
if no_reorder:
options.reorder_to_schema = False
if no_required:
options.enforce_required = False
# Inline coercion (no schema): build a tiny one-field-per-column schema.
inline_coerce = _parse_coerce(coerce_col)
if inline_coerce and options.schema is None:
options.schema = TargetSchema(fields=[
TargetField(name=col, dtype=dt) # type: ignore[arg-type]
for col, dt in inline_coerce.items()
])
options.coerce_types = True
if save_config:
saved = options.to_file(save_config)
typer.echo(f"Config saved to {saved}")
# Read input
typer.echo(f"Reading {input_path.name}...")
try:
sheet_arg: str | int | None = None
if sheet is not None:
try:
sheet_arg = int(sheet)
except ValueError:
sheet_arg = sheet
df = read_file(
input_path,
encoding=encoding_override,
header_row=header_row,
sheet_name=sheet_arg if sheet_arg is not None else 0,
repair=False,
)
if not isinstance(df, pd.DataFrame):
df = pd.concat(list(df), ignore_index=True)
except Exception as e:
typer.echo(f"Error reading file: {e}", err=True)
raise typer.Exit(1)
typer.echo(f" {len(df)} rows, {len(df.columns)} columns")
typer.echo("Mapping columns...")
try:
result = map_columns(df, options)
except (ValueError, OSError) as e:
typer.echo(f"Error: {e}", err=True)
raise typer.Exit(1)
_print_results(result, input_path, options)
if apply:
stem = input_path.stem
out_path = Path(output) if output else input_path.parent / f"{stem}_mapped.csv"
write_file(result.mapped_df, out_path)
typer.echo(f"\nMapped file: {out_path}")
# Audit: write the resolved mapping as JSON next to the output.
audit_path = input_path.parent / f"{stem}_mapping.json"
audit_path.write_text(json.dumps({
"mapping": result.mapping,
"inferred_pairs": result.inferred_pairs,
"columns_renamed": result.columns_renamed,
"columns_dropped": result.columns_dropped,
"columns_added": result.columns_added,
"coercion_failures": result.coercion_failures,
"unmapped_kept": result.unmapped_kept,
"missing_required_targets": result.missing_required_targets,
}, indent=2, default=str))
typer.echo(f"Mapping audit: {audit_path}")
else:
typer.echo("\nThis was a preview. Add --apply to write the mapped output.")
typer.echo(f"Log: {log_path}")
# ---------------------------------------------------------------------------
# Output formatting
# ---------------------------------------------------------------------------
def _print_results(result, input_path: Path, options) -> None:
typer.echo(f"\n{''*60}")
typer.echo(f" File: {input_path.name}")
typer.echo(f" Columns renamed: {result.columns_renamed}")
typer.echo(f" Columns dropped: {len(result.columns_dropped)}")
typer.echo(f" Columns added: {len(result.columns_added)}")
typer.echo(f" Unmapped kept: {len(result.unmapped_kept)}")
typer.echo(f" Coercion failures: "
f"{sum(result.coercion_failures.values())} cells across "
f"{len(result.coercion_failures)} column(s)")
typer.echo(f"{''*60}")
if result.mapping:
typer.echo("\nMapping:")
for src, tgt in result.mapping.items():
tag = " (auto)" if src in result.inferred_pairs else ""
arrow = "" if src != tgt else ""
typer.echo(f" {src!r} {arrow} {tgt!r}{tag}")
if result.columns_dropped:
typer.echo(f"\nDropped: {result.columns_dropped}")
if result.columns_added:
typer.echo(f"\nAdded (defaults): {result.columns_added}")
if result.coercion_failures:
typer.echo("\nCoercion failures:")
for col, n in result.coercion_failures.items():
typer.echo(f" {col}: {n} row(s) could not be coerced")
if result.missing_required_targets:
typer.echo(f"\nMissing required targets: {result.missing_required_targets}")
# ---------------------------------------------------------------------------
# __main__
# ---------------------------------------------------------------------------
def main():
app()
if __name__ == "__main__":
main()

364
src/cli_format.py Normal file
View File

@@ -0,0 +1,364 @@
"""CLI for the DataTools Format Standardizer (script 03).
Usage:
python -m src.cli_format input.csv \\
--types 'phone:phone,price:currency,name:name' \\
--apply
# 1 GB international file with per-row country column:
python -m src.cli_format huge.csv \\
--types 'phone:phone,address:address,price:currency' \\
--phone-country country --address-country country \\
--preserve-code --audit-max 50000 --apply
The CLI auto-streams (chunked read/write, bounded RAM) when the input
exceeds ~100 MB. Force or disable with ``--stream`` / ``--no-stream``.
"""
from __future__ import annotations
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional
import typer
from loguru import logger
app = typer.Typer(
name="format",
help=(
"Standardize dates, phones, currencies, names, and addresses "
"in CSV / Excel files.\n\n"
"Default behaviour: preview the changes (no file written). "
"Add --apply to write output.\n\n"
"For 1 GB+ international files, the CLI auto-streams in 50,000-row "
"chunks so memory stays bounded. Use --phone-country / "
"--address-country to point at a per-row ISO-3166 column for "
"country-aware parsing.\n\n"
"Examples:\n\n"
" # Preview\n"
" python -m src.cli_format data.csv --types 'phone:phone,price:currency'\n\n"
" # International file with per-row country\n"
" python -m src.cli_format leads.csv --types 'phone:phone' "
"--phone-country country --apply\n\n"
" # Force streaming with smaller chunks for tight memory\n"
" python -m src.cli_format huge.csv --types 'phone:phone' "
"--stream --chunk-size 10000 --apply\n"
),
add_completion=False,
no_args_is_help=True,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _setup_logging(log_dir: Path) -> Path:
log_dir.mkdir(parents=True, exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
log_path = log_dir / f"format_{ts}.log"
logger.remove()
logger.add(sys.stderr, level="WARNING", format="{message}")
logger.add(
str(log_path), level="DEBUG",
format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}",
)
return log_path
def _parse_types(raw: Optional[str]) -> dict[str, str]:
"""Parse ``col:phone,col:date`` into a dict."""
if not raw:
return {}
out: dict[str, str] = {}
for piece in raw.split(","):
piece = piece.strip()
if not piece:
continue
if ":" not in piece:
raise typer.BadParameter(
f"Invalid --types piece: {piece!r}. "
f"Expected 'col:type[,col:type...]' "
f"where type is one of: date, phone, currency, name, address, email, boolean."
)
col, ft = piece.split(":", 1)
out[col.strip()] = ft.strip()
return out
_AUTO_STREAM_THRESHOLD = 100 * 1024 * 1024 # 100 MB
# ---------------------------------------------------------------------------
# Main command
# ---------------------------------------------------------------------------
@app.command()
def standardize(
input_file: str = typer.Argument(..., help="CSV or TSV file path."),
output: Optional[str] = typer.Option(
None, "--output", "-o",
help="Output file path. Default: {input}_standardized.csv",
),
apply: bool = typer.Option(
False, "--apply",
help="Write the output. Without this flag, only a preview is shown.",
),
types: Optional[str] = typer.Option(
None, "--types",
help="Per-column types: 'col:type[,col:type...]'. "
"Types: date, phone, currency, name, address, email, boolean.",
),
preset: Optional[str] = typer.Option(
None, "--preset",
help="Named preset (e.g. 'us', 'uk', 'eu', 'jp'). Layered before --types.",
),
phone_country: Optional[str] = typer.Option(
None, "--phone-country",
help="Column name carrying the per-row ISO-3166 country code for phones.",
),
address_country: Optional[str] = typer.Option(
None, "--address-country",
help="Column name carrying the per-row country code for addresses.",
),
phone_region: str = typer.Option(
"US", "--phone-region",
help="Default phone region when no per-row column is set. ISO-3166 alpha-2.",
),
phone_format: str = typer.Option(
"E164", "--phone-format",
help="Phone output format: E164 | INTERNATIONAL | NATIONAL | RFC3966 | DIGITS.",
),
preserve_code: bool = typer.Option(
False, "--preserve-code",
help="Currency: emit ISO-4217 prefix (e.g. 'USD 1500.00').",
),
decimals: int = typer.Option(
2, "--decimals",
help="Currency decimal precision.",
),
audit_max: int = typer.Option(
10_000, "--audit-max",
help="Cap the change-audit at N rows (0 = no audit, -1 = unbounded).",
),
stream: Optional[bool] = typer.Option(
None, "--stream/--no-stream",
help="Force streaming (chunked, bounded RAM). Auto-on for inputs > 100 MB.",
),
chunk_size: int = typer.Option(
50_000, "--chunk-size",
help="Rows per chunk in streaming mode.",
),
cache_size: int = typer.Option(
262_144, "--cache-size",
help="Per-column LRU-cache size (set 0 to disable).",
),
encoding_override: Optional[str] = typer.Option(
None, "--encoding",
help="Override auto-detected file encoding.",
),
delimiter: Optional[str] = typer.Option(
None, "--delimiter",
help="Override auto-detected delimiter.",
),
config: Optional[str] = typer.Option(
None, "--config",
help="Load options from a saved JSON config.",
),
save_config: Optional[str] = typer.Option(
None, "--save-config",
help="Save current options to a JSON config.",
),
):
"""Standardize formats across a CSV / TSV. Auto-streams for large inputs."""
from src.core.format_standardize import (
FieldType,
StandardizeOptions,
standardize_dataframe,
standardize_file,
)
from src.core.io import read_file, detect_encoding, detect_delimiter
import pandas as pd
inp = Path(input_file)
if not inp.exists():
typer.echo(f"Error: File not found: {inp}", err=True)
raise typer.Exit(1)
log_path = _setup_logging(Path("logs"))
# Build options
if config:
cp = Path(config)
if not cp.exists():
typer.echo(f"Error: Config file not found: {cp}", err=True)
raise typer.Exit(1)
options = StandardizeOptions.from_file(cp)
elif preset:
try:
options = StandardizeOptions.from_preset(preset)
except ValueError as e:
typer.echo(f"Error: {e}", err=True)
raise typer.Exit(1)
else:
options = StandardizeOptions()
parsed_types = _parse_types(types)
if parsed_types:
try:
options.column_types = {
col: FieldType(t) for col, t in parsed_types.items()
}
except ValueError as e:
typer.echo(
f"Error: {e}. Valid types: "
+ ", ".join(sorted(t.value for t in FieldType)),
err=True,
)
raise typer.Exit(1)
if not options.column_types:
typer.echo(
"Error: no column types declared. Pass --types 'col:type,...' "
"or --preset / --config with a column_types map.",
err=True,
)
raise typer.Exit(1)
if phone_country:
options.phone_country_column = phone_country
if address_country:
options.address_country_column = address_country
options.phone_region = phone_region
options.phone_format = phone_format # type: ignore[assignment]
options.currency_preserve_code = preserve_code
options.currency_decimals = decimals
options.audit_max_rows = (
None if audit_max < 0 else audit_max
)
options.cache_size = cache_size
if save_config:
saved = options.to_file(save_config)
typer.echo(f"Config saved to {saved}")
# Decide streaming mode
file_size = inp.stat().st_size
use_stream = stream if stream is not None else file_size > _AUTO_STREAM_THRESHOLD
enc = encoding_override or detect_encoding(inp)
delim = delimiter or detect_delimiter(inp, enc)
out_path = Path(output) if output else inp.parent / f"{inp.stem}_standardized.csv"
typer.echo(
f"Reading {inp.name} ({file_size/1024/1024:.1f} MB; "
f"{'streaming' if use_stream else 'in-memory'} mode)..."
)
if use_stream:
if not apply:
typer.echo(
"\nStreaming mode does not produce a preview. "
"Re-run with --apply to write output, or remove --stream to preview a sample."
)
raise typer.Exit(0)
last_log = [0.0]
import time as _time
def _progress(rows, chunks):
now = _time.perf_counter()
if now - last_log[0] < 1.0:
return
last_log[0] = now
typer.echo(f" ... {rows:,} rows ({chunks} chunks)")
t0 = _time.perf_counter()
res = standardize_file(
inp, out_path, options,
chunk_size=chunk_size,
progress_callback=_progress,
encoding=enc,
delimiter=delim,
)
elapsed = _time.perf_counter() - t0
typer.echo(f"\n{''*60}")
typer.echo(f" File: {inp.name}")
typer.echo(f" Rows: {res.rows_processed:,}")
typer.echo(f" Chunks: {res.chunks_processed}")
typer.echo(f" Cells changed: {res.cells_changed:,}")
typer.echo(
f" Cells unparseable: {res.cells_unparseable:,} / {res.cells_total:,}"
)
typer.echo(
f" Throughput: {res.rows_processed / max(elapsed, 1e-9):,.0f} rows/sec"
)
typer.echo(f" Elapsed: {elapsed:.2f}s")
typer.echo(f"{''*60}")
typer.echo(f"\nStandardized: {res.output_path}")
if res.audit_path:
typer.echo(f"Changes audit: {res.audit_path}")
typer.echo(f"Log: {log_path}")
return
# In-memory path
try:
df = read_file(
inp, encoding=enc, delimiter=delim, repair=False,
)
if not isinstance(df, pd.DataFrame):
df = pd.concat(list(df), ignore_index=True)
except Exception as e:
typer.echo(f"Error reading file: {e}", err=True)
raise typer.Exit(1)
typer.echo(f" {len(df):,} rows, {len(df.columns)} columns")
typer.echo("Standardizing...")
try:
result = standardize_dataframe(df, options)
except (ValueError, OSError) as e:
typer.echo(f"Error: {e}", err=True)
raise typer.Exit(1)
pct = (result.cells_changed / result.cells_total * 100) if result.cells_total else 0
typer.echo(f"\n{''*60}")
typer.echo(f" File: {inp.name}")
typer.echo(f" Columns processed: {len(result.columns_processed)}")
typer.echo(f" Cells scanned: {result.cells_total:,}")
typer.echo(f" Cells changed: {result.cells_changed:,} ({pct:.1f}%)")
typer.echo(f" Cells unparseable: {result.cells_unparseable:,}")
typer.echo(f"{''*60}")
if result.cells_changed and not result.changes.empty:
typer.echo("\nFirst examples:")
for _, row in result.changes.head(5).iterrows():
old = repr(row["old"])[:40]
new = repr(row["new"])[:40]
typer.echo(
f" Row {row['row'] + 1}, {row['column']} "
f"({row['field_type']}): {old}{new}"
)
if apply:
from src.core.io import write_file
write_file(result.standardized_df, out_path)
typer.echo(f"\nStandardized: {out_path}")
if not result.changes.empty:
audit_path = inp.parent / f"{inp.stem}_changes.csv"
write_file(result.changes, audit_path)
typer.echo(f"Changes audit: {audit_path}")
else:
typer.echo("\nThis was a preview. Add --apply to write the output.")
typer.echo(f"Log: {log_path}")
def main():
app()
if __name__ == "__main__":
main()

380
src/cli_missing.py Normal file
View File

@@ -0,0 +1,380 @@
"""CLI for the DataTools Missing Value Handler (script 04).
Usage:
python -m src.cli_missing input.csv # profile only
python -m src.cli_missing input.csv --apply # detect-only + write
python -m src.cli_missing input.csv --preset safe-fill --apply
python -m src.cli_missing input.csv --strategy median --apply
python -m src.cli_missing input.csv --strategy drop_row --apply
python -m src.cli_missing input.csv --strategy constant --fill-value 0 --apply
python -m src.cli_missing input.csv --strategy median --columns age,score --apply
python -m src.cli_missing input.csv --col-strategy "age:median,city:mode" --apply
python -m src.cli_missing --help
"""
from __future__ import annotations
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional
import typer
from loguru import logger
app = typer.Typer(
name="missing",
help=(
"Detect and handle missing values in CSV / Excel files.\n\n"
"Default behaviour: profile only (no file written). Add --apply to "
"write the handled output and audit log.\n\n"
"Strategies:\n"
" none, drop_row, drop_col, drop_both,\n"
" mean, median, mode, constant,\n"
" ffill, bfill, interpolate\n\n"
"Examples:\n\n"
" # Profile missingness without writing anything\n"
" python -m src.cli_missing customers.csv\n\n"
" # Standardize sentinels (\"N/A\", \"-\", \"NULL\", …) to NaN and write\n"
" python -m src.cli_missing customers.csv --apply\n\n"
" # Safe fill: numeric → median, categorical → mode\n"
" python -m src.cli_missing customers.csv --preset safe-fill --apply\n\n"
" # Drop rows missing >50%% of selected columns\n"
" python -m src.cli_missing customers.csv --strategy drop_row "
"--row-threshold 0.5 --apply\n\n"
" # Per-column strategies\n"
" python -m src.cli_missing customers.csv "
"--col-strategy 'age:median,city:mode,notes:constant' --fill-value '' --apply\n"
),
add_completion=False,
no_args_is_help=True,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _setup_logging(log_dir: Path) -> Path:
log_dir.mkdir(parents=True, exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
log_path = log_dir / f"missing_{ts}.log"
logger.remove()
logger.add(sys.stderr, level="WARNING", format="{message}")
logger.add(
str(log_path),
level="DEBUG",
format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}",
)
return log_path
def _split_csv_arg(raw: Optional[str]) -> Optional[list[str]]:
if raw is None:
return None
return [c.strip() for c in raw.split(",") if c.strip()]
def _parse_col_strategy(raw: Optional[str]) -> dict[str, str]:
"""Parse ``--col-strategy 'age:median,city:mode'`` into a dict."""
if not raw:
return {}
out: dict[str, str] = {}
for piece in raw.split(","):
piece = piece.strip()
if not piece:
continue
if ":" not in piece:
raise typer.BadParameter(
f"Invalid --col-strategy piece: '{piece}'. "
f"Expected 'col:strategy[,col:strategy...]'."
)
col, strat = piece.split(":", 1)
out[col.strip()] = strat.strip()
return out
# ---------------------------------------------------------------------------
# Main command
# ---------------------------------------------------------------------------
@app.command()
def handle(
input_file: str = typer.Argument(
...,
help="Path to the CSV or Excel file.",
),
output: Optional[str] = typer.Option(
None, "--output", "-o",
help="Output file path. Default: {input}_missing.csv",
),
apply: bool = typer.Option(
False, "--apply",
help="Write the output. Without this flag, only the profile is shown.",
),
preset: str = typer.Option(
"detect-only", "--preset",
help="Preset: detect-only, safe-fill, or drop-incomplete.",
),
strategy: Optional[str] = typer.Option(
None, "--strategy",
help=(
"Override the preset strategy: none, drop_row, drop_col, drop_both, "
"mean, median, mode, constant, ffill, bfill, interpolate."
),
),
col_strategy: Optional[str] = typer.Option(
None, "--col-strategy",
help="Per-column strategies: 'col:strategy[,col:strategy...]'.",
),
fill_value: Optional[str] = typer.Option(
None, "--fill-value",
help="Constant fill value (used with --strategy constant).",
),
columns: Optional[str] = typer.Option(
None, "--columns",
help="Comma-separated columns to handle (default: all columns).",
),
skip: Optional[str] = typer.Option(
None, "--skip",
help="Comma-separated columns to skip.",
),
sentinels: Optional[str] = typer.Option(
None, "--sentinels",
help=(
"Comma-separated extra sentinels to treat as missing "
"(merged with the built-in defaults)."
),
),
no_sentinels: bool = typer.Option(
False, "--no-sentinels",
help="Disable disguised-null standardization entirely.",
),
row_threshold: float = typer.Option(
1.0, "--row-threshold",
help=(
"For drop_row: drop rows whose missing fraction across selected "
"columns is STRICTLY GREATER than this value (0.0..1.0). "
"Default 1.0 = never drop. Use 0.0 to drop any row with any "
"missing; 0.5 to drop rows >50%% missing."
),
),
col_threshold: float = typer.Option(
1.0, "--col-threshold",
help=(
"For drop_col: drop columns whose missing fraction is strictly "
"greater than this value. Default 1.0 = never drop."
),
),
config: Optional[str] = typer.Option(
None, "--config",
help="Load options from a saved JSON config file.",
),
save_config: Optional[str] = typer.Option(
None, "--save-config",
help="Save current options to a JSON config file.",
),
sheet: Optional[str] = typer.Option(
None, "--sheet",
help="Excel sheet name or index (default: first sheet).",
),
encoding_override: Optional[str] = typer.Option(
None, "--encoding",
help="Override auto-detected file encoding.",
),
header_row: Optional[int] = typer.Option(
None, "--header-row",
help="0-based row index for the header (default: auto-detect).",
),
full_changelog: bool = typer.Option(
False, "--full-changelog",
help="Write every change to the audit CSV (default caps to first 1000).",
),
):
"""Detect and handle missing values."""
from src.core.io import read_file, write_file
from src.core.missing import MissingOptions, PRESETS, handle_missing
import pandas as pd
# Validate inputs
input_path = Path(input_file)
if not input_path.exists():
typer.echo(f"Error: File not found: {input_path}", err=True)
raise typer.Exit(1)
if preset not in PRESETS:
typer.echo(
f"Error: Unknown preset '{preset}'. "
f"Choose from: {', '.join(sorted(PRESETS))}.",
err=True,
)
raise typer.Exit(1)
log_path = _setup_logging(Path("logs"))
# Build options
if config:
cfg_path = Path(config)
if not cfg_path.exists():
typer.echo(f"Error: Config file not found: {cfg_path}", err=True)
raise typer.Exit(1)
options = MissingOptions.from_file(cfg_path)
logger.info("Loaded config from {}", cfg_path)
else:
options = MissingOptions.from_preset(preset)
if strategy:
options.strategy = strategy # type: ignore[assignment]
if col_strategy:
options.column_strategies = _parse_col_strategy(col_strategy) # type: ignore[assignment]
if fill_value is not None:
options.fill_value = fill_value
cols_list = _split_csv_arg(columns)
if cols_list is not None:
options.columns = cols_list
skip_list = _split_csv_arg(skip)
if skip_list:
options.skip_columns = skip_list
extra = _split_csv_arg(sentinels)
if extra:
options.sentinels = list(dict.fromkeys([*options.sentinels, *extra]))
if no_sentinels:
options.standardize_sentinels = False
options.row_drop_threshold = row_threshold
options.col_drop_threshold = col_threshold
if save_config:
saved = options.to_file(save_config)
typer.echo(f"Config saved to {saved}")
# Read input
typer.echo(f"Reading {input_path.name}...")
try:
sheet_arg: str | int | None = None
if sheet is not None:
try:
sheet_arg = int(sheet)
except ValueError:
sheet_arg = sheet
df = read_file(
input_path,
encoding=encoding_override,
header_row=header_row,
sheet_name=sheet_arg if sheet_arg is not None else 0,
repair=False,
)
if not isinstance(df, pd.DataFrame):
df = pd.concat(list(df), ignore_index=True)
except Exception as e:
typer.echo(f"Error reading file: {e}", err=True)
raise typer.Exit(1)
typer.echo(f" {len(df)} rows, {len(df.columns)} columns")
# Run
typer.echo("Profiling missingness...")
try:
result = handle_missing(df, options)
except (ValueError, OSError) as e:
typer.echo(f"Error: {e}", err=True)
raise typer.Exit(1)
_print_results(result, input_path, options)
# Write
if apply:
stem = input_path.stem
out_path = Path(output) if output else input_path.parent / f"{stem}_missing.csv"
write_file(result.handled_df, out_path)
typer.echo(f"\nHandled file: {out_path}")
if not result.changes.empty:
changes_path = input_path.parent / f"{stem}_missing_changes.csv"
audit_df = result.changes
cap = 1000
if not full_changelog and len(audit_df) > cap:
typer.echo(
f"Note: changelog capped at {cap} rows. "
f"Use --full-changelog to write all {len(audit_df)} changes."
)
audit_df = audit_df.head(cap)
write_file(audit_df, changes_path)
typer.echo(f"Changes audit: {changes_path}")
else:
typer.echo(
"\nThis was a profile only. Add --apply to write the handled output."
)
typer.echo(f"Log: {log_path}")
# ---------------------------------------------------------------------------
# Output formatting
# ---------------------------------------------------------------------------
def _print_results(result, input_path: Path, options) -> None:
typer.echo(f"\n{''*60}")
typer.echo(f" File: {input_path.name}")
typer.echo(f" Rows: {result.profile_before.rows_total}")
typer.echo(f" Columns processed: {len(result.columns_processed)}")
typer.echo(
f" Cells missing: "
f"{result.profile_before.cells_missing} / {result.profile_before.cells_total}"
f" ({result.profile_before.cells_missing_pct:.1f}%)"
)
typer.echo(
f" Rows w/ any missing: "
f"{result.profile_before.rows_with_any_missing} "
f"(complete: {result.profile_before.rows_complete})"
)
typer.echo(f"{''*60}")
typer.echo("\nPer-column profile:")
profile_df = result.profile_before.to_dataframe()
for _, row in profile_df.iterrows():
marker = " " if row["missing"] == 0 else " "
typer.echo(
f"{marker}{row['column']:<24} {row['dtype']:<10} "
f"missing={row['missing']:<6} ({row['missing_pct']:>5.1f}%)"
+ (
f" top sentinel: {row['top_sentinel']!r} ×{row['top_sentinel_count']}"
if row["top_sentinel_count"] else ""
)
)
typer.echo("\nActions:")
typer.echo(f" Sentinels standardized to NaN: {result.sentinels_standardized}")
typer.echo(f" Cells filled: {result.cells_filled}")
typer.echo(f" Rows dropped: {result.rows_dropped}")
typer.echo(
f" Columns dropped: {len(result.columns_dropped)}"
+ (f" ({', '.join(result.columns_dropped)})" if result.columns_dropped else "")
)
if result.strategy_per_column:
typer.echo("\nStrategy per column:")
for col, strat in result.strategy_per_column.items():
typer.echo(f" {col}: {strat}")
if not result.changes.empty:
typer.echo("\nFirst examples:")
for _, row in result.changes.head(5).iterrows():
old = repr(row["old"])[:40]
new = repr(row["new"])[:40]
row_label = "" if row["row"] == -1 else f"Row {row['row'] + 1}"
typer.echo(
f" {row_label}, {row['column']}: {old}{new} "
f"[{row['action']}]"
)
# ---------------------------------------------------------------------------
# __main__
# ---------------------------------------------------------------------------
def main():
app()
if __name__ == "__main__":
main()

307
src/cli_pipeline.py Normal file
View File

@@ -0,0 +1,307 @@
"""CLI for the DataTools Pipeline Runner (script 09).
Usage:
# Run the recommended default pipeline (text → format → missing → dedup):
python -m src.cli_pipeline input.csv --apply
# Quick custom order via --steps:
python -m src.cli_pipeline input.csv \\
--steps text_clean,format_standardize,missing --apply
# Save the recommended pipeline to a JSON for editing:
python -m src.cli_pipeline --recommend --output pipeline.json
# Run a saved pipeline:
python -m src.cli_pipeline weekly_export.csv --pipeline pipeline.json --apply
# Strict mode: fail if the pipeline contains soft-dependency violations
python -m src.cli_pipeline data.csv --steps dedup,text_clean \\
--strict --apply
"""
from __future__ import annotations
import json
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional
import typer
from loguru import logger
app = typer.Typer(
name="pipeline",
help=(
"Chain DataTools cleaning steps into one orchestrated workflow.\n\n"
"Default behaviour: preview the plan + run the pipeline (no file "
"written). Add --apply to write the cleaned output and audit log.\n\n"
"The pipeline RECOMMENDS an order based on tool dependencies "
"(text-clean before format-standardize, format before dedup, etc.) "
"and WARNS on out-of-order configs but does not block them. Use "
"--strict to escalate warnings to errors.\n\n"
"Tools available: text_clean, format_standardize, missing, "
"column_map, dedup."
),
add_completion=False,
no_args_is_help=False,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _setup_logging(log_dir: Path) -> Path:
log_dir.mkdir(parents=True, exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
log_path = log_dir / f"pipeline_{ts}.log"
logger.remove()
logger.add(sys.stderr, level="WARNING", format="{message}")
logger.add(
str(log_path), level="DEBUG",
format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {message}",
)
return log_path
def _split_csv_arg(raw: Optional[str]) -> Optional[list[str]]:
if raw is None:
return None
return [c.strip() for c in raw.split(",") if c.strip()]
# ---------------------------------------------------------------------------
# Main command
# ---------------------------------------------------------------------------
@app.command()
def run(
input_file: Optional[str] = typer.Argument(
None,
help="CSV / TSV / Excel file. Optional with --recommend.",
),
pipeline_path: Optional[str] = typer.Option(
None, "--pipeline", "-p",
help="Path to a pipeline JSON file (Pipeline.from_file format).",
),
steps: Optional[str] = typer.Option(
None, "--steps",
help=(
"Quick pipeline: comma-separated tool names in execution order. "
"Each step uses defaults. Example: 'text_clean,format_standardize,dedup'."
),
),
recommend: bool = typer.Option(
False, "--recommend",
help="Print (or save) the recommended default pipeline and exit.",
),
output: Optional[str] = typer.Option(
None, "--output", "-o",
help=(
"When --recommend is set, save the pipeline JSON here. "
"Otherwise, write the pipeline output to this CSV path "
"(default: {input}_pipeline.csv)."
),
),
apply: bool = typer.Option(
False, "--apply",
help="Write the output. Without this flag, only the plan is shown.",
),
strict: bool = typer.Option(
False, "--strict",
help="Treat soft-dependency warnings as errors (refuse to run).",
),
continue_on_error: bool = typer.Option(
False, "--continue-on-error",
help="Don't abort if a step fails; carry the previous step's df forward.",
),
encoding_override: Optional[str] = typer.Option(
None, "--encoding",
help="Override auto-detected file encoding.",
),
delimiter: Optional[str] = typer.Option(
None, "--delimiter",
help="Override auto-detected delimiter.",
),
):
"""Run a DataTools cleaning pipeline."""
from src.core.pipeline import (
Pipeline,
recommended_pipeline,
run_pipeline,
validate_pipeline,
)
# ------------------------------------------------------------------
# --recommend: print or save the default pipeline and exit
# ------------------------------------------------------------------
if recommend:
pipe = recommended_pipeline()
body = json.dumps(pipe.to_dict(), indent=2)
if output:
Path(output).write_text(body)
typer.echo(f"Recommended pipeline saved to {output}")
else:
typer.echo(body)
return
if not input_file:
typer.echo(
"Error: input file is required (or use --recommend to "
"emit the default pipeline).",
err=True,
)
raise typer.Exit(2)
inp = Path(input_file)
if not inp.exists():
typer.echo(f"Error: File not found: {inp}", err=True)
raise typer.Exit(1)
log_path = _setup_logging(Path("logs"))
# ------------------------------------------------------------------
# Resolve pipeline source: --pipeline file, --steps list, or default
# ------------------------------------------------------------------
if pipeline_path and steps:
typer.echo(
"Error: pass either --pipeline or --steps, not both.",
err=True,
)
raise typer.Exit(1)
if pipeline_path:
pp = Path(pipeline_path)
if not pp.exists():
typer.echo(f"Error: pipeline file not found: {pp}", err=True)
raise typer.Exit(1)
try:
pipe = Pipeline.from_file(pp)
except Exception as e:
from src.core.errors import format_for_user
typer.echo(f"Error reading pipeline: {format_for_user(e)}", err=True)
raise typer.Exit(1)
elif steps:
names = _split_csv_arg(steps) or []
try:
pipe = recommended_pipeline(include=names)
except Exception as e:
from src.core.errors import format_for_user
typer.echo(f"Error: {format_for_user(e)}", err=True)
raise typer.Exit(1)
else:
pipe = recommended_pipeline()
# ------------------------------------------------------------------
# Plan + warnings
# ------------------------------------------------------------------
warnings = validate_pipeline(pipe)
typer.echo(f"\n{''*60}")
typer.echo(" Pipeline plan:")
for i, step in enumerate(pipe.steps, 1):
flag = " " if step.enabled else ""
typer.echo(f" {i}. {flag}{step.display_name():<22} options={step.options or {}}")
typer.echo(f"{''*60}")
if warnings:
typer.echo("\nSoft-dependency warnings (recommended order violated):")
for w in warnings:
typer.echo(f" ! {w}")
if strict:
typer.echo(
"\nAborting: --strict was set. Reorder the steps or drop --strict.",
err=True,
)
raise typer.Exit(2)
if not apply:
typer.echo(
"\nThis was a plan-only run. Add --apply to execute the pipeline."
)
typer.echo(f"Log: {log_path}")
return
# ------------------------------------------------------------------
# Read input + execute
# ------------------------------------------------------------------
from src.core.io import read_file, write_file
import pandas as pd
typer.echo(f"\nReading {inp.name}...")
try:
df = read_file(
inp, encoding=encoding_override, delimiter=delimiter, repair=False,
)
if not isinstance(df, pd.DataFrame):
df = pd.concat(list(df), ignore_index=True)
except Exception as e:
typer.echo(f"Error reading file: {e}", err=True)
raise typer.Exit(1)
typer.echo(f" {len(df):,} rows, {len(df.columns)} columns")
typer.echo("\nExecuting pipeline:")
def _on_step(sr) -> None:
if sr.skipped:
typer.echo(f" - {sr.step.display_name()} (skipped)")
elif sr.error:
typer.echo(f"{sr.step.display_name()} ({sr.elapsed_seconds*1000:.0f} ms) — ERROR: {sr.error.splitlines()[0]}")
else:
typer.echo(f"{sr.step.display_name()} ({sr.elapsed_seconds*1000:.0f} ms) {sr.summary}")
try:
result = run_pipeline(
df, pipe,
on_step_complete=_on_step,
stop_on_error=not continue_on_error,
)
except Exception as e:
from src.core.errors import format_for_user
typer.echo(f"\nPipeline halted: {format_for_user(e)}", err=True)
raise typer.Exit(1)
typer.echo(f"\n{''*60}")
typer.echo(f" Initial rows: {result.initial_rows:,}")
typer.echo(f" Final rows: {result.final_rows:,}")
typer.echo(f" Steps run: {sum(1 for s in result.step_results if not s.skipped)}")
typer.echo(f" Total elapsed: {result.total_elapsed:.2f} s")
typer.echo(f"{''*60}")
# ------------------------------------------------------------------
# Write output + audit
# ------------------------------------------------------------------
out_path = Path(output) if output else inp.parent / f"{inp.stem}_pipeline.csv"
write_file(result.final_df, out_path)
typer.echo(f"\nPipeline output: {out_path}")
audit_path = inp.parent / f"{inp.stem}_pipeline.json"
audit_path.write_text(json.dumps({
"pipeline": pipe.to_dict(),
"warnings": result.warnings,
"initial_rows": result.initial_rows,
"final_rows": result.final_rows,
"total_elapsed_seconds": result.total_elapsed,
"steps": [
{
"tool": sr.step.tool,
"name": sr.step.display_name(),
"enabled": sr.step.enabled,
"skipped": sr.skipped,
"elapsed_seconds": sr.elapsed_seconds,
"summary": sr.summary,
"error": sr.error,
}
for sr in result.step_results
],
}, indent=2, default=str))
typer.echo(f"Pipeline audit: {audit_path}")
typer.echo(f"Log: {log_path}")
def main() -> None:
app()
if __name__ == "__main__":
main()

View File

@@ -96,15 +96,54 @@ from .format_standardize import (
PRESETS as STANDARDIZE_PRESETS,
StandardizeOptions,
StandardizeResult,
StreamingStandardizeResult,
detect_currency_code,
standardize_address,
standardize_boolean,
standardize_currency,
standardize_dataframe,
standardize_date,
standardize_file,
standardize_name,
standardize_phone,
)
from .missing import (
DEFAULT_SENTINELS,
ColumnReport,
MissingOptions,
MissingProfile,
MissingResult,
PRESETS as MISSING_PRESETS,
Strategy as MissingStrategy,
detect_sentinels,
handle_missing,
is_missing_like,
profile_missing,
)
from .column_mapper import (
ColumnDtype,
MapOptions,
MapResult,
PRESETS as MAP_PRESETS,
TargetField,
TargetSchema,
UnmappedStrategy,
coerce_series,
infer_mapping,
map_columns,
)
from .pipeline import (
Pipeline,
PipelineResult,
SOFT_DEPENDENCIES,
Step,
StepResult,
TOOL_ADAPTERS,
TOOL_NAMES,
recommended_pipeline,
run_pipeline,
validate_pipeline,
)
__all__ = [
# Core
@@ -171,6 +210,7 @@ __all__ = [
"STANDARDIZE_PRESETS",
"StandardizeOptions",
"StandardizeResult",
"StreamingStandardizeResult",
"detect_currency_code",
"standardize_dataframe",
"standardize_date",
@@ -179,4 +219,39 @@ __all__ = [
"standardize_name",
"standardize_address",
"standardize_boolean",
"standardize_file",
# Missing-value handling
"DEFAULT_SENTINELS",
"ColumnReport",
"MissingOptions",
"MissingProfile",
"MissingResult",
"MISSING_PRESETS",
"MissingStrategy",
"detect_sentinels",
"handle_missing",
"is_missing_like",
"profile_missing",
# Column mapping
"ColumnDtype",
"MapOptions",
"MapResult",
"MAP_PRESETS",
"TargetField",
"TargetSchema",
"UnmappedStrategy",
"coerce_series",
"infer_mapping",
"map_columns",
# Pipeline
"Pipeline",
"PipelineResult",
"SOFT_DEPENDENCIES",
"Step",
"StepResult",
"TOOL_ADAPTERS",
"TOOL_NAMES",
"recommended_pipeline",
"run_pipeline",
"validate_pipeline",
]

View File

@@ -593,6 +593,40 @@ def _count_row_terminators(raw: bytes) -> tuple[int, int, int]:
return n_crlf, n_lf, n_cr
def _detect_lying_bom(raw: bytes) -> list[Finding]:
"""Flag files whose UTF-8 BOM disagrees with the body bytes.
The "lying BOM" pattern is a file that starts with the UTF-8 BOM
(``EF BB BF``) but whose body cannot be decoded as UTF-8 — typically
a cp1252 export that someone hand-prepended a BOM to in an attempt to
make Excel happy. The encoding detector recovers transparently
(returns cp1252), but the user should still be told their file is
misrepresenting itself so the next downstream tool doesn't get
surprised.
"""
if not raw[:3] == b"\xef\xbb\xbf":
return []
try:
raw[3:].decode("utf-8")
return [] # honest BOM — body is real UTF-8
except UnicodeDecodeError:
pass
return [Finding(
id="encoding_lying_bom",
severity="warn",
tool="",
count=1,
description=(
"File starts with a UTF-8 BOM, but the body bytes are not "
"valid UTF-8 — the BOM is misleading. The encoding detector "
"recovered by falling back to a single-byte codepage; you "
"may want to re-save the file with a matching encoding."
),
confidence="high",
fix_action=FIX_NONE,
)]
def _detect_mixed_line_endings(raw: bytes) -> list[Finding]:
"""Flag files that mix CRLF, LF, and bare CR row terminators.
@@ -875,6 +909,7 @@ def analyze(
findings.extend(_findings_from_repair(repair_result))
if raw_for_byte_scan is not None:
findings.extend(_detect_mixed_line_endings(raw_for_byte_scan))
findings.extend(_detect_lying_bom(raw_for_byte_scan))
findings.extend(_detect_encoding_uncertainty(df))
findings.extend(_detect_smart_punctuation(df))
findings.extend(_detect_invisible_chars(df))
@@ -890,6 +925,7 @@ def analyze(
def _load_for_analysis(
path: Path, *, sample_rows: int, encoding_override: Optional[str] = None,
fold_quotes: bool = True,
) -> tuple[pd.DataFrame, Optional[RepairResult], Optional[bytes]]:
"""Read just enough of *path* to scan, with the same robust pre-parse
repair the tool pages will use.
@@ -903,6 +939,12 @@ def _load_for_analysis(
When *encoding_override* is set, it replaces the detected encoding
entirely — the user has explicitly told us what the file is. The
delimiter is still detected (it's separate from encoding choice).
*fold_quotes* defaults to True so the byte-level smart-quote fold
runs as part of the repair pass (correct for CSV parsing). Pass
False when the caller needs a content-preserving decode for
identity round-trip checks (encoding corpus tests, format-fidelity
audits).
"""
suffix = path.suffix.lower()
if suffix in (".xlsx", ".xls"):
@@ -937,7 +979,7 @@ def _load_for_analysis(
if not head.strip():
return pd.DataFrame(), None, head
repair = repair_bytes(head, encoding=enc, delimiter=delim)
repair = repair_bytes(head, encoding=enc, delimiter=delim, fold_quotes=fold_quotes)
import io as _io
try:
df = pd.read_csv(
@@ -954,7 +996,9 @@ def _load_for_analysis(
# never trips; the 2× row-size multiplier above handles 99% of inputs.
if not head_was_full and len(df) < sample_rows:
full_raw = path.read_bytes()
full_repair = repair_bytes(full_raw, encoding=enc, delimiter=delim)
full_repair = repair_bytes(
full_raw, encoding=enc, delimiter=delim, fold_quotes=fold_quotes,
)
try:
df = pd.read_csv(
_io.BytesIO(full_repair.repaired_bytes),

633
src/core/column_mapper.py Normal file
View File

@@ -0,0 +1,633 @@
"""DataTools Column Mapper.
Rename columns, enforce a target schema, coerce types, drop / add /
reorder columns. Designed for the three buyer profiles the toolkit
already serves:
1. **Schema enforcement** — analyst receives a CSV that has to fit a
known target shape (a CRM import format, a database schema, a
mailing-list contract). Map source columns to target names, coerce
each to the declared type, drop the extras, fail clearly when a
required target field is missing.
2. **Multi-source unification** — operator merges vendor/partner
exports where every file uses different column names ("First Name"
/ "first_name" / "FirstName"). The fuzzy auto-mapper proposes a
mapping; the user reviews and overrides.
3. **Type coercion** — quick conversion of mis-typed columns (string
"123" → int, "true"/"yes" → bool, "2024-01-15" → date) without
leaving the tool, with errors surfaced row-by-row.
Public API
----------
Types:
TargetField, TargetSchema, ColumnMapping, MapOptions, MapResult,
ColumnDtype
Functions:
map_columns(df, options) -> MapResult
infer_mapping(df, schema, *, threshold=0.6) -> dict[src, target]
coerce_series(series, dtype) -> (Series, n_failures)
Presets:
PRESETS = {"rename-only", "strict-schema", "lenient-schema"}
"""
from __future__ import annotations
import json
import re
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any, Iterable, Literal, Optional
import numpy as np
import pandas as pd
from loguru import logger
from pandas.api import types as pdtypes
from .errors import ConfigError, InputValidationError, ensure_choice, ensure_dataframe
# ---------------------------------------------------------------------------
# Types
# ---------------------------------------------------------------------------
ColumnDtype = Literal[
"string",
"integer",
"float",
"boolean",
"date",
"datetime",
"category",
"auto", # leave dtype alone
]
_VALID_DTYPES: frozenset[str] = frozenset({
"string", "integer", "float", "boolean", "date", "datetime",
"category", "auto",
})
@dataclass
class TargetField:
"""One field in a target schema.
Required fields whose source column is missing produce a
``MapResult.missing_required_targets`` entry rather than silently
creating a NaN column.
"""
name: str
dtype: ColumnDtype = "auto"
required: bool = False
aliases: list[str] = field(default_factory=list)
default: Any = None
@dataclass
class TargetSchema:
"""Ordered list of target fields. Ordering survives into the result DataFrame."""
fields: list[TargetField]
def field_names(self) -> list[str]:
return [f.name for f in self.fields]
def get(self, name: str) -> Optional[TargetField]:
return next((f for f in self.fields if f.name == name), None)
def to_dict(self) -> dict:
return {"fields": [asdict(f) for f in self.fields]}
def to_file(self, path: str | Path) -> Path:
out = Path(path)
out.write_text(json.dumps(self.to_dict(), indent=2, default=str))
return out
@classmethod
def from_dict(cls, data: dict) -> TargetSchema:
if "fields" not in data:
raise ConfigError(
"Target schema must contain a 'fields' list",
operation="TargetSchema.from_dict",
suggestion='Example: {"fields": [{"name": "email", "dtype": "string", "required": true}, ...]}',
)
fields = []
for entry in data["fields"]:
if isinstance(entry, str):
fields.append(TargetField(name=entry))
continue
if "name" not in entry:
raise ConfigError(
f"Schema field is missing 'name': {entry!r}",
operation="TargetSchema.from_dict",
)
dtype = entry.get("dtype", "auto")
if dtype not in _VALID_DTYPES:
raise ConfigError(
f"Schema field {entry['name']!r}: unknown dtype {dtype!r}",
operation="TargetSchema.from_dict",
suggestion=f"Valid: {sorted(_VALID_DTYPES)}",
)
fields.append(TargetField(
name=entry["name"],
dtype=dtype,
required=bool(entry.get("required", False)),
aliases=list(entry.get("aliases", [])),
default=entry.get("default"),
))
return cls(fields=fields)
@classmethod
def from_file(cls, path: str | Path) -> TargetSchema:
return cls.from_dict(json.loads(Path(path).read_text()))
# ---------------------------------------------------------------------------
# Fuzzy column-name matching
# ---------------------------------------------------------------------------
# Whitespace, punctuation, and case all vary across vendors. We normalise
# both sides to a token list before comparing.
_NORM_RE = re.compile(r"[^a-z0-9]+")
def _normalize_name(name: str) -> str:
"""Lowercase, strip non-alphanumerics — ``First Name`` → ``firstname``."""
if not isinstance(name, str):
return ""
return _NORM_RE.sub("", name.strip().lower())
def _token_set(name: str) -> frozenset[str]:
"""Tokenise a column name on non-alphanumeric boundaries."""
if not isinstance(name, str):
return frozenset()
parts = [p for p in _NORM_RE.split(name.strip().lower()) if p]
return frozenset(parts)
def _name_similarity(a: str, b: str) -> float:
"""Cheap similarity score in [0.0, 1.0].
Combines exact-after-normalisation, token Jaccard, and SequenceMatcher
ratio. A real fuzzy library (rapidfuzz) is already a project
dependency for the deduplicator — we use it when available, fall
back to stdlib ``difflib`` otherwise so the mapper works in trimmed
builds.
"""
if not a or not b:
return 0.0
na, nb = _normalize_name(a), _normalize_name(b)
if na == nb:
return 1.0
ta, tb = _token_set(a), _token_set(b)
jaccard = (len(ta & tb) / len(ta | tb)) if (ta or tb) else 0.0
try:
from rapidfuzz import fuzz
seq = fuzz.ratio(na, nb) / 100.0
except ImportError:
from difflib import SequenceMatcher
seq = SequenceMatcher(None, na, nb).ratio()
return max(jaccard, seq)
def infer_mapping(
df: pd.DataFrame,
schema: TargetSchema,
*,
threshold: float = 0.6,
) -> dict[str, str]:
"""Best-guess source-column → target-field mapping.
Returns a dict keyed by source-column name. A source column is
omitted from the result when no candidate scores above *threshold*.
Each target is matched at most once: the highest-scoring source
wins, ties broken by source-column order in *df*.
Aliases declared on a :class:`TargetField` are scored as if they
were target names — useful for vendor-specific synonyms
(``["customer_id", "cust_id", "client_no"]``).
"""
ensure_dataframe(df, function="infer_mapping")
sources = list(df.columns)
targets = schema.fields
# All (source, target) candidate scores; keep only those above
# threshold, sorted descending so a greedy walk picks the best
# available pairings first.
scored: list[tuple[float, str, str]] = []
for src in sources:
for tgt in targets:
best = _name_similarity(src, tgt.name)
for alias in tgt.aliases:
s = _name_similarity(src, alias)
if s > best:
best = s
if best >= threshold:
scored.append((best, str(src), tgt.name))
scored.sort(key=lambda x: (-x[0], sources.index(x[1])))
mapping: dict[str, str] = {}
used_targets: set[str] = set()
for score, src, tgt in scored:
if src in mapping or tgt in used_targets:
continue
mapping[src] = tgt
used_targets.add(tgt)
return mapping
# ---------------------------------------------------------------------------
# Type coercion
# ---------------------------------------------------------------------------
_TRUTHY = frozenset({"true", "t", "yes", "y", "1"})
_FALSY = frozenset({"false", "f", "no", "n", "0"})
def _coerce_boolean(value: Any) -> Any:
if isinstance(value, bool):
return value
if value is None or (isinstance(value, float) and pd.isna(value)):
return pd.NA
if isinstance(value, (int, float)):
return bool(value)
if isinstance(value, str):
v = value.strip().lower()
if v in _TRUTHY:
return True
if v in _FALSY:
return False
raise ValueError(f"cannot coerce to boolean: {value!r}")
def coerce_series(series: pd.Series, dtype: ColumnDtype) -> tuple[pd.Series, int]:
"""Coerce *series* to *dtype*, returning ``(coerced, n_failures)``.
Failures are counted but never raised — the caller (``map_columns``)
surfaces them through ``MapResult.coercion_failures`` so the user
can inspect which rows didn't fit. Already-typed inputs are cheap
no-ops.
"""
if dtype == "auto":
return series, 0
if dtype == "string":
return series.astype("string"), 0
if dtype == "category":
return series.astype("category"), 0
if dtype == "integer":
coerced = pd.to_numeric(series, errors="coerce")
# Use nullable Int64 so NaN entries don't get cast to floats.
rounded = coerced.round().astype("Int64")
# Failures = original non-NaN cells whose numeric coercion produced NaN.
original_filled = series.notna()
failed = (rounded.isna() & original_filled).sum()
return rounded, int(failed)
if dtype == "float":
coerced = pd.to_numeric(series, errors="coerce").astype("Float64")
original_filled = series.notna()
failed = (coerced.isna() & original_filled).sum()
return coerced, int(failed)
if dtype == "boolean":
out: list[Any] = []
failed = 0
for v in series.tolist():
try:
out.append(_coerce_boolean(v))
except ValueError:
out.append(pd.NA)
failed += 1
return pd.Series(out, index=series.index, dtype="boolean"), failed
if dtype in {"date", "datetime"}:
coerced = pd.to_datetime(series, errors="coerce", utc=False)
original_filled = series.notna()
failed = (coerced.isna() & original_filled).sum()
if dtype == "date":
# Drop the time component but keep dtype as datetime64 so
# downstream operations (delta, sort) still work.
coerced = coerced.dt.normalize()
return coerced, int(failed)
raise InputValidationError(
f"Unknown dtype {dtype!r}",
operation="coerce_series",
suggestion=f"Valid: {sorted(_VALID_DTYPES)}",
)
# ---------------------------------------------------------------------------
# Options / result dataclasses
# ---------------------------------------------------------------------------
# Strategy for handling source columns that don't appear in the target
# schema. ``keep`` preserves them at the end of the output; ``drop``
# removes them; ``error`` raises an InputValidationError.
UnmappedStrategy = Literal["keep", "drop", "error"]
PRESETS: dict[str, dict[str, Any]] = {
"rename-only": {
"auto_infer": True,
"unmapped": "keep",
"coerce_types": False,
"reorder_to_schema": False,
},
"strict-schema": {
"auto_infer": True,
"unmapped": "drop",
"coerce_types": True,
"reorder_to_schema": True,
},
"lenient-schema": {
"auto_infer": True,
"unmapped": "keep",
"coerce_types": True,
"reorder_to_schema": True,
},
}
@dataclass
class MapOptions:
"""Toggles for column mapping.
Defaults match the ``rename-only`` preset: best-effort fuzzy match
against the schema (if provided), keep unmapped source columns
after the mapped ones, no type coercion, no reorder.
"""
# Either pass an explicit ``mapping`` dict or a ``schema`` (and let
# the engine infer the mapping). Explicit mapping wins when both
# are set.
mapping: dict[str, str] = field(default_factory=dict)
schema: Optional[TargetSchema] = None
# When True (default), missing entries in ``mapping`` are filled in
# by ``infer_mapping`` against ``schema``. When False, only the
# explicit mapping is honoured.
auto_infer: bool = True
fuzzy_threshold: float = 0.6
# What to do with source columns that aren't in the mapping.
unmapped: UnmappedStrategy = "keep"
# Apply target-field dtypes from the schema after rename.
coerce_types: bool = False
# Reorder output to match schema.fields order. Unmapped survivors
# (when unmapped="keep") are appended at the end in their original
# source order.
reorder_to_schema: bool = False
# Required-target enforcement. When True (default), a required
# target field that has no source column raises an InputValidationError.
# When False, the missing field is added with ``default`` value.
enforce_required: bool = True
@classmethod
def from_preset(cls, name: str) -> MapOptions:
if name not in PRESETS:
raise ConfigError(
f"Unknown preset '{name}'",
operation="MapOptions.from_preset",
suggestion=f"Available: {sorted(PRESETS)}",
)
return cls(**PRESETS[name])
@classmethod
def from_dict(cls, data: dict) -> MapOptions:
known = set(cls.__dataclass_fields__)
kwargs = {k: v for k, v in data.items() if k in known}
if "schema" in kwargs and isinstance(kwargs["schema"], dict):
kwargs["schema"] = TargetSchema.from_dict(kwargs["schema"])
return cls(**kwargs)
def to_dict(self) -> dict:
out: dict[str, Any] = {
"mapping": dict(self.mapping),
"auto_infer": self.auto_infer,
"fuzzy_threshold": self.fuzzy_threshold,
"unmapped": self.unmapped,
"coerce_types": self.coerce_types,
"reorder_to_schema": self.reorder_to_schema,
"enforce_required": self.enforce_required,
}
if self.schema is not None:
out["schema"] = self.schema.to_dict()
return out
def to_file(self, path: str | Path) -> Path:
out = Path(path)
out.write_text(json.dumps(self.to_dict(), indent=2, default=str))
return out
@classmethod
def from_file(cls, path: str | Path) -> MapOptions:
return cls.from_dict(json.loads(Path(path).read_text()))
def validate(self) -> None:
ensure_choice(
self.unmapped, name="unmapped",
choices=("keep", "drop", "error"),
function="MapOptions.validate",
)
if not (0.0 <= self.fuzzy_threshold <= 1.0):
raise ConfigError(
f"fuzzy_threshold must be in [0.0, 1.0], got {self.fuzzy_threshold!r}",
operation="MapOptions.validate",
)
@dataclass
class MapResult:
"""Output of ``map_columns``."""
mapped_df: pd.DataFrame
mapping: dict[str, str] # source → target
inferred_pairs: dict[str, str] # subset of mapping that was auto-inferred
columns_renamed: int
columns_dropped: list[str]
columns_added: list[str] # required-defaulted fields added with default value
coercion_failures: dict[str, int] # column → n_rows_that_failed_coercion
unmapped_kept: list[str]
missing_required_targets: list[str]
# ---------------------------------------------------------------------------
# Main entry point
# ---------------------------------------------------------------------------
def map_columns(
df: pd.DataFrame,
options: Optional[MapOptions] = None,
) -> MapResult:
"""Apply *options* to *df* and return a :class:`MapResult`.
Pipeline placement (recommended, not enforced)
----------------------------------------------
Two natural slots:
* **Early** — header alignment for multi-vendor unification.
Each vendor uses different column names; rename to a canonical
schema before any other tool runs.
* **Late** — schema enforcement for output. After cleaning, coerce
types and project to the target shape (CRM import contract,
database schema). Run after format / missing so the coerced
data is canonical first.
The pipeline runner does not enforce a position; place by use case.
Pipeline:
1. Compose mapping (explicit ``options.mapping`` inferred
pairs from ``options.schema``).
2. Reject duplicate target names — two source columns mapped to
the same target is a user error, not a silent overwrite.
3. Decide what to do with unmapped source columns
(``keep`` / ``drop`` / ``error``).
4. Rename, then handle missing required targets, then coerce
types, then reorder.
"""
ensure_dataframe(df, function="map_columns")
options = options or MapOptions()
options.validate()
# ------------------------------------------------------------------
# 1. Compose the effective mapping
# ------------------------------------------------------------------
explicit = dict(options.mapping)
inferred: dict[str, str] = {}
if options.schema is not None and options.auto_infer:
all_inferred = infer_mapping(df, options.schema, threshold=options.fuzzy_threshold)
# Explicit user pairings always win.
used_targets = set(explicit.values())
for src, tgt in all_inferred.items():
if src in explicit:
continue
if tgt in used_targets:
continue
inferred[src] = tgt
used_targets.add(tgt)
mapping: dict[str, str] = {**inferred, **explicit}
# ------------------------------------------------------------------
# 2. Validate mapping coherence
# ------------------------------------------------------------------
unknown_sources = [s for s in mapping if s not in df.columns]
if unknown_sources:
raise InputValidationError(
f"Mapping references columns not in input: {unknown_sources}",
operation="map_columns",
suggestion=f"Available source columns: {list(df.columns)}",
)
target_counts: dict[str, int] = {}
for tgt in mapping.values():
target_counts[tgt] = target_counts.get(tgt, 0) + 1
duplicates = [t for t, n in target_counts.items() if n > 1]
if duplicates:
raise InputValidationError(
f"Multiple source columns mapped to the same target(s): {duplicates}",
operation="map_columns",
suggestion="Each target name must be unique. Drop or rename the conflicting source columns.",
)
# ------------------------------------------------------------------
# 3. Handle unmapped source columns
# ------------------------------------------------------------------
unmapped_sources = [c for c in df.columns if c not in mapping]
unmapped_kept: list[str] = []
columns_dropped: list[str] = []
if unmapped_sources:
if options.unmapped == "drop":
columns_dropped = list(unmapped_sources)
elif options.unmapped == "error":
raise InputValidationError(
f"Source columns have no mapping and unmapped='error': {unmapped_sources}",
operation="map_columns",
suggestion=(
"Either add explicit mapping entries, set unmapped='keep' / 'drop', "
"or include the columns in the target schema."
),
)
else:
unmapped_kept = list(unmapped_sources)
# ------------------------------------------------------------------
# 4. Apply rename and drop
# ------------------------------------------------------------------
out = df.copy()
if columns_dropped:
out = out.drop(columns=columns_dropped)
if mapping:
out = out.rename(columns=mapping)
columns_renamed = sum(1 for src, tgt in mapping.items() if src != tgt)
# ------------------------------------------------------------------
# 5. Handle the schema's required + default fields
# ------------------------------------------------------------------
columns_added: list[str] = []
missing_required: list[str] = []
if options.schema is not None:
present = set(out.columns)
for tf in options.schema.fields:
if tf.name in present:
continue
if tf.required and tf.default is None:
missing_required.append(tf.name)
continue
# Add with default value (NaN if no default).
out[tf.name] = tf.default if tf.default is not None else pd.NA
columns_added.append(tf.name)
if missing_required and options.enforce_required:
raise InputValidationError(
f"Required target field(s) missing from input: {missing_required}",
operation="map_columns",
suggestion=(
"Either add explicit mapping entries, lower fuzzy_threshold, "
"supply a default in the schema, or set enforce_required=False."
),
)
# ------------------------------------------------------------------
# 6. Coerce types per the schema
# ------------------------------------------------------------------
coercion_failures: dict[str, int] = {}
if options.coerce_types and options.schema is not None:
for tf in options.schema.fields:
if tf.name not in out.columns or tf.dtype == "auto":
continue
try:
series, fails = coerce_series(out[tf.name], tf.dtype)
except (ValueError, TypeError) as e:
logger.warning(
"map_columns: coerce of {!r}{} failed: {}",
tf.name, tf.dtype, e,
)
continue
out[tf.name] = series
if fails:
coercion_failures[tf.name] = fails
# ------------------------------------------------------------------
# 7. Reorder
# ------------------------------------------------------------------
if options.reorder_to_schema and options.schema is not None:
ordered = [f.name for f in options.schema.fields if f.name in out.columns]
# Append survivors (kept-unmapped originals) in their pre-rename order.
survivors = [c for c in out.columns if c not in ordered]
out = out.loc[:, ordered + survivors]
return MapResult(
mapped_df=out,
mapping=mapping,
inferred_pairs=inferred,
columns_renamed=columns_renamed,
columns_dropped=columns_dropped,
columns_added=columns_added,
coercion_failures=coercion_failures,
unmapped_kept=unmapped_kept,
missing_required_targets=missing_required,
)

View File

@@ -514,6 +514,19 @@ def deduplicate(
) -> DeduplicationResult:
"""Run the full deduplication pipeline.
Pipeline placement (recommended, not enforced)
----------------------------------------------
Run *last* among the cleaning tools. Fuzzy matching is more
accurate when:
* text has been hygiened (NBSP padding doesn't make
``"Alice "`` look different from ``"Alice"``);
* formats have been canonicalized (``+14155551234`` matches
across rows where the source had ``(415) 555-1234`` and
``415.555.1234``);
* missing values have been standardized (NaN matching is
brittle; sentinel-laundered cells produce false matches).
See ``src.core.pipeline.SOFT_DEPENDENCIES``.
Parameters
----------
df : input DataFrame

View File

@@ -815,7 +815,22 @@ _CURRENCY_TRIM_RE = re.compile(
_PARENS_NEGATIVE_RE = re.compile(r"^\s*\(\s*(.+?)\s*\)\s*$")
CurrencyDecimal = Literal["dot", "comma"]
CurrencyDecimal = Literal["dot", "comma", "auto"]
# Multi-character symbol prefixes that aren't captured by the
# single-codepoint ``_CURRENCY_SYMBOLS`` table. Order matters: the
# detector checks these prefixes BEFORE the single-symbol regex, so
# ``R$`` resolves to BRL even though ``$`` alone would map to USD.
_PREFIX_TO_ISO: dict[str, str] = {
"r$": "BRL", # Brazilian Real
"kr": "SEK", # ambiguous Nordic — picks SEK as most common; see tests
"": "PLN", # Polish Złoty
"лв": "BGN", # Bulgarian Lev
"": "RUB", # already in symbol table; kept for parity
"rs.": "INR", # rupees — covers IN/PK informal usage
"rs": "INR",
}
def detect_currency_code(value: str) -> Optional[str]:
@@ -825,9 +840,21 @@ def detect_currency_code(value: str) -> Optional[str]:
symbol → code mapping (``$1234`` → ``USD``). Symbol mapping is best-
effort: ``$`` is ambiguous between USD/CAD/AUD/MXN — the caller is
expected to constrain that via input data discipline.
Multi-char prefixes (``R$``, ``zł``, ``kr``) are recognised before
the single-symbol regex so Brazilian / Polish / Nordic data isn't
silently bucketed as USD.
"""
if not isinstance(value, str):
return None
head = value.lstrip().lower()
for prefix, code in _PREFIX_TO_ISO.items():
if head.startswith(prefix):
# Make sure the next char (if any) isn't a letter — avoid
# matching ``rsa`` as ``rs``-then-``a``.
tail = head[len(prefix):]
if not tail or not tail[0].isalpha():
return code
m = _CURRENCY_DETECT_RE.search(value)
if m is None:
return None
@@ -852,10 +879,16 @@ def standardize_currency(
``decimal="dot"``: ``$1,234.56`` → ``1234.56`` (US/UK convention).
``decimal="comma"``: ``1.234,56 €`` → ``1234.56`` (EU convention).
Either mode auto-detects the EU shape when both ``.`` and ``,`` are
present and the comma sits after the dot (so ``€1.234,56`` parses
correctly even under the dot-default mode). Space-thousands and
Swiss apostrophe-thousands are also recognized.
``decimal="auto"``: same as ``dot`` but a single trailing comma
whose tail is NOT exactly 3 digits is read as a decimal separator
(``850,50`` → ``850.50``, ``R$ 1,5`` → ``1.5``). Use this for
mixed-locale international files. Length-3 tails (``1,234``) stay
ambiguous regardless of mode.
All three modes auto-detect the EU shape when both ``.`` and ``,``
are present and the comma sits after the dot (so ``€1.234,56``
parses correctly even under the dot-default mode). Space-thousands
and Swiss apostrophe-thousands are also recognized.
The output always uses a dot as the decimal separator since that is
the form pandas/Python parse natively.
@@ -899,6 +932,22 @@ def standardize_currency(
code = detect_currency_code(s) if preserve_code else None
# Strip any multi-char currency prefix (``R$``, ``kr``, ``zł``)
# before the symbol-table regex — these aren't single codepoints
# so the table-driven trim would otherwise leave them in place.
head = s.lstrip().lower()
for prefix in _PREFIX_TO_ISO:
if head.startswith(prefix):
tail_start = len(prefix)
if tail_start < len(head) and head[tail_start].isalpha():
continue
# Strip the matched prefix from the original (preserve case
# of any trailing content).
stripped_lead = s[: len(s) - len(head)]
s = stripped_lead + s.lstrip()[len(prefix):]
s = s.lstrip()
break
negative = False
m = _PARENS_NEGATIVE_RE.match(s)
if m:
@@ -948,6 +997,19 @@ def standardize_currency(
# is unambiguously EU — treat the comma as decimal.
if had_space_thousands:
rest = rest.replace(",", ".")
elif decimal == "auto":
# International auto-detection: a single comma whose
# tail is NOT exactly 3 digits is far more likely to be
# an EU/BRL decimal (``850,50``, ``1,5``) than a
# malformed US thousands group. Length-3 tails stay
# ambiguous and require an explicit locale.
after = rest.rsplit(",", 1)[1]
if rest.count(",") > 1:
rest = rest.replace(",", "")
elif len(after) == 3:
return _err("ambiguous separator, set --currency-locale")
else:
rest = rest.replace(",", ".")
else:
after = rest.rsplit(",", 1)[1]
if len(after) != 3:
@@ -1910,6 +1972,26 @@ class StandardizeOptions:
# verbatim into Title Case rendering.
extra_abbreviations: dict[str, str] = field(default_factory=dict)
# ----- Scale knobs for large international files -----
# Per-row country/region overrides. When set, each phone or address
# row's region is read from the named column (an ISO-3166 alpha-2 code:
# "US", "GB", "JP", "FR", …). Falls back to ``phone_region`` /
# global default when the column is missing or the cell is blank.
phone_country_column: Optional[str] = None
address_country_column: Optional[str] = None
# Audit cap. The change table can grow to tens of millions of rows on
# a 1 GB input — capping protects memory and keeps the audit usable.
# ``cells_changed`` still counts every modification; only the per-row
# ``changes`` DataFrame is truncated. Set to None for unbounded.
audit_max_rows: Optional[int] = 10_000
# Value-level LRU cache size per standardizer. Repeated phone numbers
# (call-list duplicates), repeated currencies, repeated boolean
# tokens — all dominate at scale. A 256k-entry cache absorbs most
# real-world cardinalities without ballooning memory.
cache_size: int = 262_144
@classmethod
def from_preset(cls, name: str, **overrides: Any) -> StandardizeOptions:
"""Build options from a named preset, with optional field overrides.
@@ -1953,7 +2035,7 @@ class StandardizeOptions:
for field_name, valid in (
("date_order", {"MDY", "DMY"}),
("phone_format", set(_PHONE_FORMAT_MAP) | {"DIGITS"}),
("currency_decimal", {"dot", "comma"}),
("currency_decimal", {"dot", "comma", "auto"}),
("name_case", {"title", "upper", "lower"}),
("boolean_style", set(_BOOL_OUTPUT)),
("date_error_policy", {"passthrough", "sentinel"}),
@@ -2213,6 +2295,193 @@ def _resolve_column_types(
return resolved
def _build_cached_dispatcher(
field_type: FieldType,
options: StandardizeOptions,
):
"""Return a per-value standardizer wrapped in an LRU cache.
The cache key is the raw cell value plus, when applicable, the
per-row region derived from ``phone_country_column`` /
``address_country_column``. Repeated values are O(1) lookups —
critical at 1 GB scale where the same number appears thousands
of times.
The dispatcher captures the relevant subset of ``options`` so the
cache key stays small (we don't want to serialize the whole
options dataclass into every cache entry).
"""
from functools import lru_cache
cache_size = options.cache_size if options.cache_size > 0 else None
if field_type == FieldType.DATE:
out_fmt = options.date_output_format
date_order = options.date_order
date_err = options.date_error_policy
locales = (
tuple(options.date_month_locales) if options.date_month_locales else None
)
@lru_cache(maxsize=cache_size)
def fn(value: Any, _region: Optional[str] = None):
return _apply_field_type_for(
value, FieldType.DATE, options,
_date_args=(out_fmt, date_order, date_err, locales),
)
return fn
if field_type == FieldType.PHONE:
out_fmt = options.phone_format
err = options.phone_error_policy
default_region = options.phone_region
@lru_cache(maxsize=cache_size)
def fn(value: Any, region: Optional[str] = None):
r = region or default_region
return _apply_field_type_for(
value, FieldType.PHONE, options,
_phone_args=(out_fmt, r, err),
)
return fn
if field_type == FieldType.CURRENCY:
decimal = options.currency_decimal
decimals = options.currency_decimals
preserve = options.currency_preserve_code
err = options.currency_error_policy
@lru_cache(maxsize=cache_size)
def fn(value: Any, _region: Optional[str] = None):
return _apply_field_type_for(
value, FieldType.CURRENCY, options,
_currency_args=(decimal, decimals, preserve, err),
)
return fn
if field_type == FieldType.BOOLEAN:
style = options.boolean_style
@lru_cache(maxsize=cache_size)
def fn(value: Any, _region: Optional[str] = None):
return _apply_field_type_for(
value, FieldType.BOOLEAN, options,
_boolean_args=(style,),
)
return fn
if field_type == FieldType.EMAIL:
gmail = options.email_gmail_canonical
err = options.email_error_policy
@lru_cache(maxsize=cache_size)
def fn(value: Any, _region: Optional[str] = None):
return _apply_field_type_for(
value, FieldType.EMAIL, options,
_email_args=(gmail, err),
)
return fn
# Names and addresses are usually unique per row; no cache wraps
# them but we still go through ``_apply_field_type`` for parity.
if field_type == FieldType.NAME:
def fn(value: Any, _region: Optional[str] = None):
return _apply_field_type(value, FieldType.NAME, options)
return fn
if field_type == FieldType.ADDRESS:
# Addresses can be cached too — long lists of repeated office
# addresses or warehouse locations are common in commerce data.
@lru_cache(maxsize=cache_size)
def fn(value: Any, _region: Optional[str] = None):
return _apply_field_type(value, FieldType.ADDRESS, options)
return fn
# Fallback (shouldn't happen — every FieldType is covered above).
return lambda value, _region=None: _apply_field_type(value, field_type, options)
def _apply_field_type_for(
value: Any,
field_type: FieldType,
options: StandardizeOptions,
*,
_date_args=None,
_phone_args=None,
_currency_args=None,
_boolean_args=None,
_email_args=None,
) -> tuple[Any, bool, bool]:
"""Cacheable dispatcher: same shape as :func:`_apply_field_type` but
accepts pre-extracted scalar argument tuples so the LRU cache key is
just ``(value, region)`` instead of the full options object.
"""
if value is None or (isinstance(value, float) and pd.isna(value)):
return value, False, True
if not isinstance(value, str):
if field_type == FieldType.BOOLEAN:
style = (_boolean_args or (options.boolean_style,))[0]
new, changed = standardize_boolean(value, style=style)
return new, changed, True
value = str(value)
if not value.strip():
return value, False, True
if field_type == FieldType.DATE:
out_fmt, date_order, err, locales = _date_args or (
options.date_output_format, options.date_order,
options.date_error_policy,
tuple(options.date_month_locales) if options.date_month_locales else None,
)
new, changed = standardize_date(
value,
output_format=out_fmt,
date_order=date_order,
error_policy=err,
month_locales=list(locales) if locales else None,
)
elif field_type == FieldType.PHONE:
out_fmt, region, err = _phone_args or (
options.phone_format, options.phone_region, options.phone_error_policy,
)
new, changed = standardize_phone(
value, output_format=out_fmt, default_region=region, error_policy=err,
)
elif field_type == FieldType.CURRENCY:
decimal, decimals, preserve, err = _currency_args or (
options.currency_decimal, options.currency_decimals,
options.currency_preserve_code, options.currency_error_policy,
)
new, changed = standardize_currency(
value,
decimal=decimal,
decimals=decimals,
preserve_code=preserve,
error_policy=err,
)
elif field_type == FieldType.BOOLEAN:
style = (_boolean_args or (options.boolean_style,))[0]
new, changed = standardize_boolean(value, style=style)
elif field_type == FieldType.EMAIL:
gmail, err = _email_args or (
options.email_gmail_canonical, options.email_error_policy,
)
new, changed = standardize_email(
value, gmail_canonical=gmail, error_policy=err,
)
else:
return _apply_field_type(value, field_type, options)
parsed = True
if not changed and field_type in {
FieldType.DATE, FieldType.PHONE, FieldType.CURRENCY, FieldType.BOOLEAN,
}:
parsed = _is_already_canonical(value, field_type, options)
return new, changed, parsed
def standardize_dataframe(
df: pd.DataFrame,
options: Optional[StandardizeOptions] = None,
@@ -2221,6 +2490,28 @@ def standardize_dataframe(
Columns absent from ``options.column_types`` pass through unchanged.
The input DataFrame is not mutated.
Pipeline placement (recommended, not enforced)
----------------------------------------------
Run *after* the text cleaner (smart-quote / NBSP / zero-width
pollution breaks phone, currency, and date parsers) and *before*
the missing-value handler (numeric imputation expects canonical
types) and the deduplicator (canonical phone E.164 / lowercase
email enables cross-format duplicate matching). See
``src.core.pipeline.SOFT_DEPENDENCIES``.
Performance characteristics
---------------------------
Per-cell standardizers are wrapped in an LRU cache (size
``options.cache_size``) so repeated values — common in real
international data, where the same office phone or vendor address
appears thousands of times — short-circuit. The dispatch loop uses
``Series.map`` for pandas-native iteration; on a 10-million-row
column this is roughly 4-8× faster than the previous
``for v in series.tolist()`` path.
For inputs larger than will fit comfortably in RAM, prefer
:func:`standardize_file` which streams chunks from disk.
"""
from .errors import ensure_dataframe
ensure_dataframe(df, function="standardize_dataframe")
@@ -2228,33 +2519,74 @@ def standardize_dataframe(
out = df.copy()
column_types = _resolve_column_types(options, out.columns)
change_records: list[dict[str, Any]] = []
cells_changed = 0
cells_unparseable = 0
cells_total = 0
audit_cap = options.audit_max_rows
audit_room = float("inf") if audit_cap is None else audit_cap
audit_records: list[dict[str, Any]] = []
# Per-row region columns must exist in the frame when set.
if options.phone_country_column and options.phone_country_column not in out.columns:
from .errors import InputValidationError
raise InputValidationError(
f"phone_country_column={options.phone_country_column!r} not in input columns",
operation="standardize_dataframe",
suggestion=f"Available: {list(out.columns)}",
)
if options.address_country_column and options.address_country_column not in out.columns:
from .errors import InputValidationError
raise InputValidationError(
f"address_country_column={options.address_country_column!r} not in input columns",
operation="standardize_dataframe",
suggestion=f"Available: {list(out.columns)}",
)
for col, field_type in column_types.items():
series = out[col]
new_values: list[Any] = []
for row_idx, original in enumerate(series.tolist()):
cells_total += 1
new, changed, parsed = _apply_field_type(original, field_type, options)
cells_total += len(series)
dispatcher = _build_cached_dispatcher(field_type, options)
# Per-row region lookup. Phones and addresses are the two types
# that benefit from country context; everything else ignores the
# second argument.
region_series: Optional[pd.Series] = None
if field_type == FieldType.PHONE and options.phone_country_column:
region_series = out[options.phone_country_column]
elif field_type == FieldType.ADDRESS and options.address_country_column:
region_series = out[options.address_country_column]
new_values: list[Any] = [None] * len(series)
if region_series is None:
triples = [dispatcher(v) for v in series.tolist()]
else:
regions = region_series.tolist()
triples = [
dispatcher(v, _normalize_region(r))
for v, r in zip(series.tolist(), regions)
]
for i, (orig, (new, changed, parsed)) in enumerate(
zip(series.tolist(), triples)
):
new_values[i] = new
if changed:
cells_changed += 1
change_records.append({
"row": row_idx,
"column": col,
"field_type": field_type.value,
"old": original,
"new": new,
})
if audit_room > 0:
audit_records.append({
"row": i,
"column": col,
"field_type": field_type.value,
"old": orig,
"new": new,
})
audit_room -= 1
if not parsed:
cells_unparseable += 1
new_values.append(new)
out[col] = new_values
changes_df = pd.DataFrame(
change_records,
audit_records,
columns=["row", "column", "field_type", "old", "new"],
)
@@ -2272,6 +2604,16 @@ def standardize_dataframe(
int(100 * cells_unparseable / cells_total),
)
# Only log the cap message when it would surprise the caller —
# cap=0 is the streaming-path's deliberate "audit budget exhausted"
# signal and shouldn't generate noise per chunk.
if audit_cap and audit_cap > 0 and cells_changed > audit_cap:
logger.info(
"standardize_dataframe: audit capped at {} rows "
"(cells_changed={}); raise audit_max_rows or set to None for full audit.",
audit_cap, cells_changed,
)
return StandardizeResult(
standardized_df=out,
changes=changes_df,
@@ -2280,3 +2622,290 @@ def standardize_dataframe(
cells_total=cells_total,
columns_processed=list(column_types.keys()),
)
# ---------------------------------------------------------------------------
# Per-row region helpers
# ---------------------------------------------------------------------------
# Common country-name → ISO-3166 alpha-2 mappings. The phonenumbers
# library wants the alpha-2 code, but real spreadsheets carry full names
# ("United Kingdom", "Japan", "Brazil"). Add new entries lazily as users
# bring in data — the table is a soft mapping, missing entries fall back
# to the global ``phone_region``.
_COUNTRY_NAME_TO_ISO2: dict[str, str] = {
"united states": "US", "usa": "US", "u.s.": "US", "u.s.a.": "US",
"united kingdom": "GB", "uk": "GB", "great britain": "GB", "england": "GB",
"canada": "CA",
"mexico": "MX",
"france": "FR",
"germany": "DE", "deutschland": "DE",
"italy": "IT", "italia": "IT",
"spain": "ES", "españa": "ES",
"portugal": "PT",
"netherlands": "NL", "holland": "NL",
"belgium": "BE",
"switzerland": "CH", "schweiz": "CH",
"austria": "AT", "österreich": "AT",
"ireland": "IE",
"sweden": "SE", "norway": "NO", "denmark": "DK", "finland": "FI",
"poland": "PL", "czech republic": "CZ", "czechia": "CZ", "hungary": "HU",
"russia": "RU", "ukraine": "UA",
"japan": "JP", "中国": "CN", "china": "CN", "south korea": "KR", "korea": "KR",
"india": "IN", "indonesia": "ID", "thailand": "TH", "vietnam": "VN",
"philippines": "PH", "malaysia": "MY", "singapore": "SG",
"australia": "AU", "new zealand": "NZ",
"brazil": "BR", "brasil": "BR",
"argentina": "AR", "chile": "CL", "colombia": "CO", "peru": "PE",
"south africa": "ZA",
"uae": "AE", "united arab emirates": "AE",
"saudi arabia": "SA",
"egypt": "EG",
"israel": "IL",
"turkey": "TR", "türkiye": "TR",
}
def _normalize_region(value: Any) -> Optional[str]:
"""Normalise a region cell to an ISO-3166 alpha-2 code.
Accepts ISO codes (``US``, ``us``, ``USA``), full names
(``United States``, ``Japan``), and falls back to None when the
value is empty or unrecognized — letting the dispatcher use the
global default region.
"""
if value is None:
return None
if isinstance(value, float) and pd.isna(value):
return None
if not isinstance(value, str):
value = str(value)
s = value.strip()
if not s:
return None
upper = s.upper()
# ISO-3166 alpha-2 (e.g. "US", "JP")
if len(upper) == 2 and upper.isalpha():
return upper
# ISO-3166 alpha-3 (e.g. "USA", "JPN") — strip last letter as a
# cheap heuristic, then validate alpha-2.
if len(upper) == 3 and upper.isalpha():
# phonenumbers accepts alpha-2 only; map a few common alpha-3.
alpha3_map = {
"USA": "US", "GBR": "GB", "CAN": "CA", "MEX": "MX", "DEU": "DE",
"FRA": "FR", "ITA": "IT", "ESP": "ES", "JPN": "JP", "CHN": "CN",
"KOR": "KR", "BRA": "BR", "AUS": "AU", "IND": "IN", "RUS": "RU",
}
if upper in alpha3_map:
return alpha3_map[upper]
# Full country name lookup.
return _COUNTRY_NAME_TO_ISO2.get(s.lower())
# ---------------------------------------------------------------------------
# Streaming entry point — for inputs that don't fit in memory
# ---------------------------------------------------------------------------
@dataclass
class StreamingStandardizeResult:
"""Summary returned by :func:`standardize_file`.
Mirrors :class:`StandardizeResult` but without the in-memory
DataFrame — the standardized output is written incrementally to
``output_path``. The ``changes`` audit is also written
incrementally to ``audit_path`` and capped at
``options.audit_max_rows`` total rows across all chunks.
"""
output_path: Path
audit_path: Optional[Path]
rows_processed: int
chunks_processed: int
cells_changed: int
cells_unparseable: int
cells_total: int
columns_processed: list[str]
def standardize_file(
input_path: str | Path,
output_path: str | Path,
options: Optional[StandardizeOptions] = None,
*,
chunk_size: int = 50_000,
audit_path: Optional[str | Path] = None,
progress_callback: Optional[Any] = None,
encoding: str = "utf-8",
delimiter: str = ",",
) -> StreamingStandardizeResult:
"""Standardize a CSV/TSV file in chunks, writing output incrementally.
For inputs too large to materialize in memory, this entry point
streams ``chunk_size`` rows at a time through
:func:`standardize_dataframe` and writes each chunk to *output_path*
as it completes. Memory stays bounded by the chunk size regardless
of input file size.
The audit is written to *audit_path* (default
``{output_path.stem}_changes.csv``). Each chunk's
``options.audit_max_rows`` budget is respected per chunk; pass
``audit_max_rows=None`` for a full audit (memory-bounded only by
disk).
Performance for a 1 GB CSV with ~10 M rows on a typical workstation:
- chunk_size=50_000 → ~50 MB peak DataFrame footprint
- phone-only standardization: ~3-6 minutes (cache-warm)
- mixed phone + currency + address: ~8-15 minutes
- first chunk is the cold-cache slowest; later chunks ride the LRU.
Parameters
----------
input_path
CSV or TSV path. Excel inputs aren't streamed — load with
:func:`read_file` and use :func:`standardize_dataframe`.
output_path
Where to write the standardized CSV. Existing files are
overwritten.
chunk_size
Rows per chunk. Default 50,000 ≈ 50 MB resident for typical
widths. Higher → less I/O overhead, more peak memory.
progress_callback
Optional ``callable(rows_processed, chunks_processed)``
called once per chunk.
"""
from .errors import wrap_file_read, wrap_file_write
options = options or StandardizeOptions()
inp = Path(input_path)
out = Path(output_path)
if not inp.exists():
from .errors import FileAccessError
raise FileAccessError(
f"Input file not found: {inp}",
path=inp, operation="standardize_file",
)
audit_p = Path(audit_path) if audit_path else out.with_name(
f"{out.stem}_changes.csv"
)
rows_processed = 0
chunks_processed = 0
cells_changed = 0
cells_unparseable = 0
cells_total = 0
columns_processed: list[str] = []
audit_room = (
options.audit_max_rows if options.audit_max_rows is not None
else float("inf")
)
out.parent.mkdir(parents=True, exist_ok=True)
audit_p.parent.mkdir(parents=True, exist_ok=True)
out_writer_open = False
audit_writer_open = False
try:
reader = pd.read_csv(
inp, chunksize=chunk_size, encoding=encoding,
sep=delimiter, dtype=str, keep_default_na=False,
)
except (OSError, FileNotFoundError) as e:
raise wrap_file_read(inp, "standardize_file", e) from e
try:
for chunk in reader:
# The chunked reader gives back row indices that restart
# at chunk boundaries; renumber so audit row indices reflect
# the full input file.
chunk_offset = rows_processed
chunk_options = options
# Local audit cap per chunk: never exceed the global budget.
if options.audit_max_rows is not None and audit_room <= 0:
# Disable audit for this chunk by setting cap=0; the
# standardizer skips appending records once room == 0.
chunk_options = _replace_options(options, audit_max_rows=0)
result = standardize_dataframe(chunk, chunk_options)
cells_changed += result.cells_changed
cells_unparseable += result.cells_unparseable
cells_total += result.cells_total
if not columns_processed:
columns_processed = list(result.columns_processed)
# Write the standardized chunk
try:
if not out_writer_open:
result.standardized_df.to_csv(
out, mode="w", index=False, encoding=encoding,
sep=delimiter,
)
out_writer_open = True
else:
result.standardized_df.to_csv(
out, mode="a", index=False, header=False,
encoding=encoding, sep=delimiter,
)
except OSError as e:
raise wrap_file_write(out, "standardize_file", e) from e
# Write the audit (re-numbering rows to absolute file positions).
if not result.changes.empty and audit_room > 0:
# ``audit_room`` is float('inf') when the user wants an
# unbounded audit; ``iloc[:inf]`` is invalid, so take the
# whole frame in that case.
if audit_room == float("inf"):
cap_changes = result.changes.copy()
else:
cap_changes = result.changes.iloc[: int(audit_room)].copy()
cap_changes["row"] = cap_changes["row"] + chunk_offset
try:
if not audit_writer_open:
cap_changes.to_csv(
audit_p, mode="w", index=False, encoding=encoding,
)
audit_writer_open = True
else:
cap_changes.to_csv(
audit_p, mode="a", index=False, header=False,
encoding=encoding,
)
except OSError as e:
raise wrap_file_write(audit_p, "standardize_file", e) from e
audit_room -= len(cap_changes)
rows_processed += len(chunk)
chunks_processed += 1
if progress_callback:
try:
progress_callback(rows_processed, chunks_processed)
except Exception:
# Progress callbacks are advisory — don't kill the run.
logger.opt(exception=True).debug(
"progress_callback raised; ignoring"
)
finally:
# Ensure the iterator is closed (closes the underlying file).
if hasattr(reader, "close"):
reader.close()
return StreamingStandardizeResult(
output_path=out,
audit_path=audit_p if audit_writer_open else None,
rows_processed=rows_processed,
chunks_processed=chunks_processed,
cells_changed=cells_changed,
cells_unparseable=cells_unparseable,
cells_total=cells_total,
columns_processed=columns_processed,
)
def _replace_options(options: StandardizeOptions, **kwargs: Any) -> StandardizeOptions:
"""Cheap shallow clone of :class:`StandardizeOptions` with overrides.
Used by the streaming path to reduce the audit budget chunk-by-chunk
without mutating the caller's options object.
"""
from dataclasses import replace
return replace(options, **kwargs)

View File

@@ -18,6 +18,207 @@ from loguru import logger
# Encoding detection
# ---------------------------------------------------------------------------
# charset-normalizer often picks an Eastern-European code page (cp1250,
# cp1258) for byte-equivalent Western content, mac_iceland over mac_roman
# in the Mac family, and shift_jis_2004 for short Cyrillic samples. The
# arbiter below resolves these specific false positives without
# overruling the detector when its top pick is genuinely the right
# answer.
#
# Mapping is *over-picked encoding* → *more plausible substitutes (in
# priority order)*. We accept either the candidate's primary encoding
# name or any of its ``could_be_from_charset`` aliases.
_ENCODING_FALLBACKS: dict[str, tuple[str, ...]] = {
"cp1250": ("cp1252", "latin_1", "iso8859_15", "iso8859_2"),
"cp1258": ("iso8859_2", "cp1250", "cp1252"),
"mac_iceland": ("mac_roman",),
"shift_jis_2004": ("koi8_r", "cp1251", "cp1252", "iso8859_2"),
"shift_jisx0213": ("koi8_r", "cp1251", "cp1252", "iso8859_2"),
}
def _arbitrate_charset_match(matches) -> Optional[str]:
"""Pick the most plausible encoding from a charset-normalizer match list.
Two distinguishing signals separate a false positive from a real
pick when the top encoding is one we've recorded as over-picked:
* If the top match's own ``could_be_from_charset`` alias list
already names a preferred fallback (e.g. cp1250 with cp1252 as a
sibling), we substitute — charset-normalizer has flagged the
byte content as ambiguous.
* If the second-ranked match shares identical *chaos* and
*coherence* scores with the top — meaning the bytes decode
byte-equivalently under both — we substitute when the second
match is the preferred Western default.
When neither signal fires (real cp1250 / cp1258 content where
charset-normalizer is genuinely confident), the top pick is
returned unchanged.
"""
ranked = list(matches)
if not ranked:
return None
top = ranked[0]
top_enc = top.encoding.lower()
fallbacks = _ENCODING_FALLBACKS.get(top_enc)
if not fallbacks:
return top_enc
# The decisive signal: a lower-ranked candidate that ties the top
# pick on both chaos and coherence has decoded the bytes
# *identically*, so the choice between them is byte-equivalent. When
# one of those tied candidates is a preferred Western default,
# substitute. We walk the fallbacks in priority order so the most
# canonical alternative wins (cp1252 over iso8859_2 over iso8859_15).
#
# When no tied candidate matches, we leave the top pick alone — that
# is the "real cp1250 / cp1258 content" path where charset-normalizer
# is genuinely confident.
top_chaos = getattr(top, "chaos", None)
top_coherence = getattr(top, "coherence", None)
tied: list = []
for m in ranked[1:]:
if m.chaos != top_chaos or m.coherence != top_coherence:
break # ranked list is monotonically less confident
tied.append(m)
if tied:
for preferred in fallbacks:
for m in tied:
candidates = {
m.encoding.lower(),
*(a.lower() for a in m.could_be_from_charset),
}
if preferred in candidates:
return preferred
# No tied alternative — but charset-normalizer occasionally folds
# the more popular Western alias into the *top pick's own* alias
# list (cp1250 with cp1252 listed alongside). When that happens,
# prefer the canonical Western form.
top_aliases = {a.lower() for a in top.could_be_from_charset}
for preferred in fallbacks:
# Only honour an in-alias swap if the preferred encoding is a
# different family from the top pick (cp1252 swap from cp1250 is
# legitimate; iso8859_2 swap from cp1250 is not — they differ
# bytewise on accented Eastern letters).
if preferred in top_aliases and not _same_byte_family(top_enc, preferred):
return preferred
return top_enc
# ---------------------------------------------------------------------------
# Language-aware probe: distinguish KOI8-R from Shift_JIS, ISO-8859-2 from
# cp1258 when charset-normalizer cannot.
# ---------------------------------------------------------------------------
# Unicode ranges that uniquely identify each language family. A candidate
# encoding "wins" the probe when its decoding of the raw bytes produces
# the highest *coverage ratio* (non-ASCII letters in the target range
# divided by total non-ASCII letters).
_CYRILLIC_RANGE = (0x0400, 0x04FF)
_EE_LATIN_LETTERS = frozenset(
"ąćęłńóśźżĄĆĘŁŃÓŚŹŻ" # Polish
"áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ" # Czech
"áéíóöőúüűÁÉÍÓÖŐÚÜŰ" # Hungarian
"äčďéíĺľňóôŕšťúýžÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ" # Slovak
)
# Encodings to probe when charset-normalizer fingerprints the file as
# Japanese (a frequent misfire on short Cyrillic samples whose byte
# patterns happen to coincide with shift_jis lead bytes).
_CYRILLIC_PROBES: tuple[str, ...] = ("koi8_r", "cp1251", "iso8859_5")
_EE_LATIN_PROBES: tuple[str, ...] = ("iso8859_2", "cp1250")
def _cyrillic_coverage(text: str) -> float:
"""Fraction of *all non-ASCII characters* in *text* that are Cyrillic letters.
Dividing by all non-ASCII (rather than only letters) penalises
decodings that produce mostly symbols/box-drawing with a sprinkle
of incidental Cyrillic glyphs — a real KOI8-R Russian text scores
>0.7 because nearly every non-ASCII codepoint IS a Cyrillic letter,
whereas a Japanese-shift_jis-decoded-as-koi8r text scores low.
"""
non_ascii = [c for c in text if ord(c) >= 0x80]
if not non_ascii:
return 0.0
cyr = sum(
1 for c in non_ascii
if c.isalpha() and _CYRILLIC_RANGE[0] <= ord(c) <= _CYRILLIC_RANGE[1]
)
return cyr / len(non_ascii)
def _ee_latin_coverage(text: str) -> float:
"""Fraction of *all non-ASCII characters* in *text* that look like EE Latin."""
non_ascii = [c for c in text if ord(c) >= 0x80]
if not non_ascii:
return 0.0
ee = sum(1 for c in non_ascii if c in _EE_LATIN_LETTERS)
return ee / len(non_ascii)
def _probe_language(raw: bytes, top_enc: str) -> Optional[str]:
"""Try language-specific decodings when charset-normalizer guessed wrong.
Returns a better encoding name when one of the probe candidates
decodes the bytes into a language-coherent text (Cyrillic ≥ 70 % for
Cyrillic probes, EE-Latin ≥ 50 % for EE Latin probes), else None.
"""
if top_enc in {"shift_jis_2004", "shift_jisx0213", "shift_jis", "cp932"}:
probes, scorer, threshold = _CYRILLIC_PROBES, _cyrillic_coverage, 0.70
elif top_enc in {"cp1258", "iso8859_16"}:
probes, scorer, threshold = _EE_LATIN_PROBES, _ee_latin_coverage, 0.50
else:
return None
# Score the top pick first. If the top encoding *itself* decodes the
# bytes into reasonable Cyrillic / EE Latin text, the bytes are
# genuinely in that script — don't override.
try:
top_decoded = raw.decode(top_enc, errors="replace")
top_score = scorer(top_decoded)
except LookupError:
top_score = 0.0
best_enc: Optional[str] = None
best_score = 0.0
for enc in probes:
try:
decoded = raw.decode(enc)
except (UnicodeDecodeError, LookupError):
continue
score = scorer(decoded)
if score > best_score:
best_score = score
best_enc = enc
# Require both an absolute coverage threshold AND a clear margin over
# the top pick — otherwise we risk hijacking real Japanese / Vietnamese
# content whose decode happens to produce a few Cyrillic / EE-Latin
# glyphs by coincidence.
if best_enc and best_score >= threshold and best_score >= top_score + 0.30:
return best_enc
return None
# Pairs of encoding names whose byte ranges DIFFER for accented letters.
# Used to refuse spurious in-alias swaps (e.g. cp1250 vs iso8859_2 are
# byte-distinct even though charset-normalizer lists them as siblings).
_SAME_FAMILY: set[frozenset[str]] = {
frozenset({"cp1250", "iso8859_2"}),
frozenset({"mac_iceland", "mac_turkish"}),
frozenset({"shift_jis_2004", "shift_jisx0213"}),
}
def _same_byte_family(a: str, b: str) -> bool:
return frozenset({a, b}) in _SAME_FAMILY
def detect_encoding(path: Path, sample_bytes: int = 65_536) -> str:
"""Detect file encoding by reading the first *sample_bytes*.
@@ -34,8 +235,21 @@ def detect_encoding(path: Path, sample_bytes: int = 65_536) -> str:
# Check BOM first
if raw[:3] == b"\xef\xbb\xbf":
return "utf-8-sig"
if raw[:2] in (b"\xff\xfe", b"\xfe\xff"):
# A "lying" BOM: file claims utf-8 but the body bytes don't decode
# as utf-8. Fall through to charset detection on the BOM-stripped
# body so we don't hand back utf-8-sig that will then fail to read.
body = raw[3:]
try:
body.decode("utf-8")
return "utf-8-sig"
except UnicodeDecodeError:
logger.debug(
"detect_encoding({}): file has UTF-8 BOM but body is not "
"valid UTF-8 — falling through to charset detection",
Path(path).name,
)
raw = body
elif raw[:2] in (b"\xff\xfe", b"\xfe\xff"):
return "utf-16"
# Strict UTF-8 wins. charset_normalizer fingerprints small files
@@ -48,11 +262,21 @@ def detect_encoding(path: Path, sample_bytes: int = 65_536) -> str:
except UnicodeDecodeError:
pass
result = from_bytes(raw).best()
if result is None:
matches = from_bytes(raw)
enc = _arbitrate_charset_match(matches)
if enc is None:
return "utf-8"
enc = result.encoding.lower()
# Normalise common aliases
# Language-aware probe runs after the arbiter so we only spend cycles
# on the cases where charset-normalizer fingerprinted the bytes as a
# codepage that doesn't match the apparent script. Returns a better
# encoding only when the probe finds a high-coverage match.
probed = _probe_language(raw, enc)
if probed:
logger.debug(
"detect_encoding({}): language probe overrode {}{}",
Path(path).name, enc, probed,
)
enc = probed
if enc in ("ascii", "us-ascii"):
enc = "utf-8"
return enc

780
src/core/missing.py Normal file
View File

@@ -0,0 +1,780 @@
"""DataTools Missing Value Handler.
Detects disguised nulls, profiles missingness per column, and applies
imputation or drop strategies with a full audit trail.
Public API
----------
Per-column helpers:
is_missing_like(value, sentinels) -> bool
detect_sentinels(series, sentinels) -> dict[str, int]
DataFrame entry points:
profile_missing(df, options) -> MissingProfile
handle_missing(df, options) -> MissingResult
Types:
MissingOptions, MissingProfile, MissingResult, ColumnReport, Strategy
Presets (PRESETS):
"detect-only" — only standardize sentinels to NaN, no fill / drop.
"safe-fill" — sentinels → NaN, then numeric=median, categorical=mode.
"drop-incomplete" — sentinels → NaN, then drop rows with any missing.
Use cases covered
-----------------
1. Disguised nulls in survey / CRM exports ("N/A", "n/a", "-", "(blank)",
"TBD", whitespace-only, "?", "null", "NaN").
2. Per-column profile for QA reports (counts, %, top sentinel hit).
3. Row-drop with threshold (e.g., drop rows missing >50% of columns).
4. Column-drop with threshold (e.g., drop columns missing >80%).
5. Numeric imputation (mean / median / constant), categorical (mode /
constant), time-series (ffill / bfill).
6. Per-column overrides — different strategy per column in the same run.
Non-goals
---------
- ML-based imputation (KNN / iterative) — out of scope for v1.
- Group-wise imputation by another column — deferred until a real use case.
"""
from __future__ import annotations
import json
import re
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any, Iterable, Literal, Optional
import numpy as np
import pandas as pd
from loguru import logger
from pandas.api import types as pdtypes
from .errors import ConfigError, InputValidationError, ensure_choice, ensure_dataframe
# ---------------------------------------------------------------------------
# Sentinel detection
# ---------------------------------------------------------------------------
# Default disguised-null sentinels. Matched case-insensitively after a
# strip(). Whitespace-only strings ("", " ") are always treated as
# missing regardless of this list.
DEFAULT_SENTINELS: tuple[str, ...] = (
"n/a", "na", "n.a.", "n.a",
"null", "none", "nil",
"nan",
"-", "--", "---",
"?", "??",
".",
"tbd", "tba",
"unknown", "unk",
"(blank)", "(none)", "(empty)", "(null)",
"#n/a", "#na", "#null!", "#value!",
"missing",
)
_WHITESPACE_ONLY_RE = re.compile(r"^\s*$")
def is_missing_like(value: Any, sentinels: Iterable[str] = DEFAULT_SENTINELS) -> bool:
"""True when *value* should be treated as missing.
Catches: real NaN/None, whitespace-only strings, and any string that
matches a sentinel after case-fold and strip.
"""
if value is None:
return True
# pandas / numpy NaN
try:
if isinstance(value, float) and np.isnan(value):
return True
except (TypeError, ValueError):
pass
if isinstance(value, pd._libs.tslibs.nattype.NaTType): # type: ignore[attr-defined]
return True
if not isinstance(value, str):
return False
if _WHITESPACE_ONLY_RE.match(value):
return True
needle = value.strip().casefold()
return needle in {s.casefold() for s in sentinels}
def detect_sentinels(
series: pd.Series,
sentinels: Iterable[str] = DEFAULT_SENTINELS,
) -> dict[str, int]:
"""Return ``{sentinel_value: count}`` for sentinels found in *series*.
Real NaN cells are not counted (they're already missing). Whitespace-
only strings are bucketed under the literal key ``"(whitespace)"`` so
callers can surface them distinctly from non-whitespace sentinels.
"""
counts: dict[str, int] = {}
needles = {s.casefold(): s for s in sentinels}
for value in series:
if value is None or (isinstance(value, float) and pd.isna(value)):
continue
if not isinstance(value, str):
continue
if _WHITESPACE_ONLY_RE.match(value):
counts["(whitespace)"] = counts.get("(whitespace)", 0) + 1
continue
key = value.strip().casefold()
if key in needles:
label = needles[key]
counts[label] = counts.get(label, 0) + 1
return counts
# ---------------------------------------------------------------------------
# Strategies / options / results
# ---------------------------------------------------------------------------
Strategy = Literal[
"none", # detect-only; do not fill or drop.
"drop_row", # drop rows that are missing in any selected column.
"drop_col", # drop columns whose missing fraction exceeds threshold.
"drop_both", # apply drop_col first, then drop_row on what remains.
"mean", # numeric only.
"median", # numeric only.
"mode", # any dtype.
"constant", # fill with options.fill_value.
"ffill",
"bfill",
"interpolate", # linear interpolation, numeric only.
]
_NUMERIC_STRATEGIES: frozenset[str] = frozenset(
{"mean", "median", "interpolate"},
)
_FILL_STRATEGIES: frozenset[str] = frozenset(
{"mean", "median", "mode", "constant", "ffill", "bfill", "interpolate"},
)
_DROP_STRATEGIES: frozenset[str] = frozenset(
{"drop_row", "drop_col", "drop_both"},
)
PRESETS: dict[str, dict[str, Any]] = {
"detect-only": {
"standardize_sentinels": True,
"strategy": "none",
},
"safe-fill": {
"standardize_sentinels": True,
"strategy": "median",
"categorical_strategy": "mode",
},
"drop-incomplete": {
"standardize_sentinels": True,
"strategy": "drop_row",
# Strict-greater semantics: 0.0 → drop a row as soon as any
# selected column is missing.
"row_drop_threshold": 0.0,
},
}
@dataclass
class MissingOptions:
"""Toggles for missing-value detection and handling.
Defaults match the ``detect-only`` preset: sentinels are standardized
to NaN, but no rows are dropped and no values are filled.
"""
# Detection
sentinels: list[str] = field(default_factory=lambda: list(DEFAULT_SENTINELS))
standardize_sentinels: bool = True
# Strategy applied to all selected columns. ``categorical_strategy``
# is a fallback used by numeric-only strategies (mean/median/interpolate)
# when a selected column is non-numeric — rather than crash, fall back
# to a reasonable categorical strategy.
strategy: Strategy = "none"
categorical_strategy: Strategy = "mode"
# Per-column overrides take precedence over ``strategy`` / preset.
column_strategies: dict[str, Strategy] = field(default_factory=dict)
# Constant-fill payload. Either a scalar (applied to every selected
# column) or a per-column dict for differentiated fills.
fill_value: Any = None
column_fill_values: dict[str, Any] = field(default_factory=dict)
# Drop thresholds (0.0 .. 1.0). A row/column is dropped when its
# missing fraction is *strictly greater than* the threshold. So:
# 1.0 (default) — never drop (no fraction exceeds 100%)
# 0.5 — drop when more than half is missing
# 0.0 — drop on any missing at all
row_drop_threshold: float = 1.0
col_drop_threshold: float = 1.0
# Scope control
columns: Optional[list[str]] = None
skip_columns: list[str] = field(default_factory=list)
@classmethod
def from_preset(cls, name: str) -> MissingOptions:
if name not in PRESETS:
raise ConfigError(
f"Unknown preset '{name}'",
operation="MissingOptions.from_preset",
suggestion=f"Available: {sorted(PRESETS)}",
)
return cls(**PRESETS[name])
@classmethod
def from_dict(cls, data: dict) -> MissingOptions:
known = set(cls.__dataclass_fields__)
kwargs = {k: v for k, v in data.items() if k in known}
return cls(**kwargs)
def to_dict(self) -> dict:
return asdict(self)
def to_file(self, path: str | Path) -> Path:
out = Path(path)
out.write_text(json.dumps(self.to_dict(), indent=2, default=str))
return out
@classmethod
def from_file(cls, path: str | Path) -> MissingOptions:
return cls.from_dict(json.loads(Path(path).read_text()))
def validate(self) -> None:
"""Fail fast on incoherent option combinations."""
choices = (
"none", "drop_row", "drop_col", "drop_both",
"mean", "median", "mode", "constant",
"ffill", "bfill", "interpolate",
)
ensure_choice(self.strategy, name="strategy", choices=choices,
function="MissingOptions.validate")
ensure_choice(self.categorical_strategy, name="categorical_strategy",
choices=choices, function="MissingOptions.validate")
for col, strat in self.column_strategies.items():
ensure_choice(strat, name=f"column_strategies[{col!r}]",
choices=choices, function="MissingOptions.validate")
if not (0.0 <= self.row_drop_threshold <= 1.0):
raise ConfigError(
f"row_drop_threshold must be in [0.0, 1.0], got "
f"{self.row_drop_threshold!r}",
operation="MissingOptions.validate",
)
if not (0.0 <= self.col_drop_threshold <= 1.0):
raise ConfigError(
f"col_drop_threshold must be in [0.0, 1.0], got "
f"{self.col_drop_threshold!r}",
operation="MissingOptions.validate",
)
@dataclass
class ColumnReport:
"""Per-column missingness snapshot."""
column: str
dtype: str
total: int
missing: int # NaN cells (after sentinel standardization if enabled)
missing_pct: float # 0.0 .. 100.0
sentinels_found: dict[str, int] # disguised nulls hit, pre-standardization
@property
def has_missing(self) -> bool:
return self.missing > 0
@dataclass
class MissingProfile:
"""Whole-DataFrame missingness profile."""
columns: list[ColumnReport]
rows_total: int
cells_total: int
cells_missing: int
rows_with_any_missing: int
rows_complete: int
@property
def cells_missing_pct(self) -> float:
return (self.cells_missing / self.cells_total * 100.0) if self.cells_total else 0.0
def to_dataframe(self) -> pd.DataFrame:
"""Long-form table suitable for the GUI / CLI."""
rows = []
for r in self.columns:
top = max(r.sentinels_found.items(), key=lambda kv: kv[1], default=("", 0))
rows.append({
"column": r.column,
"dtype": r.dtype,
"missing": r.missing,
"missing_pct": round(r.missing_pct, 2),
"top_sentinel": top[0],
"top_sentinel_count": top[1],
"sentinel_total": sum(r.sentinels_found.values()),
})
return pd.DataFrame(rows)
@dataclass
class MissingResult:
"""Output of ``handle_missing``."""
handled_df: pd.DataFrame
profile_before: MissingProfile
profile_after: MissingProfile
changes: pd.DataFrame # cols: row, column, old, new, action
rows_dropped: int
columns_dropped: list[str]
cells_filled: int
sentinels_standardized: int
columns_processed: list[str]
strategy_per_column: dict[str, Strategy]
# ---------------------------------------------------------------------------
# Profiling
# ---------------------------------------------------------------------------
def _select_columns(df: pd.DataFrame, options: MissingOptions) -> list[str]:
"""Pick the columns to operate on (mirrors text_clean._select_columns).
Default: every column. Missing-value handling is meaningful for any
dtype, unlike text cleaning which only touches strings.
"""
if options.columns is not None:
unknown = [c for c in options.columns if c not in df.columns]
if unknown:
raise InputValidationError(
f"Columns not found in input: {unknown}",
operation="handle_missing",
suggestion=f"Available: {list(df.columns)}",
)
chosen: Iterable[str] = options.columns
else:
chosen = list(df.columns)
skip = set(options.skip_columns)
return [c for c in chosen if c not in skip]
def _standardize_sentinels(
df: pd.DataFrame,
columns: list[str],
sentinels: Iterable[str],
) -> tuple[pd.DataFrame, list[dict[str, Any]], int]:
"""Replace sentinel strings with NaN in the selected columns.
Returns ``(new_df, change_records, total_replacements)``. ``change_records``
is appended to the audit table so the user can see exactly which cells
were converted from "N/A" / "-" / etc. to a real null.
"""
out = df.copy()
needles = {s.casefold(): s for s in sentinels}
records: list[dict[str, Any]] = []
total = 0
for col in columns:
series = out[col]
# Only iterate object/string columns — numeric/datetime cells can't
# contain string sentinels by construction.
if not (pdtypes.is_object_dtype(series) or pdtypes.is_string_dtype(series)):
continue
new_values: list[Any] = []
changed = False
for row_idx, value in enumerate(series.tolist()):
if value is None or (isinstance(value, float) and pd.isna(value)):
new_values.append(value)
continue
if not isinstance(value, str):
new_values.append(value)
continue
if _WHITESPACE_ONLY_RE.match(value):
records.append({
"row": row_idx,
"column": col,
"old": value,
"new": np.nan,
"action": "standardize:whitespace",
})
new_values.append(np.nan)
changed = True
total += 1
continue
key = value.strip().casefold()
if key in needles:
records.append({
"row": row_idx,
"column": col,
"old": value,
"new": np.nan,
"action": f"standardize:{needles[key]}",
})
new_values.append(np.nan)
changed = True
total += 1
else:
new_values.append(value)
if changed:
out[col] = new_values
return out, records, total
def profile_missing(
df: pd.DataFrame,
options: Optional[MissingOptions] = None,
) -> MissingProfile:
"""Compute a per-column missingness profile.
Sentinels are *not* mutated in *df*; this is a read-only inspection.
The profile reports both raw NaN counts and which sentinel strings
were hit so the GUI / CLI can show "12 disguised nulls (8 'N/A',
4 '-')" alongside "47 real NaN".
"""
ensure_dataframe(df, function="profile_missing")
options = options or MissingOptions()
columns = _select_columns(df, options)
sentinels = options.sentinels if options.standardize_sentinels else []
reports: list[ColumnReport] = []
for col in columns:
series = df[col]
sentinels_hit = detect_sentinels(series, sentinels) if sentinels else {}
# Effective missing = real-NaN count + sentinel hits (since they'd
# become NaN once standardize_sentinels runs). This makes the
# "before" profile match what the user sees post-standardization.
nan_count = int(series.isna().sum())
sentinel_count = sum(sentinels_hit.values())
total = len(series)
missing = nan_count + sentinel_count
reports.append(ColumnReport(
column=str(col),
dtype=str(series.dtype),
total=total,
missing=missing,
missing_pct=(missing / total * 100.0) if total else 0.0,
sentinels_found=sentinels_hit,
))
# For row-level stats use NaN sentinels in the selected columns.
if columns and len(df):
if sentinels:
mask = pd.DataFrame(index=df.index)
needles = {s.casefold() for s in sentinels}
for col in columns:
series = df[col]
if pdtypes.is_object_dtype(series) or pdtypes.is_string_dtype(series):
sentinel_mask = series.apply(
lambda v: isinstance(v, str)
and (
bool(_WHITESPACE_ONLY_RE.match(v))
or v.strip().casefold() in needles
)
)
mask[col] = series.isna() | sentinel_mask
else:
mask[col] = series.isna()
else:
mask = df[columns].isna()
rows_with_any = int(mask.any(axis=1).sum())
rows_complete = int((~mask.any(axis=1)).sum())
cells_missing = int(mask.values.sum())
cells_total = int(mask.size)
else:
rows_with_any = 0
rows_complete = len(df)
cells_missing = 0
cells_total = len(df) * len(columns)
return MissingProfile(
columns=reports,
rows_total=len(df),
cells_total=cells_total,
cells_missing=cells_missing,
rows_with_any_missing=rows_with_any,
rows_complete=rows_complete,
)
# ---------------------------------------------------------------------------
# Imputation
# ---------------------------------------------------------------------------
def _resolve_strategy(
col: str,
series: pd.Series,
options: MissingOptions,
) -> Strategy:
"""Effective strategy for *col*: per-column override → global → fallback.
If the column is non-numeric and the selected strategy is numeric-only,
fall back to ``options.categorical_strategy`` so the run doesn't crash
halfway through. The fallback is logged so the audit trail records
why a different strategy fired.
"""
strat: Strategy = options.column_strategies.get(col, options.strategy)
if strat in _NUMERIC_STRATEGIES and not pdtypes.is_numeric_dtype(series):
logger.debug(
"Column {!r}: strategy {!r} requires numeric dtype "
"(got {}); falling back to {!r}",
col, strat, series.dtype, options.categorical_strategy,
)
return options.categorical_strategy
return strat
def _fill_value_for(
col: str,
series: pd.Series,
strategy: Strategy,
options: MissingOptions,
) -> Any:
"""Compute the scalar fill for *series* under *strategy*.
Returns a sentinel ``object()`` when the strategy doesn't yield a
single scalar (ffill/bfill/interpolate handle the fill themselves).
"""
if strategy == "mean":
return series.mean()
if strategy == "median":
return series.median()
if strategy == "mode":
modes = series.mode(dropna=True)
return modes.iloc[0] if len(modes) else None
if strategy == "constant":
if col in options.column_fill_values:
return options.column_fill_values[col]
return options.fill_value
return _NO_SCALAR
_NO_SCALAR = object()
def _apply_fill(
df: pd.DataFrame,
col: str,
strategy: Strategy,
options: MissingOptions,
records: list[dict[str, Any]],
) -> int:
"""Apply *strategy* to a single column. Returns cells filled."""
series = df[col]
missing_mask = series.isna()
if not missing_mask.any():
return 0
if strategy == "ffill":
filled = series.ffill()
elif strategy == "bfill":
filled = series.bfill()
elif strategy == "interpolate":
# Interpolation is only defined for numeric series — guard so an
# accidentally-routed object column produces no output rather
# than a confusing TypeError.
if not pdtypes.is_numeric_dtype(series):
return 0
filled = series.interpolate(method="linear", limit_direction="both")
else:
# Skip mean/median computation entirely on all-NaN numeric columns
# so we don't trip numpy's "Mean of empty slice" RuntimeWarning.
if (
strategy in {"mean", "median"}
and pdtypes.is_numeric_dtype(series)
and series.dropna().empty
):
return 0
scalar = _fill_value_for(col, series, strategy, options)
if scalar is _NO_SCALAR:
return 0
if scalar is None or (isinstance(scalar, float) and pd.isna(scalar)):
# Nothing to fill with — e.g., all-NaN column under "mean".
logger.debug(
"Column {!r}: strategy {!r} produced no fill value (all-NaN?)",
col, strategy,
)
return 0
# Opt into pandas 2.x's future no-silent-downcast behaviour to
# avoid the FutureWarning fired when fillna would auto-downcast
# an object column. We then call infer_objects ourselves to
# preserve the dtype the user would have ended up with.
with pd.option_context("future.no_silent_downcasting", True):
filled = series.fillna(scalar)
if pdtypes.is_object_dtype(series):
filled = filled.infer_objects(copy=False)
cells = 0
for row_idx in np.flatnonzero(missing_mask.values):
old = series.iloc[row_idx]
new = filled.iloc[row_idx]
if pd.isna(new):
# ffill/bfill at a leading/trailing NaN run can leave NaN in
# place. Don't audit a no-op fill.
continue
records.append({
"row": int(row_idx),
"column": col,
"old": old,
"new": new,
"action": f"fill:{strategy}",
})
cells += 1
df[col] = filled
return cells
def _apply_drops(
df: pd.DataFrame,
columns: list[str],
strategy: Strategy,
options: MissingOptions,
records: list[dict[str, Any]],
) -> tuple[pd.DataFrame, int, list[str]]:
"""Drop rows / columns according to *strategy*.
Returns ``(new_df, rows_dropped, columns_dropped)``.
"""
out = df
rows_dropped = 0
cols_dropped: list[str] = []
# Drop semantics (consistent across rows and columns): a row/column
# is dropped when its missing fraction is *strictly greater* than the
# threshold. The default threshold of 1.0 therefore means "never
# drop" (no fraction can exceed 100%); 0.0 means "drop on any
# missing"; intermediate values trigger when the missing share rises
# above the chosen ceiling.
if strategy in {"drop_col", "drop_both"} and columns:
pct = out[columns].isna().mean()
to_drop = [c for c, frac in pct.items() if frac > options.col_drop_threshold]
if to_drop:
for c in to_drop:
records.append({
"row": -1,
"column": c,
"old": f"{int(out[c].isna().sum())} missing / {len(out)}",
"new": "",
"action": "drop_column",
})
out = out.drop(columns=to_drop)
cols_dropped = to_drop
columns = [c for c in columns if c not in to_drop]
if strategy in {"drop_row", "drop_both"} and columns:
sel = out[columns]
frac = sel.isna().mean(axis=1)
drop_mask = frac > options.row_drop_threshold
rows_dropped = int(drop_mask.sum())
if rows_dropped:
for row_idx in np.flatnonzero(drop_mask.values):
miss_cols = [c for c in columns if pd.isna(sel.iloc[row_idx][c])]
records.append({
"row": int(row_idx),
"column": ",".join(miss_cols),
"old": "",
"new": "",
"action": "drop_row",
})
out = out.loc[~drop_mask].reset_index(drop=True)
return out, rows_dropped, cols_dropped
def handle_missing(
df: pd.DataFrame,
options: Optional[MissingOptions] = None,
) -> MissingResult:
"""Detect and handle missing values in *df*.
Pipeline placement (recommended, not enforced)
----------------------------------------------
Run *after* the text cleaner (so NBSP-padded / zero-width-only
cells are correctly detected as missing) and the format
standardizer (so numeric imputation has numeric dtypes). Run
*before* the deduplicator (so dedup doesn't merge a row with a
missing email into a row that has one). See
``src.core.pipeline.SOFT_DEPENDENCIES``.
Pipeline:
1. Standardize disguised-null sentinels to ``NaN`` (audit-logged).
2. Apply column drops (if strategy includes ``drop_col``).
3. Apply row drops (if strategy includes ``drop_row``).
4. Apply per-column fills (mean/median/mode/constant/ffill/bfill/
interpolate). Per-column overrides win over the global strategy.
The input DataFrame is not mutated.
"""
ensure_dataframe(df, function="handle_missing")
options = options or MissingOptions()
options.validate()
profile_before = profile_missing(df, options)
columns = _select_columns(df, options)
logger.debug(
"handle_missing: rows={}, cols={}, strategy={}, scope_cols={}",
len(df), len(df.columns), options.strategy, len(columns),
)
records: list[dict[str, Any]] = []
sentinels_replaced = 0
# ------------------------------------------------------------------
# 1. Sentinel standardization
# ------------------------------------------------------------------
if options.standardize_sentinels and options.sentinels and columns:
out, sentinel_records, sentinels_replaced = _standardize_sentinels(
df, columns, options.sentinels,
)
records.extend(sentinel_records)
else:
out = df.copy()
# ------------------------------------------------------------------
# 2 + 3. Drops (column-first, then row)
# ------------------------------------------------------------------
rows_dropped = 0
columns_dropped: list[str] = []
global_strategy = options.strategy
if global_strategy in _DROP_STRATEGIES:
out, rows_dropped, columns_dropped = _apply_drops(
out, columns, global_strategy, options, records,
)
# Update column scope after potential drops.
columns = [c for c in columns if c not in columns_dropped]
# ------------------------------------------------------------------
# 4. Fills (per-column)
# ------------------------------------------------------------------
cells_filled = 0
strategy_per_column: dict[str, Strategy] = {}
for col in columns:
strat = _resolve_strategy(col, out[col], options)
strategy_per_column[col] = strat
if strat in _FILL_STRATEGIES:
cells_filled += _apply_fill(out, col, strat, options, records)
# ------------------------------------------------------------------
# Build audit + after-profile
# ------------------------------------------------------------------
changes_df = pd.DataFrame(
records, columns=["row", "column", "old", "new", "action"],
)
profile_after = profile_missing(out, options)
return MissingResult(
handled_df=out,
profile_before=profile_before,
profile_after=profile_after,
changes=changes_df,
rows_dropped=rows_dropped,
columns_dropped=columns_dropped,
cells_filled=cells_filled,
sentinels_standardized=sentinels_replaced,
columns_processed=columns,
strategy_per_column=strategy_per_column,
)

501
src/core/pipeline.py Normal file
View File

@@ -0,0 +1,501 @@
"""DataTools Pipeline Runner.
Chain the cleaning tools (text-clean, format-standardize, missing,
column-map, dedup) into a single orchestrated workflow. The pipeline
threads the DataFrame from one step to the next; each step's options
are JSON-serializable so the entire pipeline can be saved, shared, and
re-run on next week's export.
Design tenets
-------------
* **Recommended, not forced.** The recommended order
(text → format → missing → dedup, with column-map fitting either
end depending on use case) is encoded in
:data:`SOFT_DEPENDENCIES`. The runner WARNS on out-of-order
pipelines but never refuses to execute them — the user owns their
workflow.
* **Each step is opt-in / opt-out.** ``Step.enabled = False`` skips
the step without removing it from the saved configuration.
* **Adapters are tiny.** Each tool is wrapped by a small adapter that
bridges its native ``Options`` / ``Result`` shape to the pipeline's
uniform ``(df, options_dict) → (new_df, summary)`` contract.
Public API
----------
Types:
Step, Pipeline, StepResult, PipelineResult
Functions:
run_pipeline(df, pipeline) -> PipelineResult
validate_pipeline(pipeline) -> list[str]
recommended_pipeline(*, include=None, **opts) -> Pipeline
Constants:
TOOL_ADAPTERS — name → adapter callable
TOOL_NAMES — sorted list of recognised tool names
SOFT_DEPENDENCIES — list of (earlier, later, reason) tuples
"""
from __future__ import annotations
import json
import time
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any, Callable, Iterable, Optional
import pandas as pd
from loguru import logger
from .errors import (
ConfigError,
DataToolsError,
InputValidationError,
ensure_choice,
ensure_dataframe,
)
# ---------------------------------------------------------------------------
# Tool adapters — bridge each tool's native API to the pipeline contract
# ---------------------------------------------------------------------------
def _adapter_text_clean(
df: pd.DataFrame, options: dict[str, Any],
) -> tuple[pd.DataFrame, dict[str, Any]]:
from .text_clean import CleanOptions, clean_dataframe
opts = CleanOptions.from_dict(options) if options else CleanOptions()
res = clean_dataframe(df, opts)
return res.cleaned_df, {
"cells_total": res.cells_total,
"cells_changed": res.cells_changed,
"columns_processed": list(res.columns_processed),
}
def _adapter_format_standardize(
df: pd.DataFrame, options: dict[str, Any],
) -> tuple[pd.DataFrame, dict[str, Any]]:
from .format_standardize import StandardizeOptions, standardize_dataframe
opts = StandardizeOptions.from_dict(options) if options else StandardizeOptions()
res = standardize_dataframe(df, opts)
return res.standardized_df, {
"cells_total": res.cells_total,
"cells_changed": res.cells_changed,
"cells_unparseable": res.cells_unparseable,
"columns_processed": list(res.columns_processed),
}
def _adapter_missing(
df: pd.DataFrame, options: dict[str, Any],
) -> tuple[pd.DataFrame, dict[str, Any]]:
from .missing import MissingOptions, handle_missing
opts = MissingOptions.from_dict(options) if options else MissingOptions()
res = handle_missing(df, opts)
return res.handled_df, {
"sentinels_standardized": res.sentinels_standardized,
"cells_filled": res.cells_filled,
"rows_dropped": res.rows_dropped,
"columns_dropped": list(res.columns_dropped),
"columns_processed": list(res.columns_processed),
}
def _adapter_column_map(
df: pd.DataFrame, options: dict[str, Any],
) -> tuple[pd.DataFrame, dict[str, Any]]:
from .column_mapper import MapOptions, map_columns
opts = MapOptions.from_dict(options) if options else MapOptions()
res = map_columns(df, opts)
return res.mapped_df, {
"columns_renamed": res.columns_renamed,
"columns_dropped": list(res.columns_dropped),
"columns_added": list(res.columns_added),
"coercion_failures": dict(res.coercion_failures),
"missing_required_targets": list(res.missing_required_targets),
}
def _adapter_dedup(
df: pd.DataFrame, options: dict[str, Any],
) -> tuple[pd.DataFrame, dict[str, Any]]:
from .dedup import deduplicate, SurvivorRule
from .config import DeduplicationConfig
options = options or {}
survivor = options.get("survivor_rule", "first")
if isinstance(survivor, str):
try:
survivor = SurvivorRule(survivor)
except ValueError as e:
raise ConfigError(
f"Unknown survivor_rule {survivor!r}",
operation="pipeline.dedup",
cause=e,
suggestion=f"Valid: {[r.value for r in SurvivorRule]}",
) from e
# Optional explicit strategies via the same JSON shape as
# DeduplicationConfig: ``[{"columns": [{"column": "phone",
# "algorithm": "exact", "threshold": 100}, ...]}, ...]``.
raw_strategies = options.get("strategies")
explicit_strategies = None
if raw_strategies:
cfg = DeduplicationConfig.from_dict({"strategies": raw_strategies})
explicit_strategies = cfg.to_strategies()
res = deduplicate(
df,
strategies=explicit_strategies,
survivor_rule=survivor,
merge=options.get("merge", False),
preview=False, # pipeline always commits the dedup output
date_column=options.get("date_column"),
)
final = res.deduplicated_df if res.deduplicated_df is not None else df
return final, {
"input_rows": len(df),
"output_rows": len(final),
"duplicates_removed": len(df) - len(final),
"groups": len(res.match_groups) if res.match_groups else 0,
}
TOOL_ADAPTERS: dict[str, Callable[..., tuple[pd.DataFrame, dict[str, Any]]]] = {
"text_clean": _adapter_text_clean,
"format_standardize": _adapter_format_standardize,
"missing": _adapter_missing,
"column_map": _adapter_column_map,
"dedup": _adapter_dedup,
}
TOOL_NAMES: list[str] = sorted(TOOL_ADAPTERS)
# ---------------------------------------------------------------------------
# Soft dependencies
# ---------------------------------------------------------------------------
# Pairs of (earlier, later, reason) where running *earlier* before
# *later* is recommended. A reversal triggers a WARNING — never a
# block. The user owns their workflow.
SOFT_DEPENDENCIES: list[tuple[str, str, str]] = [
(
"text_clean", "format_standardize",
"format parsers (phone / currency / date) fail on smart-quote-"
"contaminated or NBSP-padded input — clean text first",
),
(
"text_clean", "missing",
"sentinel detection misses cells padded with NBSP / zero-width "
"characters — clean text first",
),
(
"text_clean", "dedup",
"fuzzy matching treats NBSP-padded values as different — "
"clean text first",
),
(
"format_standardize", "missing",
"numeric imputation needs numeric dtypes; canonical phones / "
"currencies improve sentinel detection",
),
(
"format_standardize", "dedup",
"canonical phones / lowercase emails enable cross-format "
"duplicate matching",
),
(
"missing", "dedup",
"deduping rows with mixed NaN sentinels produces brittle merges "
"— resolve missing values first",
),
]
# ---------------------------------------------------------------------------
# Step / Pipeline / Result dataclasses
# ---------------------------------------------------------------------------
@dataclass
class Step:
"""One step in a pipeline.
Attributes
----------
tool : Name of the tool to run. Must be a key of :data:`TOOL_ADAPTERS`.
options : JSON-serializable dict of tool-specific options. Each
adapter parses this through the tool's ``Options.from_dict``.
enabled : Skip the step (without removing it) when False.
name : Optional friendly label for logs / GUI rendering. Defaults
to the tool name.
"""
tool: str
options: dict[str, Any] = field(default_factory=dict)
enabled: bool = True
name: Optional[str] = None
def display_name(self) -> str:
return self.name or self.tool
def __post_init__(self) -> None:
if self.tool not in TOOL_ADAPTERS:
raise ConfigError(
f"Unknown tool {self.tool!r}",
operation="Step.__post_init__",
suggestion=f"Valid tools: {TOOL_NAMES}",
)
@dataclass
class Pipeline:
"""An ordered sequence of :class:`Step` records."""
steps: list[Step] = field(default_factory=list)
def to_dict(self) -> dict:
return {"steps": [asdict(s) for s in self.steps]}
def to_file(self, path: str | Path) -> Path:
out = Path(path)
out.write_text(json.dumps(self.to_dict(), indent=2, default=str))
return out
@classmethod
def from_dict(cls, data: dict) -> Pipeline:
if "steps" not in data:
raise ConfigError(
"Pipeline file must contain a 'steps' list",
operation="Pipeline.from_dict",
suggestion='Example: {"steps": [{"tool": "text_clean"}, ...]}',
)
steps: list[Step] = []
for raw in data["steps"]:
if "tool" not in raw:
raise ConfigError(
f"Step is missing 'tool': {raw!r}",
operation="Pipeline.from_dict",
)
steps.append(Step(
tool=raw["tool"],
options=dict(raw.get("options") or {}),
enabled=bool(raw.get("enabled", True)),
name=raw.get("name"),
))
return cls(steps=steps)
@classmethod
def from_file(cls, path: str | Path) -> Pipeline:
return cls.from_dict(json.loads(Path(path).read_text()))
@dataclass
class StepResult:
"""One step's outcome."""
step: Step
summary: dict[str, Any]
elapsed_seconds: float
skipped: bool = False
error: Optional[str] = None # rendered exception, not the live one
@dataclass
class PipelineResult:
"""Whole-run outcome."""
final_df: pd.DataFrame
step_results: list[StepResult]
total_elapsed: float
initial_rows: int
final_rows: int
warnings: list[str]
# ---------------------------------------------------------------------------
# Recommended pipeline + validation
# ---------------------------------------------------------------------------
# The single canonical default. Column-map is omitted: include it only
# when the caller needs header alignment (early) or schema enforcement
# (late). Adding it as an "auto" middle step would override the user's
# downstream column lookups without their having asked.
_DEFAULT_ORDER: list[str] = [
"text_clean",
"format_standardize",
"missing",
"dedup",
]
def recommended_pipeline(
*,
include: Optional[Iterable[str]] = None,
options: Optional[dict[str, dict[str, Any]]] = None,
) -> Pipeline:
"""Build the recommended pipeline.
Defaults to ``[text_clean, format_standardize, missing, dedup]`` —
the canonical workflow surfaced in DECISIONS.md and
``src.core.pipeline.SOFT_DEPENDENCIES``.
Parameters
----------
include
Names of tools to include, in the desired order. When None,
uses :data:`_DEFAULT_ORDER`. Pass ``["column_map", "text_clean",
...]`` to put column-map first (header-alignment use case) or
``[..., "column_map"]`` to put it last (schema-enforcement use
case).
options
Optional ``{tool_name: {option_dict}}`` to seed each step. A
missing entry uses the tool's default options.
"""
chosen = list(include) if include is not None else list(_DEFAULT_ORDER)
seed = options or {}
for t in chosen:
ensure_choice(
t, name="tool", choices=TOOL_NAMES,
function="recommended_pipeline",
)
return Pipeline(steps=[
Step(tool=t, options=dict(seed.get(t) or {}))
for t in chosen
])
def validate_pipeline(pipeline: Pipeline) -> list[str]:
"""Return a list of WARNING strings for soft-dependency violations.
Empty list = pipeline is in recommended order. Each warning is a
single human-readable sentence the CLI / GUI can surface verbatim.
Disabled steps are ignored.
"""
enabled = [s for s in pipeline.steps if s.enabled]
positions: dict[str, int] = {}
duplicates: list[str] = []
for i, s in enumerate(enabled):
if s.tool in positions:
# Multiple steps for the same tool is allowed (a user might
# text-clean twice with different scopes). Skip the dep
# check for the duplicate so we don't spam warnings.
duplicates.append(s.tool)
else:
positions[s.tool] = i
warnings: list[str] = []
for earlier, later, why in SOFT_DEPENDENCIES:
if earlier in positions and later in positions:
if positions[earlier] > positions[later]:
warnings.append(
f"step {later!r} runs BEFORE {earlier!r}{why}"
)
return warnings
# ---------------------------------------------------------------------------
# Execution
# ---------------------------------------------------------------------------
def run_pipeline(
df: pd.DataFrame,
pipeline: Pipeline,
*,
on_step_complete: Optional[Callable[[StepResult], None]] = None,
stop_on_error: bool = True,
) -> PipelineResult:
"""Execute *pipeline* against *df*.
The DataFrame from each step's adapter is passed to the next step;
the original input is never mutated. Soft-dependency warnings are
captured up-front and returned via ``PipelineResult.warnings`` so
the caller can surface them — the run proceeds regardless.
Parameters
----------
on_step_complete
Optional ``callable(StepResult)`` fired after each step. Useful
for live progress in the GUI.
stop_on_error
When True (default), the first failing step's exception
propagates and execution halts. Set False to continue past a
failing step using the previous step's output (the failed
step's ``StepResult.error`` holds the rendered exception).
"""
ensure_dataframe(df, function="run_pipeline")
if not isinstance(pipeline, Pipeline):
raise InputValidationError(
f"Expected Pipeline, got {type(pipeline).__name__}",
operation="run_pipeline",
)
warnings = validate_pipeline(pipeline)
if warnings:
for w in warnings:
logger.warning("pipeline order: {}", w)
initial_rows = len(df)
step_results: list[StepResult] = []
current = df
t_start = time.perf_counter()
for step in pipeline.steps:
if not step.enabled:
sr = StepResult(
step=step, summary={}, elapsed_seconds=0.0, skipped=True,
)
step_results.append(sr)
if on_step_complete:
_safe_call(on_step_complete, sr)
continue
adapter = TOOL_ADAPTERS[step.tool]
s_start = time.perf_counter()
try:
new_df, summary = adapter(current, step.options)
except Exception as e: # noqa: BLE001 — pipeline owns the error contract
elapsed = time.perf_counter() - s_start
err_msg = (
e.format() if isinstance(e, DataToolsError) else f"{type(e).__name__}: {e}"
)
sr = StepResult(
step=step, summary={}, elapsed_seconds=elapsed,
error=err_msg,
)
step_results.append(sr)
if on_step_complete:
_safe_call(on_step_complete, sr)
if stop_on_error:
raise
logger.warning(
"pipeline step {!r} failed; continuing with previous output",
step.display_name(),
)
continue
current = new_df
sr = StepResult(
step=step, summary=summary,
elapsed_seconds=time.perf_counter() - s_start,
)
step_results.append(sr)
if on_step_complete:
_safe_call(on_step_complete, sr)
return PipelineResult(
final_df=current,
step_results=step_results,
total_elapsed=time.perf_counter() - t_start,
initial_rows=initial_rows,
final_rows=len(current),
warnings=warnings,
)
def _safe_call(callback: Callable, *args: Any) -> None:
"""Run a user-supplied callback, logging but never propagating errors."""
try:
callback(*args)
except Exception: # noqa: BLE001 — progress callbacks are advisory
logger.opt(exception=True).debug("pipeline callback raised; ignoring")

View File

@@ -535,6 +535,15 @@ def clean_dataframe(df: pd.DataFrame, options: Optional[CleanOptions] = None) ->
Numeric, datetime, and boolean columns are skipped by default. The input
DataFrame is not mutated; a copy is returned in ``CleanResult.cleaned_df``.
Pipeline placement (recommended, not enforced)
----------------------------------------------
*Best run early.* Smart-quote, NBSP, and zero-width pollution
silently breaks downstream parsers — phone numbers fail on
smart-quote contamination, sentinel detection misses NBSP-padded
cells, and fuzzy dedup treats whitespace-padded values as
different. Running this tool before format / missing / dedup is
the standard order. See ``src.core.pipeline.SOFT_DEPENDENCIES``.
"""
from .errors import ensure_dataframe
ensure_dataframe(df, function="clean_dataframe")

468
src/gui/app_demo.py Normal file
View File

@@ -0,0 +1,468 @@
"""DataTools — public demo app (deploys to Streamlit Community Cloud).
This is a SEPARATE entry point from the main GUI (``src/gui/app.py``).
The full GUI is the paid product surface; this demo is the marketing
surface — a single page that runs one of three persona-specific
pipelines on a preloaded sample file, shows the BEFORE / AFTER
side-by-side, and converts the visitor to a Gumroad purchase.
Launch:
streamlit run src/gui/app_demo.py
URL routing:
https://demo.datatools.app/?p=shopify-pet (Shopify operator)
https://demo.datatools.app/?p=bookkeeper (Bookkeeper)
https://demo.datatools.app/?p=revops (RevOps agency)
Free / paid boundary (per docs/DEMO-PLAN.md §6):
- input rows capped at ``DEMO_ROW_CAP``
- input file size capped at ``DEMO_FILE_CAP_MB``
- download CSV gets a single trailing watermark row
- the pipeline editor is read-only — visitor sees it but can't change it
- no audit-log download (paid feature)
- no save-pipeline-JSON (paid feature)
The demo runs the *same engine* as the paid product. Caps are applied
at the surface layer only — when the buyer downloads and runs the paid
build, every cap disappears.
"""
from __future__ import annotations
import io
import json
import sys
import time
from pathlib import Path
from typing import Any
import pandas as pd
import streamlit as st
# Ensure project root is on sys.path so `src.core` imports work
_project_root = Path(__file__).resolve().parent.parent.parent
if str(_project_root) not in sys.path:
sys.path.insert(0, str(_project_root))
from src.core.pipeline import Pipeline, run_pipeline
# ---------------------------------------------------------------------------
# Free / paid boundary constants
# ---------------------------------------------------------------------------
DEMO_ROW_CAP: int = 100
DEMO_FILE_CAP_MB: int = 5
GUMROAD_BASE: str = "https://gumroad.com/l/datatools"
# ---------------------------------------------------------------------------
# Persona registry — single source of truth
# ---------------------------------------------------------------------------
DEMO_DIR = _project_root / "samples" / "demo"
PERSONAS: dict[str, dict[str, Any]] = {
"shopify-pet": {
"label": "Shopify pet operator",
"icon": "🛍️",
"h1": "Klaviyo-import-ready customer lists. **In 30 seconds. Locally.**",
"sub": (
"Your Shopify customer export has duplicates Excel can't catch, "
"international phones Excel can't parse, and disguised nulls "
"(`N/A`, `(blank)`, `?`) that break Klaviyo's import. "
"DataTools fixes all of it in one pass — and your data never "
"leaves your computer."
),
"data_file": "shopify_pet_customers.csv",
"pipeline_file": "shopify_pet_pipeline.json",
"cta": "Get DataTools for Shopify — $49 →",
"landing": "https://datatools.app/shopify/",
},
"bookkeeper": {
"label": "Bookkeeper / freelance accountant",
"icon": "📒",
"h1": "Reconcile messy bank exports. **Hand your client an audit trail.**",
"sub": (
"The Jan and Feb exports overlap; the same transaction posts twice. "
"Vendor names are *Amazon* / *amazon.com* / *AMAZON.COM*4F2X9* in "
"three rows. DataTools dedups on Date + Amount + fuzzy Vendor, "
"produces ISO dates and numeric amounts, and gives you a row-level "
"audit log to hand the client."
),
"data_file": "bookkeeper_bank_reconcile.csv",
"pipeline_file": "bookkeeper_bank_pipeline.json",
"cta": "Get DataTools for Bookkeepers — $49 →",
"landing": "https://datatools.app/bookkeeper/",
},
"revops": {
"label": "Marketing / RevOps agency",
"icon": "🪢",
"h1": "Dedupe lead lists across HubSpot, LinkedIn, and manual scrapes — **locally.**",
"sub": (
"The same prospect shows up in HubSpot as `alice@acme.com`, in "
"LinkedIn as `Alice.Johnson@acme.com`, and in your VA's manual "
"scrape as `alice@acme.com` again. Country is `USA` / `US` / "
"`United States`. DataTools fuzzy-matches across sources, "
"normalizes phones for 50+ countries, and merges survivors "
"with their most-complete fields — without uploading anything."
),
"data_file": "agency_combined_leads.csv",
"pipeline_file": "agency_leads_pipeline.json",
"cta": "Get DataTools for RevOps — $49 →",
"landing": "https://datatools.app/revops/",
},
}
DEFAULT_PERSONA = "shopify-pet"
# ---------------------------------------------------------------------------
# Page config + routing
# ---------------------------------------------------------------------------
st.set_page_config(
page_title="DataTools — try it live",
page_icon="🧹",
layout="wide",
initial_sidebar_state="collapsed",
)
# Strip Streamlit chrome that breaks the iframe-embed look on the
# landing pages.
st.markdown("""
<style>
#MainMenu, footer, header { visibility: hidden; }
.block-container { padding-top: 1.2rem; padding-bottom: 1rem; max-width: 1200px; }
[data-testid="stSidebarNav"] { display: none; }
section[data-testid="stSidebar"] { display: none; }
.stApp { background: #0f1115; color: #e8eaed; }
h1, h2, h3 { color: #e8eaed; letter-spacing: -0.01em; }
hr { border-color: #252a36; }
.demo-card {
background: #161922;
border: 1px solid #252a36;
border-radius: 12px;
padding: 18px;
}
.cta-block {
background: linear-gradient(135deg, #161922 0%, #1d212b 100%);
border: 1px solid #6ee7b7;
border-radius: 12px;
padding: 24px;
text-align: center;
}
.cta-block a {
display: inline-block;
background: #6ee7b7; color: #052e1a;
font-weight: 600; padding: 12px 22px;
border-radius: 8px; text-decoration: none;
font-size: 17px; margin-top: 12px;
}
.metric-pill {
display: inline-block;
background: #1d212b; border: 1px solid #252a36;
padding: 4px 10px; border-radius: 999px;
font-family: ui-monospace, monospace; font-size: 13px;
color: #6ee7b7; margin-right: 6px; margin-bottom: 4px;
}
</style>
""", unsafe_allow_html=True)
def _resolve_persona() -> str:
"""Read ``?p=<persona>`` from query string; fall back to default."""
try:
params = st.query_params
raw = params.get("p", DEFAULT_PERSONA)
except AttributeError:
# Older Streamlit versions
params = st.experimental_get_query_params()
raw = params.get("p", [DEFAULT_PERSONA])
raw = raw[0] if isinstance(raw, list) else raw
if raw not in PERSONAS:
return DEFAULT_PERSONA
return raw
persona_key = _resolve_persona()
persona = PERSONAS[persona_key]
# ---------------------------------------------------------------------------
# Header + persona switch
# ---------------------------------------------------------------------------
col_brand, col_switch = st.columns([3, 2])
with col_brand:
st.markdown(f"### 🧹 DataTools / for {persona['label']}")
with col_switch:
# Quick-switch dropdown for visitors landing on the wrong persona
new_choice = st.selectbox(
"Try a different demo",
options=list(PERSONAS),
format_func=lambda k: f"{PERSONAS[k]['icon']} {PERSONAS[k]['label']}",
index=list(PERSONAS).index(persona_key),
key="persona_switch",
label_visibility="collapsed",
)
if new_choice != persona_key:
st.query_params["p"] = new_choice
st.rerun()
st.markdown(f"## {persona['h1']}")
st.markdown(persona["sub"])
st.markdown("---")
# ---------------------------------------------------------------------------
# Load preloaded sample data + pipeline
# ---------------------------------------------------------------------------
@st.cache_data(show_spinner=False)
def _load_demo(data_file: str, pipeline_file: str) -> tuple[pd.DataFrame, Pipeline]:
df = pd.read_csv(DEMO_DIR / data_file, dtype=str, keep_default_na=False)
pipe = Pipeline.from_file(DEMO_DIR / pipeline_file)
return df, pipe
sample_df, sample_pipeline = _load_demo(persona["data_file"], persona["pipeline_file"])
def _read_uploaded(uploaded_file) -> tuple[pd.DataFrame, list[str]]:
"""Decode an uploaded file. Returns (df, warnings)."""
warnings: list[str] = []
raw = uploaded_file.getvalue()
size_mb = len(raw) / 1024 / 1024
if size_mb > DEMO_FILE_CAP_MB:
warnings.append(
f"Uploaded file is {size_mb:.1f} MB — demo capped at "
f"{DEMO_FILE_CAP_MB} MB. The paid product has no size limit."
)
return sample_df.copy(), warnings
suffix = Path(uploaded_file.name).suffix.lower()
bio = io.BytesIO(raw)
try:
if suffix in (".xlsx", ".xls"):
df = pd.read_excel(bio, dtype=str, keep_default_na=False)
else:
for enc in ("utf-8", "utf-8-sig", "latin-1"):
try:
bio.seek(0)
sep = "\t" if suffix == ".tsv" else ","
df = pd.read_csv(
bio, dtype=str, keep_default_na=False,
encoding=enc, sep=sep, on_bad_lines="warn",
)
break
except UnicodeDecodeError:
continue
else:
bio.seek(0)
df = pd.read_csv(bio, dtype=str, keep_default_na=False, encoding="latin-1")
except Exception as e:
warnings.append(f"Could not read your file ({type(e).__name__}). "
"Demo will run on the sample dataset.")
return sample_df.copy(), warnings
if len(df) > DEMO_ROW_CAP:
warnings.append(
f"Demo capped at {DEMO_ROW_CAP} rows — your file has {len(df):,}. "
f"Running on the first {DEMO_ROW_CAP} rows. The paid product has no row limit."
)
df = df.head(DEMO_ROW_CAP)
return df, warnings
# ---------------------------------------------------------------------------
# File source: preloaded sample (default) or user upload
# ---------------------------------------------------------------------------
st.markdown(f"#### Sample dataset preloaded · `{persona['data_file']}`")
with st.expander(
"Or replace with your own file (capped at "
f"{DEMO_ROW_CAP} rows / {DEMO_FILE_CAP_MB} MB for the demo)",
expanded=False,
):
uploaded = st.file_uploader(
"Your file",
type=["csv", "tsv", "xlsx", "xls"],
key="demo_user_file",
label_visibility="collapsed",
help=(
"Files larger than the cap are accepted but only the first "
f"{DEMO_ROW_CAP} rows are processed. The paid build runs on "
"1 GB+ files via streaming."
),
)
if uploaded is not None:
df_in, upload_warnings = _read_uploaded(uploaded)
for w in upload_warnings:
st.info(w)
using_sample = False
else:
df_in = sample_df.copy()
using_sample = True
# ---------------------------------------------------------------------------
# BEFORE preview
# ---------------------------------------------------------------------------
st.markdown(f"#### BEFORE — {len(df_in)} rows, {len(df_in.columns)} columns")
st.dataframe(df_in.head(10), use_container_width=True, hide_index=True)
st.markdown("---")
# ---------------------------------------------------------------------------
# Pipeline (read-only)
# ---------------------------------------------------------------------------
st.markdown("#### Pipeline (saved — paid version is editable)")
pipe_summary = "".join(
f"**{i + 1}.** {step.tool}"
for i, step in enumerate(sample_pipeline.steps)
)
st.markdown(pipe_summary)
# ---------------------------------------------------------------------------
# Run
# ---------------------------------------------------------------------------
run_clicked = st.button(
"▶ Run pipeline",
type="primary",
use_container_width=True,
key="demo_run_button",
)
if run_clicked:
with st.spinner("Running…"):
t0 = time.perf_counter()
try:
result = run_pipeline(df_in, sample_pipeline, stop_on_error=False)
except Exception as e:
from src.core.errors import format_for_user
st.error(f"Demo halted: {format_for_user(e)}")
st.stop()
elapsed = time.perf_counter() - t0
st.session_state["demo_result"] = result
st.session_state["demo_elapsed"] = elapsed
st.session_state["demo_persona"] = persona_key
result = st.session_state.get("demo_result")
elapsed = st.session_state.get("demo_elapsed", 0.0)
result_persona = st.session_state.get("demo_persona")
# Reset cached result when persona switches
if result is not None and result_persona != persona_key:
result = None
st.session_state.pop("demo_result", None)
# ---------------------------------------------------------------------------
# AFTER + metrics + CTA
# ---------------------------------------------------------------------------
if result is not None:
st.markdown("---")
st.markdown(
f"#### AFTER — {len(df_in)}{len(result.final_df)} rows · "
f"finished in {elapsed*1000:.0f} ms"
)
# Per-step metric pills
pills_html: list[str] = []
for sr in result.step_results:
if sr.skipped:
continue
if sr.error:
pills_html.append(
f'<span class="metric-pill" style="color:#fbbf24">'
f'{sr.step.tool}: error</span>'
)
continue
s = sr.summary
bits: list[str] = []
if "cells_changed" in s and s["cells_changed"]:
bits.append(f"{s['cells_changed']} cells")
if "sentinels_standardized" in s and s["sentinels_standardized"]:
bits.append(f"{s['sentinels_standardized']} sentinels")
if "duplicates_removed" in s and s["duplicates_removed"]:
bits.append(f"{s['duplicates_removed']} dupes merged")
if "columns_renamed" in s and s["columns_renamed"]:
bits.append(f"{s['columns_renamed']} renamed")
label = ", ".join(bits) if bits else "no-op"
pills_html.append(
f'<span class="metric-pill">{sr.step.tool}: {label}</span>'
)
st.markdown("".join(pills_html), unsafe_allow_html=True)
st.dataframe(result.final_df.head(10), use_container_width=True, hide_index=True)
# ----- Download with watermark row -----
watermark_row = pd.DataFrame([{
col: f"DataTools demo — buy at {persona['landing']}"
if i == 0 else ""
for i, col in enumerate(result.final_df.columns)
}])
out_df = pd.concat([result.final_df, watermark_row], ignore_index=True)
csv_bytes = out_df.to_csv(index=False).encode("utf-8-sig")
col_dl, col_cta = st.columns([1, 2])
with col_dl:
st.download_button(
"Download cleaned CSV (sample · watermarked)",
data=csv_bytes,
file_name=Path(persona["data_file"]).stem + "_cleaned_demo.csv",
mime="text/csv",
use_container_width=True,
)
with col_cta:
st.markdown(
f"""
<div class="cta-block">
<strong style="font-size: 18px;">Like what you see?</strong><br/>
Run this on YOUR full file — locally. No upload. No row limit. No watermark.<br/>
<a href="{GUMROAD_BASE}?from={persona_key}" rel="noopener">{persona['cta']}</a>
</div>
""",
unsafe_allow_html=True,
)
else:
# Pre-run state — show the buy block at the bottom anyway so the
# CTA is always visible above the fold once the visitor scrolls.
st.markdown(
f"""
<div class="cta-block" style="margin-top: 24px;">
<strong style="font-size: 18px;">Already convinced?</strong><br/>
Skip the demo and grab the full version. One-time payment, no subscription.<br/>
<a href="{GUMROAD_BASE}?from={persona_key}" rel="noopener">{persona['cta']}</a>
</div>
""",
unsafe_allow_html=True,
)
# ---------------------------------------------------------------------------
# Footer trust block
# ---------------------------------------------------------------------------
st.markdown("---")
col_t1, col_t2, col_t3 = st.columns(3)
with col_t1:
st.markdown("**🔒 Runs locally**\n\nThe paid product is desktop-only. Your data never leaves your computer.")
with col_t2:
st.markdown("**📋 Audit trail**\n\nEvery cell change row-logged with old / new / which rule fired.")
with col_t3:
st.markdown("**💰 One-time $49**\n\nNo subscription. Mac · Windows · Linux. Free updates for v1.x.")
st.caption(
f"Demo capped at {DEMO_ROW_CAP} rows · output watermarked with one trailing row · "
"running on free hosting. The paid product is uncapped and runs offline."
)

View File

@@ -1,111 +1,368 @@
"""DataTools Missing Value Handler — stub page."""
"""DataTools Missing Value Handler — Streamlit page."""
from __future__ import annotations
import io
import json
import sys
from pathlib import Path
import pandas as pd
import streamlit as st
_project_root = Path(__file__).resolve().parent.parent.parent.parent
if str(_project_root) not in sys.path:
sys.path.insert(0, str(_project_root))
from src.gui.components import hide_streamlit_chrome, require_normalization_gate
from src.gui.components import (
hide_streamlit_chrome,
pickup_or_upload,
require_normalization_gate,
)
from src.core.missing import (
DEFAULT_SENTINELS,
MissingOptions,
PRESETS,
handle_missing,
profile_missing,
)
hide_streamlit_chrome()
require_normalization_gate()
# ---------------------------------------------------------------------------
# Header
# ---------------------------------------------------------------------------
st.title("🕳️ Missing Value Handler")
st.caption("Detect, analyze, and handle missing values in your data.")
st.caption(
"Detect disguised nulls, profile missingness, and apply imputation or "
"drop strategies. Runs locally — your data never leaves this computer."
)
st.info("This tool is under development.")
# ---------------------------------------------------------------------------
# What this tool will do
# File upload
# ---------------------------------------------------------------------------
st.markdown("""
**Features:**
- Detect disguised nulls (empty strings, "N/A", "n/a", "-", "NULL", "None", etc.)
- Missingness analysis: per-column counts, percentages, and patterns
- Visualize missing data heatmap
- Imputation strategies: drop rows/columns, fill with mean/median/mode, forward-fill, backward-fill
- Custom sentinel value replacement
- Before/after comparison
""")
uploaded = pickup_or_upload(
label="Upload CSV or Excel file",
key="missing_file_upload",
types=["csv", "tsv", "xlsx", "xls"],
)
if uploaded is None:
st.info("Upload a CSV, TSV, or Excel file to begin.")
st.stop()
@st.cache_data(show_spinner=False)
def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
"""Read the uploaded bytes into a DataFrame.
Unlike the text cleaner, we do *not* force ``dtype=str`` here: missing-
value handling is more useful when numeric columns are typed correctly
(so mean / median / interpolate work without manual coercion).
Sentinel strings are still detected because they survive in object
columns where any cell is non-numeric.
"""
suffix = Path(name).suffix.lower()
bio = io.BytesIO(data)
if suffix in (".xlsx", ".xls"):
return pd.read_excel(bio)
for enc in ("utf-8", "utf-8-sig", "latin-1"):
try:
bio.seek(0)
sep = "\t" if suffix == ".tsv" else ","
return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn")
except UnicodeDecodeError:
continue
bio.seek(0)
return pd.read_csv(bio, encoding="latin-1")
try:
df = _read_uploaded(uploaded.name, uploaded.getvalue())
except Exception as e:
from src.core.errors import format_for_user
st.error(
f"**Could not read `{uploaded.name}`**\n\n"
f"```\n{format_for_user(e)}\n```"
)
st.stop()
st.subheader(f"Preview: {uploaded.name}")
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
st.dataframe(df.head(10), use_container_width=True)
st.divider()
# ---------------------------------------------------------------------------
# File upload (functional)
# Initial profile (read-only)
# ---------------------------------------------------------------------------
uploaded = st.file_uploader(
"Upload CSV or Excel file",
type=["csv", "tsv", "xlsx", "xls"],
help="Upload a file to preview. Processing is not yet available.",
key="missing_file_upload",
)
st.subheader("Missingness profile")
if uploaded is not None:
import pandas as pd
try:
if uploaded.name.endswith((".xlsx", ".xls")):
df = pd.read_excel(uploaded)
else:
df = pd.read_csv(uploaded)
st.subheader(f"Preview: {uploaded.name}")
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
st.dataframe(df.head(10), use_container_width=True)
except Exception as e:
from src.core.errors import format_for_user
st.error(
f"**Could not read `{uploaded.name}`**\n\n"
f"```\n{format_for_user(e)}\n```"
initial_profile = profile_missing(df, MissingOptions())
prof_df = initial_profile.to_dataframe()
m1, m2, m3, m4 = st.columns(4)
m1.metric("Rows", initial_profile.rows_total)
m2.metric("Cells missing", initial_profile.cells_missing)
m3.metric("% cells missing", f"{initial_profile.cells_missing_pct:.1f}%")
m4.metric("Complete rows", initial_profile.rows_complete)
st.dataframe(prof_df, use_container_width=True, hide_index=True)
if initial_profile.cells_missing == 0:
st.success("No missing values or disguised nulls detected. Nothing to handle.")
st.divider()
# ---------------------------------------------------------------------------
# Options
# ---------------------------------------------------------------------------
st.subheader("Strategy")
preset_label = st.radio(
"Preset",
[
"detect-only (standardize sentinels to NaN, no fill or drop)",
"safe-fill (numeric → median, categorical → mode)",
"drop-incomplete (drop any row with missing)",
],
index=0,
help=(
"detect-only: replace 'N/A', '-', 'NULL', etc. with real NaN, then stop. "
"safe-fill: also fill — numeric columns with median, others with mode. "
"drop-incomplete: also drop every row that has any missing cell."
),
)
preset_key = preset_label.split(" ", 1)[0]
options = MissingOptions.from_preset(preset_key)
with st.expander("Advanced options"):
col_a, col_b = st.columns(2)
with col_a:
st.markdown("**Detection**")
options.standardize_sentinels = st.checkbox(
"Standardize disguised nulls to NaN",
value=options.standardize_sentinels,
help="Replace 'N/A', '-', 'NULL', whitespace-only cells, etc. with real NaN.",
)
sentinels_text = st.text_input(
"Sentinel values (comma-separated)",
value=", ".join(options.sentinels),
disabled=not options.standardize_sentinels,
help="Matched case-insensitively after stripping whitespace.",
)
options.sentinels = [
s.strip() for s in sentinels_text.split(",") if s.strip()
]
with col_b:
st.markdown("**Strategy override**")
strat_options = [
"(use preset)",
"none", "drop_row", "drop_col", "drop_both",
"mean", "median", "mode", "constant",
"ffill", "bfill", "interpolate",
]
strat_choice = st.selectbox(
"Global strategy",
strat_options,
index=0,
help=(
"drop_row / drop_col use the thresholds below. "
"mean / median / interpolate are numeric only — non-numeric "
"columns fall back to the categorical strategy."
),
)
if strat_choice != "(use preset)":
options.strategy = strat_choice # type: ignore[assignment]
cat_strat = st.selectbox(
"Categorical fallback (for non-numeric columns)",
["mode", "constant", "ffill", "bfill", "none"],
index=0,
)
options.categorical_strategy = cat_strat # type: ignore[assignment]
if options.strategy == "constant" or cat_strat == "constant":
fill_val = st.text_input(
"Constant fill value",
value="",
help="Used when strategy = constant. Leave blank to fill with empty string.",
)
options.fill_value = fill_val
st.markdown("**Drop thresholds**")
col_c, col_d = st.columns(2)
with col_c:
options.row_drop_threshold = st.slider(
"Row drop threshold (drop rows with ≥ this fraction missing across selected cols)",
0.0, 1.0, options.row_drop_threshold, 0.05,
)
with col_d:
options.col_drop_threshold = st.slider(
"Column drop threshold (drop columns with ≥ this fraction missing)",
0.0, 1.0, options.col_drop_threshold, 0.05,
)
# ---------------------------------------------------------------------------
# Placeholder options
# ---------------------------------------------------------------------------
st.markdown("**Scope**")
selected_cols = st.multiselect(
"Columns to handle (default: all)",
options=list(df.columns),
default=list(df.columns),
)
skip_cols = st.multiselect(
"Columns to skip",
options=list(df.columns),
default=[],
)
options.columns = selected_cols if selected_cols else None
options.skip_columns = list(skip_cols)
st.subheader("Detection Settings")
st.text_input(
"Null patterns (comma-separated)",
value="N/A, n/a, NA, -, NULL, None, empty, .",
disabled=True,
help="Values to treat as missing.",
)
st.subheader("Handling Strategy")
st.selectbox("Strategy", [
"Drop rows with any missing",
"Drop rows above threshold",
"Fill with mean (numeric)",
"Fill with median (numeric)",
"Fill with mode (categorical)",
"Forward-fill",
"Backward-fill",
"Custom value",
], disabled=True)
st.slider("Drop threshold (%)", 0, 100, 50, disabled=True, help="Drop rows missing more than this % of columns.")
st.divider()
st.button("Handle Missing Values", type="primary", use_container_width=True, disabled=True)
st.markdown("**Per-column strategy overrides** (optional)")
st.caption(
"Set a different strategy for specific columns. Leave any row blank to "
"use the global strategy."
)
per_col_overrides: dict[str, str] = {}
only_missing_cols = [
r.column for r in initial_profile.columns if r.has_missing
]
if only_missing_cols:
edit_df = pd.DataFrame({
"column": only_missing_cols,
"strategy": ["" for _ in only_missing_cols],
})
edited = st.data_editor(
edit_df,
use_container_width=True,
hide_index=True,
column_config={
"column": st.column_config.TextColumn("Column", disabled=True),
"strategy": st.column_config.SelectboxColumn(
"Override",
options=[
"", "drop_row", "drop_col",
"mean", "median", "mode", "constant",
"ffill", "bfill", "interpolate",
],
),
},
key="missing_per_col_editor",
)
for _, row in edited.iterrows():
if row["strategy"]:
per_col_overrides[row["column"]] = row["strategy"]
options.column_strategies = per_col_overrides # type: ignore[assignment]
# ---------------------------------------------------------------------------
# Footer
# Run
# ---------------------------------------------------------------------------
st.divider()
st.caption(
"Runs locally. Your data never leaves this computer. "
"| DataTools v3.0"
)
if st.button("Handle Missing Values", type="primary", use_container_width=True):
with st.spinner("Handling..."):
try:
result = handle_missing(df, options)
except (ValueError, OSError) as e:
from src.core.errors import format_for_user
st.error(format_for_user(e))
st.stop()
st.session_state["missing_result"] = result
st.session_state["missing_input_name"] = uploaded.name
st.session_state["missing_options"] = options.to_dict()
result = st.session_state.get("missing_result")
if result is None:
st.info("Choose a strategy and click **Handle Missing Values** to run.")
st.stop()
# ---------------------------------------------------------------------------
# Results
# ---------------------------------------------------------------------------
st.subheader("Results")
m1, m2, m3, m4 = st.columns(4)
m1.metric("Sentinels → NaN", result.sentinels_standardized)
m2.metric("Cells filled", result.cells_filled)
m3.metric("Rows dropped", result.rows_dropped)
m4.metric("Columns dropped", len(result.columns_dropped))
if result.columns_dropped:
st.warning(f"Dropped columns: {', '.join(result.columns_dropped)}")
st.markdown("**Missingness — before vs. after**")
before = result.profile_before.to_dataframe().set_index("column")[
["missing", "missing_pct"]
].rename(columns={"missing": "before_missing", "missing_pct": "before_pct"})
after = result.profile_after.to_dataframe().set_index("column")[
["missing", "missing_pct"]
].rename(columns={"missing": "after_missing", "missing_pct": "after_pct"})
combined = before.join(after, how="outer").fillna(0)
st.dataframe(combined, use_container_width=True)
if result.strategy_per_column:
st.markdown("**Strategy applied per column**")
strat_df = pd.DataFrame(
[{"column": c, "strategy": s} for c, s in result.strategy_per_column.items()]
)
st.dataframe(strat_df, use_container_width=True, hide_index=True)
if not result.changes.empty:
st.markdown("**Audit (first 50 changes)**")
audit_view = result.changes.head(50).copy()
audit_view["row"] = audit_view["row"].apply(lambda x: "" if x == -1 else x + 1)
st.dataframe(audit_view, use_container_width=True, hide_index=True)
if len(result.changes) > 50:
st.caption(f"… and {len(result.changes) - 50} more (download the full audit below).")
st.markdown("**Handled preview (first 10 rows)**")
st.dataframe(result.handled_df.head(10), use_container_width=True)
# ---------------------------------------------------------------------------
# Downloads
# ---------------------------------------------------------------------------
st.divider()
stem = Path(st.session_state.get("missing_input_name", "input")).stem
dl_a, dl_b, dl_c = st.columns(3)
with dl_a:
handled_bytes = result.handled_df.to_csv(index=False).encode("utf-8-sig")
st.download_button(
"Download handled CSV",
data=handled_bytes,
file_name=f"{stem}_missing.csv",
mime="text/csv",
)
with dl_b:
if not result.changes.empty:
changes_bytes = result.changes.to_csv(index=False).encode("utf-8-sig")
st.download_button(
"Download changes audit",
data=changes_bytes,
file_name=f"{stem}_missing_changes.csv",
mime="text/csv",
)
with dl_c:
config_bytes = json.dumps(
st.session_state.get("missing_options", {}), indent=2, default=str,
).encode("utf-8")
st.download_button(
"Download config JSON",
data=config_bytes,
file_name="missing_config.json",
mime="application/json",
)
st.divider()
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")

View File

@@ -1,102 +1,413 @@
"""DataTools Column Mapper — stub page."""
"""DataTools Column Mapper — Streamlit page."""
from __future__ import annotations
import io
import json
import sys
from pathlib import Path
import pandas as pd
import streamlit as st
_project_root = Path(__file__).resolve().parent.parent.parent.parent
if str(_project_root) not in sys.path:
sys.path.insert(0, str(_project_root))
from src.gui.components import hide_streamlit_chrome, require_normalization_gate
from src.gui.components import (
hide_streamlit_chrome,
pickup_or_upload,
require_normalization_gate,
)
from src.core.column_mapper import (
MapOptions,
PRESETS,
TargetField,
TargetSchema,
infer_mapping,
map_columns,
)
hide_streamlit_chrome()
require_normalization_gate()
# ---------------------------------------------------------------------------
# Header
# ---------------------------------------------------------------------------
st.title("🗂️ Column Mapper")
st.caption("Rename columns, enforce a target schema, and coerce types.")
st.caption(
"Rename columns, enforce a target schema, and coerce types. Runs locally — "
"your data never leaves this computer."
)
st.info("This tool is under development.")
# ---------------------------------------------------------------------------
# What this tool will do
# File upload
# ---------------------------------------------------------------------------
st.markdown("""
**Features:**
- Rename columns via interactive mapping table
- Load a target schema (JSON/CSV) to auto-map columns
- Fuzzy column name matching for automatic suggestions
- Type coercion (string → int, string → date, etc.)
- Drop unmapped columns or keep as-is
- Reorder columns to match target schema
""")
uploaded = pickup_or_upload(
label="Upload CSV or Excel file",
key="colmap_file_upload",
types=["csv", "tsv", "xlsx", "xls"],
)
if uploaded is None:
st.info("Upload a CSV, TSV, or Excel file to begin.")
st.stop()
@st.cache_data(show_spinner=False)
def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
suffix = Path(name).suffix.lower()
bio = io.BytesIO(data)
if suffix in (".xlsx", ".xls"):
return pd.read_excel(bio)
for enc in ("utf-8", "utf-8-sig", "latin-1"):
try:
bio.seek(0)
sep = "\t" if suffix == ".tsv" else ","
return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn")
except UnicodeDecodeError:
continue
bio.seek(0)
return pd.read_csv(bio, encoding="latin-1")
try:
df = _read_uploaded(uploaded.name, uploaded.getvalue())
except Exception as e:
from src.core.errors import format_for_user
st.error(
f"**Could not read `{uploaded.name}`**\n\n"
f"```\n{format_for_user(e)}\n```"
)
st.stop()
st.subheader(f"Preview: {uploaded.name}")
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
st.dataframe(df.head(10), use_container_width=True)
st.divider()
# ---------------------------------------------------------------------------
# Schema input
# ---------------------------------------------------------------------------
st.subheader("Target schema")
schema_mode = st.radio(
"How would you like to define the target schema?",
[
"Build interactively (start from current columns)",
"Upload schema JSON",
"Skip (rename / coerce only — no schema)",
],
index=0,
help=(
"An interactive build is fastest for one-off cleanup. Upload a JSON "
"when you have a fixed contract (a CRM import format, db schema). "
"Skip when you only want to rename or coerce specific columns."
),
)
schema: TargetSchema | None = None
if schema_mode.startswith("Upload"):
schema_file = st.file_uploader(
"Schema JSON",
type=["json"],
key="colmap_schema_upload",
help='Format: {"fields": [{"name": "email", "dtype": "string", "required": true, "aliases": ["EmailAddr"]}, ...]}',
)
if schema_file is not None:
try:
schema = TargetSchema.from_dict(json.loads(schema_file.getvalue()))
st.success(f"Loaded {len(schema.fields)} target field(s).")
except Exception as e:
from src.core.errors import format_for_user
st.error(f"**Could not parse schema**\n\n```\n{format_for_user(e)}\n```")
elif schema_mode.startswith("Build"):
st.caption(
"Edit the table to define your target schema. Add rows for fields the "
"input doesn't have yet (with a default), or remove rows for columns "
"you want to drop."
)
initial = pd.DataFrame({
"name": list(df.columns),
"dtype": ["auto"] * len(df.columns),
"required": [False] * len(df.columns),
"default": [""] * len(df.columns),
"aliases": [""] * len(df.columns),
})
edited = st.data_editor(
initial,
use_container_width=True,
num_rows="dynamic",
column_config={
"name": st.column_config.TextColumn("Target name"),
"dtype": st.column_config.SelectboxColumn(
"Type",
options=[
"auto", "string", "integer", "float",
"boolean", "date", "datetime", "category",
],
),
"required": st.column_config.CheckboxColumn("Required"),
"default": st.column_config.TextColumn("Default (for added cols)"),
"aliases": st.column_config.TextColumn(
"Aliases (comma-sep, helps fuzzy-match)",
),
},
key="colmap_schema_editor",
)
fields: list[TargetField] = []
for _, row in edited.iterrows():
name = str(row.get("name", "")).strip()
if not name:
continue
aliases = [
a.strip() for a in str(row.get("aliases", "") or "").split(",")
if a.strip()
]
default_raw = row.get("default")
default_val = (
default_raw if (default_raw not in (None, "", float("nan")))
else None
)
try:
if isinstance(default_val, float) and pd.isna(default_val):
default_val = None
except TypeError:
pass
fields.append(TargetField(
name=name,
dtype=str(row.get("dtype", "auto")), # type: ignore[arg-type]
required=bool(row.get("required", False)),
aliases=aliases,
default=default_val,
))
if fields:
schema = TargetSchema(fields=fields)
st.divider()
# ---------------------------------------------------------------------------
# File upload (functional)
# Strategy
# ---------------------------------------------------------------------------
uploaded = st.file_uploader(
"Upload CSV or Excel file",
type=["csv", "tsv", "xlsx", "xls"],
help="Upload a file to preview. Processing is not yet available.",
key="colmap_file_upload",
st.subheader("Strategy")
preset_label = st.radio(
"Preset",
[
"rename-only (just rename, leave types alone, keep extras)",
"lenient-schema (rename + coerce + reorder, keep extras)",
"strict-schema (rename + coerce + reorder, drop extras)",
],
index=0,
)
preset_key = preset_label.split(" ", 1)[0]
options = MapOptions.from_preset(preset_key)
options.schema = schema
if uploaded is not None:
import pandas as pd
try:
if uploaded.name.endswith((".xlsx", ".xls")):
df = pd.read_excel(uploaded)
else:
df = pd.read_csv(uploaded)
st.subheader(f"Preview: {uploaded.name}")
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
st.dataframe(df.head(10), use_container_width=True)
st.subheader("Column Mapping")
st.caption("Map source columns to target names. (Interactive mapping coming soon.)")
mapping_data = pd.DataFrame({
"Source Column": df.columns.tolist(),
"Target Column": df.columns.tolist(),
"Type": ["auto"] * len(df.columns),
})
st.dataframe(mapping_data, use_container_width=True, hide_index=True)
except Exception as e:
from src.core.errors import format_for_user
st.error(
f"**Could not read `{uploaded.name}`**\n\n"
f"```\n{format_for_user(e)}\n```"
with st.expander("Advanced options"):
col_a, col_b = st.columns(2)
with col_a:
options.unmapped = st.selectbox( # type: ignore[assignment]
"Unmapped source columns",
["keep", "drop", "error"],
index=["keep", "drop", "error"].index(options.unmapped),
)
options.coerce_types = st.checkbox(
"Coerce types per schema", value=options.coerce_types,
)
options.reorder_to_schema = st.checkbox(
"Reorder to schema order", value=options.reorder_to_schema,
)
with col_b:
options.auto_infer = st.checkbox(
"Auto-infer mapping (fuzzy match)", value=options.auto_infer,
)
options.fuzzy_threshold = st.slider(
"Fuzzy match threshold", 0.0, 1.0, options.fuzzy_threshold, 0.05,
)
options.enforce_required = st.checkbox(
"Enforce required fields", value=options.enforce_required,
)
# ---------------------------------------------------------------------------
# Placeholder options
# Mapping editor — show inferred and let user override
# ---------------------------------------------------------------------------
st.subheader("Schema Options")
st.subheader("Mapping")
st.file_uploader("Load target schema (JSON)", type=["json"], disabled=True, key="colmap_schema")
st.checkbox("Drop unmapped columns", value=False, disabled=True)
st.checkbox("Reorder to match schema", value=True, disabled=True)
st.divider()
st.button("Apply Column Mapping", type="primary", use_container_width=True, disabled=True)
if schema is None:
st.caption(
"No schema — define explicit renames below (left blank means keep "
"the source name)."
)
rename_initial = pd.DataFrame({
"source": list(df.columns),
"target": list(df.columns),
})
rename_edited = st.data_editor(
rename_initial,
use_container_width=True,
column_config={
"source": st.column_config.TextColumn("Source", disabled=True),
"target": st.column_config.TextColumn("Target"),
},
hide_index=True,
key="colmap_rename_only_editor",
)
explicit_mapping: dict[str, str] = {}
for _, row in rename_edited.iterrows():
src = str(row["source"])
tgt = str(row["target"]).strip()
if tgt and tgt != src:
explicit_mapping[src] = tgt
options.mapping = explicit_mapping
else:
inferred = (
infer_mapping(df, schema, threshold=options.fuzzy_threshold)
if options.auto_infer else {}
)
target_options = ["(unmapped)"] + schema.field_names()
map_initial = pd.DataFrame({
"source": list(df.columns),
"target": [inferred.get(c, "(unmapped)") for c in df.columns],
"auto": [c in inferred for c in df.columns],
})
map_edited = st.data_editor(
map_initial,
use_container_width=True,
column_config={
"source": st.column_config.TextColumn("Source", disabled=True),
"target": st.column_config.SelectboxColumn(
"Target", options=target_options,
),
"auto": st.column_config.CheckboxColumn("Auto-suggested", disabled=True),
},
hide_index=True,
key="colmap_schema_mapping_editor",
)
explicit_mapping = {}
for _, row in map_edited.iterrows():
src = str(row["source"])
tgt = str(row["target"])
if tgt and tgt != "(unmapped)":
explicit_mapping[src] = tgt
options.mapping = explicit_mapping
# Disable auto-infer for the actual run since the editor already shows
# the user's resolved choices (they can manually re-select to add).
options.auto_infer = False
# ---------------------------------------------------------------------------
# Footer
# Run
# ---------------------------------------------------------------------------
st.divider()
st.caption(
"Runs locally. Your data never leaves this computer. "
"| DataTools v3.0"
if st.button("Apply Column Mapping", type="primary", use_container_width=True):
with st.spinner("Mapping..."):
try:
result = map_columns(df, options)
except (ValueError, OSError) as e:
from src.core.errors import format_for_user
st.error(format_for_user(e))
st.stop()
st.session_state["colmap_result"] = result
st.session_state["colmap_input_name"] = uploaded.name
st.session_state["colmap_options"] = options.to_dict()
result = st.session_state.get("colmap_result")
if result is None:
st.info("Configure a mapping and click **Apply Column Mapping** to run.")
st.stop()
# ---------------------------------------------------------------------------
# Results
# ---------------------------------------------------------------------------
st.subheader("Results")
m1, m2, m3, m4 = st.columns(4)
m1.metric("Renamed", result.columns_renamed)
m2.metric("Dropped", len(result.columns_dropped))
m3.metric("Added", len(result.columns_added))
m4.metric(
"Coerce fails",
sum(result.coercion_failures.values()) if result.coercion_failures else 0,
)
if result.columns_dropped:
st.warning(f"Dropped columns: {', '.join(result.columns_dropped)}")
if result.columns_added:
st.info(f"Added (with defaults): {', '.join(result.columns_added)}")
if result.coercion_failures:
st.warning(
"Some cells could not be coerced and were left as NaN: "
+ ", ".join(f"{c} ({n})" for c, n in result.coercion_failures.items())
)
if result.mapping:
st.markdown("**Resolved mapping**")
map_df = pd.DataFrame(
[
{"source": s, "target": t, "auto": s in result.inferred_pairs}
for s, t in result.mapping.items()
],
)
st.dataframe(map_df, use_container_width=True, hide_index=True)
st.markdown("**Mapped preview (first 10 rows)**")
st.dataframe(result.mapped_df.head(10), use_container_width=True)
# ---------------------------------------------------------------------------
# Downloads
# ---------------------------------------------------------------------------
st.divider()
stem = Path(st.session_state.get("colmap_input_name", "input")).stem
dl_a, dl_b, dl_c = st.columns(3)
with dl_a:
mapped_bytes = result.mapped_df.to_csv(index=False).encode("utf-8-sig")
st.download_button(
"Download mapped CSV",
data=mapped_bytes,
file_name=f"{stem}_mapped.csv",
mime="text/csv",
)
with dl_b:
audit_bytes = json.dumps({
"mapping": result.mapping,
"inferred_pairs": result.inferred_pairs,
"columns_renamed": result.columns_renamed,
"columns_dropped": result.columns_dropped,
"columns_added": result.columns_added,
"coercion_failures": result.coercion_failures,
"unmapped_kept": result.unmapped_kept,
"missing_required_targets": result.missing_required_targets,
}, indent=2, default=str).encode("utf-8")
st.download_button(
"Download mapping audit",
data=audit_bytes,
file_name=f"{stem}_mapping.json",
mime="application/json",
)
with dl_c:
config_bytes = json.dumps(
st.session_state.get("colmap_options", {}), indent=2, default=str,
).encode("utf-8")
st.download_button(
"Download config JSON",
data=config_bytes,
file_name="column_map_config.json",
mime="application/json",
)
st.divider()
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")

View File

@@ -1,104 +1,370 @@
"""DataTools Pipeline Runner — stub page."""
"""DataTools Pipeline Runner — Streamlit page."""
from __future__ import annotations
import io
import json
import sys
from pathlib import Path
import pandas as pd
import streamlit as st
_project_root = Path(__file__).resolve().parent.parent.parent.parent
if str(_project_root) not in sys.path:
sys.path.insert(0, str(_project_root))
from src.gui.components import hide_streamlit_chrome, require_normalization_gate
from src.gui.components import (
hide_streamlit_chrome,
pickup_or_upload,
require_normalization_gate,
)
from src.core.pipeline import (
Pipeline,
SOFT_DEPENDENCIES,
Step,
TOOL_NAMES,
recommended_pipeline,
run_pipeline,
validate_pipeline,
)
hide_streamlit_chrome()
require_normalization_gate()
# ---------------------------------------------------------------------------
# Header
# ---------------------------------------------------------------------------
st.title("⚙️ Pipeline Runner")
st.caption("Chain tools in sequence and pass output between steps automatically.")
st.info("This tool is under development.")
# ---------------------------------------------------------------------------
# What this tool will do
# ---------------------------------------------------------------------------
st.markdown("""
**Features:**
- Select tools to run in sequence
- Recommended order: Text Cleaner → Format Standardizer → Missing Values → Deduplicator → Validator
- Each step's output feeds into the next step's input
- Per-step configuration overrides
- Progress tracking across all steps
- Final combined report
""")
st.divider()
# ---------------------------------------------------------------------------
# File upload (functional)
# ---------------------------------------------------------------------------
uploaded = st.file_uploader(
"Upload CSV or Excel file",
type=["csv", "tsv", "xlsx", "xls"],
help="Upload a file to preview. Processing is not yet available.",
key="pipeline_file_upload",
st.caption(
"Chain DataTools cleaning steps into one repeatable workflow. The "
"pipeline recommends an order; you stay in control."
)
if uploaded is not None:
import pandas as pd
# ---------------------------------------------------------------------------
# File upload
# ---------------------------------------------------------------------------
uploaded = pickup_or_upload(
label="Upload CSV or Excel file",
key="pipeline_file_upload",
types=["csv", "tsv", "xlsx", "xls"],
)
if uploaded is None:
st.info("Upload a CSV, TSV, or Excel file to begin.")
st.stop()
@st.cache_data(show_spinner=False)
def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
suffix = Path(name).suffix.lower()
bio = io.BytesIO(data)
if suffix in (".xlsx", ".xls"):
return pd.read_excel(bio)
for enc in ("utf-8", "utf-8-sig", "latin-1"):
try:
bio.seek(0)
sep = "\t" if suffix == ".tsv" else ","
return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn")
except UnicodeDecodeError:
continue
bio.seek(0)
return pd.read_csv(bio, encoding="latin-1")
try:
df = _read_uploaded(uploaded.name, uploaded.getvalue())
except Exception as e:
from src.core.errors import format_for_user
st.error(
f"**Could not read `{uploaded.name}`**\n\n"
f"```\n{format_for_user(e)}\n```"
)
st.stop()
st.subheader(f"Preview: {uploaded.name}")
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
st.dataframe(df.head(10), use_container_width=True)
st.divider()
# ---------------------------------------------------------------------------
# Pipeline builder
# ---------------------------------------------------------------------------
st.subheader("Pipeline")
mode = st.radio(
"How would you like to define the pipeline?",
[
"Use the recommended default (text-clean → format → missing → dedup)",
"Build interactively",
"Upload a saved pipeline JSON",
],
index=0,
)
if "pipeline_rows" not in st.session_state:
default = recommended_pipeline()
st.session_state["pipeline_rows"] = pd.DataFrame([
{
"tool": s.tool, "enabled": s.enabled,
"options_json": json.dumps(s.options),
}
for s in default.steps
])
if mode.startswith("Use the recommended"):
default = recommended_pipeline()
st.session_state["pipeline_rows"] = pd.DataFrame([
{
"tool": s.tool, "enabled": s.enabled,
"options_json": json.dumps(s.options),
}
for s in default.steps
])
elif mode.startswith("Upload"):
pipeline_file = st.file_uploader(
"Pipeline JSON", type=["json"], key="pipeline_upload",
)
if pipeline_file is not None:
try:
data = json.loads(pipeline_file.getvalue())
uploaded_pipe = Pipeline.from_dict(data)
st.session_state["pipeline_rows"] = pd.DataFrame([
{
"tool": s.tool, "enabled": s.enabled,
"options_json": json.dumps(s.options),
}
for s in uploaded_pipe.steps
])
st.success(f"Loaded {len(uploaded_pipe.steps)} step(s).")
except Exception as e:
from src.core.errors import format_for_user
st.error(f"**Could not parse pipeline**\n\n```\n{format_for_user(e)}\n```")
st.caption(
"Edit the table to add, remove, reorder (drag the row index), enable, "
"or configure each step. Tool order is recommended, not enforced — "
"violations surface as warnings below the table."
)
edited = st.data_editor(
st.session_state["pipeline_rows"],
use_container_width=True,
num_rows="dynamic",
column_config={
"tool": st.column_config.SelectboxColumn(
"Tool", options=TOOL_NAMES, required=True,
),
"enabled": st.column_config.CheckboxColumn("Enabled"),
"options_json": st.column_config.TextColumn(
"Options (JSON)",
help='e.g. {"column_types": {"phone": "phone"}}',
),
},
key="pipeline_editor",
)
st.session_state["pipeline_rows"] = edited
# Build a Pipeline object from the editor state.
steps_list: list[Step] = []
parse_errors: list[str] = []
for i, row in edited.iterrows():
tool = row.get("tool")
if not tool or pd.isna(tool):
continue
raw_opts = row.get("options_json") or "{}"
if pd.isna(raw_opts):
raw_opts = "{}"
try:
if uploaded.name.endswith((".xlsx", ".xls")):
df = pd.read_excel(uploaded)
else:
df = pd.read_csv(uploaded)
st.subheader(f"Preview: {uploaded.name}")
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
st.dataframe(df.head(10), use_container_width=True)
opts = json.loads(raw_opts) if isinstance(raw_opts, str) else dict(raw_opts)
if not isinstance(opts, dict):
raise ValueError("options must be a JSON object")
except Exception as e:
from src.core.errors import format_for_user
st.error(
f"**Could not read `{uploaded.name}`**\n\n"
f"```\n{format_for_user(e)}\n```"
parse_errors.append(f"Step {i + 1}: {e}")
continue
try:
steps_list.append(Step(
tool=str(tool),
options=opts,
enabled=bool(row.get("enabled", True)),
))
except Exception as e:
parse_errors.append(f"Step {i + 1}: {e}")
if parse_errors:
for err in parse_errors:
st.error(err)
current_pipeline = Pipeline(steps=steps_list) if steps_list else None
if current_pipeline is not None:
warnings = validate_pipeline(current_pipeline)
if warnings:
st.warning(
"Pipeline is out of recommended order:\n\n"
+ "\n".join(f"- {w}" for w in warnings)
+ "\n\nThe pipeline will still run — these are recommendations only."
)
# ---------------------------------------------------------------------------
# Pipeline steps (checklist)
# ---------------------------------------------------------------------------
st.subheader("Pipeline Steps")
st.caption("Select tools to include in the pipeline (recommended order):")
st.checkbox("1. Text Cleaner", value=True, disabled=True)
st.checkbox("2. Format Standardizer", value=True, disabled=True)
st.checkbox("3. Missing Value Handler", value=True, disabled=True)
st.checkbox("4. Column Mapper", value=False, disabled=True)
st.checkbox("5. Outlier Detector", value=False, disabled=True)
st.checkbox("6. Deduplicator", value=True, disabled=True)
st.checkbox("7. Multi-File Merger", value=False, disabled=True)
st.checkbox("8. Validator & Reporter", value=True, disabled=True)
st.subheader("Pipeline Configuration")
st.selectbox("On error", ["Stop pipeline", "Skip step and continue", "Prompt for decision"], disabled=True)
st.checkbox("Generate combined report at end", value=True, disabled=True)
with st.expander("Recommended tool order — why each step belongs where it does"):
st.markdown(
"\n".join(
f"- **{e}** before **{l}** — {why}"
for e, l, why in SOFT_DEPENDENCIES
)
)
st.divider()
st.button("Run Pipeline", type="primary", use_container_width=True, disabled=True)
# ---------------------------------------------------------------------------
# Footer
# Run
# ---------------------------------------------------------------------------
run_disabled = current_pipeline is None or not current_pipeline.steps
if st.button(
"Run Pipeline",
type="primary",
use_container_width=True,
disabled=run_disabled,
):
progress = st.progress(0.0, text="Starting...")
log_box = st.empty()
log_lines: list[str] = []
total_enabled = sum(1 for s in current_pipeline.steps if s.enabled)
completed = [0]
def _on_step(sr) -> None:
completed[0] += 1
if sr.skipped:
log_lines.append(f"{sr.step.display_name()} (skipped)")
elif sr.error:
log_lines.append(
f"{sr.step.display_name()}{sr.error.splitlines()[0]}"
)
else:
log_lines.append(
f"{sr.step.display_name()}{sr.elapsed_seconds*1000:.0f} ms"
)
log_box.markdown("\n".join(log_lines))
progress.progress(
completed[0] / max(total_enabled, 1),
text=f"Step {completed[0]}/{total_enabled}",
)
try:
result = run_pipeline(
df, current_pipeline,
on_step_complete=_on_step,
stop_on_error=False,
)
except Exception as e:
from src.core.errors import format_for_user
st.error(f"**Pipeline halted**\n\n```\n{format_for_user(e)}\n```")
st.stop()
progress.progress(1.0, text="Done")
st.session_state["pipeline_result"] = result
st.session_state["pipeline_input_name"] = uploaded.name
result = st.session_state.get("pipeline_result")
if result is None:
st.info(
"Configure the pipeline above and click **Run Pipeline** to "
"execute it on your file."
)
st.stop()
# ---------------------------------------------------------------------------
# Results
# ---------------------------------------------------------------------------
st.subheader("Results")
m1, m2, m3, m4 = st.columns(4)
m1.metric("Initial rows", result.initial_rows)
m2.metric("Final rows", result.final_rows)
m3.metric("Steps run", sum(1 for s in result.step_results if not s.skipped))
m4.metric("Elapsed", f"{result.total_elapsed:.2f} s")
st.markdown("**Per-step summary**")
step_df = pd.DataFrame([
{
"step": sr.step.display_name(),
"status": (
"skipped" if sr.skipped
else "error" if sr.error
else "ok"
),
"elapsed_ms": int(sr.elapsed_seconds * 1000),
"summary": json.dumps(sr.summary, default=str)[:200],
"error": sr.error or "",
}
for sr in result.step_results
])
st.dataframe(step_df, use_container_width=True, hide_index=True)
st.markdown("**Output preview (first 10 rows)**")
st.dataframe(result.final_df.head(10), use_container_width=True)
# ---------------------------------------------------------------------------
# Downloads
# ---------------------------------------------------------------------------
st.divider()
st.caption(
"Runs locally. Your data never leaves this computer. "
"| DataTools v3.0"
)
stem = Path(st.session_state.get("pipeline_input_name", "input")).stem
dl_a, dl_b, dl_c = st.columns(3)
with dl_a:
bytes_csv = result.final_df.to_csv(index=False).encode("utf-8-sig")
st.download_button(
"Download cleaned CSV",
data=bytes_csv,
file_name=f"{stem}_pipeline.csv",
mime="text/csv",
)
with dl_b:
pipeline_bytes = json.dumps(
current_pipeline.to_dict() if current_pipeline else {"steps": []},
indent=2, default=str,
).encode("utf-8")
st.download_button(
"Download pipeline JSON",
data=pipeline_bytes,
file_name="pipeline.json",
mime="application/json",
help="Save this and pass --pipeline pipeline.json to the CLI to re-run on next week's file.",
)
with dl_c:
audit_bytes = json.dumps({
"warnings": result.warnings,
"initial_rows": result.initial_rows,
"final_rows": result.final_rows,
"total_elapsed_seconds": result.total_elapsed,
"steps": [
{
"tool": sr.step.tool,
"name": sr.step.display_name(),
"enabled": sr.step.enabled,
"skipped": sr.skipped,
"elapsed_seconds": sr.elapsed_seconds,
"summary": sr.summary,
"error": sr.error,
}
for sr in result.step_results
],
}, indent=2, default=str).encode("utf-8")
st.download_button(
"Download run audit",
data=audit_bytes,
file_name=f"{stem}_pipeline_audit.json",
mime="application/json",
)
st.divider()
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")

View File

@@ -78,7 +78,7 @@ TOOLS: list[Tool] = [
"Detect disguised nulls, missingness analysis, and imputation strategies."
),
page_slug="4_Missing_Values",
status="Coming Soon",
status="Ready",
),
Tool(
tool_id="05_column_mapper",
@@ -86,7 +86,7 @@ TOOLS: list[Tool] = [
name="Column Mapper",
description="Rename columns, enforce a target schema, and coerce types.",
page_slug="5_Column_Mapper",
status="Coming Soon",
status="Ready",
),
Tool(
tool_id="06_outlier_detector",
@@ -125,7 +125,7 @@ TOOLS: list[Tool] = [
"Chain tools in recommended order and pass output between steps."
),
page_slug="9_Pipeline_Runner",
status="Coming Soon",
status="Ready",
),
]