feat: 3 new tools, format streaming, distribution-ready demo + landing pages
Tools shipped this batch (4 → 6 of 9 Ready):
04 Missing Value Handler src/core/missing.py + cli_missing.py + GUI
05 Column Mapper src/core/column_mapper.py + cli_column_map.py + GUI
09 Pipeline Runner src/core/pipeline.py + cli_pipeline.py + GUI
with soft tool-dependency graph (recommended,
not enforced) and JSON save/load for repeatable
weekly cleanups.
Format Standardizer reworked for 1 GB international files:
• Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
• Per-row country / address columns drive parsing
• Audit cap (default 10 k rows, ~50 MB RAM)
• standardize_file(): chunked streaming entry point (~165 k rows/sec)
• currency_decimal="auto" for EU comma-decimal locales
• R$ / kr / zł multi-char currency prefixes
• cli_format.py with auto-stream above 100 MB inputs
Encoding detection arbiter + language-aware probe:
Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.
Distribution-readiness assets:
• streamlit_app.py — Streamlit Community Cloud entry shim
• src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
100-row cap + watermark, free-vs-paid boundary enforced at surface
• samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
• landing/ — 4 static HTML pages (apex chooser + 3 niche),
shared CSS, deploy.py URL-substitution script,
auto-generated robots.txt + sitemap.xml + 404.html + favicon
• docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
— full strategy + measurement + deployment + master checklist
Test counts:
before: 1,520 passed · 4 skipped · 17 xfailed
after: 1,729 passed · 0 skipped · 0 xfailed
Tier-1 corpora added:
• missing-corpus 3 use cases + 16 edge cases
• column-mapper-corpus 3 use cases + 5 edge cases
• format-cleaner intl 20-row 13-country stress fixture
Engine hardening flushed out by the corpora:
• interpolate guards against object-dtype columns
• mean/median skip all-NaN columns (silences numpy warning)
• fillna runs under future.no_silent_downcasting (silences pandas warning)
• mojibake test no longer skips when ftfy installed (monkeypatch path)
• drop-row threshold semantics: strict-greater (consistent across rows / cols)
• currency_decimal validator allow-set updated for "auto"
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,102 +1,413 @@
|
||||
"""DataTools Column Mapper — stub page."""
|
||||
"""DataTools Column Mapper — Streamlit page."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
if str(_project_root) not in sys.path:
|
||||
sys.path.insert(0, str(_project_root))
|
||||
|
||||
from src.gui.components import hide_streamlit_chrome, require_normalization_gate
|
||||
from src.gui.components import (
|
||||
hide_streamlit_chrome,
|
||||
pickup_or_upload,
|
||||
require_normalization_gate,
|
||||
)
|
||||
from src.core.column_mapper import (
|
||||
MapOptions,
|
||||
PRESETS,
|
||||
TargetField,
|
||||
TargetSchema,
|
||||
infer_mapping,
|
||||
map_columns,
|
||||
)
|
||||
|
||||
hide_streamlit_chrome()
|
||||
require_normalization_gate()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Header
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.title("🗂️ Column Mapper")
|
||||
st.caption("Rename columns, enforce a target schema, and coerce types.")
|
||||
st.caption(
|
||||
"Rename columns, enforce a target schema, and coerce types. Runs locally — "
|
||||
"your data never leaves this computer."
|
||||
)
|
||||
|
||||
st.info("This tool is under development.")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# What this tool will do
|
||||
# File upload
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.markdown("""
|
||||
**Features:**
|
||||
- Rename columns via interactive mapping table
|
||||
- Load a target schema (JSON/CSV) to auto-map columns
|
||||
- Fuzzy column name matching for automatic suggestions
|
||||
- Type coercion (string → int, string → date, etc.)
|
||||
- Drop unmapped columns or keep as-is
|
||||
- Reorder columns to match target schema
|
||||
""")
|
||||
uploaded = pickup_or_upload(
|
||||
label="Upload CSV or Excel file",
|
||||
key="colmap_file_upload",
|
||||
types=["csv", "tsv", "xlsx", "xls"],
|
||||
)
|
||||
|
||||
if uploaded is None:
|
||||
st.info("Upload a CSV, TSV, or Excel file to begin.")
|
||||
st.stop()
|
||||
|
||||
|
||||
@st.cache_data(show_spinner=False)
|
||||
def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
|
||||
suffix = Path(name).suffix.lower()
|
||||
bio = io.BytesIO(data)
|
||||
if suffix in (".xlsx", ".xls"):
|
||||
return pd.read_excel(bio)
|
||||
for enc in ("utf-8", "utf-8-sig", "latin-1"):
|
||||
try:
|
||||
bio.seek(0)
|
||||
sep = "\t" if suffix == ".tsv" else ","
|
||||
return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn")
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
bio.seek(0)
|
||||
return pd.read_csv(bio, encoding="latin-1")
|
||||
|
||||
|
||||
try:
|
||||
df = _read_uploaded(uploaded.name, uploaded.getvalue())
|
||||
except Exception as e:
|
||||
from src.core.errors import format_for_user
|
||||
st.error(
|
||||
f"**Could not read `{uploaded.name}`**\n\n"
|
||||
f"```\n{format_for_user(e)}\n```"
|
||||
)
|
||||
st.stop()
|
||||
|
||||
st.subheader(f"Preview: {uploaded.name}")
|
||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||
st.dataframe(df.head(10), use_container_width=True)
|
||||
st.divider()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Schema input
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.subheader("Target schema")
|
||||
|
||||
schema_mode = st.radio(
|
||||
"How would you like to define the target schema?",
|
||||
[
|
||||
"Build interactively (start from current columns)",
|
||||
"Upload schema JSON",
|
||||
"Skip (rename / coerce only — no schema)",
|
||||
],
|
||||
index=0,
|
||||
help=(
|
||||
"An interactive build is fastest for one-off cleanup. Upload a JSON "
|
||||
"when you have a fixed contract (a CRM import format, db schema). "
|
||||
"Skip when you only want to rename or coerce specific columns."
|
||||
),
|
||||
)
|
||||
|
||||
schema: TargetSchema | None = None
|
||||
|
||||
if schema_mode.startswith("Upload"):
|
||||
schema_file = st.file_uploader(
|
||||
"Schema JSON",
|
||||
type=["json"],
|
||||
key="colmap_schema_upload",
|
||||
help='Format: {"fields": [{"name": "email", "dtype": "string", "required": true, "aliases": ["EmailAddr"]}, ...]}',
|
||||
)
|
||||
if schema_file is not None:
|
||||
try:
|
||||
schema = TargetSchema.from_dict(json.loads(schema_file.getvalue()))
|
||||
st.success(f"Loaded {len(schema.fields)} target field(s).")
|
||||
except Exception as e:
|
||||
from src.core.errors import format_for_user
|
||||
st.error(f"**Could not parse schema**\n\n```\n{format_for_user(e)}\n```")
|
||||
|
||||
elif schema_mode.startswith("Build"):
|
||||
st.caption(
|
||||
"Edit the table to define your target schema. Add rows for fields the "
|
||||
"input doesn't have yet (with a default), or remove rows for columns "
|
||||
"you want to drop."
|
||||
)
|
||||
initial = pd.DataFrame({
|
||||
"name": list(df.columns),
|
||||
"dtype": ["auto"] * len(df.columns),
|
||||
"required": [False] * len(df.columns),
|
||||
"default": [""] * len(df.columns),
|
||||
"aliases": [""] * len(df.columns),
|
||||
})
|
||||
edited = st.data_editor(
|
||||
initial,
|
||||
use_container_width=True,
|
||||
num_rows="dynamic",
|
||||
column_config={
|
||||
"name": st.column_config.TextColumn("Target name"),
|
||||
"dtype": st.column_config.SelectboxColumn(
|
||||
"Type",
|
||||
options=[
|
||||
"auto", "string", "integer", "float",
|
||||
"boolean", "date", "datetime", "category",
|
||||
],
|
||||
),
|
||||
"required": st.column_config.CheckboxColumn("Required"),
|
||||
"default": st.column_config.TextColumn("Default (for added cols)"),
|
||||
"aliases": st.column_config.TextColumn(
|
||||
"Aliases (comma-sep, helps fuzzy-match)",
|
||||
),
|
||||
},
|
||||
key="colmap_schema_editor",
|
||||
)
|
||||
fields: list[TargetField] = []
|
||||
for _, row in edited.iterrows():
|
||||
name = str(row.get("name", "")).strip()
|
||||
if not name:
|
||||
continue
|
||||
aliases = [
|
||||
a.strip() for a in str(row.get("aliases", "") or "").split(",")
|
||||
if a.strip()
|
||||
]
|
||||
default_raw = row.get("default")
|
||||
default_val = (
|
||||
default_raw if (default_raw not in (None, "", float("nan")))
|
||||
else None
|
||||
)
|
||||
try:
|
||||
if isinstance(default_val, float) and pd.isna(default_val):
|
||||
default_val = None
|
||||
except TypeError:
|
||||
pass
|
||||
fields.append(TargetField(
|
||||
name=name,
|
||||
dtype=str(row.get("dtype", "auto")), # type: ignore[arg-type]
|
||||
required=bool(row.get("required", False)),
|
||||
aliases=aliases,
|
||||
default=default_val,
|
||||
))
|
||||
if fields:
|
||||
schema = TargetSchema(fields=fields)
|
||||
|
||||
st.divider()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# File upload (functional)
|
||||
# Strategy
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
uploaded = st.file_uploader(
|
||||
"Upload CSV or Excel file",
|
||||
type=["csv", "tsv", "xlsx", "xls"],
|
||||
help="Upload a file to preview. Processing is not yet available.",
|
||||
key="colmap_file_upload",
|
||||
st.subheader("Strategy")
|
||||
|
||||
preset_label = st.radio(
|
||||
"Preset",
|
||||
[
|
||||
"rename-only (just rename, leave types alone, keep extras)",
|
||||
"lenient-schema (rename + coerce + reorder, keep extras)",
|
||||
"strict-schema (rename + coerce + reorder, drop extras)",
|
||||
],
|
||||
index=0,
|
||||
)
|
||||
preset_key = preset_label.split(" ", 1)[0]
|
||||
options = MapOptions.from_preset(preset_key)
|
||||
options.schema = schema
|
||||
|
||||
if uploaded is not None:
|
||||
import pandas as pd
|
||||
try:
|
||||
if uploaded.name.endswith((".xlsx", ".xls")):
|
||||
df = pd.read_excel(uploaded)
|
||||
else:
|
||||
df = pd.read_csv(uploaded)
|
||||
st.subheader(f"Preview: {uploaded.name}")
|
||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||
st.dataframe(df.head(10), use_container_width=True)
|
||||
|
||||
st.subheader("Column Mapping")
|
||||
st.caption("Map source columns to target names. (Interactive mapping coming soon.)")
|
||||
mapping_data = pd.DataFrame({
|
||||
"Source Column": df.columns.tolist(),
|
||||
"Target Column": df.columns.tolist(),
|
||||
"Type": ["auto"] * len(df.columns),
|
||||
})
|
||||
st.dataframe(mapping_data, use_container_width=True, hide_index=True)
|
||||
except Exception as e:
|
||||
from src.core.errors import format_for_user
|
||||
st.error(
|
||||
f"**Could not read `{uploaded.name}`**\n\n"
|
||||
f"```\n{format_for_user(e)}\n```"
|
||||
with st.expander("Advanced options"):
|
||||
col_a, col_b = st.columns(2)
|
||||
with col_a:
|
||||
options.unmapped = st.selectbox( # type: ignore[assignment]
|
||||
"Unmapped source columns",
|
||||
["keep", "drop", "error"],
|
||||
index=["keep", "drop", "error"].index(options.unmapped),
|
||||
)
|
||||
options.coerce_types = st.checkbox(
|
||||
"Coerce types per schema", value=options.coerce_types,
|
||||
)
|
||||
options.reorder_to_schema = st.checkbox(
|
||||
"Reorder to schema order", value=options.reorder_to_schema,
|
||||
)
|
||||
with col_b:
|
||||
options.auto_infer = st.checkbox(
|
||||
"Auto-infer mapping (fuzzy match)", value=options.auto_infer,
|
||||
)
|
||||
options.fuzzy_threshold = st.slider(
|
||||
"Fuzzy match threshold", 0.0, 1.0, options.fuzzy_threshold, 0.05,
|
||||
)
|
||||
options.enforce_required = st.checkbox(
|
||||
"Enforce required fields", value=options.enforce_required,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Placeholder options
|
||||
# Mapping editor — show inferred and let user override
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.subheader("Schema Options")
|
||||
st.subheader("Mapping")
|
||||
|
||||
st.file_uploader("Load target schema (JSON)", type=["json"], disabled=True, key="colmap_schema")
|
||||
st.checkbox("Drop unmapped columns", value=False, disabled=True)
|
||||
st.checkbox("Reorder to match schema", value=True, disabled=True)
|
||||
|
||||
st.divider()
|
||||
st.button("Apply Column Mapping", type="primary", use_container_width=True, disabled=True)
|
||||
if schema is None:
|
||||
st.caption(
|
||||
"No schema — define explicit renames below (left blank means keep "
|
||||
"the source name)."
|
||||
)
|
||||
rename_initial = pd.DataFrame({
|
||||
"source": list(df.columns),
|
||||
"target": list(df.columns),
|
||||
})
|
||||
rename_edited = st.data_editor(
|
||||
rename_initial,
|
||||
use_container_width=True,
|
||||
column_config={
|
||||
"source": st.column_config.TextColumn("Source", disabled=True),
|
||||
"target": st.column_config.TextColumn("Target"),
|
||||
},
|
||||
hide_index=True,
|
||||
key="colmap_rename_only_editor",
|
||||
)
|
||||
explicit_mapping: dict[str, str] = {}
|
||||
for _, row in rename_edited.iterrows():
|
||||
src = str(row["source"])
|
||||
tgt = str(row["target"]).strip()
|
||||
if tgt and tgt != src:
|
||||
explicit_mapping[src] = tgt
|
||||
options.mapping = explicit_mapping
|
||||
else:
|
||||
inferred = (
|
||||
infer_mapping(df, schema, threshold=options.fuzzy_threshold)
|
||||
if options.auto_infer else {}
|
||||
)
|
||||
target_options = ["(unmapped)"] + schema.field_names()
|
||||
map_initial = pd.DataFrame({
|
||||
"source": list(df.columns),
|
||||
"target": [inferred.get(c, "(unmapped)") for c in df.columns],
|
||||
"auto": [c in inferred for c in df.columns],
|
||||
})
|
||||
map_edited = st.data_editor(
|
||||
map_initial,
|
||||
use_container_width=True,
|
||||
column_config={
|
||||
"source": st.column_config.TextColumn("Source", disabled=True),
|
||||
"target": st.column_config.SelectboxColumn(
|
||||
"Target", options=target_options,
|
||||
),
|
||||
"auto": st.column_config.CheckboxColumn("Auto-suggested", disabled=True),
|
||||
},
|
||||
hide_index=True,
|
||||
key="colmap_schema_mapping_editor",
|
||||
)
|
||||
explicit_mapping = {}
|
||||
for _, row in map_edited.iterrows():
|
||||
src = str(row["source"])
|
||||
tgt = str(row["target"])
|
||||
if tgt and tgt != "(unmapped)":
|
||||
explicit_mapping[src] = tgt
|
||||
options.mapping = explicit_mapping
|
||||
# Disable auto-infer for the actual run since the editor already shows
|
||||
# the user's resolved choices (they can manually re-select to add).
|
||||
options.auto_infer = False
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Footer
|
||||
# Run
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.divider()
|
||||
st.caption(
|
||||
"Runs locally. Your data never leaves this computer. "
|
||||
"| DataTools v3.0"
|
||||
|
||||
if st.button("Apply Column Mapping", type="primary", use_container_width=True):
|
||||
with st.spinner("Mapping..."):
|
||||
try:
|
||||
result = map_columns(df, options)
|
||||
except (ValueError, OSError) as e:
|
||||
from src.core.errors import format_for_user
|
||||
st.error(format_for_user(e))
|
||||
st.stop()
|
||||
st.session_state["colmap_result"] = result
|
||||
st.session_state["colmap_input_name"] = uploaded.name
|
||||
st.session_state["colmap_options"] = options.to_dict()
|
||||
|
||||
result = st.session_state.get("colmap_result")
|
||||
if result is None:
|
||||
st.info("Configure a mapping and click **Apply Column Mapping** to run.")
|
||||
st.stop()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Results
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.subheader("Results")
|
||||
|
||||
m1, m2, m3, m4 = st.columns(4)
|
||||
m1.metric("Renamed", result.columns_renamed)
|
||||
m2.metric("Dropped", len(result.columns_dropped))
|
||||
m3.metric("Added", len(result.columns_added))
|
||||
m4.metric(
|
||||
"Coerce fails",
|
||||
sum(result.coercion_failures.values()) if result.coercion_failures else 0,
|
||||
)
|
||||
|
||||
if result.columns_dropped:
|
||||
st.warning(f"Dropped columns: {', '.join(result.columns_dropped)}")
|
||||
if result.columns_added:
|
||||
st.info(f"Added (with defaults): {', '.join(result.columns_added)}")
|
||||
if result.coercion_failures:
|
||||
st.warning(
|
||||
"Some cells could not be coerced and were left as NaN: "
|
||||
+ ", ".join(f"{c} ({n})" for c, n in result.coercion_failures.items())
|
||||
)
|
||||
|
||||
if result.mapping:
|
||||
st.markdown("**Resolved mapping**")
|
||||
map_df = pd.DataFrame(
|
||||
[
|
||||
{"source": s, "target": t, "auto": s in result.inferred_pairs}
|
||||
for s, t in result.mapping.items()
|
||||
],
|
||||
)
|
||||
st.dataframe(map_df, use_container_width=True, hide_index=True)
|
||||
|
||||
st.markdown("**Mapped preview (first 10 rows)**")
|
||||
st.dataframe(result.mapped_df.head(10), use_container_width=True)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Downloads
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.divider()
|
||||
stem = Path(st.session_state.get("colmap_input_name", "input")).stem
|
||||
|
||||
dl_a, dl_b, dl_c = st.columns(3)
|
||||
with dl_a:
|
||||
mapped_bytes = result.mapped_df.to_csv(index=False).encode("utf-8-sig")
|
||||
st.download_button(
|
||||
"Download mapped CSV",
|
||||
data=mapped_bytes,
|
||||
file_name=f"{stem}_mapped.csv",
|
||||
mime="text/csv",
|
||||
)
|
||||
with dl_b:
|
||||
audit_bytes = json.dumps({
|
||||
"mapping": result.mapping,
|
||||
"inferred_pairs": result.inferred_pairs,
|
||||
"columns_renamed": result.columns_renamed,
|
||||
"columns_dropped": result.columns_dropped,
|
||||
"columns_added": result.columns_added,
|
||||
"coercion_failures": result.coercion_failures,
|
||||
"unmapped_kept": result.unmapped_kept,
|
||||
"missing_required_targets": result.missing_required_targets,
|
||||
}, indent=2, default=str).encode("utf-8")
|
||||
st.download_button(
|
||||
"Download mapping audit",
|
||||
data=audit_bytes,
|
||||
file_name=f"{stem}_mapping.json",
|
||||
mime="application/json",
|
||||
)
|
||||
with dl_c:
|
||||
config_bytes = json.dumps(
|
||||
st.session_state.get("colmap_options", {}), indent=2, default=str,
|
||||
).encode("utf-8")
|
||||
st.download_button(
|
||||
"Download config JSON",
|
||||
data=config_bytes,
|
||||
file_name="column_map_config.json",
|
||||
mime="application/json",
|
||||
)
|
||||
|
||||
st.divider()
|
||||
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")
|
||||
|
||||
Reference in New Issue
Block a user