feat: 3 new tools, format streaming, distribution-ready demo + landing pages

Tools shipped this batch (4 → 6 of 9 Ready): 04 Missing Value Handler src/core/missing.py + cli_missing.py + GUI 05 Column Mapper src/core/column_mapper.py + cli_column_map.py + GUI 09 Pipeline Runner src/core/pipeline.py + cli_pipeline.py + GUI with soft tool-dependency graph (recommended, not enforced) and JSON save/load for repeatable weekly cleanups. Format Standardizer reworked for 1 GB international files: • Vectorised dispatch + LRU cache over phone/date/currency/boolean/email • Per-row country / address columns drive parsing • Audit cap (default 10 k rows, ~50 MB RAM) • standardize_file(): chunked streaming entry point (~165 k rows/sec) • currency_decimal="auto" for EU comma-decimal locales • R$ / kr / zł multi-char currency prefixes • cli_format.py with auto-stream above 100 MB inputs Encoding detection arbiter + language-aware probe: Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM) via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes. Distribution-readiness assets: • streamlit_app.py — Streamlit Community Cloud entry shim • src/gui/app_demo.py — single-page demo, ?p=<persona> routing, 100-row cap + watermark, free-vs-paid boundary enforced at surface • samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs • landing/ — 4 static HTML pages (apex chooser + 3 niche), shared CSS, deploy.py URL-substitution script, auto-generated robots.txt + sitemap.xml + 404.html + favicon • docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md — full strategy + measurement + deployment + master checklist Test counts: before: 1,520 passed · 4 skipped · 17 xfailed after: 1,729 passed · 0 skipped · 0 xfailed Tier-1 corpora added: • missing-corpus 3 use cases + 16 edge cases • column-mapper-corpus 3 use cases + 5 edge cases • format-cleaner intl 20-row 13-country stress fixture Engine hardening flushed out by the corpora: • interpolate guards against object-dtype columns • mean/median skip all-NaN columns (silences numpy warning) • fillna runs under future.no_silent_downcasting (silences pandas warning) • mojibake test no longer skips when ftfy installed (monkeypatch path) • drop-row threshold semantics: strict-greater (consistent across rows / cols) • currency_decimal validator allow-set updated for "auto" Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 22:31:26 +00:00
parent d18b95880d
commit 966af8ef94
89 changed files with 12039 additions and 284 deletions
--- a/src/gui/pages/5_Column_Mapper.py
+++ b/src/gui/pages/5_Column_Mapper.py
@@ -1,102 +1,413 @@
-"""DataTools Column Mapper — stub page."""
+"""DataTools Column Mapper — Streamlit page."""

 from __future__ import annotations

+import io
+import json
 import sys
 from pathlib import Path

+import pandas as pd
 import streamlit as st

 _project_root = Path(__file__).resolve().parent.parent.parent.parent
 if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root))

-from src.gui.components import hide_streamlit_chrome, require_normalization_gate
+from src.gui.components import (
+    hide_streamlit_chrome,
+    pickup_or_upload,
+    require_normalization_gate,
+)
+from src.core.column_mapper import (
+    MapOptions,
+    PRESETS,
+    TargetField,
+    TargetSchema,
+    infer_mapping,
+    map_columns,
+)

 hide_streamlit_chrome()
 require_normalization_gate()

+
 # ---------------------------------------------------------------------------
 # Header
 # ---------------------------------------------------------------------------

 st.title("🗂️ Column Mapper")
-st.caption("Rename columns, enforce a target schema, and coerce types.")
+st.caption(
+    "Rename columns, enforce a target schema, and coerce types. Runs locally — "
+    "your data never leaves this computer."
+)

-st.info("This tool is under development.")

 # ---------------------------------------------------------------------------
-# What this tool will do
+# File upload
 # ---------------------------------------------------------------------------

-st.markdown("""
-**Features:**
- Rename columns via interactive mapping table
- Load a target schema (JSON/CSV) to auto-map columns
- Fuzzy column name matching for automatic suggestions
- Type coercion (string → int, string → date, etc.)
- Drop unmapped columns or keep as-is
- Reorder columns to match target schema
-""")
+uploaded = pickup_or_upload(
+    label="Upload CSV or Excel file",
+    key="colmap_file_upload",
+    types=["csv", "tsv", "xlsx", "xls"],
+)
+
+if uploaded is None:
+    st.info("Upload a CSV, TSV, or Excel file to begin.")
+    st.stop()
+
+
+@st.cache_data(show_spinner=False)
+def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
+    suffix = Path(name).suffix.lower()
+    bio = io.BytesIO(data)
+    if suffix in (".xlsx", ".xls"):
+        return pd.read_excel(bio)
+    for enc in ("utf-8", "utf-8-sig", "latin-1"):
+        try:
+            bio.seek(0)
+            sep = "\t" if suffix == ".tsv" else ","
+            return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn")
+        except UnicodeDecodeError:
+            continue
+    bio.seek(0)
+    return pd.read_csv(bio, encoding="latin-1")
+
+
+try:
+    df = _read_uploaded(uploaded.name, uploaded.getvalue())
+except Exception as e:
+    from src.core.errors import format_for_user
+    st.error(
+        f"**Could not read `{uploaded.name}`**\n\n"
+        f"```\n{format_for_user(e)}\n```"
+    )
+    st.stop()
+
+st.subheader(f"Preview: {uploaded.name}")
+st.caption(f"{len(df)} rows, {len(df.columns)} columns")
+st.dataframe(df.head(10), use_container_width=True)
+st.divider()
+
+# ---------------------------------------------------------------------------
+# Schema input
+# ---------------------------------------------------------------------------
+
+st.subheader("Target schema")
+
+schema_mode = st.radio(
+    "How would you like to define the target schema?",
+    [
+        "Build interactively (start from current columns)",
+        "Upload schema JSON",
+        "Skip (rename / coerce only — no schema)",
+    ],
+    index=0,
+    help=(
+        "An interactive build is fastest for one-off cleanup. Upload a JSON "
+        "when you have a fixed contract (a CRM import format, db schema). "
+        "Skip when you only want to rename or coerce specific columns."
+    ),
+)
+
+schema: TargetSchema | None = None
+
+if schema_mode.startswith("Upload"):
+    schema_file = st.file_uploader(
+        "Schema JSON",
+        type=["json"],
+        key="colmap_schema_upload",
+        help='Format: {"fields": [{"name": "email", "dtype": "string", "required": true, "aliases": ["EmailAddr"]}, ...]}',
+    )
+    if schema_file is not None:
+        try:
+            schema = TargetSchema.from_dict(json.loads(schema_file.getvalue()))
+            st.success(f"Loaded {len(schema.fields)} target field(s).")
+        except Exception as e:
+            from src.core.errors import format_for_user
+            st.error(f"**Could not parse schema**\n\n```\n{format_for_user(e)}\n```")
+
+elif schema_mode.startswith("Build"):
+    st.caption(
+        "Edit the table to define your target schema. Add rows for fields the "
+        "input doesn't have yet (with a default), or remove rows for columns "
+        "you want to drop."
+    )
+    initial = pd.DataFrame({
+        "name": list(df.columns),
+        "dtype": ["auto"] * len(df.columns),
+        "required": [False] * len(df.columns),
+        "default": [""] * len(df.columns),
+        "aliases": [""] * len(df.columns),
+    })
+    edited = st.data_editor(
+        initial,
+        use_container_width=True,
+        num_rows="dynamic",
+        column_config={
+            "name": st.column_config.TextColumn("Target name"),
+            "dtype": st.column_config.SelectboxColumn(
+                "Type",
+                options=[
+                    "auto", "string", "integer", "float",
+                    "boolean", "date", "datetime", "category",
+                ],
+            ),
+            "required": st.column_config.CheckboxColumn("Required"),
+            "default": st.column_config.TextColumn("Default (for added cols)"),
+            "aliases": st.column_config.TextColumn(
+                "Aliases (comma-sep, helps fuzzy-match)",
+            ),
+        },
+        key="colmap_schema_editor",
+    )
+    fields: list[TargetField] = []
+    for _, row in edited.iterrows():
+        name = str(row.get("name", "")).strip()
+        if not name:
+            continue
+        aliases = [
+            a.strip() for a in str(row.get("aliases", "") or "").split(",")
+            if a.strip()
+        ]
+        default_raw = row.get("default")
+        default_val = (
+            default_raw if (default_raw not in (None, "", float("nan")))
+            else None
+        )
+        try:
+            if isinstance(default_val, float) and pd.isna(default_val):
+                default_val = None
+        except TypeError:
+            pass
+        fields.append(TargetField(
+            name=name,
+            dtype=str(row.get("dtype", "auto")),  # type: ignore[arg-type]
+            required=bool(row.get("required", False)),
+            aliases=aliases,
+            default=default_val,
+        ))
+    if fields:
+        schema = TargetSchema(fields=fields)

 st.divider()

 # ---------------------------------------------------------------------------
-# File upload (functional)
+# Strategy
 # ---------------------------------------------------------------------------

-uploaded = st.file_uploader(
-    "Upload CSV or Excel file",
-    type=["csv", "tsv", "xlsx", "xls"],
-    help="Upload a file to preview. Processing is not yet available.",
-    key="colmap_file_upload",
+st.subheader("Strategy")
+
+preset_label = st.radio(
+    "Preset",
+    [
+        "rename-only (just rename, leave types alone, keep extras)",
+        "lenient-schema (rename + coerce + reorder, keep extras)",
+        "strict-schema (rename + coerce + reorder, drop extras)",
+    ],
+    index=0,
 )
+preset_key = preset_label.split(" ", 1)[0]
+options = MapOptions.from_preset(preset_key)
+options.schema = schema

-if uploaded is not None:
-    import pandas as pd
-    try:
-        if uploaded.name.endswith((".xlsx", ".xls")):
-            df = pd.read_excel(uploaded)
-        else:
-            df = pd.read_csv(uploaded)
-        st.subheader(f"Preview: {uploaded.name}")
-        st.caption(f"{len(df)} rows, {len(df.columns)} columns")
-        st.dataframe(df.head(10), use_container_width=True)
-
-        st.subheader("Column Mapping")
-        st.caption("Map source columns to target names. (Interactive mapping coming soon.)")
-        mapping_data = pd.DataFrame({
-            "Source Column": df.columns.tolist(),
-            "Target Column": df.columns.tolist(),
-            "Type": ["auto"] * len(df.columns),
-        })
-        st.dataframe(mapping_data, use_container_width=True, hide_index=True)
-    except Exception as e:
-        from src.core.errors import format_for_user
-        st.error(
-            f"**Could not read `{uploaded.name}`**\n\n"
-            f"```\n{format_for_user(e)}\n```"
+with st.expander("Advanced options"):
+    col_a, col_b = st.columns(2)
+    with col_a:
+        options.unmapped = st.selectbox(  # type: ignore[assignment]
+            "Unmapped source columns",
+            ["keep", "drop", "error"],
+            index=["keep", "drop", "error"].index(options.unmapped),
+        )
+        options.coerce_types = st.checkbox(
+            "Coerce types per schema", value=options.coerce_types,
+        )
+        options.reorder_to_schema = st.checkbox(
+            "Reorder to schema order", value=options.reorder_to_schema,
+        )
+    with col_b:
+        options.auto_infer = st.checkbox(
+            "Auto-infer mapping (fuzzy match)", value=options.auto_infer,
+        )
+        options.fuzzy_threshold = st.slider(
+            "Fuzzy match threshold", 0.0, 1.0, options.fuzzy_threshold, 0.05,
+        )
+        options.enforce_required = st.checkbox(
+            "Enforce required fields", value=options.enforce_required,
        )

 # ---------------------------------------------------------------------------
-# Placeholder options
+# Mapping editor — show inferred and let user override
 # ---------------------------------------------------------------------------

-st.subheader("Schema Options")
+st.subheader("Mapping")

-st.file_uploader("Load target schema (JSON)", type=["json"], disabled=True, key="colmap_schema")
-st.checkbox("Drop unmapped columns", value=False, disabled=True)
-st.checkbox("Reorder to match schema", value=True, disabled=True)
-
-st.divider()
-st.button("Apply Column Mapping", type="primary", use_container_width=True, disabled=True)
+if schema is None:
+    st.caption(
+        "No schema — define explicit renames below (left blank means keep "
+        "the source name)."
+    )
+    rename_initial = pd.DataFrame({
+        "source": list(df.columns),
+        "target": list(df.columns),
+    })
+    rename_edited = st.data_editor(
+        rename_initial,
+        use_container_width=True,
+        column_config={
+            "source": st.column_config.TextColumn("Source", disabled=True),
+            "target": st.column_config.TextColumn("Target"),
+        },
+        hide_index=True,
+        key="colmap_rename_only_editor",
+    )
+    explicit_mapping: dict[str, str] = {}
+    for _, row in rename_edited.iterrows():
+        src = str(row["source"])
+        tgt = str(row["target"]).strip()
+        if tgt and tgt != src:
+            explicit_mapping[src] = tgt
+    options.mapping = explicit_mapping
+else:
+    inferred = (
+        infer_mapping(df, schema, threshold=options.fuzzy_threshold)
+        if options.auto_infer else {}
+    )
+    target_options = ["(unmapped)"] + schema.field_names()
+    map_initial = pd.DataFrame({
+        "source": list(df.columns),
+        "target": [inferred.get(c, "(unmapped)") for c in df.columns],
+        "auto": [c in inferred for c in df.columns],
+    })
+    map_edited = st.data_editor(
+        map_initial,
+        use_container_width=True,
+        column_config={
+            "source": st.column_config.TextColumn("Source", disabled=True),
+            "target": st.column_config.SelectboxColumn(
+                "Target", options=target_options,
+            ),
+            "auto": st.column_config.CheckboxColumn("Auto-suggested", disabled=True),
+        },
+        hide_index=True,
+        key="colmap_schema_mapping_editor",
+    )
+    explicit_mapping = {}
+    for _, row in map_edited.iterrows():
+        src = str(row["source"])
+        tgt = str(row["target"])
+        if tgt and tgt != "(unmapped)":
+            explicit_mapping[src] = tgt
+    options.mapping = explicit_mapping
+    # Disable auto-infer for the actual run since the editor already shows
+    # the user's resolved choices (they can manually re-select to add).
+    options.auto_infer = False

 # ---------------------------------------------------------------------------
-# Footer
+# Run
 # ---------------------------------------------------------------------------

 st.divider()
-st.caption(
-    "Runs locally. Your data never leaves this computer. "
-    "| DataTools v3.0"
+
+if st.button("Apply Column Mapping", type="primary", use_container_width=True):
+    with st.spinner("Mapping..."):
+        try:
+            result = map_columns(df, options)
+        except (ValueError, OSError) as e:
+            from src.core.errors import format_for_user
+            st.error(format_for_user(e))
+            st.stop()
+    st.session_state["colmap_result"] = result
+    st.session_state["colmap_input_name"] = uploaded.name
+    st.session_state["colmap_options"] = options.to_dict()
+
+result = st.session_state.get("colmap_result")
+if result is None:
+    st.info("Configure a mapping and click **Apply Column Mapping** to run.")
+    st.stop()
+
+# ---------------------------------------------------------------------------
+# Results
+# ---------------------------------------------------------------------------
+
+st.subheader("Results")
+
+m1, m2, m3, m4 = st.columns(4)
+m1.metric("Renamed", result.columns_renamed)
+m2.metric("Dropped", len(result.columns_dropped))
+m3.metric("Added", len(result.columns_added))
+m4.metric(
+    "Coerce fails",
+    sum(result.coercion_failures.values()) if result.coercion_failures else 0,
 )
+
+if result.columns_dropped:
+    st.warning(f"Dropped columns: {', '.join(result.columns_dropped)}")
+if result.columns_added:
+    st.info(f"Added (with defaults): {', '.join(result.columns_added)}")
+if result.coercion_failures:
+    st.warning(
+        "Some cells could not be coerced and were left as NaN: "
+        + ", ".join(f"{c} ({n})" for c, n in result.coercion_failures.items())
+    )
+
+if result.mapping:
+    st.markdown("**Resolved mapping**")
+    map_df = pd.DataFrame(
+        [
+            {"source": s, "target": t, "auto": s in result.inferred_pairs}
+            for s, t in result.mapping.items()
+        ],
+    )
+    st.dataframe(map_df, use_container_width=True, hide_index=True)
+
+st.markdown("**Mapped preview (first 10 rows)**")
+st.dataframe(result.mapped_df.head(10), use_container_width=True)
+
+# ---------------------------------------------------------------------------
+# Downloads
+# ---------------------------------------------------------------------------
+
+st.divider()
+stem = Path(st.session_state.get("colmap_input_name", "input")).stem
+
+dl_a, dl_b, dl_c = st.columns(3)
+with dl_a:
+    mapped_bytes = result.mapped_df.to_csv(index=False).encode("utf-8-sig")
+    st.download_button(
+        "Download mapped CSV",
+        data=mapped_bytes,
+        file_name=f"{stem}_mapped.csv",
+        mime="text/csv",
+    )
+with dl_b:
+    audit_bytes = json.dumps({
+        "mapping": result.mapping,
+        "inferred_pairs": result.inferred_pairs,
+        "columns_renamed": result.columns_renamed,
+        "columns_dropped": result.columns_dropped,
+        "columns_added": result.columns_added,
+        "coercion_failures": result.coercion_failures,
+        "unmapped_kept": result.unmapped_kept,
+        "missing_required_targets": result.missing_required_targets,
+    }, indent=2, default=str).encode("utf-8")
+    st.download_button(
+        "Download mapping audit",
+        data=audit_bytes,
+        file_name=f"{stem}_mapping.json",
+        mime="application/json",
+    )
+with dl_c:
+    config_bytes = json.dumps(
+        st.session_state.get("colmap_options", {}), indent=2, default=str,
+    ).encode("utf-8")
+    st.download_button(
+        "Download config JSON",
+        data=config_bytes,
+        file_name="column_map_config.json",
+        mime="application/json",
+    )
+
+st.divider()
+st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")