"""DataTools Map Columns — Streamlit page.""" from __future__ import annotations import io import json import sys from pathlib import Path import pandas as pd import streamlit as st _project_root = Path(__file__).resolve().parent.parent.parent.parent if str(_project_root) not in sys.path: sys.path.insert(0, str(_project_root)) from src.gui.components import ( back_to_home_link, render_sticky_footer, render_tool_header, hide_streamlit_chrome, html_download_button, pickup_or_upload, require_feature_or_render_upgrade, ) from src.core.column_mapper import ( MapOptions, PRESETS, TargetField, TargetSchema, infer_mapping, map_columns, ) from src.license import FeatureFlag hide_streamlit_chrome() render_sticky_footer() back_to_home_link() from src.audit import log_page_open log_page_open("5_Column_Mapper") require_feature_or_render_upgrade(FeatureFlag.COLUMN_MAPPER) # --------------------------------------------------------------------------- # Header # --------------------------------------------------------------------------- render_tool_header("05_column_mapper") # --------------------------------------------------------------------------- # File upload # --------------------------------------------------------------------------- uploaded = pickup_or_upload( label="Import CSV or Excel file", key="colmap_file_upload", types=["csv", "tsv", "xlsx", "xls"], ) if uploaded is None: st.info("Import a CSV, TSV, or Excel file to begin.") st.stop() @st.cache_data(show_spinner=False) def _read_uploaded(name: str, data: bytes) -> pd.DataFrame: suffix = Path(name).suffix.lower() bio = io.BytesIO(data) if suffix in (".xlsx", ".xls"): return pd.read_excel(bio) for enc in ("utf-8", "utf-8-sig", "latin-1"): try: bio.seek(0) sep = "\t" if suffix == ".tsv" else "," return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn") except UnicodeDecodeError: continue bio.seek(0) return pd.read_csv(bio, encoding="latin-1") try: df = _read_uploaded(uploaded.name, uploaded.getvalue()) except Exception as e: from src.core.errors import format_for_user st.error( f"**Could not read `{uploaded.name}`**\n\n" f"```\n{format_for_user(e)}\n```" ) st.stop() # Collapse the input preview once the user has clicked Apply Column # Mapping so the Results section below is the primary visual focus. # The user can re-expand the expander to re-inspect the source rows. _has_result = st.session_state.get("colmap_result") is not None with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result): st.caption(f"{len(df)} rows, {len(df.columns)} columns") st.dataframe(df.head(10), width="stretch") st.divider() # --------------------------------------------------------------------------- # Options (Target schema + Strategy + Mapping) # --------------------------------------------------------------------------- # # Wrapped in an outer expander whose default state mirrors the preview # expander above: open before a result exists, folded once the user has # clicked Apply Column Mapping. The Mapping editor is the heart of the # tool, but per the Text Cleaner pattern we still collapse everything # post-run — the user can re-expand to tweak any of the three sections. with st.expander("Options", expanded=not _has_result): # ----------------------------------------------------------------------- # Schema input # ----------------------------------------------------------------------- st.subheader("Target schema") schema_mode = st.radio( "How would you like to define the target schema?", [ "Build interactively (start from current columns)", "Import schema JSON", "Skip (rename / coerce only — no schema)", ], index=0, help=( "An interactive build is fastest for one-off cleanup. Import a JSON " "when you have a fixed contract (a CRM import format, db schema). " "Skip when you only want to rename or coerce specific columns." ), ) schema: TargetSchema | None = None if schema_mode.startswith("Import"): schema_file = st.file_uploader( "Schema JSON", type=["json"], key="colmap_schema_upload", help='Format: {"fields": [{"name": "email", "dtype": "string", "required": true, "aliases": ["EmailAddr"]}, ...]}', ) if schema_file is not None: try: schema = TargetSchema.from_dict(json.loads(schema_file.getvalue())) st.success(f"Loaded {len(schema.fields)} target field(s).") except Exception as e: from src.core.errors import format_for_user st.error(f"**Could not parse schema**\n\n```\n{format_for_user(e)}\n```") elif schema_mode.startswith("Build"): st.caption( "Edit the table to define your target schema. Add rows for fields the " "input doesn't have yet (with a default), or remove rows for columns " "you want to drop." ) initial = pd.DataFrame({ "name": list(df.columns), "dtype": ["auto"] * len(df.columns), "required": [False] * len(df.columns), "default": [""] * len(df.columns), "aliases": [""] * len(df.columns), }) edited = st.data_editor( initial, width="stretch", num_rows="dynamic", column_config={ "name": st.column_config.TextColumn("Target name"), "dtype": st.column_config.SelectboxColumn( "Type", options=[ "auto", "string", "integer", "float", "boolean", "date", "datetime", "category", ], ), "required": st.column_config.CheckboxColumn("Required"), "default": st.column_config.TextColumn("Default (for added cols)"), "aliases": st.column_config.TextColumn( "Aliases (comma-sep, helps fuzzy-match)", ), }, key="colmap_schema_editor", ) fields: list[TargetField] = [] for _, row in edited.iterrows(): name = str(row.get("name", "")).strip() if not name: continue aliases = [ a.strip() for a in str(row.get("aliases", "") or "").split(",") if a.strip() ] default_raw = row.get("default") default_val = ( default_raw if (default_raw not in (None, "", float("nan"))) else None ) try: if isinstance(default_val, float) and pd.isna(default_val): default_val = None except TypeError: pass fields.append(TargetField( name=name, dtype=str(row.get("dtype", "auto")), # type: ignore[arg-type] required=bool(row.get("required", False)), aliases=aliases, default=default_val, )) if fields: schema = TargetSchema(fields=fields) st.divider() # ----------------------------------------------------------------------- # Strategy # ----------------------------------------------------------------------- st.subheader("Strategy") preset_label = st.radio( "Preset", [ "rename-only (just rename, leave types alone, keep extras)", "lenient-schema (rename + coerce + reorder, keep extras)", "strict-schema (rename + coerce + reorder, drop extras)", ], index=0, ) preset_key = preset_label.split(" ", 1)[0] options = MapOptions.from_preset(preset_key) options.schema = schema with st.expander("Advanced options"): col_a, col_b = st.columns(2) with col_a: options.unmapped = st.selectbox( # type: ignore[assignment] "Unmapped source columns", ["keep", "drop", "error"], index=["keep", "drop", "error"].index(options.unmapped), ) options.coerce_types = st.checkbox( "Coerce types per schema", value=options.coerce_types, ) options.reorder_to_schema = st.checkbox( "Reorder to schema order", value=options.reorder_to_schema, ) with col_b: options.auto_infer = st.checkbox( "Auto-infer mapping (fuzzy match)", value=options.auto_infer, ) options.fuzzy_threshold = st.slider( "Fuzzy match threshold", 0.0, 1.0, options.fuzzy_threshold, 0.05, ) options.enforce_required = st.checkbox( "Enforce required fields", value=options.enforce_required, ) # ----------------------------------------------------------------------- # Mapping editor — show inferred and let user override # ----------------------------------------------------------------------- st.subheader("Mapping") if schema is None: st.caption( "No schema — define explicit renames below (left blank means keep " "the source name)." ) rename_initial = pd.DataFrame({ "source": list(df.columns), "target": list(df.columns), }) rename_edited = st.data_editor( rename_initial, width="stretch", column_config={ "source": st.column_config.TextColumn("Source", disabled=True), "target": st.column_config.TextColumn("Target"), }, hide_index=True, key="colmap_rename_only_editor", ) explicit_mapping: dict[str, str] = {} for _, row in rename_edited.iterrows(): src = str(row["source"]) tgt = str(row["target"]).strip() if tgt and tgt != src: explicit_mapping[src] = tgt options.mapping = explicit_mapping else: inferred = ( infer_mapping(df, schema, threshold=options.fuzzy_threshold) if options.auto_infer else {} ) target_options = ["(unmapped)"] + schema.field_names() map_initial = pd.DataFrame({ "source": list(df.columns), "target": [inferred.get(c, "(unmapped)") for c in df.columns], "auto": [c in inferred for c in df.columns], }) map_edited = st.data_editor( map_initial, width="stretch", column_config={ "source": st.column_config.TextColumn("Source", disabled=True), "target": st.column_config.SelectboxColumn( "Target", options=target_options, ), "auto": st.column_config.CheckboxColumn("Auto-suggested", disabled=True), }, hide_index=True, key="colmap_schema_mapping_editor", ) explicit_mapping = {} for _, row in map_edited.iterrows(): src = str(row["source"]) tgt = str(row["target"]) if tgt and tgt != "(unmapped)": explicit_mapping[src] = tgt options.mapping = explicit_mapping # Disable auto-infer for the actual run since the editor already shows # the user's resolved choices (they can manually re-select to add). options.auto_infer = False # --------------------------------------------------------------------------- # Run # --------------------------------------------------------------------------- st.divider() if st.button("Apply Column Mapping", type="primary", width="stretch"): with st.spinner("Mapping..."): try: result = map_columns(df, options) except (ValueError, OSError) as e: from src.core.errors import format_for_user st.error(format_for_user(e)) st.stop() st.session_state["colmap_result"] = result from src.audit import log_event log_event("tool_run", "Map Columns run", page="5_Column_Mapper") st.session_state["colmap_input_name"] = uploaded.name st.session_state["colmap_options"] = options.to_dict() # One-shot flag picked up on the next pass to scroll the parent # document to the Results anchor (see scroll snippet below). st.session_state["_colmap_scroll_to_results"] = True # Force a second rerun so the preview and options expanders see # the new result on the NEXT script pass and collapse themselves. st.rerun() result = st.session_state.get("colmap_result") if result is None: st.info("Configure a mapping and click **Apply Column Mapping** to run.") st.stop() # --------------------------------------------------------------------------- # Results # --------------------------------------------------------------------------- # Anchor target for the auto-scroll snippet at the end of this block. # A bare ``