datatools-dev/src/gui/pages/5_Column_Mapper.py

"""DataTools Map Columns — Streamlit page."""

from __future__ import annotations

import io
import json
import sys
from pathlib import Path

import pandas as pd
import streamlit as st

_project_root = Path(__file__).resolve().parent.parent.parent.parent
if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root))

from src.gui.components import (
    back_to_home_link,
    render_sticky_footer,
    render_tool_header,
    hide_streamlit_chrome,
    html_download_button,
    pickup_or_upload,
    require_feature_or_render_upgrade,
)
from src.core.column_mapper import (
    MapOptions,
    PRESETS,
    TargetField,
    TargetSchema,
    infer_mapping,
    map_columns,
)
from src.license import FeatureFlag

hide_streamlit_chrome()
render_sticky_footer()
back_to_home_link()
from src.audit import log_page_open
log_page_open("5_Column_Mapper")
require_feature_or_render_upgrade(FeatureFlag.COLUMN_MAPPER)


# ---------------------------------------------------------------------------
# Header
# ---------------------------------------------------------------------------

render_tool_header("05_column_mapper")


# ---------------------------------------------------------------------------
# File upload
# ---------------------------------------------------------------------------

uploaded = pickup_or_upload(
    label="Import CSV or Excel file",
    key="colmap_file_upload",
    types=["csv", "tsv", "xlsx", "xls"],
)

if uploaded is None:
    st.info("Import a CSV, TSV, or Excel file to begin.")
    st.stop()


@st.cache_data(show_spinner=False)
def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
    suffix = Path(name).suffix.lower()
    bio = io.BytesIO(data)
    if suffix in (".xlsx", ".xls"):
        return pd.read_excel(bio)
    for enc in ("utf-8", "utf-8-sig", "latin-1"):
        try:
            bio.seek(0)
            sep = "\t" if suffix == ".tsv" else ","
            return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn")
        except UnicodeDecodeError:
            continue
    bio.seek(0)
    return pd.read_csv(bio, encoding="latin-1")


try:
    df = _read_uploaded(uploaded.name, uploaded.getvalue())
except Exception as e:
    from src.core.errors import format_for_user
    st.error(
        f"**Could not read `{uploaded.name}`**\n\n"
        f"```\n{format_for_user(e)}\n```"
    )
    st.stop()

# Collapse the input preview once the user has clicked Apply Column
# Mapping so the Results section below is the primary visual focus.
# The user can re-expand the expander to re-inspect the source rows.
_has_result = st.session_state.get("colmap_result") is not None

with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result):
    st.caption(f"{len(df)} rows, {len(df.columns)} columns")
    st.dataframe(df.head(10), width="stretch")
st.divider()

# ---------------------------------------------------------------------------
# Options (Target schema + Strategy + Mapping)
# ---------------------------------------------------------------------------
#
# Wrapped in an outer expander whose default state mirrors the preview
# expander above: open before a result exists, folded once the user has
# clicked Apply Column Mapping. The Mapping editor is the heart of the
# tool, but per the Text Cleaner pattern we still collapse everything
# post-run — the user can re-expand to tweak any of the three sections.

with st.expander("Options", expanded=not _has_result):
    # -----------------------------------------------------------------------
    # Schema input
    # -----------------------------------------------------------------------

    st.subheader("Target schema")

    schema_mode = st.radio(
        "How would you like to define the target schema?",
        [
            "Build interactively (start from current columns)",
            "Import schema JSON",
            "Skip (rename / coerce only — no schema)",
        ],
        index=0,
        help=(
            "An interactive build is fastest for one-off cleanup. Import a JSON "
            "when you have a fixed contract (a CRM import format, db schema). "
            "Skip when you only want to rename or coerce specific columns."
        ),
    )

    schema: TargetSchema | None = None

    if schema_mode.startswith("Import"):
        schema_file = st.file_uploader(
            "Schema JSON",
            type=["json"],
            key="colmap_schema_upload",
            help='Format: {"fields": [{"name": "email", "dtype": "string", "required": true, "aliases": ["EmailAddr"]}, ...]}',
        )
        if schema_file is not None:
            try:
                schema = TargetSchema.from_dict(json.loads(schema_file.getvalue()))
                st.success(f"Loaded {len(schema.fields)} target field(s).")
            except Exception as e:
                from src.core.errors import format_for_user
                st.error(f"**Could not parse schema**\n\n```\n{format_for_user(e)}\n```")

    elif schema_mode.startswith("Build"):
        st.caption(
            "Edit the table to define your target schema. Add rows for fields the "
            "input doesn't have yet (with a default), or remove rows for columns "
            "you want to drop."
        )
        initial = pd.DataFrame({
            "name": list(df.columns),
            "dtype": ["auto"] * len(df.columns),
            "required": [False] * len(df.columns),
            "default": [""] * len(df.columns),
            "aliases": [""] * len(df.columns),
        })
        edited = st.data_editor(
            initial,
            width="stretch",
            num_rows="dynamic",
            column_config={
                "name": st.column_config.TextColumn("Target name"),
                "dtype": st.column_config.SelectboxColumn(
                    "Type",
                    options=[
                        "auto", "string", "integer", "float",
                        "boolean", "date", "datetime", "category",
                    ],
                ),
                "required": st.column_config.CheckboxColumn("Required"),
                "default": st.column_config.TextColumn("Default (for added cols)"),
                "aliases": st.column_config.TextColumn(
                    "Aliases (comma-sep, helps fuzzy-match)",
                ),
            },
            key="colmap_schema_editor",
        )
        fields: list[TargetField] = []
        for _, row in edited.iterrows():
            name = str(row.get("name", "")).strip()
            if not name:
                continue
            aliases = [
                a.strip() for a in str(row.get("aliases", "") or "").split(",")
                if a.strip()
            ]
            default_raw = row.get("default")
            default_val = (
                default_raw if (default_raw not in (None, "", float("nan")))
                else None
            )
            try:
                if isinstance(default_val, float) and pd.isna(default_val):
                    default_val = None
            except TypeError:
                pass
            fields.append(TargetField(
                name=name,
                dtype=str(row.get("dtype", "auto")),  # type: ignore[arg-type]
                required=bool(row.get("required", False)),
                aliases=aliases,
                default=default_val,
            ))
        if fields:
            schema = TargetSchema(fields=fields)

    st.divider()

    # -----------------------------------------------------------------------
    # Strategy
    # -----------------------------------------------------------------------

    st.subheader("Strategy")

    preset_label = st.radio(
        "Preset",
        [
            "rename-only (just rename, leave types alone, keep extras)",
            "lenient-schema (rename + coerce + reorder, keep extras)",
            "strict-schema (rename + coerce + reorder, drop extras)",
        ],
        index=0,
    )
    preset_key = preset_label.split(" ", 1)[0]
    options = MapOptions.from_preset(preset_key)
    options.schema = schema

    with st.expander("Advanced options"):
        col_a, col_b = st.columns(2)
        with col_a:
            options.unmapped = st.selectbox(  # type: ignore[assignment]
                "Unmapped source columns",
                ["keep", "drop", "error"],
                index=["keep", "drop", "error"].index(options.unmapped),
            )
            options.coerce_types = st.checkbox(
                "Coerce types per schema", value=options.coerce_types,
            )
            options.reorder_to_schema = st.checkbox(
                "Reorder to schema order", value=options.reorder_to_schema,
            )
        with col_b:
            options.auto_infer = st.checkbox(
                "Auto-infer mapping (fuzzy match)", value=options.auto_infer,
            )
            options.fuzzy_threshold = st.slider(
                "Fuzzy match threshold", 0.0, 1.0, options.fuzzy_threshold, 0.05,
            )
            options.enforce_required = st.checkbox(
                "Enforce required fields", value=options.enforce_required,
            )

    # -----------------------------------------------------------------------
    # Mapping editor — show inferred and let user override
    # -----------------------------------------------------------------------

    st.subheader("Mapping")

    if schema is None:
        st.caption(
            "No schema — define explicit renames below (left blank means keep "
            "the source name)."
        )
        rename_initial = pd.DataFrame({
            "source": list(df.columns),
            "target": list(df.columns),
        })
        rename_edited = st.data_editor(
            rename_initial,
            width="stretch",
            column_config={
                "source": st.column_config.TextColumn("Source", disabled=True),
                "target": st.column_config.TextColumn("Target"),
            },
            hide_index=True,
            key="colmap_rename_only_editor",
        )
        explicit_mapping: dict[str, str] = {}
        for _, row in rename_edited.iterrows():
            src = str(row["source"])
            tgt = str(row["target"]).strip()
            if tgt and tgt != src:
                explicit_mapping[src] = tgt
        options.mapping = explicit_mapping
    else:
        inferred = (
            infer_mapping(df, schema, threshold=options.fuzzy_threshold)
            if options.auto_infer else {}
        )
        target_options = ["(unmapped)"] + schema.field_names()
        map_initial = pd.DataFrame({
            "source": list(df.columns),
            "target": [inferred.get(c, "(unmapped)") for c in df.columns],
            "auto": [c in inferred for c in df.columns],
        })
        map_edited = st.data_editor(
            map_initial,
            width="stretch",
            column_config={
                "source": st.column_config.TextColumn("Source", disabled=True),
                "target": st.column_config.SelectboxColumn(
                    "Target", options=target_options,
                ),
                "auto": st.column_config.CheckboxColumn("Auto-suggested", disabled=True),
            },
            hide_index=True,
            key="colmap_schema_mapping_editor",
        )
        explicit_mapping = {}
        for _, row in map_edited.iterrows():
            src = str(row["source"])
            tgt = str(row["target"])
            if tgt and tgt != "(unmapped)":
                explicit_mapping[src] = tgt
        options.mapping = explicit_mapping
        # Disable auto-infer for the actual run since the editor already shows
        # the user's resolved choices (they can manually re-select to add).
        options.auto_infer = False

# ---------------------------------------------------------------------------
# Run
# ---------------------------------------------------------------------------

st.divider()

if st.button("Apply Column Mapping", type="primary", width="stretch"):
    with st.spinner("Mapping..."):
        try:
            result = map_columns(df, options)
        except (ValueError, OSError) as e:
            from src.core.errors import format_for_user
            st.error(format_for_user(e))
            st.stop()
    st.session_state["colmap_result"] = result
    from src.audit import log_event
    log_event("tool_run", "Map Columns run", page="5_Column_Mapper")
    st.session_state["colmap_input_name"] = uploaded.name
    st.session_state["colmap_options"] = options.to_dict()
    # One-shot flag picked up on the next pass to scroll the parent
    # document to the Results anchor (see scroll snippet below).
    st.session_state["_colmap_scroll_to_results"] = True
    # Force a second rerun so the preview and options expanders see
    # the new result on the NEXT script pass and collapse themselves.
    st.rerun()

result = st.session_state.get("colmap_result")
if result is None:
    st.info("Configure a mapping and click **Apply Column Mapping** to run.")
    st.stop()

# ---------------------------------------------------------------------------
# Results
# ---------------------------------------------------------------------------

# Anchor target for the auto-scroll snippet at the end of this block.
# A bare ``<div id="...">`` survives Streamlit's HTML sanitizer (only
# ``<script>`` is stripped), and a 1px-tall div doesn't visually shift
# anything. Placed before the subheader so the scrolled-to viewport
# starts a few pixels above the section heading rather than below it.
st.markdown(
    '<div id="colmap-results-anchor" style="height:1px"></div>',
    unsafe_allow_html=True,
)

st.subheader("Results")

m1, m2, m3, m4 = st.columns(4)
m1.metric("Renamed", result.columns_renamed)
m2.metric("Dropped", len(result.columns_dropped))
m3.metric("Added", len(result.columns_added))
m4.metric(
    "Coerce fails",
    sum(result.coercion_failures.values()) if result.coercion_failures else 0,
)

if result.columns_dropped:
    st.warning(f"Dropped columns: {', '.join(result.columns_dropped)}")
if result.columns_added:
    st.info(f"Added (with defaults): {', '.join(result.columns_added)}")
if result.coercion_failures:
    st.warning(
        "Some cells could not be coerced and were left as NaN: "
        + ", ".join(f"{c} ({n})" for c, n in result.coercion_failures.items())
    )

if result.mapping:
    st.markdown("**Resolved mapping**")
    map_df = pd.DataFrame(
        [
            {"source": s, "target": t, "auto": s in result.inferred_pairs}
            for s, t in result.mapping.items()
        ],
    )
    st.dataframe(map_df, width="stretch", hide_index=True)

st.markdown("**Mapped preview (first 10 rows)**")
st.dataframe(result.mapped_df.head(10), width="stretch")

# ---------------------------------------------------------------------------
# Downloads
# ---------------------------------------------------------------------------
#
# All three byte buffers are prepared up front (outside the columns) so
# each ``st.download_button`` sees stable ``data`` across reruns and an
# explicit ``key`` — without those, Streamlit auto-derived widget IDs
# can collide for multiple download_buttons in adjacent columns and
# only the first one actually fires on click.

st.divider()
stem = Path(st.session_state.get("colmap_input_name", "input")).stem

mapped_bytes = result.mapped_df.to_csv(index=False).encode("utf-8-sig")
audit_bytes = json.dumps({
    "mapping": result.mapping,
    "inferred_pairs": result.inferred_pairs,
    "columns_renamed": result.columns_renamed,
    "columns_dropped": result.columns_dropped,
    "columns_added": result.columns_added,
    "coercion_failures": result.coercion_failures,
    "unmapped_kept": result.unmapped_kept,
    "missing_required_targets": result.missing_required_targets,
}, indent=2, default=str).encode("utf-8")
config_bytes = json.dumps(
    st.session_state.get("colmap_options", {}), indent=2, default=str,
).encode("utf-8")

_no_mapping = not result.mapping

dl_a, dl_b, dl_c = st.columns(3)
with dl_a:
    html_download_button(
        "Download mapped CSV",
        mapped_bytes,
        file_name=f"{stem}_mapped.csv",
        mime="text/csv",
    )
with dl_b:
    html_download_button(
        "Download mapping audit",
        audit_bytes,
        file_name=f"{stem}_mapping.json",
        mime="application/json",
        disabled=_no_mapping,
        help="No mapping was applied." if _no_mapping else None,
    )
with dl_c:
    html_download_button(
        "Download config JSON",
        config_bytes,
        file_name="column_map_config.json",
        mime="application/json",
    )


# ---------------------------------------------------------------------------
# Post-run auto-scroll
# ---------------------------------------------------------------------------
#
# When the user clicks Apply Column Mapping, the preview + options
# collapse but Streamlit by itself doesn't scroll — the Results section
# is at the bottom of a tall script so the user has to find it. Inject
# a tiny component-html iframe that calls ``scrollIntoView`` on the
# parent's Results anchor. Streamlit's main page is same-origin with
# component iframes so ``window.parent.document`` access is allowed.
#
# The flag is one-shot (``pop`` removes it) so re-renders triggered by
# unrelated widgets in the Results section don't yank the viewport back
# to the top of Results.
if st.session_state.pop("_colmap_scroll_to_results", False):
    st.iframe(
        """
        <script>
          const doc = window.parent.document;
          const target = doc.getElementById('colmap-results-anchor');
          if (target) target.scrollIntoView({behavior: 'smooth', block: 'start'});
        </script>
        """,
        height=1,
    )