- Drop unused 'from src.i18n import t' from pages 1-9 (the swap to render_tool_header(tool_id) means no page calls t() directly anymore). Pages 10, 11 and the underscore-prefixed pages were already clean or legitimately use t(). - Rewrite PDF Extractor help_md (en + es). The original prose described features the tool does NOT have — template drawing, per-source saved templates, automatic reuse. The actual tool is a heuristic batch scanner (per its own docstring: "No templates, no per-bank configuration"). New copy: scan → uncheck → pick date format → enable OCR if needed → download. Spanish version tagged with '<!-- TODO: review Spanish -->' since the prose is best-effort. - Document why both stSidebarNavSectionHeader (legacy, streamlit~=1.35) and stNavSectionHeader (current, 1.57) testids appear in the chrome CSS — requirements floor is streamlit>=1.35,<2 so dropping the legacy selector would silently break the lower bound. - Pin the t()-returns-key-on-miss contract that render_tool_header's fallback path depends on, with a comment at the call site. - Pin the demo's intentional skip of hide_streamlit_chrome (so the +/- sidebar swap JS doesn't ever try to load there) with a load- bearing comment in app_demo.py. - Confirmed i18n parity: every tool id has page_title / page_caption / description / name / help_md in BOTH packs; help.button_label and help.missing_body in both. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
488 lines
17 KiB
Python
488 lines
17 KiB
Python
"""DataTools Map Columns — Streamlit page."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import io
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
import streamlit as st
|
|
|
|
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
|
if str(_project_root) not in sys.path:
|
|
sys.path.insert(0, str(_project_root))
|
|
|
|
from src.gui.components import (
|
|
back_to_home_link,
|
|
render_sticky_footer,
|
|
render_tool_header,
|
|
hide_streamlit_chrome,
|
|
html_download_button,
|
|
pickup_or_upload,
|
|
require_feature_or_render_upgrade,
|
|
)
|
|
from src.core.column_mapper import (
|
|
MapOptions,
|
|
PRESETS,
|
|
TargetField,
|
|
TargetSchema,
|
|
infer_mapping,
|
|
map_columns,
|
|
)
|
|
from src.license import FeatureFlag
|
|
|
|
hide_streamlit_chrome()
|
|
render_sticky_footer()
|
|
back_to_home_link()
|
|
from src.audit import log_page_open
|
|
log_page_open("5_Column_Mapper")
|
|
require_feature_or_render_upgrade(FeatureFlag.COLUMN_MAPPER)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Header
|
|
# ---------------------------------------------------------------------------
|
|
|
|
render_tool_header("05_column_mapper")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# File upload
|
|
# ---------------------------------------------------------------------------
|
|
|
|
uploaded = pickup_or_upload(
|
|
label="Import CSV or Excel file",
|
|
key="colmap_file_upload",
|
|
types=["csv", "tsv", "xlsx", "xls"],
|
|
)
|
|
|
|
if uploaded is None:
|
|
st.info("Import a CSV, TSV, or Excel file to begin.")
|
|
st.stop()
|
|
|
|
|
|
@st.cache_data(show_spinner=False)
|
|
def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
|
|
suffix = Path(name).suffix.lower()
|
|
bio = io.BytesIO(data)
|
|
if suffix in (".xlsx", ".xls"):
|
|
return pd.read_excel(bio)
|
|
for enc in ("utf-8", "utf-8-sig", "latin-1"):
|
|
try:
|
|
bio.seek(0)
|
|
sep = "\t" if suffix == ".tsv" else ","
|
|
return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn")
|
|
except UnicodeDecodeError:
|
|
continue
|
|
bio.seek(0)
|
|
return pd.read_csv(bio, encoding="latin-1")
|
|
|
|
|
|
try:
|
|
df = _read_uploaded(uploaded.name, uploaded.getvalue())
|
|
except Exception as e:
|
|
from src.core.errors import format_for_user
|
|
st.error(
|
|
f"**Could not read `{uploaded.name}`**\n\n"
|
|
f"```\n{format_for_user(e)}\n```"
|
|
)
|
|
st.stop()
|
|
|
|
# Collapse the input preview once the user has clicked Apply Column
|
|
# Mapping so the Results section below is the primary visual focus.
|
|
# The user can re-expand the expander to re-inspect the source rows.
|
|
_has_result = st.session_state.get("colmap_result") is not None
|
|
|
|
with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result):
|
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
|
st.dataframe(df.head(10), width="stretch")
|
|
st.divider()
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Options (Target schema + Strategy + Mapping)
|
|
# ---------------------------------------------------------------------------
|
|
#
|
|
# Wrapped in an outer expander whose default state mirrors the preview
|
|
# expander above: open before a result exists, folded once the user has
|
|
# clicked Apply Column Mapping. The Mapping editor is the heart of the
|
|
# tool, but per the Text Cleaner pattern we still collapse everything
|
|
# post-run — the user can re-expand to tweak any of the three sections.
|
|
|
|
with st.expander("Options", expanded=not _has_result):
|
|
# -----------------------------------------------------------------------
|
|
# Schema input
|
|
# -----------------------------------------------------------------------
|
|
|
|
st.subheader("Target schema")
|
|
|
|
schema_mode = st.radio(
|
|
"How would you like to define the target schema?",
|
|
[
|
|
"Build interactively (start from current columns)",
|
|
"Import schema JSON",
|
|
"Skip (rename / coerce only — no schema)",
|
|
],
|
|
index=0,
|
|
help=(
|
|
"An interactive build is fastest for one-off cleanup. Import a JSON "
|
|
"when you have a fixed contract (a CRM import format, db schema). "
|
|
"Skip when you only want to rename or coerce specific columns."
|
|
),
|
|
)
|
|
|
|
schema: TargetSchema | None = None
|
|
|
|
if schema_mode.startswith("Import"):
|
|
schema_file = st.file_uploader(
|
|
"Schema JSON",
|
|
type=["json"],
|
|
key="colmap_schema_upload",
|
|
help='Format: {"fields": [{"name": "email", "dtype": "string", "required": true, "aliases": ["EmailAddr"]}, ...]}',
|
|
)
|
|
if schema_file is not None:
|
|
try:
|
|
schema = TargetSchema.from_dict(json.loads(schema_file.getvalue()))
|
|
st.success(f"Loaded {len(schema.fields)} target field(s).")
|
|
except Exception as e:
|
|
from src.core.errors import format_for_user
|
|
st.error(f"**Could not parse schema**\n\n```\n{format_for_user(e)}\n```")
|
|
|
|
elif schema_mode.startswith("Build"):
|
|
st.caption(
|
|
"Edit the table to define your target schema. Add rows for fields the "
|
|
"input doesn't have yet (with a default), or remove rows for columns "
|
|
"you want to drop."
|
|
)
|
|
initial = pd.DataFrame({
|
|
"name": list(df.columns),
|
|
"dtype": ["auto"] * len(df.columns),
|
|
"required": [False] * len(df.columns),
|
|
"default": [""] * len(df.columns),
|
|
"aliases": [""] * len(df.columns),
|
|
})
|
|
edited = st.data_editor(
|
|
initial,
|
|
width="stretch",
|
|
num_rows="dynamic",
|
|
column_config={
|
|
"name": st.column_config.TextColumn("Target name"),
|
|
"dtype": st.column_config.SelectboxColumn(
|
|
"Type",
|
|
options=[
|
|
"auto", "string", "integer", "float",
|
|
"boolean", "date", "datetime", "category",
|
|
],
|
|
),
|
|
"required": st.column_config.CheckboxColumn("Required"),
|
|
"default": st.column_config.TextColumn("Default (for added cols)"),
|
|
"aliases": st.column_config.TextColumn(
|
|
"Aliases (comma-sep, helps fuzzy-match)",
|
|
),
|
|
},
|
|
key="colmap_schema_editor",
|
|
)
|
|
fields: list[TargetField] = []
|
|
for _, row in edited.iterrows():
|
|
name = str(row.get("name", "")).strip()
|
|
if not name:
|
|
continue
|
|
aliases = [
|
|
a.strip() for a in str(row.get("aliases", "") or "").split(",")
|
|
if a.strip()
|
|
]
|
|
default_raw = row.get("default")
|
|
default_val = (
|
|
default_raw if (default_raw not in (None, "", float("nan")))
|
|
else None
|
|
)
|
|
try:
|
|
if isinstance(default_val, float) and pd.isna(default_val):
|
|
default_val = None
|
|
except TypeError:
|
|
pass
|
|
fields.append(TargetField(
|
|
name=name,
|
|
dtype=str(row.get("dtype", "auto")), # type: ignore[arg-type]
|
|
required=bool(row.get("required", False)),
|
|
aliases=aliases,
|
|
default=default_val,
|
|
))
|
|
if fields:
|
|
schema = TargetSchema(fields=fields)
|
|
|
|
st.divider()
|
|
|
|
# -----------------------------------------------------------------------
|
|
# Strategy
|
|
# -----------------------------------------------------------------------
|
|
|
|
st.subheader("Strategy")
|
|
|
|
preset_label = st.radio(
|
|
"Preset",
|
|
[
|
|
"rename-only (just rename, leave types alone, keep extras)",
|
|
"lenient-schema (rename + coerce + reorder, keep extras)",
|
|
"strict-schema (rename + coerce + reorder, drop extras)",
|
|
],
|
|
index=0,
|
|
)
|
|
preset_key = preset_label.split(" ", 1)[0]
|
|
options = MapOptions.from_preset(preset_key)
|
|
options.schema = schema
|
|
|
|
with st.expander("Advanced options"):
|
|
col_a, col_b = st.columns(2)
|
|
with col_a:
|
|
options.unmapped = st.selectbox( # type: ignore[assignment]
|
|
"Unmapped source columns",
|
|
["keep", "drop", "error"],
|
|
index=["keep", "drop", "error"].index(options.unmapped),
|
|
)
|
|
options.coerce_types = st.checkbox(
|
|
"Coerce types per schema", value=options.coerce_types,
|
|
)
|
|
options.reorder_to_schema = st.checkbox(
|
|
"Reorder to schema order", value=options.reorder_to_schema,
|
|
)
|
|
with col_b:
|
|
options.auto_infer = st.checkbox(
|
|
"Auto-infer mapping (fuzzy match)", value=options.auto_infer,
|
|
)
|
|
options.fuzzy_threshold = st.slider(
|
|
"Fuzzy match threshold", 0.0, 1.0, options.fuzzy_threshold, 0.05,
|
|
)
|
|
options.enforce_required = st.checkbox(
|
|
"Enforce required fields", value=options.enforce_required,
|
|
)
|
|
|
|
# -----------------------------------------------------------------------
|
|
# Mapping editor — show inferred and let user override
|
|
# -----------------------------------------------------------------------
|
|
|
|
st.subheader("Mapping")
|
|
|
|
if schema is None:
|
|
st.caption(
|
|
"No schema — define explicit renames below (left blank means keep "
|
|
"the source name)."
|
|
)
|
|
rename_initial = pd.DataFrame({
|
|
"source": list(df.columns),
|
|
"target": list(df.columns),
|
|
})
|
|
rename_edited = st.data_editor(
|
|
rename_initial,
|
|
width="stretch",
|
|
column_config={
|
|
"source": st.column_config.TextColumn("Source", disabled=True),
|
|
"target": st.column_config.TextColumn("Target"),
|
|
},
|
|
hide_index=True,
|
|
key="colmap_rename_only_editor",
|
|
)
|
|
explicit_mapping: dict[str, str] = {}
|
|
for _, row in rename_edited.iterrows():
|
|
src = str(row["source"])
|
|
tgt = str(row["target"]).strip()
|
|
if tgt and tgt != src:
|
|
explicit_mapping[src] = tgt
|
|
options.mapping = explicit_mapping
|
|
else:
|
|
inferred = (
|
|
infer_mapping(df, schema, threshold=options.fuzzy_threshold)
|
|
if options.auto_infer else {}
|
|
)
|
|
target_options = ["(unmapped)"] + schema.field_names()
|
|
map_initial = pd.DataFrame({
|
|
"source": list(df.columns),
|
|
"target": [inferred.get(c, "(unmapped)") for c in df.columns],
|
|
"auto": [c in inferred for c in df.columns],
|
|
})
|
|
map_edited = st.data_editor(
|
|
map_initial,
|
|
width="stretch",
|
|
column_config={
|
|
"source": st.column_config.TextColumn("Source", disabled=True),
|
|
"target": st.column_config.SelectboxColumn(
|
|
"Target", options=target_options,
|
|
),
|
|
"auto": st.column_config.CheckboxColumn("Auto-suggested", disabled=True),
|
|
},
|
|
hide_index=True,
|
|
key="colmap_schema_mapping_editor",
|
|
)
|
|
explicit_mapping = {}
|
|
for _, row in map_edited.iterrows():
|
|
src = str(row["source"])
|
|
tgt = str(row["target"])
|
|
if tgt and tgt != "(unmapped)":
|
|
explicit_mapping[src] = tgt
|
|
options.mapping = explicit_mapping
|
|
# Disable auto-infer for the actual run since the editor already shows
|
|
# the user's resolved choices (they can manually re-select to add).
|
|
options.auto_infer = False
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Run
|
|
# ---------------------------------------------------------------------------
|
|
|
|
st.divider()
|
|
|
|
if st.button("Apply Column Mapping", type="primary", width="stretch"):
|
|
with st.spinner("Mapping..."):
|
|
try:
|
|
result = map_columns(df, options)
|
|
except (ValueError, OSError) as e:
|
|
from src.core.errors import format_for_user
|
|
st.error(format_for_user(e))
|
|
st.stop()
|
|
st.session_state["colmap_result"] = result
|
|
from src.audit import log_event
|
|
log_event("tool_run", "Map Columns run", page="5_Column_Mapper")
|
|
st.session_state["colmap_input_name"] = uploaded.name
|
|
st.session_state["colmap_options"] = options.to_dict()
|
|
# One-shot flag picked up on the next pass to scroll the parent
|
|
# document to the Results anchor (see scroll snippet below).
|
|
st.session_state["_colmap_scroll_to_results"] = True
|
|
# Force a second rerun so the preview and options expanders see
|
|
# the new result on the NEXT script pass and collapse themselves.
|
|
st.rerun()
|
|
|
|
result = st.session_state.get("colmap_result")
|
|
if result is None:
|
|
st.info("Configure a mapping and click **Apply Column Mapping** to run.")
|
|
st.stop()
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Results
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Anchor target for the auto-scroll snippet at the end of this block.
|
|
# A bare ``<div id="...">`` survives Streamlit's HTML sanitizer (only
|
|
# ``<script>`` is stripped), and a 1px-tall div doesn't visually shift
|
|
# anything. Placed before the subheader so the scrolled-to viewport
|
|
# starts a few pixels above the section heading rather than below it.
|
|
st.markdown(
|
|
'<div id="colmap-results-anchor" style="height:1px"></div>',
|
|
unsafe_allow_html=True,
|
|
)
|
|
|
|
st.subheader("Results")
|
|
|
|
m1, m2, m3, m4 = st.columns(4)
|
|
m1.metric("Renamed", result.columns_renamed)
|
|
m2.metric("Dropped", len(result.columns_dropped))
|
|
m3.metric("Added", len(result.columns_added))
|
|
m4.metric(
|
|
"Coerce fails",
|
|
sum(result.coercion_failures.values()) if result.coercion_failures else 0,
|
|
)
|
|
|
|
if result.columns_dropped:
|
|
st.warning(f"Dropped columns: {', '.join(result.columns_dropped)}")
|
|
if result.columns_added:
|
|
st.info(f"Added (with defaults): {', '.join(result.columns_added)}")
|
|
if result.coercion_failures:
|
|
st.warning(
|
|
"Some cells could not be coerced and were left as NaN: "
|
|
+ ", ".join(f"{c} ({n})" for c, n in result.coercion_failures.items())
|
|
)
|
|
|
|
if result.mapping:
|
|
st.markdown("**Resolved mapping**")
|
|
map_df = pd.DataFrame(
|
|
[
|
|
{"source": s, "target": t, "auto": s in result.inferred_pairs}
|
|
for s, t in result.mapping.items()
|
|
],
|
|
)
|
|
st.dataframe(map_df, width="stretch", hide_index=True)
|
|
|
|
st.markdown("**Mapped preview (first 10 rows)**")
|
|
st.dataframe(result.mapped_df.head(10), width="stretch")
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Downloads
|
|
# ---------------------------------------------------------------------------
|
|
#
|
|
# All three byte buffers are prepared up front (outside the columns) so
|
|
# each ``st.download_button`` sees stable ``data`` across reruns and an
|
|
# explicit ``key`` — without those, Streamlit auto-derived widget IDs
|
|
# can collide for multiple download_buttons in adjacent columns and
|
|
# only the first one actually fires on click.
|
|
|
|
st.divider()
|
|
stem = Path(st.session_state.get("colmap_input_name", "input")).stem
|
|
|
|
mapped_bytes = result.mapped_df.to_csv(index=False).encode("utf-8-sig")
|
|
audit_bytes = json.dumps({
|
|
"mapping": result.mapping,
|
|
"inferred_pairs": result.inferred_pairs,
|
|
"columns_renamed": result.columns_renamed,
|
|
"columns_dropped": result.columns_dropped,
|
|
"columns_added": result.columns_added,
|
|
"coercion_failures": result.coercion_failures,
|
|
"unmapped_kept": result.unmapped_kept,
|
|
"missing_required_targets": result.missing_required_targets,
|
|
}, indent=2, default=str).encode("utf-8")
|
|
config_bytes = json.dumps(
|
|
st.session_state.get("colmap_options", {}), indent=2, default=str,
|
|
).encode("utf-8")
|
|
|
|
_no_mapping = not result.mapping
|
|
|
|
dl_a, dl_b, dl_c = st.columns(3)
|
|
with dl_a:
|
|
html_download_button(
|
|
"Download mapped CSV",
|
|
mapped_bytes,
|
|
file_name=f"{stem}_mapped.csv",
|
|
mime="text/csv",
|
|
)
|
|
with dl_b:
|
|
html_download_button(
|
|
"Download mapping audit",
|
|
audit_bytes,
|
|
file_name=f"{stem}_mapping.json",
|
|
mime="application/json",
|
|
disabled=_no_mapping,
|
|
help="No mapping was applied." if _no_mapping else None,
|
|
)
|
|
with dl_c:
|
|
html_download_button(
|
|
"Download config JSON",
|
|
config_bytes,
|
|
file_name="column_map_config.json",
|
|
mime="application/json",
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Post-run auto-scroll
|
|
# ---------------------------------------------------------------------------
|
|
#
|
|
# When the user clicks Apply Column Mapping, the preview + options
|
|
# collapse but Streamlit by itself doesn't scroll — the Results section
|
|
# is at the bottom of a tall script so the user has to find it. Inject
|
|
# a tiny component-html iframe that calls ``scrollIntoView`` on the
|
|
# parent's Results anchor. Streamlit's main page is same-origin with
|
|
# component iframes so ``window.parent.document`` access is allowed.
|
|
#
|
|
# The flag is one-shot (``pop`` removes it) so re-renders triggered by
|
|
# unrelated widgets in the Results section don't yank the viewport back
|
|
# to the top of Results.
|
|
if st.session_state.pop("_colmap_scroll_to_results", False):
|
|
st.iframe(
|
|
"""
|
|
<script>
|
|
const doc = window.parent.document;
|
|
const target = doc.getElementById('colmap-results-anchor');
|
|
if (target) target.scrollIntoView({behavior: 'smooth', block: 'start'});
|
|
</script>
|
|
""",
|
|
height=1,
|
|
)
|