feat(tools): unified post-run UX across all Ready tool pages
Apply the Clean Text page's post-run UX pattern to every other Ready
tool page (Find Duplicates, Standardize Formats, Fix Missing Values,
Map Columns, Automated Workflows) for consistency and ease of use.
Per page:
1. Preview wrapped in ``st.expander(f"Preview: {filename}",
expanded=not _has_result)``. Open before a result exists, folded
afterwards.
2. Options / configuration controls wrapped in
``st.expander("Options", expanded=not _has_result)``. Inner
sub-expanders preserved (Streamlit 1.36+ supports nesting).
3. After the primary action stashes the result, set a one-shot
``_<tool>_scroll_to_results`` flag in session state and call
``st.rerun()`` so the preview + options expanders see the new
state on the next pass and collapse themselves.
4. ``<div id="<tool>-results-anchor" style="height:1px">`` placed
immediately before the Results subheader.
5. End-of-page: pop the scroll flag and inject a tiny
``streamlit.components.v1.html`` iframe whose ``<script>`` calls
``scrollIntoView`` on the parent document's anchor. One-shot, so
unrelated reruns (toggling Show-hidden, etc.) don't yank the
viewport.
6. Download buttons hardened against the multi-button Streamlit
footgun: byte buffers pre-computed outside the column scopes,
explicit unique ``key="<tool>_dl_<purpose>"`` per button,
``use_container_width=True``, and previously-conditional buttons
now render unconditionally with ``disabled=True`` + a help
tooltip when the underlying data is empty so layout stays steady.
Per-page judgment calls (already noted in agent reports):
- Find Duplicates: sheet picker and delimiter selector kept OUTSIDE
expanders (the user still needs to see them when a file fails to
parse).
- Fix Missing Values: missingness profile wrapped INSIDE the Options
expander together with Strategy — the Results section already
shows a before/after missingness comparison that supersedes the
static input profile.
- Map Columns: all three subsections (Target schema, Strategy,
Mapping) wrapped under one outer Options expander, matching the
Text Cleaner pattern.
- Automated Workflows: inner "Recommended tool order" expander stays
nested inside the outer Options wrap; Run button stays outside
Options so the user can re-run after tweaking the (collapsed)
editor.
2008 tests pass.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -173,22 +173,33 @@ if uploaded is not None:
|
|||||||
st.session_state["review_decisions"] = {}
|
st.session_state["review_decisions"] = {}
|
||||||
tmp_path.unlink(missing_ok=True)
|
tmp_path.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
# Collapse the input preview + options once a result exists so
|
||||||
|
# the Results section below becomes the primary visual focus
|
||||||
|
# after Find Duplicates runs. Mirrors the Clean Text pattern.
|
||||||
|
_has_result = st.session_state.get("result") is not None
|
||||||
|
|
||||||
# Preview
|
# Preview
|
||||||
st.subheader(f"Preview: {uploaded.name}")
|
with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result):
|
||||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
# Subheader retained inside the expander so collected_text in
|
||||||
st.dataframe(df.head(10), use_container_width=True)
|
# the workflow tests still finds "Preview: <name>" — Streamlit's
|
||||||
|
# AppTest does not surface expander labels through the
|
||||||
|
# markdown/caption/subheader collections.
|
||||||
|
st.subheader(f"Preview: {uploaded.name}")
|
||||||
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||||
|
st.dataframe(df.head(10), use_container_width=True)
|
||||||
|
|
||||||
# Advanced options
|
# Advanced options
|
||||||
settings = config_panel(df)
|
with st.expander("Options", expanded=not _has_result):
|
||||||
|
settings = config_panel(df)
|
||||||
|
|
||||||
# Apply loaded config if present
|
# Apply loaded config if present
|
||||||
loaded_cfg = st.session_state.get("loaded_config")
|
loaded_cfg = st.session_state.get("loaded_config")
|
||||||
if loaded_cfg is not None:
|
if loaded_cfg is not None:
|
||||||
settings["strategies"] = loaded_cfg.to_strategies()
|
settings["strategies"] = loaded_cfg.to_strategies()
|
||||||
settings["survivor_rule"] = loaded_cfg.to_survivor_rule()
|
settings["survivor_rule"] = loaded_cfg.to_survivor_rule()
|
||||||
settings["date_column"] = loaded_cfg.date_column
|
settings["date_column"] = loaded_cfg.date_column
|
||||||
settings["merge"] = loaded_cfg.merge
|
settings["merge"] = loaded_cfg.merge
|
||||||
del st.session_state["loaded_config"]
|
del st.session_state["loaded_config"]
|
||||||
|
|
||||||
# -------------------------------------------------------------------
|
# -------------------------------------------------------------------
|
||||||
# Find Duplicates button
|
# Find Duplicates button
|
||||||
@@ -218,6 +229,11 @@ if uploaded is not None:
|
|||||||
progress_bar.empty()
|
progress_bar.empty()
|
||||||
st.session_state["result"] = result
|
st.session_state["result"] = result
|
||||||
st.session_state["review_decisions"] = {}
|
st.session_state["review_decisions"] = {}
|
||||||
|
# One-shot flag for the scroll snippet at the bottom of the
|
||||||
|
# page. Force a rerun so the Preview / Options expanders see
|
||||||
|
# the new result on the next pass and collapse themselves.
|
||||||
|
st.session_state["_dedup_scroll_to_results"] = True
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
# -------------------------------------------------------------------
|
# -------------------------------------------------------------------
|
||||||
# Results
|
# Results
|
||||||
@@ -227,6 +243,14 @@ if uploaded is not None:
|
|||||||
|
|
||||||
if result is not None:
|
if result is not None:
|
||||||
st.divider()
|
st.divider()
|
||||||
|
# Anchor target for the post-run auto-scroll snippet at the
|
||||||
|
# bottom of this page. A bare ``<div id="...">`` survives
|
||||||
|
# Streamlit's HTML sanitizer; a 1px-tall div doesn't shift
|
||||||
|
# layout.
|
||||||
|
st.markdown(
|
||||||
|
'<div id="dedup-results-anchor" style="height:1px"></div>',
|
||||||
|
unsafe_allow_html=True,
|
||||||
|
)
|
||||||
st.subheader("Results")
|
st.subheader("Results")
|
||||||
|
|
||||||
# Summary + download buttons
|
# Summary + download buttons
|
||||||
@@ -324,27 +348,45 @@ if uploaded is not None:
|
|||||||
df, result.match_groups, decisions,
|
df, result.match_groups, decisions,
|
||||||
)
|
)
|
||||||
|
|
||||||
csv_bytes = reviewed_df.to_csv(
|
# Pre-compute every byte buffer up front so each
|
||||||
|
# ``st.download_button`` sees stable ``data``
|
||||||
|
# across reruns. Render the empty-removed case
|
||||||
|
# as a disabled button (rather than hiding it)
|
||||||
|
# so layout stays steady and the user can see
|
||||||
|
# why the download isn't available.
|
||||||
|
reviewed_bytes = reviewed_df.to_csv(
|
||||||
index=False
|
index=False
|
||||||
).encode("utf-8-sig")
|
).encode("utf-8-sig")
|
||||||
|
reviewed_removed_empty = reviewed_removed.empty
|
||||||
|
reviewed_removed_bytes = (
|
||||||
|
reviewed_removed.to_csv(index=False).encode("utf-8-sig")
|
||||||
|
if not reviewed_removed_empty
|
||||||
|
else b""
|
||||||
|
)
|
||||||
|
|
||||||
st.download_button(
|
st.download_button(
|
||||||
"Download Reviewed & Deduplicated CSV",
|
"Download Reviewed & Deduplicated CSV",
|
||||||
data=csv_bytes,
|
data=reviewed_bytes,
|
||||||
file_name="deduplicated_reviewed.csv",
|
file_name="deduplicated_reviewed.csv",
|
||||||
mime="text/csv",
|
mime="text/csv",
|
||||||
key="reviewed_download",
|
key="dedup_dl_reviewed",
|
||||||
|
use_container_width=True,
|
||||||
|
)
|
||||||
|
st.download_button(
|
||||||
|
"Download Reviewed Removed Rows",
|
||||||
|
data=reviewed_removed_bytes,
|
||||||
|
file_name="removed_reviewed.csv",
|
||||||
|
mime="text/csv",
|
||||||
|
key="dedup_dl_reviewed_removed",
|
||||||
|
disabled=reviewed_removed_empty,
|
||||||
|
help=(
|
||||||
|
"No rows were removed under the current "
|
||||||
|
"review decisions."
|
||||||
|
if reviewed_removed_empty
|
||||||
|
else None
|
||||||
|
),
|
||||||
|
use_container_width=True,
|
||||||
)
|
)
|
||||||
if not reviewed_removed.empty:
|
|
||||||
removed_bytes = reviewed_removed.to_csv(
|
|
||||||
index=False
|
|
||||||
).encode("utf-8-sig")
|
|
||||||
st.download_button(
|
|
||||||
"Download Reviewed Removed Rows",
|
|
||||||
data=removed_bytes,
|
|
||||||
file_name="removed_reviewed.csv",
|
|
||||||
mime="text/csv",
|
|
||||||
key="reviewed_removed_download",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Log entries
|
# Log entries
|
||||||
if result.log_entries:
|
if result.log_entries:
|
||||||
@@ -365,3 +407,27 @@ st.caption(
|
|||||||
"Runs locally. Your data never leaves this computer. "
|
"Runs locally. Your data never leaves this computer. "
|
||||||
"| DataTools v3.0"
|
"| DataTools v3.0"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Post-run auto-scroll
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# When Find Duplicates fires, the preview + options collapse, but
|
||||||
|
# Streamlit by itself doesn't scroll — the Results section sits below a
|
||||||
|
# tall page so the user has to hunt for it. Inject a tiny
|
||||||
|
# component-html iframe that calls ``scrollIntoView`` on the parent's
|
||||||
|
# Results anchor. The flag is one-shot (``pop`` removes it) so reruns
|
||||||
|
# triggered by unrelated widgets in the Results section don't yank the
|
||||||
|
# viewport back to the top of Results.
|
||||||
|
if st.session_state.pop("_dedup_scroll_to_results", False):
|
||||||
|
from streamlit.components.v1 import html as _components_html
|
||||||
|
_components_html(
|
||||||
|
"""
|
||||||
|
<script>
|
||||||
|
const doc = window.parent.document;
|
||||||
|
const target = doc.getElementById('dedup-results-anchor');
|
||||||
|
if (target) target.scrollIntoView({behavior: 'smooth', block: 'start'});
|
||||||
|
</script>
|
||||||
|
""",
|
||||||
|
height=0,
|
||||||
|
)
|
||||||
|
|||||||
@@ -99,9 +99,13 @@ except Exception as e:
|
|||||||
)
|
)
|
||||||
st.stop()
|
st.stop()
|
||||||
|
|
||||||
st.subheader(f"Preview: {uploaded.name}")
|
# Collapse the input preview once the user has clicked Standardize Formats
|
||||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
# so the Results section below is the primary visual focus. The user can
|
||||||
st.dataframe(df.head(10), use_container_width=True)
|
# re-expand the expander to re-inspect the source rows.
|
||||||
|
_has_result = st.session_state.get("fmtstd_result") is not None
|
||||||
|
with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result):
|
||||||
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||||
|
st.dataframe(df.head(10), use_container_width=True)
|
||||||
st.divider()
|
st.divider()
|
||||||
|
|
||||||
|
|
||||||
@@ -180,328 +184,335 @@ def _detect_field_type(col: str, samples: list[str]) -> FieldType | None:
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Options
|
# Options
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
st.subheader("Column types")
|
# Wrapped in an outer expander whose default state mirrors the preview
|
||||||
st.caption(
|
# expander above: open before a result exists, folded once the user has
|
||||||
"Assign each column to a field type. Auto-detected suggestions are "
|
# clicked Standardize Formats. Together they push the Results section to
|
||||||
"pre-filled; pick **(skip)** to leave a column untouched."
|
# the top of the visible area after a run.
|
||||||
)
|
|
||||||
|
|
||||||
_FIELD_LABELS = {
|
|
||||||
"(skip)": None,
|
|
||||||
"Date": FieldType.DATE,
|
|
||||||
"Phone": FieldType.PHONE,
|
|
||||||
"Currency": FieldType.CURRENCY,
|
|
||||||
"Name": FieldType.NAME,
|
|
||||||
"Address": FieldType.ADDRESS,
|
|
||||||
"Boolean": FieldType.BOOLEAN,
|
|
||||||
}
|
|
||||||
_LABEL_BY_TYPE = {v: k for k, v in _FIELD_LABELS.items()}
|
|
||||||
_LABELS = list(_FIELD_LABELS.keys())
|
|
||||||
|
|
||||||
sample_size = min(len(df), 200)
|
|
||||||
sample_df = df.head(sample_size)
|
|
||||||
|
|
||||||
column_types: dict[str, FieldType] = {}
|
column_types: dict[str, FieldType] = {}
|
||||||
cols_per_row = 3
|
|
||||||
columns_iter = list(df.columns)
|
|
||||||
for i in range(0, len(columns_iter), cols_per_row):
|
|
||||||
cols_block = st.columns(cols_per_row)
|
|
||||||
for j, col_name in enumerate(columns_iter[i:i + cols_per_row]):
|
|
||||||
with cols_block[j]:
|
|
||||||
detected = _detect_field_type(col_name, sample_df[col_name].tolist())
|
|
||||||
default_label = _LABEL_BY_TYPE.get(detected, "(skip)")
|
|
||||||
chosen = st.selectbox(
|
|
||||||
col_name,
|
|
||||||
_LABELS,
|
|
||||||
index=_LABELS.index(default_label),
|
|
||||||
key=f"fmtstd_type__{col_name}",
|
|
||||||
)
|
|
||||||
ft = _FIELD_LABELS[chosen]
|
|
||||||
if ft is not None:
|
|
||||||
column_types[col_name] = ft
|
|
||||||
|
|
||||||
st.divider()
|
|
||||||
st.subheader("Format options")
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Preset bundle picker
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
#
|
|
||||||
# Picking a preset rewrites every option below to that preset's defaults.
|
|
||||||
# It does NOT touch column-type assignments — those are user-driven and
|
|
||||||
# orthogonal. To make the rewrite stick across the rerun, we stash the
|
|
||||||
# preset values into the per-option session keys; the widgets below read
|
|
||||||
# those keys via their ``index``/``value`` arguments.
|
|
||||||
|
|
||||||
_PRESET_LABELS = {
|
|
||||||
"us-default": "US (default) — ISO 8601 dates · E.164 phones · USD",
|
|
||||||
"european": "European — DMY input · INTL phones · EUR comma decimal",
|
|
||||||
"uk": "UK — DD/MM/YYYY · GB phones · Yes/No booleans",
|
|
||||||
"iso-strict": "ISO Strict — ISO 8601 · bare-number currency · true/false",
|
|
||||||
"legacy-us": "Legacy US — MM/DD/YYYY · National phones · Yes/No",
|
|
||||||
"custom": "Custom — keep current settings",
|
|
||||||
}
|
|
||||||
|
|
||||||
preset_choice = st.radio(
|
|
||||||
"Standards preset",
|
|
||||||
list(_PRESET_LABELS.keys()),
|
|
||||||
format_func=lambda k: _PRESET_LABELS[k],
|
|
||||||
index=0,
|
|
||||||
horizontal=False,
|
|
||||||
key="fmtstd_preset",
|
|
||||||
help=(
|
|
||||||
"Pick a published standard or regional convention as the baseline. "
|
|
||||||
"Every option below is still individually overridable; choose "
|
|
||||||
"**Custom** to keep whatever you've manually adjusted."
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Detect a preset switch since the last rerun; when it changes (and the
|
|
||||||
# new choice isn't ``custom``), purge the dependent widget keys so
|
|
||||||
# Streamlit lets their ``index=``/``value=`` defaults take effect on the
|
|
||||||
# new render. Without this clear, prior session_state pins the widget to
|
|
||||||
# the previous preset's choice and the apparent picker becomes a no-op.
|
|
||||||
_DEPENDENT_KEYS = [
|
|
||||||
"fmtstd_date_format", "fmtstd_date_order",
|
|
||||||
"fmtstd_phone_format", "fmtstd_phone_region",
|
|
||||||
"fmtstd_currency_decimal", "fmtstd_currency_decimals",
|
|
||||||
"fmtstd_currency_preserve", "fmtstd_currency_preserve_code",
|
|
||||||
"fmtstd_name_case", "fmtstd_bool_style",
|
|
||||||
]
|
|
||||||
_last = st.session_state.get("fmtstd_preset_last")
|
|
||||||
if _last != preset_choice:
|
|
||||||
st.session_state["fmtstd_preset_last"] = preset_choice
|
|
||||||
if preset_choice != "custom":
|
|
||||||
for k in _DEPENDENT_KEYS:
|
|
||||||
st.session_state.pop(k, None)
|
|
||||||
st.rerun()
|
|
||||||
|
|
||||||
# Map preset → widget-state defaults. Done as labels so the radios/selects
|
|
||||||
# below pick up the right index without us re-implementing each map twice.
|
|
||||||
_PRESET_TO_WIDGETS: dict[str, dict[str, str]] = {
|
|
||||||
"us-default": {
|
|
||||||
"date_format": "YYYY-MM-DD (ISO)", "date_order": "MDY (US)",
|
|
||||||
"phone_format": "E.164 (+15551234567)", "phone_region": "US",
|
|
||||||
"currency_decimal": "dot (1,234.56)", "currency_decimals": 2,
|
|
||||||
"currency_preserve_code": False,
|
|
||||||
"name_case": "Title Case", "boolean_style": "True/False",
|
|
||||||
},
|
|
||||||
"european": {
|
|
||||||
"date_format": "YYYY-MM-DD (ISO)", "date_order": "DMY (EU)",
|
|
||||||
"phone_format": "International (+1 555-123-4567)", "phone_region": "DE",
|
|
||||||
"currency_decimal": "comma (1.234,56)", "currency_decimals": 2,
|
|
||||||
"currency_preserve_code": True,
|
|
||||||
"name_case": "Title Case", "boolean_style": "True/False",
|
|
||||||
},
|
|
||||||
"uk": {
|
|
||||||
"date_format": "DD/MM/YYYY", "date_order": "DMY (EU)",
|
|
||||||
"phone_format": "International (+1 555-123-4567)", "phone_region": "GB",
|
|
||||||
"currency_decimal": "dot (1,234.56)", "currency_decimals": 2,
|
|
||||||
"currency_preserve_code": False,
|
|
||||||
"name_case": "Title Case", "boolean_style": "Yes/No",
|
|
||||||
},
|
|
||||||
"iso-strict": {
|
|
||||||
"date_format": "YYYY-MM-DD (ISO)", "date_order": "MDY (US)",
|
|
||||||
"phone_format": "E.164 (+15551234567)", "phone_region": "US",
|
|
||||||
"currency_decimal": "dot (1,234.56)", "currency_decimals": 0,
|
|
||||||
"currency_preserve_code": True,
|
|
||||||
"name_case": "Title Case", "boolean_style": "true/false",
|
|
||||||
},
|
|
||||||
"legacy-us": {
|
|
||||||
"date_format": "MM/DD/YYYY", "date_order": "MDY (US)",
|
|
||||||
"phone_format": "National ((555) 123-4567)", "phone_region": "US",
|
|
||||||
"currency_decimal": "dot (1,234.56)", "currency_decimals": 2,
|
|
||||||
"currency_preserve_code": False,
|
|
||||||
"name_case": "Title Case", "boolean_style": "Yes/No",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
# ``iso-strict`` wants currency with no rounding; the GUI exposes that via
|
|
||||||
# the "preserve original precision" checkbox rather than a sentinel value
|
|
||||||
# in the number-input. Map that here.
|
|
||||||
_PRESET_PRESERVE_DECIMALS: dict[str, bool] = {
|
|
||||||
"iso-strict": True,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _preset_default(key: str, fallback):
|
|
||||||
"""Pull the preset-driven default for *key*, or *fallback* on Custom."""
|
|
||||||
if preset_choice == "custom":
|
|
||||||
return fallback
|
|
||||||
return _PRESET_TO_WIDGETS[preset_choice].get(key, fallback)
|
|
||||||
|
|
||||||
|
|
||||||
opt_cols = st.columns(2)
|
|
||||||
with opt_cols[0]:
|
|
||||||
st.markdown("**Dates**")
|
|
||||||
_DATE_LABELS = ["YYYY-MM-DD (ISO)", "MM/DD/YYYY", "DD/MM/YYYY", "DD-Mon-YYYY", "Mon DD, YYYY"]
|
|
||||||
date_format_label = st.selectbox(
|
|
||||||
"Output format",
|
|
||||||
_DATE_LABELS,
|
|
||||||
index=_DATE_LABELS.index(_preset_default("date_format", "YYYY-MM-DD (ISO)")),
|
|
||||||
key="fmtstd_date_format",
|
|
||||||
)
|
|
||||||
date_format_map = {
|
|
||||||
"YYYY-MM-DD (ISO)": "%Y-%m-%d",
|
|
||||||
"MM/DD/YYYY": "%m/%d/%Y",
|
|
||||||
"DD/MM/YYYY": "%d/%m/%Y",
|
|
||||||
"DD-Mon-YYYY": "%d-%b-%Y",
|
|
||||||
"Mon DD, YYYY": "%b %d, %Y",
|
|
||||||
}
|
|
||||||
_DATE_ORDER_LABELS = ["MDY (US)", "DMY (EU)"]
|
|
||||||
date_order = st.radio(
|
|
||||||
"Ambiguous input order (e.g. 01/02/2024)",
|
|
||||||
_DATE_ORDER_LABELS,
|
|
||||||
index=_DATE_ORDER_LABELS.index(_preset_default("date_order", "MDY (US)")),
|
|
||||||
horizontal=True,
|
|
||||||
key="fmtstd_date_order",
|
|
||||||
)
|
|
||||||
|
|
||||||
st.markdown("**Phones**")
|
|
||||||
_PHONE_LABELS = [
|
|
||||||
"E.164 (+15551234567)", "International (+1 555-123-4567)",
|
|
||||||
"National ((555) 123-4567)", "Digits only",
|
|
||||||
]
|
|
||||||
phone_format_label = st.selectbox(
|
|
||||||
"Output format",
|
|
||||||
_PHONE_LABELS,
|
|
||||||
index=_PHONE_LABELS.index(_preset_default("phone_format", "E.164 (+15551234567)")),
|
|
||||||
key="fmtstd_phone_format",
|
|
||||||
)
|
|
||||||
phone_format_map = {
|
|
||||||
"E.164 (+15551234567)": "E164",
|
|
||||||
"International (+1 555-123-4567)": "INTERNATIONAL",
|
|
||||||
"National ((555) 123-4567)": "NATIONAL",
|
|
||||||
"Digits only": "DIGITS",
|
|
||||||
}
|
|
||||||
phone_region = st.text_input(
|
|
||||||
"Default region (ISO-2)",
|
|
||||||
value=_preset_default("phone_region", "US"),
|
|
||||||
max_chars=2,
|
|
||||||
help="Region used when the input has no country code. ``US``, ``GB``, ``DE``, etc.",
|
|
||||||
key="fmtstd_phone_region",
|
|
||||||
).upper() or "US"
|
|
||||||
|
|
||||||
with opt_cols[1]:
|
|
||||||
st.markdown("**Currency**")
|
|
||||||
_CURR_DECIMAL_LABELS = ["dot (1,234.56)", "comma (1.234,56)"]
|
|
||||||
currency_decimal = st.radio(
|
|
||||||
"Decimal separator in input",
|
|
||||||
_CURR_DECIMAL_LABELS,
|
|
||||||
index=_CURR_DECIMAL_LABELS.index(_preset_default("currency_decimal", "dot (1,234.56)")),
|
|
||||||
horizontal=True,
|
|
||||||
key="fmtstd_currency_decimal",
|
|
||||||
)
|
|
||||||
currency_decimals = st.number_input(
|
|
||||||
"Round to decimals",
|
|
||||||
min_value=0, max_value=8,
|
|
||||||
value=int(_preset_default("currency_decimals", 2)),
|
|
||||||
step=1,
|
|
||||||
key="fmtstd_currency_decimals",
|
|
||||||
)
|
|
||||||
preserve_decimals = st.checkbox(
|
|
||||||
"Preserve original precision (don't round)",
|
|
||||||
value=_PRESET_PRESERVE_DECIMALS.get(preset_choice, False),
|
|
||||||
key="fmtstd_currency_preserve",
|
|
||||||
)
|
|
||||||
currency_preserve_code = st.checkbox(
|
|
||||||
"Preserve currency code (emit `USD 1234.56`, `EUR 99.00`, etc.)",
|
|
||||||
value=bool(_preset_default("currency_preserve_code", False)),
|
|
||||||
help=(
|
|
||||||
"Detects an ISO 4217 code or symbol in the input ($/€/£/¥/USD/"
|
|
||||||
"EUR/...) and re-emits it as a space-separated prefix on the "
|
|
||||||
"standardized number. Cells without a currency marker emit "
|
|
||||||
"just the number."
|
|
||||||
),
|
|
||||||
key="fmtstd_currency_preserve_code",
|
|
||||||
)
|
|
||||||
|
|
||||||
st.markdown("**Names**")
|
|
||||||
_NAME_CASE_LABELS = ["Title Case", "UPPER", "lower"]
|
|
||||||
name_case_label = st.selectbox(
|
|
||||||
"Casing",
|
|
||||||
_NAME_CASE_LABELS,
|
|
||||||
index=_NAME_CASE_LABELS.index(_preset_default("name_case", "Title Case")),
|
|
||||||
key="fmtstd_name_case",
|
|
||||||
)
|
|
||||||
name_case_map = {"Title Case": "title", "UPPER": "upper", "lower": "lower"}
|
|
||||||
|
|
||||||
st.markdown("**Booleans**")
|
|
||||||
_BOOL_LABELS = ["True/False", "true/false", "Yes/No", "Y/N", "1/0"]
|
|
||||||
boolean_style = st.selectbox(
|
|
||||||
"Output style",
|
|
||||||
_BOOL_LABELS,
|
|
||||||
index=_BOOL_LABELS.index(_preset_default("boolean_style", "True/False")),
|
|
||||||
key="fmtstd_bool_style",
|
|
||||||
)
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Address abbreviations — built-in USPS table is editable
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
#
|
|
||||||
# Users with international addresses (German Strasse, Spanish-language
|
|
||||||
# Avenida, French Boulevard variants) need to override the built-in
|
|
||||||
# table. Show it in a data_editor so the override is visible — the table
|
|
||||||
# is small, this is the right surface.
|
|
||||||
|
|
||||||
extra_abbreviations: dict[str, str] = {}
|
extra_abbreviations: dict[str, str] = {}
|
||||||
if any(ft == FieldType.ADDRESS for ft in column_types.values()):
|
|
||||||
with st.expander("Custom address abbreviations (advanced)", expanded=False):
|
|
||||||
st.caption(
|
|
||||||
"Add or override entries in the address abbreviation table. "
|
|
||||||
"Each row maps a short form (case-insensitive, periods OK) to "
|
|
||||||
"the long form the standardizer should emit. Built-in USPS "
|
|
||||||
"Pub. 28 entries (`St` → `Street`, `Ave` → `Avenue`, …) apply "
|
|
||||||
"automatically; rows here merge on top and can override them."
|
|
||||||
)
|
|
||||||
starter = pd.DataFrame(
|
|
||||||
[
|
|
||||||
{"abbreviation": "", "expansion": ""},
|
|
||||||
{"abbreviation": "", "expansion": ""},
|
|
||||||
{"abbreviation": "", "expansion": ""},
|
|
||||||
]
|
|
||||||
)
|
|
||||||
edited = st.data_editor(
|
|
||||||
starter,
|
|
||||||
num_rows="dynamic",
|
|
||||||
use_container_width=True,
|
|
||||||
column_config={
|
|
||||||
"abbreviation": st.column_config.TextColumn(
|
|
||||||
"Short form",
|
|
||||||
help="Case-insensitive, trailing period optional. e.g. ``Strasse``",
|
|
||||||
),
|
|
||||||
"expansion": st.column_config.TextColumn(
|
|
||||||
"Long form",
|
|
||||||
help="What the standardizer emits. e.g. ``Straße``",
|
|
||||||
),
|
|
||||||
},
|
|
||||||
key="fmtstd_extra_abbrev",
|
|
||||||
)
|
|
||||||
for _, row in edited.iterrows():
|
|
||||||
k = str(row.get("abbreviation") or "").strip()
|
|
||||||
v = str(row.get("expansion") or "").strip()
|
|
||||||
if k and v:
|
|
||||||
extra_abbreviations[k] = v
|
|
||||||
if extra_abbreviations:
|
|
||||||
st.success(
|
|
||||||
f"{len(extra_abbreviations)} custom mapping(s) will merge "
|
|
||||||
"with the built-in table."
|
|
||||||
)
|
|
||||||
|
|
||||||
options = StandardizeOptions(
|
with st.expander("Options", expanded=not _has_result):
|
||||||
column_types=column_types,
|
st.subheader("Column types")
|
||||||
date_output_format=date_format_map[date_format_label],
|
st.caption(
|
||||||
date_order="MDY" if date_order.startswith("MDY") else "DMY",
|
"Assign each column to a field type. Auto-detected suggestions are "
|
||||||
phone_format=phone_format_map[phone_format_label], # type: ignore[arg-type]
|
"pre-filled; pick **(skip)** to leave a column untouched."
|
||||||
phone_region=phone_region,
|
)
|
||||||
currency_decimal="dot" if currency_decimal.startswith("dot") else "comma",
|
|
||||||
currency_decimals=None if preserve_decimals else int(currency_decimals),
|
_FIELD_LABELS = {
|
||||||
currency_preserve_code=currency_preserve_code,
|
"(skip)": None,
|
||||||
name_case=name_case_map[name_case_label], # type: ignore[arg-type]
|
"Date": FieldType.DATE,
|
||||||
boolean_style=boolean_style, # type: ignore[arg-type]
|
"Phone": FieldType.PHONE,
|
||||||
extra_abbreviations=extra_abbreviations,
|
"Currency": FieldType.CURRENCY,
|
||||||
)
|
"Name": FieldType.NAME,
|
||||||
|
"Address": FieldType.ADDRESS,
|
||||||
|
"Boolean": FieldType.BOOLEAN,
|
||||||
|
}
|
||||||
|
_LABEL_BY_TYPE = {v: k for k, v in _FIELD_LABELS.items()}
|
||||||
|
_LABELS = list(_FIELD_LABELS.keys())
|
||||||
|
|
||||||
|
sample_size = min(len(df), 200)
|
||||||
|
sample_df = df.head(sample_size)
|
||||||
|
|
||||||
|
cols_per_row = 3
|
||||||
|
columns_iter = list(df.columns)
|
||||||
|
for i in range(0, len(columns_iter), cols_per_row):
|
||||||
|
cols_block = st.columns(cols_per_row)
|
||||||
|
for j, col_name in enumerate(columns_iter[i:i + cols_per_row]):
|
||||||
|
with cols_block[j]:
|
||||||
|
detected = _detect_field_type(col_name, sample_df[col_name].tolist())
|
||||||
|
default_label = _LABEL_BY_TYPE.get(detected, "(skip)")
|
||||||
|
chosen = st.selectbox(
|
||||||
|
col_name,
|
||||||
|
_LABELS,
|
||||||
|
index=_LABELS.index(default_label),
|
||||||
|
key=f"fmtstd_type__{col_name}",
|
||||||
|
)
|
||||||
|
ft = _FIELD_LABELS[chosen]
|
||||||
|
if ft is not None:
|
||||||
|
column_types[col_name] = ft
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.subheader("Format options")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Preset bundle picker
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# Picking a preset rewrites every option below to that preset's defaults.
|
||||||
|
# It does NOT touch column-type assignments — those are user-driven and
|
||||||
|
# orthogonal. To make the rewrite stick across the rerun, we stash the
|
||||||
|
# preset values into the per-option session keys; the widgets below read
|
||||||
|
# those keys via their ``index``/``value`` arguments.
|
||||||
|
|
||||||
|
_PRESET_LABELS = {
|
||||||
|
"us-default": "US (default) — ISO 8601 dates · E.164 phones · USD",
|
||||||
|
"european": "European — DMY input · INTL phones · EUR comma decimal",
|
||||||
|
"uk": "UK — DD/MM/YYYY · GB phones · Yes/No booleans",
|
||||||
|
"iso-strict": "ISO Strict — ISO 8601 · bare-number currency · true/false",
|
||||||
|
"legacy-us": "Legacy US — MM/DD/YYYY · National phones · Yes/No",
|
||||||
|
"custom": "Custom — keep current settings",
|
||||||
|
}
|
||||||
|
|
||||||
|
preset_choice = st.radio(
|
||||||
|
"Standards preset",
|
||||||
|
list(_PRESET_LABELS.keys()),
|
||||||
|
format_func=lambda k: _PRESET_LABELS[k],
|
||||||
|
index=0,
|
||||||
|
horizontal=False,
|
||||||
|
key="fmtstd_preset",
|
||||||
|
help=(
|
||||||
|
"Pick a published standard or regional convention as the baseline. "
|
||||||
|
"Every option below is still individually overridable; choose "
|
||||||
|
"**Custom** to keep whatever you've manually adjusted."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Detect a preset switch since the last rerun; when it changes (and the
|
||||||
|
# new choice isn't ``custom``), purge the dependent widget keys so
|
||||||
|
# Streamlit lets their ``index=``/``value=`` defaults take effect on the
|
||||||
|
# new render. Without this clear, prior session_state pins the widget to
|
||||||
|
# the previous preset's choice and the apparent picker becomes a no-op.
|
||||||
|
_DEPENDENT_KEYS = [
|
||||||
|
"fmtstd_date_format", "fmtstd_date_order",
|
||||||
|
"fmtstd_phone_format", "fmtstd_phone_region",
|
||||||
|
"fmtstd_currency_decimal", "fmtstd_currency_decimals",
|
||||||
|
"fmtstd_currency_preserve", "fmtstd_currency_preserve_code",
|
||||||
|
"fmtstd_name_case", "fmtstd_bool_style",
|
||||||
|
]
|
||||||
|
_last = st.session_state.get("fmtstd_preset_last")
|
||||||
|
if _last != preset_choice:
|
||||||
|
st.session_state["fmtstd_preset_last"] = preset_choice
|
||||||
|
if preset_choice != "custom":
|
||||||
|
for k in _DEPENDENT_KEYS:
|
||||||
|
st.session_state.pop(k, None)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
# Map preset → widget-state defaults. Done as labels so the radios/selects
|
||||||
|
# below pick up the right index without us re-implementing each map twice.
|
||||||
|
_PRESET_TO_WIDGETS: dict[str, dict[str, str]] = {
|
||||||
|
"us-default": {
|
||||||
|
"date_format": "YYYY-MM-DD (ISO)", "date_order": "MDY (US)",
|
||||||
|
"phone_format": "E.164 (+15551234567)", "phone_region": "US",
|
||||||
|
"currency_decimal": "dot (1,234.56)", "currency_decimals": 2,
|
||||||
|
"currency_preserve_code": False,
|
||||||
|
"name_case": "Title Case", "boolean_style": "True/False",
|
||||||
|
},
|
||||||
|
"european": {
|
||||||
|
"date_format": "YYYY-MM-DD (ISO)", "date_order": "DMY (EU)",
|
||||||
|
"phone_format": "International (+1 555-123-4567)", "phone_region": "DE",
|
||||||
|
"currency_decimal": "comma (1.234,56)", "currency_decimals": 2,
|
||||||
|
"currency_preserve_code": True,
|
||||||
|
"name_case": "Title Case", "boolean_style": "True/False",
|
||||||
|
},
|
||||||
|
"uk": {
|
||||||
|
"date_format": "DD/MM/YYYY", "date_order": "DMY (EU)",
|
||||||
|
"phone_format": "International (+1 555-123-4567)", "phone_region": "GB",
|
||||||
|
"currency_decimal": "dot (1,234.56)", "currency_decimals": 2,
|
||||||
|
"currency_preserve_code": False,
|
||||||
|
"name_case": "Title Case", "boolean_style": "Yes/No",
|
||||||
|
},
|
||||||
|
"iso-strict": {
|
||||||
|
"date_format": "YYYY-MM-DD (ISO)", "date_order": "MDY (US)",
|
||||||
|
"phone_format": "E.164 (+15551234567)", "phone_region": "US",
|
||||||
|
"currency_decimal": "dot (1,234.56)", "currency_decimals": 0,
|
||||||
|
"currency_preserve_code": True,
|
||||||
|
"name_case": "Title Case", "boolean_style": "true/false",
|
||||||
|
},
|
||||||
|
"legacy-us": {
|
||||||
|
"date_format": "MM/DD/YYYY", "date_order": "MDY (US)",
|
||||||
|
"phone_format": "National ((555) 123-4567)", "phone_region": "US",
|
||||||
|
"currency_decimal": "dot (1,234.56)", "currency_decimals": 2,
|
||||||
|
"currency_preserve_code": False,
|
||||||
|
"name_case": "Title Case", "boolean_style": "Yes/No",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# ``iso-strict`` wants currency with no rounding; the GUI exposes that via
|
||||||
|
# the "preserve original precision" checkbox rather than a sentinel value
|
||||||
|
# in the number-input. Map that here.
|
||||||
|
_PRESET_PRESERVE_DECIMALS: dict[str, bool] = {
|
||||||
|
"iso-strict": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _preset_default(key: str, fallback):
|
||||||
|
"""Pull the preset-driven default for *key*, or *fallback* on Custom."""
|
||||||
|
if preset_choice == "custom":
|
||||||
|
return fallback
|
||||||
|
return _PRESET_TO_WIDGETS[preset_choice].get(key, fallback)
|
||||||
|
|
||||||
|
|
||||||
|
opt_cols = st.columns(2)
|
||||||
|
with opt_cols[0]:
|
||||||
|
st.markdown("**Dates**")
|
||||||
|
_DATE_LABELS = ["YYYY-MM-DD (ISO)", "MM/DD/YYYY", "DD/MM/YYYY", "DD-Mon-YYYY", "Mon DD, YYYY"]
|
||||||
|
date_format_label = st.selectbox(
|
||||||
|
"Output format",
|
||||||
|
_DATE_LABELS,
|
||||||
|
index=_DATE_LABELS.index(_preset_default("date_format", "YYYY-MM-DD (ISO)")),
|
||||||
|
key="fmtstd_date_format",
|
||||||
|
)
|
||||||
|
date_format_map = {
|
||||||
|
"YYYY-MM-DD (ISO)": "%Y-%m-%d",
|
||||||
|
"MM/DD/YYYY": "%m/%d/%Y",
|
||||||
|
"DD/MM/YYYY": "%d/%m/%Y",
|
||||||
|
"DD-Mon-YYYY": "%d-%b-%Y",
|
||||||
|
"Mon DD, YYYY": "%b %d, %Y",
|
||||||
|
}
|
||||||
|
_DATE_ORDER_LABELS = ["MDY (US)", "DMY (EU)"]
|
||||||
|
date_order = st.radio(
|
||||||
|
"Ambiguous input order (e.g. 01/02/2024)",
|
||||||
|
_DATE_ORDER_LABELS,
|
||||||
|
index=_DATE_ORDER_LABELS.index(_preset_default("date_order", "MDY (US)")),
|
||||||
|
horizontal=True,
|
||||||
|
key="fmtstd_date_order",
|
||||||
|
)
|
||||||
|
|
||||||
|
st.markdown("**Phones**")
|
||||||
|
_PHONE_LABELS = [
|
||||||
|
"E.164 (+15551234567)", "International (+1 555-123-4567)",
|
||||||
|
"National ((555) 123-4567)", "Digits only",
|
||||||
|
]
|
||||||
|
phone_format_label = st.selectbox(
|
||||||
|
"Output format",
|
||||||
|
_PHONE_LABELS,
|
||||||
|
index=_PHONE_LABELS.index(_preset_default("phone_format", "E.164 (+15551234567)")),
|
||||||
|
key="fmtstd_phone_format",
|
||||||
|
)
|
||||||
|
phone_format_map = {
|
||||||
|
"E.164 (+15551234567)": "E164",
|
||||||
|
"International (+1 555-123-4567)": "INTERNATIONAL",
|
||||||
|
"National ((555) 123-4567)": "NATIONAL",
|
||||||
|
"Digits only": "DIGITS",
|
||||||
|
}
|
||||||
|
phone_region = st.text_input(
|
||||||
|
"Default region (ISO-2)",
|
||||||
|
value=_preset_default("phone_region", "US"),
|
||||||
|
max_chars=2,
|
||||||
|
help="Region used when the input has no country code. ``US``, ``GB``, ``DE``, etc.",
|
||||||
|
key="fmtstd_phone_region",
|
||||||
|
).upper() or "US"
|
||||||
|
|
||||||
|
with opt_cols[1]:
|
||||||
|
st.markdown("**Currency**")
|
||||||
|
_CURR_DECIMAL_LABELS = ["dot (1,234.56)", "comma (1.234,56)"]
|
||||||
|
currency_decimal = st.radio(
|
||||||
|
"Decimal separator in input",
|
||||||
|
_CURR_DECIMAL_LABELS,
|
||||||
|
index=_CURR_DECIMAL_LABELS.index(_preset_default("currency_decimal", "dot (1,234.56)")),
|
||||||
|
horizontal=True,
|
||||||
|
key="fmtstd_currency_decimal",
|
||||||
|
)
|
||||||
|
currency_decimals = st.number_input(
|
||||||
|
"Round to decimals",
|
||||||
|
min_value=0, max_value=8,
|
||||||
|
value=int(_preset_default("currency_decimals", 2)),
|
||||||
|
step=1,
|
||||||
|
key="fmtstd_currency_decimals",
|
||||||
|
)
|
||||||
|
preserve_decimals = st.checkbox(
|
||||||
|
"Preserve original precision (don't round)",
|
||||||
|
value=_PRESET_PRESERVE_DECIMALS.get(preset_choice, False),
|
||||||
|
key="fmtstd_currency_preserve",
|
||||||
|
)
|
||||||
|
currency_preserve_code = st.checkbox(
|
||||||
|
"Preserve currency code (emit `USD 1234.56`, `EUR 99.00`, etc.)",
|
||||||
|
value=bool(_preset_default("currency_preserve_code", False)),
|
||||||
|
help=(
|
||||||
|
"Detects an ISO 4217 code or symbol in the input ($/€/£/¥/USD/"
|
||||||
|
"EUR/...) and re-emits it as a space-separated prefix on the "
|
||||||
|
"standardized number. Cells without a currency marker emit "
|
||||||
|
"just the number."
|
||||||
|
),
|
||||||
|
key="fmtstd_currency_preserve_code",
|
||||||
|
)
|
||||||
|
|
||||||
|
st.markdown("**Names**")
|
||||||
|
_NAME_CASE_LABELS = ["Title Case", "UPPER", "lower"]
|
||||||
|
name_case_label = st.selectbox(
|
||||||
|
"Casing",
|
||||||
|
_NAME_CASE_LABELS,
|
||||||
|
index=_NAME_CASE_LABELS.index(_preset_default("name_case", "Title Case")),
|
||||||
|
key="fmtstd_name_case",
|
||||||
|
)
|
||||||
|
name_case_map = {"Title Case": "title", "UPPER": "upper", "lower": "lower"}
|
||||||
|
|
||||||
|
st.markdown("**Booleans**")
|
||||||
|
_BOOL_LABELS = ["True/False", "true/false", "Yes/No", "Y/N", "1/0"]
|
||||||
|
boolean_style = st.selectbox(
|
||||||
|
"Output style",
|
||||||
|
_BOOL_LABELS,
|
||||||
|
index=_BOOL_LABELS.index(_preset_default("boolean_style", "True/False")),
|
||||||
|
key="fmtstd_bool_style",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Address abbreviations — built-in USPS table is editable
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# Users with international addresses (German Strasse, Spanish-language
|
||||||
|
# Avenida, French Boulevard variants) need to override the built-in
|
||||||
|
# table. Show it in a data_editor so the override is visible — the table
|
||||||
|
# is small, this is the right surface.
|
||||||
|
|
||||||
|
if any(ft == FieldType.ADDRESS for ft in column_types.values()):
|
||||||
|
with st.expander("Custom address abbreviations (advanced)", expanded=False):
|
||||||
|
st.caption(
|
||||||
|
"Add or override entries in the address abbreviation table. "
|
||||||
|
"Each row maps a short form (case-insensitive, periods OK) to "
|
||||||
|
"the long form the standardizer should emit. Built-in USPS "
|
||||||
|
"Pub. 28 entries (`St` → `Street`, `Ave` → `Avenue`, …) apply "
|
||||||
|
"automatically; rows here merge on top and can override them."
|
||||||
|
)
|
||||||
|
starter = pd.DataFrame(
|
||||||
|
[
|
||||||
|
{"abbreviation": "", "expansion": ""},
|
||||||
|
{"abbreviation": "", "expansion": ""},
|
||||||
|
{"abbreviation": "", "expansion": ""},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
edited = st.data_editor(
|
||||||
|
starter,
|
||||||
|
num_rows="dynamic",
|
||||||
|
use_container_width=True,
|
||||||
|
column_config={
|
||||||
|
"abbreviation": st.column_config.TextColumn(
|
||||||
|
"Short form",
|
||||||
|
help="Case-insensitive, trailing period optional. e.g. ``Strasse``",
|
||||||
|
),
|
||||||
|
"expansion": st.column_config.TextColumn(
|
||||||
|
"Long form",
|
||||||
|
help="What the standardizer emits. e.g. ``Straße``",
|
||||||
|
),
|
||||||
|
},
|
||||||
|
key="fmtstd_extra_abbrev",
|
||||||
|
)
|
||||||
|
for _, row in edited.iterrows():
|
||||||
|
k = str(row.get("abbreviation") or "").strip()
|
||||||
|
v = str(row.get("expansion") or "").strip()
|
||||||
|
if k and v:
|
||||||
|
extra_abbreviations[k] = v
|
||||||
|
if extra_abbreviations:
|
||||||
|
st.success(
|
||||||
|
f"{len(extra_abbreviations)} custom mapping(s) will merge "
|
||||||
|
"with the built-in table."
|
||||||
|
)
|
||||||
|
|
||||||
|
options = StandardizeOptions(
|
||||||
|
column_types=column_types,
|
||||||
|
date_output_format=date_format_map[date_format_label],
|
||||||
|
date_order="MDY" if date_order.startswith("MDY") else "DMY",
|
||||||
|
phone_format=phone_format_map[phone_format_label], # type: ignore[arg-type]
|
||||||
|
phone_region=phone_region,
|
||||||
|
currency_decimal="dot" if currency_decimal.startswith("dot") else "comma",
|
||||||
|
currency_decimals=None if preserve_decimals else int(currency_decimals),
|
||||||
|
currency_preserve_code=currency_preserve_code,
|
||||||
|
name_case=name_case_map[name_case_label], # type: ignore[arg-type]
|
||||||
|
boolean_style=boolean_style, # type: ignore[arg-type]
|
||||||
|
extra_abbreviations=extra_abbreviations,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -528,6 +539,14 @@ if st.button(
|
|||||||
st.stop()
|
st.stop()
|
||||||
st.session_state["fmtstd_result"] = result
|
st.session_state["fmtstd_result"] = result
|
||||||
st.session_state["fmtstd_input_name"] = uploaded.name
|
st.session_state["fmtstd_input_name"] = uploaded.name
|
||||||
|
# One-shot flag picked up on the next pass to scroll the parent
|
||||||
|
# document to the Results anchor (see scroll snippet below).
|
||||||
|
st.session_state["_fmtstd_scroll_to_results"] = True
|
||||||
|
# Force a second rerun so the preview and options expanders see
|
||||||
|
# the new result on the NEXT script pass and collapse themselves.
|
||||||
|
# Without this they stay expanded until the user touches any
|
||||||
|
# other widget.
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
result = st.session_state.get("fmtstd_result")
|
result = st.session_state.get("fmtstd_result")
|
||||||
if result is None:
|
if result is None:
|
||||||
@@ -538,6 +557,16 @@ if result is None:
|
|||||||
# Results
|
# Results
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Anchor target for the auto-scroll snippet at the end of this block.
|
||||||
|
# A bare ``<div id="...">`` survives Streamlit's HTML sanitizer (only
|
||||||
|
# ``<script>`` is stripped), and a 1px-tall div doesn't visually shift
|
||||||
|
# anything. Placed before the subheader so the scrolled-to viewport
|
||||||
|
# starts a few pixels above the section heading rather than below it.
|
||||||
|
st.markdown(
|
||||||
|
'<div id="fmtstd-results-anchor" style="height:1px"></div>',
|
||||||
|
unsafe_allow_html=True,
|
||||||
|
)
|
||||||
|
|
||||||
st.subheader("Results")
|
st.subheader("Results")
|
||||||
|
|
||||||
pct = (result.cells_changed / result.cells_total * 100.0) if result.cells_total else 0.0
|
pct = (result.cells_changed / result.cells_total * 100.0) if result.cells_total else 0.0
|
||||||
@@ -574,36 +603,83 @@ st.dataframe(result.standardized_df.head(10), use_container_width=True)
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Downloads
|
# Downloads
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# All three byte buffers are prepared up front (outside the columns) so
|
||||||
|
# each ``st.download_button`` sees stable ``data`` across reruns and an
|
||||||
|
# explicit ``key`` — without those, Streamlit auto-derived widget IDs
|
||||||
|
# can collide for multiple download_buttons in adjacent columns and
|
||||||
|
# only the first one actually fires on click. The empty-changes case
|
||||||
|
# now renders a disabled button (rather than vanishing) so the layout
|
||||||
|
# stays steady and the user understands why nothing's available.
|
||||||
|
|
||||||
st.divider()
|
st.divider()
|
||||||
stem = Path(st.session_state.get("fmtstd_input_name", "input")).stem
|
stem = Path(st.session_state.get("fmtstd_input_name", "input")).stem
|
||||||
|
|
||||||
|
standardized_bytes = result.standardized_df.to_csv(index=False).encode("utf-8-sig")
|
||||||
|
changes_bytes = (
|
||||||
|
result.changes.to_csv(index=False).encode("utf-8-sig")
|
||||||
|
if not result.changes.empty
|
||||||
|
else b""
|
||||||
|
)
|
||||||
|
config_bytes = json.dumps(options.to_dict(), indent=2).encode("utf-8")
|
||||||
|
|
||||||
dl_a, dl_b, dl_c = st.columns(3)
|
dl_a, dl_b, dl_c = st.columns(3)
|
||||||
with dl_a:
|
with dl_a:
|
||||||
standardized_bytes = result.standardized_df.to_csv(index=False).encode("utf-8-sig")
|
|
||||||
st.download_button(
|
st.download_button(
|
||||||
"Download standardized CSV",
|
"Download standardized CSV",
|
||||||
data=standardized_bytes,
|
data=standardized_bytes,
|
||||||
file_name=f"{stem}_standardized.csv",
|
file_name=f"{stem}_standardized.csv",
|
||||||
mime="text/csv",
|
mime="text/csv",
|
||||||
|
key="fmtstd_dl_standardized",
|
||||||
|
use_container_width=True,
|
||||||
)
|
)
|
||||||
with dl_b:
|
with dl_b:
|
||||||
if not result.changes.empty:
|
st.download_button(
|
||||||
changes_bytes = result.changes.to_csv(index=False).encode("utf-8-sig")
|
"Download changes audit",
|
||||||
st.download_button(
|
data=changes_bytes,
|
||||||
"Download changes audit",
|
file_name=f"{stem}_changes.csv",
|
||||||
data=changes_bytes,
|
mime="text/csv",
|
||||||
file_name=f"{stem}_changes.csv",
|
key="fmtstd_dl_changes",
|
||||||
mime="text/csv",
|
disabled=result.changes.empty,
|
||||||
)
|
help="No changes to audit." if result.changes.empty else None,
|
||||||
|
use_container_width=True,
|
||||||
|
)
|
||||||
with dl_c:
|
with dl_c:
|
||||||
config_bytes = json.dumps(options.to_dict(), indent=2).encode("utf-8")
|
|
||||||
st.download_button(
|
st.download_button(
|
||||||
"Download config JSON",
|
"Download config JSON",
|
||||||
data=config_bytes,
|
data=config_bytes,
|
||||||
file_name="format_standardize_config.json",
|
file_name="format_standardize_config.json",
|
||||||
mime="application/json",
|
mime="application/json",
|
||||||
|
key="fmtstd_dl_config",
|
||||||
|
use_container_width=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
st.divider()
|
st.divider()
|
||||||
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")
|
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Post-run auto-scroll
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# When the user clicks Standardize Formats, the preview + options collapse
|
||||||
|
# but Streamlit by itself doesn't scroll — the Results section is at the
|
||||||
|
# bottom of a tall script so the user has to find it. Inject a tiny
|
||||||
|
# component-html iframe that calls ``scrollIntoView`` on the parent's
|
||||||
|
# Results anchor. Streamlit's main page is same-origin with component
|
||||||
|
# iframes so ``window.parent.document`` access is allowed.
|
||||||
|
#
|
||||||
|
# The flag is one-shot (``pop`` removes it) so re-renders triggered by
|
||||||
|
# unrelated widgets in the Results section don't yank the viewport back
|
||||||
|
# to the top of Results.
|
||||||
|
if st.session_state.pop("_fmtstd_scroll_to_results", False):
|
||||||
|
from streamlit.components.v1 import html as _components_html
|
||||||
|
_components_html(
|
||||||
|
"""
|
||||||
|
<script>
|
||||||
|
const doc = window.parent.document;
|
||||||
|
const target = doc.getElementById('fmtstd-results-anchor');
|
||||||
|
if (target) target.scrollIntoView({behavior: 'smooth', block: 'start'});
|
||||||
|
</script>
|
||||||
|
""",
|
||||||
|
height=0,
|
||||||
|
)
|
||||||
|
|||||||
@@ -95,175 +95,186 @@ except Exception as e:
|
|||||||
)
|
)
|
||||||
st.stop()
|
st.stop()
|
||||||
|
|
||||||
st.subheader(f"Preview: {uploaded.name}")
|
# Collapse the input preview + options once the user has clicked
|
||||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
# Handle Missing Values so the Results section below is the primary
|
||||||
st.dataframe(df.head(10), use_container_width=True)
|
# visual focus. The user can re-expand to re-inspect the source rows
|
||||||
|
# or tweak strategy and rerun.
|
||||||
|
_has_result = st.session_state.get("missing_result") is not None
|
||||||
|
|
||||||
|
with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result):
|
||||||
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||||
|
st.dataframe(df.head(10), use_container_width=True)
|
||||||
|
|
||||||
st.divider()
|
st.divider()
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Initial profile (read-only)
|
# Options (Missingness profile + Strategy)
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# Wrapped in an outer expander whose default state mirrors the preview
|
||||||
|
# expander above: open before a result exists, folded once the user has
|
||||||
|
# clicked Handle Missing Values. The Missingness profile lives inside
|
||||||
|
# this expander too — after a run the Results section shows a richer
|
||||||
|
# before-vs-after comparison that supersedes the static input profile,
|
||||||
|
# so keeping it tucked away with the controls cleanly pushes Results
|
||||||
|
# to the top of the visible area.
|
||||||
|
|
||||||
st.subheader("Missingness profile")
|
with st.expander("Options", expanded=not _has_result):
|
||||||
|
st.subheader("Missingness profile")
|
||||||
|
|
||||||
initial_profile = profile_missing(df, MissingOptions())
|
initial_profile = profile_missing(df, MissingOptions())
|
||||||
prof_df = initial_profile.to_dataframe()
|
prof_df = initial_profile.to_dataframe()
|
||||||
|
|
||||||
m1, m2, m3, m4 = st.columns(4)
|
m1, m2, m3, m4 = st.columns(4)
|
||||||
m1.metric("Rows", initial_profile.rows_total)
|
m1.metric("Rows", initial_profile.rows_total)
|
||||||
m2.metric("Cells missing", initial_profile.cells_missing)
|
m2.metric("Cells missing", initial_profile.cells_missing)
|
||||||
m3.metric("% cells missing", f"{initial_profile.cells_missing_pct:.1f}%")
|
m3.metric("% cells missing", f"{initial_profile.cells_missing_pct:.1f}%")
|
||||||
m4.metric("Complete rows", initial_profile.rows_complete)
|
m4.metric("Complete rows", initial_profile.rows_complete)
|
||||||
|
|
||||||
st.dataframe(prof_df, use_container_width=True, hide_index=True)
|
st.dataframe(prof_df, use_container_width=True, hide_index=True)
|
||||||
|
|
||||||
if initial_profile.cells_missing == 0:
|
if initial_profile.cells_missing == 0:
|
||||||
st.success("No missing values or disguised nulls detected. Nothing to handle.")
|
st.success("No missing values or disguised nulls detected. Nothing to handle.")
|
||||||
|
|
||||||
st.divider()
|
st.divider()
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
st.subheader("Strategy")
|
||||||
# Options
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
st.subheader("Strategy")
|
preset_label = st.radio(
|
||||||
|
"Preset",
|
||||||
|
[
|
||||||
|
"detect-only (standardize sentinels to NaN, no fill or drop)",
|
||||||
|
"safe-fill (numeric → median, categorical → mode)",
|
||||||
|
"drop-incomplete (drop any row with missing)",
|
||||||
|
],
|
||||||
|
index=0,
|
||||||
|
help=(
|
||||||
|
"detect-only: replace 'N/A', '-', 'NULL', etc. with real NaN, then stop. "
|
||||||
|
"safe-fill: also fill — numeric columns with median, others with mode. "
|
||||||
|
"drop-incomplete: also drop every row that has any missing cell."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
preset_key = preset_label.split(" ", 1)[0]
|
||||||
|
options = MissingOptions.from_preset(preset_key)
|
||||||
|
|
||||||
preset_label = st.radio(
|
with st.expander("Advanced options"):
|
||||||
"Preset",
|
col_a, col_b = st.columns(2)
|
||||||
[
|
|
||||||
"detect-only (standardize sentinels to NaN, no fill or drop)",
|
|
||||||
"safe-fill (numeric → median, categorical → mode)",
|
|
||||||
"drop-incomplete (drop any row with missing)",
|
|
||||||
],
|
|
||||||
index=0,
|
|
||||||
help=(
|
|
||||||
"detect-only: replace 'N/A', '-', 'NULL', etc. with real NaN, then stop. "
|
|
||||||
"safe-fill: also fill — numeric columns with median, others with mode. "
|
|
||||||
"drop-incomplete: also drop every row that has any missing cell."
|
|
||||||
),
|
|
||||||
)
|
|
||||||
preset_key = preset_label.split(" ", 1)[0]
|
|
||||||
options = MissingOptions.from_preset(preset_key)
|
|
||||||
|
|
||||||
with st.expander("Advanced options"):
|
with col_a:
|
||||||
col_a, col_b = st.columns(2)
|
st.markdown("**Detection**")
|
||||||
|
options.standardize_sentinels = st.checkbox(
|
||||||
with col_a:
|
"Standardize disguised nulls to NaN",
|
||||||
st.markdown("**Detection**")
|
value=options.standardize_sentinels,
|
||||||
options.standardize_sentinels = st.checkbox(
|
help="Replace 'N/A', '-', 'NULL', whitespace-only cells, etc. with real NaN.",
|
||||||
"Standardize disguised nulls to NaN",
|
|
||||||
value=options.standardize_sentinels,
|
|
||||||
help="Replace 'N/A', '-', 'NULL', whitespace-only cells, etc. with real NaN.",
|
|
||||||
)
|
|
||||||
sentinels_text = st.text_input(
|
|
||||||
"Sentinel values (comma-separated)",
|
|
||||||
value=", ".join(options.sentinels),
|
|
||||||
disabled=not options.standardize_sentinels,
|
|
||||||
help="Matched case-insensitively after stripping whitespace.",
|
|
||||||
)
|
|
||||||
options.sentinels = [
|
|
||||||
s.strip() for s in sentinels_text.split(",") if s.strip()
|
|
||||||
]
|
|
||||||
|
|
||||||
with col_b:
|
|
||||||
st.markdown("**Strategy override**")
|
|
||||||
strat_options = [
|
|
||||||
"(use preset)",
|
|
||||||
"none", "drop_row", "drop_col", "drop_both",
|
|
||||||
"mean", "median", "mode", "constant",
|
|
||||||
"ffill", "bfill", "interpolate",
|
|
||||||
]
|
|
||||||
strat_choice = st.selectbox(
|
|
||||||
"Global strategy",
|
|
||||||
strat_options,
|
|
||||||
index=0,
|
|
||||||
help=(
|
|
||||||
"drop_row / drop_col use the thresholds below. "
|
|
||||||
"mean / median / interpolate are numeric only — non-numeric "
|
|
||||||
"columns fall back to the categorical strategy."
|
|
||||||
),
|
|
||||||
)
|
|
||||||
if strat_choice != "(use preset)":
|
|
||||||
options.strategy = strat_choice # type: ignore[assignment]
|
|
||||||
|
|
||||||
cat_strat = st.selectbox(
|
|
||||||
"Categorical fallback (for non-numeric columns)",
|
|
||||||
["mode", "constant", "ffill", "bfill", "none"],
|
|
||||||
index=0,
|
|
||||||
)
|
|
||||||
options.categorical_strategy = cat_strat # type: ignore[assignment]
|
|
||||||
|
|
||||||
if options.strategy == "constant" or cat_strat == "constant":
|
|
||||||
fill_val = st.text_input(
|
|
||||||
"Constant fill value",
|
|
||||||
value="",
|
|
||||||
help="Used when strategy = constant. Leave blank to fill with empty string.",
|
|
||||||
)
|
)
|
||||||
options.fill_value = fill_val
|
sentinels_text = st.text_input(
|
||||||
|
"Sentinel values (comma-separated)",
|
||||||
|
value=", ".join(options.sentinels),
|
||||||
|
disabled=not options.standardize_sentinels,
|
||||||
|
help="Matched case-insensitively after stripping whitespace.",
|
||||||
|
)
|
||||||
|
options.sentinels = [
|
||||||
|
s.strip() for s in sentinels_text.split(",") if s.strip()
|
||||||
|
]
|
||||||
|
|
||||||
st.markdown("**Drop thresholds**")
|
with col_b:
|
||||||
col_c, col_d = st.columns(2)
|
st.markdown("**Strategy override**")
|
||||||
with col_c:
|
strat_options = [
|
||||||
options.row_drop_threshold = st.slider(
|
"(use preset)",
|
||||||
"Row drop threshold (drop rows with ≥ this fraction missing across selected cols)",
|
"none", "drop_row", "drop_col", "drop_both",
|
||||||
0.0, 1.0, options.row_drop_threshold, 0.05,
|
"mean", "median", "mode", "constant",
|
||||||
)
|
"ffill", "bfill", "interpolate",
|
||||||
with col_d:
|
]
|
||||||
options.col_drop_threshold = st.slider(
|
strat_choice = st.selectbox(
|
||||||
"Column drop threshold (drop columns with ≥ this fraction missing)",
|
"Global strategy",
|
||||||
0.0, 1.0, options.col_drop_threshold, 0.05,
|
strat_options,
|
||||||
)
|
index=0,
|
||||||
|
help=(
|
||||||
st.markdown("**Scope**")
|
"drop_row / drop_col use the thresholds below. "
|
||||||
selected_cols = st.multiselect(
|
"mean / median / interpolate are numeric only — non-numeric "
|
||||||
"Columns to handle (default: all)",
|
"columns fall back to the categorical strategy."
|
||||||
options=list(df.columns),
|
|
||||||
default=list(df.columns),
|
|
||||||
)
|
|
||||||
skip_cols = st.multiselect(
|
|
||||||
"Columns to skip",
|
|
||||||
options=list(df.columns),
|
|
||||||
default=[],
|
|
||||||
)
|
|
||||||
options.columns = selected_cols if selected_cols else None
|
|
||||||
options.skip_columns = list(skip_cols)
|
|
||||||
|
|
||||||
st.markdown("**Per-column strategy overrides** (optional)")
|
|
||||||
st.caption(
|
|
||||||
"Set a different strategy for specific columns. Leave any row blank to "
|
|
||||||
"use the global strategy."
|
|
||||||
)
|
|
||||||
per_col_overrides: dict[str, str] = {}
|
|
||||||
only_missing_cols = [
|
|
||||||
r.column for r in initial_profile.columns if r.has_missing
|
|
||||||
]
|
|
||||||
if only_missing_cols:
|
|
||||||
edit_df = pd.DataFrame({
|
|
||||||
"column": only_missing_cols,
|
|
||||||
"strategy": ["" for _ in only_missing_cols],
|
|
||||||
})
|
|
||||||
edited = st.data_editor(
|
|
||||||
edit_df,
|
|
||||||
use_container_width=True,
|
|
||||||
hide_index=True,
|
|
||||||
column_config={
|
|
||||||
"column": st.column_config.TextColumn("Column", disabled=True),
|
|
||||||
"strategy": st.column_config.SelectboxColumn(
|
|
||||||
"Override",
|
|
||||||
options=[
|
|
||||||
"", "drop_row", "drop_col",
|
|
||||||
"mean", "median", "mode", "constant",
|
|
||||||
"ffill", "bfill", "interpolate",
|
|
||||||
],
|
|
||||||
),
|
),
|
||||||
},
|
)
|
||||||
key="missing_per_col_editor",
|
if strat_choice != "(use preset)":
|
||||||
|
options.strategy = strat_choice # type: ignore[assignment]
|
||||||
|
|
||||||
|
cat_strat = st.selectbox(
|
||||||
|
"Categorical fallback (for non-numeric columns)",
|
||||||
|
["mode", "constant", "ffill", "bfill", "none"],
|
||||||
|
index=0,
|
||||||
|
)
|
||||||
|
options.categorical_strategy = cat_strat # type: ignore[assignment]
|
||||||
|
|
||||||
|
if options.strategy == "constant" or cat_strat == "constant":
|
||||||
|
fill_val = st.text_input(
|
||||||
|
"Constant fill value",
|
||||||
|
value="",
|
||||||
|
help="Used when strategy = constant. Leave blank to fill with empty string.",
|
||||||
|
)
|
||||||
|
options.fill_value = fill_val
|
||||||
|
|
||||||
|
st.markdown("**Drop thresholds**")
|
||||||
|
col_c, col_d = st.columns(2)
|
||||||
|
with col_c:
|
||||||
|
options.row_drop_threshold = st.slider(
|
||||||
|
"Row drop threshold (drop rows with ≥ this fraction missing across selected cols)",
|
||||||
|
0.0, 1.0, options.row_drop_threshold, 0.05,
|
||||||
|
)
|
||||||
|
with col_d:
|
||||||
|
options.col_drop_threshold = st.slider(
|
||||||
|
"Column drop threshold (drop columns with ≥ this fraction missing)",
|
||||||
|
0.0, 1.0, options.col_drop_threshold, 0.05,
|
||||||
|
)
|
||||||
|
|
||||||
|
st.markdown("**Scope**")
|
||||||
|
selected_cols = st.multiselect(
|
||||||
|
"Columns to handle (default: all)",
|
||||||
|
options=list(df.columns),
|
||||||
|
default=list(df.columns),
|
||||||
)
|
)
|
||||||
for _, row in edited.iterrows():
|
skip_cols = st.multiselect(
|
||||||
if row["strategy"]:
|
"Columns to skip",
|
||||||
per_col_overrides[row["column"]] = row["strategy"]
|
options=list(df.columns),
|
||||||
options.column_strategies = per_col_overrides # type: ignore[assignment]
|
default=[],
|
||||||
|
)
|
||||||
|
options.columns = selected_cols if selected_cols else None
|
||||||
|
options.skip_columns = list(skip_cols)
|
||||||
|
|
||||||
|
st.markdown("**Per-column strategy overrides** (optional)")
|
||||||
|
st.caption(
|
||||||
|
"Set a different strategy for specific columns. Leave any row blank to "
|
||||||
|
"use the global strategy."
|
||||||
|
)
|
||||||
|
per_col_overrides: dict[str, str] = {}
|
||||||
|
only_missing_cols = [
|
||||||
|
r.column for r in initial_profile.columns if r.has_missing
|
||||||
|
]
|
||||||
|
if only_missing_cols:
|
||||||
|
edit_df = pd.DataFrame({
|
||||||
|
"column": only_missing_cols,
|
||||||
|
"strategy": ["" for _ in only_missing_cols],
|
||||||
|
})
|
||||||
|
edited = st.data_editor(
|
||||||
|
edit_df,
|
||||||
|
use_container_width=True,
|
||||||
|
hide_index=True,
|
||||||
|
column_config={
|
||||||
|
"column": st.column_config.TextColumn("Column", disabled=True),
|
||||||
|
"strategy": st.column_config.SelectboxColumn(
|
||||||
|
"Override",
|
||||||
|
options=[
|
||||||
|
"", "drop_row", "drop_col",
|
||||||
|
"mean", "median", "mode", "constant",
|
||||||
|
"ffill", "bfill", "interpolate",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
},
|
||||||
|
key="missing_per_col_editor",
|
||||||
|
)
|
||||||
|
for _, row in edited.iterrows():
|
||||||
|
if row["strategy"]:
|
||||||
|
per_col_overrides[row["column"]] = row["strategy"]
|
||||||
|
options.column_strategies = per_col_overrides # type: ignore[assignment]
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Run
|
# Run
|
||||||
@@ -282,6 +293,14 @@ if st.button("Handle Missing Values", type="primary", use_container_width=True):
|
|||||||
st.session_state["missing_result"] = result
|
st.session_state["missing_result"] = result
|
||||||
st.session_state["missing_input_name"] = uploaded.name
|
st.session_state["missing_input_name"] = uploaded.name
|
||||||
st.session_state["missing_options"] = options.to_dict()
|
st.session_state["missing_options"] = options.to_dict()
|
||||||
|
# One-shot flag picked up on the next pass to scroll the parent
|
||||||
|
# document to the Results anchor (see scroll snippet below).
|
||||||
|
st.session_state["_missing_scroll_to_results"] = True
|
||||||
|
# Force a second rerun so the preview and options expanders see
|
||||||
|
# the new result on the NEXT script pass and collapse themselves.
|
||||||
|
# Without this they stay expanded until the user touches any
|
||||||
|
# other widget.
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
result = st.session_state.get("missing_result")
|
result = st.session_state.get("missing_result")
|
||||||
if result is None:
|
if result is None:
|
||||||
@@ -292,6 +311,16 @@ if result is None:
|
|||||||
# Results
|
# Results
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Anchor target for the auto-scroll snippet at the end of this block.
|
||||||
|
# A bare ``<div id="...">`` survives Streamlit's HTML sanitizer (only
|
||||||
|
# ``<script>`` is stripped), and a 1px-tall div doesn't visually shift
|
||||||
|
# anything. Placed before the subheader so the scrolled-to viewport
|
||||||
|
# starts a few pixels above the section heading rather than below it.
|
||||||
|
st.markdown(
|
||||||
|
'<div id="missing-results-anchor" style="height:1px"></div>',
|
||||||
|
unsafe_allow_html=True,
|
||||||
|
)
|
||||||
|
|
||||||
st.subheader("Results")
|
st.subheader("Results")
|
||||||
|
|
||||||
m1, m2, m3, m4 = st.columns(4)
|
m1, m2, m3, m4 = st.columns(4)
|
||||||
@@ -334,38 +363,85 @@ st.dataframe(result.handled_df.head(10), use_container_width=True)
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Downloads
|
# Downloads
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# All three byte buffers are prepared up front (outside the columns) so
|
||||||
|
# each ``st.download_button`` sees stable ``data`` across reruns and an
|
||||||
|
# explicit ``key`` — without those, Streamlit auto-derived widget IDs
|
||||||
|
# can collide for multiple download_buttons in adjacent columns and
|
||||||
|
# only the first one actually fires on click. The empty-changes case
|
||||||
|
# now renders a disabled button (rather than vanishing) so the layout
|
||||||
|
# stays steady and the user understands why nothing's available.
|
||||||
|
|
||||||
st.divider()
|
st.divider()
|
||||||
stem = Path(st.session_state.get("missing_input_name", "input")).stem
|
stem = Path(st.session_state.get("missing_input_name", "input")).stem
|
||||||
|
|
||||||
|
handled_bytes = result.handled_df.to_csv(index=False).encode("utf-8-sig")
|
||||||
|
changes_bytes = (
|
||||||
|
result.changes.to_csv(index=False).encode("utf-8-sig")
|
||||||
|
if not result.changes.empty
|
||||||
|
else b""
|
||||||
|
)
|
||||||
|
config_bytes = json.dumps(
|
||||||
|
st.session_state.get("missing_options", {}), indent=2, default=str,
|
||||||
|
).encode("utf-8")
|
||||||
|
|
||||||
dl_a, dl_b, dl_c = st.columns(3)
|
dl_a, dl_b, dl_c = st.columns(3)
|
||||||
with dl_a:
|
with dl_a:
|
||||||
handled_bytes = result.handled_df.to_csv(index=False).encode("utf-8-sig")
|
|
||||||
st.download_button(
|
st.download_button(
|
||||||
"Download handled CSV",
|
"Download handled CSV",
|
||||||
data=handled_bytes,
|
data=handled_bytes,
|
||||||
file_name=f"{stem}_missing.csv",
|
file_name=f"{stem}_missing.csv",
|
||||||
mime="text/csv",
|
mime="text/csv",
|
||||||
|
key="missing_dl_handled",
|
||||||
|
use_container_width=True,
|
||||||
)
|
)
|
||||||
with dl_b:
|
with dl_b:
|
||||||
if not result.changes.empty:
|
st.download_button(
|
||||||
changes_bytes = result.changes.to_csv(index=False).encode("utf-8-sig")
|
"Download changes audit",
|
||||||
st.download_button(
|
data=changes_bytes,
|
||||||
"Download changes audit",
|
file_name=f"{stem}_missing_changes.csv",
|
||||||
data=changes_bytes,
|
mime="text/csv",
|
||||||
file_name=f"{stem}_missing_changes.csv",
|
key="missing_dl_changes",
|
||||||
mime="text/csv",
|
disabled=result.changes.empty,
|
||||||
)
|
help="No changes to audit." if result.changes.empty else None,
|
||||||
|
use_container_width=True,
|
||||||
|
)
|
||||||
with dl_c:
|
with dl_c:
|
||||||
config_bytes = json.dumps(
|
|
||||||
st.session_state.get("missing_options", {}), indent=2, default=str,
|
|
||||||
).encode("utf-8")
|
|
||||||
st.download_button(
|
st.download_button(
|
||||||
"Download config JSON",
|
"Download config JSON",
|
||||||
data=config_bytes,
|
data=config_bytes,
|
||||||
file_name="missing_config.json",
|
file_name="missing_config.json",
|
||||||
mime="application/json",
|
mime="application/json",
|
||||||
|
key="missing_dl_config",
|
||||||
|
use_container_width=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
st.divider()
|
st.divider()
|
||||||
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")
|
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Post-run auto-scroll
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# When the user clicks Handle Missing Values, the preview + options
|
||||||
|
# collapse but Streamlit by itself doesn't scroll — the Results section
|
||||||
|
# is at the bottom of a tall script so the user has to find it. Inject
|
||||||
|
# a tiny component-html iframe that calls ``scrollIntoView`` on the
|
||||||
|
# parent's Results anchor. Streamlit's main page is same-origin with
|
||||||
|
# component iframes so ``window.parent.document`` access is allowed.
|
||||||
|
#
|
||||||
|
# The flag is one-shot (``pop`` removes it) so re-renders triggered by
|
||||||
|
# unrelated widgets in the Results section don't yank the viewport
|
||||||
|
# back to the top of Results.
|
||||||
|
if st.session_state.pop("_missing_scroll_to_results", False):
|
||||||
|
from streamlit.components.v1 import html as _components_html
|
||||||
|
_components_html(
|
||||||
|
"""
|
||||||
|
<script>
|
||||||
|
const doc = window.parent.document;
|
||||||
|
const target = doc.getElementById('missing-results-anchor');
|
||||||
|
if (target) target.scrollIntoView({behavior: 'smooth', block: 'start'});
|
||||||
|
</script>
|
||||||
|
""",
|
||||||
|
height=0,
|
||||||
|
)
|
||||||
|
|||||||
@@ -88,224 +88,240 @@ except Exception as e:
|
|||||||
)
|
)
|
||||||
st.stop()
|
st.stop()
|
||||||
|
|
||||||
st.subheader(f"Preview: {uploaded.name}")
|
# Collapse the input preview once the user has clicked Apply Column
|
||||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
# Mapping so the Results section below is the primary visual focus.
|
||||||
st.dataframe(df.head(10), use_container_width=True)
|
# The user can re-expand the expander to re-inspect the source rows.
|
||||||
|
_has_result = st.session_state.get("colmap_result") is not None
|
||||||
|
|
||||||
|
with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result):
|
||||||
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||||
|
st.dataframe(df.head(10), use_container_width=True)
|
||||||
st.divider()
|
st.divider()
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Schema input
|
# Options (Target schema + Strategy + Mapping)
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# Wrapped in an outer expander whose default state mirrors the preview
|
||||||
|
# expander above: open before a result exists, folded once the user has
|
||||||
|
# clicked Apply Column Mapping. The Mapping editor is the heart of the
|
||||||
|
# tool, but per the Text Cleaner pattern we still collapse everything
|
||||||
|
# post-run — the user can re-expand to tweak any of the three sections.
|
||||||
|
|
||||||
st.subheader("Target schema")
|
with st.expander("Options", expanded=not _has_result):
|
||||||
|
# -----------------------------------------------------------------------
|
||||||
|
# Schema input
|
||||||
|
# -----------------------------------------------------------------------
|
||||||
|
|
||||||
schema_mode = st.radio(
|
st.subheader("Target schema")
|
||||||
"How would you like to define the target schema?",
|
|
||||||
[
|
|
||||||
"Build interactively (start from current columns)",
|
|
||||||
"Upload schema JSON",
|
|
||||||
"Skip (rename / coerce only — no schema)",
|
|
||||||
],
|
|
||||||
index=0,
|
|
||||||
help=(
|
|
||||||
"An interactive build is fastest for one-off cleanup. Upload a JSON "
|
|
||||||
"when you have a fixed contract (a CRM import format, db schema). "
|
|
||||||
"Skip when you only want to rename or coerce specific columns."
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
schema: TargetSchema | None = None
|
schema_mode = st.radio(
|
||||||
|
"How would you like to define the target schema?",
|
||||||
if schema_mode.startswith("Upload"):
|
[
|
||||||
schema_file = st.file_uploader(
|
"Build interactively (start from current columns)",
|
||||||
"Schema JSON",
|
"Upload schema JSON",
|
||||||
type=["json"],
|
"Skip (rename / coerce only — no schema)",
|
||||||
key="colmap_schema_upload",
|
],
|
||||||
help='Format: {"fields": [{"name": "email", "dtype": "string", "required": true, "aliases": ["EmailAddr"]}, ...]}',
|
index=0,
|
||||||
|
help=(
|
||||||
|
"An interactive build is fastest for one-off cleanup. Upload a JSON "
|
||||||
|
"when you have a fixed contract (a CRM import format, db schema). "
|
||||||
|
"Skip when you only want to rename or coerce specific columns."
|
||||||
|
),
|
||||||
)
|
)
|
||||||
if schema_file is not None:
|
|
||||||
try:
|
|
||||||
schema = TargetSchema.from_dict(json.loads(schema_file.getvalue()))
|
|
||||||
st.success(f"Loaded {len(schema.fields)} target field(s).")
|
|
||||||
except Exception as e:
|
|
||||||
from src.core.errors import format_for_user
|
|
||||||
st.error(f"**Could not parse schema**\n\n```\n{format_for_user(e)}\n```")
|
|
||||||
|
|
||||||
elif schema_mode.startswith("Build"):
|
schema: TargetSchema | None = None
|
||||||
st.caption(
|
|
||||||
"Edit the table to define your target schema. Add rows for fields the "
|
if schema_mode.startswith("Upload"):
|
||||||
"input doesn't have yet (with a default), or remove rows for columns "
|
schema_file = st.file_uploader(
|
||||||
"you want to drop."
|
"Schema JSON",
|
||||||
|
type=["json"],
|
||||||
|
key="colmap_schema_upload",
|
||||||
|
help='Format: {"fields": [{"name": "email", "dtype": "string", "required": true, "aliases": ["EmailAddr"]}, ...]}',
|
||||||
|
)
|
||||||
|
if schema_file is not None:
|
||||||
|
try:
|
||||||
|
schema = TargetSchema.from_dict(json.loads(schema_file.getvalue()))
|
||||||
|
st.success(f"Loaded {len(schema.fields)} target field(s).")
|
||||||
|
except Exception as e:
|
||||||
|
from src.core.errors import format_for_user
|
||||||
|
st.error(f"**Could not parse schema**\n\n```\n{format_for_user(e)}\n```")
|
||||||
|
|
||||||
|
elif schema_mode.startswith("Build"):
|
||||||
|
st.caption(
|
||||||
|
"Edit the table to define your target schema. Add rows for fields the "
|
||||||
|
"input doesn't have yet (with a default), or remove rows for columns "
|
||||||
|
"you want to drop."
|
||||||
|
)
|
||||||
|
initial = pd.DataFrame({
|
||||||
|
"name": list(df.columns),
|
||||||
|
"dtype": ["auto"] * len(df.columns),
|
||||||
|
"required": [False] * len(df.columns),
|
||||||
|
"default": [""] * len(df.columns),
|
||||||
|
"aliases": [""] * len(df.columns),
|
||||||
|
})
|
||||||
|
edited = st.data_editor(
|
||||||
|
initial,
|
||||||
|
use_container_width=True,
|
||||||
|
num_rows="dynamic",
|
||||||
|
column_config={
|
||||||
|
"name": st.column_config.TextColumn("Target name"),
|
||||||
|
"dtype": st.column_config.SelectboxColumn(
|
||||||
|
"Type",
|
||||||
|
options=[
|
||||||
|
"auto", "string", "integer", "float",
|
||||||
|
"boolean", "date", "datetime", "category",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
"required": st.column_config.CheckboxColumn("Required"),
|
||||||
|
"default": st.column_config.TextColumn("Default (for added cols)"),
|
||||||
|
"aliases": st.column_config.TextColumn(
|
||||||
|
"Aliases (comma-sep, helps fuzzy-match)",
|
||||||
|
),
|
||||||
|
},
|
||||||
|
key="colmap_schema_editor",
|
||||||
|
)
|
||||||
|
fields: list[TargetField] = []
|
||||||
|
for _, row in edited.iterrows():
|
||||||
|
name = str(row.get("name", "")).strip()
|
||||||
|
if not name:
|
||||||
|
continue
|
||||||
|
aliases = [
|
||||||
|
a.strip() for a in str(row.get("aliases", "") or "").split(",")
|
||||||
|
if a.strip()
|
||||||
|
]
|
||||||
|
default_raw = row.get("default")
|
||||||
|
default_val = (
|
||||||
|
default_raw if (default_raw not in (None, "", float("nan")))
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
if isinstance(default_val, float) and pd.isna(default_val):
|
||||||
|
default_val = None
|
||||||
|
except TypeError:
|
||||||
|
pass
|
||||||
|
fields.append(TargetField(
|
||||||
|
name=name,
|
||||||
|
dtype=str(row.get("dtype", "auto")), # type: ignore[arg-type]
|
||||||
|
required=bool(row.get("required", False)),
|
||||||
|
aliases=aliases,
|
||||||
|
default=default_val,
|
||||||
|
))
|
||||||
|
if fields:
|
||||||
|
schema = TargetSchema(fields=fields)
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------
|
||||||
|
# Strategy
|
||||||
|
# -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.subheader("Strategy")
|
||||||
|
|
||||||
|
preset_label = st.radio(
|
||||||
|
"Preset",
|
||||||
|
[
|
||||||
|
"rename-only (just rename, leave types alone, keep extras)",
|
||||||
|
"lenient-schema (rename + coerce + reorder, keep extras)",
|
||||||
|
"strict-schema (rename + coerce + reorder, drop extras)",
|
||||||
|
],
|
||||||
|
index=0,
|
||||||
)
|
)
|
||||||
initial = pd.DataFrame({
|
preset_key = preset_label.split(" ", 1)[0]
|
||||||
"name": list(df.columns),
|
options = MapOptions.from_preset(preset_key)
|
||||||
"dtype": ["auto"] * len(df.columns),
|
options.schema = schema
|
||||||
"required": [False] * len(df.columns),
|
|
||||||
"default": [""] * len(df.columns),
|
with st.expander("Advanced options"):
|
||||||
"aliases": [""] * len(df.columns),
|
col_a, col_b = st.columns(2)
|
||||||
})
|
with col_a:
|
||||||
edited = st.data_editor(
|
options.unmapped = st.selectbox( # type: ignore[assignment]
|
||||||
initial,
|
"Unmapped source columns",
|
||||||
use_container_width=True,
|
["keep", "drop", "error"],
|
||||||
num_rows="dynamic",
|
index=["keep", "drop", "error"].index(options.unmapped),
|
||||||
column_config={
|
)
|
||||||
"name": st.column_config.TextColumn("Target name"),
|
options.coerce_types = st.checkbox(
|
||||||
"dtype": st.column_config.SelectboxColumn(
|
"Coerce types per schema", value=options.coerce_types,
|
||||||
"Type",
|
)
|
||||||
options=[
|
options.reorder_to_schema = st.checkbox(
|
||||||
"auto", "string", "integer", "float",
|
"Reorder to schema order", value=options.reorder_to_schema,
|
||||||
"boolean", "date", "datetime", "category",
|
)
|
||||||
],
|
with col_b:
|
||||||
),
|
options.auto_infer = st.checkbox(
|
||||||
"required": st.column_config.CheckboxColumn("Required"),
|
"Auto-infer mapping (fuzzy match)", value=options.auto_infer,
|
||||||
"default": st.column_config.TextColumn("Default (for added cols)"),
|
)
|
||||||
"aliases": st.column_config.TextColumn(
|
options.fuzzy_threshold = st.slider(
|
||||||
"Aliases (comma-sep, helps fuzzy-match)",
|
"Fuzzy match threshold", 0.0, 1.0, options.fuzzy_threshold, 0.05,
|
||||||
),
|
)
|
||||||
},
|
options.enforce_required = st.checkbox(
|
||||||
key="colmap_schema_editor",
|
"Enforce required fields", value=options.enforce_required,
|
||||||
)
|
)
|
||||||
fields: list[TargetField] = []
|
|
||||||
for _, row in edited.iterrows():
|
# -----------------------------------------------------------------------
|
||||||
name = str(row.get("name", "")).strip()
|
# Mapping editor — show inferred and let user override
|
||||||
if not name:
|
# -----------------------------------------------------------------------
|
||||||
continue
|
|
||||||
aliases = [
|
st.subheader("Mapping")
|
||||||
a.strip() for a in str(row.get("aliases", "") or "").split(",")
|
|
||||||
if a.strip()
|
if schema is None:
|
||||||
]
|
st.caption(
|
||||||
default_raw = row.get("default")
|
"No schema — define explicit renames below (left blank means keep "
|
||||||
default_val = (
|
"the source name)."
|
||||||
default_raw if (default_raw not in (None, "", float("nan")))
|
|
||||||
else None
|
|
||||||
)
|
)
|
||||||
try:
|
rename_initial = pd.DataFrame({
|
||||||
if isinstance(default_val, float) and pd.isna(default_val):
|
"source": list(df.columns),
|
||||||
default_val = None
|
"target": list(df.columns),
|
||||||
except TypeError:
|
})
|
||||||
pass
|
rename_edited = st.data_editor(
|
||||||
fields.append(TargetField(
|
rename_initial,
|
||||||
name=name,
|
use_container_width=True,
|
||||||
dtype=str(row.get("dtype", "auto")), # type: ignore[arg-type]
|
column_config={
|
||||||
required=bool(row.get("required", False)),
|
"source": st.column_config.TextColumn("Source", disabled=True),
|
||||||
aliases=aliases,
|
"target": st.column_config.TextColumn("Target"),
|
||||||
default=default_val,
|
},
|
||||||
))
|
hide_index=True,
|
||||||
if fields:
|
key="colmap_rename_only_editor",
|
||||||
schema = TargetSchema(fields=fields)
|
|
||||||
|
|
||||||
st.divider()
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Strategy
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
st.subheader("Strategy")
|
|
||||||
|
|
||||||
preset_label = st.radio(
|
|
||||||
"Preset",
|
|
||||||
[
|
|
||||||
"rename-only (just rename, leave types alone, keep extras)",
|
|
||||||
"lenient-schema (rename + coerce + reorder, keep extras)",
|
|
||||||
"strict-schema (rename + coerce + reorder, drop extras)",
|
|
||||||
],
|
|
||||||
index=0,
|
|
||||||
)
|
|
||||||
preset_key = preset_label.split(" ", 1)[0]
|
|
||||||
options = MapOptions.from_preset(preset_key)
|
|
||||||
options.schema = schema
|
|
||||||
|
|
||||||
with st.expander("Advanced options"):
|
|
||||||
col_a, col_b = st.columns(2)
|
|
||||||
with col_a:
|
|
||||||
options.unmapped = st.selectbox( # type: ignore[assignment]
|
|
||||||
"Unmapped source columns",
|
|
||||||
["keep", "drop", "error"],
|
|
||||||
index=["keep", "drop", "error"].index(options.unmapped),
|
|
||||||
)
|
)
|
||||||
options.coerce_types = st.checkbox(
|
explicit_mapping: dict[str, str] = {}
|
||||||
"Coerce types per schema", value=options.coerce_types,
|
for _, row in rename_edited.iterrows():
|
||||||
|
src = str(row["source"])
|
||||||
|
tgt = str(row["target"]).strip()
|
||||||
|
if tgt and tgt != src:
|
||||||
|
explicit_mapping[src] = tgt
|
||||||
|
options.mapping = explicit_mapping
|
||||||
|
else:
|
||||||
|
inferred = (
|
||||||
|
infer_mapping(df, schema, threshold=options.fuzzy_threshold)
|
||||||
|
if options.auto_infer else {}
|
||||||
)
|
)
|
||||||
options.reorder_to_schema = st.checkbox(
|
target_options = ["(unmapped)"] + schema.field_names()
|
||||||
"Reorder to schema order", value=options.reorder_to_schema,
|
map_initial = pd.DataFrame({
|
||||||
|
"source": list(df.columns),
|
||||||
|
"target": [inferred.get(c, "(unmapped)") for c in df.columns],
|
||||||
|
"auto": [c in inferred for c in df.columns],
|
||||||
|
})
|
||||||
|
map_edited = st.data_editor(
|
||||||
|
map_initial,
|
||||||
|
use_container_width=True,
|
||||||
|
column_config={
|
||||||
|
"source": st.column_config.TextColumn("Source", disabled=True),
|
||||||
|
"target": st.column_config.SelectboxColumn(
|
||||||
|
"Target", options=target_options,
|
||||||
|
),
|
||||||
|
"auto": st.column_config.CheckboxColumn("Auto-suggested", disabled=True),
|
||||||
|
},
|
||||||
|
hide_index=True,
|
||||||
|
key="colmap_schema_mapping_editor",
|
||||||
)
|
)
|
||||||
with col_b:
|
explicit_mapping = {}
|
||||||
options.auto_infer = st.checkbox(
|
for _, row in map_edited.iterrows():
|
||||||
"Auto-infer mapping (fuzzy match)", value=options.auto_infer,
|
src = str(row["source"])
|
||||||
)
|
tgt = str(row["target"])
|
||||||
options.fuzzy_threshold = st.slider(
|
if tgt and tgt != "(unmapped)":
|
||||||
"Fuzzy match threshold", 0.0, 1.0, options.fuzzy_threshold, 0.05,
|
explicit_mapping[src] = tgt
|
||||||
)
|
options.mapping = explicit_mapping
|
||||||
options.enforce_required = st.checkbox(
|
# Disable auto-infer for the actual run since the editor already shows
|
||||||
"Enforce required fields", value=options.enforce_required,
|
# the user's resolved choices (they can manually re-select to add).
|
||||||
)
|
options.auto_infer = False
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Mapping editor — show inferred and let user override
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
st.subheader("Mapping")
|
|
||||||
|
|
||||||
if schema is None:
|
|
||||||
st.caption(
|
|
||||||
"No schema — define explicit renames below (left blank means keep "
|
|
||||||
"the source name)."
|
|
||||||
)
|
|
||||||
rename_initial = pd.DataFrame({
|
|
||||||
"source": list(df.columns),
|
|
||||||
"target": list(df.columns),
|
|
||||||
})
|
|
||||||
rename_edited = st.data_editor(
|
|
||||||
rename_initial,
|
|
||||||
use_container_width=True,
|
|
||||||
column_config={
|
|
||||||
"source": st.column_config.TextColumn("Source", disabled=True),
|
|
||||||
"target": st.column_config.TextColumn("Target"),
|
|
||||||
},
|
|
||||||
hide_index=True,
|
|
||||||
key="colmap_rename_only_editor",
|
|
||||||
)
|
|
||||||
explicit_mapping: dict[str, str] = {}
|
|
||||||
for _, row in rename_edited.iterrows():
|
|
||||||
src = str(row["source"])
|
|
||||||
tgt = str(row["target"]).strip()
|
|
||||||
if tgt and tgt != src:
|
|
||||||
explicit_mapping[src] = tgt
|
|
||||||
options.mapping = explicit_mapping
|
|
||||||
else:
|
|
||||||
inferred = (
|
|
||||||
infer_mapping(df, schema, threshold=options.fuzzy_threshold)
|
|
||||||
if options.auto_infer else {}
|
|
||||||
)
|
|
||||||
target_options = ["(unmapped)"] + schema.field_names()
|
|
||||||
map_initial = pd.DataFrame({
|
|
||||||
"source": list(df.columns),
|
|
||||||
"target": [inferred.get(c, "(unmapped)") for c in df.columns],
|
|
||||||
"auto": [c in inferred for c in df.columns],
|
|
||||||
})
|
|
||||||
map_edited = st.data_editor(
|
|
||||||
map_initial,
|
|
||||||
use_container_width=True,
|
|
||||||
column_config={
|
|
||||||
"source": st.column_config.TextColumn("Source", disabled=True),
|
|
||||||
"target": st.column_config.SelectboxColumn(
|
|
||||||
"Target", options=target_options,
|
|
||||||
),
|
|
||||||
"auto": st.column_config.CheckboxColumn("Auto-suggested", disabled=True),
|
|
||||||
},
|
|
||||||
hide_index=True,
|
|
||||||
key="colmap_schema_mapping_editor",
|
|
||||||
)
|
|
||||||
explicit_mapping = {}
|
|
||||||
for _, row in map_edited.iterrows():
|
|
||||||
src = str(row["source"])
|
|
||||||
tgt = str(row["target"])
|
|
||||||
if tgt and tgt != "(unmapped)":
|
|
||||||
explicit_mapping[src] = tgt
|
|
||||||
options.mapping = explicit_mapping
|
|
||||||
# Disable auto-infer for the actual run since the editor already shows
|
|
||||||
# the user's resolved choices (they can manually re-select to add).
|
|
||||||
options.auto_infer = False
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Run
|
# Run
|
||||||
@@ -324,6 +340,12 @@ if st.button("Apply Column Mapping", type="primary", use_container_width=True):
|
|||||||
st.session_state["colmap_result"] = result
|
st.session_state["colmap_result"] = result
|
||||||
st.session_state["colmap_input_name"] = uploaded.name
|
st.session_state["colmap_input_name"] = uploaded.name
|
||||||
st.session_state["colmap_options"] = options.to_dict()
|
st.session_state["colmap_options"] = options.to_dict()
|
||||||
|
# One-shot flag picked up on the next pass to scroll the parent
|
||||||
|
# document to the Results anchor (see scroll snippet below).
|
||||||
|
st.session_state["_colmap_scroll_to_results"] = True
|
||||||
|
# Force a second rerun so the preview and options expanders see
|
||||||
|
# the new result on the NEXT script pass and collapse themselves.
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
result = st.session_state.get("colmap_result")
|
result = st.session_state.get("colmap_result")
|
||||||
if result is None:
|
if result is None:
|
||||||
@@ -334,6 +356,16 @@ if result is None:
|
|||||||
# Results
|
# Results
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Anchor target for the auto-scroll snippet at the end of this block.
|
||||||
|
# A bare ``<div id="...">`` survives Streamlit's HTML sanitizer (only
|
||||||
|
# ``<script>`` is stripped), and a 1px-tall div doesn't visually shift
|
||||||
|
# anything. Placed before the subheader so the scrolled-to viewport
|
||||||
|
# starts a few pixels above the section heading rather than below it.
|
||||||
|
st.markdown(
|
||||||
|
'<div id="colmap-results-anchor" style="height:1px"></div>',
|
||||||
|
unsafe_allow_html=True,
|
||||||
|
)
|
||||||
|
|
||||||
st.subheader("Results")
|
st.subheader("Results")
|
||||||
|
|
||||||
m1, m2, m3, m4 = st.columns(4)
|
m1, m2, m3, m4 = st.columns(4)
|
||||||
@@ -371,46 +403,90 @@ st.dataframe(result.mapped_df.head(10), use_container_width=True)
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Downloads
|
# Downloads
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# All three byte buffers are prepared up front (outside the columns) so
|
||||||
|
# each ``st.download_button`` sees stable ``data`` across reruns and an
|
||||||
|
# explicit ``key`` — without those, Streamlit auto-derived widget IDs
|
||||||
|
# can collide for multiple download_buttons in adjacent columns and
|
||||||
|
# only the first one actually fires on click.
|
||||||
|
|
||||||
st.divider()
|
st.divider()
|
||||||
stem = Path(st.session_state.get("colmap_input_name", "input")).stem
|
stem = Path(st.session_state.get("colmap_input_name", "input")).stem
|
||||||
|
|
||||||
|
mapped_bytes = result.mapped_df.to_csv(index=False).encode("utf-8-sig")
|
||||||
|
audit_bytes = json.dumps({
|
||||||
|
"mapping": result.mapping,
|
||||||
|
"inferred_pairs": result.inferred_pairs,
|
||||||
|
"columns_renamed": result.columns_renamed,
|
||||||
|
"columns_dropped": result.columns_dropped,
|
||||||
|
"columns_added": result.columns_added,
|
||||||
|
"coercion_failures": result.coercion_failures,
|
||||||
|
"unmapped_kept": result.unmapped_kept,
|
||||||
|
"missing_required_targets": result.missing_required_targets,
|
||||||
|
}, indent=2, default=str).encode("utf-8")
|
||||||
|
config_bytes = json.dumps(
|
||||||
|
st.session_state.get("colmap_options", {}), indent=2, default=str,
|
||||||
|
).encode("utf-8")
|
||||||
|
|
||||||
|
_no_mapping = not result.mapping
|
||||||
|
|
||||||
dl_a, dl_b, dl_c = st.columns(3)
|
dl_a, dl_b, dl_c = st.columns(3)
|
||||||
with dl_a:
|
with dl_a:
|
||||||
mapped_bytes = result.mapped_df.to_csv(index=False).encode("utf-8-sig")
|
|
||||||
st.download_button(
|
st.download_button(
|
||||||
"Download mapped CSV",
|
"Download mapped CSV",
|
||||||
data=mapped_bytes,
|
data=mapped_bytes,
|
||||||
file_name=f"{stem}_mapped.csv",
|
file_name=f"{stem}_mapped.csv",
|
||||||
mime="text/csv",
|
mime="text/csv",
|
||||||
|
key="colmap_dl_mapped",
|
||||||
|
use_container_width=True,
|
||||||
)
|
)
|
||||||
with dl_b:
|
with dl_b:
|
||||||
audit_bytes = json.dumps({
|
|
||||||
"mapping": result.mapping,
|
|
||||||
"inferred_pairs": result.inferred_pairs,
|
|
||||||
"columns_renamed": result.columns_renamed,
|
|
||||||
"columns_dropped": result.columns_dropped,
|
|
||||||
"columns_added": result.columns_added,
|
|
||||||
"coercion_failures": result.coercion_failures,
|
|
||||||
"unmapped_kept": result.unmapped_kept,
|
|
||||||
"missing_required_targets": result.missing_required_targets,
|
|
||||||
}, indent=2, default=str).encode("utf-8")
|
|
||||||
st.download_button(
|
st.download_button(
|
||||||
"Download mapping audit",
|
"Download mapping audit",
|
||||||
data=audit_bytes,
|
data=audit_bytes,
|
||||||
file_name=f"{stem}_mapping.json",
|
file_name=f"{stem}_mapping.json",
|
||||||
mime="application/json",
|
mime="application/json",
|
||||||
|
key="colmap_dl_audit",
|
||||||
|
disabled=_no_mapping,
|
||||||
|
help="No mapping was applied." if _no_mapping else None,
|
||||||
|
use_container_width=True,
|
||||||
)
|
)
|
||||||
with dl_c:
|
with dl_c:
|
||||||
config_bytes = json.dumps(
|
|
||||||
st.session_state.get("colmap_options", {}), indent=2, default=str,
|
|
||||||
).encode("utf-8")
|
|
||||||
st.download_button(
|
st.download_button(
|
||||||
"Download config JSON",
|
"Download config JSON",
|
||||||
data=config_bytes,
|
data=config_bytes,
|
||||||
file_name="column_map_config.json",
|
file_name="column_map_config.json",
|
||||||
mime="application/json",
|
mime="application/json",
|
||||||
|
key="colmap_dl_config",
|
||||||
|
use_container_width=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
st.divider()
|
st.divider()
|
||||||
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")
|
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Post-run auto-scroll
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# When the user clicks Apply Column Mapping, the preview + options
|
||||||
|
# collapse but Streamlit by itself doesn't scroll — the Results section
|
||||||
|
# is at the bottom of a tall script so the user has to find it. Inject
|
||||||
|
# a tiny component-html iframe that calls ``scrollIntoView`` on the
|
||||||
|
# parent's Results anchor. Streamlit's main page is same-origin with
|
||||||
|
# component iframes so ``window.parent.document`` access is allowed.
|
||||||
|
#
|
||||||
|
# The flag is one-shot (``pop`` removes it) so re-renders triggered by
|
||||||
|
# unrelated widgets in the Results section don't yank the viewport back
|
||||||
|
# to the top of Results.
|
||||||
|
if st.session_state.pop("_colmap_scroll_to_results", False):
|
||||||
|
from streamlit.components.v1 import html as _components_html
|
||||||
|
_components_html(
|
||||||
|
"""
|
||||||
|
<script>
|
||||||
|
const doc = window.parent.document;
|
||||||
|
const target = doc.getElementById('colmap-results-anchor');
|
||||||
|
if (target) target.scrollIntoView({behavior: 'smooth', block: 'start'});
|
||||||
|
</script>
|
||||||
|
""",
|
||||||
|
height=0,
|
||||||
|
)
|
||||||
|
|||||||
@@ -89,139 +89,149 @@ except Exception as e:
|
|||||||
)
|
)
|
||||||
st.stop()
|
st.stop()
|
||||||
|
|
||||||
st.subheader(f"Preview: {uploaded.name}")
|
# Collapse the input preview and pipeline editor once the user has clicked
|
||||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
# Run Pipeline so the Results section below is the primary visual focus.
|
||||||
st.dataframe(df.head(10), use_container_width=True)
|
# The user can re-expand either expander to re-inspect or adjust.
|
||||||
|
_has_result = st.session_state.get("pipeline_result") is not None
|
||||||
|
|
||||||
|
with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result):
|
||||||
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||||
|
st.dataframe(df.head(10), use_container_width=True)
|
||||||
|
|
||||||
st.divider()
|
st.divider()
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Pipeline builder
|
# Pipeline builder
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# Wrapped in an outer expander whose default state mirrors the preview
|
||||||
|
# expander above: open before a result exists, folded once the user has
|
||||||
|
# clicked Run Pipeline. The pipeline editor is this page's "Options"
|
||||||
|
# section — structurally analogous to Text Cleaner's options block.
|
||||||
|
|
||||||
st.subheader("Pipeline")
|
with st.expander("Options", expanded=not _has_result):
|
||||||
|
mode = st.radio(
|
||||||
mode = st.radio(
|
"How would you like to define the pipeline?",
|
||||||
"How would you like to define the pipeline?",
|
[
|
||||||
[
|
"Use the recommended default (text-clean → format → missing → dedup)",
|
||||||
"Use the recommended default (text-clean → format → missing → dedup)",
|
"Build interactively",
|
||||||
"Build interactively",
|
"Upload a saved pipeline JSON",
|
||||||
"Upload a saved pipeline JSON",
|
],
|
||||||
],
|
index=0,
|
||||||
index=0,
|
|
||||||
)
|
|
||||||
|
|
||||||
if "pipeline_rows" not in st.session_state:
|
|
||||||
default = recommended_pipeline()
|
|
||||||
st.session_state["pipeline_rows"] = pd.DataFrame([
|
|
||||||
{
|
|
||||||
"tool": s.tool, "enabled": s.enabled,
|
|
||||||
"options_json": json.dumps(s.options),
|
|
||||||
}
|
|
||||||
for s in default.steps
|
|
||||||
])
|
|
||||||
|
|
||||||
if mode.startswith("Use the recommended"):
|
|
||||||
default = recommended_pipeline()
|
|
||||||
st.session_state["pipeline_rows"] = pd.DataFrame([
|
|
||||||
{
|
|
||||||
"tool": s.tool, "enabled": s.enabled,
|
|
||||||
"options_json": json.dumps(s.options),
|
|
||||||
}
|
|
||||||
for s in default.steps
|
|
||||||
])
|
|
||||||
elif mode.startswith("Upload"):
|
|
||||||
pipeline_file = st.file_uploader(
|
|
||||||
"Pipeline JSON", type=["json"], key="pipeline_upload",
|
|
||||||
)
|
)
|
||||||
if pipeline_file is not None:
|
|
||||||
|
if "pipeline_rows" not in st.session_state:
|
||||||
|
default = recommended_pipeline()
|
||||||
|
st.session_state["pipeline_rows"] = pd.DataFrame([
|
||||||
|
{
|
||||||
|
"tool": s.tool, "enabled": s.enabled,
|
||||||
|
"options_json": json.dumps(s.options),
|
||||||
|
}
|
||||||
|
for s in default.steps
|
||||||
|
])
|
||||||
|
|
||||||
|
if mode.startswith("Use the recommended"):
|
||||||
|
default = recommended_pipeline()
|
||||||
|
st.session_state["pipeline_rows"] = pd.DataFrame([
|
||||||
|
{
|
||||||
|
"tool": s.tool, "enabled": s.enabled,
|
||||||
|
"options_json": json.dumps(s.options),
|
||||||
|
}
|
||||||
|
for s in default.steps
|
||||||
|
])
|
||||||
|
elif mode.startswith("Upload"):
|
||||||
|
pipeline_file = st.file_uploader(
|
||||||
|
"Pipeline JSON", type=["json"], key="pipeline_upload",
|
||||||
|
)
|
||||||
|
if pipeline_file is not None:
|
||||||
|
try:
|
||||||
|
data = json.loads(pipeline_file.getvalue())
|
||||||
|
uploaded_pipe = Pipeline.from_dict(data)
|
||||||
|
st.session_state["pipeline_rows"] = pd.DataFrame([
|
||||||
|
{
|
||||||
|
"tool": s.tool, "enabled": s.enabled,
|
||||||
|
"options_json": json.dumps(s.options),
|
||||||
|
}
|
||||||
|
for s in uploaded_pipe.steps
|
||||||
|
])
|
||||||
|
st.success(f"Loaded {len(uploaded_pipe.steps)} step(s).")
|
||||||
|
except Exception as e:
|
||||||
|
from src.core.errors import format_for_user
|
||||||
|
st.error(f"**Could not parse pipeline**\n\n```\n{format_for_user(e)}\n```")
|
||||||
|
|
||||||
|
st.caption(
|
||||||
|
"Edit the table to add, remove, reorder (drag the row index), enable, "
|
||||||
|
"or configure each step. Tool order is recommended, not enforced — "
|
||||||
|
"violations surface as warnings below the table."
|
||||||
|
)
|
||||||
|
edited = st.data_editor(
|
||||||
|
st.session_state["pipeline_rows"],
|
||||||
|
use_container_width=True,
|
||||||
|
num_rows="dynamic",
|
||||||
|
column_config={
|
||||||
|
"tool": st.column_config.SelectboxColumn(
|
||||||
|
"Tool", options=TOOL_NAMES, required=True,
|
||||||
|
),
|
||||||
|
"enabled": st.column_config.CheckboxColumn("Enabled"),
|
||||||
|
"options_json": st.column_config.TextColumn(
|
||||||
|
"Options (JSON)",
|
||||||
|
help='e.g. {"column_types": {"phone": "phone"}}',
|
||||||
|
),
|
||||||
|
},
|
||||||
|
key="pipeline_editor",
|
||||||
|
)
|
||||||
|
st.session_state["pipeline_rows"] = edited
|
||||||
|
|
||||||
|
# Build a Pipeline object from the editor state.
|
||||||
|
steps_list: list[Step] = []
|
||||||
|
parse_errors: list[str] = []
|
||||||
|
for i, row in edited.iterrows():
|
||||||
|
tool = row.get("tool")
|
||||||
|
if not tool or pd.isna(tool):
|
||||||
|
continue
|
||||||
|
raw_opts = row.get("options_json") or "{}"
|
||||||
|
if pd.isna(raw_opts):
|
||||||
|
raw_opts = "{}"
|
||||||
try:
|
try:
|
||||||
data = json.loads(pipeline_file.getvalue())
|
opts = json.loads(raw_opts) if isinstance(raw_opts, str) else dict(raw_opts)
|
||||||
uploaded_pipe = Pipeline.from_dict(data)
|
if not isinstance(opts, dict):
|
||||||
st.session_state["pipeline_rows"] = pd.DataFrame([
|
raise ValueError("options must be a JSON object")
|
||||||
{
|
|
||||||
"tool": s.tool, "enabled": s.enabled,
|
|
||||||
"options_json": json.dumps(s.options),
|
|
||||||
}
|
|
||||||
for s in uploaded_pipe.steps
|
|
||||||
])
|
|
||||||
st.success(f"Loaded {len(uploaded_pipe.steps)} step(s).")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
from src.core.errors import format_for_user
|
parse_errors.append(f"Step {i + 1}: {e}")
|
||||||
st.error(f"**Could not parse pipeline**\n\n```\n{format_for_user(e)}\n```")
|
continue
|
||||||
|
try:
|
||||||
|
steps_list.append(Step(
|
||||||
|
tool=str(tool),
|
||||||
|
options=opts,
|
||||||
|
enabled=bool(row.get("enabled", True)),
|
||||||
|
))
|
||||||
|
except Exception as e:
|
||||||
|
parse_errors.append(f"Step {i + 1}: {e}")
|
||||||
|
|
||||||
st.caption(
|
if parse_errors:
|
||||||
"Edit the table to add, remove, reorder (drag the row index), enable, "
|
for err in parse_errors:
|
||||||
"or configure each step. Tool order is recommended, not enforced — "
|
st.error(err)
|
||||||
"violations surface as warnings below the table."
|
|
||||||
)
|
|
||||||
edited = st.data_editor(
|
|
||||||
st.session_state["pipeline_rows"],
|
|
||||||
use_container_width=True,
|
|
||||||
num_rows="dynamic",
|
|
||||||
column_config={
|
|
||||||
"tool": st.column_config.SelectboxColumn(
|
|
||||||
"Tool", options=TOOL_NAMES, required=True,
|
|
||||||
),
|
|
||||||
"enabled": st.column_config.CheckboxColumn("Enabled"),
|
|
||||||
"options_json": st.column_config.TextColumn(
|
|
||||||
"Options (JSON)",
|
|
||||||
help='e.g. {"column_types": {"phone": "phone"}}',
|
|
||||||
),
|
|
||||||
},
|
|
||||||
key="pipeline_editor",
|
|
||||||
)
|
|
||||||
st.session_state["pipeline_rows"] = edited
|
|
||||||
|
|
||||||
# Build a Pipeline object from the editor state.
|
current_pipeline = Pipeline(steps=steps_list) if steps_list else None
|
||||||
steps_list: list[Step] = []
|
|
||||||
parse_errors: list[str] = []
|
|
||||||
for i, row in edited.iterrows():
|
|
||||||
tool = row.get("tool")
|
|
||||||
if not tool or pd.isna(tool):
|
|
||||||
continue
|
|
||||||
raw_opts = row.get("options_json") or "{}"
|
|
||||||
if pd.isna(raw_opts):
|
|
||||||
raw_opts = "{}"
|
|
||||||
try:
|
|
||||||
opts = json.loads(raw_opts) if isinstance(raw_opts, str) else dict(raw_opts)
|
|
||||||
if not isinstance(opts, dict):
|
|
||||||
raise ValueError("options must be a JSON object")
|
|
||||||
except Exception as e:
|
|
||||||
parse_errors.append(f"Step {i + 1}: {e}")
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
steps_list.append(Step(
|
|
||||||
tool=str(tool),
|
|
||||||
options=opts,
|
|
||||||
enabled=bool(row.get("enabled", True)),
|
|
||||||
))
|
|
||||||
except Exception as e:
|
|
||||||
parse_errors.append(f"Step {i + 1}: {e}")
|
|
||||||
|
|
||||||
if parse_errors:
|
if current_pipeline is not None:
|
||||||
for err in parse_errors:
|
warnings = validate_pipeline(current_pipeline)
|
||||||
st.error(err)
|
if warnings:
|
||||||
|
st.warning(
|
||||||
|
"Pipeline is out of recommended order:\n\n"
|
||||||
|
+ "\n".join(f"- {w}" for w in warnings)
|
||||||
|
+ "\n\nThe pipeline will still run — these are recommendations only."
|
||||||
|
)
|
||||||
|
|
||||||
current_pipeline = Pipeline(steps=steps_list) if steps_list else None
|
with st.expander("Recommended tool order — why each step belongs where it does"):
|
||||||
|
st.markdown(
|
||||||
if current_pipeline is not None:
|
"\n".join(
|
||||||
warnings = validate_pipeline(current_pipeline)
|
f"- **{e}** before **{l}** — {why}"
|
||||||
if warnings:
|
for e, l, why in SOFT_DEPENDENCIES
|
||||||
st.warning(
|
)
|
||||||
"Pipeline is out of recommended order:\n\n"
|
|
||||||
+ "\n".join(f"- {w}" for w in warnings)
|
|
||||||
+ "\n\nThe pipeline will still run — these are recommendations only."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
with st.expander("Recommended tool order — why each step belongs where it does"):
|
|
||||||
st.markdown(
|
|
||||||
"\n".join(
|
|
||||||
f"- **{e}** before **{l}** — {why}"
|
|
||||||
for e, l, why in SOFT_DEPENDENCIES
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
st.divider()
|
st.divider()
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -274,6 +284,14 @@ if st.button(
|
|||||||
progress.progress(1.0, text="Done")
|
progress.progress(1.0, text="Done")
|
||||||
st.session_state["pipeline_result"] = result
|
st.session_state["pipeline_result"] = result
|
||||||
st.session_state["pipeline_input_name"] = uploaded.name
|
st.session_state["pipeline_input_name"] = uploaded.name
|
||||||
|
# One-shot flag picked up on the next pass to scroll the parent
|
||||||
|
# document to the Results anchor (see scroll snippet at end of file).
|
||||||
|
st.session_state["_pipeline_scroll_to_results"] = True
|
||||||
|
# Force a second rerun so the preview and options expanders see
|
||||||
|
# the new result on the NEXT script pass and collapse themselves.
|
||||||
|
# Without this they stay expanded until the user touches any
|
||||||
|
# other widget.
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
result = st.session_state.get("pipeline_result")
|
result = st.session_state.get("pipeline_result")
|
||||||
if result is None:
|
if result is None:
|
||||||
@@ -287,6 +305,16 @@ if result is None:
|
|||||||
# Results
|
# Results
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Anchor target for the auto-scroll snippet at the end of this block.
|
||||||
|
# A bare ``<div id="...">`` survives Streamlit's HTML sanitizer (only
|
||||||
|
# ``<script>`` is stripped), and a 1px-tall div doesn't visually shift
|
||||||
|
# anything. Placed before the subheader so the scrolled-to viewport
|
||||||
|
# starts a few pixels above the section heading rather than below it.
|
||||||
|
st.markdown(
|
||||||
|
'<div id="pipeline-results-anchor" style="height:1px"></div>',
|
||||||
|
unsafe_allow_html=True,
|
||||||
|
)
|
||||||
|
|
||||||
st.subheader("Results")
|
st.subheader("Results")
|
||||||
|
|
||||||
m1, m2, m3, m4 = st.columns(4)
|
m1, m2, m3, m4 = st.columns(4)
|
||||||
@@ -318,56 +346,105 @@ st.dataframe(result.final_df.head(10), use_container_width=True)
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Downloads
|
# Downloads
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# All three byte buffers are prepared up front (outside the columns) so
|
||||||
|
# each ``st.download_button`` sees stable ``data`` across reruns and an
|
||||||
|
# explicit ``key`` — without those, Streamlit auto-derived widget IDs
|
||||||
|
# can collide for multiple download_buttons in adjacent columns and
|
||||||
|
# only the first one actually fires on click. The pipeline-JSON button
|
||||||
|
# now renders unconditionally (disabled when no pipeline is defined)
|
||||||
|
# so the layout stays steady.
|
||||||
|
|
||||||
st.divider()
|
st.divider()
|
||||||
stem = Path(st.session_state.get("pipeline_input_name", "input")).stem
|
stem = Path(st.session_state.get("pipeline_input_name", "input")).stem
|
||||||
|
|
||||||
|
cleaned_bytes = result.final_df.to_csv(index=False).encode("utf-8-sig")
|
||||||
|
pipeline_bytes = json.dumps(
|
||||||
|
current_pipeline.to_dict() if current_pipeline else {"steps": []},
|
||||||
|
indent=2, default=str,
|
||||||
|
).encode("utf-8")
|
||||||
|
audit_bytes = json.dumps({
|
||||||
|
"warnings": result.warnings,
|
||||||
|
"initial_rows": result.initial_rows,
|
||||||
|
"final_rows": result.final_rows,
|
||||||
|
"total_elapsed_seconds": result.total_elapsed,
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"tool": sr.step.tool,
|
||||||
|
"name": sr.step.display_name(),
|
||||||
|
"enabled": sr.step.enabled,
|
||||||
|
"skipped": sr.skipped,
|
||||||
|
"elapsed_seconds": sr.elapsed_seconds,
|
||||||
|
"summary": sr.summary,
|
||||||
|
"error": sr.error,
|
||||||
|
}
|
||||||
|
for sr in result.step_results
|
||||||
|
],
|
||||||
|
}, indent=2, default=str).encode("utf-8")
|
||||||
|
|
||||||
|
_pipeline_empty = current_pipeline is None or not current_pipeline.steps
|
||||||
|
|
||||||
dl_a, dl_b, dl_c = st.columns(3)
|
dl_a, dl_b, dl_c = st.columns(3)
|
||||||
with dl_a:
|
with dl_a:
|
||||||
bytes_csv = result.final_df.to_csv(index=False).encode("utf-8-sig")
|
|
||||||
st.download_button(
|
st.download_button(
|
||||||
"Download cleaned CSV",
|
"Download cleaned CSV",
|
||||||
data=bytes_csv,
|
data=cleaned_bytes,
|
||||||
file_name=f"{stem}_pipeline.csv",
|
file_name=f"{stem}_pipeline.csv",
|
||||||
mime="text/csv",
|
mime="text/csv",
|
||||||
|
key="pipeline_dl_cleaned",
|
||||||
|
use_container_width=True,
|
||||||
)
|
)
|
||||||
with dl_b:
|
with dl_b:
|
||||||
pipeline_bytes = json.dumps(
|
|
||||||
current_pipeline.to_dict() if current_pipeline else {"steps": []},
|
|
||||||
indent=2, default=str,
|
|
||||||
).encode("utf-8")
|
|
||||||
st.download_button(
|
st.download_button(
|
||||||
"Download pipeline JSON",
|
"Download pipeline JSON",
|
||||||
data=pipeline_bytes,
|
data=pipeline_bytes,
|
||||||
file_name="pipeline.json",
|
file_name="pipeline.json",
|
||||||
mime="application/json",
|
mime="application/json",
|
||||||
help="Save this and pass --pipeline pipeline.json to the CLI to re-run on next week's file.",
|
key="pipeline_dl_pipeline",
|
||||||
|
disabled=_pipeline_empty,
|
||||||
|
help=(
|
||||||
|
"No pipeline defined."
|
||||||
|
if _pipeline_empty
|
||||||
|
else "Save this and pass --pipeline pipeline.json to the CLI to re-run on next week's file."
|
||||||
|
),
|
||||||
|
use_container_width=True,
|
||||||
)
|
)
|
||||||
with dl_c:
|
with dl_c:
|
||||||
audit_bytes = json.dumps({
|
|
||||||
"warnings": result.warnings,
|
|
||||||
"initial_rows": result.initial_rows,
|
|
||||||
"final_rows": result.final_rows,
|
|
||||||
"total_elapsed_seconds": result.total_elapsed,
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"tool": sr.step.tool,
|
|
||||||
"name": sr.step.display_name(),
|
|
||||||
"enabled": sr.step.enabled,
|
|
||||||
"skipped": sr.skipped,
|
|
||||||
"elapsed_seconds": sr.elapsed_seconds,
|
|
||||||
"summary": sr.summary,
|
|
||||||
"error": sr.error,
|
|
||||||
}
|
|
||||||
for sr in result.step_results
|
|
||||||
],
|
|
||||||
}, indent=2, default=str).encode("utf-8")
|
|
||||||
st.download_button(
|
st.download_button(
|
||||||
"Download run audit",
|
"Download run audit",
|
||||||
data=audit_bytes,
|
data=audit_bytes,
|
||||||
file_name=f"{stem}_pipeline_audit.json",
|
file_name=f"{stem}_pipeline_audit.json",
|
||||||
mime="application/json",
|
mime="application/json",
|
||||||
|
key="pipeline_dl_audit",
|
||||||
|
use_container_width=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
st.divider()
|
st.divider()
|
||||||
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")
|
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Post-run auto-scroll
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# When the user clicks Run Pipeline, the preview + options collapse but
|
||||||
|
# Streamlit by itself doesn't scroll — the Results section is at the
|
||||||
|
# bottom of a tall script so the user has to find it. Inject a tiny
|
||||||
|
# component-html iframe that calls ``scrollIntoView`` on the parent's
|
||||||
|
# Results anchor. Streamlit's main page is same-origin with component
|
||||||
|
# iframes so ``window.parent.document`` access is allowed.
|
||||||
|
#
|
||||||
|
# The flag is one-shot (``pop`` removes it) so re-renders triggered by
|
||||||
|
# unrelated widgets in the Results section don't yank the viewport
|
||||||
|
# back to the top of Results.
|
||||||
|
if st.session_state.pop("_pipeline_scroll_to_results", False):
|
||||||
|
from streamlit.components.v1 import html as _components_html
|
||||||
|
_components_html(
|
||||||
|
"""
|
||||||
|
<script>
|
||||||
|
const doc = window.parent.document;
|
||||||
|
const target = doc.getElementById('pipeline-results-anchor');
|
||||||
|
if (target) target.scrollIntoView({behavior: 'smooth', block: 'start'});
|
||||||
|
</script>
|
||||||
|
""",
|
||||||
|
height=0,
|
||||||
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user