feat(tools): unified post-run UX across all Ready tool pages

Apply the Clean Text page's post-run UX pattern to every other Ready
tool page (Find Duplicates, Standardize Formats, Fix Missing Values,
Map Columns, Automated Workflows) for consistency and ease of use.

Per page:

1. Preview wrapped in ``st.expander(f"Preview: {filename}",
   expanded=not _has_result)``. Open before a result exists, folded
   afterwards.

2. Options / configuration controls wrapped in
   ``st.expander("Options", expanded=not _has_result)``. Inner
   sub-expanders preserved (Streamlit 1.36+ supports nesting).

3. After the primary action stashes the result, set a one-shot
   ``_<tool>_scroll_to_results`` flag in session state and call
   ``st.rerun()`` so the preview + options expanders see the new
   state on the next pass and collapse themselves.

4. ``<div id="<tool>-results-anchor" style="height:1px">`` placed
   immediately before the Results subheader.

5. End-of-page: pop the scroll flag and inject a tiny
   ``streamlit.components.v1.html`` iframe whose ``<script>`` calls
   ``scrollIntoView`` on the parent document's anchor. One-shot, so
   unrelated reruns (toggling Show-hidden, etc.) don't yank the
   viewport.

6. Download buttons hardened against the multi-button Streamlit
   footgun: byte buffers pre-computed outside the column scopes,
   explicit unique ``key="<tool>_dl_<purpose>"`` per button,
   ``use_container_width=True``, and previously-conditional buttons
   now render unconditionally with ``disabled=True`` + a help
   tooltip when the underlying data is empty so layout stays steady.

Per-page judgment calls (already noted in agent reports):

- Find Duplicates: sheet picker and delimiter selector kept OUTSIDE
  expanders (the user still needs to see them when a file fails to
  parse).
- Fix Missing Values: missingness profile wrapped INSIDE the Options
  expander together with Strategy — the Results section already
  shows a before/after missingness comparison that supersedes the
  static input profile.
- Map Columns: all three subsections (Target schema, Strategy,
  Mapping) wrapped under one outer Options expander, matching the
  Text Cleaner pattern.
- Automated Workflows: inner "Recommended tool order" expander stays
  nested inside the outer Options wrap; Run button stays outside
  Options so the user can re-run after tweaking the (collapsed)
  editor.

2008 tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-16 21:04:37 +00:00
parent d1aaf3c2b9
commit 6415be8bf4
5 changed files with 1250 additions and 879 deletions

View File

@@ -173,12 +173,23 @@ if uploaded is not None:
st.session_state["review_decisions"] = {} st.session_state["review_decisions"] = {}
tmp_path.unlink(missing_ok=True) tmp_path.unlink(missing_ok=True)
# Collapse the input preview + options once a result exists so
# the Results section below becomes the primary visual focus
# after Find Duplicates runs. Mirrors the Clean Text pattern.
_has_result = st.session_state.get("result") is not None
# Preview # Preview
with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result):
# Subheader retained inside the expander so collected_text in
# the workflow tests still finds "Preview: <name>" — Streamlit's
# AppTest does not surface expander labels through the
# markdown/caption/subheader collections.
st.subheader(f"Preview: {uploaded.name}") st.subheader(f"Preview: {uploaded.name}")
st.caption(f"{len(df)} rows, {len(df.columns)} columns") st.caption(f"{len(df)} rows, {len(df.columns)} columns")
st.dataframe(df.head(10), use_container_width=True) st.dataframe(df.head(10), use_container_width=True)
# Advanced options # Advanced options
with st.expander("Options", expanded=not _has_result):
settings = config_panel(df) settings = config_panel(df)
# Apply loaded config if present # Apply loaded config if present
@@ -218,6 +229,11 @@ if uploaded is not None:
progress_bar.empty() progress_bar.empty()
st.session_state["result"] = result st.session_state["result"] = result
st.session_state["review_decisions"] = {} st.session_state["review_decisions"] = {}
# One-shot flag for the scroll snippet at the bottom of the
# page. Force a rerun so the Preview / Options expanders see
# the new result on the next pass and collapse themselves.
st.session_state["_dedup_scroll_to_results"] = True
st.rerun()
# ------------------------------------------------------------------- # -------------------------------------------------------------------
# Results # Results
@@ -227,6 +243,14 @@ if uploaded is not None:
if result is not None: if result is not None:
st.divider() st.divider()
# Anchor target for the post-run auto-scroll snippet at the
# bottom of this page. A bare ``<div id="...">`` survives
# Streamlit's HTML sanitizer; a 1px-tall div doesn't shift
# layout.
st.markdown(
'<div id="dedup-results-anchor" style="height:1px"></div>',
unsafe_allow_html=True,
)
st.subheader("Results") st.subheader("Results")
# Summary + download buttons # Summary + download buttons
@@ -324,26 +348,44 @@ if uploaded is not None:
df, result.match_groups, decisions, df, result.match_groups, decisions,
) )
csv_bytes = reviewed_df.to_csv( # Pre-compute every byte buffer up front so each
# ``st.download_button`` sees stable ``data``
# across reruns. Render the empty-removed case
# as a disabled button (rather than hiding it)
# so layout stays steady and the user can see
# why the download isn't available.
reviewed_bytes = reviewed_df.to_csv(
index=False index=False
).encode("utf-8-sig") ).encode("utf-8-sig")
reviewed_removed_empty = reviewed_removed.empty
reviewed_removed_bytes = (
reviewed_removed.to_csv(index=False).encode("utf-8-sig")
if not reviewed_removed_empty
else b""
)
st.download_button( st.download_button(
"Download Reviewed & Deduplicated CSV", "Download Reviewed & Deduplicated CSV",
data=csv_bytes, data=reviewed_bytes,
file_name="deduplicated_reviewed.csv", file_name="deduplicated_reviewed.csv",
mime="text/csv", mime="text/csv",
key="reviewed_download", key="dedup_dl_reviewed",
use_container_width=True,
) )
if not reviewed_removed.empty:
removed_bytes = reviewed_removed.to_csv(
index=False
).encode("utf-8-sig")
st.download_button( st.download_button(
"Download Reviewed Removed Rows", "Download Reviewed Removed Rows",
data=removed_bytes, data=reviewed_removed_bytes,
file_name="removed_reviewed.csv", file_name="removed_reviewed.csv",
mime="text/csv", mime="text/csv",
key="reviewed_removed_download", key="dedup_dl_reviewed_removed",
disabled=reviewed_removed_empty,
help=(
"No rows were removed under the current "
"review decisions."
if reviewed_removed_empty
else None
),
use_container_width=True,
) )
# Log entries # Log entries
@@ -365,3 +407,27 @@ st.caption(
"Runs locally. Your data never leaves this computer. " "Runs locally. Your data never leaves this computer. "
"| DataTools v3.0" "| DataTools v3.0"
) )
# ---------------------------------------------------------------------------
# Post-run auto-scroll
# ---------------------------------------------------------------------------
#
# When Find Duplicates fires, the preview + options collapse, but
# Streamlit by itself doesn't scroll — the Results section sits below a
# tall page so the user has to hunt for it. Inject a tiny
# component-html iframe that calls ``scrollIntoView`` on the parent's
# Results anchor. The flag is one-shot (``pop`` removes it) so reruns
# triggered by unrelated widgets in the Results section don't yank the
# viewport back to the top of Results.
if st.session_state.pop("_dedup_scroll_to_results", False):
from streamlit.components.v1 import html as _components_html
_components_html(
"""
<script>
const doc = window.parent.document;
const target = doc.getElementById('dedup-results-anchor');
if (target) target.scrollIntoView({behavior: 'smooth', block: 'start'});
</script>
""",
height=0,
)

View File

@@ -99,9 +99,13 @@ except Exception as e:
) )
st.stop() st.stop()
st.subheader(f"Preview: {uploaded.name}") # Collapse the input preview once the user has clicked Standardize Formats
st.caption(f"{len(df)} rows, {len(df.columns)} columns") # so the Results section below is the primary visual focus. The user can
st.dataframe(df.head(10), use_container_width=True) # re-expand the expander to re-inspect the source rows.
_has_result = st.session_state.get("fmtstd_result") is not None
with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result):
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
st.dataframe(df.head(10), use_container_width=True)
st.divider() st.divider()
@@ -180,14 +184,23 @@ def _detect_field_type(col: str, samples: list[str]) -> FieldType | None:
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Options # Options
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
#
# Wrapped in an outer expander whose default state mirrors the preview
# expander above: open before a result exists, folded once the user has
# clicked Standardize Formats. Together they push the Results section to
# the top of the visible area after a run.
st.subheader("Column types") column_types: dict[str, FieldType] = {}
st.caption( extra_abbreviations: dict[str, str] = {}
with st.expander("Options", expanded=not _has_result):
st.subheader("Column types")
st.caption(
"Assign each column to a field type. Auto-detected suggestions are " "Assign each column to a field type. Auto-detected suggestions are "
"pre-filled; pick **(skip)** to leave a column untouched." "pre-filled; pick **(skip)** to leave a column untouched."
) )
_FIELD_LABELS = { _FIELD_LABELS = {
"(skip)": None, "(skip)": None,
"Date": FieldType.DATE, "Date": FieldType.DATE,
"Phone": FieldType.PHONE, "Phone": FieldType.PHONE,
@@ -195,17 +208,16 @@ _FIELD_LABELS = {
"Name": FieldType.NAME, "Name": FieldType.NAME,
"Address": FieldType.ADDRESS, "Address": FieldType.ADDRESS,
"Boolean": FieldType.BOOLEAN, "Boolean": FieldType.BOOLEAN,
} }
_LABEL_BY_TYPE = {v: k for k, v in _FIELD_LABELS.items()} _LABEL_BY_TYPE = {v: k for k, v in _FIELD_LABELS.items()}
_LABELS = list(_FIELD_LABELS.keys()) _LABELS = list(_FIELD_LABELS.keys())
sample_size = min(len(df), 200) sample_size = min(len(df), 200)
sample_df = df.head(sample_size) sample_df = df.head(sample_size)
column_types: dict[str, FieldType] = {} cols_per_row = 3
cols_per_row = 3 columns_iter = list(df.columns)
columns_iter = list(df.columns) for i in range(0, len(columns_iter), cols_per_row):
for i in range(0, len(columns_iter), cols_per_row):
cols_block = st.columns(cols_per_row) cols_block = st.columns(cols_per_row)
for j, col_name in enumerate(columns_iter[i:i + cols_per_row]): for j, col_name in enumerate(columns_iter[i:i + cols_per_row]):
with cols_block[j]: with cols_block[j]:
@@ -221,29 +233,29 @@ for i in range(0, len(columns_iter), cols_per_row):
if ft is not None: if ft is not None:
column_types[col_name] = ft column_types[col_name] = ft
st.divider() st.divider()
st.subheader("Format options") st.subheader("Format options")
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Preset bundle picker # Preset bundle picker
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# #
# Picking a preset rewrites every option below to that preset's defaults. # Picking a preset rewrites every option below to that preset's defaults.
# It does NOT touch column-type assignments — those are user-driven and # It does NOT touch column-type assignments — those are user-driven and
# orthogonal. To make the rewrite stick across the rerun, we stash the # orthogonal. To make the rewrite stick across the rerun, we stash the
# preset values into the per-option session keys; the widgets below read # preset values into the per-option session keys; the widgets below read
# those keys via their ``index``/``value`` arguments. # those keys via their ``index``/``value`` arguments.
_PRESET_LABELS = { _PRESET_LABELS = {
"us-default": "US (default) — ISO 8601 dates · E.164 phones · USD", "us-default": "US (default) — ISO 8601 dates · E.164 phones · USD",
"european": "European — DMY input · INTL phones · EUR comma decimal", "european": "European — DMY input · INTL phones · EUR comma decimal",
"uk": "UK — DD/MM/YYYY · GB phones · Yes/No booleans", "uk": "UK — DD/MM/YYYY · GB phones · Yes/No booleans",
"iso-strict": "ISO Strict — ISO 8601 · bare-number currency · true/false", "iso-strict": "ISO Strict — ISO 8601 · bare-number currency · true/false",
"legacy-us": "Legacy US — MM/DD/YYYY · National phones · Yes/No", "legacy-us": "Legacy US — MM/DD/YYYY · National phones · Yes/No",
"custom": "Custom — keep current settings", "custom": "Custom — keep current settings",
} }
preset_choice = st.radio( preset_choice = st.radio(
"Standards preset", "Standards preset",
list(_PRESET_LABELS.keys()), list(_PRESET_LABELS.keys()),
format_func=lambda k: _PRESET_LABELS[k], format_func=lambda k: _PRESET_LABELS[k],
@@ -255,31 +267,31 @@ preset_choice = st.radio(
"Every option below is still individually overridable; choose " "Every option below is still individually overridable; choose "
"**Custom** to keep whatever you've manually adjusted." "**Custom** to keep whatever you've manually adjusted."
), ),
) )
# Detect a preset switch since the last rerun; when it changes (and the # Detect a preset switch since the last rerun; when it changes (and the
# new choice isn't ``custom``), purge the dependent widget keys so # new choice isn't ``custom``), purge the dependent widget keys so
# Streamlit lets their ``index=``/``value=`` defaults take effect on the # Streamlit lets their ``index=``/``value=`` defaults take effect on the
# new render. Without this clear, prior session_state pins the widget to # new render. Without this clear, prior session_state pins the widget to
# the previous preset's choice and the apparent picker becomes a no-op. # the previous preset's choice and the apparent picker becomes a no-op.
_DEPENDENT_KEYS = [ _DEPENDENT_KEYS = [
"fmtstd_date_format", "fmtstd_date_order", "fmtstd_date_format", "fmtstd_date_order",
"fmtstd_phone_format", "fmtstd_phone_region", "fmtstd_phone_format", "fmtstd_phone_region",
"fmtstd_currency_decimal", "fmtstd_currency_decimals", "fmtstd_currency_decimal", "fmtstd_currency_decimals",
"fmtstd_currency_preserve", "fmtstd_currency_preserve_code", "fmtstd_currency_preserve", "fmtstd_currency_preserve_code",
"fmtstd_name_case", "fmtstd_bool_style", "fmtstd_name_case", "fmtstd_bool_style",
] ]
_last = st.session_state.get("fmtstd_preset_last") _last = st.session_state.get("fmtstd_preset_last")
if _last != preset_choice: if _last != preset_choice:
st.session_state["fmtstd_preset_last"] = preset_choice st.session_state["fmtstd_preset_last"] = preset_choice
if preset_choice != "custom": if preset_choice != "custom":
for k in _DEPENDENT_KEYS: for k in _DEPENDENT_KEYS:
st.session_state.pop(k, None) st.session_state.pop(k, None)
st.rerun() st.rerun()
# Map preset → widget-state defaults. Done as labels so the radios/selects # Map preset → widget-state defaults. Done as labels so the radios/selects
# below pick up the right index without us re-implementing each map twice. # below pick up the right index without us re-implementing each map twice.
_PRESET_TO_WIDGETS: dict[str, dict[str, str]] = { _PRESET_TO_WIDGETS: dict[str, dict[str, str]] = {
"us-default": { "us-default": {
"date_format": "YYYY-MM-DD (ISO)", "date_order": "MDY (US)", "date_format": "YYYY-MM-DD (ISO)", "date_order": "MDY (US)",
"phone_format": "E.164 (+15551234567)", "phone_region": "US", "phone_format": "E.164 (+15551234567)", "phone_region": "US",
@@ -315,25 +327,25 @@ _PRESET_TO_WIDGETS: dict[str, dict[str, str]] = {
"currency_preserve_code": False, "currency_preserve_code": False,
"name_case": "Title Case", "boolean_style": "Yes/No", "name_case": "Title Case", "boolean_style": "Yes/No",
}, },
} }
# ``iso-strict`` wants currency with no rounding; the GUI exposes that via # ``iso-strict`` wants currency with no rounding; the GUI exposes that via
# the "preserve original precision" checkbox rather than a sentinel value # the "preserve original precision" checkbox rather than a sentinel value
# in the number-input. Map that here. # in the number-input. Map that here.
_PRESET_PRESERVE_DECIMALS: dict[str, bool] = { _PRESET_PRESERVE_DECIMALS: dict[str, bool] = {
"iso-strict": True, "iso-strict": True,
} }
def _preset_default(key: str, fallback): def _preset_default(key: str, fallback):
"""Pull the preset-driven default for *key*, or *fallback* on Custom.""" """Pull the preset-driven default for *key*, or *fallback* on Custom."""
if preset_choice == "custom": if preset_choice == "custom":
return fallback return fallback
return _PRESET_TO_WIDGETS[preset_choice].get(key, fallback) return _PRESET_TO_WIDGETS[preset_choice].get(key, fallback)
opt_cols = st.columns(2) opt_cols = st.columns(2)
with opt_cols[0]: with opt_cols[0]:
st.markdown("**Dates**") st.markdown("**Dates**")
_DATE_LABELS = ["YYYY-MM-DD (ISO)", "MM/DD/YYYY", "DD/MM/YYYY", "DD-Mon-YYYY", "Mon DD, YYYY"] _DATE_LABELS = ["YYYY-MM-DD (ISO)", "MM/DD/YYYY", "DD/MM/YYYY", "DD-Mon-YYYY", "Mon DD, YYYY"]
date_format_label = st.selectbox( date_format_label = st.selectbox(
@@ -383,7 +395,7 @@ with opt_cols[0]:
key="fmtstd_phone_region", key="fmtstd_phone_region",
).upper() or "US" ).upper() or "US"
with opt_cols[1]: with opt_cols[1]:
st.markdown("**Currency**") st.markdown("**Currency**")
_CURR_DECIMAL_LABELS = ["dot (1,234.56)", "comma (1.234,56)"] _CURR_DECIMAL_LABELS = ["dot (1,234.56)", "comma (1.234,56)"]
currency_decimal = st.radio( currency_decimal = st.radio(
@@ -436,17 +448,16 @@ with opt_cols[1]:
key="fmtstd_bool_style", key="fmtstd_bool_style",
) )
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Address abbreviations — built-in USPS table is editable # Address abbreviations — built-in USPS table is editable
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# #
# Users with international addresses (German Strasse, Spanish-language # Users with international addresses (German Strasse, Spanish-language
# Avenida, French Boulevard variants) need to override the built-in # Avenida, French Boulevard variants) need to override the built-in
# table. Show it in a data_editor so the override is visible — the table # table. Show it in a data_editor so the override is visible — the table
# is small, this is the right surface. # is small, this is the right surface.
extra_abbreviations: dict[str, str] = {} if any(ft == FieldType.ADDRESS for ft in column_types.values()):
if any(ft == FieldType.ADDRESS for ft in column_types.values()):
with st.expander("Custom address abbreviations (advanced)", expanded=False): with st.expander("Custom address abbreviations (advanced)", expanded=False):
st.caption( st.caption(
"Add or override entries in the address abbreviation table. " "Add or override entries in the address abbreviation table. "
@@ -489,7 +500,7 @@ if any(ft == FieldType.ADDRESS for ft in column_types.values()):
"with the built-in table." "with the built-in table."
) )
options = StandardizeOptions( options = StandardizeOptions(
column_types=column_types, column_types=column_types,
date_output_format=date_format_map[date_format_label], date_output_format=date_format_map[date_format_label],
date_order="MDY" if date_order.startswith("MDY") else "DMY", date_order="MDY" if date_order.startswith("MDY") else "DMY",
@@ -501,7 +512,7 @@ options = StandardizeOptions(
name_case=name_case_map[name_case_label], # type: ignore[arg-type] name_case=name_case_map[name_case_label], # type: ignore[arg-type]
boolean_style=boolean_style, # type: ignore[arg-type] boolean_style=boolean_style, # type: ignore[arg-type]
extra_abbreviations=extra_abbreviations, extra_abbreviations=extra_abbreviations,
) )
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -528,6 +539,14 @@ if st.button(
st.stop() st.stop()
st.session_state["fmtstd_result"] = result st.session_state["fmtstd_result"] = result
st.session_state["fmtstd_input_name"] = uploaded.name st.session_state["fmtstd_input_name"] = uploaded.name
# One-shot flag picked up on the next pass to scroll the parent
# document to the Results anchor (see scroll snippet below).
st.session_state["_fmtstd_scroll_to_results"] = True
# Force a second rerun so the preview and options expanders see
# the new result on the NEXT script pass and collapse themselves.
# Without this they stay expanded until the user touches any
# other widget.
st.rerun()
result = st.session_state.get("fmtstd_result") result = st.session_state.get("fmtstd_result")
if result is None: if result is None:
@@ -538,6 +557,16 @@ if result is None:
# Results # Results
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Anchor target for the auto-scroll snippet at the end of this block.
# A bare ``<div id="...">`` survives Streamlit's HTML sanitizer (only
# ``<script>`` is stripped), and a 1px-tall div doesn't visually shift
# anything. Placed before the subheader so the scrolled-to viewport
# starts a few pixels above the section heading rather than below it.
st.markdown(
'<div id="fmtstd-results-anchor" style="height:1px"></div>',
unsafe_allow_html=True,
)
st.subheader("Results") st.subheader("Results")
pct = (result.cells_changed / result.cells_total * 100.0) if result.cells_total else 0.0 pct = (result.cells_changed / result.cells_total * 100.0) if result.cells_total else 0.0
@@ -574,36 +603,83 @@ st.dataframe(result.standardized_df.head(10), use_container_width=True)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Downloads # Downloads
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
#
# All three byte buffers are prepared up front (outside the columns) so
# each ``st.download_button`` sees stable ``data`` across reruns and an
# explicit ``key`` — without those, Streamlit auto-derived widget IDs
# can collide for multiple download_buttons in adjacent columns and
# only the first one actually fires on click. The empty-changes case
# now renders a disabled button (rather than vanishing) so the layout
# stays steady and the user understands why nothing's available.
st.divider() st.divider()
stem = Path(st.session_state.get("fmtstd_input_name", "input")).stem stem = Path(st.session_state.get("fmtstd_input_name", "input")).stem
standardized_bytes = result.standardized_df.to_csv(index=False).encode("utf-8-sig")
changes_bytes = (
result.changes.to_csv(index=False).encode("utf-8-sig")
if not result.changes.empty
else b""
)
config_bytes = json.dumps(options.to_dict(), indent=2).encode("utf-8")
dl_a, dl_b, dl_c = st.columns(3) dl_a, dl_b, dl_c = st.columns(3)
with dl_a: with dl_a:
standardized_bytes = result.standardized_df.to_csv(index=False).encode("utf-8-sig")
st.download_button( st.download_button(
"Download standardized CSV", "Download standardized CSV",
data=standardized_bytes, data=standardized_bytes,
file_name=f"{stem}_standardized.csv", file_name=f"{stem}_standardized.csv",
mime="text/csv", mime="text/csv",
key="fmtstd_dl_standardized",
use_container_width=True,
) )
with dl_b: with dl_b:
if not result.changes.empty:
changes_bytes = result.changes.to_csv(index=False).encode("utf-8-sig")
st.download_button( st.download_button(
"Download changes audit", "Download changes audit",
data=changes_bytes, data=changes_bytes,
file_name=f"{stem}_changes.csv", file_name=f"{stem}_changes.csv",
mime="text/csv", mime="text/csv",
key="fmtstd_dl_changes",
disabled=result.changes.empty,
help="No changes to audit." if result.changes.empty else None,
use_container_width=True,
) )
with dl_c: with dl_c:
config_bytes = json.dumps(options.to_dict(), indent=2).encode("utf-8")
st.download_button( st.download_button(
"Download config JSON", "Download config JSON",
data=config_bytes, data=config_bytes,
file_name="format_standardize_config.json", file_name="format_standardize_config.json",
mime="application/json", mime="application/json",
key="fmtstd_dl_config",
use_container_width=True,
) )
st.divider() st.divider()
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0") st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")
# ---------------------------------------------------------------------------
# Post-run auto-scroll
# ---------------------------------------------------------------------------
#
# When the user clicks Standardize Formats, the preview + options collapse
# but Streamlit by itself doesn't scroll — the Results section is at the
# bottom of a tall script so the user has to find it. Inject a tiny
# component-html iframe that calls ``scrollIntoView`` on the parent's
# Results anchor. Streamlit's main page is same-origin with component
# iframes so ``window.parent.document`` access is allowed.
#
# The flag is one-shot (``pop`` removes it) so re-renders triggered by
# unrelated widgets in the Results section don't yank the viewport back
# to the top of Results.
if st.session_state.pop("_fmtstd_scroll_to_results", False):
from streamlit.components.v1 import html as _components_html
_components_html(
"""
<script>
const doc = window.parent.document;
const target = doc.getElementById('fmtstd-results-anchor');
if (target) target.scrollIntoView({behavior: 'smooth', block: 'start'});
</script>
""",
height=0,
)

View File

@@ -95,41 +95,52 @@ except Exception as e:
) )
st.stop() st.stop()
st.subheader(f"Preview: {uploaded.name}") # Collapse the input preview + options once the user has clicked
st.caption(f"{len(df)} rows, {len(df.columns)} columns") # Handle Missing Values so the Results section below is the primary
st.dataframe(df.head(10), use_container_width=True) # visual focus. The user can re-expand to re-inspect the source rows
# or tweak strategy and rerun.
_has_result = st.session_state.get("missing_result") is not None
with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result):
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
st.dataframe(df.head(10), use_container_width=True)
st.divider() st.divider()
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Initial profile (read-only) # Options (Missingness profile + Strategy)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
#
# Wrapped in an outer expander whose default state mirrors the preview
# expander above: open before a result exists, folded once the user has
# clicked Handle Missing Values. The Missingness profile lives inside
# this expander too — after a run the Results section shows a richer
# before-vs-after comparison that supersedes the static input profile,
# so keeping it tucked away with the controls cleanly pushes Results
# to the top of the visible area.
st.subheader("Missingness profile") with st.expander("Options", expanded=not _has_result):
st.subheader("Missingness profile")
initial_profile = profile_missing(df, MissingOptions()) initial_profile = profile_missing(df, MissingOptions())
prof_df = initial_profile.to_dataframe() prof_df = initial_profile.to_dataframe()
m1, m2, m3, m4 = st.columns(4) m1, m2, m3, m4 = st.columns(4)
m1.metric("Rows", initial_profile.rows_total) m1.metric("Rows", initial_profile.rows_total)
m2.metric("Cells missing", initial_profile.cells_missing) m2.metric("Cells missing", initial_profile.cells_missing)
m3.metric("% cells missing", f"{initial_profile.cells_missing_pct:.1f}%") m3.metric("% cells missing", f"{initial_profile.cells_missing_pct:.1f}%")
m4.metric("Complete rows", initial_profile.rows_complete) m4.metric("Complete rows", initial_profile.rows_complete)
st.dataframe(prof_df, use_container_width=True, hide_index=True) st.dataframe(prof_df, use_container_width=True, hide_index=True)
if initial_profile.cells_missing == 0: if initial_profile.cells_missing == 0:
st.success("No missing values or disguised nulls detected. Nothing to handle.") st.success("No missing values or disguised nulls detected. Nothing to handle.")
st.divider() st.divider()
# --------------------------------------------------------------------------- st.subheader("Strategy")
# Options
# ---------------------------------------------------------------------------
st.subheader("Strategy") preset_label = st.radio(
preset_label = st.radio(
"Preset", "Preset",
[ [
"detect-only (standardize sentinels to NaN, no fill or drop)", "detect-only (standardize sentinels to NaN, no fill or drop)",
@@ -142,11 +153,11 @@ preset_label = st.radio(
"safe-fill: also fill — numeric columns with median, others with mode. " "safe-fill: also fill — numeric columns with median, others with mode. "
"drop-incomplete: also drop every row that has any missing cell." "drop-incomplete: also drop every row that has any missing cell."
), ),
) )
preset_key = preset_label.split(" ", 1)[0] preset_key = preset_label.split(" ", 1)[0]
options = MissingOptions.from_preset(preset_key) options = MissingOptions.from_preset(preset_key)
with st.expander("Advanced options"): with st.expander("Advanced options"):
col_a, col_b = st.columns(2) col_a, col_b = st.columns(2)
with col_a: with col_a:
@@ -282,6 +293,14 @@ if st.button("Handle Missing Values", type="primary", use_container_width=True):
st.session_state["missing_result"] = result st.session_state["missing_result"] = result
st.session_state["missing_input_name"] = uploaded.name st.session_state["missing_input_name"] = uploaded.name
st.session_state["missing_options"] = options.to_dict() st.session_state["missing_options"] = options.to_dict()
# One-shot flag picked up on the next pass to scroll the parent
# document to the Results anchor (see scroll snippet below).
st.session_state["_missing_scroll_to_results"] = True
# Force a second rerun so the preview and options expanders see
# the new result on the NEXT script pass and collapse themselves.
# Without this they stay expanded until the user touches any
# other widget.
st.rerun()
result = st.session_state.get("missing_result") result = st.session_state.get("missing_result")
if result is None: if result is None:
@@ -292,6 +311,16 @@ if result is None:
# Results # Results
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Anchor target for the auto-scroll snippet at the end of this block.
# A bare ``<div id="...">`` survives Streamlit's HTML sanitizer (only
# ``<script>`` is stripped), and a 1px-tall div doesn't visually shift
# anything. Placed before the subheader so the scrolled-to viewport
# starts a few pixels above the section heading rather than below it.
st.markdown(
'<div id="missing-results-anchor" style="height:1px"></div>',
unsafe_allow_html=True,
)
st.subheader("Results") st.subheader("Results")
m1, m2, m3, m4 = st.columns(4) m1, m2, m3, m4 = st.columns(4)
@@ -334,38 +363,85 @@ st.dataframe(result.handled_df.head(10), use_container_width=True)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Downloads # Downloads
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
#
# All three byte buffers are prepared up front (outside the columns) so
# each ``st.download_button`` sees stable ``data`` across reruns and an
# explicit ``key`` — without those, Streamlit auto-derived widget IDs
# can collide for multiple download_buttons in adjacent columns and
# only the first one actually fires on click. The empty-changes case
# now renders a disabled button (rather than vanishing) so the layout
# stays steady and the user understands why nothing's available.
st.divider() st.divider()
stem = Path(st.session_state.get("missing_input_name", "input")).stem stem = Path(st.session_state.get("missing_input_name", "input")).stem
handled_bytes = result.handled_df.to_csv(index=False).encode("utf-8-sig")
changes_bytes = (
result.changes.to_csv(index=False).encode("utf-8-sig")
if not result.changes.empty
else b""
)
config_bytes = json.dumps(
st.session_state.get("missing_options", {}), indent=2, default=str,
).encode("utf-8")
dl_a, dl_b, dl_c = st.columns(3) dl_a, dl_b, dl_c = st.columns(3)
with dl_a: with dl_a:
handled_bytes = result.handled_df.to_csv(index=False).encode("utf-8-sig")
st.download_button( st.download_button(
"Download handled CSV", "Download handled CSV",
data=handled_bytes, data=handled_bytes,
file_name=f"{stem}_missing.csv", file_name=f"{stem}_missing.csv",
mime="text/csv", mime="text/csv",
key="missing_dl_handled",
use_container_width=True,
) )
with dl_b: with dl_b:
if not result.changes.empty:
changes_bytes = result.changes.to_csv(index=False).encode("utf-8-sig")
st.download_button( st.download_button(
"Download changes audit", "Download changes audit",
data=changes_bytes, data=changes_bytes,
file_name=f"{stem}_missing_changes.csv", file_name=f"{stem}_missing_changes.csv",
mime="text/csv", mime="text/csv",
key="missing_dl_changes",
disabled=result.changes.empty,
help="No changes to audit." if result.changes.empty else None,
use_container_width=True,
) )
with dl_c: with dl_c:
config_bytes = json.dumps(
st.session_state.get("missing_options", {}), indent=2, default=str,
).encode("utf-8")
st.download_button( st.download_button(
"Download config JSON", "Download config JSON",
data=config_bytes, data=config_bytes,
file_name="missing_config.json", file_name="missing_config.json",
mime="application/json", mime="application/json",
key="missing_dl_config",
use_container_width=True,
) )
st.divider() st.divider()
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0") st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")
# ---------------------------------------------------------------------------
# Post-run auto-scroll
# ---------------------------------------------------------------------------
#
# When the user clicks Handle Missing Values, the preview + options
# collapse but Streamlit by itself doesn't scroll — the Results section
# is at the bottom of a tall script so the user has to find it. Inject
# a tiny component-html iframe that calls ``scrollIntoView`` on the
# parent's Results anchor. Streamlit's main page is same-origin with
# component iframes so ``window.parent.document`` access is allowed.
#
# The flag is one-shot (``pop`` removes it) so re-renders triggered by
# unrelated widgets in the Results section don't yank the viewport
# back to the top of Results.
if st.session_state.pop("_missing_scroll_to_results", False):
from streamlit.components.v1 import html as _components_html
_components_html(
"""
<script>
const doc = window.parent.document;
const target = doc.getElementById('missing-results-anchor');
if (target) target.scrollIntoView({behavior: 'smooth', block: 'start'});
</script>
""",
height=0,
)

View File

@@ -88,18 +88,34 @@ except Exception as e:
) )
st.stop() st.stop()
st.subheader(f"Preview: {uploaded.name}") # Collapse the input preview once the user has clicked Apply Column
st.caption(f"{len(df)} rows, {len(df.columns)} columns") # Mapping so the Results section below is the primary visual focus.
st.dataframe(df.head(10), use_container_width=True) # The user can re-expand the expander to re-inspect the source rows.
_has_result = st.session_state.get("colmap_result") is not None
with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result):
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
st.dataframe(df.head(10), use_container_width=True)
st.divider() st.divider()
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Schema input # Options (Target schema + Strategy + Mapping)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
#
# Wrapped in an outer expander whose default state mirrors the preview
# expander above: open before a result exists, folded once the user has
# clicked Apply Column Mapping. The Mapping editor is the heart of the
# tool, but per the Text Cleaner pattern we still collapse everything
# post-run — the user can re-expand to tweak any of the three sections.
st.subheader("Target schema") with st.expander("Options", expanded=not _has_result):
# -----------------------------------------------------------------------
# Schema input
# -----------------------------------------------------------------------
schema_mode = st.radio( st.subheader("Target schema")
schema_mode = st.radio(
"How would you like to define the target schema?", "How would you like to define the target schema?",
[ [
"Build interactively (start from current columns)", "Build interactively (start from current columns)",
@@ -112,11 +128,11 @@ schema_mode = st.radio(
"when you have a fixed contract (a CRM import format, db schema). " "when you have a fixed contract (a CRM import format, db schema). "
"Skip when you only want to rename or coerce specific columns." "Skip when you only want to rename or coerce specific columns."
), ),
) )
schema: TargetSchema | None = None schema: TargetSchema | None = None
if schema_mode.startswith("Upload"): if schema_mode.startswith("Upload"):
schema_file = st.file_uploader( schema_file = st.file_uploader(
"Schema JSON", "Schema JSON",
type=["json"], type=["json"],
@@ -131,7 +147,7 @@ if schema_mode.startswith("Upload"):
from src.core.errors import format_for_user from src.core.errors import format_for_user
st.error(f"**Could not parse schema**\n\n```\n{format_for_user(e)}\n```") st.error(f"**Could not parse schema**\n\n```\n{format_for_user(e)}\n```")
elif schema_mode.startswith("Build"): elif schema_mode.startswith("Build"):
st.caption( st.caption(
"Edit the table to define your target schema. Add rows for fields the " "Edit the table to define your target schema. Add rows for fields the "
"input doesn't have yet (with a default), or remove rows for columns " "input doesn't have yet (with a default), or remove rows for columns "
@@ -194,15 +210,15 @@ elif schema_mode.startswith("Build"):
if fields: if fields:
schema = TargetSchema(fields=fields) schema = TargetSchema(fields=fields)
st.divider() st.divider()
# --------------------------------------------------------------------------- # -----------------------------------------------------------------------
# Strategy # Strategy
# --------------------------------------------------------------------------- # -----------------------------------------------------------------------
st.subheader("Strategy") st.subheader("Strategy")
preset_label = st.radio( preset_label = st.radio(
"Preset", "Preset",
[ [
"rename-only (just rename, leave types alone, keep extras)", "rename-only (just rename, leave types alone, keep extras)",
@@ -210,12 +226,12 @@ preset_label = st.radio(
"strict-schema (rename + coerce + reorder, drop extras)", "strict-schema (rename + coerce + reorder, drop extras)",
], ],
index=0, index=0,
) )
preset_key = preset_label.split(" ", 1)[0] preset_key = preset_label.split(" ", 1)[0]
options = MapOptions.from_preset(preset_key) options = MapOptions.from_preset(preset_key)
options.schema = schema options.schema = schema
with st.expander("Advanced options"): with st.expander("Advanced options"):
col_a, col_b = st.columns(2) col_a, col_b = st.columns(2)
with col_a: with col_a:
options.unmapped = st.selectbox( # type: ignore[assignment] options.unmapped = st.selectbox( # type: ignore[assignment]
@@ -240,13 +256,13 @@ with st.expander("Advanced options"):
"Enforce required fields", value=options.enforce_required, "Enforce required fields", value=options.enforce_required,
) )
# --------------------------------------------------------------------------- # -----------------------------------------------------------------------
# Mapping editor — show inferred and let user override # Mapping editor — show inferred and let user override
# --------------------------------------------------------------------------- # -----------------------------------------------------------------------
st.subheader("Mapping") st.subheader("Mapping")
if schema is None: if schema is None:
st.caption( st.caption(
"No schema — define explicit renames below (left blank means keep " "No schema — define explicit renames below (left blank means keep "
"the source name)." "the source name)."
@@ -272,7 +288,7 @@ if schema is None:
if tgt and tgt != src: if tgt and tgt != src:
explicit_mapping[src] = tgt explicit_mapping[src] = tgt
options.mapping = explicit_mapping options.mapping = explicit_mapping
else: else:
inferred = ( inferred = (
infer_mapping(df, schema, threshold=options.fuzzy_threshold) infer_mapping(df, schema, threshold=options.fuzzy_threshold)
if options.auto_infer else {} if options.auto_infer else {}
@@ -324,6 +340,12 @@ if st.button("Apply Column Mapping", type="primary", use_container_width=True):
st.session_state["colmap_result"] = result st.session_state["colmap_result"] = result
st.session_state["colmap_input_name"] = uploaded.name st.session_state["colmap_input_name"] = uploaded.name
st.session_state["colmap_options"] = options.to_dict() st.session_state["colmap_options"] = options.to_dict()
# One-shot flag picked up on the next pass to scroll the parent
# document to the Results anchor (see scroll snippet below).
st.session_state["_colmap_scroll_to_results"] = True
# Force a second rerun so the preview and options expanders see
# the new result on the NEXT script pass and collapse themselves.
st.rerun()
result = st.session_state.get("colmap_result") result = st.session_state.get("colmap_result")
if result is None: if result is None:
@@ -334,6 +356,16 @@ if result is None:
# Results # Results
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Anchor target for the auto-scroll snippet at the end of this block.
# A bare ``<div id="...">`` survives Streamlit's HTML sanitizer (only
# ``<script>`` is stripped), and a 1px-tall div doesn't visually shift
# anything. Placed before the subheader so the scrolled-to viewport
# starts a few pixels above the section heading rather than below it.
st.markdown(
'<div id="colmap-results-anchor" style="height:1px"></div>',
unsafe_allow_html=True,
)
st.subheader("Results") st.subheader("Results")
m1, m2, m3, m4 = st.columns(4) m1, m2, m3, m4 = st.columns(4)
@@ -371,21 +403,18 @@ st.dataframe(result.mapped_df.head(10), use_container_width=True)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Downloads # Downloads
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
#
# All three byte buffers are prepared up front (outside the columns) so
# each ``st.download_button`` sees stable ``data`` across reruns and an
# explicit ``key`` — without those, Streamlit auto-derived widget IDs
# can collide for multiple download_buttons in adjacent columns and
# only the first one actually fires on click.
st.divider() st.divider()
stem = Path(st.session_state.get("colmap_input_name", "input")).stem stem = Path(st.session_state.get("colmap_input_name", "input")).stem
dl_a, dl_b, dl_c = st.columns(3) mapped_bytes = result.mapped_df.to_csv(index=False).encode("utf-8-sig")
with dl_a: audit_bytes = json.dumps({
mapped_bytes = result.mapped_df.to_csv(index=False).encode("utf-8-sig")
st.download_button(
"Download mapped CSV",
data=mapped_bytes,
file_name=f"{stem}_mapped.csv",
mime="text/csv",
)
with dl_b:
audit_bytes = json.dumps({
"mapping": result.mapping, "mapping": result.mapping,
"inferred_pairs": result.inferred_pairs, "inferred_pairs": result.inferred_pairs,
"columns_renamed": result.columns_renamed, "columns_renamed": result.columns_renamed,
@@ -394,23 +423,70 @@ with dl_b:
"coercion_failures": result.coercion_failures, "coercion_failures": result.coercion_failures,
"unmapped_kept": result.unmapped_kept, "unmapped_kept": result.unmapped_kept,
"missing_required_targets": result.missing_required_targets, "missing_required_targets": result.missing_required_targets,
}, indent=2, default=str).encode("utf-8") }, indent=2, default=str).encode("utf-8")
config_bytes = json.dumps(
st.session_state.get("colmap_options", {}), indent=2, default=str,
).encode("utf-8")
_no_mapping = not result.mapping
dl_a, dl_b, dl_c = st.columns(3)
with dl_a:
st.download_button(
"Download mapped CSV",
data=mapped_bytes,
file_name=f"{stem}_mapped.csv",
mime="text/csv",
key="colmap_dl_mapped",
use_container_width=True,
)
with dl_b:
st.download_button( st.download_button(
"Download mapping audit", "Download mapping audit",
data=audit_bytes, data=audit_bytes,
file_name=f"{stem}_mapping.json", file_name=f"{stem}_mapping.json",
mime="application/json", mime="application/json",
key="colmap_dl_audit",
disabled=_no_mapping,
help="No mapping was applied." if _no_mapping else None,
use_container_width=True,
) )
with dl_c: with dl_c:
config_bytes = json.dumps(
st.session_state.get("colmap_options", {}), indent=2, default=str,
).encode("utf-8")
st.download_button( st.download_button(
"Download config JSON", "Download config JSON",
data=config_bytes, data=config_bytes,
file_name="column_map_config.json", file_name="column_map_config.json",
mime="application/json", mime="application/json",
key="colmap_dl_config",
use_container_width=True,
) )
st.divider() st.divider()
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0") st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")
# ---------------------------------------------------------------------------
# Post-run auto-scroll
# ---------------------------------------------------------------------------
#
# When the user clicks Apply Column Mapping, the preview + options
# collapse but Streamlit by itself doesn't scroll — the Results section
# is at the bottom of a tall script so the user has to find it. Inject
# a tiny component-html iframe that calls ``scrollIntoView`` on the
# parent's Results anchor. Streamlit's main page is same-origin with
# component iframes so ``window.parent.document`` access is allowed.
#
# The flag is one-shot (``pop`` removes it) so re-renders triggered by
# unrelated widgets in the Results section don't yank the viewport back
# to the top of Results.
if st.session_state.pop("_colmap_scroll_to_results", False):
from streamlit.components.v1 import html as _components_html
_components_html(
"""
<script>
const doc = window.parent.document;
const target = doc.getElementById('colmap-results-anchor');
if (target) target.scrollIntoView({behavior: 'smooth', block: 'start'});
</script>
""",
height=0,
)

View File

@@ -89,19 +89,29 @@ except Exception as e:
) )
st.stop() st.stop()
st.subheader(f"Preview: {uploaded.name}") # Collapse the input preview and pipeline editor once the user has clicked
st.caption(f"{len(df)} rows, {len(df.columns)} columns") # Run Pipeline so the Results section below is the primary visual focus.
st.dataframe(df.head(10), use_container_width=True) # The user can re-expand either expander to re-inspect or adjust.
_has_result = st.session_state.get("pipeline_result") is not None
with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result):
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
st.dataframe(df.head(10), use_container_width=True)
st.divider() st.divider()
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Pipeline builder # Pipeline builder
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
#
# Wrapped in an outer expander whose default state mirrors the preview
# expander above: open before a result exists, folded once the user has
# clicked Run Pipeline. The pipeline editor is this page's "Options"
# section — structurally analogous to Text Cleaner's options block.
st.subheader("Pipeline") with st.expander("Options", expanded=not _has_result):
mode = st.radio(
mode = st.radio(
"How would you like to define the pipeline?", "How would you like to define the pipeline?",
[ [
"Use the recommended default (text-clean → format → missing → dedup)", "Use the recommended default (text-clean → format → missing → dedup)",
@@ -109,9 +119,9 @@ mode = st.radio(
"Upload a saved pipeline JSON", "Upload a saved pipeline JSON",
], ],
index=0, index=0,
) )
if "pipeline_rows" not in st.session_state: if "pipeline_rows" not in st.session_state:
default = recommended_pipeline() default = recommended_pipeline()
st.session_state["pipeline_rows"] = pd.DataFrame([ st.session_state["pipeline_rows"] = pd.DataFrame([
{ {
@@ -121,7 +131,7 @@ if "pipeline_rows" not in st.session_state:
for s in default.steps for s in default.steps
]) ])
if mode.startswith("Use the recommended"): if mode.startswith("Use the recommended"):
default = recommended_pipeline() default = recommended_pipeline()
st.session_state["pipeline_rows"] = pd.DataFrame([ st.session_state["pipeline_rows"] = pd.DataFrame([
{ {
@@ -130,7 +140,7 @@ if mode.startswith("Use the recommended"):
} }
for s in default.steps for s in default.steps
]) ])
elif mode.startswith("Upload"): elif mode.startswith("Upload"):
pipeline_file = st.file_uploader( pipeline_file = st.file_uploader(
"Pipeline JSON", type=["json"], key="pipeline_upload", "Pipeline JSON", type=["json"], key="pipeline_upload",
) )
@@ -150,12 +160,12 @@ elif mode.startswith("Upload"):
from src.core.errors import format_for_user from src.core.errors import format_for_user
st.error(f"**Could not parse pipeline**\n\n```\n{format_for_user(e)}\n```") st.error(f"**Could not parse pipeline**\n\n```\n{format_for_user(e)}\n```")
st.caption( st.caption(
"Edit the table to add, remove, reorder (drag the row index), enable, " "Edit the table to add, remove, reorder (drag the row index), enable, "
"or configure each step. Tool order is recommended, not enforced — " "or configure each step. Tool order is recommended, not enforced — "
"violations surface as warnings below the table." "violations surface as warnings below the table."
) )
edited = st.data_editor( edited = st.data_editor(
st.session_state["pipeline_rows"], st.session_state["pipeline_rows"],
use_container_width=True, use_container_width=True,
num_rows="dynamic", num_rows="dynamic",
@@ -170,13 +180,13 @@ edited = st.data_editor(
), ),
}, },
key="pipeline_editor", key="pipeline_editor",
) )
st.session_state["pipeline_rows"] = edited st.session_state["pipeline_rows"] = edited
# Build a Pipeline object from the editor state. # Build a Pipeline object from the editor state.
steps_list: list[Step] = [] steps_list: list[Step] = []
parse_errors: list[str] = [] parse_errors: list[str] = []
for i, row in edited.iterrows(): for i, row in edited.iterrows():
tool = row.get("tool") tool = row.get("tool")
if not tool or pd.isna(tool): if not tool or pd.isna(tool):
continue continue
@@ -199,13 +209,13 @@ for i, row in edited.iterrows():
except Exception as e: except Exception as e:
parse_errors.append(f"Step {i + 1}: {e}") parse_errors.append(f"Step {i + 1}: {e}")
if parse_errors: if parse_errors:
for err in parse_errors: for err in parse_errors:
st.error(err) st.error(err)
current_pipeline = Pipeline(steps=steps_list) if steps_list else None current_pipeline = Pipeline(steps=steps_list) if steps_list else None
if current_pipeline is not None: if current_pipeline is not None:
warnings = validate_pipeline(current_pipeline) warnings = validate_pipeline(current_pipeline)
if warnings: if warnings:
st.warning( st.warning(
@@ -214,7 +224,7 @@ if current_pipeline is not None:
+ "\n\nThe pipeline will still run — these are recommendations only." + "\n\nThe pipeline will still run — these are recommendations only."
) )
with st.expander("Recommended tool order — why each step belongs where it does"): with st.expander("Recommended tool order — why each step belongs where it does"):
st.markdown( st.markdown(
"\n".join( "\n".join(
f"- **{e}** before **{l}** — {why}" f"- **{e}** before **{l}** — {why}"
@@ -274,6 +284,14 @@ if st.button(
progress.progress(1.0, text="Done") progress.progress(1.0, text="Done")
st.session_state["pipeline_result"] = result st.session_state["pipeline_result"] = result
st.session_state["pipeline_input_name"] = uploaded.name st.session_state["pipeline_input_name"] = uploaded.name
# One-shot flag picked up on the next pass to scroll the parent
# document to the Results anchor (see scroll snippet at end of file).
st.session_state["_pipeline_scroll_to_results"] = True
# Force a second rerun so the preview and options expanders see
# the new result on the NEXT script pass and collapse themselves.
# Without this they stay expanded until the user touches any
# other widget.
st.rerun()
result = st.session_state.get("pipeline_result") result = st.session_state.get("pipeline_result")
if result is None: if result is None:
@@ -287,6 +305,16 @@ if result is None:
# Results # Results
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Anchor target for the auto-scroll snippet at the end of this block.
# A bare ``<div id="...">`` survives Streamlit's HTML sanitizer (only
# ``<script>`` is stripped), and a 1px-tall div doesn't visually shift
# anything. Placed before the subheader so the scrolled-to viewport
# starts a few pixels above the section heading rather than below it.
st.markdown(
'<div id="pipeline-results-anchor" style="height:1px"></div>',
unsafe_allow_html=True,
)
st.subheader("Results") st.subheader("Results")
m1, m2, m3, m4 = st.columns(4) m1, m2, m3, m4 = st.columns(4)
@@ -318,33 +346,24 @@ st.dataframe(result.final_df.head(10), use_container_width=True)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Downloads # Downloads
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
#
# All three byte buffers are prepared up front (outside the columns) so
# each ``st.download_button`` sees stable ``data`` across reruns and an
# explicit ``key`` — without those, Streamlit auto-derived widget IDs
# can collide for multiple download_buttons in adjacent columns and
# only the first one actually fires on click. The pipeline-JSON button
# now renders unconditionally (disabled when no pipeline is defined)
# so the layout stays steady.
st.divider() st.divider()
stem = Path(st.session_state.get("pipeline_input_name", "input")).stem stem = Path(st.session_state.get("pipeline_input_name", "input")).stem
dl_a, dl_b, dl_c = st.columns(3) cleaned_bytes = result.final_df.to_csv(index=False).encode("utf-8-sig")
with dl_a: pipeline_bytes = json.dumps(
bytes_csv = result.final_df.to_csv(index=False).encode("utf-8-sig")
st.download_button(
"Download cleaned CSV",
data=bytes_csv,
file_name=f"{stem}_pipeline.csv",
mime="text/csv",
)
with dl_b:
pipeline_bytes = json.dumps(
current_pipeline.to_dict() if current_pipeline else {"steps": []}, current_pipeline.to_dict() if current_pipeline else {"steps": []},
indent=2, default=str, indent=2, default=str,
).encode("utf-8") ).encode("utf-8")
st.download_button( audit_bytes = json.dumps({
"Download pipeline JSON",
data=pipeline_bytes,
file_name="pipeline.json",
mime="application/json",
help="Save this and pass --pipeline pipeline.json to the CLI to re-run on next week's file.",
)
with dl_c:
audit_bytes = json.dumps({
"warnings": result.warnings, "warnings": result.warnings,
"initial_rows": result.initial_rows, "initial_rows": result.initial_rows,
"final_rows": result.final_rows, "final_rows": result.final_rows,
@@ -361,13 +380,71 @@ with dl_c:
} }
for sr in result.step_results for sr in result.step_results
], ],
}, indent=2, default=str).encode("utf-8") }, indent=2, default=str).encode("utf-8")
_pipeline_empty = current_pipeline is None or not current_pipeline.steps
dl_a, dl_b, dl_c = st.columns(3)
with dl_a:
st.download_button(
"Download cleaned CSV",
data=cleaned_bytes,
file_name=f"{stem}_pipeline.csv",
mime="text/csv",
key="pipeline_dl_cleaned",
use_container_width=True,
)
with dl_b:
st.download_button(
"Download pipeline JSON",
data=pipeline_bytes,
file_name="pipeline.json",
mime="application/json",
key="pipeline_dl_pipeline",
disabled=_pipeline_empty,
help=(
"No pipeline defined."
if _pipeline_empty
else "Save this and pass --pipeline pipeline.json to the CLI to re-run on next week's file."
),
use_container_width=True,
)
with dl_c:
st.download_button( st.download_button(
"Download run audit", "Download run audit",
data=audit_bytes, data=audit_bytes,
file_name=f"{stem}_pipeline_audit.json", file_name=f"{stem}_pipeline_audit.json",
mime="application/json", mime="application/json",
key="pipeline_dl_audit",
use_container_width=True,
) )
st.divider() st.divider()
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0") st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")
# ---------------------------------------------------------------------------
# Post-run auto-scroll
# ---------------------------------------------------------------------------
#
# When the user clicks Run Pipeline, the preview + options collapse but
# Streamlit by itself doesn't scroll — the Results section is at the
# bottom of a tall script so the user has to find it. Inject a tiny
# component-html iframe that calls ``scrollIntoView`` on the parent's
# Results anchor. Streamlit's main page is same-origin with component
# iframes so ``window.parent.document`` access is allowed.
#
# The flag is one-shot (``pop`` removes it) so re-renders triggered by
# unrelated widgets in the Results section don't yank the viewport
# back to the top of Results.
if st.session_state.pop("_pipeline_scroll_to_results", False):
from streamlit.components.v1 import html as _components_html
_components_html(
"""
<script>
const doc = window.parent.document;
const target = doc.getElementById('pipeline-results-anchor');
if (target) target.scrollIntoView({behavior: 'smooth', block: 'start'});
</script>
""",
height=0,
)