diff --git a/src/gui/pages/1_Deduplicator.py b/src/gui/pages/1_Deduplicator.py index bbadd60..8d5ca24 100644 --- a/src/gui/pages/1_Deduplicator.py +++ b/src/gui/pages/1_Deduplicator.py @@ -173,22 +173,33 @@ if uploaded is not None: st.session_state["review_decisions"] = {} tmp_path.unlink(missing_ok=True) + # Collapse the input preview + options once a result exists so + # the Results section below becomes the primary visual focus + # after Find Duplicates runs. Mirrors the Clean Text pattern. + _has_result = st.session_state.get("result") is not None + # Preview - st.subheader(f"Preview: {uploaded.name}") - st.caption(f"{len(df)} rows, {len(df.columns)} columns") - st.dataframe(df.head(10), use_container_width=True) + with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result): + # Subheader retained inside the expander so collected_text in + # the workflow tests still finds "Preview: " — Streamlit's + # AppTest does not surface expander labels through the + # markdown/caption/subheader collections. + st.subheader(f"Preview: {uploaded.name}") + st.caption(f"{len(df)} rows, {len(df.columns)} columns") + st.dataframe(df.head(10), use_container_width=True) # Advanced options - settings = config_panel(df) + with st.expander("Options", expanded=not _has_result): + settings = config_panel(df) - # Apply loaded config if present - loaded_cfg = st.session_state.get("loaded_config") - if loaded_cfg is not None: - settings["strategies"] = loaded_cfg.to_strategies() - settings["survivor_rule"] = loaded_cfg.to_survivor_rule() - settings["date_column"] = loaded_cfg.date_column - settings["merge"] = loaded_cfg.merge - del st.session_state["loaded_config"] + # Apply loaded config if present + loaded_cfg = st.session_state.get("loaded_config") + if loaded_cfg is not None: + settings["strategies"] = loaded_cfg.to_strategies() + settings["survivor_rule"] = loaded_cfg.to_survivor_rule() + settings["date_column"] = loaded_cfg.date_column + settings["merge"] = loaded_cfg.merge + del st.session_state["loaded_config"] # ------------------------------------------------------------------- # Find Duplicates button @@ -218,6 +229,11 @@ if uploaded is not None: progress_bar.empty() st.session_state["result"] = result st.session_state["review_decisions"] = {} + # One-shot flag for the scroll snippet at the bottom of the + # page. Force a rerun so the Preview / Options expanders see + # the new result on the next pass and collapse themselves. + st.session_state["_dedup_scroll_to_results"] = True + st.rerun() # ------------------------------------------------------------------- # Results @@ -227,6 +243,14 @@ if uploaded is not None: if result is not None: st.divider() + # Anchor target for the post-run auto-scroll snippet at the + # bottom of this page. A bare ``
`` survives + # Streamlit's HTML sanitizer; a 1px-tall div doesn't shift + # layout. + st.markdown( + '
', + unsafe_allow_html=True, + ) st.subheader("Results") # Summary + download buttons @@ -324,27 +348,45 @@ if uploaded is not None: df, result.match_groups, decisions, ) - csv_bytes = reviewed_df.to_csv( + # Pre-compute every byte buffer up front so each + # ``st.download_button`` sees stable ``data`` + # across reruns. Render the empty-removed case + # as a disabled button (rather than hiding it) + # so layout stays steady and the user can see + # why the download isn't available. + reviewed_bytes = reviewed_df.to_csv( index=False ).encode("utf-8-sig") + reviewed_removed_empty = reviewed_removed.empty + reviewed_removed_bytes = ( + reviewed_removed.to_csv(index=False).encode("utf-8-sig") + if not reviewed_removed_empty + else b"" + ) + st.download_button( "Download Reviewed & Deduplicated CSV", - data=csv_bytes, + data=reviewed_bytes, file_name="deduplicated_reviewed.csv", mime="text/csv", - key="reviewed_download", + key="dedup_dl_reviewed", + use_container_width=True, + ) + st.download_button( + "Download Reviewed Removed Rows", + data=reviewed_removed_bytes, + file_name="removed_reviewed.csv", + mime="text/csv", + key="dedup_dl_reviewed_removed", + disabled=reviewed_removed_empty, + help=( + "No rows were removed under the current " + "review decisions." + if reviewed_removed_empty + else None + ), + use_container_width=True, ) - if not reviewed_removed.empty: - removed_bytes = reviewed_removed.to_csv( - index=False - ).encode("utf-8-sig") - st.download_button( - "Download Reviewed Removed Rows", - data=removed_bytes, - file_name="removed_reviewed.csv", - mime="text/csv", - key="reviewed_removed_download", - ) # Log entries if result.log_entries: @@ -365,3 +407,27 @@ st.caption( "Runs locally. Your data never leaves this computer. " "| DataTools v3.0" ) + +# --------------------------------------------------------------------------- +# Post-run auto-scroll +# --------------------------------------------------------------------------- +# +# When Find Duplicates fires, the preview + options collapse, but +# Streamlit by itself doesn't scroll — the Results section sits below a +# tall page so the user has to hunt for it. Inject a tiny +# component-html iframe that calls ``scrollIntoView`` on the parent's +# Results anchor. The flag is one-shot (``pop`` removes it) so reruns +# triggered by unrelated widgets in the Results section don't yank the +# viewport back to the top of Results. +if st.session_state.pop("_dedup_scroll_to_results", False): + from streamlit.components.v1 import html as _components_html + _components_html( + """ + + """, + height=0, + ) diff --git a/src/gui/pages/3_Format_Standardizer.py b/src/gui/pages/3_Format_Standardizer.py index bad6fb9..75d4d05 100644 --- a/src/gui/pages/3_Format_Standardizer.py +++ b/src/gui/pages/3_Format_Standardizer.py @@ -99,9 +99,13 @@ except Exception as e: ) st.stop() -st.subheader(f"Preview: {uploaded.name}") -st.caption(f"{len(df)} rows, {len(df.columns)} columns") -st.dataframe(df.head(10), use_container_width=True) +# Collapse the input preview once the user has clicked Standardize Formats +# so the Results section below is the primary visual focus. The user can +# re-expand the expander to re-inspect the source rows. +_has_result = st.session_state.get("fmtstd_result") is not None +with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result): + st.caption(f"{len(df)} rows, {len(df.columns)} columns") + st.dataframe(df.head(10), use_container_width=True) st.divider() @@ -180,328 +184,335 @@ def _detect_field_type(col: str, samples: list[str]) -> FieldType | None: # --------------------------------------------------------------------------- # Options # --------------------------------------------------------------------------- - -st.subheader("Column types") -st.caption( - "Assign each column to a field type. Auto-detected suggestions are " - "pre-filled; pick **(skip)** to leave a column untouched." -) - -_FIELD_LABELS = { - "(skip)": None, - "Date": FieldType.DATE, - "Phone": FieldType.PHONE, - "Currency": FieldType.CURRENCY, - "Name": FieldType.NAME, - "Address": FieldType.ADDRESS, - "Boolean": FieldType.BOOLEAN, -} -_LABEL_BY_TYPE = {v: k for k, v in _FIELD_LABELS.items()} -_LABELS = list(_FIELD_LABELS.keys()) - -sample_size = min(len(df), 200) -sample_df = df.head(sample_size) +# +# Wrapped in an outer expander whose default state mirrors the preview +# expander above: open before a result exists, folded once the user has +# clicked Standardize Formats. Together they push the Results section to +# the top of the visible area after a run. column_types: dict[str, FieldType] = {} -cols_per_row = 3 -columns_iter = list(df.columns) -for i in range(0, len(columns_iter), cols_per_row): - cols_block = st.columns(cols_per_row) - for j, col_name in enumerate(columns_iter[i:i + cols_per_row]): - with cols_block[j]: - detected = _detect_field_type(col_name, sample_df[col_name].tolist()) - default_label = _LABEL_BY_TYPE.get(detected, "(skip)") - chosen = st.selectbox( - col_name, - _LABELS, - index=_LABELS.index(default_label), - key=f"fmtstd_type__{col_name}", - ) - ft = _FIELD_LABELS[chosen] - if ft is not None: - column_types[col_name] = ft - -st.divider() -st.subheader("Format options") - -# --------------------------------------------------------------------------- -# Preset bundle picker -# --------------------------------------------------------------------------- -# -# Picking a preset rewrites every option below to that preset's defaults. -# It does NOT touch column-type assignments — those are user-driven and -# orthogonal. To make the rewrite stick across the rerun, we stash the -# preset values into the per-option session keys; the widgets below read -# those keys via their ``index``/``value`` arguments. - -_PRESET_LABELS = { - "us-default": "US (default) — ISO 8601 dates · E.164 phones · USD", - "european": "European — DMY input · INTL phones · EUR comma decimal", - "uk": "UK — DD/MM/YYYY · GB phones · Yes/No booleans", - "iso-strict": "ISO Strict — ISO 8601 · bare-number currency · true/false", - "legacy-us": "Legacy US — MM/DD/YYYY · National phones · Yes/No", - "custom": "Custom — keep current settings", -} - -preset_choice = st.radio( - "Standards preset", - list(_PRESET_LABELS.keys()), - format_func=lambda k: _PRESET_LABELS[k], - index=0, - horizontal=False, - key="fmtstd_preset", - help=( - "Pick a published standard or regional convention as the baseline. " - "Every option below is still individually overridable; choose " - "**Custom** to keep whatever you've manually adjusted." - ), -) - -# Detect a preset switch since the last rerun; when it changes (and the -# new choice isn't ``custom``), purge the dependent widget keys so -# Streamlit lets their ``index=``/``value=`` defaults take effect on the -# new render. Without this clear, prior session_state pins the widget to -# the previous preset's choice and the apparent picker becomes a no-op. -_DEPENDENT_KEYS = [ - "fmtstd_date_format", "fmtstd_date_order", - "fmtstd_phone_format", "fmtstd_phone_region", - "fmtstd_currency_decimal", "fmtstd_currency_decimals", - "fmtstd_currency_preserve", "fmtstd_currency_preserve_code", - "fmtstd_name_case", "fmtstd_bool_style", -] -_last = st.session_state.get("fmtstd_preset_last") -if _last != preset_choice: - st.session_state["fmtstd_preset_last"] = preset_choice - if preset_choice != "custom": - for k in _DEPENDENT_KEYS: - st.session_state.pop(k, None) - st.rerun() - -# Map preset → widget-state defaults. Done as labels so the radios/selects -# below pick up the right index without us re-implementing each map twice. -_PRESET_TO_WIDGETS: dict[str, dict[str, str]] = { - "us-default": { - "date_format": "YYYY-MM-DD (ISO)", "date_order": "MDY (US)", - "phone_format": "E.164 (+15551234567)", "phone_region": "US", - "currency_decimal": "dot (1,234.56)", "currency_decimals": 2, - "currency_preserve_code": False, - "name_case": "Title Case", "boolean_style": "True/False", - }, - "european": { - "date_format": "YYYY-MM-DD (ISO)", "date_order": "DMY (EU)", - "phone_format": "International (+1 555-123-4567)", "phone_region": "DE", - "currency_decimal": "comma (1.234,56)", "currency_decimals": 2, - "currency_preserve_code": True, - "name_case": "Title Case", "boolean_style": "True/False", - }, - "uk": { - "date_format": "DD/MM/YYYY", "date_order": "DMY (EU)", - "phone_format": "International (+1 555-123-4567)", "phone_region": "GB", - "currency_decimal": "dot (1,234.56)", "currency_decimals": 2, - "currency_preserve_code": False, - "name_case": "Title Case", "boolean_style": "Yes/No", - }, - "iso-strict": { - "date_format": "YYYY-MM-DD (ISO)", "date_order": "MDY (US)", - "phone_format": "E.164 (+15551234567)", "phone_region": "US", - "currency_decimal": "dot (1,234.56)", "currency_decimals": 0, - "currency_preserve_code": True, - "name_case": "Title Case", "boolean_style": "true/false", - }, - "legacy-us": { - "date_format": "MM/DD/YYYY", "date_order": "MDY (US)", - "phone_format": "National ((555) 123-4567)", "phone_region": "US", - "currency_decimal": "dot (1,234.56)", "currency_decimals": 2, - "currency_preserve_code": False, - "name_case": "Title Case", "boolean_style": "Yes/No", - }, -} - -# ``iso-strict`` wants currency with no rounding; the GUI exposes that via -# the "preserve original precision" checkbox rather than a sentinel value -# in the number-input. Map that here. -_PRESET_PRESERVE_DECIMALS: dict[str, bool] = { - "iso-strict": True, -} - - -def _preset_default(key: str, fallback): - """Pull the preset-driven default for *key*, or *fallback* on Custom.""" - if preset_choice == "custom": - return fallback - return _PRESET_TO_WIDGETS[preset_choice].get(key, fallback) - - -opt_cols = st.columns(2) -with opt_cols[0]: - st.markdown("**Dates**") - _DATE_LABELS = ["YYYY-MM-DD (ISO)", "MM/DD/YYYY", "DD/MM/YYYY", "DD-Mon-YYYY", "Mon DD, YYYY"] - date_format_label = st.selectbox( - "Output format", - _DATE_LABELS, - index=_DATE_LABELS.index(_preset_default("date_format", "YYYY-MM-DD (ISO)")), - key="fmtstd_date_format", - ) - date_format_map = { - "YYYY-MM-DD (ISO)": "%Y-%m-%d", - "MM/DD/YYYY": "%m/%d/%Y", - "DD/MM/YYYY": "%d/%m/%Y", - "DD-Mon-YYYY": "%d-%b-%Y", - "Mon DD, YYYY": "%b %d, %Y", - } - _DATE_ORDER_LABELS = ["MDY (US)", "DMY (EU)"] - date_order = st.radio( - "Ambiguous input order (e.g. 01/02/2024)", - _DATE_ORDER_LABELS, - index=_DATE_ORDER_LABELS.index(_preset_default("date_order", "MDY (US)")), - horizontal=True, - key="fmtstd_date_order", - ) - - st.markdown("**Phones**") - _PHONE_LABELS = [ - "E.164 (+15551234567)", "International (+1 555-123-4567)", - "National ((555) 123-4567)", "Digits only", - ] - phone_format_label = st.selectbox( - "Output format", - _PHONE_LABELS, - index=_PHONE_LABELS.index(_preset_default("phone_format", "E.164 (+15551234567)")), - key="fmtstd_phone_format", - ) - phone_format_map = { - "E.164 (+15551234567)": "E164", - "International (+1 555-123-4567)": "INTERNATIONAL", - "National ((555) 123-4567)": "NATIONAL", - "Digits only": "DIGITS", - } - phone_region = st.text_input( - "Default region (ISO-2)", - value=_preset_default("phone_region", "US"), - max_chars=2, - help="Region used when the input has no country code. ``US``, ``GB``, ``DE``, etc.", - key="fmtstd_phone_region", - ).upper() or "US" - -with opt_cols[1]: - st.markdown("**Currency**") - _CURR_DECIMAL_LABELS = ["dot (1,234.56)", "comma (1.234,56)"] - currency_decimal = st.radio( - "Decimal separator in input", - _CURR_DECIMAL_LABELS, - index=_CURR_DECIMAL_LABELS.index(_preset_default("currency_decimal", "dot (1,234.56)")), - horizontal=True, - key="fmtstd_currency_decimal", - ) - currency_decimals = st.number_input( - "Round to decimals", - min_value=0, max_value=8, - value=int(_preset_default("currency_decimals", 2)), - step=1, - key="fmtstd_currency_decimals", - ) - preserve_decimals = st.checkbox( - "Preserve original precision (don't round)", - value=_PRESET_PRESERVE_DECIMALS.get(preset_choice, False), - key="fmtstd_currency_preserve", - ) - currency_preserve_code = st.checkbox( - "Preserve currency code (emit `USD 1234.56`, `EUR 99.00`, etc.)", - value=bool(_preset_default("currency_preserve_code", False)), - help=( - "Detects an ISO 4217 code or symbol in the input ($/€/£/¥/USD/" - "EUR/...) and re-emits it as a space-separated prefix on the " - "standardized number. Cells without a currency marker emit " - "just the number." - ), - key="fmtstd_currency_preserve_code", - ) - - st.markdown("**Names**") - _NAME_CASE_LABELS = ["Title Case", "UPPER", "lower"] - name_case_label = st.selectbox( - "Casing", - _NAME_CASE_LABELS, - index=_NAME_CASE_LABELS.index(_preset_default("name_case", "Title Case")), - key="fmtstd_name_case", - ) - name_case_map = {"Title Case": "title", "UPPER": "upper", "lower": "lower"} - - st.markdown("**Booleans**") - _BOOL_LABELS = ["True/False", "true/false", "Yes/No", "Y/N", "1/0"] - boolean_style = st.selectbox( - "Output style", - _BOOL_LABELS, - index=_BOOL_LABELS.index(_preset_default("boolean_style", "True/False")), - key="fmtstd_bool_style", - ) - -# --------------------------------------------------------------------------- -# Address abbreviations — built-in USPS table is editable -# --------------------------------------------------------------------------- -# -# Users with international addresses (German Strasse, Spanish-language -# Avenida, French Boulevard variants) need to override the built-in -# table. Show it in a data_editor so the override is visible — the table -# is small, this is the right surface. - extra_abbreviations: dict[str, str] = {} -if any(ft == FieldType.ADDRESS for ft in column_types.values()): - with st.expander("Custom address abbreviations (advanced)", expanded=False): - st.caption( - "Add or override entries in the address abbreviation table. " - "Each row maps a short form (case-insensitive, periods OK) to " - "the long form the standardizer should emit. Built-in USPS " - "Pub. 28 entries (`St` → `Street`, `Ave` → `Avenue`, …) apply " - "automatically; rows here merge on top and can override them." - ) - starter = pd.DataFrame( - [ - {"abbreviation": "", "expansion": ""}, - {"abbreviation": "", "expansion": ""}, - {"abbreviation": "", "expansion": ""}, - ] - ) - edited = st.data_editor( - starter, - num_rows="dynamic", - use_container_width=True, - column_config={ - "abbreviation": st.column_config.TextColumn( - "Short form", - help="Case-insensitive, trailing period optional. e.g. ``Strasse``", - ), - "expansion": st.column_config.TextColumn( - "Long form", - help="What the standardizer emits. e.g. ``Straße``", - ), - }, - key="fmtstd_extra_abbrev", - ) - for _, row in edited.iterrows(): - k = str(row.get("abbreviation") or "").strip() - v = str(row.get("expansion") or "").strip() - if k and v: - extra_abbreviations[k] = v - if extra_abbreviations: - st.success( - f"{len(extra_abbreviations)} custom mapping(s) will merge " - "with the built-in table." - ) -options = StandardizeOptions( - column_types=column_types, - date_output_format=date_format_map[date_format_label], - date_order="MDY" if date_order.startswith("MDY") else "DMY", - phone_format=phone_format_map[phone_format_label], # type: ignore[arg-type] - phone_region=phone_region, - currency_decimal="dot" if currency_decimal.startswith("dot") else "comma", - currency_decimals=None if preserve_decimals else int(currency_decimals), - currency_preserve_code=currency_preserve_code, - name_case=name_case_map[name_case_label], # type: ignore[arg-type] - boolean_style=boolean_style, # type: ignore[arg-type] - extra_abbreviations=extra_abbreviations, -) +with st.expander("Options", expanded=not _has_result): + st.subheader("Column types") + st.caption( + "Assign each column to a field type. Auto-detected suggestions are " + "pre-filled; pick **(skip)** to leave a column untouched." + ) + + _FIELD_LABELS = { + "(skip)": None, + "Date": FieldType.DATE, + "Phone": FieldType.PHONE, + "Currency": FieldType.CURRENCY, + "Name": FieldType.NAME, + "Address": FieldType.ADDRESS, + "Boolean": FieldType.BOOLEAN, + } + _LABEL_BY_TYPE = {v: k for k, v in _FIELD_LABELS.items()} + _LABELS = list(_FIELD_LABELS.keys()) + + sample_size = min(len(df), 200) + sample_df = df.head(sample_size) + + cols_per_row = 3 + columns_iter = list(df.columns) + for i in range(0, len(columns_iter), cols_per_row): + cols_block = st.columns(cols_per_row) + for j, col_name in enumerate(columns_iter[i:i + cols_per_row]): + with cols_block[j]: + detected = _detect_field_type(col_name, sample_df[col_name].tolist()) + default_label = _LABEL_BY_TYPE.get(detected, "(skip)") + chosen = st.selectbox( + col_name, + _LABELS, + index=_LABELS.index(default_label), + key=f"fmtstd_type__{col_name}", + ) + ft = _FIELD_LABELS[chosen] + if ft is not None: + column_types[col_name] = ft + + st.divider() + st.subheader("Format options") + + # --------------------------------------------------------------------------- + # Preset bundle picker + # --------------------------------------------------------------------------- + # + # Picking a preset rewrites every option below to that preset's defaults. + # It does NOT touch column-type assignments — those are user-driven and + # orthogonal. To make the rewrite stick across the rerun, we stash the + # preset values into the per-option session keys; the widgets below read + # those keys via their ``index``/``value`` arguments. + + _PRESET_LABELS = { + "us-default": "US (default) — ISO 8601 dates · E.164 phones · USD", + "european": "European — DMY input · INTL phones · EUR comma decimal", + "uk": "UK — DD/MM/YYYY · GB phones · Yes/No booleans", + "iso-strict": "ISO Strict — ISO 8601 · bare-number currency · true/false", + "legacy-us": "Legacy US — MM/DD/YYYY · National phones · Yes/No", + "custom": "Custom — keep current settings", + } + + preset_choice = st.radio( + "Standards preset", + list(_PRESET_LABELS.keys()), + format_func=lambda k: _PRESET_LABELS[k], + index=0, + horizontal=False, + key="fmtstd_preset", + help=( + "Pick a published standard or regional convention as the baseline. " + "Every option below is still individually overridable; choose " + "**Custom** to keep whatever you've manually adjusted." + ), + ) + + # Detect a preset switch since the last rerun; when it changes (and the + # new choice isn't ``custom``), purge the dependent widget keys so + # Streamlit lets their ``index=``/``value=`` defaults take effect on the + # new render. Without this clear, prior session_state pins the widget to + # the previous preset's choice and the apparent picker becomes a no-op. + _DEPENDENT_KEYS = [ + "fmtstd_date_format", "fmtstd_date_order", + "fmtstd_phone_format", "fmtstd_phone_region", + "fmtstd_currency_decimal", "fmtstd_currency_decimals", + "fmtstd_currency_preserve", "fmtstd_currency_preserve_code", + "fmtstd_name_case", "fmtstd_bool_style", + ] + _last = st.session_state.get("fmtstd_preset_last") + if _last != preset_choice: + st.session_state["fmtstd_preset_last"] = preset_choice + if preset_choice != "custom": + for k in _DEPENDENT_KEYS: + st.session_state.pop(k, None) + st.rerun() + + # Map preset → widget-state defaults. Done as labels so the radios/selects + # below pick up the right index without us re-implementing each map twice. + _PRESET_TO_WIDGETS: dict[str, dict[str, str]] = { + "us-default": { + "date_format": "YYYY-MM-DD (ISO)", "date_order": "MDY (US)", + "phone_format": "E.164 (+15551234567)", "phone_region": "US", + "currency_decimal": "dot (1,234.56)", "currency_decimals": 2, + "currency_preserve_code": False, + "name_case": "Title Case", "boolean_style": "True/False", + }, + "european": { + "date_format": "YYYY-MM-DD (ISO)", "date_order": "DMY (EU)", + "phone_format": "International (+1 555-123-4567)", "phone_region": "DE", + "currency_decimal": "comma (1.234,56)", "currency_decimals": 2, + "currency_preserve_code": True, + "name_case": "Title Case", "boolean_style": "True/False", + }, + "uk": { + "date_format": "DD/MM/YYYY", "date_order": "DMY (EU)", + "phone_format": "International (+1 555-123-4567)", "phone_region": "GB", + "currency_decimal": "dot (1,234.56)", "currency_decimals": 2, + "currency_preserve_code": False, + "name_case": "Title Case", "boolean_style": "Yes/No", + }, + "iso-strict": { + "date_format": "YYYY-MM-DD (ISO)", "date_order": "MDY (US)", + "phone_format": "E.164 (+15551234567)", "phone_region": "US", + "currency_decimal": "dot (1,234.56)", "currency_decimals": 0, + "currency_preserve_code": True, + "name_case": "Title Case", "boolean_style": "true/false", + }, + "legacy-us": { + "date_format": "MM/DD/YYYY", "date_order": "MDY (US)", + "phone_format": "National ((555) 123-4567)", "phone_region": "US", + "currency_decimal": "dot (1,234.56)", "currency_decimals": 2, + "currency_preserve_code": False, + "name_case": "Title Case", "boolean_style": "Yes/No", + }, + } + + # ``iso-strict`` wants currency with no rounding; the GUI exposes that via + # the "preserve original precision" checkbox rather than a sentinel value + # in the number-input. Map that here. + _PRESET_PRESERVE_DECIMALS: dict[str, bool] = { + "iso-strict": True, + } + + + def _preset_default(key: str, fallback): + """Pull the preset-driven default for *key*, or *fallback* on Custom.""" + if preset_choice == "custom": + return fallback + return _PRESET_TO_WIDGETS[preset_choice].get(key, fallback) + + + opt_cols = st.columns(2) + with opt_cols[0]: + st.markdown("**Dates**") + _DATE_LABELS = ["YYYY-MM-DD (ISO)", "MM/DD/YYYY", "DD/MM/YYYY", "DD-Mon-YYYY", "Mon DD, YYYY"] + date_format_label = st.selectbox( + "Output format", + _DATE_LABELS, + index=_DATE_LABELS.index(_preset_default("date_format", "YYYY-MM-DD (ISO)")), + key="fmtstd_date_format", + ) + date_format_map = { + "YYYY-MM-DD (ISO)": "%Y-%m-%d", + "MM/DD/YYYY": "%m/%d/%Y", + "DD/MM/YYYY": "%d/%m/%Y", + "DD-Mon-YYYY": "%d-%b-%Y", + "Mon DD, YYYY": "%b %d, %Y", + } + _DATE_ORDER_LABELS = ["MDY (US)", "DMY (EU)"] + date_order = st.radio( + "Ambiguous input order (e.g. 01/02/2024)", + _DATE_ORDER_LABELS, + index=_DATE_ORDER_LABELS.index(_preset_default("date_order", "MDY (US)")), + horizontal=True, + key="fmtstd_date_order", + ) + + st.markdown("**Phones**") + _PHONE_LABELS = [ + "E.164 (+15551234567)", "International (+1 555-123-4567)", + "National ((555) 123-4567)", "Digits only", + ] + phone_format_label = st.selectbox( + "Output format", + _PHONE_LABELS, + index=_PHONE_LABELS.index(_preset_default("phone_format", "E.164 (+15551234567)")), + key="fmtstd_phone_format", + ) + phone_format_map = { + "E.164 (+15551234567)": "E164", + "International (+1 555-123-4567)": "INTERNATIONAL", + "National ((555) 123-4567)": "NATIONAL", + "Digits only": "DIGITS", + } + phone_region = st.text_input( + "Default region (ISO-2)", + value=_preset_default("phone_region", "US"), + max_chars=2, + help="Region used when the input has no country code. ``US``, ``GB``, ``DE``, etc.", + key="fmtstd_phone_region", + ).upper() or "US" + + with opt_cols[1]: + st.markdown("**Currency**") + _CURR_DECIMAL_LABELS = ["dot (1,234.56)", "comma (1.234,56)"] + currency_decimal = st.radio( + "Decimal separator in input", + _CURR_DECIMAL_LABELS, + index=_CURR_DECIMAL_LABELS.index(_preset_default("currency_decimal", "dot (1,234.56)")), + horizontal=True, + key="fmtstd_currency_decimal", + ) + currency_decimals = st.number_input( + "Round to decimals", + min_value=0, max_value=8, + value=int(_preset_default("currency_decimals", 2)), + step=1, + key="fmtstd_currency_decimals", + ) + preserve_decimals = st.checkbox( + "Preserve original precision (don't round)", + value=_PRESET_PRESERVE_DECIMALS.get(preset_choice, False), + key="fmtstd_currency_preserve", + ) + currency_preserve_code = st.checkbox( + "Preserve currency code (emit `USD 1234.56`, `EUR 99.00`, etc.)", + value=bool(_preset_default("currency_preserve_code", False)), + help=( + "Detects an ISO 4217 code or symbol in the input ($/€/£/¥/USD/" + "EUR/...) and re-emits it as a space-separated prefix on the " + "standardized number. Cells without a currency marker emit " + "just the number." + ), + key="fmtstd_currency_preserve_code", + ) + + st.markdown("**Names**") + _NAME_CASE_LABELS = ["Title Case", "UPPER", "lower"] + name_case_label = st.selectbox( + "Casing", + _NAME_CASE_LABELS, + index=_NAME_CASE_LABELS.index(_preset_default("name_case", "Title Case")), + key="fmtstd_name_case", + ) + name_case_map = {"Title Case": "title", "UPPER": "upper", "lower": "lower"} + + st.markdown("**Booleans**") + _BOOL_LABELS = ["True/False", "true/false", "Yes/No", "Y/N", "1/0"] + boolean_style = st.selectbox( + "Output style", + _BOOL_LABELS, + index=_BOOL_LABELS.index(_preset_default("boolean_style", "True/False")), + key="fmtstd_bool_style", + ) + + # --------------------------------------------------------------------------- + # Address abbreviations — built-in USPS table is editable + # --------------------------------------------------------------------------- + # + # Users with international addresses (German Strasse, Spanish-language + # Avenida, French Boulevard variants) need to override the built-in + # table. Show it in a data_editor so the override is visible — the table + # is small, this is the right surface. + + if any(ft == FieldType.ADDRESS for ft in column_types.values()): + with st.expander("Custom address abbreviations (advanced)", expanded=False): + st.caption( + "Add or override entries in the address abbreviation table. " + "Each row maps a short form (case-insensitive, periods OK) to " + "the long form the standardizer should emit. Built-in USPS " + "Pub. 28 entries (`St` → `Street`, `Ave` → `Avenue`, …) apply " + "automatically; rows here merge on top and can override them." + ) + starter = pd.DataFrame( + [ + {"abbreviation": "", "expansion": ""}, + {"abbreviation": "", "expansion": ""}, + {"abbreviation": "", "expansion": ""}, + ] + ) + edited = st.data_editor( + starter, + num_rows="dynamic", + use_container_width=True, + column_config={ + "abbreviation": st.column_config.TextColumn( + "Short form", + help="Case-insensitive, trailing period optional. e.g. ``Strasse``", + ), + "expansion": st.column_config.TextColumn( + "Long form", + help="What the standardizer emits. e.g. ``Straße``", + ), + }, + key="fmtstd_extra_abbrev", + ) + for _, row in edited.iterrows(): + k = str(row.get("abbreviation") or "").strip() + v = str(row.get("expansion") or "").strip() + if k and v: + extra_abbreviations[k] = v + if extra_abbreviations: + st.success( + f"{len(extra_abbreviations)} custom mapping(s) will merge " + "with the built-in table." + ) + + options = StandardizeOptions( + column_types=column_types, + date_output_format=date_format_map[date_format_label], + date_order="MDY" if date_order.startswith("MDY") else "DMY", + phone_format=phone_format_map[phone_format_label], # type: ignore[arg-type] + phone_region=phone_region, + currency_decimal="dot" if currency_decimal.startswith("dot") else "comma", + currency_decimals=None if preserve_decimals else int(currency_decimals), + currency_preserve_code=currency_preserve_code, + name_case=name_case_map[name_case_label], # type: ignore[arg-type] + boolean_style=boolean_style, # type: ignore[arg-type] + extra_abbreviations=extra_abbreviations, + ) # --------------------------------------------------------------------------- @@ -528,6 +539,14 @@ if st.button( st.stop() st.session_state["fmtstd_result"] = result st.session_state["fmtstd_input_name"] = uploaded.name + # One-shot flag picked up on the next pass to scroll the parent + # document to the Results anchor (see scroll snippet below). + st.session_state["_fmtstd_scroll_to_results"] = True + # Force a second rerun so the preview and options expanders see + # the new result on the NEXT script pass and collapse themselves. + # Without this they stay expanded until the user touches any + # other widget. + st.rerun() result = st.session_state.get("fmtstd_result") if result is None: @@ -538,6 +557,16 @@ if result is None: # Results # --------------------------------------------------------------------------- +# Anchor target for the auto-scroll snippet at the end of this block. +# A bare ``
`` survives Streamlit's HTML sanitizer (only +# `` + """, + height=0, + ) diff --git a/src/gui/pages/4_Missing_Values.py b/src/gui/pages/4_Missing_Values.py index c49ed7a..5701219 100644 --- a/src/gui/pages/4_Missing_Values.py +++ b/src/gui/pages/4_Missing_Values.py @@ -95,175 +95,186 @@ except Exception as e: ) st.stop() -st.subheader(f"Preview: {uploaded.name}") -st.caption(f"{len(df)} rows, {len(df.columns)} columns") -st.dataframe(df.head(10), use_container_width=True) +# Collapse the input preview + options once the user has clicked +# Handle Missing Values so the Results section below is the primary +# visual focus. The user can re-expand to re-inspect the source rows +# or tweak strategy and rerun. +_has_result = st.session_state.get("missing_result") is not None + +with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result): + st.caption(f"{len(df)} rows, {len(df.columns)} columns") + st.dataframe(df.head(10), use_container_width=True) st.divider() # --------------------------------------------------------------------------- -# Initial profile (read-only) +# Options (Missingness profile + Strategy) # --------------------------------------------------------------------------- +# +# Wrapped in an outer expander whose default state mirrors the preview +# expander above: open before a result exists, folded once the user has +# clicked Handle Missing Values. The Missingness profile lives inside +# this expander too — after a run the Results section shows a richer +# before-vs-after comparison that supersedes the static input profile, +# so keeping it tucked away with the controls cleanly pushes Results +# to the top of the visible area. -st.subheader("Missingness profile") +with st.expander("Options", expanded=not _has_result): + st.subheader("Missingness profile") -initial_profile = profile_missing(df, MissingOptions()) -prof_df = initial_profile.to_dataframe() + initial_profile = profile_missing(df, MissingOptions()) + prof_df = initial_profile.to_dataframe() -m1, m2, m3, m4 = st.columns(4) -m1.metric("Rows", initial_profile.rows_total) -m2.metric("Cells missing", initial_profile.cells_missing) -m3.metric("% cells missing", f"{initial_profile.cells_missing_pct:.1f}%") -m4.metric("Complete rows", initial_profile.rows_complete) + m1, m2, m3, m4 = st.columns(4) + m1.metric("Rows", initial_profile.rows_total) + m2.metric("Cells missing", initial_profile.cells_missing) + m3.metric("% cells missing", f"{initial_profile.cells_missing_pct:.1f}%") + m4.metric("Complete rows", initial_profile.rows_complete) -st.dataframe(prof_df, use_container_width=True, hide_index=True) + st.dataframe(prof_df, use_container_width=True, hide_index=True) -if initial_profile.cells_missing == 0: - st.success("No missing values or disguised nulls detected. Nothing to handle.") + if initial_profile.cells_missing == 0: + st.success("No missing values or disguised nulls detected. Nothing to handle.") -st.divider() + st.divider() -# --------------------------------------------------------------------------- -# Options -# --------------------------------------------------------------------------- + st.subheader("Strategy") -st.subheader("Strategy") + preset_label = st.radio( + "Preset", + [ + "detect-only (standardize sentinels to NaN, no fill or drop)", + "safe-fill (numeric → median, categorical → mode)", + "drop-incomplete (drop any row with missing)", + ], + index=0, + help=( + "detect-only: replace 'N/A', '-', 'NULL', etc. with real NaN, then stop. " + "safe-fill: also fill — numeric columns with median, others with mode. " + "drop-incomplete: also drop every row that has any missing cell." + ), + ) + preset_key = preset_label.split(" ", 1)[0] + options = MissingOptions.from_preset(preset_key) -preset_label = st.radio( - "Preset", - [ - "detect-only (standardize sentinels to NaN, no fill or drop)", - "safe-fill (numeric → median, categorical → mode)", - "drop-incomplete (drop any row with missing)", - ], - index=0, - help=( - "detect-only: replace 'N/A', '-', 'NULL', etc. with real NaN, then stop. " - "safe-fill: also fill — numeric columns with median, others with mode. " - "drop-incomplete: also drop every row that has any missing cell." - ), -) -preset_key = preset_label.split(" ", 1)[0] -options = MissingOptions.from_preset(preset_key) + with st.expander("Advanced options"): + col_a, col_b = st.columns(2) -with st.expander("Advanced options"): - col_a, col_b = st.columns(2) - - with col_a: - st.markdown("**Detection**") - options.standardize_sentinels = st.checkbox( - "Standardize disguised nulls to NaN", - value=options.standardize_sentinels, - help="Replace 'N/A', '-', 'NULL', whitespace-only cells, etc. with real NaN.", - ) - sentinels_text = st.text_input( - "Sentinel values (comma-separated)", - value=", ".join(options.sentinels), - disabled=not options.standardize_sentinels, - help="Matched case-insensitively after stripping whitespace.", - ) - options.sentinels = [ - s.strip() for s in sentinels_text.split(",") if s.strip() - ] - - with col_b: - st.markdown("**Strategy override**") - strat_options = [ - "(use preset)", - "none", "drop_row", "drop_col", "drop_both", - "mean", "median", "mode", "constant", - "ffill", "bfill", "interpolate", - ] - strat_choice = st.selectbox( - "Global strategy", - strat_options, - index=0, - help=( - "drop_row / drop_col use the thresholds below. " - "mean / median / interpolate are numeric only — non-numeric " - "columns fall back to the categorical strategy." - ), - ) - if strat_choice != "(use preset)": - options.strategy = strat_choice # type: ignore[assignment] - - cat_strat = st.selectbox( - "Categorical fallback (for non-numeric columns)", - ["mode", "constant", "ffill", "bfill", "none"], - index=0, - ) - options.categorical_strategy = cat_strat # type: ignore[assignment] - - if options.strategy == "constant" or cat_strat == "constant": - fill_val = st.text_input( - "Constant fill value", - value="", - help="Used when strategy = constant. Leave blank to fill with empty string.", + with col_a: + st.markdown("**Detection**") + options.standardize_sentinels = st.checkbox( + "Standardize disguised nulls to NaN", + value=options.standardize_sentinels, + help="Replace 'N/A', '-', 'NULL', whitespace-only cells, etc. with real NaN.", ) - options.fill_value = fill_val + sentinels_text = st.text_input( + "Sentinel values (comma-separated)", + value=", ".join(options.sentinels), + disabled=not options.standardize_sentinels, + help="Matched case-insensitively after stripping whitespace.", + ) + options.sentinels = [ + s.strip() for s in sentinels_text.split(",") if s.strip() + ] - st.markdown("**Drop thresholds**") - col_c, col_d = st.columns(2) - with col_c: - options.row_drop_threshold = st.slider( - "Row drop threshold (drop rows with ≥ this fraction missing across selected cols)", - 0.0, 1.0, options.row_drop_threshold, 0.05, - ) - with col_d: - options.col_drop_threshold = st.slider( - "Column drop threshold (drop columns with ≥ this fraction missing)", - 0.0, 1.0, options.col_drop_threshold, 0.05, - ) - - st.markdown("**Scope**") - selected_cols = st.multiselect( - "Columns to handle (default: all)", - options=list(df.columns), - default=list(df.columns), - ) - skip_cols = st.multiselect( - "Columns to skip", - options=list(df.columns), - default=[], - ) - options.columns = selected_cols if selected_cols else None - options.skip_columns = list(skip_cols) - - st.markdown("**Per-column strategy overrides** (optional)") - st.caption( - "Set a different strategy for specific columns. Leave any row blank to " - "use the global strategy." - ) - per_col_overrides: dict[str, str] = {} - only_missing_cols = [ - r.column for r in initial_profile.columns if r.has_missing - ] - if only_missing_cols: - edit_df = pd.DataFrame({ - "column": only_missing_cols, - "strategy": ["" for _ in only_missing_cols], - }) - edited = st.data_editor( - edit_df, - use_container_width=True, - hide_index=True, - column_config={ - "column": st.column_config.TextColumn("Column", disabled=True), - "strategy": st.column_config.SelectboxColumn( - "Override", - options=[ - "", "drop_row", "drop_col", - "mean", "median", "mode", "constant", - "ffill", "bfill", "interpolate", - ], + with col_b: + st.markdown("**Strategy override**") + strat_options = [ + "(use preset)", + "none", "drop_row", "drop_col", "drop_both", + "mean", "median", "mode", "constant", + "ffill", "bfill", "interpolate", + ] + strat_choice = st.selectbox( + "Global strategy", + strat_options, + index=0, + help=( + "drop_row / drop_col use the thresholds below. " + "mean / median / interpolate are numeric only — non-numeric " + "columns fall back to the categorical strategy." ), - }, - key="missing_per_col_editor", + ) + if strat_choice != "(use preset)": + options.strategy = strat_choice # type: ignore[assignment] + + cat_strat = st.selectbox( + "Categorical fallback (for non-numeric columns)", + ["mode", "constant", "ffill", "bfill", "none"], + index=0, + ) + options.categorical_strategy = cat_strat # type: ignore[assignment] + + if options.strategy == "constant" or cat_strat == "constant": + fill_val = st.text_input( + "Constant fill value", + value="", + help="Used when strategy = constant. Leave blank to fill with empty string.", + ) + options.fill_value = fill_val + + st.markdown("**Drop thresholds**") + col_c, col_d = st.columns(2) + with col_c: + options.row_drop_threshold = st.slider( + "Row drop threshold (drop rows with ≥ this fraction missing across selected cols)", + 0.0, 1.0, options.row_drop_threshold, 0.05, + ) + with col_d: + options.col_drop_threshold = st.slider( + "Column drop threshold (drop columns with ≥ this fraction missing)", + 0.0, 1.0, options.col_drop_threshold, 0.05, + ) + + st.markdown("**Scope**") + selected_cols = st.multiselect( + "Columns to handle (default: all)", + options=list(df.columns), + default=list(df.columns), ) - for _, row in edited.iterrows(): - if row["strategy"]: - per_col_overrides[row["column"]] = row["strategy"] - options.column_strategies = per_col_overrides # type: ignore[assignment] + skip_cols = st.multiselect( + "Columns to skip", + options=list(df.columns), + default=[], + ) + options.columns = selected_cols if selected_cols else None + options.skip_columns = list(skip_cols) + + st.markdown("**Per-column strategy overrides** (optional)") + st.caption( + "Set a different strategy for specific columns. Leave any row blank to " + "use the global strategy." + ) + per_col_overrides: dict[str, str] = {} + only_missing_cols = [ + r.column for r in initial_profile.columns if r.has_missing + ] + if only_missing_cols: + edit_df = pd.DataFrame({ + "column": only_missing_cols, + "strategy": ["" for _ in only_missing_cols], + }) + edited = st.data_editor( + edit_df, + use_container_width=True, + hide_index=True, + column_config={ + "column": st.column_config.TextColumn("Column", disabled=True), + "strategy": st.column_config.SelectboxColumn( + "Override", + options=[ + "", "drop_row", "drop_col", + "mean", "median", "mode", "constant", + "ffill", "bfill", "interpolate", + ], + ), + }, + key="missing_per_col_editor", + ) + for _, row in edited.iterrows(): + if row["strategy"]: + per_col_overrides[row["column"]] = row["strategy"] + options.column_strategies = per_col_overrides # type: ignore[assignment] # --------------------------------------------------------------------------- # Run @@ -282,6 +293,14 @@ if st.button("Handle Missing Values", type="primary", use_container_width=True): st.session_state["missing_result"] = result st.session_state["missing_input_name"] = uploaded.name st.session_state["missing_options"] = options.to_dict() + # One-shot flag picked up on the next pass to scroll the parent + # document to the Results anchor (see scroll snippet below). + st.session_state["_missing_scroll_to_results"] = True + # Force a second rerun so the preview and options expanders see + # the new result on the NEXT script pass and collapse themselves. + # Without this they stay expanded until the user touches any + # other widget. + st.rerun() result = st.session_state.get("missing_result") if result is None: @@ -292,6 +311,16 @@ if result is None: # Results # --------------------------------------------------------------------------- +# Anchor target for the auto-scroll snippet at the end of this block. +# A bare ``
`` survives Streamlit's HTML sanitizer (only +# `` + """, + height=0, + ) diff --git a/src/gui/pages/5_Column_Mapper.py b/src/gui/pages/5_Column_Mapper.py index 84ca1e4..b6edda5 100644 --- a/src/gui/pages/5_Column_Mapper.py +++ b/src/gui/pages/5_Column_Mapper.py @@ -88,224 +88,240 @@ except Exception as e: ) st.stop() -st.subheader(f"Preview: {uploaded.name}") -st.caption(f"{len(df)} rows, {len(df.columns)} columns") -st.dataframe(df.head(10), use_container_width=True) +# Collapse the input preview once the user has clicked Apply Column +# Mapping so the Results section below is the primary visual focus. +# The user can re-expand the expander to re-inspect the source rows. +_has_result = st.session_state.get("colmap_result") is not None + +with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result): + st.caption(f"{len(df)} rows, {len(df.columns)} columns") + st.dataframe(df.head(10), use_container_width=True) st.divider() # --------------------------------------------------------------------------- -# Schema input +# Options (Target schema + Strategy + Mapping) # --------------------------------------------------------------------------- +# +# Wrapped in an outer expander whose default state mirrors the preview +# expander above: open before a result exists, folded once the user has +# clicked Apply Column Mapping. The Mapping editor is the heart of the +# tool, but per the Text Cleaner pattern we still collapse everything +# post-run — the user can re-expand to tweak any of the three sections. -st.subheader("Target schema") +with st.expander("Options", expanded=not _has_result): + # ----------------------------------------------------------------------- + # Schema input + # ----------------------------------------------------------------------- -schema_mode = st.radio( - "How would you like to define the target schema?", - [ - "Build interactively (start from current columns)", - "Upload schema JSON", - "Skip (rename / coerce only — no schema)", - ], - index=0, - help=( - "An interactive build is fastest for one-off cleanup. Upload a JSON " - "when you have a fixed contract (a CRM import format, db schema). " - "Skip when you only want to rename or coerce specific columns." - ), -) + st.subheader("Target schema") -schema: TargetSchema | None = None - -if schema_mode.startswith("Upload"): - schema_file = st.file_uploader( - "Schema JSON", - type=["json"], - key="colmap_schema_upload", - help='Format: {"fields": [{"name": "email", "dtype": "string", "required": true, "aliases": ["EmailAddr"]}, ...]}', + schema_mode = st.radio( + "How would you like to define the target schema?", + [ + "Build interactively (start from current columns)", + "Upload schema JSON", + "Skip (rename / coerce only — no schema)", + ], + index=0, + help=( + "An interactive build is fastest for one-off cleanup. Upload a JSON " + "when you have a fixed contract (a CRM import format, db schema). " + "Skip when you only want to rename or coerce specific columns." + ), ) - if schema_file is not None: - try: - schema = TargetSchema.from_dict(json.loads(schema_file.getvalue())) - st.success(f"Loaded {len(schema.fields)} target field(s).") - except Exception as e: - from src.core.errors import format_for_user - st.error(f"**Could not parse schema**\n\n```\n{format_for_user(e)}\n```") -elif schema_mode.startswith("Build"): - st.caption( - "Edit the table to define your target schema. Add rows for fields the " - "input doesn't have yet (with a default), or remove rows for columns " - "you want to drop." + schema: TargetSchema | None = None + + if schema_mode.startswith("Upload"): + schema_file = st.file_uploader( + "Schema JSON", + type=["json"], + key="colmap_schema_upload", + help='Format: {"fields": [{"name": "email", "dtype": "string", "required": true, "aliases": ["EmailAddr"]}, ...]}', + ) + if schema_file is not None: + try: + schema = TargetSchema.from_dict(json.loads(schema_file.getvalue())) + st.success(f"Loaded {len(schema.fields)} target field(s).") + except Exception as e: + from src.core.errors import format_for_user + st.error(f"**Could not parse schema**\n\n```\n{format_for_user(e)}\n```") + + elif schema_mode.startswith("Build"): + st.caption( + "Edit the table to define your target schema. Add rows for fields the " + "input doesn't have yet (with a default), or remove rows for columns " + "you want to drop." + ) + initial = pd.DataFrame({ + "name": list(df.columns), + "dtype": ["auto"] * len(df.columns), + "required": [False] * len(df.columns), + "default": [""] * len(df.columns), + "aliases": [""] * len(df.columns), + }) + edited = st.data_editor( + initial, + use_container_width=True, + num_rows="dynamic", + column_config={ + "name": st.column_config.TextColumn("Target name"), + "dtype": st.column_config.SelectboxColumn( + "Type", + options=[ + "auto", "string", "integer", "float", + "boolean", "date", "datetime", "category", + ], + ), + "required": st.column_config.CheckboxColumn("Required"), + "default": st.column_config.TextColumn("Default (for added cols)"), + "aliases": st.column_config.TextColumn( + "Aliases (comma-sep, helps fuzzy-match)", + ), + }, + key="colmap_schema_editor", + ) + fields: list[TargetField] = [] + for _, row in edited.iterrows(): + name = str(row.get("name", "")).strip() + if not name: + continue + aliases = [ + a.strip() for a in str(row.get("aliases", "") or "").split(",") + if a.strip() + ] + default_raw = row.get("default") + default_val = ( + default_raw if (default_raw not in (None, "", float("nan"))) + else None + ) + try: + if isinstance(default_val, float) and pd.isna(default_val): + default_val = None + except TypeError: + pass + fields.append(TargetField( + name=name, + dtype=str(row.get("dtype", "auto")), # type: ignore[arg-type] + required=bool(row.get("required", False)), + aliases=aliases, + default=default_val, + )) + if fields: + schema = TargetSchema(fields=fields) + + st.divider() + + # ----------------------------------------------------------------------- + # Strategy + # ----------------------------------------------------------------------- + + st.subheader("Strategy") + + preset_label = st.radio( + "Preset", + [ + "rename-only (just rename, leave types alone, keep extras)", + "lenient-schema (rename + coerce + reorder, keep extras)", + "strict-schema (rename + coerce + reorder, drop extras)", + ], + index=0, ) - initial = pd.DataFrame({ - "name": list(df.columns), - "dtype": ["auto"] * len(df.columns), - "required": [False] * len(df.columns), - "default": [""] * len(df.columns), - "aliases": [""] * len(df.columns), - }) - edited = st.data_editor( - initial, - use_container_width=True, - num_rows="dynamic", - column_config={ - "name": st.column_config.TextColumn("Target name"), - "dtype": st.column_config.SelectboxColumn( - "Type", - options=[ - "auto", "string", "integer", "float", - "boolean", "date", "datetime", "category", - ], - ), - "required": st.column_config.CheckboxColumn("Required"), - "default": st.column_config.TextColumn("Default (for added cols)"), - "aliases": st.column_config.TextColumn( - "Aliases (comma-sep, helps fuzzy-match)", - ), - }, - key="colmap_schema_editor", - ) - fields: list[TargetField] = [] - for _, row in edited.iterrows(): - name = str(row.get("name", "")).strip() - if not name: - continue - aliases = [ - a.strip() for a in str(row.get("aliases", "") or "").split(",") - if a.strip() - ] - default_raw = row.get("default") - default_val = ( - default_raw if (default_raw not in (None, "", float("nan"))) - else None + preset_key = preset_label.split(" ", 1)[0] + options = MapOptions.from_preset(preset_key) + options.schema = schema + + with st.expander("Advanced options"): + col_a, col_b = st.columns(2) + with col_a: + options.unmapped = st.selectbox( # type: ignore[assignment] + "Unmapped source columns", + ["keep", "drop", "error"], + index=["keep", "drop", "error"].index(options.unmapped), + ) + options.coerce_types = st.checkbox( + "Coerce types per schema", value=options.coerce_types, + ) + options.reorder_to_schema = st.checkbox( + "Reorder to schema order", value=options.reorder_to_schema, + ) + with col_b: + options.auto_infer = st.checkbox( + "Auto-infer mapping (fuzzy match)", value=options.auto_infer, + ) + options.fuzzy_threshold = st.slider( + "Fuzzy match threshold", 0.0, 1.0, options.fuzzy_threshold, 0.05, + ) + options.enforce_required = st.checkbox( + "Enforce required fields", value=options.enforce_required, + ) + + # ----------------------------------------------------------------------- + # Mapping editor — show inferred and let user override + # ----------------------------------------------------------------------- + + st.subheader("Mapping") + + if schema is None: + st.caption( + "No schema — define explicit renames below (left blank means keep " + "the source name)." ) - try: - if isinstance(default_val, float) and pd.isna(default_val): - default_val = None - except TypeError: - pass - fields.append(TargetField( - name=name, - dtype=str(row.get("dtype", "auto")), # type: ignore[arg-type] - required=bool(row.get("required", False)), - aliases=aliases, - default=default_val, - )) - if fields: - schema = TargetSchema(fields=fields) - -st.divider() - -# --------------------------------------------------------------------------- -# Strategy -# --------------------------------------------------------------------------- - -st.subheader("Strategy") - -preset_label = st.radio( - "Preset", - [ - "rename-only (just rename, leave types alone, keep extras)", - "lenient-schema (rename + coerce + reorder, keep extras)", - "strict-schema (rename + coerce + reorder, drop extras)", - ], - index=0, -) -preset_key = preset_label.split(" ", 1)[0] -options = MapOptions.from_preset(preset_key) -options.schema = schema - -with st.expander("Advanced options"): - col_a, col_b = st.columns(2) - with col_a: - options.unmapped = st.selectbox( # type: ignore[assignment] - "Unmapped source columns", - ["keep", "drop", "error"], - index=["keep", "drop", "error"].index(options.unmapped), + rename_initial = pd.DataFrame({ + "source": list(df.columns), + "target": list(df.columns), + }) + rename_edited = st.data_editor( + rename_initial, + use_container_width=True, + column_config={ + "source": st.column_config.TextColumn("Source", disabled=True), + "target": st.column_config.TextColumn("Target"), + }, + hide_index=True, + key="colmap_rename_only_editor", ) - options.coerce_types = st.checkbox( - "Coerce types per schema", value=options.coerce_types, + explicit_mapping: dict[str, str] = {} + for _, row in rename_edited.iterrows(): + src = str(row["source"]) + tgt = str(row["target"]).strip() + if tgt and tgt != src: + explicit_mapping[src] = tgt + options.mapping = explicit_mapping + else: + inferred = ( + infer_mapping(df, schema, threshold=options.fuzzy_threshold) + if options.auto_infer else {} ) - options.reorder_to_schema = st.checkbox( - "Reorder to schema order", value=options.reorder_to_schema, + target_options = ["(unmapped)"] + schema.field_names() + map_initial = pd.DataFrame({ + "source": list(df.columns), + "target": [inferred.get(c, "(unmapped)") for c in df.columns], + "auto": [c in inferred for c in df.columns], + }) + map_edited = st.data_editor( + map_initial, + use_container_width=True, + column_config={ + "source": st.column_config.TextColumn("Source", disabled=True), + "target": st.column_config.SelectboxColumn( + "Target", options=target_options, + ), + "auto": st.column_config.CheckboxColumn("Auto-suggested", disabled=True), + }, + hide_index=True, + key="colmap_schema_mapping_editor", ) - with col_b: - options.auto_infer = st.checkbox( - "Auto-infer mapping (fuzzy match)", value=options.auto_infer, - ) - options.fuzzy_threshold = st.slider( - "Fuzzy match threshold", 0.0, 1.0, options.fuzzy_threshold, 0.05, - ) - options.enforce_required = st.checkbox( - "Enforce required fields", value=options.enforce_required, - ) - -# --------------------------------------------------------------------------- -# Mapping editor — show inferred and let user override -# --------------------------------------------------------------------------- - -st.subheader("Mapping") - -if schema is None: - st.caption( - "No schema — define explicit renames below (left blank means keep " - "the source name)." - ) - rename_initial = pd.DataFrame({ - "source": list(df.columns), - "target": list(df.columns), - }) - rename_edited = st.data_editor( - rename_initial, - use_container_width=True, - column_config={ - "source": st.column_config.TextColumn("Source", disabled=True), - "target": st.column_config.TextColumn("Target"), - }, - hide_index=True, - key="colmap_rename_only_editor", - ) - explicit_mapping: dict[str, str] = {} - for _, row in rename_edited.iterrows(): - src = str(row["source"]) - tgt = str(row["target"]).strip() - if tgt and tgt != src: - explicit_mapping[src] = tgt - options.mapping = explicit_mapping -else: - inferred = ( - infer_mapping(df, schema, threshold=options.fuzzy_threshold) - if options.auto_infer else {} - ) - target_options = ["(unmapped)"] + schema.field_names() - map_initial = pd.DataFrame({ - "source": list(df.columns), - "target": [inferred.get(c, "(unmapped)") for c in df.columns], - "auto": [c in inferred for c in df.columns], - }) - map_edited = st.data_editor( - map_initial, - use_container_width=True, - column_config={ - "source": st.column_config.TextColumn("Source", disabled=True), - "target": st.column_config.SelectboxColumn( - "Target", options=target_options, - ), - "auto": st.column_config.CheckboxColumn("Auto-suggested", disabled=True), - }, - hide_index=True, - key="colmap_schema_mapping_editor", - ) - explicit_mapping = {} - for _, row in map_edited.iterrows(): - src = str(row["source"]) - tgt = str(row["target"]) - if tgt and tgt != "(unmapped)": - explicit_mapping[src] = tgt - options.mapping = explicit_mapping - # Disable auto-infer for the actual run since the editor already shows - # the user's resolved choices (they can manually re-select to add). - options.auto_infer = False + explicit_mapping = {} + for _, row in map_edited.iterrows(): + src = str(row["source"]) + tgt = str(row["target"]) + if tgt and tgt != "(unmapped)": + explicit_mapping[src] = tgt + options.mapping = explicit_mapping + # Disable auto-infer for the actual run since the editor already shows + # the user's resolved choices (they can manually re-select to add). + options.auto_infer = False # --------------------------------------------------------------------------- # Run @@ -324,6 +340,12 @@ if st.button("Apply Column Mapping", type="primary", use_container_width=True): st.session_state["colmap_result"] = result st.session_state["colmap_input_name"] = uploaded.name st.session_state["colmap_options"] = options.to_dict() + # One-shot flag picked up on the next pass to scroll the parent + # document to the Results anchor (see scroll snippet below). + st.session_state["_colmap_scroll_to_results"] = True + # Force a second rerun so the preview and options expanders see + # the new result on the NEXT script pass and collapse themselves. + st.rerun() result = st.session_state.get("colmap_result") if result is None: @@ -334,6 +356,16 @@ if result is None: # Results # --------------------------------------------------------------------------- +# Anchor target for the auto-scroll snippet at the end of this block. +# A bare ``
`` survives Streamlit's HTML sanitizer (only +# `` + """, + height=0, + ) diff --git a/src/gui/pages/9_Pipeline_Runner.py b/src/gui/pages/9_Pipeline_Runner.py index c260d0c..9bafbfa 100644 --- a/src/gui/pages/9_Pipeline_Runner.py +++ b/src/gui/pages/9_Pipeline_Runner.py @@ -89,139 +89,149 @@ except Exception as e: ) st.stop() -st.subheader(f"Preview: {uploaded.name}") -st.caption(f"{len(df)} rows, {len(df.columns)} columns") -st.dataframe(df.head(10), use_container_width=True) +# Collapse the input preview and pipeline editor once the user has clicked +# Run Pipeline so the Results section below is the primary visual focus. +# The user can re-expand either expander to re-inspect or adjust. +_has_result = st.session_state.get("pipeline_result") is not None + +with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result): + st.caption(f"{len(df)} rows, {len(df.columns)} columns") + st.dataframe(df.head(10), use_container_width=True) + st.divider() # --------------------------------------------------------------------------- # Pipeline builder # --------------------------------------------------------------------------- +# +# Wrapped in an outer expander whose default state mirrors the preview +# expander above: open before a result exists, folded once the user has +# clicked Run Pipeline. The pipeline editor is this page's "Options" +# section — structurally analogous to Text Cleaner's options block. -st.subheader("Pipeline") - -mode = st.radio( - "How would you like to define the pipeline?", - [ - "Use the recommended default (text-clean → format → missing → dedup)", - "Build interactively", - "Upload a saved pipeline JSON", - ], - index=0, -) - -if "pipeline_rows" not in st.session_state: - default = recommended_pipeline() - st.session_state["pipeline_rows"] = pd.DataFrame([ - { - "tool": s.tool, "enabled": s.enabled, - "options_json": json.dumps(s.options), - } - for s in default.steps - ]) - -if mode.startswith("Use the recommended"): - default = recommended_pipeline() - st.session_state["pipeline_rows"] = pd.DataFrame([ - { - "tool": s.tool, "enabled": s.enabled, - "options_json": json.dumps(s.options), - } - for s in default.steps - ]) -elif mode.startswith("Upload"): - pipeline_file = st.file_uploader( - "Pipeline JSON", type=["json"], key="pipeline_upload", +with st.expander("Options", expanded=not _has_result): + mode = st.radio( + "How would you like to define the pipeline?", + [ + "Use the recommended default (text-clean → format → missing → dedup)", + "Build interactively", + "Upload a saved pipeline JSON", + ], + index=0, ) - if pipeline_file is not None: + + if "pipeline_rows" not in st.session_state: + default = recommended_pipeline() + st.session_state["pipeline_rows"] = pd.DataFrame([ + { + "tool": s.tool, "enabled": s.enabled, + "options_json": json.dumps(s.options), + } + for s in default.steps + ]) + + if mode.startswith("Use the recommended"): + default = recommended_pipeline() + st.session_state["pipeline_rows"] = pd.DataFrame([ + { + "tool": s.tool, "enabled": s.enabled, + "options_json": json.dumps(s.options), + } + for s in default.steps + ]) + elif mode.startswith("Upload"): + pipeline_file = st.file_uploader( + "Pipeline JSON", type=["json"], key="pipeline_upload", + ) + if pipeline_file is not None: + try: + data = json.loads(pipeline_file.getvalue()) + uploaded_pipe = Pipeline.from_dict(data) + st.session_state["pipeline_rows"] = pd.DataFrame([ + { + "tool": s.tool, "enabled": s.enabled, + "options_json": json.dumps(s.options), + } + for s in uploaded_pipe.steps + ]) + st.success(f"Loaded {len(uploaded_pipe.steps)} step(s).") + except Exception as e: + from src.core.errors import format_for_user + st.error(f"**Could not parse pipeline**\n\n```\n{format_for_user(e)}\n```") + + st.caption( + "Edit the table to add, remove, reorder (drag the row index), enable, " + "or configure each step. Tool order is recommended, not enforced — " + "violations surface as warnings below the table." + ) + edited = st.data_editor( + st.session_state["pipeline_rows"], + use_container_width=True, + num_rows="dynamic", + column_config={ + "tool": st.column_config.SelectboxColumn( + "Tool", options=TOOL_NAMES, required=True, + ), + "enabled": st.column_config.CheckboxColumn("Enabled"), + "options_json": st.column_config.TextColumn( + "Options (JSON)", + help='e.g. {"column_types": {"phone": "phone"}}', + ), + }, + key="pipeline_editor", + ) + st.session_state["pipeline_rows"] = edited + + # Build a Pipeline object from the editor state. + steps_list: list[Step] = [] + parse_errors: list[str] = [] + for i, row in edited.iterrows(): + tool = row.get("tool") + if not tool or pd.isna(tool): + continue + raw_opts = row.get("options_json") or "{}" + if pd.isna(raw_opts): + raw_opts = "{}" try: - data = json.loads(pipeline_file.getvalue()) - uploaded_pipe = Pipeline.from_dict(data) - st.session_state["pipeline_rows"] = pd.DataFrame([ - { - "tool": s.tool, "enabled": s.enabled, - "options_json": json.dumps(s.options), - } - for s in uploaded_pipe.steps - ]) - st.success(f"Loaded {len(uploaded_pipe.steps)} step(s).") + opts = json.loads(raw_opts) if isinstance(raw_opts, str) else dict(raw_opts) + if not isinstance(opts, dict): + raise ValueError("options must be a JSON object") except Exception as e: - from src.core.errors import format_for_user - st.error(f"**Could not parse pipeline**\n\n```\n{format_for_user(e)}\n```") + parse_errors.append(f"Step {i + 1}: {e}") + continue + try: + steps_list.append(Step( + tool=str(tool), + options=opts, + enabled=bool(row.get("enabled", True)), + )) + except Exception as e: + parse_errors.append(f"Step {i + 1}: {e}") -st.caption( - "Edit the table to add, remove, reorder (drag the row index), enable, " - "or configure each step. Tool order is recommended, not enforced — " - "violations surface as warnings below the table." -) -edited = st.data_editor( - st.session_state["pipeline_rows"], - use_container_width=True, - num_rows="dynamic", - column_config={ - "tool": st.column_config.SelectboxColumn( - "Tool", options=TOOL_NAMES, required=True, - ), - "enabled": st.column_config.CheckboxColumn("Enabled"), - "options_json": st.column_config.TextColumn( - "Options (JSON)", - help='e.g. {"column_types": {"phone": "phone"}}', - ), - }, - key="pipeline_editor", -) -st.session_state["pipeline_rows"] = edited + if parse_errors: + for err in parse_errors: + st.error(err) -# Build a Pipeline object from the editor state. -steps_list: list[Step] = [] -parse_errors: list[str] = [] -for i, row in edited.iterrows(): - tool = row.get("tool") - if not tool or pd.isna(tool): - continue - raw_opts = row.get("options_json") or "{}" - if pd.isna(raw_opts): - raw_opts = "{}" - try: - opts = json.loads(raw_opts) if isinstance(raw_opts, str) else dict(raw_opts) - if not isinstance(opts, dict): - raise ValueError("options must be a JSON object") - except Exception as e: - parse_errors.append(f"Step {i + 1}: {e}") - continue - try: - steps_list.append(Step( - tool=str(tool), - options=opts, - enabled=bool(row.get("enabled", True)), - )) - except Exception as e: - parse_errors.append(f"Step {i + 1}: {e}") + current_pipeline = Pipeline(steps=steps_list) if steps_list else None -if parse_errors: - for err in parse_errors: - st.error(err) + if current_pipeline is not None: + warnings = validate_pipeline(current_pipeline) + if warnings: + st.warning( + "Pipeline is out of recommended order:\n\n" + + "\n".join(f"- {w}" for w in warnings) + + "\n\nThe pipeline will still run — these are recommendations only." + ) -current_pipeline = Pipeline(steps=steps_list) if steps_list else None - -if current_pipeline is not None: - warnings = validate_pipeline(current_pipeline) - if warnings: - st.warning( - "Pipeline is out of recommended order:\n\n" - + "\n".join(f"- {w}" for w in warnings) - + "\n\nThe pipeline will still run — these are recommendations only." + with st.expander("Recommended tool order — why each step belongs where it does"): + st.markdown( + "\n".join( + f"- **{e}** before **{l}** — {why}" + for e, l, why in SOFT_DEPENDENCIES + ) ) -with st.expander("Recommended tool order — why each step belongs where it does"): - st.markdown( - "\n".join( - f"- **{e}** before **{l}** — {why}" - for e, l, why in SOFT_DEPENDENCIES - ) - ) - st.divider() # --------------------------------------------------------------------------- @@ -274,6 +284,14 @@ if st.button( progress.progress(1.0, text="Done") st.session_state["pipeline_result"] = result st.session_state["pipeline_input_name"] = uploaded.name + # One-shot flag picked up on the next pass to scroll the parent + # document to the Results anchor (see scroll snippet at end of file). + st.session_state["_pipeline_scroll_to_results"] = True + # Force a second rerun so the preview and options expanders see + # the new result on the NEXT script pass and collapse themselves. + # Without this they stay expanded until the user touches any + # other widget. + st.rerun() result = st.session_state.get("pipeline_result") if result is None: @@ -287,6 +305,16 @@ if result is None: # Results # --------------------------------------------------------------------------- +# Anchor target for the auto-scroll snippet at the end of this block. +# A bare ``
`` survives Streamlit's HTML sanitizer (only +# `` + """, + height=0, + )