"""DataTools Fix Missing Values — Streamlit page.""" from __future__ import annotations import io import json import sys from pathlib import Path import pandas as pd import streamlit as st _project_root = Path(__file__).resolve().parent.parent.parent.parent if str(_project_root) not in sys.path: sys.path.insert(0, str(_project_root)) from src.gui.components import ( back_to_home_link, render_sticky_footer, render_tool_header, hide_streamlit_chrome, html_download_button, pickup_or_upload, require_feature_or_render_upgrade, ) from src.i18n import t from src.core.missing import ( DEFAULT_SENTINELS, MissingOptions, PRESETS, handle_missing, profile_missing, ) from src.license import FeatureFlag hide_streamlit_chrome() render_sticky_footer() back_to_home_link() from src.audit import log_page_open log_page_open("4_Missing_Values") require_feature_or_render_upgrade(FeatureFlag.MISSING_HANDLER) # --------------------------------------------------------------------------- # Header # --------------------------------------------------------------------------- render_tool_header("04_missing_handler") # --------------------------------------------------------------------------- # File upload # --------------------------------------------------------------------------- uploaded = pickup_or_upload( label="Import CSV or Excel file", key="missing_file_upload", types=["csv", "tsv", "xlsx", "xls"], ) if uploaded is None: st.info("Import a CSV, TSV, or Excel file to begin.") st.stop() @st.cache_data(show_spinner=False) def _read_uploaded(name: str, data: bytes) -> pd.DataFrame: """Read the uploaded bytes into a DataFrame. Unlike the text cleaner, we do *not* force ``dtype=str`` here: missing- value handling is more useful when numeric columns are typed correctly (so mean / median / interpolate work without manual coercion). Sentinel strings are still detected because they survive in object columns where any cell is non-numeric. """ suffix = Path(name).suffix.lower() bio = io.BytesIO(data) if suffix in (".xlsx", ".xls"): return pd.read_excel(bio) for enc in ("utf-8", "utf-8-sig", "latin-1"): try: bio.seek(0) sep = "\t" if suffix == ".tsv" else "," return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn") except UnicodeDecodeError: continue bio.seek(0) return pd.read_csv(bio, encoding="latin-1") try: df = _read_uploaded(uploaded.name, uploaded.getvalue()) except Exception as e: from src.core.errors import format_for_user st.error( f"**Could not read `{uploaded.name}`**\n\n" f"```\n{format_for_user(e)}\n```" ) st.stop() # Collapse the input preview + options once the user has clicked # Handle Missing Values so the Results section below is the primary # visual focus. The user can re-expand to re-inspect the source rows # or tweak strategy and rerun. _has_result = st.session_state.get("missing_result") is not None with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result): st.caption(f"{len(df)} rows, {len(df.columns)} columns") st.dataframe(df.head(10), width="stretch") st.divider() # --------------------------------------------------------------------------- # Options (Missingness profile + Strategy) # --------------------------------------------------------------------------- # # Wrapped in an outer expander whose default state mirrors the preview # expander above: open before a result exists, folded once the user has # clicked Handle Missing Values. The Missingness profile lives inside # this expander too — after a run the Results section shows a richer # before-vs-after comparison that supersedes the static input profile, # so keeping it tucked away with the controls cleanly pushes Results # to the top of the visible area. with st.expander("Options", expanded=not _has_result): st.subheader("Missingness profile") initial_profile = profile_missing(df, MissingOptions()) prof_df = initial_profile.to_dataframe() m1, m2, m3, m4 = st.columns(4) m1.metric("Rows", initial_profile.rows_total) m2.metric("Cells missing", initial_profile.cells_missing) m3.metric("% cells missing", f"{initial_profile.cells_missing_pct:.1f}%") m4.metric("Complete rows", initial_profile.rows_complete) st.dataframe(prof_df, width="stretch", hide_index=True) if initial_profile.cells_missing == 0: st.success("No missing values or disguised nulls detected. Nothing to handle.") st.divider() st.subheader("Strategy") preset_label = st.radio( "Preset", [ "detect-only (standardize sentinels to NaN, no fill or drop)", "safe-fill (numeric → median, categorical → mode)", "drop-incomplete (drop any row with missing)", ], index=0, help=( "detect-only: replace 'N/A', '-', 'NULL', etc. with real NaN, then stop. " "safe-fill: also fill — numeric columns with median, others with mode. " "drop-incomplete: also drop every row that has any missing cell." ), ) preset_key = preset_label.split(" ", 1)[0] options = MissingOptions.from_preset(preset_key) with st.expander("Advanced options"): col_a, col_b = st.columns(2) with col_a: st.markdown("**Detection**") options.standardize_sentinels = st.checkbox( "Standardize disguised nulls to NaN", value=options.standardize_sentinels, help="Replace 'N/A', '-', 'NULL', whitespace-only cells, etc. with real NaN.", ) sentinels_text = st.text_input( "Sentinel values (comma-separated)", value=", ".join(options.sentinels), disabled=not options.standardize_sentinels, help="Matched case-insensitively after stripping whitespace.", ) options.sentinels = [ s.strip() for s in sentinels_text.split(",") if s.strip() ] with col_b: st.markdown("**Strategy override**") strat_options = [ "(use preset)", "none", "drop_row", "drop_col", "drop_both", "mean", "median", "mode", "constant", "ffill", "bfill", "interpolate", ] strat_choice = st.selectbox( "Global strategy", strat_options, index=0, help=( "drop_row / drop_col use the thresholds below. " "mean / median / interpolate are numeric only — non-numeric " "columns fall back to the categorical strategy." ), ) if strat_choice != "(use preset)": options.strategy = strat_choice # type: ignore[assignment] cat_strat = st.selectbox( "Categorical fallback (for non-numeric columns)", ["mode", "constant", "ffill", "bfill", "none"], index=0, ) options.categorical_strategy = cat_strat # type: ignore[assignment] if options.strategy == "constant" or cat_strat == "constant": fill_val = st.text_input( "Constant fill value", value="", help="Used when strategy = constant. Leave blank to fill with empty string.", ) options.fill_value = fill_val st.markdown("**Drop thresholds**") col_c, col_d = st.columns(2) with col_c: options.row_drop_threshold = st.slider( "Row drop threshold (drop rows with ≥ this fraction missing across selected cols)", 0.0, 1.0, options.row_drop_threshold, 0.05, ) with col_d: options.col_drop_threshold = st.slider( "Column drop threshold (drop columns with ≥ this fraction missing)", 0.0, 1.0, options.col_drop_threshold, 0.05, ) st.markdown("**Scope**") selected_cols = st.multiselect( "Columns to handle (default: all)", options=list(df.columns), default=list(df.columns), ) skip_cols = st.multiselect( "Columns to skip", options=list(df.columns), default=[], ) options.columns = selected_cols if selected_cols else None options.skip_columns = list(skip_cols) st.markdown("**Per-column strategy overrides** (optional)") st.caption( "Set a different strategy for specific columns. Leave any row blank to " "use the global strategy." ) per_col_overrides: dict[str, str] = {} only_missing_cols = [ r.column for r in initial_profile.columns if r.has_missing ] if only_missing_cols: edit_df = pd.DataFrame({ "column": only_missing_cols, "strategy": ["" for _ in only_missing_cols], }) edited = st.data_editor( edit_df, width="stretch", hide_index=True, column_config={ "column": st.column_config.TextColumn("Column", disabled=True), "strategy": st.column_config.SelectboxColumn( "Override", options=[ "", "drop_row", "drop_col", "mean", "median", "mode", "constant", "ffill", "bfill", "interpolate", ], ), }, key="missing_per_col_editor", ) for _, row in edited.iterrows(): if row["strategy"]: per_col_overrides[row["column"]] = row["strategy"] options.column_strategies = per_col_overrides # type: ignore[assignment] # --------------------------------------------------------------------------- # Run # --------------------------------------------------------------------------- st.divider() if st.button("Handle Missing Values", type="primary", width="stretch"): with st.spinner("Handling..."): try: result = handle_missing(df, options) except (ValueError, OSError) as e: from src.core.errors import format_for_user st.error(format_for_user(e)) st.stop() st.session_state["missing_result"] = result from src.audit import log_event log_event("tool_run", "Fix Missing Values run", page="4_Missing_Values") st.session_state["missing_input_name"] = uploaded.name st.session_state["missing_options"] = options.to_dict() # One-shot flag picked up on the next pass to scroll the parent # document to the Results anchor (see scroll snippet below). st.session_state["_missing_scroll_to_results"] = True # Force a second rerun so the preview and options expanders see # the new result on the NEXT script pass and collapse themselves. # Without this they stay expanded until the user touches any # other widget. st.rerun() result = st.session_state.get("missing_result") if result is None: st.info("Choose a strategy and click **Handle Missing Values** to run.") st.stop() # --------------------------------------------------------------------------- # Results # --------------------------------------------------------------------------- # Anchor target for the auto-scroll snippet at the end of this block. # A bare ``
`` survives Streamlit's HTML sanitizer (only # `` """, height=1, )