"""DataTools Missing Value Handler — Streamlit page.""" from __future__ import annotations import io import json import sys from pathlib import Path import pandas as pd import streamlit as st _project_root = Path(__file__).resolve().parent.parent.parent.parent if str(_project_root) not in sys.path: sys.path.insert(0, str(_project_root)) from src.gui.components import ( hide_streamlit_chrome, pickup_or_upload, require_normalization_gate, ) from src.core.missing import ( DEFAULT_SENTINELS, MissingOptions, PRESETS, handle_missing, profile_missing, ) hide_streamlit_chrome() require_normalization_gate() # --------------------------------------------------------------------------- # Header # --------------------------------------------------------------------------- st.title("🕳️ Missing Value Handler") st.caption( "Detect disguised nulls, profile missingness, and apply imputation or " "drop strategies. Runs locally — your data never leaves this computer." ) # --------------------------------------------------------------------------- # File upload # --------------------------------------------------------------------------- uploaded = pickup_or_upload( label="Upload CSV or Excel file", key="missing_file_upload", types=["csv", "tsv", "xlsx", "xls"], ) if uploaded is None: st.info("Upload a CSV, TSV, or Excel file to begin.") st.stop() @st.cache_data(show_spinner=False) def _read_uploaded(name: str, data: bytes) -> pd.DataFrame: """Read the uploaded bytes into a DataFrame. Unlike the text cleaner, we do *not* force ``dtype=str`` here: missing- value handling is more useful when numeric columns are typed correctly (so mean / median / interpolate work without manual coercion). Sentinel strings are still detected because they survive in object columns where any cell is non-numeric. """ suffix = Path(name).suffix.lower() bio = io.BytesIO(data) if suffix in (".xlsx", ".xls"): return pd.read_excel(bio) for enc in ("utf-8", "utf-8-sig", "latin-1"): try: bio.seek(0) sep = "\t" if suffix == ".tsv" else "," return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn") except UnicodeDecodeError: continue bio.seek(0) return pd.read_csv(bio, encoding="latin-1") try: df = _read_uploaded(uploaded.name, uploaded.getvalue()) except Exception as e: from src.core.errors import format_for_user st.error( f"**Could not read `{uploaded.name}`**\n\n" f"```\n{format_for_user(e)}\n```" ) st.stop() st.subheader(f"Preview: {uploaded.name}") st.caption(f"{len(df)} rows, {len(df.columns)} columns") st.dataframe(df.head(10), use_container_width=True) st.divider() # --------------------------------------------------------------------------- # Initial profile (read-only) # --------------------------------------------------------------------------- st.subheader("Missingness profile") initial_profile = profile_missing(df, MissingOptions()) prof_df = initial_profile.to_dataframe() m1, m2, m3, m4 = st.columns(4) m1.metric("Rows", initial_profile.rows_total) m2.metric("Cells missing", initial_profile.cells_missing) m3.metric("% cells missing", f"{initial_profile.cells_missing_pct:.1f}%") m4.metric("Complete rows", initial_profile.rows_complete) st.dataframe(prof_df, use_container_width=True, hide_index=True) if initial_profile.cells_missing == 0: st.success("No missing values or disguised nulls detected. Nothing to handle.") st.divider() # --------------------------------------------------------------------------- # Options # --------------------------------------------------------------------------- st.subheader("Strategy") preset_label = st.radio( "Preset", [ "detect-only (standardize sentinels to NaN, no fill or drop)", "safe-fill (numeric → median, categorical → mode)", "drop-incomplete (drop any row with missing)", ], index=0, help=( "detect-only: replace 'N/A', '-', 'NULL', etc. with real NaN, then stop. " "safe-fill: also fill — numeric columns with median, others with mode. " "drop-incomplete: also drop every row that has any missing cell." ), ) preset_key = preset_label.split(" ", 1)[0] options = MissingOptions.from_preset(preset_key) with st.expander("Advanced options"): col_a, col_b = st.columns(2) with col_a: st.markdown("**Detection**") options.standardize_sentinels = st.checkbox( "Standardize disguised nulls to NaN", value=options.standardize_sentinels, help="Replace 'N/A', '-', 'NULL', whitespace-only cells, etc. with real NaN.", ) sentinels_text = st.text_input( "Sentinel values (comma-separated)", value=", ".join(options.sentinels), disabled=not options.standardize_sentinels, help="Matched case-insensitively after stripping whitespace.", ) options.sentinels = [ s.strip() for s in sentinels_text.split(",") if s.strip() ] with col_b: st.markdown("**Strategy override**") strat_options = [ "(use preset)", "none", "drop_row", "drop_col", "drop_both", "mean", "median", "mode", "constant", "ffill", "bfill", "interpolate", ] strat_choice = st.selectbox( "Global strategy", strat_options, index=0, help=( "drop_row / drop_col use the thresholds below. " "mean / median / interpolate are numeric only — non-numeric " "columns fall back to the categorical strategy." ), ) if strat_choice != "(use preset)": options.strategy = strat_choice # type: ignore[assignment] cat_strat = st.selectbox( "Categorical fallback (for non-numeric columns)", ["mode", "constant", "ffill", "bfill", "none"], index=0, ) options.categorical_strategy = cat_strat # type: ignore[assignment] if options.strategy == "constant" or cat_strat == "constant": fill_val = st.text_input( "Constant fill value", value="", help="Used when strategy = constant. Leave blank to fill with empty string.", ) options.fill_value = fill_val st.markdown("**Drop thresholds**") col_c, col_d = st.columns(2) with col_c: options.row_drop_threshold = st.slider( "Row drop threshold (drop rows with ≥ this fraction missing across selected cols)", 0.0, 1.0, options.row_drop_threshold, 0.05, ) with col_d: options.col_drop_threshold = st.slider( "Column drop threshold (drop columns with ≥ this fraction missing)", 0.0, 1.0, options.col_drop_threshold, 0.05, ) st.markdown("**Scope**") selected_cols = st.multiselect( "Columns to handle (default: all)", options=list(df.columns), default=list(df.columns), ) skip_cols = st.multiselect( "Columns to skip", options=list(df.columns), default=[], ) options.columns = selected_cols if selected_cols else None options.skip_columns = list(skip_cols) st.markdown("**Per-column strategy overrides** (optional)") st.caption( "Set a different strategy for specific columns. Leave any row blank to " "use the global strategy." ) per_col_overrides: dict[str, str] = {} only_missing_cols = [ r.column for r in initial_profile.columns if r.has_missing ] if only_missing_cols: edit_df = pd.DataFrame({ "column": only_missing_cols, "strategy": ["" for _ in only_missing_cols], }) edited = st.data_editor( edit_df, use_container_width=True, hide_index=True, column_config={ "column": st.column_config.TextColumn("Column", disabled=True), "strategy": st.column_config.SelectboxColumn( "Override", options=[ "", "drop_row", "drop_col", "mean", "median", "mode", "constant", "ffill", "bfill", "interpolate", ], ), }, key="missing_per_col_editor", ) for _, row in edited.iterrows(): if row["strategy"]: per_col_overrides[row["column"]] = row["strategy"] options.column_strategies = per_col_overrides # type: ignore[assignment] # --------------------------------------------------------------------------- # Run # --------------------------------------------------------------------------- st.divider() if st.button("Handle Missing Values", type="primary", use_container_width=True): with st.spinner("Handling..."): try: result = handle_missing(df, options) except (ValueError, OSError) as e: from src.core.errors import format_for_user st.error(format_for_user(e)) st.stop() st.session_state["missing_result"] = result st.session_state["missing_input_name"] = uploaded.name st.session_state["missing_options"] = options.to_dict() result = st.session_state.get("missing_result") if result is None: st.info("Choose a strategy and click **Handle Missing Values** to run.") st.stop() # --------------------------------------------------------------------------- # Results # --------------------------------------------------------------------------- st.subheader("Results") m1, m2, m3, m4 = st.columns(4) m1.metric("Sentinels → NaN", result.sentinels_standardized) m2.metric("Cells filled", result.cells_filled) m3.metric("Rows dropped", result.rows_dropped) m4.metric("Columns dropped", len(result.columns_dropped)) if result.columns_dropped: st.warning(f"Dropped columns: {', '.join(result.columns_dropped)}") st.markdown("**Missingness — before vs. after**") before = result.profile_before.to_dataframe().set_index("column")[ ["missing", "missing_pct"] ].rename(columns={"missing": "before_missing", "missing_pct": "before_pct"}) after = result.profile_after.to_dataframe().set_index("column")[ ["missing", "missing_pct"] ].rename(columns={"missing": "after_missing", "missing_pct": "after_pct"}) combined = before.join(after, how="outer").fillna(0) st.dataframe(combined, use_container_width=True) if result.strategy_per_column: st.markdown("**Strategy applied per column**") strat_df = pd.DataFrame( [{"column": c, "strategy": s} for c, s in result.strategy_per_column.items()] ) st.dataframe(strat_df, use_container_width=True, hide_index=True) if not result.changes.empty: st.markdown("**Audit (first 50 changes)**") audit_view = result.changes.head(50).copy() audit_view["row"] = audit_view["row"].apply(lambda x: "—" if x == -1 else x + 1) st.dataframe(audit_view, use_container_width=True, hide_index=True) if len(result.changes) > 50: st.caption(f"… and {len(result.changes) - 50} more (download the full audit below).") st.markdown("**Handled preview (first 10 rows)**") st.dataframe(result.handled_df.head(10), use_container_width=True) # --------------------------------------------------------------------------- # Downloads # --------------------------------------------------------------------------- st.divider() stem = Path(st.session_state.get("missing_input_name", "input")).stem dl_a, dl_b, dl_c = st.columns(3) with dl_a: handled_bytes = result.handled_df.to_csv(index=False).encode("utf-8-sig") st.download_button( "Download handled CSV", data=handled_bytes, file_name=f"{stem}_missing.csv", mime="text/csv", ) with dl_b: if not result.changes.empty: changes_bytes = result.changes.to_csv(index=False).encode("utf-8-sig") st.download_button( "Download changes audit", data=changes_bytes, file_name=f"{stem}_missing_changes.csv", mime="text/csv", ) with dl_c: config_bytes = json.dumps( st.session_state.get("missing_options", {}), indent=2, default=str, ).encode("utf-8") st.download_button( "Download config JSON", data=config_bytes, file_name="missing_config.json", mime="application/json", ) st.divider() st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")