Adds a contextual Help button on each detail page, right of the title. Clicking it opens a Streamlit popover with a one-shot how-to: when to use, numbered steps, before→after examples, and an optional one-line tip. Designed to be scannable — no paragraph prose. Implementation: - New ``render_tool_header(tool_id)`` helper in components replaces the bare ``st.title(...) + st.caption(...)`` block on each of the 11 tool pages. Title in the wide column, popover in a narrow right column; caption sits on its own line beneath. - Help content is one markdown blob per tool stored in i18n under ``tools.<id>.help_md`` (en + es). Editors can tweak copy without touching Python. - ``help.button_label`` and ``help.missing_body`` keys added to both packs for the popover trigger and the empty-tool fallback. All 11 tool pages now use the same header pattern — including the PDF Extractor and Reconciler which previously had hardcoded title/ caption pairs. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
446 lines
16 KiB
Python
446 lines
16 KiB
Python
"""DataTools Fix Missing Values — Streamlit page."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import io
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
import streamlit as st
|
|
|
|
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
|
if str(_project_root) not in sys.path:
|
|
sys.path.insert(0, str(_project_root))
|
|
|
|
from src.gui.components import (
|
|
back_to_home_link,
|
|
render_sticky_footer,
|
|
render_tool_header,
|
|
hide_streamlit_chrome,
|
|
html_download_button,
|
|
pickup_or_upload,
|
|
require_feature_or_render_upgrade,
|
|
)
|
|
from src.i18n import t
|
|
from src.core.missing import (
|
|
DEFAULT_SENTINELS,
|
|
MissingOptions,
|
|
PRESETS,
|
|
handle_missing,
|
|
profile_missing,
|
|
)
|
|
from src.license import FeatureFlag
|
|
|
|
hide_streamlit_chrome()
|
|
render_sticky_footer()
|
|
back_to_home_link()
|
|
from src.audit import log_page_open
|
|
log_page_open("4_Missing_Values")
|
|
require_feature_or_render_upgrade(FeatureFlag.MISSING_HANDLER)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Header
|
|
# ---------------------------------------------------------------------------
|
|
|
|
render_tool_header("04_missing_handler")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# File upload
|
|
# ---------------------------------------------------------------------------
|
|
|
|
uploaded = pickup_or_upload(
|
|
label="Import CSV or Excel file",
|
|
key="missing_file_upload",
|
|
types=["csv", "tsv", "xlsx", "xls"],
|
|
)
|
|
|
|
if uploaded is None:
|
|
st.info("Import a CSV, TSV, or Excel file to begin.")
|
|
st.stop()
|
|
|
|
|
|
@st.cache_data(show_spinner=False)
|
|
def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
|
|
"""Read the uploaded bytes into a DataFrame.
|
|
|
|
Unlike the text cleaner, we do *not* force ``dtype=str`` here: missing-
|
|
value handling is more useful when numeric columns are typed correctly
|
|
(so mean / median / interpolate work without manual coercion).
|
|
Sentinel strings are still detected because they survive in object
|
|
columns where any cell is non-numeric.
|
|
"""
|
|
suffix = Path(name).suffix.lower()
|
|
bio = io.BytesIO(data)
|
|
if suffix in (".xlsx", ".xls"):
|
|
return pd.read_excel(bio)
|
|
for enc in ("utf-8", "utf-8-sig", "latin-1"):
|
|
try:
|
|
bio.seek(0)
|
|
sep = "\t" if suffix == ".tsv" else ","
|
|
return pd.read_csv(bio, encoding=enc, sep=sep, on_bad_lines="warn")
|
|
except UnicodeDecodeError:
|
|
continue
|
|
bio.seek(0)
|
|
return pd.read_csv(bio, encoding="latin-1")
|
|
|
|
|
|
try:
|
|
df = _read_uploaded(uploaded.name, uploaded.getvalue())
|
|
except Exception as e:
|
|
from src.core.errors import format_for_user
|
|
st.error(
|
|
f"**Could not read `{uploaded.name}`**\n\n"
|
|
f"```\n{format_for_user(e)}\n```"
|
|
)
|
|
st.stop()
|
|
|
|
# Collapse the input preview + options once the user has clicked
|
|
# Handle Missing Values so the Results section below is the primary
|
|
# visual focus. The user can re-expand to re-inspect the source rows
|
|
# or tweak strategy and rerun.
|
|
_has_result = st.session_state.get("missing_result") is not None
|
|
|
|
with st.expander(f"Preview: {uploaded.name}", expanded=not _has_result):
|
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
|
st.dataframe(df.head(10), width="stretch")
|
|
|
|
st.divider()
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Options (Missingness profile + Strategy)
|
|
# ---------------------------------------------------------------------------
|
|
#
|
|
# Wrapped in an outer expander whose default state mirrors the preview
|
|
# expander above: open before a result exists, folded once the user has
|
|
# clicked Handle Missing Values. The Missingness profile lives inside
|
|
# this expander too — after a run the Results section shows a richer
|
|
# before-vs-after comparison that supersedes the static input profile,
|
|
# so keeping it tucked away with the controls cleanly pushes Results
|
|
# to the top of the visible area.
|
|
|
|
with st.expander("Options", expanded=not _has_result):
|
|
st.subheader("Missingness profile")
|
|
|
|
initial_profile = profile_missing(df, MissingOptions())
|
|
prof_df = initial_profile.to_dataframe()
|
|
|
|
m1, m2, m3, m4 = st.columns(4)
|
|
m1.metric("Rows", initial_profile.rows_total)
|
|
m2.metric("Cells missing", initial_profile.cells_missing)
|
|
m3.metric("% cells missing", f"{initial_profile.cells_missing_pct:.1f}%")
|
|
m4.metric("Complete rows", initial_profile.rows_complete)
|
|
|
|
st.dataframe(prof_df, width="stretch", hide_index=True)
|
|
|
|
if initial_profile.cells_missing == 0:
|
|
st.success("No missing values or disguised nulls detected. Nothing to handle.")
|
|
|
|
st.divider()
|
|
|
|
st.subheader("Strategy")
|
|
|
|
preset_label = st.radio(
|
|
"Preset",
|
|
[
|
|
"detect-only (standardize sentinels to NaN, no fill or drop)",
|
|
"safe-fill (numeric → median, categorical → mode)",
|
|
"drop-incomplete (drop any row with missing)",
|
|
],
|
|
index=0,
|
|
help=(
|
|
"detect-only: replace 'N/A', '-', 'NULL', etc. with real NaN, then stop. "
|
|
"safe-fill: also fill — numeric columns with median, others with mode. "
|
|
"drop-incomplete: also drop every row that has any missing cell."
|
|
),
|
|
)
|
|
preset_key = preset_label.split(" ", 1)[0]
|
|
options = MissingOptions.from_preset(preset_key)
|
|
|
|
with st.expander("Advanced options"):
|
|
col_a, col_b = st.columns(2)
|
|
|
|
with col_a:
|
|
st.markdown("**Detection**")
|
|
options.standardize_sentinels = st.checkbox(
|
|
"Standardize disguised nulls to NaN",
|
|
value=options.standardize_sentinels,
|
|
help="Replace 'N/A', '-', 'NULL', whitespace-only cells, etc. with real NaN.",
|
|
)
|
|
sentinels_text = st.text_input(
|
|
"Sentinel values (comma-separated)",
|
|
value=", ".join(options.sentinels),
|
|
disabled=not options.standardize_sentinels,
|
|
help="Matched case-insensitively after stripping whitespace.",
|
|
)
|
|
options.sentinels = [
|
|
s.strip() for s in sentinels_text.split(",") if s.strip()
|
|
]
|
|
|
|
with col_b:
|
|
st.markdown("**Strategy override**")
|
|
strat_options = [
|
|
"(use preset)",
|
|
"none", "drop_row", "drop_col", "drop_both",
|
|
"mean", "median", "mode", "constant",
|
|
"ffill", "bfill", "interpolate",
|
|
]
|
|
strat_choice = st.selectbox(
|
|
"Global strategy",
|
|
strat_options,
|
|
index=0,
|
|
help=(
|
|
"drop_row / drop_col use the thresholds below. "
|
|
"mean / median / interpolate are numeric only — non-numeric "
|
|
"columns fall back to the categorical strategy."
|
|
),
|
|
)
|
|
if strat_choice != "(use preset)":
|
|
options.strategy = strat_choice # type: ignore[assignment]
|
|
|
|
cat_strat = st.selectbox(
|
|
"Categorical fallback (for non-numeric columns)",
|
|
["mode", "constant", "ffill", "bfill", "none"],
|
|
index=0,
|
|
)
|
|
options.categorical_strategy = cat_strat # type: ignore[assignment]
|
|
|
|
if options.strategy == "constant" or cat_strat == "constant":
|
|
fill_val = st.text_input(
|
|
"Constant fill value",
|
|
value="",
|
|
help="Used when strategy = constant. Leave blank to fill with empty string.",
|
|
)
|
|
options.fill_value = fill_val
|
|
|
|
st.markdown("**Drop thresholds**")
|
|
col_c, col_d = st.columns(2)
|
|
with col_c:
|
|
options.row_drop_threshold = st.slider(
|
|
"Row drop threshold (drop rows with ≥ this fraction missing across selected cols)",
|
|
0.0, 1.0, options.row_drop_threshold, 0.05,
|
|
)
|
|
with col_d:
|
|
options.col_drop_threshold = st.slider(
|
|
"Column drop threshold (drop columns with ≥ this fraction missing)",
|
|
0.0, 1.0, options.col_drop_threshold, 0.05,
|
|
)
|
|
|
|
st.markdown("**Scope**")
|
|
selected_cols = st.multiselect(
|
|
"Columns to handle (default: all)",
|
|
options=list(df.columns),
|
|
default=list(df.columns),
|
|
)
|
|
skip_cols = st.multiselect(
|
|
"Columns to skip",
|
|
options=list(df.columns),
|
|
default=[],
|
|
)
|
|
options.columns = selected_cols if selected_cols else None
|
|
options.skip_columns = list(skip_cols)
|
|
|
|
st.markdown("**Per-column strategy overrides** (optional)")
|
|
st.caption(
|
|
"Set a different strategy for specific columns. Leave any row blank to "
|
|
"use the global strategy."
|
|
)
|
|
per_col_overrides: dict[str, str] = {}
|
|
only_missing_cols = [
|
|
r.column for r in initial_profile.columns if r.has_missing
|
|
]
|
|
if only_missing_cols:
|
|
edit_df = pd.DataFrame({
|
|
"column": only_missing_cols,
|
|
"strategy": ["" for _ in only_missing_cols],
|
|
})
|
|
edited = st.data_editor(
|
|
edit_df,
|
|
width="stretch",
|
|
hide_index=True,
|
|
column_config={
|
|
"column": st.column_config.TextColumn("Column", disabled=True),
|
|
"strategy": st.column_config.SelectboxColumn(
|
|
"Override",
|
|
options=[
|
|
"", "drop_row", "drop_col",
|
|
"mean", "median", "mode", "constant",
|
|
"ffill", "bfill", "interpolate",
|
|
],
|
|
),
|
|
},
|
|
key="missing_per_col_editor",
|
|
)
|
|
for _, row in edited.iterrows():
|
|
if row["strategy"]:
|
|
per_col_overrides[row["column"]] = row["strategy"]
|
|
options.column_strategies = per_col_overrides # type: ignore[assignment]
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Run
|
|
# ---------------------------------------------------------------------------
|
|
|
|
st.divider()
|
|
|
|
if st.button("Handle Missing Values", type="primary", width="stretch"):
|
|
with st.spinner("Handling..."):
|
|
try:
|
|
result = handle_missing(df, options)
|
|
except (ValueError, OSError) as e:
|
|
from src.core.errors import format_for_user
|
|
st.error(format_for_user(e))
|
|
st.stop()
|
|
st.session_state["missing_result"] = result
|
|
from src.audit import log_event
|
|
log_event("tool_run", "Fix Missing Values run", page="4_Missing_Values")
|
|
st.session_state["missing_input_name"] = uploaded.name
|
|
st.session_state["missing_options"] = options.to_dict()
|
|
# One-shot flag picked up on the next pass to scroll the parent
|
|
# document to the Results anchor (see scroll snippet below).
|
|
st.session_state["_missing_scroll_to_results"] = True
|
|
# Force a second rerun so the preview and options expanders see
|
|
# the new result on the NEXT script pass and collapse themselves.
|
|
# Without this they stay expanded until the user touches any
|
|
# other widget.
|
|
st.rerun()
|
|
|
|
result = st.session_state.get("missing_result")
|
|
if result is None:
|
|
st.info("Choose a strategy and click **Handle Missing Values** to run.")
|
|
st.stop()
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Results
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Anchor target for the auto-scroll snippet at the end of this block.
|
|
# A bare ``<div id="...">`` survives Streamlit's HTML sanitizer (only
|
|
# ``<script>`` is stripped), and a 1px-tall div doesn't visually shift
|
|
# anything. Placed before the subheader so the scrolled-to viewport
|
|
# starts a few pixels above the section heading rather than below it.
|
|
st.markdown(
|
|
'<div id="missing-results-anchor" style="height:1px"></div>',
|
|
unsafe_allow_html=True,
|
|
)
|
|
|
|
st.subheader("Results")
|
|
|
|
m1, m2, m3, m4 = st.columns(4)
|
|
m1.metric("Sentinels → NaN", result.sentinels_standardized)
|
|
m2.metric("Cells filled", result.cells_filled)
|
|
m3.metric("Rows dropped", result.rows_dropped)
|
|
m4.metric("Columns dropped", len(result.columns_dropped))
|
|
|
|
if result.columns_dropped:
|
|
st.warning(f"Dropped columns: {', '.join(result.columns_dropped)}")
|
|
|
|
st.markdown("**Missingness — before vs. after**")
|
|
before = result.profile_before.to_dataframe().set_index("column")[
|
|
["missing", "missing_pct"]
|
|
].rename(columns={"missing": "before_missing", "missing_pct": "before_pct"})
|
|
after = result.profile_after.to_dataframe().set_index("column")[
|
|
["missing", "missing_pct"]
|
|
].rename(columns={"missing": "after_missing", "missing_pct": "after_pct"})
|
|
combined = before.join(after, how="outer").fillna(0)
|
|
st.dataframe(combined, width="stretch")
|
|
|
|
if result.strategy_per_column:
|
|
st.markdown("**Strategy applied per column**")
|
|
strat_df = pd.DataFrame(
|
|
[{"column": c, "strategy": s} for c, s in result.strategy_per_column.items()]
|
|
)
|
|
st.dataframe(strat_df, width="stretch", hide_index=True)
|
|
|
|
if not result.changes.empty:
|
|
st.markdown("**Audit (first 50 changes)**")
|
|
audit_view = result.changes.head(50).copy()
|
|
audit_view["row"] = audit_view["row"].apply(lambda x: "—" if x == -1 else x + 1)
|
|
st.dataframe(audit_view, width="stretch", hide_index=True)
|
|
if len(result.changes) > 50:
|
|
st.caption(f"… and {len(result.changes) - 50} more (download the full audit below).")
|
|
|
|
st.markdown("**Handled preview (first 10 rows)**")
|
|
st.dataframe(result.handled_df.head(10), width="stretch")
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Downloads
|
|
# ---------------------------------------------------------------------------
|
|
#
|
|
# Rendered via ``html_download_button`` (raw <a download> anchor) rather
|
|
# than ``st.download_button``. The latter has a long-standing bug where
|
|
# the second and third download_buttons rendered in the same script pass
|
|
# fail to fire — only the first one's click reaches the browser save
|
|
# dialog. The HTML helper bypasses the widget system entirely and works
|
|
# uniformly across all browsers. The empty-changes case still renders a
|
|
# disabled button (rather than vanishing) so the layout stays steady and
|
|
# the user understands why nothing's available.
|
|
|
|
st.divider()
|
|
stem = Path(st.session_state.get("missing_input_name", "input")).stem
|
|
|
|
handled_bytes = result.handled_df.to_csv(index=False).encode("utf-8-sig")
|
|
changes_bytes = (
|
|
result.changes.to_csv(index=False).encode("utf-8-sig")
|
|
if not result.changes.empty
|
|
else b""
|
|
)
|
|
config_bytes = json.dumps(
|
|
st.session_state.get("missing_options", {}), indent=2, default=str,
|
|
).encode("utf-8")
|
|
|
|
dl_a, dl_b, dl_c = st.columns(3)
|
|
with dl_a:
|
|
html_download_button(
|
|
"Download handled CSV",
|
|
handled_bytes,
|
|
file_name=f"{stem}_missing.csv",
|
|
mime="text/csv",
|
|
)
|
|
with dl_b:
|
|
html_download_button(
|
|
"Download changes audit",
|
|
changes_bytes,
|
|
file_name=f"{stem}_missing_changes.csv",
|
|
mime="text/csv",
|
|
disabled=result.changes.empty,
|
|
help="No changes to audit." if result.changes.empty else None,
|
|
)
|
|
with dl_c:
|
|
html_download_button(
|
|
"Download config JSON",
|
|
config_bytes,
|
|
file_name="missing_config.json",
|
|
mime="application/json",
|
|
)
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Post-run auto-scroll
|
|
# ---------------------------------------------------------------------------
|
|
#
|
|
# When the user clicks Handle Missing Values, the preview + options
|
|
# collapse but Streamlit by itself doesn't scroll — the Results section
|
|
# is at the bottom of a tall script so the user has to find it. Inject
|
|
# a tiny component-html iframe that calls ``scrollIntoView`` on the
|
|
# parent's Results anchor. Streamlit's main page is same-origin with
|
|
# component iframes so ``window.parent.document`` access is allowed.
|
|
#
|
|
# The flag is one-shot (``pop`` removes it) so re-renders triggered by
|
|
# unrelated widgets in the Results section don't yank the viewport
|
|
# back to the top of Results.
|
|
if st.session_state.pop("_missing_scroll_to_results", False):
|
|
st.iframe(
|
|
"""
|
|
<script>
|
|
const doc = window.parent.document;
|
|
const target = doc.getElementById('missing-results-anchor');
|
|
if (target) target.scrollIntoView({behavior: 'smooth', block: 'start'});
|
|
</script>
|
|
""",
|
|
height=1,
|
|
)
|