diff --git a/src/gui/app.py b/src/gui/app.py index 873ddc4..8166dc1 100644 --- a/src/gui/app.py +++ b/src/gui/app.py @@ -75,7 +75,7 @@ def _home_page() -> None: # Group tool cards by sidebar section so the home grid mirrors the # left-nav layout — same vocabulary, same ordering. sections: list[tuple[str, list]] = [] - for section in ("review", "cleaners", "transformations", "automations"): + for section in ("cleaners", "transformations", "automations"): tools = [tool for tool in TOOLS if tool.section == section] if not tools: continue @@ -151,7 +151,6 @@ def _page_for(tool_id: str, *, page_slug: str, icon: str, title: str) -> "st.Pag def _build_navigation() -> dict[str, list]: by_section: dict[str, list] = { - "review": [], "cleaners": [], "transformations": [], "automations": [], @@ -166,16 +165,6 @@ def _build_navigation() -> dict[str, list]: ) ) - # The Review gate has no entry in the registry (it isn't a "tool") - # so register it by hand at the top of its section. - review_page = st.Page( - "pages/0_Review.py", - title=_t("nav.review_page_title") or "Review", - icon="🛡️", - url_path="review", - ) - by_section["review"].insert(0, review_page) - home = st.Page( _home_page, title=_t("nav.home_page_title") or "Home", @@ -199,7 +188,6 @@ def _build_navigation() -> dict[str, list]: account_header = _t("nav.section_account") or "Account" return { "": [home], - section_label("review"): by_section["review"], section_label("cleaners"): by_section["cleaners"], section_label("transformations"): by_section["transformations"], section_label("automations"): by_section["automations"], diff --git a/src/gui/components/__init__.py b/src/gui/components/__init__.py index c239ccf..52edd24 100644 --- a/src/gui/components/__init__.py +++ b/src/gui/components/__init__.py @@ -11,14 +11,13 @@ they need without dragging the entire kitchen-sink module: components/ __init__.py ← compatibility shim (this file) _legacy.py ← original components.py, unchanged - gate.py ← gate-only seam (require_normalization_gate) findings.py ← analyzer-finding rendering seam dedup_review.py ← dedup match-group cards + review pipeline shared.py ← chrome / file-pickup helpers used by every tool A standalone Find Duplicates build, for example, can ship without -``findings.py`` and ``gate.py`` — those modules import the analyzer / -gate code that the Lite SKU does not include. +``findings.py`` — that module imports the analyzer code that the +Lite SKU does not include. Adding new tooling: drop new helpers into the appropriate seam module. Add their names to its ``__all__`` and to this file's ``__all__`` if @@ -46,11 +45,10 @@ from .activation import ( # noqa: F401 re-exported ) __all__ = [ - # Shared chrome / pickup / gate + # Shared chrome / pickup "hide_streamlit_chrome", "quit_button", "pickup_or_upload", - "require_normalization_gate", # License gate + activation form "render_activation_form", "render_license_status_sidebar", diff --git a/src/gui/components/_legacy.py b/src/gui/components/_legacy.py index 0c59965..14740f5 100644 --- a/src/gui/components/_legacy.py +++ b/src/gui/components/_legacy.py @@ -1264,45 +1264,6 @@ class _StashedUpload: return self._data -def require_normalization_gate() -> None: - """Block the calling tool page until the upload has passed the gate. - - Tool pages should call this immediately after their imports. When the - current session upload has not been normalized — no - ``normalization_result``, the result is for a different upload, or the - result didn't pass — the user is shown a banner and a button to jump - to the Review page; the rest of the page is short-circuited via - ``st.stop()``. - - Pages that genuinely don't need a clean dataframe (rare) can opt out - by simply not calling this. - """ - import hashlib - has_upload = st.session_state.get("home_uploaded_bytes") is not None - if not has_upload: - # No upload yet — let the page's own uploader handle it; the gate - # will kick in once a file is present. - return - - upload_hash = hashlib.sha256( - st.session_state["home_uploaded_bytes"] - ).hexdigest() - result = st.session_state.get("normalization_result") - matched = ( - result is not None - and st.session_state.get("normalization_for") == upload_hash - and getattr(result, "passed", False) - ) - if matched: - return - - name = st.session_state.get("home_uploaded_name") or _t("gate.default_name") - st.warning(_t("gate.warning", name=name)) - if st.button(_t("gate.open_review"), type="primary"): - st.switch_page("pages/0_Review.py") - st.stop() - - def pickup_or_upload( *, label: str, diff --git a/src/gui/components/gate.py b/src/gui/components/gate.py deleted file mode 100644 index 5b710f5..0000000 --- a/src/gui/components/gate.py +++ /dev/null @@ -1,16 +0,0 @@ -"""Normalization-gate guard for tool pages. - -``require_normalization_gate`` short-circuits a tool page when the -current upload has not yet passed the gate, redirecting the user to the -Review & Normalize page. Pulled into its own seam module so: - -* A build that includes the gate (Pro / Suite SKUs) imports this. -* A standalone single-tool build that bypasses the gate can omit this - module entirely without removing the helper from a shared file. -""" - -from __future__ import annotations - -from ._legacy import require_normalization_gate - -__all__ = ["require_normalization_gate"] diff --git a/src/gui/pages/0_Review.py b/src/gui/pages/0_Review.py deleted file mode 100644 index 6fcec47..0000000 --- a/src/gui/pages/0_Review.py +++ /dev/null @@ -1,711 +0,0 @@ -"""Review & normalize gate page. - -Sits between the home-page upload and every tool page. Walks the user -through every analyzer finding, lets them auto-fix, preview, customize, -or skip each one, and produces a :class:`NormalizationResult` stashed in -session state. Tool pages refuse to load until this gate has passed. - -State contract --------------- -Session state read: -* ``home_uploaded_bytes`` / ``home_uploaded_name`` — current upload. -* ``home_findings`` — list of :class:`Finding` from the home-page scan. -* ``review_decisions`` — dict[finding_id, Decision]; user's choices so far. - -Session state written: -* ``review_decisions`` — updated as the user flips controls. -* ``normalization_result`` — :class:`NormalizationResult` after Apply. -* ``normalization_for`` — content hash of the upload the result is for. -""" - -from __future__ import annotations - -import hashlib -import io -import sys -from pathlib import Path -from typing import Optional - -import pandas as pd -import streamlit as st - -# Project root on sys.path (mirrors app.py). -_project_root = Path(__file__).resolve().parent.parent.parent.parent -if str(_project_root) not in sys.path: - sys.path.insert(0, str(_project_root)) - -from src.core.analyze import Finding, analyze -from src.core.fixes import get_fix -from src.core.io import detect_encoding, repair_bytes -from src.core.normalize import ( - Decision, - NormalizationResult, - apply_decisions, - auto_fix, - gate_summary, - is_normalized, -) -from src.gui.components import hide_streamlit_chrome - - -# Common single-byte and multi-byte encodings the user might pick to -# correct a misdetection. Ordered by frequency in real-world Western / -# multilingual data; keep the list short — too many options just adds -# noise. The user can type a custom encoding via the "Other" entry. -_OVERRIDE_ENCODINGS = [ - "(detected)", - "utf-8", - "utf-8-sig", - "cp1252", - "iso-8859-1", - "iso-8859-15", - "cp1250", - "iso-8859-2", - "cp1251", - "koi8-r", - "mac-roman", - "shift_jis", - "cp932", - "gb18030", - "big5", - "euc-kr", - "cp949", - "utf-16", - "utf-16-le", - "utf-16-be", - "Other…", -] - - -st.set_page_config(page_title="Review & Normalize", page_icon="🛡️", layout="wide") -hide_streamlit_chrome() - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - -def _upload_hash() -> Optional[str]: - data = st.session_state.get("home_uploaded_bytes") - if not data: - return None - return hashlib.sha256(data).hexdigest() - - -def _detected_encoding_for_session() -> Optional[str]: - """Run charset detection on the session bytes via a tmp file.""" - data = st.session_state.get("home_uploaded_bytes") - name = st.session_state.get("home_uploaded_name") or "tmp.csv" - if not data: - return None - import tempfile - suffix = "." + name.rsplit(".", 1)[-1] if "." in name else ".csv" - with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as fh: - fh.write(data) - tmp_path = Path(fh.name) - try: - return detect_encoding(tmp_path) - finally: - tmp_path.unlink(missing_ok=True) - - -def _load_df_from_session(encoding_override: Optional[str] = None) -> Optional[pd.DataFrame]: - """Re-parse the session upload through the same pipeline the home page - uses, so the review page operates on identical bytes. - - When *encoding_override* is set, decode with that encoding instead of - UTF-8. The override flows into ``repair_bytes`` so the wide-encoding - transcode and decode_replaced fallback both honor the user's choice. - """ - data = st.session_state.get("home_uploaded_bytes") - name = st.session_state.get("home_uploaded_name") or "" - if not data: - return None - suffix = name.rsplit(".", 1)[-1].lower() if "." in name else "" - if suffix in ("xlsx", "xls"): - return pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False) - delim = "\t" if suffix == "tsv" else "," - if delim == ",": - head = data[:4096].decode("utf-8", errors="replace") - for cand in ("\t", ";", "|"): - if head.count(cand) > head.count(",") * 1.5: - delim = cand - break - enc = encoding_override or "utf-8" - repair = repair_bytes(data, encoding=enc, delimiter=delim) - return pd.read_csv( - io.BytesIO(repair.repaired_bytes), - encoding="utf-8", delimiter=delim, - dtype=str, keep_default_na=False, on_bad_lines="warn", - ) - - -def _run_analysis_with_override(encoding_override: Optional[str]) -> list[Finding]: - """Re-run analyze() on the session upload with an encoding override. - - Mirrors components._run_analysis_on_upload but writes the bytes to a - tempfile so analyze() goes through the path-based loader (which is - where the encoding_override hook lives — DataFrame-mode analysis has - nothing to override). - """ - data = st.session_state.get("home_uploaded_bytes") - name = st.session_state.get("home_uploaded_name") or "tmp.csv" - if not data: - return [] - import tempfile - suffix = "." + name.rsplit(".", 1)[-1] if "." in name else ".csv" - with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as fh: - fh.write(data) - tmp_path = Path(fh.name) - try: - return analyze(tmp_path, encoding_override=encoding_override) - finally: - tmp_path.unlink(missing_ok=True) - - -def _confidence_pill(c: str) -> str: - """Streamlit-markdown pill for the confidence tier.""" - palette = {"high": "green", "medium": "orange", "low": "red"} - return f":{palette.get(c, 'gray')}-background[**{c.upper()}**]" - - -def _severity_pill(s: str) -> str: - palette = {"info": "blue", "warn": "orange", "error": "red"} - return f":{palette.get(s, 'gray')}-background[**{s}**]" - - -# --------------------------------------------------------------------------- -# Output options (Advanced — re-encode the cleaned DataFrame for download) -# --------------------------------------------------------------------------- - -# (label_shown_to_user, codec_passed_to_pandas) -_OUTPUT_ENCODINGS = [ - ("UTF-8 (recommended)", "utf-8"), - ("UTF-8 with BOM (Excel)", "utf-8-sig"), - ("Windows-1252 (Western Europe)", "cp1252"), - ("ISO-8859-1 / Latin-1", "iso-8859-1"), - ("ISO-8859-15 / Latin-9", "iso-8859-15"), - ("Windows-1250 (Central Europe)", "cp1250"), - ("ISO-8859-2 / Latin-2", "iso-8859-2"), - ("Windows-1251 (Cyrillic)", "cp1251"), - ("Shift_JIS (Japanese)", "shift_jis"), - ("GB18030 (Chinese)", "gb18030"), - ("Big5 (Traditional Chinese)", "big5"), - ("EUC-KR (Korean)", "euc-kr"), - ("UTF-16 LE with BOM", "utf-16"), -] - -_OUTPUT_DELIMITERS = [ - ("Comma ,", ","), - ("Tab \\t", "\t"), - ("Semicolon ;", ";"), - ("Pipe |", "|"), -] - -_OUTPUT_LINE_TERMINATORS = [ - ("LF — \\n (Unix / web / git default)", "\n"), - ("CRLF — \\r\\n (Windows / classic Excel)", "\r\n"), - ("CR — \\r (classic Mac, very rare)", "\r"), -] - - -def _build_output_bytes( - df: pd.DataFrame, - *, - encoding: str, - delimiter: str, - line_terminator: str, -) -> tuple[bytes, Optional[str]]: - """Serialize *df* with the user's output options. - - Returns ``(bytes, error_message)``. ``error_message`` is non-None when - the chosen encoding cannot represent at least one cell — characters - that don't exist in the target codepage are replaced with ``?`` so - the user still gets a download, plus a warning telling them which - target was lossy. - """ - buf = io.StringIO() - df.to_csv(buf, index=False, sep=delimiter, lineterminator=line_terminator) - text = buf.getvalue() - try: - return text.encode(encoding), None - except UnicodeEncodeError: - # Find the first character that fails so the message is useful. - bad: Optional[str] = None - for ch in text: - try: - ch.encode(encoding) - except UnicodeEncodeError: - bad = ch - break - msg = ( - f"Some characters cannot be represented in {encoding}" - + (f" (first offender: {bad!r})" if bad else "") - + ". Falling back to '?' replacement; non-Latin content will be lost." - ) - return text.encode(encoding, errors="replace"), msg - - -def _preview_table(f: Finding, decision_action: str, payload: Optional[dict]) -> Optional[pd.DataFrame]: - """Build a before/after preview from finding samples. - - Runs the registered fix function on each sample value individually so - the user sees exactly what would change. Returns None when no preview - is meaningful (no samples, or no fix registered). - """ - if not f.samples: - return None - fix_fn = get_fix(f.fix_action) - if fix_fn is None: - # No fix to preview; show samples as-is. - return pd.DataFrame( - [{"row": r, "column": c, "value": v} for r, c, v in f.samples] - ) - rows = [] - for r, col, val in f.samples: - # Run the fix on a tiny single-cell DataFrame so payload semantics - # (e.g. lowercase_email's column targeting) are honored. - mini = pd.DataFrame({col: [val]}) - try: - new_df, _ = fix_fn(mini, payload) - new_val = new_df[col].iloc[0] - except Exception as e: - new_val = f"" - rows.append({"row": r, "column": col, "before": val, "after": new_val}) - return pd.DataFrame(rows) - - -# --------------------------------------------------------------------------- -# Page body -# --------------------------------------------------------------------------- - -st.title("🛡️ Review & Normalize") -st.caption( - "Every finding is shown below with the algorithm that would fix it. " - "Auto-fix the high-confidence ones in one click; preview or customize " - "the rest before applying." -) - -# Pre-flight: if nothing has been uploaded yet, let the user upload -# directly from this page instead of bouncing them back to the home -# screen. Once a file is picked, we auto-run the analyzer (the user is -# already on the Review page — they've implicitly committed to a scan), -# stash the result, and rerun so the rest of the page picks it up. -findings: list[Finding] = st.session_state.get("home_findings") or [] -upload_name = st.session_state.get("home_uploaded_name") - -if not upload_name: - st.info( - "Upload a CSV or Excel file to begin reviewing. The analyzer runs " - "locally and your data never leaves this computer." - ) - review_upload = st.file_uploader( - "Choose a file", - type=["csv", "tsv", "xlsx", "xls"], - key="review_upload", - help="Drag-and-drop or browse for a CSV, TSV, or Excel file.", - ) - if review_upload is None: - st.stop() - - # New file → stash bytes + size + name, drop any stale state, then - # run the analyzer. The rerun at the bottom lets the rest of this - # page render with the upload in place. - same_file = ( - st.session_state.get("home_uploaded_name") == review_upload.name - and st.session_state.get("home_uploaded_size") == review_upload.size - ) - if not same_file: - st.session_state["home_uploaded_name"] = review_upload.name - st.session_state["home_uploaded_size"] = review_upload.size - st.session_state["home_uploaded_bytes"] = review_upload.getvalue() - st.session_state.pop("home_findings", None) - st.session_state.pop("home_skipped", None) - st.session_state.pop("review_decisions", None) - st.session_state.pop("normalization_result", None) - st.session_state.pop("normalization_for", None) - st.session_state.pop("encoding_override", None) - - if st.session_state.get("home_findings") is None: - with st.spinner("Analyzing…"): - st.session_state["home_findings"] = _run_analysis_with_override(None) - st.session_state["home_skipped"] = False - st.rerun() - -# ---- Encoding picker -------------------------------------------------------- -# -# Charset detection misfires on small files, byte-equivalent codepages -# (cp1252 vs Latin-1 vs cp1250), and content where every byte happens to -# decode under the wrong encoding (KOI8-R bytes that look like Shift_JIS). -# When the user spots mojibake or U+FFFD chars in the findings list, this -# picker is the escape hatch — pick the right encoding, re-run the analyzer. - -with st.container(border=True): - detected_enc = _detected_encoding_for_session() - current_override = st.session_state.get("encoding_override") - suffix = (st.session_state.get("home_uploaded_name") or "") - suffix = suffix.rsplit(".", 1)[-1].lower() if "." in suffix else "" - is_excel = suffix in ("xlsx", "xls") - - st.markdown("**File encoding**") - if is_excel: - st.caption( - "Excel files store text as Unicode internally — encoding override " - "doesn't apply. Skip this section." - ) - else: - cap_parts = [f"Detected: `{detected_enc or 'unknown'}`"] - if current_override: - cap_parts.append(f"Currently using: `{current_override}`") - st.caption( - " · ".join(cap_parts) - + " · Override only if you see mojibake (e.g. `é` for `é`) or U+FFFD" - " (`�`) in the findings below." - ) - - col_pick, col_custom, col_apply = st.columns([2, 2, 1]) - - with col_pick: - current_label = current_override or "(detected)" - try: - idx = _OVERRIDE_ENCODINGS.index(current_label) - except ValueError: - idx = _OVERRIDE_ENCODINGS.index("Other…") - chosen = st.selectbox( - "Encoding", - options=_OVERRIDE_ENCODINGS, - index=idx, - key="encoding_override_select", - label_visibility="collapsed", - ) - - custom_value: Optional[str] = None - with col_custom: - if chosen == "Other…": - custom_value = st.text_input( - "Custom encoding (e.g. `cp1257`, `iso-8859-9`)", - value=current_override if current_override and current_override not in _OVERRIDE_ENCODINGS else "", - key="encoding_override_custom", - label_visibility="collapsed", - placeholder="cp1257", - ) - - with col_apply: - if st.button("Re-analyze", use_container_width=True): - if chosen == "(detected)": - new_override = None - elif chosen == "Other…": - new_override = (custom_value or "").strip() or None - else: - new_override = chosen - - # Sanity-check the override actually decodes the bytes. - data = st.session_state.get("home_uploaded_bytes") or b"" - if new_override is not None: - try: - data.decode(new_override, errors="strict") - decode_ok = True - decode_err = None - except (UnicodeDecodeError, LookupError) as e: - decode_ok = False - decode_err = str(e) - else: - decode_ok = True - decode_err = None - - if not decode_ok: - st.warning( - f"`{new_override}` cannot decode this file: {decode_err}. " - f"Re-running anyway with replacement-character fallback so " - f"you can see where the failures are." - ) - - # Re-run analysis with the override and refresh session state. - st.session_state["encoding_override"] = new_override - st.session_state["home_findings"] = _run_analysis_with_override(new_override) - # Drop any prior gate result; the user must re-apply. - st.session_state.pop("normalization_result", None) - st.session_state.pop("normalization_for", None) - st.session_state.pop("review_decisions", None) - st.rerun() - -# Reload findings — the picker above may have just rewritten them. -findings = st.session_state.get("home_findings") or [] - -if not findings: - st.success("✓ No findings to review. The file is already clean — open any tool to begin.") - st.stop() - - -# ---- Top-line counters ------------------------------------------------------- - -n_high = sum(1 for f in findings if f.confidence == "high" and not f.pre_applied and f.fix_action) -n_medium = sum(1 for f in findings if f.confidence == "medium" and not f.pre_applied) -n_low = sum(1 for f in findings if f.confidence == "low" and not f.pre_applied) -n_pre = sum(1 for f in findings if f.pre_applied) -n_block = sum(1 for f in findings if f.severity == "error") - -c1, c2, c3, c4, c5 = st.columns(5) -c1.metric("High confidence", n_high, help="Round-trip safe — eligible for auto-fix.") -c2.metric("Medium", n_medium, help="Right call in the common case; preview before applying.") -c3.metric("Low", n_low, help="Heuristic — opt in only.") -c4.metric("Already applied", n_pre, help="Fixed during the read pass (BOM, NUL, line endings).") -c5.metric("Blocking", n_block, help="Severity = error; must be resolved or waived.") - -st.divider() - - -# ---- Top-level controls ------------------------------------------------------ - -decisions_state: dict = st.session_state.setdefault("review_decisions", {}) - -bar_left, bar_mid, bar_right = st.columns([1.2, 1.2, 3]) - -with bar_left: - if st.button("✨ Auto-fix high-confidence", type="primary", use_container_width=True): - for f in findings: - if ( - not f.pre_applied - and f.confidence == "high" - and f.fix_action - and get_fix(f.fix_action) is not None - ): - decisions_state[f.id] = Decision(finding_id=f.id, action="auto") - st.rerun() - -with bar_mid: - if st.button("Skip everything (not recommended)", use_container_width=True): - for f in findings: - if not f.pre_applied: - decisions_state[f.id] = Decision(finding_id=f.id, action="skip") - st.rerun() - - -# ---- Per-finding cards ------------------------------------------------------- - -# Sort: blocking first, then high (unfixed), medium, low, pre-applied. -def _sort_key(f: Finding) -> tuple: - severity_rank = {"error": 0, "warn": 1, "info": 2}[f.severity] - confidence_rank = {"high": 0, "medium": 1, "low": 2}[f.confidence] - return (int(f.pre_applied), severity_rank, confidence_rank, f.id) - - -for f in sorted(findings, key=_sort_key): - decision = decisions_state.get(f.id) - decision_action = decision.action if decision else ( - "auto" if (f.pre_applied or (f.confidence == "high" and f.fix_action)) else "skip" - ) - - title_bits = [ - _severity_pill(f.severity), - _confidence_pill(f.confidence), - f"**{f.id}**", - f"({f.count})", - ] - if f.pre_applied: - title_bits.append(":gray-background[applied during read]") - - with st.expander(" ".join(title_bits), expanded=(f.severity == "error")): - st.caption(f.description) - if f.tool: - st.caption(f"Owned by: `{f.tool}`") - - if f.pre_applied: - st.info("This was already applied during the file read pass — no decision needed.") - continue - - if not f.fix_action: - if f.severity == "error": - st.error( - "Blocking finding with no auto-fix. Choose **Skip / waive** to " - "acknowledge and proceed (not recommended), or fix the file outside " - "DataTools and re-upload." - ) - else: - st.info("Informational only — no fix to apply.") - - # Decision radio - choice_labels = { - "auto": "Auto-fix with our algorithm", - "skip": "Skip / waive (no change)", - } - # Customize is offered for fixes that take a meaningful payload. - if f.fix_action in ("replace_null_sentinels",): - choice_labels["modified"] = "Customize" - - chosen = st.radio( - "Decision", - options=list(choice_labels.keys()), - index=list(choice_labels.keys()).index(decision_action) - if decision_action in choice_labels else 0, - format_func=lambda k: choice_labels[k], - key=f"decision_{f.id}", - horizontal=True, - ) - - # Customize payload editor (only for the modified action) - payload: Optional[dict] = None - if chosen == "modified" and f.fix_action == "replace_null_sentinels": - default_sentinels = ", ".join(sorted([ - "n/a", "na", "nan", "null", "none", "-", "--", "tbd", "unknown", - ])) - text = st.text_area( - "Sentinels (comma-separated, case-insensitive):", - value=(decision.payload or {}).get( - "sentinels_raw", default_sentinels, - ) if decision else default_sentinels, - key=f"sentinels_{f.id}", - ) - sentinels = [s.strip() for s in text.split(",") if s.strip()] - payload = {"sentinels": sentinels, "sentinels_raw": text} - - # Persist - decisions_state[f.id] = Decision( - finding_id=f.id, action=chosen, payload=payload, - ) - - # Preview - if chosen != "skip" and f.samples: - preview = _preview_table(f, chosen, payload) - if preview is not None and not preview.empty: - st.markdown("**Preview** (showing up to 5 affected cells)") - st.dataframe(preview, use_container_width=True, hide_index=True) - -st.divider() - - -# ---- Apply ------------------------------------------------------------------ - -bottom_left, bottom_mid, bottom_right = st.columns([1, 1, 3]) - -with bottom_left: - apply_clicked = st.button( - "✅ Apply & enter tools", type="primary", use_container_width=True, - disabled=not decisions_state, - ) - -with bottom_mid: - reset_clicked = st.button("Reset all decisions", use_container_width=True) - -if reset_clicked: - st.session_state.pop("review_decisions", None) - st.session_state.pop("normalization_result", None) - st.session_state.pop("normalization_for", None) - st.rerun() - -if apply_clicked: - df = _load_df_from_session( - encoding_override=st.session_state.get("encoding_override") - ) - if df is None: - st.error("Could not re-read the uploaded file. Try re-uploading.") - st.stop() - decisions_list = [d for d in decisions_state.values() if isinstance(d, Decision)] - result = apply_decisions(df, findings, decisions_list) - st.session_state["normalization_result"] = result - st.session_state["normalization_for"] = _upload_hash() - - summary = gate_summary(result) - if result.passed and is_normalized(findings, result): - st.success( - f"✓ Gate passed — {summary['fixes_applied']} fix(es) applied, " - f"{summary['cells_changed']} cell(s) changed. You can now open any tool." - ) - elif result.blocking_findings: - st.error( - f"Gate blocked by error-level findings: " - f"{', '.join(b.id for b in result.blocking_findings)}. " - f"Resolve or waive them above before continuing." - ) - elif result.pending_findings: - st.warning( - f"Pending decisions remain on: " - f"{', '.join(f.id for f in result.pending_findings)}. " - f"Choose Auto-fix or Skip for each before continuing." - ) - -# Persisted summary (re-render on reload) -result: Optional[NormalizationResult] = st.session_state.get("normalization_result") -if result is not None and st.session_state.get("normalization_for") == _upload_hash(): - with st.expander("Audit log"): - if result.applied: - st.markdown("**Applied fixes**") - st.dataframe( - pd.DataFrame([ - { - "finding": a.finding_id, - "fix_action": a.fix_action, - "decision": a.decision, - "cells_changed": a.cells_changed, - } - for a in result.applied - ]), - use_container_width=True, hide_index=True, - ) - if result.skipped_findings: - st.markdown("**Skipped (waived by user)**") - st.write([f.id for f in result.skipped_findings]) - if result.passed: - st.markdown("---") - st.markdown("**Download normalized file**") - with st.expander("⚙️ Advanced output options"): - st.caption( - "Defaults match what the analyzer normalized to: UTF-8, " - "comma-separated, LF line endings. Override only if your " - "destination tool requires a specific format." - ) - - col_enc, col_delim, col_le = st.columns(3) - with col_enc: - enc_choice = st.selectbox( - "Encoding (code page)", - options=[label for label, _ in _OUTPUT_ENCODINGS], - index=0, - key="output_encoding_select", - ) - out_encoding = next( - codec for label, codec in _OUTPUT_ENCODINGS if label == enc_choice - ) - - with col_delim: - delim_choice = st.selectbox( - "Delimiter", - options=[label for label, _ in _OUTPUT_DELIMITERS], - index=0, - key="output_delim_select", - ) - out_delim = next( - ch for label, ch in _OUTPUT_DELIMITERS if label == delim_choice - ) - - with col_le: - le_choice = st.selectbox( - "Line terminator", - options=[label for label, _ in _OUTPUT_LINE_TERMINATORS], - index=0, - key="output_le_select", - ) - out_le = next( - ch for label, ch in _OUTPUT_LINE_TERMINATORS if label == le_choice - ) - - data, encode_warn = _build_output_bytes( - result.cleaned_df, - encoding=out_encoding, - delimiter=out_delim, - line_terminator=out_le, - ) - if encode_warn: - st.warning(encode_warn) - - ext = "tsv" if out_delim == "\t" else "csv" - mime = "text/tab-separated-values" if out_delim == "\t" else "text/csv" - file_name = f"{Path(upload_name).stem}.normalized.{ext}" - - st.download_button( - f"⬇️ Download {file_name}", - data=data, - file_name=file_name, - mime=mime, - type="primary", - ) diff --git a/src/gui/pages/1_Deduplicator.py b/src/gui/pages/1_Deduplicator.py index dd0a1f9..2198e75 100644 --- a/src/gui/pages/1_Deduplicator.py +++ b/src/gui/pages/1_Deduplicator.py @@ -23,14 +23,12 @@ from src.gui.components import ( match_group_card, pickup_or_upload, require_feature_or_render_upgrade, - require_normalization_gate, results_summary, ) from src.license import FeatureFlag hide_streamlit_chrome() require_feature_or_render_upgrade(FeatureFlag.DEDUPLICATOR) -require_normalization_gate() # --------------------------------------------------------------------------- # Session state defaults diff --git a/src/gui/pages/2_Text_Cleaner.py b/src/gui/pages/2_Text_Cleaner.py index 60c34ce..bcf7704 100644 --- a/src/gui/pages/2_Text_Cleaner.py +++ b/src/gui/pages/2_Text_Cleaner.py @@ -19,7 +19,6 @@ from src.gui.components import ( pickup_or_upload, render_hidden_aware_preview, require_feature_or_render_upgrade, - require_normalization_gate, ) from src.license import FeatureFlag from src.core.text_clean import ( @@ -32,7 +31,6 @@ from src.core.text_clean import ( hide_streamlit_chrome() require_feature_or_render_upgrade(FeatureFlag.TEXT_CLEANER) -require_normalization_gate() # --------------------------------------------------------------------------- diff --git a/src/gui/pages/3_Format_Standardizer.py b/src/gui/pages/3_Format_Standardizer.py index 76f9ae0..1406cd6 100644 --- a/src/gui/pages/3_Format_Standardizer.py +++ b/src/gui/pages/3_Format_Standardizer.py @@ -18,7 +18,6 @@ from src.gui.components import ( hide_streamlit_chrome, pickup_or_upload, require_feature_or_render_upgrade, - require_normalization_gate, ) from src.core.format_standardize import ( PRESETS, @@ -30,7 +29,6 @@ from src.license import FeatureFlag hide_streamlit_chrome() require_feature_or_render_upgrade(FeatureFlag.FORMAT_STANDARDIZER) -require_normalization_gate() # --------------------------------------------------------------------------- diff --git a/src/gui/pages/4_Missing_Values.py b/src/gui/pages/4_Missing_Values.py index 08baf0d..62d07f6 100644 --- a/src/gui/pages/4_Missing_Values.py +++ b/src/gui/pages/4_Missing_Values.py @@ -18,7 +18,6 @@ from src.gui.components import ( hide_streamlit_chrome, pickup_or_upload, require_feature_or_render_upgrade, - require_normalization_gate, ) from src.core.missing import ( DEFAULT_SENTINELS, @@ -31,7 +30,6 @@ from src.license import FeatureFlag hide_streamlit_chrome() require_feature_or_render_upgrade(FeatureFlag.MISSING_HANDLER) -require_normalization_gate() # --------------------------------------------------------------------------- diff --git a/src/gui/pages/5_Column_Mapper.py b/src/gui/pages/5_Column_Mapper.py index 9ba1e68..6146623 100644 --- a/src/gui/pages/5_Column_Mapper.py +++ b/src/gui/pages/5_Column_Mapper.py @@ -18,7 +18,6 @@ from src.gui.components import ( hide_streamlit_chrome, pickup_or_upload, require_feature_or_render_upgrade, - require_normalization_gate, ) from src.core.column_mapper import ( MapOptions, @@ -32,7 +31,6 @@ from src.license import FeatureFlag hide_streamlit_chrome() require_feature_or_render_upgrade(FeatureFlag.COLUMN_MAPPER) -require_normalization_gate() # --------------------------------------------------------------------------- diff --git a/src/gui/pages/6_Outlier_Detector.py b/src/gui/pages/6_Outlier_Detector.py index 57b01f7..19455d3 100644 --- a/src/gui/pages/6_Outlier_Detector.py +++ b/src/gui/pages/6_Outlier_Detector.py @@ -14,13 +14,11 @@ if str(_project_root) not in sys.path: from src.gui.components import ( hide_streamlit_chrome, require_feature_or_render_upgrade, - require_normalization_gate, ) from src.license import FeatureFlag hide_streamlit_chrome() require_feature_or_render_upgrade(FeatureFlag.OUTLIER_DETECTOR) -require_normalization_gate() # --------------------------------------------------------------------------- # Header diff --git a/src/gui/pages/7_Multi_File_Merger.py b/src/gui/pages/7_Multi_File_Merger.py index 50f9fff..0c7ce2f 100644 --- a/src/gui/pages/7_Multi_File_Merger.py +++ b/src/gui/pages/7_Multi_File_Merger.py @@ -14,13 +14,11 @@ if str(_project_root) not in sys.path: from src.gui.components import ( hide_streamlit_chrome, require_feature_or_render_upgrade, - require_normalization_gate, ) from src.license import FeatureFlag hide_streamlit_chrome() require_feature_or_render_upgrade(FeatureFlag.MULTI_FILE_MERGER) -require_normalization_gate() # --------------------------------------------------------------------------- # Header diff --git a/src/gui/pages/8_Validator_Reporter.py b/src/gui/pages/8_Validator_Reporter.py index f94ce3d..7372b7a 100644 --- a/src/gui/pages/8_Validator_Reporter.py +++ b/src/gui/pages/8_Validator_Reporter.py @@ -14,13 +14,11 @@ if str(_project_root) not in sys.path: from src.gui.components import ( hide_streamlit_chrome, require_feature_or_render_upgrade, - require_normalization_gate, ) from src.license import FeatureFlag hide_streamlit_chrome() require_feature_or_render_upgrade(FeatureFlag.VALIDATOR_REPORTER) -require_normalization_gate() # --------------------------------------------------------------------------- # Header diff --git a/src/gui/pages/9_Pipeline_Runner.py b/src/gui/pages/9_Pipeline_Runner.py index 9f58e04..1ca6b20 100644 --- a/src/gui/pages/9_Pipeline_Runner.py +++ b/src/gui/pages/9_Pipeline_Runner.py @@ -18,7 +18,6 @@ from src.gui.components import ( hide_streamlit_chrome, pickup_or_upload, require_feature_or_render_upgrade, - require_normalization_gate, ) from src.core.pipeline import ( Pipeline, @@ -33,7 +32,6 @@ from src.license import FeatureFlag hide_streamlit_chrome() require_feature_or_render_upgrade(FeatureFlag.PIPELINE_RUNNER) -require_normalization_gate() # --------------------------------------------------------------------------- diff --git a/src/gui/tools_registry.py b/src/gui/tools_registry.py index 5e7dc89..50a9db9 100644 --- a/src/gui/tools_registry.py +++ b/src/gui/tools_registry.py @@ -22,10 +22,9 @@ from typing import Literal Tier = Literal["core", "pro", "enterprise"] Status = Literal["Ready", "Coming Soon"] -# Sidebar grouping. The Review gate is its own section; cleaners, -# transformations, and automations group the tools by what the user is -# trying to accomplish rather than by implementation detail. -Section = Literal["review", "cleaners", "transformations", "automations"] +# Sidebar grouping. Tools are bucketed by what the user is trying to +# accomplish rather than by implementation detail. +Section = Literal["cleaners", "transformations", "automations"] @dataclass(frozen=True) @@ -152,7 +151,6 @@ TOOLS: list[Tool] = [ # Display labels for each sidebar section. Kept here so i18n falls back # to a sensible English string if a translation pack is missing the key. SECTION_LABELS: dict[Section, str] = { - "review": "Data Review", "cleaners": "Data Cleaners", "transformations": "Transformations", "automations": "Automations", diff --git a/tests/gui/conftest.py b/tests/gui/conftest.py index 5e98ef9..75f0e7c 100644 --- a/tests/gui/conftest.py +++ b/tests/gui/conftest.py @@ -109,49 +109,19 @@ def app_factory(): # --------------------------------------------------------------------------- def stash_upload(app: AppTest, *, name: str, data: bytes) -> str: - """Pre-populate the home-screen upload stash + the gate's normalisation - result so a tool page renders past ``require_normalization_gate()``. + """Pre-populate the home-screen upload stash so a tool page renders + as if the user had uploaded *name* / *data* on the home screen. - Returns the SHA-256 hex of *data* (used as the gate key) in case the - test wants to assert against it. - - The gate checks:: - - - ``home_uploaded_bytes`` is set - - ``normalization_for == sha256(home_uploaded_bytes)`` - - ``normalization_result.passed is True`` - - We synthesise a passing result via a tiny stub object that satisfies - the gate's only attribute access (``.passed``). Tests that want to - exercise gate-blocking behaviour should NOT call this helper — they - should stash bytes without the normalisation result. + Returns the SHA-256 hex of *data* in case the test wants to assert + against it. """ sha = hashlib.sha256(data).hexdigest() app.session_state["home_uploaded_bytes"] = data app.session_state["home_uploaded_name"] = name app.session_state["home_uploaded_size"] = len(data) - app.session_state["normalization_for"] = sha - app.session_state["normalization_result"] = _PassedGateResult() return sha -class _PassedGateResult: - """Minimal stand-in for the real NormalizationResult shape — the gate - only reads ``.passed``. Using a real NormalizationResult here would - pull in core.normalize and tie GUI tests to its constructor surface. - """ - - passed: bool = True - - -def stash_upload_without_gate(app: AppTest, *, name: str, data: bytes) -> None: - """Stash the upload bytes but do NOT pre-pass the gate. Used by gate - tests that want the warning + Go-to-Review button to appear.""" - app.session_state["home_uploaded_bytes"] = data - app.session_state["home_uploaded_name"] = name - app.session_state["home_uploaded_size"] = len(data) - - # --------------------------------------------------------------------------- # i18n helpers # --------------------------------------------------------------------------- diff --git a/tests/gui/test_gate.py b/tests/gui/test_gate.py deleted file mode 100644 index d1a9339..0000000 --- a/tests/gui/test_gate.py +++ /dev/null @@ -1,157 +0,0 @@ -"""Gate tests — ``require_normalization_gate()`` behaviour. - -The gate sits between every tool page and the user's data. Three states -exist, each pinned here: - -1. **No upload** — gate is a no-op; the page proceeds and its own - uploader handles the file. -2. **Upload but no normalization result** — gate shows a warning and a - "Go to Review & Normalize" button, then ``st.stop()`` short-circuits - the rest of the page. -3. **Upload + matching passed normalization** — gate is a no-op; the - page proceeds. - -We exercise the gate via the Find Duplicates page (any tool page would -work; dedup is the smallest one that doesn't depend on heavy widgets). -""" - -from __future__ import annotations - -import pytest - -from .conftest import ( - collected_text, - stash_upload, - stash_upload_without_gate, - with_language, -) - - -# Find Duplicates is our canary — it calls ``require_normalization_gate`` -# on the second line of the module. If the gate blocks, the dedup- -# specific title shouldn't even render. -GATED_PAGE = "1_Deduplicator" - - -class TestGateNoUpload: - """No upload → the gate exits early and the page renders normally, - showing its own file uploader. (This is the "user opened the dedup - page first instead of coming from home" path.)""" - - def test_no_upload_lets_page_render(self, app_factory): - app = app_factory(GATED_PAGE) - app.run() - assert not app.exception - text = collected_text(app) - # The dedup page title is the unambiguous signal that the gate - # didn't short-circuit. - assert "Find Duplicates" in text - - def test_no_upload_no_gate_warning(self, app_factory): - app = app_factory(GATED_PAGE) - app.run() - # The gate's warning string starts with the upload filename. No - # warning should be present when there's no upload. - for w in app.warning: - assert "normalization gate" not in (w.body or "") - - -class TestGateBlocksWithoutNormalization: - """Upload present but no passing normalization → gate fires: - warning + Go-to-Review button + page short-circuit.""" - - def test_gate_warning_renders(self, app_factory, small_csv_bytes): - app = app_factory(GATED_PAGE) - stash_upload_without_gate(app, name="messy.csv", data=small_csv_bytes) - app.run() - warnings = [w.body for w in app.warning if w.body] - joined = " ".join(warnings) - assert "normalization gate" in joined, ( - f"expected gate warning; got warnings: {warnings}" - ) - assert "messy.csv" in joined, ( - "gate warning should name the offending file" - ) - - def test_gate_renders_go_to_review_button(self, app_factory, small_csv_bytes): - app = app_factory(GATED_PAGE) - stash_upload_without_gate(app, name="messy.csv", data=small_csv_bytes) - app.run() - labels = [b.label for b in app.button] - assert any("Review & Normalize" in lbl for lbl in labels), ( - f"missing 'Go to Review & Normalize' button; got: {labels}" - ) - - def test_gate_short_circuits_page(self, app_factory, small_csv_bytes): - app = app_factory(GATED_PAGE) - stash_upload_without_gate(app, name="messy.csv", data=small_csv_bytes) - app.run() - # When the gate fires it calls ``st.stop()`` after the warning. - # The page-body widgets (e.g., the advanced-options expander, the - # dedup-strategy widgets) must NOT be present. - labels = [b.label for b in app.button] - # The Run-Dedup primary action lives below the gate — make sure - # the gate killed the render before it. - assert not any("Run Deduplication" in lbl for lbl in labels), ( - f"gate failed to short-circuit; saw button: {labels}" - ) - - def test_gate_warning_localizes_to_spanish(self, app_factory, small_csv_bytes): - app = app_factory(GATED_PAGE) - with_language(app, "es") - stash_upload_without_gate(app, name="messy.csv", data=small_csv_bytes) - app.run() - warnings = " ".join(w.body for w in app.warning if w.body) - # Spanish pack: ``debe pasar la verificación de normalización CSV``. - assert "normalización" in warnings - - def test_gate_button_localizes_to_spanish(self, app_factory, small_csv_bytes): - app = app_factory(GATED_PAGE) - with_language(app, "es") - stash_upload_without_gate(app, name="messy.csv", data=small_csv_bytes) - app.run() - labels = [b.label for b in app.button] - assert any("Revisar y Normalizar" in lbl for lbl in labels), ( - f"Spanish gate button missing; got: {labels}" - ) - - -class TestGateAllowsWithPassedNormalization: - """Upload + passed normalization → gate is a no-op and the page - renders past the gate.""" - - def test_passed_gate_lets_page_render(self, app_factory, small_csv_bytes): - app = app_factory(GATED_PAGE) - stash_upload(app, name="messy.csv", data=small_csv_bytes) - app.run() - assert not app.exception, f"page raised past gate: {app.exception}" - # The pickup banner uses the upload name — that's our signal - # that the gate let us through AND the pickup helper engaged. - text = collected_text(app) - assert "messy.csv" in text - - -class TestGateMismatchedHash: - """Upload changes (different bytes) but normalization_for still - points at the old hash → gate fires again because the result is - stale. Pins the security-relevant "stale fix doesn't carry over to - a new file" invariant.""" - - def test_stale_normalization_blocks_new_upload(self, app_factory, small_csv_bytes): - app = app_factory(GATED_PAGE) - # Stash bytes A but a normalization_for hash that points at B. - app.session_state["home_uploaded_bytes"] = small_csv_bytes - app.session_state["home_uploaded_name"] = "new.csv" - app.session_state["home_uploaded_size"] = len(small_csv_bytes) - app.session_state["normalization_for"] = "different-hash-from-an-old-upload" - - # A passed-result object exists but is keyed to a different file. - class _Passed: - passed = True - app.session_state["normalization_result"] = _Passed() - - app.run() - warnings = " ".join(w.body for w in app.warning if w.body) - assert "normalization gate" in warnings, ( - "stale gate result should not unlock a new upload" - ) diff --git a/tests/gui/test_smoke.py b/tests/gui/test_smoke.py index 26458dc..7fa8c10 100644 --- a/tests/gui/test_smoke.py +++ b/tests/gui/test_smoke.py @@ -25,7 +25,6 @@ from .conftest import collected_text, with_language # Every page that ships in the sidebar nav. Slugs match the filenames # under ``src/gui/pages/`` so failures point at a real file. PAGE_SLUGS = [ - "0_Review", "1_Deduplicator", "2_Text_Cleaner", "3_Format_Standardizer", @@ -53,7 +52,6 @@ PAGE_SLUGS = [ # When a page gains real Spanish translation, flip its 'es' entry to # the localized substring — the test surface stays the same. EXPECTED_SUBSTRINGS: dict[str, dict[str, str]] = { - "0_Review": {"en": "Review", "es": "Review"}, "1_Deduplicator": {"en": "Find Duplicates", "es": "Find Duplicates"}, "2_Text_Cleaner": {"en": "Clean Text", "es": "Clean Text"}, "3_Format_Standardizer": {"en": "Standardize", "es": "Standardize"}, diff --git a/tests/gui/test_workflows.py b/tests/gui/test_workflows.py index 5a0f280..8c4a2e5 100644 --- a/tests/gui/test_workflows.py +++ b/tests/gui/test_workflows.py @@ -151,50 +151,6 @@ class TestPipelineRunnerWorkflow: assert "Automated Workflows" in text -# --------------------------------------------------------------------------- -# Review page — special: doesn't gate on upload, has its own analyzer flow -# --------------------------------------------------------------------------- - -class TestReviewWorkflow: - """The Review page is the gate-fixer. Without an upload it shows - its own file uploader so the user can start the flow from this - page directly. With an upload it runs the analyzer and shows - findings.""" - - def test_no_upload_shows_inline_uploader(self, app_factory): - app = app_factory("0_Review") - app.run() - text = collected_text(app) - # Page should invite the user to upload, not redirect home. - assert "Upload" in text or "Choose a file" in text, ( - f"Review page should expose an inline uploader; got:\n{text[:400]}" - ) - # The 'Back to home' button is gone — the page is self-contained now. - labels = [b.label for b in app.button] - assert not any("Back to home" in lbl for lbl in labels), ( - f"Back-to-home button should be removed; got buttons: {labels}" - ) - - def test_with_upload_shows_review_content( - self, app_factory, small_csv_bytes, - ): - app = app_factory("0_Review") - # Review page only needs the upload bytes, not a pre-passed gate. - app.session_state["home_uploaded_bytes"] = small_csv_bytes - app.session_state["home_uploaded_name"] = "messy.csv" - app.session_state["home_uploaded_size"] = len(small_csv_bytes) - app.run() - assert not app.exception - text = collected_text(app) - # Page ran the analyzer — either we get findings or the - # "already clean" success message. Either way confirms the - # analyzer pipeline ran end-to-end with the stashed bytes. - clean_msg = "No findings to review" in text - encoding_section = "File encoding" in text - assert clean_msg or encoding_section, ( - f"Review page didn't surface analyzer output; got:\n{text[:400]}" - ) - # --------------------------------------------------------------------------- # Coming-Soon pages still render (just a stub) — pinned so we know if a