refactor(gui): drop Review page + normalization gate

Home is now the only entry point: the "Run analysis" button on the
upload section IS the review step (findings render inline via
render_findings_panel). Tool pages no longer gate on a passed
normalization — running the analyzer is sufficient context.

Removed:
- src/gui/pages/0_Review.py
- src/gui/components/gate.py (re-export seam)
- require_normalization_gate() in src/gui/components/_legacy.py
- "review" section enum in tools_registry.py
- Data Review entry in app.py navigation
- require_normalization_gate() calls + imports in all nine tool pages
- tests/gui/test_gate.py (whole file)
- TestReviewWorkflow in tests/gui/test_workflows.py
- 0_Review entry in tests/gui/test_smoke.py PAGE_SLUGS
- stash_upload's normalization_result+normalization_for stashing
- stash_upload_without_gate (was the gate's negative-path helper)

2017 tests pass (16 retired with the gate flow).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-16 20:04:33 +00:00
parent fc6c22c6a7
commit dad744f17f
19 changed files with 11 additions and 1044 deletions

View File

@@ -75,7 +75,7 @@ def _home_page() -> None:
# Group tool cards by sidebar section so the home grid mirrors the
# left-nav layout — same vocabulary, same ordering.
sections: list[tuple[str, list]] = []
for section in ("review", "cleaners", "transformations", "automations"):
for section in ("cleaners", "transformations", "automations"):
tools = [tool for tool in TOOLS if tool.section == section]
if not tools:
continue
@@ -151,7 +151,6 @@ def _page_for(tool_id: str, *, page_slug: str, icon: str, title: str) -> "st.Pag
def _build_navigation() -> dict[str, list]:
by_section: dict[str, list] = {
"review": [],
"cleaners": [],
"transformations": [],
"automations": [],
@@ -166,16 +165,6 @@ def _build_navigation() -> dict[str, list]:
)
)
# The Review gate has no entry in the registry (it isn't a "tool")
# so register it by hand at the top of its section.
review_page = st.Page(
"pages/0_Review.py",
title=_t("nav.review_page_title") or "Review",
icon="🛡️",
url_path="review",
)
by_section["review"].insert(0, review_page)
home = st.Page(
_home_page,
title=_t("nav.home_page_title") or "Home",
@@ -199,7 +188,6 @@ def _build_navigation() -> dict[str, list]:
account_header = _t("nav.section_account") or "Account"
return {
"": [home],
section_label("review"): by_section["review"],
section_label("cleaners"): by_section["cleaners"],
section_label("transformations"): by_section["transformations"],
section_label("automations"): by_section["automations"],

View File

@@ -11,14 +11,13 @@ they need without dragging the entire kitchen-sink module:
components/
__init__.py ← compatibility shim (this file)
_legacy.py ← original components.py, unchanged
gate.py ← gate-only seam (require_normalization_gate)
findings.py ← analyzer-finding rendering seam
dedup_review.py ← dedup match-group cards + review pipeline
shared.py ← chrome / file-pickup helpers used by every tool
A standalone Find Duplicates build, for example, can ship without
``findings.py`` and ``gate.py`` — those modules import the analyzer /
gate code that the Lite SKU does not include.
``findings.py`` — that module imports the analyzer code that the
Lite SKU does not include.
Adding new tooling: drop new helpers into the appropriate seam module.
Add their names to its ``__all__`` and to this file's ``__all__`` if
@@ -46,11 +45,10 @@ from .activation import ( # noqa: F401 re-exported
)
__all__ = [
# Shared chrome / pickup / gate
# Shared chrome / pickup
"hide_streamlit_chrome",
"quit_button",
"pickup_or_upload",
"require_normalization_gate",
# License gate + activation form
"render_activation_form",
"render_license_status_sidebar",

View File

@@ -1264,45 +1264,6 @@ class _StashedUpload:
return self._data
def require_normalization_gate() -> None:
"""Block the calling tool page until the upload has passed the gate.
Tool pages should call this immediately after their imports. When the
current session upload has not been normalized — no
``normalization_result``, the result is for a different upload, or the
result didn't pass — the user is shown a banner and a button to jump
to the Review page; the rest of the page is short-circuited via
``st.stop()``.
Pages that genuinely don't need a clean dataframe (rare) can opt out
by simply not calling this.
"""
import hashlib
has_upload = st.session_state.get("home_uploaded_bytes") is not None
if not has_upload:
# No upload yet — let the page's own uploader handle it; the gate
# will kick in once a file is present.
return
upload_hash = hashlib.sha256(
st.session_state["home_uploaded_bytes"]
).hexdigest()
result = st.session_state.get("normalization_result")
matched = (
result is not None
and st.session_state.get("normalization_for") == upload_hash
and getattr(result, "passed", False)
)
if matched:
return
name = st.session_state.get("home_uploaded_name") or _t("gate.default_name")
st.warning(_t("gate.warning", name=name))
if st.button(_t("gate.open_review"), type="primary"):
st.switch_page("pages/0_Review.py")
st.stop()
def pickup_or_upload(
*,
label: str,

View File

@@ -1,16 +0,0 @@
"""Normalization-gate guard for tool pages.
``require_normalization_gate`` short-circuits a tool page when the
current upload has not yet passed the gate, redirecting the user to the
Review & Normalize page. Pulled into its own seam module so:
* A build that includes the gate (Pro / Suite SKUs) imports this.
* A standalone single-tool build that bypasses the gate can omit this
module entirely without removing the helper from a shared file.
"""
from __future__ import annotations
from ._legacy import require_normalization_gate
__all__ = ["require_normalization_gate"]

View File

@@ -1,711 +0,0 @@
"""Review & normalize gate page.
Sits between the home-page upload and every tool page. Walks the user
through every analyzer finding, lets them auto-fix, preview, customize,
or skip each one, and produces a :class:`NormalizationResult` stashed in
session state. Tool pages refuse to load until this gate has passed.
State contract
--------------
Session state read:
* ``home_uploaded_bytes`` / ``home_uploaded_name`` — current upload.
* ``home_findings`` — list of :class:`Finding` from the home-page scan.
* ``review_decisions`` — dict[finding_id, Decision]; user's choices so far.
Session state written:
* ``review_decisions`` — updated as the user flips controls.
* ``normalization_result`` — :class:`NormalizationResult` after Apply.
* ``normalization_for`` — content hash of the upload the result is for.
"""
from __future__ import annotations
import hashlib
import io
import sys
from pathlib import Path
from typing import Optional
import pandas as pd
import streamlit as st
# Project root on sys.path (mirrors app.py).
_project_root = Path(__file__).resolve().parent.parent.parent.parent
if str(_project_root) not in sys.path:
sys.path.insert(0, str(_project_root))
from src.core.analyze import Finding, analyze
from src.core.fixes import get_fix
from src.core.io import detect_encoding, repair_bytes
from src.core.normalize import (
Decision,
NormalizationResult,
apply_decisions,
auto_fix,
gate_summary,
is_normalized,
)
from src.gui.components import hide_streamlit_chrome
# Common single-byte and multi-byte encodings the user might pick to
# correct a misdetection. Ordered by frequency in real-world Western /
# multilingual data; keep the list short — too many options just adds
# noise. The user can type a custom encoding via the "Other" entry.
_OVERRIDE_ENCODINGS = [
"(detected)",
"utf-8",
"utf-8-sig",
"cp1252",
"iso-8859-1",
"iso-8859-15",
"cp1250",
"iso-8859-2",
"cp1251",
"koi8-r",
"mac-roman",
"shift_jis",
"cp932",
"gb18030",
"big5",
"euc-kr",
"cp949",
"utf-16",
"utf-16-le",
"utf-16-be",
"Other…",
]
st.set_page_config(page_title="Review & Normalize", page_icon="🛡️", layout="wide")
hide_streamlit_chrome()
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _upload_hash() -> Optional[str]:
data = st.session_state.get("home_uploaded_bytes")
if not data:
return None
return hashlib.sha256(data).hexdigest()
def _detected_encoding_for_session() -> Optional[str]:
"""Run charset detection on the session bytes via a tmp file."""
data = st.session_state.get("home_uploaded_bytes")
name = st.session_state.get("home_uploaded_name") or "tmp.csv"
if not data:
return None
import tempfile
suffix = "." + name.rsplit(".", 1)[-1] if "." in name else ".csv"
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as fh:
fh.write(data)
tmp_path = Path(fh.name)
try:
return detect_encoding(tmp_path)
finally:
tmp_path.unlink(missing_ok=True)
def _load_df_from_session(encoding_override: Optional[str] = None) -> Optional[pd.DataFrame]:
"""Re-parse the session upload through the same pipeline the home page
uses, so the review page operates on identical bytes.
When *encoding_override* is set, decode with that encoding instead of
UTF-8. The override flows into ``repair_bytes`` so the wide-encoding
transcode and decode_replaced fallback both honor the user's choice.
"""
data = st.session_state.get("home_uploaded_bytes")
name = st.session_state.get("home_uploaded_name") or ""
if not data:
return None
suffix = name.rsplit(".", 1)[-1].lower() if "." in name else ""
if suffix in ("xlsx", "xls"):
return pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False)
delim = "\t" if suffix == "tsv" else ","
if delim == ",":
head = data[:4096].decode("utf-8", errors="replace")
for cand in ("\t", ";", "|"):
if head.count(cand) > head.count(",") * 1.5:
delim = cand
break
enc = encoding_override or "utf-8"
repair = repair_bytes(data, encoding=enc, delimiter=delim)
return pd.read_csv(
io.BytesIO(repair.repaired_bytes),
encoding="utf-8", delimiter=delim,
dtype=str, keep_default_na=False, on_bad_lines="warn",
)
def _run_analysis_with_override(encoding_override: Optional[str]) -> list[Finding]:
"""Re-run analyze() on the session upload with an encoding override.
Mirrors components._run_analysis_on_upload but writes the bytes to a
tempfile so analyze() goes through the path-based loader (which is
where the encoding_override hook lives — DataFrame-mode analysis has
nothing to override).
"""
data = st.session_state.get("home_uploaded_bytes")
name = st.session_state.get("home_uploaded_name") or "tmp.csv"
if not data:
return []
import tempfile
suffix = "." + name.rsplit(".", 1)[-1] if "." in name else ".csv"
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as fh:
fh.write(data)
tmp_path = Path(fh.name)
try:
return analyze(tmp_path, encoding_override=encoding_override)
finally:
tmp_path.unlink(missing_ok=True)
def _confidence_pill(c: str) -> str:
"""Streamlit-markdown pill for the confidence tier."""
palette = {"high": "green", "medium": "orange", "low": "red"}
return f":{palette.get(c, 'gray')}-background[**{c.upper()}**]"
def _severity_pill(s: str) -> str:
palette = {"info": "blue", "warn": "orange", "error": "red"}
return f":{palette.get(s, 'gray')}-background[**{s}**]"
# ---------------------------------------------------------------------------
# Output options (Advanced — re-encode the cleaned DataFrame for download)
# ---------------------------------------------------------------------------
# (label_shown_to_user, codec_passed_to_pandas)
_OUTPUT_ENCODINGS = [
("UTF-8 (recommended)", "utf-8"),
("UTF-8 with BOM (Excel)", "utf-8-sig"),
("Windows-1252 (Western Europe)", "cp1252"),
("ISO-8859-1 / Latin-1", "iso-8859-1"),
("ISO-8859-15 / Latin-9", "iso-8859-15"),
("Windows-1250 (Central Europe)", "cp1250"),
("ISO-8859-2 / Latin-2", "iso-8859-2"),
("Windows-1251 (Cyrillic)", "cp1251"),
("Shift_JIS (Japanese)", "shift_jis"),
("GB18030 (Chinese)", "gb18030"),
("Big5 (Traditional Chinese)", "big5"),
("EUC-KR (Korean)", "euc-kr"),
("UTF-16 LE with BOM", "utf-16"),
]
_OUTPUT_DELIMITERS = [
("Comma ,", ","),
("Tab \\t", "\t"),
("Semicolon ;", ";"),
("Pipe |", "|"),
]
_OUTPUT_LINE_TERMINATORS = [
("LF — \\n (Unix / web / git default)", "\n"),
("CRLF — \\r\\n (Windows / classic Excel)", "\r\n"),
("CR — \\r (classic Mac, very rare)", "\r"),
]
def _build_output_bytes(
df: pd.DataFrame,
*,
encoding: str,
delimiter: str,
line_terminator: str,
) -> tuple[bytes, Optional[str]]:
"""Serialize *df* with the user's output options.
Returns ``(bytes, error_message)``. ``error_message`` is non-None when
the chosen encoding cannot represent at least one cell — characters
that don't exist in the target codepage are replaced with ``?`` so
the user still gets a download, plus a warning telling them which
target was lossy.
"""
buf = io.StringIO()
df.to_csv(buf, index=False, sep=delimiter, lineterminator=line_terminator)
text = buf.getvalue()
try:
return text.encode(encoding), None
except UnicodeEncodeError:
# Find the first character that fails so the message is useful.
bad: Optional[str] = None
for ch in text:
try:
ch.encode(encoding)
except UnicodeEncodeError:
bad = ch
break
msg = (
f"Some characters cannot be represented in {encoding}"
+ (f" (first offender: {bad!r})" if bad else "")
+ ". Falling back to '?' replacement; non-Latin content will be lost."
)
return text.encode(encoding, errors="replace"), msg
def _preview_table(f: Finding, decision_action: str, payload: Optional[dict]) -> Optional[pd.DataFrame]:
"""Build a before/after preview from finding samples.
Runs the registered fix function on each sample value individually so
the user sees exactly what would change. Returns None when no preview
is meaningful (no samples, or no fix registered).
"""
if not f.samples:
return None
fix_fn = get_fix(f.fix_action)
if fix_fn is None:
# No fix to preview; show samples as-is.
return pd.DataFrame(
[{"row": r, "column": c, "value": v} for r, c, v in f.samples]
)
rows = []
for r, col, val in f.samples:
# Run the fix on a tiny single-cell DataFrame so payload semantics
# (e.g. lowercase_email's column targeting) are honored.
mini = pd.DataFrame({col: [val]})
try:
new_df, _ = fix_fn(mini, payload)
new_val = new_df[col].iloc[0]
except Exception as e:
new_val = f"<preview error: {e}>"
rows.append({"row": r, "column": col, "before": val, "after": new_val})
return pd.DataFrame(rows)
# ---------------------------------------------------------------------------
# Page body
# ---------------------------------------------------------------------------
st.title("🛡️ Review & Normalize")
st.caption(
"Every finding is shown below with the algorithm that would fix it. "
"Auto-fix the high-confidence ones in one click; preview or customize "
"the rest before applying."
)
# Pre-flight: if nothing has been uploaded yet, let the user upload
# directly from this page instead of bouncing them back to the home
# screen. Once a file is picked, we auto-run the analyzer (the user is
# already on the Review page — they've implicitly committed to a scan),
# stash the result, and rerun so the rest of the page picks it up.
findings: list[Finding] = st.session_state.get("home_findings") or []
upload_name = st.session_state.get("home_uploaded_name")
if not upload_name:
st.info(
"Upload a CSV or Excel file to begin reviewing. The analyzer runs "
"locally and your data never leaves this computer."
)
review_upload = st.file_uploader(
"Choose a file",
type=["csv", "tsv", "xlsx", "xls"],
key="review_upload",
help="Drag-and-drop or browse for a CSV, TSV, or Excel file.",
)
if review_upload is None:
st.stop()
# New file → stash bytes + size + name, drop any stale state, then
# run the analyzer. The rerun at the bottom lets the rest of this
# page render with the upload in place.
same_file = (
st.session_state.get("home_uploaded_name") == review_upload.name
and st.session_state.get("home_uploaded_size") == review_upload.size
)
if not same_file:
st.session_state["home_uploaded_name"] = review_upload.name
st.session_state["home_uploaded_size"] = review_upload.size
st.session_state["home_uploaded_bytes"] = review_upload.getvalue()
st.session_state.pop("home_findings", None)
st.session_state.pop("home_skipped", None)
st.session_state.pop("review_decisions", None)
st.session_state.pop("normalization_result", None)
st.session_state.pop("normalization_for", None)
st.session_state.pop("encoding_override", None)
if st.session_state.get("home_findings") is None:
with st.spinner("Analyzing…"):
st.session_state["home_findings"] = _run_analysis_with_override(None)
st.session_state["home_skipped"] = False
st.rerun()
# ---- Encoding picker --------------------------------------------------------
#
# Charset detection misfires on small files, byte-equivalent codepages
# (cp1252 vs Latin-1 vs cp1250), and content where every byte happens to
# decode under the wrong encoding (KOI8-R bytes that look like Shift_JIS).
# When the user spots mojibake or U+FFFD chars in the findings list, this
# picker is the escape hatch — pick the right encoding, re-run the analyzer.
with st.container(border=True):
detected_enc = _detected_encoding_for_session()
current_override = st.session_state.get("encoding_override")
suffix = (st.session_state.get("home_uploaded_name") or "")
suffix = suffix.rsplit(".", 1)[-1].lower() if "." in suffix else ""
is_excel = suffix in ("xlsx", "xls")
st.markdown("**File encoding**")
if is_excel:
st.caption(
"Excel files store text as Unicode internally — encoding override "
"doesn't apply. Skip this section."
)
else:
cap_parts = [f"Detected: `{detected_enc or 'unknown'}`"]
if current_override:
cap_parts.append(f"Currently using: `{current_override}`")
st.caption(
" · ".join(cap_parts)
+ " · Override only if you see mojibake (e.g. `é` for `é`) or U+FFFD"
" (`<60>`) in the findings below."
)
col_pick, col_custom, col_apply = st.columns([2, 2, 1])
with col_pick:
current_label = current_override or "(detected)"
try:
idx = _OVERRIDE_ENCODINGS.index(current_label)
except ValueError:
idx = _OVERRIDE_ENCODINGS.index("Other…")
chosen = st.selectbox(
"Encoding",
options=_OVERRIDE_ENCODINGS,
index=idx,
key="encoding_override_select",
label_visibility="collapsed",
)
custom_value: Optional[str] = None
with col_custom:
if chosen == "Other…":
custom_value = st.text_input(
"Custom encoding (e.g. `cp1257`, `iso-8859-9`)",
value=current_override if current_override and current_override not in _OVERRIDE_ENCODINGS else "",
key="encoding_override_custom",
label_visibility="collapsed",
placeholder="cp1257",
)
with col_apply:
if st.button("Re-analyze", use_container_width=True):
if chosen == "(detected)":
new_override = None
elif chosen == "Other…":
new_override = (custom_value or "").strip() or None
else:
new_override = chosen
# Sanity-check the override actually decodes the bytes.
data = st.session_state.get("home_uploaded_bytes") or b""
if new_override is not None:
try:
data.decode(new_override, errors="strict")
decode_ok = True
decode_err = None
except (UnicodeDecodeError, LookupError) as e:
decode_ok = False
decode_err = str(e)
else:
decode_ok = True
decode_err = None
if not decode_ok:
st.warning(
f"`{new_override}` cannot decode this file: {decode_err}. "
f"Re-running anyway with replacement-character fallback so "
f"you can see where the failures are."
)
# Re-run analysis with the override and refresh session state.
st.session_state["encoding_override"] = new_override
st.session_state["home_findings"] = _run_analysis_with_override(new_override)
# Drop any prior gate result; the user must re-apply.
st.session_state.pop("normalization_result", None)
st.session_state.pop("normalization_for", None)
st.session_state.pop("review_decisions", None)
st.rerun()
# Reload findings — the picker above may have just rewritten them.
findings = st.session_state.get("home_findings") or []
if not findings:
st.success("✓ No findings to review. The file is already clean — open any tool to begin.")
st.stop()
# ---- Top-line counters -------------------------------------------------------
n_high = sum(1 for f in findings if f.confidence == "high" and not f.pre_applied and f.fix_action)
n_medium = sum(1 for f in findings if f.confidence == "medium" and not f.pre_applied)
n_low = sum(1 for f in findings if f.confidence == "low" and not f.pre_applied)
n_pre = sum(1 for f in findings if f.pre_applied)
n_block = sum(1 for f in findings if f.severity == "error")
c1, c2, c3, c4, c5 = st.columns(5)
c1.metric("High confidence", n_high, help="Round-trip safe — eligible for auto-fix.")
c2.metric("Medium", n_medium, help="Right call in the common case; preview before applying.")
c3.metric("Low", n_low, help="Heuristic — opt in only.")
c4.metric("Already applied", n_pre, help="Fixed during the read pass (BOM, NUL, line endings).")
c5.metric("Blocking", n_block, help="Severity = error; must be resolved or waived.")
st.divider()
# ---- Top-level controls ------------------------------------------------------
decisions_state: dict = st.session_state.setdefault("review_decisions", {})
bar_left, bar_mid, bar_right = st.columns([1.2, 1.2, 3])
with bar_left:
if st.button("✨ Auto-fix high-confidence", type="primary", use_container_width=True):
for f in findings:
if (
not f.pre_applied
and f.confidence == "high"
and f.fix_action
and get_fix(f.fix_action) is not None
):
decisions_state[f.id] = Decision(finding_id=f.id, action="auto")
st.rerun()
with bar_mid:
if st.button("Skip everything (not recommended)", use_container_width=True):
for f in findings:
if not f.pre_applied:
decisions_state[f.id] = Decision(finding_id=f.id, action="skip")
st.rerun()
# ---- Per-finding cards -------------------------------------------------------
# Sort: blocking first, then high (unfixed), medium, low, pre-applied.
def _sort_key(f: Finding) -> tuple:
severity_rank = {"error": 0, "warn": 1, "info": 2}[f.severity]
confidence_rank = {"high": 0, "medium": 1, "low": 2}[f.confidence]
return (int(f.pre_applied), severity_rank, confidence_rank, f.id)
for f in sorted(findings, key=_sort_key):
decision = decisions_state.get(f.id)
decision_action = decision.action if decision else (
"auto" if (f.pre_applied or (f.confidence == "high" and f.fix_action)) else "skip"
)
title_bits = [
_severity_pill(f.severity),
_confidence_pill(f.confidence),
f"**{f.id}**",
f"({f.count})",
]
if f.pre_applied:
title_bits.append(":gray-background[applied during read]")
with st.expander(" ".join(title_bits), expanded=(f.severity == "error")):
st.caption(f.description)
if f.tool:
st.caption(f"Owned by: `{f.tool}`")
if f.pre_applied:
st.info("This was already applied during the file read pass — no decision needed.")
continue
if not f.fix_action:
if f.severity == "error":
st.error(
"Blocking finding with no auto-fix. Choose **Skip / waive** to "
"acknowledge and proceed (not recommended), or fix the file outside "
"DataTools and re-upload."
)
else:
st.info("Informational only — no fix to apply.")
# Decision radio
choice_labels = {
"auto": "Auto-fix with our algorithm",
"skip": "Skip / waive (no change)",
}
# Customize is offered for fixes that take a meaningful payload.
if f.fix_action in ("replace_null_sentinels",):
choice_labels["modified"] = "Customize"
chosen = st.radio(
"Decision",
options=list(choice_labels.keys()),
index=list(choice_labels.keys()).index(decision_action)
if decision_action in choice_labels else 0,
format_func=lambda k: choice_labels[k],
key=f"decision_{f.id}",
horizontal=True,
)
# Customize payload editor (only for the modified action)
payload: Optional[dict] = None
if chosen == "modified" and f.fix_action == "replace_null_sentinels":
default_sentinels = ", ".join(sorted([
"n/a", "na", "nan", "null", "none", "-", "--", "tbd", "unknown",
]))
text = st.text_area(
"Sentinels (comma-separated, case-insensitive):",
value=(decision.payload or {}).get(
"sentinels_raw", default_sentinels,
) if decision else default_sentinels,
key=f"sentinels_{f.id}",
)
sentinels = [s.strip() for s in text.split(",") if s.strip()]
payload = {"sentinels": sentinels, "sentinels_raw": text}
# Persist
decisions_state[f.id] = Decision(
finding_id=f.id, action=chosen, payload=payload,
)
# Preview
if chosen != "skip" and f.samples:
preview = _preview_table(f, chosen, payload)
if preview is not None and not preview.empty:
st.markdown("**Preview** (showing up to 5 affected cells)")
st.dataframe(preview, use_container_width=True, hide_index=True)
st.divider()
# ---- Apply ------------------------------------------------------------------
bottom_left, bottom_mid, bottom_right = st.columns([1, 1, 3])
with bottom_left:
apply_clicked = st.button(
"✅ Apply & enter tools", type="primary", use_container_width=True,
disabled=not decisions_state,
)
with bottom_mid:
reset_clicked = st.button("Reset all decisions", use_container_width=True)
if reset_clicked:
st.session_state.pop("review_decisions", None)
st.session_state.pop("normalization_result", None)
st.session_state.pop("normalization_for", None)
st.rerun()
if apply_clicked:
df = _load_df_from_session(
encoding_override=st.session_state.get("encoding_override")
)
if df is None:
st.error("Could not re-read the uploaded file. Try re-uploading.")
st.stop()
decisions_list = [d for d in decisions_state.values() if isinstance(d, Decision)]
result = apply_decisions(df, findings, decisions_list)
st.session_state["normalization_result"] = result
st.session_state["normalization_for"] = _upload_hash()
summary = gate_summary(result)
if result.passed and is_normalized(findings, result):
st.success(
f"✓ Gate passed — {summary['fixes_applied']} fix(es) applied, "
f"{summary['cells_changed']} cell(s) changed. You can now open any tool."
)
elif result.blocking_findings:
st.error(
f"Gate blocked by error-level findings: "
f"{', '.join(b.id for b in result.blocking_findings)}. "
f"Resolve or waive them above before continuing."
)
elif result.pending_findings:
st.warning(
f"Pending decisions remain on: "
f"{', '.join(f.id for f in result.pending_findings)}. "
f"Choose Auto-fix or Skip for each before continuing."
)
# Persisted summary (re-render on reload)
result: Optional[NormalizationResult] = st.session_state.get("normalization_result")
if result is not None and st.session_state.get("normalization_for") == _upload_hash():
with st.expander("Audit log"):
if result.applied:
st.markdown("**Applied fixes**")
st.dataframe(
pd.DataFrame([
{
"finding": a.finding_id,
"fix_action": a.fix_action,
"decision": a.decision,
"cells_changed": a.cells_changed,
}
for a in result.applied
]),
use_container_width=True, hide_index=True,
)
if result.skipped_findings:
st.markdown("**Skipped (waived by user)**")
st.write([f.id for f in result.skipped_findings])
if result.passed:
st.markdown("---")
st.markdown("**Download normalized file**")
with st.expander("⚙️ Advanced output options"):
st.caption(
"Defaults match what the analyzer normalized to: UTF-8, "
"comma-separated, LF line endings. Override only if your "
"destination tool requires a specific format."
)
col_enc, col_delim, col_le = st.columns(3)
with col_enc:
enc_choice = st.selectbox(
"Encoding (code page)",
options=[label for label, _ in _OUTPUT_ENCODINGS],
index=0,
key="output_encoding_select",
)
out_encoding = next(
codec for label, codec in _OUTPUT_ENCODINGS if label == enc_choice
)
with col_delim:
delim_choice = st.selectbox(
"Delimiter",
options=[label for label, _ in _OUTPUT_DELIMITERS],
index=0,
key="output_delim_select",
)
out_delim = next(
ch for label, ch in _OUTPUT_DELIMITERS if label == delim_choice
)
with col_le:
le_choice = st.selectbox(
"Line terminator",
options=[label for label, _ in _OUTPUT_LINE_TERMINATORS],
index=0,
key="output_le_select",
)
out_le = next(
ch for label, ch in _OUTPUT_LINE_TERMINATORS if label == le_choice
)
data, encode_warn = _build_output_bytes(
result.cleaned_df,
encoding=out_encoding,
delimiter=out_delim,
line_terminator=out_le,
)
if encode_warn:
st.warning(encode_warn)
ext = "tsv" if out_delim == "\t" else "csv"
mime = "text/tab-separated-values" if out_delim == "\t" else "text/csv"
file_name = f"{Path(upload_name).stem}.normalized.{ext}"
st.download_button(
f"⬇️ Download {file_name}",
data=data,
file_name=file_name,
mime=mime,
type="primary",
)

View File

@@ -23,14 +23,12 @@ from src.gui.components import (
match_group_card,
pickup_or_upload,
require_feature_or_render_upgrade,
require_normalization_gate,
results_summary,
)
from src.license import FeatureFlag
hide_streamlit_chrome()
require_feature_or_render_upgrade(FeatureFlag.DEDUPLICATOR)
require_normalization_gate()
# ---------------------------------------------------------------------------
# Session state defaults

View File

@@ -19,7 +19,6 @@ from src.gui.components import (
pickup_or_upload,
render_hidden_aware_preview,
require_feature_or_render_upgrade,
require_normalization_gate,
)
from src.license import FeatureFlag
from src.core.text_clean import (
@@ -32,7 +31,6 @@ from src.core.text_clean import (
hide_streamlit_chrome()
require_feature_or_render_upgrade(FeatureFlag.TEXT_CLEANER)
require_normalization_gate()
# ---------------------------------------------------------------------------

View File

@@ -18,7 +18,6 @@ from src.gui.components import (
hide_streamlit_chrome,
pickup_or_upload,
require_feature_or_render_upgrade,
require_normalization_gate,
)
from src.core.format_standardize import (
PRESETS,
@@ -30,7 +29,6 @@ from src.license import FeatureFlag
hide_streamlit_chrome()
require_feature_or_render_upgrade(FeatureFlag.FORMAT_STANDARDIZER)
require_normalization_gate()
# ---------------------------------------------------------------------------

View File

@@ -18,7 +18,6 @@ from src.gui.components import (
hide_streamlit_chrome,
pickup_or_upload,
require_feature_or_render_upgrade,
require_normalization_gate,
)
from src.core.missing import (
DEFAULT_SENTINELS,
@@ -31,7 +30,6 @@ from src.license import FeatureFlag
hide_streamlit_chrome()
require_feature_or_render_upgrade(FeatureFlag.MISSING_HANDLER)
require_normalization_gate()
# ---------------------------------------------------------------------------

View File

@@ -18,7 +18,6 @@ from src.gui.components import (
hide_streamlit_chrome,
pickup_or_upload,
require_feature_or_render_upgrade,
require_normalization_gate,
)
from src.core.column_mapper import (
MapOptions,
@@ -32,7 +31,6 @@ from src.license import FeatureFlag
hide_streamlit_chrome()
require_feature_or_render_upgrade(FeatureFlag.COLUMN_MAPPER)
require_normalization_gate()
# ---------------------------------------------------------------------------

View File

@@ -14,13 +14,11 @@ if str(_project_root) not in sys.path:
from src.gui.components import (
hide_streamlit_chrome,
require_feature_or_render_upgrade,
require_normalization_gate,
)
from src.license import FeatureFlag
hide_streamlit_chrome()
require_feature_or_render_upgrade(FeatureFlag.OUTLIER_DETECTOR)
require_normalization_gate()
# ---------------------------------------------------------------------------
# Header

View File

@@ -14,13 +14,11 @@ if str(_project_root) not in sys.path:
from src.gui.components import (
hide_streamlit_chrome,
require_feature_or_render_upgrade,
require_normalization_gate,
)
from src.license import FeatureFlag
hide_streamlit_chrome()
require_feature_or_render_upgrade(FeatureFlag.MULTI_FILE_MERGER)
require_normalization_gate()
# ---------------------------------------------------------------------------
# Header

View File

@@ -14,13 +14,11 @@ if str(_project_root) not in sys.path:
from src.gui.components import (
hide_streamlit_chrome,
require_feature_or_render_upgrade,
require_normalization_gate,
)
from src.license import FeatureFlag
hide_streamlit_chrome()
require_feature_or_render_upgrade(FeatureFlag.VALIDATOR_REPORTER)
require_normalization_gate()
# ---------------------------------------------------------------------------
# Header

View File

@@ -18,7 +18,6 @@ from src.gui.components import (
hide_streamlit_chrome,
pickup_or_upload,
require_feature_or_render_upgrade,
require_normalization_gate,
)
from src.core.pipeline import (
Pipeline,
@@ -33,7 +32,6 @@ from src.license import FeatureFlag
hide_streamlit_chrome()
require_feature_or_render_upgrade(FeatureFlag.PIPELINE_RUNNER)
require_normalization_gate()
# ---------------------------------------------------------------------------

View File

@@ -22,10 +22,9 @@ from typing import Literal
Tier = Literal["core", "pro", "enterprise"]
Status = Literal["Ready", "Coming Soon"]
# Sidebar grouping. The Review gate is its own section; cleaners,
# transformations, and automations group the tools by what the user is
# trying to accomplish rather than by implementation detail.
Section = Literal["review", "cleaners", "transformations", "automations"]
# Sidebar grouping. Tools are bucketed by what the user is trying to
# accomplish rather than by implementation detail.
Section = Literal["cleaners", "transformations", "automations"]
@dataclass(frozen=True)
@@ -152,7 +151,6 @@ TOOLS: list[Tool] = [
# Display labels for each sidebar section. Kept here so i18n falls back
# to a sensible English string if a translation pack is missing the key.
SECTION_LABELS: dict[Section, str] = {
"review": "Data Review",
"cleaners": "Data Cleaners",
"transformations": "Transformations",
"automations": "Automations",

View File

@@ -109,49 +109,19 @@ def app_factory():
# ---------------------------------------------------------------------------
def stash_upload(app: AppTest, *, name: str, data: bytes) -> str:
"""Pre-populate the home-screen upload stash + the gate's normalisation
result so a tool page renders past ``require_normalization_gate()``.
"""Pre-populate the home-screen upload stash so a tool page renders
as if the user had uploaded *name* / *data* on the home screen.
Returns the SHA-256 hex of *data* (used as the gate key) in case the
test wants to assert against it.
The gate checks::
- ``home_uploaded_bytes`` is set
- ``normalization_for == sha256(home_uploaded_bytes)``
- ``normalization_result.passed is True``
We synthesise a passing result via a tiny stub object that satisfies
the gate's only attribute access (``.passed``). Tests that want to
exercise gate-blocking behaviour should NOT call this helper — they
should stash bytes without the normalisation result.
Returns the SHA-256 hex of *data* in case the test wants to assert
against it.
"""
sha = hashlib.sha256(data).hexdigest()
app.session_state["home_uploaded_bytes"] = data
app.session_state["home_uploaded_name"] = name
app.session_state["home_uploaded_size"] = len(data)
app.session_state["normalization_for"] = sha
app.session_state["normalization_result"] = _PassedGateResult()
return sha
class _PassedGateResult:
"""Minimal stand-in for the real NormalizationResult shape — the gate
only reads ``.passed``. Using a real NormalizationResult here would
pull in core.normalize and tie GUI tests to its constructor surface.
"""
passed: bool = True
def stash_upload_without_gate(app: AppTest, *, name: str, data: bytes) -> None:
"""Stash the upload bytes but do NOT pre-pass the gate. Used by gate
tests that want the warning + Go-to-Review button to appear."""
app.session_state["home_uploaded_bytes"] = data
app.session_state["home_uploaded_name"] = name
app.session_state["home_uploaded_size"] = len(data)
# ---------------------------------------------------------------------------
# i18n helpers
# ---------------------------------------------------------------------------

View File

@@ -1,157 +0,0 @@
"""Gate tests — ``require_normalization_gate()`` behaviour.
The gate sits between every tool page and the user's data. Three states
exist, each pinned here:
1. **No upload** — gate is a no-op; the page proceeds and its own
uploader handles the file.
2. **Upload but no normalization result** — gate shows a warning and a
"Go to Review & Normalize" button, then ``st.stop()`` short-circuits
the rest of the page.
3. **Upload + matching passed normalization** — gate is a no-op; the
page proceeds.
We exercise the gate via the Find Duplicates page (any tool page would
work; dedup is the smallest one that doesn't depend on heavy widgets).
"""
from __future__ import annotations
import pytest
from .conftest import (
collected_text,
stash_upload,
stash_upload_without_gate,
with_language,
)
# Find Duplicates is our canary — it calls ``require_normalization_gate``
# on the second line of the module. If the gate blocks, the dedup-
# specific title shouldn't even render.
GATED_PAGE = "1_Deduplicator"
class TestGateNoUpload:
"""No upload → the gate exits early and the page renders normally,
showing its own file uploader. (This is the "user opened the dedup
page first instead of coming from home" path.)"""
def test_no_upload_lets_page_render(self, app_factory):
app = app_factory(GATED_PAGE)
app.run()
assert not app.exception
text = collected_text(app)
# The dedup page title is the unambiguous signal that the gate
# didn't short-circuit.
assert "Find Duplicates" in text
def test_no_upload_no_gate_warning(self, app_factory):
app = app_factory(GATED_PAGE)
app.run()
# The gate's warning string starts with the upload filename. No
# warning should be present when there's no upload.
for w in app.warning:
assert "normalization gate" not in (w.body or "")
class TestGateBlocksWithoutNormalization:
"""Upload present but no passing normalization → gate fires:
warning + Go-to-Review button + page short-circuit."""
def test_gate_warning_renders(self, app_factory, small_csv_bytes):
app = app_factory(GATED_PAGE)
stash_upload_without_gate(app, name="messy.csv", data=small_csv_bytes)
app.run()
warnings = [w.body for w in app.warning if w.body]
joined = " ".join(warnings)
assert "normalization gate" in joined, (
f"expected gate warning; got warnings: {warnings}"
)
assert "messy.csv" in joined, (
"gate warning should name the offending file"
)
def test_gate_renders_go_to_review_button(self, app_factory, small_csv_bytes):
app = app_factory(GATED_PAGE)
stash_upload_without_gate(app, name="messy.csv", data=small_csv_bytes)
app.run()
labels = [b.label for b in app.button]
assert any("Review & Normalize" in lbl for lbl in labels), (
f"missing 'Go to Review & Normalize' button; got: {labels}"
)
def test_gate_short_circuits_page(self, app_factory, small_csv_bytes):
app = app_factory(GATED_PAGE)
stash_upload_without_gate(app, name="messy.csv", data=small_csv_bytes)
app.run()
# When the gate fires it calls ``st.stop()`` after the warning.
# The page-body widgets (e.g., the advanced-options expander, the
# dedup-strategy widgets) must NOT be present.
labels = [b.label for b in app.button]
# The Run-Dedup primary action lives below the gate — make sure
# the gate killed the render before it.
assert not any("Run Deduplication" in lbl for lbl in labels), (
f"gate failed to short-circuit; saw button: {labels}"
)
def test_gate_warning_localizes_to_spanish(self, app_factory, small_csv_bytes):
app = app_factory(GATED_PAGE)
with_language(app, "es")
stash_upload_without_gate(app, name="messy.csv", data=small_csv_bytes)
app.run()
warnings = " ".join(w.body for w in app.warning if w.body)
# Spanish pack: ``debe pasar la verificación de normalización CSV``.
assert "normalización" in warnings
def test_gate_button_localizes_to_spanish(self, app_factory, small_csv_bytes):
app = app_factory(GATED_PAGE)
with_language(app, "es")
stash_upload_without_gate(app, name="messy.csv", data=small_csv_bytes)
app.run()
labels = [b.label for b in app.button]
assert any("Revisar y Normalizar" in lbl for lbl in labels), (
f"Spanish gate button missing; got: {labels}"
)
class TestGateAllowsWithPassedNormalization:
"""Upload + passed normalization → gate is a no-op and the page
renders past the gate."""
def test_passed_gate_lets_page_render(self, app_factory, small_csv_bytes):
app = app_factory(GATED_PAGE)
stash_upload(app, name="messy.csv", data=small_csv_bytes)
app.run()
assert not app.exception, f"page raised past gate: {app.exception}"
# The pickup banner uses the upload name — that's our signal
# that the gate let us through AND the pickup helper engaged.
text = collected_text(app)
assert "messy.csv" in text
class TestGateMismatchedHash:
"""Upload changes (different bytes) but normalization_for still
points at the old hash → gate fires again because the result is
stale. Pins the security-relevant "stale fix doesn't carry over to
a new file" invariant."""
def test_stale_normalization_blocks_new_upload(self, app_factory, small_csv_bytes):
app = app_factory(GATED_PAGE)
# Stash bytes A but a normalization_for hash that points at B.
app.session_state["home_uploaded_bytes"] = small_csv_bytes
app.session_state["home_uploaded_name"] = "new.csv"
app.session_state["home_uploaded_size"] = len(small_csv_bytes)
app.session_state["normalization_for"] = "different-hash-from-an-old-upload"
# A passed-result object exists but is keyed to a different file.
class _Passed:
passed = True
app.session_state["normalization_result"] = _Passed()
app.run()
warnings = " ".join(w.body for w in app.warning if w.body)
assert "normalization gate" in warnings, (
"stale gate result should not unlock a new upload"
)

View File

@@ -25,7 +25,6 @@ from .conftest import collected_text, with_language
# Every page that ships in the sidebar nav. Slugs match the filenames
# under ``src/gui/pages/`` so failures point at a real file.
PAGE_SLUGS = [
"0_Review",
"1_Deduplicator",
"2_Text_Cleaner",
"3_Format_Standardizer",
@@ -53,7 +52,6 @@ PAGE_SLUGS = [
# When a page gains real Spanish translation, flip its 'es' entry to
# the localized substring — the test surface stays the same.
EXPECTED_SUBSTRINGS: dict[str, dict[str, str]] = {
"0_Review": {"en": "Review", "es": "Review"},
"1_Deduplicator": {"en": "Find Duplicates", "es": "Find Duplicates"},
"2_Text_Cleaner": {"en": "Clean Text", "es": "Clean Text"},
"3_Format_Standardizer": {"en": "Standardize", "es": "Standardize"},

View File

@@ -151,50 +151,6 @@ class TestPipelineRunnerWorkflow:
assert "Automated Workflows" in text
# ---------------------------------------------------------------------------
# Review page — special: doesn't gate on upload, has its own analyzer flow
# ---------------------------------------------------------------------------
class TestReviewWorkflow:
"""The Review page is the gate-fixer. Without an upload it shows
its own file uploader so the user can start the flow from this
page directly. With an upload it runs the analyzer and shows
findings."""
def test_no_upload_shows_inline_uploader(self, app_factory):
app = app_factory("0_Review")
app.run()
text = collected_text(app)
# Page should invite the user to upload, not redirect home.
assert "Upload" in text or "Choose a file" in text, (
f"Review page should expose an inline uploader; got:\n{text[:400]}"
)
# The 'Back to home' button is gone — the page is self-contained now.
labels = [b.label for b in app.button]
assert not any("Back to home" in lbl for lbl in labels), (
f"Back-to-home button should be removed; got buttons: {labels}"
)
def test_with_upload_shows_review_content(
self, app_factory, small_csv_bytes,
):
app = app_factory("0_Review")
# Review page only needs the upload bytes, not a pre-passed gate.
app.session_state["home_uploaded_bytes"] = small_csv_bytes
app.session_state["home_uploaded_name"] = "messy.csv"
app.session_state["home_uploaded_size"] = len(small_csv_bytes)
app.run()
assert not app.exception
text = collected_text(app)
# Page ran the analyzer — either we get findings or the
# "already clean" success message. Either way confirms the
# analyzer pipeline ran end-to-end with the stashed bytes.
clean_msg = "No findings to review" in text
encoding_section = "File encoding" in text
assert clean_msg or encoding_section, (
f"Review page didn't surface analyzer output; got:\n{text[:400]}"
)
# ---------------------------------------------------------------------------
# Coming-Soon pages still render (just a stub) — pinned so we know if a