fix(text-cleaner): hoist show_hidden + stress-test all tool pages
Reported crash: clicking "Clean Text" with mojibake.csv (a junk corpus
file that the cleaner ran on but produced zero changes) blew up the
results render with
NameError: name 'show_hidden' is not defined
at the cleaned-preview block. ``show_hidden`` was defined inside
``if result.cells_changed:`` and referenced unconditionally below.
Fix on the page itself: hoist the ``show_hidden = st.toggle(...)``
declaration out of the conditional so it's always in scope for the
downstream cleaned-preview render. One toggle now drives both the
Examples table (which only renders when there are changes) AND the
cleaned preview (which always renders).
Generalized regression net: ``tests/test_junk_corpus_tool_pages.py``.
For nine representative junk files (empty, only_nul, mojibake,
invalid_utf8, utf16_le_no_bom, mismatched_columns, all_nulls,
corrupt_xlsx, single_column) and every Ready/Coming-Soon tool page,
the test:
1. Stashes the junk bytes as the home upload via session_state.
2. Runs the page through AppTest, asserts ``app.exception`` is empty.
3. If the page exposes a deterministic primary-action button label,
clicks it and asserts no exception on the post-click render.
Pages that catch a bad file at read time and short-circuit via
``st.error`` + ``st.stop`` are correctly skipped from the
primary-action half (the button isn't rendered). A genuine crash
shows up as ``app.exception`` carrying a Python traceback — exactly
what the user reported, exactly what we now catch.
162 tests collected, 102 passed, 60 skipped. 4 seconds.
Full suite: 2220 passed, 91 skipped, 35 s.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -256,6 +256,21 @@ m2.metric("Cells changed", result.cells_changed)
|
|||||||
m3.metric("% changed", f"{pct:.1f}%")
|
m3.metric("% changed", f"{pct:.1f}%")
|
||||||
m4.metric("Columns processed", len(result.columns_processed))
|
m4.metric("Columns processed", len(result.columns_processed))
|
||||||
|
|
||||||
|
# Single toggle drives both the Examples table AND the Cleaned preview.
|
||||||
|
# Defined OUTSIDE the ``if result.cells_changed`` block so the
|
||||||
|
# downstream cleaned-preview render below always has the variable in
|
||||||
|
# scope, even on no-op runs (junk files / minimal preset that produces
|
||||||
|
# zero changes — previously triggered ``NameError: show_hidden``).
|
||||||
|
show_hidden = st.toggle(
|
||||||
|
"Show hidden characters (NBSP, ZWSP, smart quotes, control chars…)",
|
||||||
|
value=True,
|
||||||
|
help=(
|
||||||
|
"Highlights characters the cleaner is removing or replacing. "
|
||||||
|
"Hover any badge to see the codepoint and label."
|
||||||
|
),
|
||||||
|
key="textclean_show_hidden",
|
||||||
|
)
|
||||||
|
|
||||||
if result.cells_changed:
|
if result.cells_changed:
|
||||||
counts = result.changes["column"].value_counts()
|
counts = result.changes["column"].value_counts()
|
||||||
st.markdown("**Changes by column**")
|
st.markdown("**Changes by column**")
|
||||||
@@ -265,15 +280,6 @@ if result.cells_changed:
|
|||||||
)
|
)
|
||||||
|
|
||||||
st.markdown("**Examples (first 25 changes)**")
|
st.markdown("**Examples (first 25 changes)**")
|
||||||
show_hidden = st.toggle(
|
|
||||||
"Show hidden characters (NBSP, ZWSP, smart quotes, control chars…)",
|
|
||||||
value=True,
|
|
||||||
help=(
|
|
||||||
"Highlights characters the cleaner is removing or replacing. "
|
|
||||||
"Hover any badge to see the codepoint and label."
|
|
||||||
),
|
|
||||||
key="textclean_show_hidden",
|
|
||||||
)
|
|
||||||
examples = result.changes.head(25).copy()
|
examples = result.changes.head(25).copy()
|
||||||
examples["row"] = examples["row"] + 1
|
examples["row"] = examples["row"] + 1
|
||||||
if show_hidden:
|
if show_hidden:
|
||||||
|
|||||||
126
tests/test_junk_corpus_tool_pages.py
Normal file
126
tests/test_junk_corpus_tool_pages.py
Normal file
@@ -0,0 +1,126 @@
|
|||||||
|
"""Stress-test every tool page against the junk corpus via AppTest.
|
||||||
|
|
||||||
|
For each Ready tool page, walk a representative subset of pathological
|
||||||
|
files, stash them as the session upload, render the page, click the
|
||||||
|
primary action, and assert that nothing raises and that a clean error
|
||||||
|
is surfaced when the file is genuinely unprocessable.
|
||||||
|
|
||||||
|
The home-page analyzer is covered separately by
|
||||||
|
``tests/test_junk_corpus.py``. This file pins the *tool page* contract:
|
||||||
|
once a junk file makes it past the upload, the page's own read +
|
||||||
|
operate + render loop must also stay stable.
|
||||||
|
|
||||||
|
Subset rationale: running 35 files × 9 pages × full render is too slow
|
||||||
|
for CI. The subset hits the high-value shapes that historically broke
|
||||||
|
something (empty, NUL-only, mojibake, corrupt zip, mismatched columns)
|
||||||
|
plus a clean one to make sure the page still works under happy-path
|
||||||
|
inputs.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from streamlit.testing.v1 import AppTest
|
||||||
|
|
||||||
|
|
||||||
|
_PAGES = Path(__file__).resolve().parent.parent / "src" / "gui" / "pages"
|
||||||
|
_CORPUS = (
|
||||||
|
Path(__file__).resolve().parent.parent
|
||||||
|
/ "test-cases" / "junk-corpus" / "test_data"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Representative shapes. Each was selected because it has historically
|
||||||
|
# tripped (or could trip) a different layer: file IO, encoding, pandas
|
||||||
|
# parse, downstream operation, or result rendering.
|
||||||
|
_SUBSET = [
|
||||||
|
"empty.csv", # zero bytes — IO short-circuit
|
||||||
|
"only_nul.csv", # all-NUL — repair_bytes strips to nothing
|
||||||
|
"mojibake.csv", # Latin-1-as-UTF-8 — original NameError repro
|
||||||
|
"invalid_utf8.csv", # raw invalid UTF-8 sequences
|
||||||
|
"utf16_le_no_bom.csv", # wide encoding without BOM
|
||||||
|
"mismatched_columns.csv", # pandas ParserWarning territory
|
||||||
|
"all_nulls.csv", # cleans/standardizes a no-op
|
||||||
|
"corrupt_xlsx.xlsx", # bad zip — pandas/openpyxl error
|
||||||
|
"single_column.csv", # only one column — many tools assume >=2
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# Ready tool pages with a deterministic primary-action button label.
|
||||||
|
# The Coming-Soon stubs (6_Outlier_Detector, 7_Multi_File_Merger,
|
||||||
|
# 8_Validator_Reporter) don't process the file so they're covered by
|
||||||
|
# the "renders without exception" half of the contract only.
|
||||||
|
_TOOL_PAGES = [
|
||||||
|
("1_Deduplicator.py", None), # dedup's primary action depends on UI state
|
||||||
|
("2_Text_Cleaner.py", "Clean Text"),
|
||||||
|
("3_Format_Standardizer.py", None), # button label varies with detected types
|
||||||
|
("4_Missing_Values.py", "Handle Missing Values"),
|
||||||
|
("5_Column_Mapper.py", "Apply Column Mapping"),
|
||||||
|
("6_Outlier_Detector.py", None), # stub
|
||||||
|
("7_Multi_File_Merger.py", None), # stub
|
||||||
|
("8_Validator_Reporter.py", None), # stub
|
||||||
|
("9_Pipeline_Runner.py", None), # button enabled only when pipeline valid
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _stash_junk_upload(at: AppTest, path: Path) -> None:
|
||||||
|
"""Pre-populate the session upload stash with junk bytes."""
|
||||||
|
data = path.read_bytes()
|
||||||
|
at.session_state["home_uploaded_bytes"] = data
|
||||||
|
at.session_state["home_uploaded_name"] = path.name
|
||||||
|
at.session_state["home_uploaded_size"] = len(data)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("junk_file", _SUBSET)
|
||||||
|
@pytest.mark.parametrize("page,primary_label", _TOOL_PAGES, ids=lambda x: x[0] if isinstance(x, tuple) else str(x))
|
||||||
|
class TestToolPagesSurvive:
|
||||||
|
"""Every Ready/Coming-Soon tool page must render against every
|
||||||
|
junk file without raising. Errors surface as ``st.error`` banners
|
||||||
|
inside the page body, not as Python tracebacks bubbled up to the
|
||||||
|
Streamlit chrome.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_initial_render_no_exception(self, page, primary_label, junk_file):
|
||||||
|
path = _CORPUS / junk_file
|
||||||
|
if not path.exists():
|
||||||
|
pytest.skip(f"junk file {junk_file} missing from corpus")
|
||||||
|
|
||||||
|
at = AppTest.from_file(str(_PAGES / page))
|
||||||
|
_stash_junk_upload(at, path)
|
||||||
|
at.run(timeout=20)
|
||||||
|
# ``at.exception`` is a list-like; empty means no traceback.
|
||||||
|
assert not at.exception, (
|
||||||
|
f"{page} crashed on first render with {junk_file}: "
|
||||||
|
f"{at.exception}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_primary_action_no_exception(self, page, primary_label, junk_file):
|
||||||
|
if primary_label is None:
|
||||||
|
pytest.skip(
|
||||||
|
f"{page} has no deterministic primary-action label; "
|
||||||
|
f"covered by initial-render half of the contract"
|
||||||
|
)
|
||||||
|
path = _CORPUS / junk_file
|
||||||
|
if not path.exists():
|
||||||
|
pytest.skip(f"junk file {junk_file} missing from corpus")
|
||||||
|
|
||||||
|
at = AppTest.from_file(str(_PAGES / page))
|
||||||
|
_stash_junk_upload(at, path)
|
||||||
|
at.run(timeout=20)
|
||||||
|
# First render may already have failed cleanly via st.error +
|
||||||
|
# st.stop — in which case the primary action button isn't
|
||||||
|
# rendered. That's the correct behaviour; skip.
|
||||||
|
candidates = [b for b in at.button if b.label == primary_label]
|
||||||
|
if not candidates:
|
||||||
|
pytest.skip(
|
||||||
|
f"{page} did not render {primary_label!r} for "
|
||||||
|
f"{junk_file} (likely caught the bad file at read "
|
||||||
|
f"time and showed an error banner)"
|
||||||
|
)
|
||||||
|
candidates[0].click().run(timeout=20)
|
||||||
|
assert not at.exception, (
|
||||||
|
f"{page} crashed after clicking {primary_label!r} with "
|
||||||
|
f"{junk_file}: {at.exception}"
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user