From 5128d35961be83a036e97f48bf31ddf8d5c4acdf Mon Sep 17 00:00:00 2001 From: Michael Date: Sat, 16 May 2026 21:41:14 +0000 Subject: [PATCH] fix(text-cleaner): hoist show_hidden + stress-test all tool pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reported crash: clicking "Clean Text" with mojibake.csv (a junk corpus file that the cleaner ran on but produced zero changes) blew up the results render with NameError: name 'show_hidden' is not defined at the cleaned-preview block. ``show_hidden`` was defined inside ``if result.cells_changed:`` and referenced unconditionally below. Fix on the page itself: hoist the ``show_hidden = st.toggle(...)`` declaration out of the conditional so it's always in scope for the downstream cleaned-preview render. One toggle now drives both the Examples table (which only renders when there are changes) AND the cleaned preview (which always renders). Generalized regression net: ``tests/test_junk_corpus_tool_pages.py``. For nine representative junk files (empty, only_nul, mojibake, invalid_utf8, utf16_le_no_bom, mismatched_columns, all_nulls, corrupt_xlsx, single_column) and every Ready/Coming-Soon tool page, the test: 1. Stashes the junk bytes as the home upload via session_state. 2. Runs the page through AppTest, asserts ``app.exception`` is empty. 3. If the page exposes a deterministic primary-action button label, clicks it and asserts no exception on the post-click render. Pages that catch a bad file at read time and short-circuit via ``st.error`` + ``st.stop`` are correctly skipped from the primary-action half (the button isn't rendered). A genuine crash shows up as ``app.exception`` carrying a Python traceback — exactly what the user reported, exactly what we now catch. 162 tests collected, 102 passed, 60 skipped. 4 seconds. Full suite: 2220 passed, 91 skipped, 35 s. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/gui/pages/2_Text_Cleaner.py | 24 +++-- tests/test_junk_corpus_tool_pages.py | 126 +++++++++++++++++++++++++++ 2 files changed, 141 insertions(+), 9 deletions(-) create mode 100644 tests/test_junk_corpus_tool_pages.py diff --git a/src/gui/pages/2_Text_Cleaner.py b/src/gui/pages/2_Text_Cleaner.py index 151739e..779db2c 100644 --- a/src/gui/pages/2_Text_Cleaner.py +++ b/src/gui/pages/2_Text_Cleaner.py @@ -256,6 +256,21 @@ m2.metric("Cells changed", result.cells_changed) m3.metric("% changed", f"{pct:.1f}%") m4.metric("Columns processed", len(result.columns_processed)) +# Single toggle drives both the Examples table AND the Cleaned preview. +# Defined OUTSIDE the ``if result.cells_changed`` block so the +# downstream cleaned-preview render below always has the variable in +# scope, even on no-op runs (junk files / minimal preset that produces +# zero changes — previously triggered ``NameError: show_hidden``). +show_hidden = st.toggle( + "Show hidden characters (NBSP, ZWSP, smart quotes, control chars…)", + value=True, + help=( + "Highlights characters the cleaner is removing or replacing. " + "Hover any badge to see the codepoint and label." + ), + key="textclean_show_hidden", +) + if result.cells_changed: counts = result.changes["column"].value_counts() st.markdown("**Changes by column**") @@ -265,15 +280,6 @@ if result.cells_changed: ) st.markdown("**Examples (first 25 changes)**") - show_hidden = st.toggle( - "Show hidden characters (NBSP, ZWSP, smart quotes, control chars…)", - value=True, - help=( - "Highlights characters the cleaner is removing or replacing. " - "Hover any badge to see the codepoint and label." - ), - key="textclean_show_hidden", - ) examples = result.changes.head(25).copy() examples["row"] = examples["row"] + 1 if show_hidden: diff --git a/tests/test_junk_corpus_tool_pages.py b/tests/test_junk_corpus_tool_pages.py new file mode 100644 index 0000000..ead802a --- /dev/null +++ b/tests/test_junk_corpus_tool_pages.py @@ -0,0 +1,126 @@ +"""Stress-test every tool page against the junk corpus via AppTest. + +For each Ready tool page, walk a representative subset of pathological +files, stash them as the session upload, render the page, click the +primary action, and assert that nothing raises and that a clean error +is surfaced when the file is genuinely unprocessable. + +The home-page analyzer is covered separately by +``tests/test_junk_corpus.py``. This file pins the *tool page* contract: +once a junk file makes it past the upload, the page's own read + +operate + render loop must also stay stable. + +Subset rationale: running 35 files × 9 pages × full render is too slow +for CI. The subset hits the high-value shapes that historically broke +something (empty, NUL-only, mojibake, corrupt zip, mismatched columns) +plus a clean one to make sure the page still works under happy-path +inputs. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest +from streamlit.testing.v1 import AppTest + + +_PAGES = Path(__file__).resolve().parent.parent / "src" / "gui" / "pages" +_CORPUS = ( + Path(__file__).resolve().parent.parent + / "test-cases" / "junk-corpus" / "test_data" +) + + +# Representative shapes. Each was selected because it has historically +# tripped (or could trip) a different layer: file IO, encoding, pandas +# parse, downstream operation, or result rendering. +_SUBSET = [ + "empty.csv", # zero bytes — IO short-circuit + "only_nul.csv", # all-NUL — repair_bytes strips to nothing + "mojibake.csv", # Latin-1-as-UTF-8 — original NameError repro + "invalid_utf8.csv", # raw invalid UTF-8 sequences + "utf16_le_no_bom.csv", # wide encoding without BOM + "mismatched_columns.csv", # pandas ParserWarning territory + "all_nulls.csv", # cleans/standardizes a no-op + "corrupt_xlsx.xlsx", # bad zip — pandas/openpyxl error + "single_column.csv", # only one column — many tools assume >=2 +] + + +# Ready tool pages with a deterministic primary-action button label. +# The Coming-Soon stubs (6_Outlier_Detector, 7_Multi_File_Merger, +# 8_Validator_Reporter) don't process the file so they're covered by +# the "renders without exception" half of the contract only. +_TOOL_PAGES = [ + ("1_Deduplicator.py", None), # dedup's primary action depends on UI state + ("2_Text_Cleaner.py", "Clean Text"), + ("3_Format_Standardizer.py", None), # button label varies with detected types + ("4_Missing_Values.py", "Handle Missing Values"), + ("5_Column_Mapper.py", "Apply Column Mapping"), + ("6_Outlier_Detector.py", None), # stub + ("7_Multi_File_Merger.py", None), # stub + ("8_Validator_Reporter.py", None), # stub + ("9_Pipeline_Runner.py", None), # button enabled only when pipeline valid +] + + +def _stash_junk_upload(at: AppTest, path: Path) -> None: + """Pre-populate the session upload stash with junk bytes.""" + data = path.read_bytes() + at.session_state["home_uploaded_bytes"] = data + at.session_state["home_uploaded_name"] = path.name + at.session_state["home_uploaded_size"] = len(data) + + +@pytest.mark.parametrize("junk_file", _SUBSET) +@pytest.mark.parametrize("page,primary_label", _TOOL_PAGES, ids=lambda x: x[0] if isinstance(x, tuple) else str(x)) +class TestToolPagesSurvive: + """Every Ready/Coming-Soon tool page must render against every + junk file without raising. Errors surface as ``st.error`` banners + inside the page body, not as Python tracebacks bubbled up to the + Streamlit chrome. + """ + + def test_initial_render_no_exception(self, page, primary_label, junk_file): + path = _CORPUS / junk_file + if not path.exists(): + pytest.skip(f"junk file {junk_file} missing from corpus") + + at = AppTest.from_file(str(_PAGES / page)) + _stash_junk_upload(at, path) + at.run(timeout=20) + # ``at.exception`` is a list-like; empty means no traceback. + assert not at.exception, ( + f"{page} crashed on first render with {junk_file}: " + f"{at.exception}" + ) + + def test_primary_action_no_exception(self, page, primary_label, junk_file): + if primary_label is None: + pytest.skip( + f"{page} has no deterministic primary-action label; " + f"covered by initial-render half of the contract" + ) + path = _CORPUS / junk_file + if not path.exists(): + pytest.skip(f"junk file {junk_file} missing from corpus") + + at = AppTest.from_file(str(_PAGES / page)) + _stash_junk_upload(at, path) + at.run(timeout=20) + # First render may already have failed cleanly via st.error + + # st.stop — in which case the primary action button isn't + # rendered. That's the correct behaviour; skip. + candidates = [b for b in at.button if b.label == primary_label] + if not candidates: + pytest.skip( + f"{page} did not render {primary_label!r} for " + f"{junk_file} (likely caught the bad file at read " + f"time and showed an error banner)" + ) + candidates[0].click().run(timeout=20) + assert not at.exception, ( + f"{page} crashed after clicking {primary_label!r} with " + f"{junk_file}: {at.exception}" + )