From 5128d35961be83a036e97f48bf31ddf8d5c4acdf Mon Sep 17 00:00:00 2001
From: Michael <michael.dombaugh@gmail.com>
Date: Sat, 16 May 2026 21:41:14 +0000
Subject: [PATCH] fix(text-cleaner): hoist show_hidden + stress-test all tool
 pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reported crash: clicking "Clean Text" with mojibake.csv (a junk corpus
file that the cleaner ran on but produced zero changes) blew up the
results render with

    NameError: name 'show_hidden' is not defined

at the cleaned-preview block. ``show_hidden`` was defined inside
``if result.cells_changed:`` and referenced unconditionally below.

Fix on the page itself: hoist the ``show_hidden = st.toggle(...)``
declaration out of the conditional so it's always in scope for the
downstream cleaned-preview render. One toggle now drives both the
Examples table (which only renders when there are changes) AND the
cleaned preview (which always renders).

Generalized regression net: ``tests/test_junk_corpus_tool_pages.py``.
For nine representative junk files (empty, only_nul, mojibake,
invalid_utf8, utf16_le_no_bom, mismatched_columns, all_nulls,
corrupt_xlsx, single_column) and every Ready/Coming-Soon tool page,
the test:

1. Stashes the junk bytes as the home upload via session_state.
2. Runs the page through AppTest, asserts ``app.exception`` is empty.
3. If the page exposes a deterministic primary-action button label,
   clicks it and asserts no exception on the post-click render.

Pages that catch a bad file at read time and short-circuit via
``st.error`` + ``st.stop`` are correctly skipped from the
primary-action half (the button isn't rendered). A genuine crash
shows up as ``app.exception`` carrying a Python traceback — exactly
what the user reported, exactly what we now catch.

162 tests collected, 102 passed, 60 skipped. 4 seconds.

Full suite: 2220 passed, 91 skipped, 35 s.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/gui/pages/2_Text_Cleaner.py      |  24 +++--
 tests/test_junk_corpus_tool_pages.py | 126 +++++++++++++++++++++++++++
 2 files changed, 141 insertions(+), 9 deletions(-)
 create mode 100644 tests/test_junk_corpus_tool_pages.py

diff --git a/src/gui/pages/2_Text_Cleaner.py b/src/gui/pages/2_Text_Cleaner.py
index 151739e..779db2c 100644
--- a/src/gui/pages/2_Text_Cleaner.py
+++ b/src/gui/pages/2_Text_Cleaner.py
@@ -256,6 +256,21 @@ m2.metric("Cells changed", result.cells_changed)
 m3.metric("% changed", f"{pct:.1f}%")
 m4.metric("Columns processed", len(result.columns_processed))
 
+# Single toggle drives both the Examples table AND the Cleaned preview.
+# Defined OUTSIDE the ``if result.cells_changed`` block so the
+# downstream cleaned-preview render below always has the variable in
+# scope, even on no-op runs (junk files / minimal preset that produces
+# zero changes — previously triggered ``NameError: show_hidden``).
+show_hidden = st.toggle(
+    "Show hidden characters (NBSP, ZWSP, smart quotes, control chars…)",
+    value=True,
+    help=(
+        "Highlights characters the cleaner is removing or replacing. "
+        "Hover any badge to see the codepoint and label."
+    ),
+    key="textclean_show_hidden",
+)
+
 if result.cells_changed:
     counts = result.changes["column"].value_counts()
     st.markdown("**Changes by column**")
@@ -265,15 +280,6 @@ if result.cells_changed:
     )
 
     st.markdown("**Examples (first 25 changes)**")
-    show_hidden = st.toggle(
-        "Show hidden characters (NBSP, ZWSP, smart quotes, control chars…)",
-        value=True,
-        help=(
-            "Highlights characters the cleaner is removing or replacing. "
-            "Hover any badge to see the codepoint and label."
-        ),
-        key="textclean_show_hidden",
-    )
     examples = result.changes.head(25).copy()
     examples["row"] = examples["row"] + 1
     if show_hidden:
diff --git a/tests/test_junk_corpus_tool_pages.py b/tests/test_junk_corpus_tool_pages.py
new file mode 100644
index 0000000..ead802a
--- /dev/null
+++ b/tests/test_junk_corpus_tool_pages.py
@@ -0,0 +1,126 @@
+"""Stress-test every tool page against the junk corpus via AppTest.
+
+For each Ready tool page, walk a representative subset of pathological
+files, stash them as the session upload, render the page, click the
+primary action, and assert that nothing raises and that a clean error
+is surfaced when the file is genuinely unprocessable.
+
+The home-page analyzer is covered separately by
+``tests/test_junk_corpus.py``. This file pins the *tool page* contract:
+once a junk file makes it past the upload, the page's own read +
+operate + render loop must also stay stable.
+
+Subset rationale: running 35 files × 9 pages × full render is too slow
+for CI. The subset hits the high-value shapes that historically broke
+something (empty, NUL-only, mojibake, corrupt zip, mismatched columns)
+plus a clean one to make sure the page still works under happy-path
+inputs.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+from streamlit.testing.v1 import AppTest
+
+
+_PAGES = Path(__file__).resolve().parent.parent / "src" / "gui" / "pages"
+_CORPUS = (
+    Path(__file__).resolve().parent.parent
+    / "test-cases" / "junk-corpus" / "test_data"
+)
+
+
+# Representative shapes. Each was selected because it has historically
+# tripped (or could trip) a different layer: file IO, encoding, pandas
+# parse, downstream operation, or result rendering.
+_SUBSET = [
+    "empty.csv",                # zero bytes — IO short-circuit
+    "only_nul.csv",             # all-NUL — repair_bytes strips to nothing
+    "mojibake.csv",             # Latin-1-as-UTF-8 — original NameError repro
+    "invalid_utf8.csv",         # raw invalid UTF-8 sequences
+    "utf16_le_no_bom.csv",      # wide encoding without BOM
+    "mismatched_columns.csv",   # pandas ParserWarning territory
+    "all_nulls.csv",            # cleans/standardizes a no-op
+    "corrupt_xlsx.xlsx",        # bad zip — pandas/openpyxl error
+    "single_column.csv",        # only one column — many tools assume >=2
+]
+
+
+# Ready tool pages with a deterministic primary-action button label.
+# The Coming-Soon stubs (6_Outlier_Detector, 7_Multi_File_Merger,
+# 8_Validator_Reporter) don't process the file so they're covered by
+# the "renders without exception" half of the contract only.
+_TOOL_PAGES = [
+    ("1_Deduplicator.py", None),  # dedup's primary action depends on UI state
+    ("2_Text_Cleaner.py", "Clean Text"),
+    ("3_Format_Standardizer.py", None),  # button label varies with detected types
+    ("4_Missing_Values.py", "Handle Missing Values"),
+    ("5_Column_Mapper.py", "Apply Column Mapping"),
+    ("6_Outlier_Detector.py", None),  # stub
+    ("7_Multi_File_Merger.py", None),  # stub
+    ("8_Validator_Reporter.py", None),  # stub
+    ("9_Pipeline_Runner.py", None),  # button enabled only when pipeline valid
+]
+
+
+def _stash_junk_upload(at: AppTest, path: Path) -> None:
+    """Pre-populate the session upload stash with junk bytes."""
+    data = path.read_bytes()
+    at.session_state["home_uploaded_bytes"] = data
+    at.session_state["home_uploaded_name"] = path.name
+    at.session_state["home_uploaded_size"] = len(data)
+
+
+@pytest.mark.parametrize("junk_file", _SUBSET)
+@pytest.mark.parametrize("page,primary_label", _TOOL_PAGES, ids=lambda x: x[0] if isinstance(x, tuple) else str(x))
+class TestToolPagesSurvive:
+    """Every Ready/Coming-Soon tool page must render against every
+    junk file without raising. Errors surface as ``st.error`` banners
+    inside the page body, not as Python tracebacks bubbled up to the
+    Streamlit chrome.
+    """
+
+    def test_initial_render_no_exception(self, page, primary_label, junk_file):
+        path = _CORPUS / junk_file
+        if not path.exists():
+            pytest.skip(f"junk file {junk_file} missing from corpus")
+
+        at = AppTest.from_file(str(_PAGES / page))
+        _stash_junk_upload(at, path)
+        at.run(timeout=20)
+        # ``at.exception`` is a list-like; empty means no traceback.
+        assert not at.exception, (
+            f"{page} crashed on first render with {junk_file}: "
+            f"{at.exception}"
+        )
+
+    def test_primary_action_no_exception(self, page, primary_label, junk_file):
+        if primary_label is None:
+            pytest.skip(
+                f"{page} has no deterministic primary-action label; "
+                f"covered by initial-render half of the contract"
+            )
+        path = _CORPUS / junk_file
+        if not path.exists():
+            pytest.skip(f"junk file {junk_file} missing from corpus")
+
+        at = AppTest.from_file(str(_PAGES / page))
+        _stash_junk_upload(at, path)
+        at.run(timeout=20)
+        # First render may already have failed cleanly via st.error +
+        # st.stop — in which case the primary action button isn't
+        # rendered. That's the correct behaviour; skip.
+        candidates = [b for b in at.button if b.label == primary_label]
+        if not candidates:
+            pytest.skip(
+                f"{page} did not render {primary_label!r} for "
+                f"{junk_file} (likely caught the bad file at read "
+                f"time and showed an error banner)"
+            )
+        candidates[0].click().run(timeout=20)
+        assert not at.exception, (
+            f"{page} crashed after clicking {primary_label!r} with "
+            f"{junk_file}: {at.exception}"
+        )