Files
datatools-dev/tests/test_junk_corpus_tool_pages.py
Michael 5128d35961 fix(text-cleaner): hoist show_hidden + stress-test all tool pages
Reported crash: clicking "Clean Text" with mojibake.csv (a junk corpus
file that the cleaner ran on but produced zero changes) blew up the
results render with

    NameError: name 'show_hidden' is not defined

at the cleaned-preview block. ``show_hidden`` was defined inside
``if result.cells_changed:`` and referenced unconditionally below.

Fix on the page itself: hoist the ``show_hidden = st.toggle(...)``
declaration out of the conditional so it's always in scope for the
downstream cleaned-preview render. One toggle now drives both the
Examples table (which only renders when there are changes) AND the
cleaned preview (which always renders).

Generalized regression net: ``tests/test_junk_corpus_tool_pages.py``.
For nine representative junk files (empty, only_nul, mojibake,
invalid_utf8, utf16_le_no_bom, mismatched_columns, all_nulls,
corrupt_xlsx, single_column) and every Ready/Coming-Soon tool page,
the test:

1. Stashes the junk bytes as the home upload via session_state.
2. Runs the page through AppTest, asserts ``app.exception`` is empty.
3. If the page exposes a deterministic primary-action button label,
   clicks it and asserts no exception on the post-click render.

Pages that catch a bad file at read time and short-circuit via
``st.error`` + ``st.stop`` are correctly skipped from the
primary-action half (the button isn't rendered). A genuine crash
shows up as ``app.exception`` carrying a Python traceback — exactly
what the user reported, exactly what we now catch.

162 tests collected, 102 passed, 60 skipped. 4 seconds.

Full suite: 2220 passed, 91 skipped, 35 s.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 21:41:14 +00:00

127 lines
5.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Stress-test every tool page against the junk corpus via AppTest.
For each Ready tool page, walk a representative subset of pathological
files, stash them as the session upload, render the page, click the
primary action, and assert that nothing raises and that a clean error
is surfaced when the file is genuinely unprocessable.
The home-page analyzer is covered separately by
``tests/test_junk_corpus.py``. This file pins the *tool page* contract:
once a junk file makes it past the upload, the page's own read +
operate + render loop must also stay stable.
Subset rationale: running 35 files × 9 pages × full render is too slow
for CI. The subset hits the high-value shapes that historically broke
something (empty, NUL-only, mojibake, corrupt zip, mismatched columns)
plus a clean one to make sure the page still works under happy-path
inputs.
"""
from __future__ import annotations
from pathlib import Path
import pytest
from streamlit.testing.v1 import AppTest
_PAGES = Path(__file__).resolve().parent.parent / "src" / "gui" / "pages"
_CORPUS = (
Path(__file__).resolve().parent.parent
/ "test-cases" / "junk-corpus" / "test_data"
)
# Representative shapes. Each was selected because it has historically
# tripped (or could trip) a different layer: file IO, encoding, pandas
# parse, downstream operation, or result rendering.
_SUBSET = [
"empty.csv", # zero bytes — IO short-circuit
"only_nul.csv", # all-NUL — repair_bytes strips to nothing
"mojibake.csv", # Latin-1-as-UTF-8 — original NameError repro
"invalid_utf8.csv", # raw invalid UTF-8 sequences
"utf16_le_no_bom.csv", # wide encoding without BOM
"mismatched_columns.csv", # pandas ParserWarning territory
"all_nulls.csv", # cleans/standardizes a no-op
"corrupt_xlsx.xlsx", # bad zip — pandas/openpyxl error
"single_column.csv", # only one column — many tools assume >=2
]
# Ready tool pages with a deterministic primary-action button label.
# The Coming-Soon stubs (6_Outlier_Detector, 7_Multi_File_Merger,
# 8_Validator_Reporter) don't process the file so they're covered by
# the "renders without exception" half of the contract only.
_TOOL_PAGES = [
("1_Deduplicator.py", None), # dedup's primary action depends on UI state
("2_Text_Cleaner.py", "Clean Text"),
("3_Format_Standardizer.py", None), # button label varies with detected types
("4_Missing_Values.py", "Handle Missing Values"),
("5_Column_Mapper.py", "Apply Column Mapping"),
("6_Outlier_Detector.py", None), # stub
("7_Multi_File_Merger.py", None), # stub
("8_Validator_Reporter.py", None), # stub
("9_Pipeline_Runner.py", None), # button enabled only when pipeline valid
]
def _stash_junk_upload(at: AppTest, path: Path) -> None:
"""Pre-populate the session upload stash with junk bytes."""
data = path.read_bytes()
at.session_state["home_uploaded_bytes"] = data
at.session_state["home_uploaded_name"] = path.name
at.session_state["home_uploaded_size"] = len(data)
@pytest.mark.parametrize("junk_file", _SUBSET)
@pytest.mark.parametrize("page,primary_label", _TOOL_PAGES, ids=lambda x: x[0] if isinstance(x, tuple) else str(x))
class TestToolPagesSurvive:
"""Every Ready/Coming-Soon tool page must render against every
junk file without raising. Errors surface as ``st.error`` banners
inside the page body, not as Python tracebacks bubbled up to the
Streamlit chrome.
"""
def test_initial_render_no_exception(self, page, primary_label, junk_file):
path = _CORPUS / junk_file
if not path.exists():
pytest.skip(f"junk file {junk_file} missing from corpus")
at = AppTest.from_file(str(_PAGES / page))
_stash_junk_upload(at, path)
at.run(timeout=20)
# ``at.exception`` is a list-like; empty means no traceback.
assert not at.exception, (
f"{page} crashed on first render with {junk_file}: "
f"{at.exception}"
)
def test_primary_action_no_exception(self, page, primary_label, junk_file):
if primary_label is None:
pytest.skip(
f"{page} has no deterministic primary-action label; "
f"covered by initial-render half of the contract"
)
path = _CORPUS / junk_file
if not path.exists():
pytest.skip(f"junk file {junk_file} missing from corpus")
at = AppTest.from_file(str(_PAGES / page))
_stash_junk_upload(at, path)
at.run(timeout=20)
# First render may already have failed cleanly via st.error +
# st.stop — in which case the primary action button isn't
# rendered. That's the correct behaviour; skip.
candidates = [b for b in at.button if b.label == primary_label]
if not candidates:
pytest.skip(
f"{page} did not render {primary_label!r} for "
f"{junk_file} (likely caught the bad file at read "
f"time and showed an error banner)"
)
candidates[0].click().run(timeout=20)
assert not at.exception, (
f"{page} crashed after clicking {primary_label!r} with "
f"{junk_file}: {at.exception}"
)