Build a corpus of 35 deliberately-broken files (empty bytes, NUL
bytes, mojibake, UTF-16 without BOM, mismatched columns, unescaped
quotes, corrupt zip, etc.) and pin the analyzer's stability contract
against them.
Files land in ``test-cases/junk-corpus/test_data/``. The generator
``make_junk_corpus.py`` produces them deterministically (one random
sample uses ``secrets.token_bytes`` — committed bytes are stable
across regenerations because the byte stream is captured at commit
time). README documents the categories and how to add new shapes.
``tests/test_junk_corpus.py`` parametrizes over every file in the
corpus and asserts:
1. ``_run_analysis_on_upload`` never raises — exceptions must be
caught and surfaced as a synthetic ``Finding`` with
severity="error". This was the user-reported crash for
13_non_latin_scripts.csv that the previous fix in ae9d4a2
defensively wrapped; the corpus now stops the regression
from re-landing on a different shape.
2. Every Finding in the result list is well-formed (string id,
valid severity, non-empty description).
3. A high-risk subset (empty.csv, only_bom.csv, only_nul.csv,
corrupt_xlsx.xlsx) MUST surface at least one error-level
Finding — otherwise the GUI would render "no issues found"
for a structurally broken file.
4. Error-level Finding descriptions are at least 20 chars so the
UI banner gives the user something to act on.
Also exclude ``junk-corpus`` from ``tests/test_fixtures_sweep.py``
since that sweep is happy-path (round-trip the text cleaner) and
fights with files designed to break it. The contract is enforced
by the dedicated junk-corpus test, not the sweep.
Runtime: 12 s for the junk-corpus tests, 30 s for the full
project suite (was 19 s without these). 2118 tests pass.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
157 lines
5.9 KiB
Python
157 lines
5.9 KiB
Python
"""Stress-test the upload analyzer against a corpus of pathological files.
|
|
|
|
Every file under ``test-cases/junk-corpus/test_data/`` is fed through
|
|
``_run_analysis_on_upload`` — the same path the GUI takes when a user
|
|
drops a file on the home page. The contract under test is:
|
|
|
|
* The call never raises. Errors must surface as a synthetic ``Finding``
|
|
with severity ``"error"``, not a Python traceback that the page
|
|
chrome bubbles up to the user.
|
|
* The return is always a list of :class:`Finding` (possibly empty for
|
|
files the analyzer judges clean).
|
|
* Specific high-risk files (empty bytes, corrupt zip, etc.) MUST
|
|
produce at least one error-level Finding so the UI shows a red
|
|
banner rather than silently rendering "no issues found".
|
|
|
|
To add a new pathological shape:
|
|
|
|
1. Edit ``test-cases/junk-corpus/make_junk_corpus.py`` to write the new
|
|
file under ``test_data/``.
|
|
2. Re-run that script to materialize the file on disk.
|
|
3. (Optional) Add the filename to ``_MUST_BE_ERROR`` below if the file
|
|
represents a state where "no findings" would be a silent failure.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from src.core.analyze import Finding
|
|
from src.gui.components._legacy import _run_analysis_on_upload
|
|
|
|
|
|
_CORPUS = Path(__file__).resolve().parent.parent / "test-cases" / "junk-corpus" / "test_data"
|
|
|
|
|
|
class _FakeUpload:
|
|
"""Duck-type the Streamlit ``UploadedFile`` interface from a path."""
|
|
|
|
def __init__(self, path: Path) -> None:
|
|
self.name = path.name
|
|
self._bytes = path.read_bytes()
|
|
|
|
def getvalue(self) -> bytes:
|
|
return self._bytes
|
|
|
|
|
|
def _corpus_files() -> list[Path]:
|
|
files = sorted(p for p in _CORPUS.iterdir() if p.is_file())
|
|
if not files:
|
|
raise RuntimeError(
|
|
f"Junk corpus is empty. Run "
|
|
f"`python test-cases/junk-corpus/make_junk_corpus.py` "
|
|
f"to generate {_CORPUS}."
|
|
)
|
|
return files
|
|
|
|
|
|
# Files where "zero findings" would be a silent failure — these are
|
|
# structurally broken enough that the analyzer MUST flag them. The
|
|
# error-level Finding is what shows the user a red banner instead of
|
|
# the misleading "no issues found" success path.
|
|
_MUST_BE_ERROR = {
|
|
"empty.csv",
|
|
"only_bom.csv",
|
|
"only_nul.csv",
|
|
"corrupt_xlsx.xlsx",
|
|
}
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"path",
|
|
_corpus_files(),
|
|
ids=lambda p: p.name,
|
|
)
|
|
class TestJunkCorpus:
|
|
"""Every pathological file must round-trip through the analyzer
|
|
without raising. The error message format is checked separately
|
|
via :func:`TestJunkCorpus.test_error_findings_have_a_description`.
|
|
"""
|
|
|
|
def test_no_exception_propagates(self, path: Path) -> None:
|
|
upload = _FakeUpload(path)
|
|
# The point of the test: any exception from analyze() / pandas /
|
|
# repair_bytes / openpyxl SHOULD have been caught and turned
|
|
# into an error Finding by ``_run_analysis_on_upload``. If this
|
|
# raises, the home page would crash on this file in production.
|
|
findings = _run_analysis_on_upload(upload)
|
|
assert isinstance(findings, list), (
|
|
f"{path.name}: expected list[Finding], got {type(findings).__name__}"
|
|
)
|
|
|
|
def test_findings_are_well_formed(self, path: Path) -> None:
|
|
upload = _FakeUpload(path)
|
|
findings = _run_analysis_on_upload(upload)
|
|
for f in findings:
|
|
assert isinstance(f, Finding), (
|
|
f"{path.name}: non-Finding in result list: {f!r}"
|
|
)
|
|
assert isinstance(f.id, str) and f.id, (
|
|
f"{path.name}: Finding has empty id"
|
|
)
|
|
assert f.severity in ("info", "warn", "error"), (
|
|
f"{path.name}: Finding has bad severity {f.severity!r}"
|
|
)
|
|
assert isinstance(f.description, str) and f.description, (
|
|
f"{path.name}: Finding has empty description"
|
|
)
|
|
|
|
def test_must_be_error_files_actually_flag(self, path: Path) -> None:
|
|
if path.name not in _MUST_BE_ERROR:
|
|
pytest.skip(f"{path.name} is allowed to pass clean")
|
|
upload = _FakeUpload(path)
|
|
findings = _run_analysis_on_upload(upload)
|
|
errors = [f for f in findings if f.severity == "error"]
|
|
assert errors, (
|
|
f"{path.name} should surface at least one error-level "
|
|
f"Finding so the UI shows a red banner; got {len(findings)} "
|
|
f"findings (none of severity 'error')."
|
|
)
|
|
|
|
def test_error_findings_have_a_description(self, path: Path) -> None:
|
|
"""Error findings must carry a description the user can act on.
|
|
|
|
For an empty / corrupt file the description is the ONLY thing
|
|
the user sees — it has to name the file or include enough
|
|
context that they can fix the underlying problem.
|
|
"""
|
|
upload = _FakeUpload(path)
|
|
findings = _run_analysis_on_upload(upload)
|
|
for f in findings:
|
|
if f.severity != "error":
|
|
continue
|
|
# The synthetic error Findings always interpolate the file
|
|
# name; analyzer-generated errors include the column or a
|
|
# description that mentions what was wrong.
|
|
assert len(f.description) >= 20, (
|
|
f"{path.name}: error Finding description is too short "
|
|
f"to be useful: {f.description!r}"
|
|
)
|
|
|
|
|
|
def test_corpus_contains_expected_shapes() -> None:
|
|
"""Sanity-check that the corpus generator wrote the files we rely
|
|
on for the must-be-error matrix. If somebody renames a file in
|
|
``make_junk_corpus.py`` without updating ``_MUST_BE_ERROR``, this
|
|
test catches it before the per-file parametrization silently
|
|
skips the must-be-error assertion."""
|
|
names = {p.name for p in _corpus_files()}
|
|
missing = _MUST_BE_ERROR - names
|
|
assert not missing, (
|
|
f"_MUST_BE_ERROR references files that don't exist in the "
|
|
f"corpus: {sorted(missing)}. Regenerate the corpus or update "
|
|
f"_MUST_BE_ERROR."
|
|
)
|