Files
datatools-dev/tests/test_junk_corpus.py
Michael 696996c119 test(junk-corpus): pathological-input stress suite for the analyzer
Build a corpus of 35 deliberately-broken files (empty bytes, NUL
bytes, mojibake, UTF-16 without BOM, mismatched columns, unescaped
quotes, corrupt zip, etc.) and pin the analyzer's stability contract
against them.

Files land in ``test-cases/junk-corpus/test_data/``. The generator
``make_junk_corpus.py`` produces them deterministically (one random
sample uses ``secrets.token_bytes`` — committed bytes are stable
across regenerations because the byte stream is captured at commit
time). README documents the categories and how to add new shapes.

``tests/test_junk_corpus.py`` parametrizes over every file in the
corpus and asserts:

1. ``_run_analysis_on_upload`` never raises — exceptions must be
   caught and surfaced as a synthetic ``Finding`` with
   severity="error". This was the user-reported crash for
   13_non_latin_scripts.csv that the previous fix in ae9d4a2
   defensively wrapped; the corpus now stops the regression
   from re-landing on a different shape.
2. Every Finding in the result list is well-formed (string id,
   valid severity, non-empty description).
3. A high-risk subset (empty.csv, only_bom.csv, only_nul.csv,
   corrupt_xlsx.xlsx) MUST surface at least one error-level
   Finding — otherwise the GUI would render "no issues found"
   for a structurally broken file.
4. Error-level Finding descriptions are at least 20 chars so the
   UI banner gives the user something to act on.

Also exclude ``junk-corpus`` from ``tests/test_fixtures_sweep.py``
since that sweep is happy-path (round-trip the text cleaner) and
fights with files designed to break it. The contract is enforced
by the dedicated junk-corpus test, not the sweep.

Runtime: 12 s for the junk-corpus tests, 30 s for the full
project suite (was 19 s without these). 2118 tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 21:35:22 +00:00

157 lines
5.9 KiB
Python

"""Stress-test the upload analyzer against a corpus of pathological files.
Every file under ``test-cases/junk-corpus/test_data/`` is fed through
``_run_analysis_on_upload`` — the same path the GUI takes when a user
drops a file on the home page. The contract under test is:
* The call never raises. Errors must surface as a synthetic ``Finding``
with severity ``"error"``, not a Python traceback that the page
chrome bubbles up to the user.
* The return is always a list of :class:`Finding` (possibly empty for
files the analyzer judges clean).
* Specific high-risk files (empty bytes, corrupt zip, etc.) MUST
produce at least one error-level Finding so the UI shows a red
banner rather than silently rendering "no issues found".
To add a new pathological shape:
1. Edit ``test-cases/junk-corpus/make_junk_corpus.py`` to write the new
file under ``test_data/``.
2. Re-run that script to materialize the file on disk.
3. (Optional) Add the filename to ``_MUST_BE_ERROR`` below if the file
represents a state where "no findings" would be a silent failure.
"""
from __future__ import annotations
from pathlib import Path
import pytest
from src.core.analyze import Finding
from src.gui.components._legacy import _run_analysis_on_upload
_CORPUS = Path(__file__).resolve().parent.parent / "test-cases" / "junk-corpus" / "test_data"
class _FakeUpload:
"""Duck-type the Streamlit ``UploadedFile`` interface from a path."""
def __init__(self, path: Path) -> None:
self.name = path.name
self._bytes = path.read_bytes()
def getvalue(self) -> bytes:
return self._bytes
def _corpus_files() -> list[Path]:
files = sorted(p for p in _CORPUS.iterdir() if p.is_file())
if not files:
raise RuntimeError(
f"Junk corpus is empty. Run "
f"`python test-cases/junk-corpus/make_junk_corpus.py` "
f"to generate {_CORPUS}."
)
return files
# Files where "zero findings" would be a silent failure — these are
# structurally broken enough that the analyzer MUST flag them. The
# error-level Finding is what shows the user a red banner instead of
# the misleading "no issues found" success path.
_MUST_BE_ERROR = {
"empty.csv",
"only_bom.csv",
"only_nul.csv",
"corrupt_xlsx.xlsx",
}
@pytest.mark.parametrize(
"path",
_corpus_files(),
ids=lambda p: p.name,
)
class TestJunkCorpus:
"""Every pathological file must round-trip through the analyzer
without raising. The error message format is checked separately
via :func:`TestJunkCorpus.test_error_findings_have_a_description`.
"""
def test_no_exception_propagates(self, path: Path) -> None:
upload = _FakeUpload(path)
# The point of the test: any exception from analyze() / pandas /
# repair_bytes / openpyxl SHOULD have been caught and turned
# into an error Finding by ``_run_analysis_on_upload``. If this
# raises, the home page would crash on this file in production.
findings = _run_analysis_on_upload(upload)
assert isinstance(findings, list), (
f"{path.name}: expected list[Finding], got {type(findings).__name__}"
)
def test_findings_are_well_formed(self, path: Path) -> None:
upload = _FakeUpload(path)
findings = _run_analysis_on_upload(upload)
for f in findings:
assert isinstance(f, Finding), (
f"{path.name}: non-Finding in result list: {f!r}"
)
assert isinstance(f.id, str) and f.id, (
f"{path.name}: Finding has empty id"
)
assert f.severity in ("info", "warn", "error"), (
f"{path.name}: Finding has bad severity {f.severity!r}"
)
assert isinstance(f.description, str) and f.description, (
f"{path.name}: Finding has empty description"
)
def test_must_be_error_files_actually_flag(self, path: Path) -> None:
if path.name not in _MUST_BE_ERROR:
pytest.skip(f"{path.name} is allowed to pass clean")
upload = _FakeUpload(path)
findings = _run_analysis_on_upload(upload)
errors = [f for f in findings if f.severity == "error"]
assert errors, (
f"{path.name} should surface at least one error-level "
f"Finding so the UI shows a red banner; got {len(findings)} "
f"findings (none of severity 'error')."
)
def test_error_findings_have_a_description(self, path: Path) -> None:
"""Error findings must carry a description the user can act on.
For an empty / corrupt file the description is the ONLY thing
the user sees — it has to name the file or include enough
context that they can fix the underlying problem.
"""
upload = _FakeUpload(path)
findings = _run_analysis_on_upload(upload)
for f in findings:
if f.severity != "error":
continue
# The synthetic error Findings always interpolate the file
# name; analyzer-generated errors include the column or a
# description that mentions what was wrong.
assert len(f.description) >= 20, (
f"{path.name}: error Finding description is too short "
f"to be useful: {f.description!r}"
)
def test_corpus_contains_expected_shapes() -> None:
"""Sanity-check that the corpus generator wrote the files we rely
on for the must-be-error matrix. If somebody renames a file in
``make_junk_corpus.py`` without updating ``_MUST_BE_ERROR``, this
test catches it before the per-file parametrization silently
skips the must-be-error assertion."""
names = {p.name for p in _corpus_files()}
missing = _MUST_BE_ERROR - names
assert not missing, (
f"_MUST_BE_ERROR references files that don't exist in the "
f"corpus: {sorted(missing)}. Regenerate the corpus or update "
f"_MUST_BE_ERROR."
)