datatools-dev/tests/test_junk_corpus.py

"""Stress-test the upload analyzer against a corpus of pathological files.

Every file under ``test-cases/junk-corpus/test_data/`` is fed through
``_run_analysis_on_upload`` — the same path the GUI takes when a user
drops a file on the home page. The contract under test is:

* The call never raises. Errors must surface as a synthetic ``Finding``
  with severity ``"error"``, not a Python traceback that the page
  chrome bubbles up to the user.
* The return is always a list of :class:`Finding` (possibly empty for
  files the analyzer judges clean).
* Specific high-risk files (empty bytes, corrupt zip, etc.) MUST
  produce at least one error-level Finding so the UI shows a red
  banner rather than silently rendering "no issues found".

To add a new pathological shape:

1. Edit ``test-cases/junk-corpus/make_junk_corpus.py`` to write the new
   file under ``test_data/``.
2. Re-run that script to materialize the file on disk.
3. (Optional) Add the filename to ``_MUST_BE_ERROR`` below if the file
   represents a state where "no findings" would be a silent failure.
"""

from __future__ import annotations

from pathlib import Path

import pytest

from src.core.analyze import Finding
from src.gui.components._legacy import _run_analysis_on_upload


_CORPUS = Path(__file__).resolve().parent.parent / "test-cases" / "junk-corpus" / "test_data"


class _FakeUpload:
    """Duck-type the Streamlit ``UploadedFile`` interface from a path."""

    def __init__(self, path: Path) -> None:
        self.name = path.name
        self._bytes = path.read_bytes()

    def getvalue(self) -> bytes:
        return self._bytes


def _corpus_files() -> list[Path]:
    files = sorted(p for p in _CORPUS.iterdir() if p.is_file())
    if not files:
        raise RuntimeError(
            f"Junk corpus is empty. Run "
            f"`python test-cases/junk-corpus/make_junk_corpus.py` "
            f"to generate {_CORPUS}."
        )
    return files


# Files where "zero findings" would be a silent failure — these are
# structurally broken enough that the analyzer MUST flag them. The
# error-level Finding is what shows the user a red banner instead of
# the misleading "no issues found" success path.
_MUST_BE_ERROR = {
    "empty.csv",
    "only_bom.csv",
    "only_nul.csv",
    "corrupt_xlsx.xlsx",
}


@pytest.mark.parametrize(
    "path",
    _corpus_files(),
    ids=lambda p: p.name,
)
class TestJunkCorpus:
    """Every pathological file must round-trip through the analyzer
    without raising. The error message format is checked separately
    via :func:`TestJunkCorpus.test_error_findings_have_a_description`.
    """

    def test_no_exception_propagates(self, path: Path) -> None:
        upload = _FakeUpload(path)
        # The point of the test: any exception from analyze() / pandas /
        # repair_bytes / openpyxl SHOULD have been caught and turned
        # into an error Finding by ``_run_analysis_on_upload``. If this
        # raises, the home page would crash on this file in production.
        findings = _run_analysis_on_upload(upload)
        assert isinstance(findings, list), (
            f"{path.name}: expected list[Finding], got {type(findings).__name__}"
        )

    def test_findings_are_well_formed(self, path: Path) -> None:
        upload = _FakeUpload(path)
        findings = _run_analysis_on_upload(upload)
        for f in findings:
            assert isinstance(f, Finding), (
                f"{path.name}: non-Finding in result list: {f!r}"
            )
            assert isinstance(f.id, str) and f.id, (
                f"{path.name}: Finding has empty id"
            )
            assert f.severity in ("info", "warn", "error"), (
                f"{path.name}: Finding has bad severity {f.severity!r}"
            )
            assert isinstance(f.description, str) and f.description, (
                f"{path.name}: Finding has empty description"
            )

    def test_must_be_error_files_actually_flag(self, path: Path) -> None:
        if path.name not in _MUST_BE_ERROR:
            pytest.skip(f"{path.name} is allowed to pass clean")
        upload = _FakeUpload(path)
        findings = _run_analysis_on_upload(upload)
        errors = [f for f in findings if f.severity == "error"]
        assert errors, (
            f"{path.name} should surface at least one error-level "
            f"Finding so the UI shows a red banner; got {len(findings)} "
            f"findings (none of severity 'error')."
        )

    def test_error_findings_have_a_description(self, path: Path) -> None:
        """Error findings must carry a description the user can act on.

        For an empty / corrupt file the description is the ONLY thing
        the user sees — it has to name the file or include enough
        context that they can fix the underlying problem.
        """
        upload = _FakeUpload(path)
        findings = _run_analysis_on_upload(upload)
        for f in findings:
            if f.severity != "error":
                continue
            # The synthetic error Findings always interpolate the file
            # name; analyzer-generated errors include the column or a
            # description that mentions what was wrong.
            assert len(f.description) >= 20, (
                f"{path.name}: error Finding description is too short "
                f"to be useful: {f.description!r}"
            )


def test_corpus_contains_expected_shapes() -> None:
    """Sanity-check that the corpus generator wrote the files we rely
    on for the must-be-error matrix. If somebody renames a file in
    ``make_junk_corpus.py`` without updating ``_MUST_BE_ERROR``, this
    test catches it before the per-file parametrization silently
    skips the must-be-error assertion."""
    names = {p.name for p in _corpus_files()}
    missing = _MUST_BE_ERROR - names
    assert not missing, (
        f"_MUST_BE_ERROR references files that don't exist in the "
        f"corpus: {sorted(missing)}. Regenerate the corpus or update "
        f"_MUST_BE_ERROR."
    )