datatools-dev/test-cases/junk-corpus/make_junk_corpus.py

"""Generate a corpus of pathological files for stress-testing the upload
analyzer.

Each file in ``test_data/`` is deliberately broken in a different way:
empty bytes, NUL bytes, mojibake, UTF-16 without BOM, mismatched columns,
unescaped quotes, etc. The goal is to make sure ``_run_analysis_on_upload``
returns a clean error Finding (never a Python traceback) for any of them,
in any combination, on every operating system the GUI ships on.

Run::

    python test-cases/junk-corpus/make_junk_corpus.py

The matching pytest at ``tests/test_junk_corpus.py`` iterates every file
in ``test_data/`` and asserts the analyzer either returns findings or an
error Finding — never raises.
"""

from __future__ import annotations

import io
import os
import secrets
import struct
import zipfile
from pathlib import Path


_HERE = Path(__file__).resolve().parent
_OUT = _HERE / "test_data"


def write(name: str, data: bytes) -> None:
    """Write *data* to ``test_data/name`` and report the size."""
    path = _OUT / name
    path.write_bytes(data)
    print(f"  {name:<40} {len(data):>10} bytes")


def _valid_xlsx_bytes(*, sheet_xml: str) -> bytes:
    """Build a minimal but valid .xlsx (zip with the required parts).

    ``sheet_xml`` is the inner ``<sheetData>`` content; the rest of the
    workbook scaffolding is filled in around it. Good enough for pandas
    to load.
    """
    buf = io.BytesIO()
    with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
        z.writestr(
            "[Content_Types].xml",
            '<?xml version="1.0"?>'
            '<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">'
            '<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>'
            '<Default Extension="xml" ContentType="application/xml"/>'
            '<Override PartName="/xl/workbook.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml"/>'
            '<Override PartName="/xl/worksheets/sheet1.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml"/>'
            "</Types>",
        )
        z.writestr(
            "_rels/.rels",
            '<?xml version="1.0"?>'
            '<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
            '<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="xl/workbook.xml"/>'
            "</Relationships>",
        )
        z.writestr(
            "xl/_rels/workbook.xml.rels",
            '<?xml version="1.0"?>'
            '<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
            '<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet" Target="worksheets/sheet1.xml"/>'
            "</Relationships>",
        )
        z.writestr(
            "xl/workbook.xml",
            '<?xml version="1.0"?>'
            '<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"'
            ' xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">'
            '<sheets><sheet name="Sheet1" sheetId="1" r:id="rId1"/></sheets>'
            "</workbook>",
        )
        z.writestr(
            "xl/worksheets/sheet1.xml",
            '<?xml version="1.0"?>'
            '<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">'
            f"<sheetData>{sheet_xml}</sheetData>"
            "</worksheet>",
        )
    return buf.getvalue()


def main() -> None:
    _OUT.mkdir(parents=True, exist_ok=True)
    print(f"Writing junk corpus to {_OUT}")

    # ---- Empty / near-empty -------------------------------------------------
    write("empty.csv", b"")
    write("only_whitespace.csv", b"   \t\n  \n\t  \n")
    write("only_bom.csv", b"\xef\xbb\xbf")
    write("only_nul.csv", b"\x00" * 64)
    write("just_newlines.csv", b"\n\n\n\n\n")
    write("header_only.csv", b"id,name,note\n")

    # ---- Random / binary garbage -------------------------------------------
    write("random_bytes.csv", secrets.token_bytes(2048))
    # Bytes that look like a PNG signature plus garbage; would mislead any
    # naive file-type sniffer.
    write("png_magic_as_csv.csv", b"\x89PNG\r\n\x1a\n" + secrets.token_bytes(512))

    # ---- Truncated / structurally damaged ----------------------------------
    write(
        "truncated_mid_row.csv",
        b"id,name,note\n1,alice,hello\n2,bob,wor",  # row 2 ends mid-cell
    )
    write(
        "one_huge_line.csv",
        b"a," * 5_000,  # 10KB single line, no newline anywhere
    )
    write(
        "massive_columns.csv",
        (",".join(f"c{i}" for i in range(500)) + "\n"
         + ",".join("x" for _ in range(500)) + "\n").encode(),
    )
    write(
        "single_column.csv",
        b"\n".join([b"id"] + [str(i).encode() for i in range(20)]) + b"\n",
    )

    # ---- Wrong / misleading delimiter --------------------------------------
    write(
        "tsv_as_csv.csv",
        b"id\tname\tnote\n1\talice\thi\n2\tbob\tworld\n",
    )
    write(
        "mixed_delimiters.csv",
        b"id,name\tnote;extra|tail\n1,alice\thi;x|y\n",
    )

    # ---- Encoding chaos ----------------------------------------------------
    sample_text = "id,name,note\n1,café,hello\n2,naïve,world\n"
    write("utf16_le_no_bom.csv", sample_text.encode("utf-16-le"))
    write("utf16_be_with_bom.csv", b"\xfe\xff" + sample_text.encode("utf-16-be"))
    write("utf32_le.csv", sample_text.encode("utf-32-le"))
    # Latin-1 bytes that decode as UTF-8 produce mojibake (Ã©, Ã¯ etc.)
    write("mojibake.csv", sample_text.encode("latin-1"))
    # Bytes that aren't valid UTF-8 (lone continuation bytes)
    write("invalid_utf8.csv", b"id,name\n1,\xff\xfe\xfd,hello\n")
    # cp1252-encoded smart quotes in column values. cp1252 ascribes
    # smart-quote glyphs to bytes 0x91-0x94; the surrounding ASCII +
    # accented "é" is just there to keep the value realistic.
    write(
        "cp1252_smart_quotes.csv",
        b"id,quote\n1,"
        + "café ".encode("cp1252")
        + b"\x93smart\x94 \x91quote\x92"
        + b"\n",
    )

    # ---- Quoting and field-shape pathologies -------------------------------
    write(
        "unescaped_quotes.csv",
        b'id,note\n1,"this has " unescaped quote"\n2,"normal"\n',
    )
    write(
        "embedded_newlines.csv",
        b'id,note\n1,"line one\nline two"\n2,"single line"\n',
    )
    write(
        "mismatched_columns.csv",
        b"id,name,note\n1,alice,hi\n2,bob\n3,carol,hi,extra,fields\n",
    )
    write(
        "duplicate_headers.csv",
        b"col,col,col\n1,2,3\n4,5,6\n",
    )
    write(
        "empty_header_names.csv",
        b",,,\n1,2,3,4\n5,6,7,8\n",
    )
    write(
        "trailing_commas.csv",
        b"id,name,note,\n1,alice,hi,\n2,bob,wo,\n",
    )

    # ---- Content pathologies ----------------------------------------------
    write(
        "all_nulls.csv",
        b"id,name,note\nNULL,NULL,NULL\nN/A,NA,(null)\nNone,nan,?\n",
    )
    write(
        "very_wide_cell.csv",
        b'id,blob\n1,"' + b"x" * 10_000 + b'"\n',
    )
    write(
        "all_same_row.csv",
        b"id,name,note\n" + b"1,alice,hello\n" * 100,
    )

    # ---- Extension confusion ----------------------------------------------
    write("no_extension", b"id,name,note\n1,alice,hi\n")
    write(
        "weird_extension.foo",
        b"id,name,note\n1,alice,hi\n",
    )
    write(
        "double_extension.csv.txt",
        b"id,name,note\n1,alice,hi\n",
    )

    # ---- Excel-specific pathologies ----------------------------------------
    # Not a real zip — pandas/openpyxl should error cleanly.
    write("corrupt_xlsx.xlsx", b"PK\x03\x04 not really a zip file")
    # Valid xlsx with an entirely empty sheet.
    write("excel_empty.xlsx", _valid_xlsx_bytes(sheet_xml=""))
    # Valid xlsx with one row of headers and no data.
    write(
        "excel_header_only.xlsx",
        _valid_xlsx_bytes(
            sheet_xml=(
                '<row r="1">'
                '<c r="A1" t="inlineStr"><is><t>id</t></is></c>'
                '<c r="B1" t="inlineStr"><is><t>name</t></is></c>'
                "</row>"
            ),
        ),
    )

    print(f"\nWrote {len(list(_OUT.iterdir()))} files.")


if __name__ == "__main__":
    main()