"""Generate a corpus of pathological files for stress-testing the upload analyzer. Each file in ``test_data/`` is deliberately broken in a different way: empty bytes, NUL bytes, mojibake, UTF-16 without BOM, mismatched columns, unescaped quotes, etc. The goal is to make sure ``_run_analysis_on_upload`` returns a clean error Finding (never a Python traceback) for any of them, in any combination, on every operating system the GUI ships on. Run:: python test-cases/junk-corpus/make_junk_corpus.py The matching pytest at ``tests/test_junk_corpus.py`` iterates every file in ``test_data/`` and asserts the analyzer either returns findings or an error Finding — never raises. """ from __future__ import annotations import io import os import secrets import struct import zipfile from pathlib import Path _HERE = Path(__file__).resolve().parent _OUT = _HERE / "test_data" def write(name: str, data: bytes) -> None: """Write *data* to ``test_data/name`` and report the size.""" path = _OUT / name path.write_bytes(data) print(f" {name:<40} {len(data):>10} bytes") def _valid_xlsx_bytes(*, sheet_xml: str) -> bytes: """Build a minimal but valid .xlsx (zip with the required parts). ``sheet_xml`` is the inner ```` content; the rest of the workbook scaffolding is filled in around it. Good enough for pandas to load. """ buf = io.BytesIO() with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z: z.writestr( "[Content_Types].xml", '' '' '' '' '' '' "", ) z.writestr( "_rels/.rels", '' '' '' "", ) z.writestr( "xl/_rels/workbook.xml.rels", '' '' '' "", ) z.writestr( "xl/workbook.xml", '' '' '' "", ) z.writestr( "xl/worksheets/sheet1.xml", '' '' f"{sheet_xml}" "", ) return buf.getvalue() def main() -> None: _OUT.mkdir(parents=True, exist_ok=True) print(f"Writing junk corpus to {_OUT}") # ---- Empty / near-empty ------------------------------------------------- write("empty.csv", b"") write("only_whitespace.csv", b" \t\n \n\t \n") write("only_bom.csv", b"\xef\xbb\xbf") write("only_nul.csv", b"\x00" * 64) write("just_newlines.csv", b"\n\n\n\n\n") write("header_only.csv", b"id,name,note\n") # ---- Random / binary garbage ------------------------------------------- write("random_bytes.csv", secrets.token_bytes(2048)) # Bytes that look like a PNG signature plus garbage; would mislead any # naive file-type sniffer. write("png_magic_as_csv.csv", b"\x89PNG\r\n\x1a\n" + secrets.token_bytes(512)) # ---- Truncated / structurally damaged ---------------------------------- write( "truncated_mid_row.csv", b"id,name,note\n1,alice,hello\n2,bob,wor", # row 2 ends mid-cell ) write( "one_huge_line.csv", b"a," * 5_000, # 10KB single line, no newline anywhere ) write( "massive_columns.csv", (",".join(f"c{i}" for i in range(500)) + "\n" + ",".join("x" for _ in range(500)) + "\n").encode(), ) write( "single_column.csv", b"\n".join([b"id"] + [str(i).encode() for i in range(20)]) + b"\n", ) # ---- Wrong / misleading delimiter -------------------------------------- write( "tsv_as_csv.csv", b"id\tname\tnote\n1\talice\thi\n2\tbob\tworld\n", ) write( "mixed_delimiters.csv", b"id,name\tnote;extra|tail\n1,alice\thi;x|y\n", ) # ---- Encoding chaos ---------------------------------------------------- sample_text = "id,name,note\n1,café,hello\n2,naïve,world\n" write("utf16_le_no_bom.csv", sample_text.encode("utf-16-le")) write("utf16_be_with_bom.csv", b"\xfe\xff" + sample_text.encode("utf-16-be")) write("utf32_le.csv", sample_text.encode("utf-32-le")) # Latin-1 bytes that decode as UTF-8 produce mojibake (Ã©, Ã¯ etc.) write("mojibake.csv", sample_text.encode("latin-1")) # Bytes that aren't valid UTF-8 (lone continuation bytes) write("invalid_utf8.csv", b"id,name\n1,\xff\xfe\xfd,hello\n") # cp1252-encoded smart quotes in column values. cp1252 ascribes # smart-quote glyphs to bytes 0x91-0x94; the surrounding ASCII + # accented "é" is just there to keep the value realistic. write( "cp1252_smart_quotes.csv", b"id,quote\n1," + "café ".encode("cp1252") + b"\x93smart\x94 \x91quote\x92" + b"\n", ) # ---- Quoting and field-shape pathologies ------------------------------- write( "unescaped_quotes.csv", b'id,note\n1,"this has " unescaped quote"\n2,"normal"\n', ) write( "embedded_newlines.csv", b'id,note\n1,"line one\nline two"\n2,"single line"\n', ) write( "mismatched_columns.csv", b"id,name,note\n1,alice,hi\n2,bob\n3,carol,hi,extra,fields\n", ) write( "duplicate_headers.csv", b"col,col,col\n1,2,3\n4,5,6\n", ) write( "empty_header_names.csv", b",,,\n1,2,3,4\n5,6,7,8\n", ) write( "trailing_commas.csv", b"id,name,note,\n1,alice,hi,\n2,bob,wo,\n", ) # ---- Content pathologies ---------------------------------------------- write( "all_nulls.csv", b"id,name,note\nNULL,NULL,NULL\nN/A,NA,(null)\nNone,nan,?\n", ) write( "very_wide_cell.csv", b'id,blob\n1,"' + b"x" * 10_000 + b'"\n', ) write( "all_same_row.csv", b"id,name,note\n" + b"1,alice,hello\n" * 100, ) # ---- Extension confusion ---------------------------------------------- write("no_extension", b"id,name,note\n1,alice,hi\n") write( "weird_extension.foo", b"id,name,note\n1,alice,hi\n", ) write( "double_extension.csv.txt", b"id,name,note\n1,alice,hi\n", ) # ---- Excel-specific pathologies ---------------------------------------- # Not a real zip — pandas/openpyxl should error cleanly. write("corrupt_xlsx.xlsx", b"PK\x03\x04 not really a zip file") # Valid xlsx with an entirely empty sheet. write("excel_empty.xlsx", _valid_xlsx_bytes(sheet_xml="")) # Valid xlsx with one row of headers and no data. write( "excel_header_only.xlsx", _valid_xlsx_bytes( sheet_xml=( '' 'id' 'name' "" ), ), ) print(f"\nWrote {len(list(_OUT.iterdir()))} files.") if __name__ == "__main__": main()