Files
datatools-dev/test-cases/junk-corpus/make_junk_corpus.py
Michael 696996c119 test(junk-corpus): pathological-input stress suite for the analyzer
Build a corpus of 35 deliberately-broken files (empty bytes, NUL
bytes, mojibake, UTF-16 without BOM, mismatched columns, unescaped
quotes, corrupt zip, etc.) and pin the analyzer's stability contract
against them.

Files land in ``test-cases/junk-corpus/test_data/``. The generator
``make_junk_corpus.py`` produces them deterministically (one random
sample uses ``secrets.token_bytes`` — committed bytes are stable
across regenerations because the byte stream is captured at commit
time). README documents the categories and how to add new shapes.

``tests/test_junk_corpus.py`` parametrizes over every file in the
corpus and asserts:

1. ``_run_analysis_on_upload`` never raises — exceptions must be
   caught and surfaced as a synthetic ``Finding`` with
   severity="error". This was the user-reported crash for
   13_non_latin_scripts.csv that the previous fix in ae9d4a2
   defensively wrapped; the corpus now stops the regression
   from re-landing on a different shape.
2. Every Finding in the result list is well-formed (string id,
   valid severity, non-empty description).
3. A high-risk subset (empty.csv, only_bom.csv, only_nul.csv,
   corrupt_xlsx.xlsx) MUST surface at least one error-level
   Finding — otherwise the GUI would render "no issues found"
   for a structurally broken file.
4. Error-level Finding descriptions are at least 20 chars so the
   UI banner gives the user something to act on.

Also exclude ``junk-corpus`` from ``tests/test_fixtures_sweep.py``
since that sweep is happy-path (round-trip the text cleaner) and
fights with files designed to break it. The contract is enforced
by the dedicated junk-corpus test, not the sweep.

Runtime: 12 s for the junk-corpus tests, 30 s for the full
project suite (was 19 s without these). 2118 tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 21:35:22 +00:00

232 lines
8.3 KiB
Python

"""Generate a corpus of pathological files for stress-testing the upload
analyzer.
Each file in ``test_data/`` is deliberately broken in a different way:
empty bytes, NUL bytes, mojibake, UTF-16 without BOM, mismatched columns,
unescaped quotes, etc. The goal is to make sure ``_run_analysis_on_upload``
returns a clean error Finding (never a Python traceback) for any of them,
in any combination, on every operating system the GUI ships on.
Run::
python test-cases/junk-corpus/make_junk_corpus.py
The matching pytest at ``tests/test_junk_corpus.py`` iterates every file
in ``test_data/`` and asserts the analyzer either returns findings or an
error Finding — never raises.
"""
from __future__ import annotations
import io
import os
import secrets
import struct
import zipfile
from pathlib import Path
_HERE = Path(__file__).resolve().parent
_OUT = _HERE / "test_data"
def write(name: str, data: bytes) -> None:
"""Write *data* to ``test_data/name`` and report the size."""
path = _OUT / name
path.write_bytes(data)
print(f" {name:<40} {len(data):>10} bytes")
def _valid_xlsx_bytes(*, sheet_xml: str) -> bytes:
"""Build a minimal but valid .xlsx (zip with the required parts).
``sheet_xml`` is the inner ``<sheetData>`` content; the rest of the
workbook scaffolding is filled in around it. Good enough for pandas
to load.
"""
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
z.writestr(
"[Content_Types].xml",
'<?xml version="1.0"?>'
'<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">'
'<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>'
'<Default Extension="xml" ContentType="application/xml"/>'
'<Override PartName="/xl/workbook.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml"/>'
'<Override PartName="/xl/worksheets/sheet1.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml"/>'
"</Types>",
)
z.writestr(
"_rels/.rels",
'<?xml version="1.0"?>'
'<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
'<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="xl/workbook.xml"/>'
"</Relationships>",
)
z.writestr(
"xl/_rels/workbook.xml.rels",
'<?xml version="1.0"?>'
'<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
'<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet" Target="worksheets/sheet1.xml"/>'
"</Relationships>",
)
z.writestr(
"xl/workbook.xml",
'<?xml version="1.0"?>'
'<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"'
' xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">'
'<sheets><sheet name="Sheet1" sheetId="1" r:id="rId1"/></sheets>'
"</workbook>",
)
z.writestr(
"xl/worksheets/sheet1.xml",
'<?xml version="1.0"?>'
'<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">'
f"<sheetData>{sheet_xml}</sheetData>"
"</worksheet>",
)
return buf.getvalue()
def main() -> None:
_OUT.mkdir(parents=True, exist_ok=True)
print(f"Writing junk corpus to {_OUT}")
# ---- Empty / near-empty -------------------------------------------------
write("empty.csv", b"")
write("only_whitespace.csv", b" \t\n \n\t \n")
write("only_bom.csv", b"\xef\xbb\xbf")
write("only_nul.csv", b"\x00" * 64)
write("just_newlines.csv", b"\n\n\n\n\n")
write("header_only.csv", b"id,name,note\n")
# ---- Random / binary garbage -------------------------------------------
write("random_bytes.csv", secrets.token_bytes(2048))
# Bytes that look like a PNG signature plus garbage; would mislead any
# naive file-type sniffer.
write("png_magic_as_csv.csv", b"\x89PNG\r\n\x1a\n" + secrets.token_bytes(512))
# ---- Truncated / structurally damaged ----------------------------------
write(
"truncated_mid_row.csv",
b"id,name,note\n1,alice,hello\n2,bob,wor", # row 2 ends mid-cell
)
write(
"one_huge_line.csv",
b"a," * 5_000, # 10KB single line, no newline anywhere
)
write(
"massive_columns.csv",
(",".join(f"c{i}" for i in range(500)) + "\n"
+ ",".join("x" for _ in range(500)) + "\n").encode(),
)
write(
"single_column.csv",
b"\n".join([b"id"] + [str(i).encode() for i in range(20)]) + b"\n",
)
# ---- Wrong / misleading delimiter --------------------------------------
write(
"tsv_as_csv.csv",
b"id\tname\tnote\n1\talice\thi\n2\tbob\tworld\n",
)
write(
"mixed_delimiters.csv",
b"id,name\tnote;extra|tail\n1,alice\thi;x|y\n",
)
# ---- Encoding chaos ----------------------------------------------------
sample_text = "id,name,note\n1,café,hello\n2,naïve,world\n"
write("utf16_le_no_bom.csv", sample_text.encode("utf-16-le"))
write("utf16_be_with_bom.csv", b"\xfe\xff" + sample_text.encode("utf-16-be"))
write("utf32_le.csv", sample_text.encode("utf-32-le"))
# Latin-1 bytes that decode as UTF-8 produce mojibake (é, ï etc.)
write("mojibake.csv", sample_text.encode("latin-1"))
# Bytes that aren't valid UTF-8 (lone continuation bytes)
write("invalid_utf8.csv", b"id,name\n1,\xff\xfe\xfd,hello\n")
# cp1252-encoded smart quotes in column values. cp1252 ascribes
# smart-quote glyphs to bytes 0x91-0x94; the surrounding ASCII +
# accented "é" is just there to keep the value realistic.
write(
"cp1252_smart_quotes.csv",
b"id,quote\n1,"
+ "café ".encode("cp1252")
+ b"\x93smart\x94 \x91quote\x92"
+ b"\n",
)
# ---- Quoting and field-shape pathologies -------------------------------
write(
"unescaped_quotes.csv",
b'id,note\n1,"this has " unescaped quote"\n2,"normal"\n',
)
write(
"embedded_newlines.csv",
b'id,note\n1,"line one\nline two"\n2,"single line"\n',
)
write(
"mismatched_columns.csv",
b"id,name,note\n1,alice,hi\n2,bob\n3,carol,hi,extra,fields\n",
)
write(
"duplicate_headers.csv",
b"col,col,col\n1,2,3\n4,5,6\n",
)
write(
"empty_header_names.csv",
b",,,\n1,2,3,4\n5,6,7,8\n",
)
write(
"trailing_commas.csv",
b"id,name,note,\n1,alice,hi,\n2,bob,wo,\n",
)
# ---- Content pathologies ----------------------------------------------
write(
"all_nulls.csv",
b"id,name,note\nNULL,NULL,NULL\nN/A,NA,(null)\nNone,nan,?\n",
)
write(
"very_wide_cell.csv",
b'id,blob\n1,"' + b"x" * 10_000 + b'"\n',
)
write(
"all_same_row.csv",
b"id,name,note\n" + b"1,alice,hello\n" * 100,
)
# ---- Extension confusion ----------------------------------------------
write("no_extension", b"id,name,note\n1,alice,hi\n")
write(
"weird_extension.foo",
b"id,name,note\n1,alice,hi\n",
)
write(
"double_extension.csv.txt",
b"id,name,note\n1,alice,hi\n",
)
# ---- Excel-specific pathologies ----------------------------------------
# Not a real zip — pandas/openpyxl should error cleanly.
write("corrupt_xlsx.xlsx", b"PK\x03\x04 not really a zip file")
# Valid xlsx with an entirely empty sheet.
write("excel_empty.xlsx", _valid_xlsx_bytes(sheet_xml=""))
# Valid xlsx with one row of headers and no data.
write(
"excel_header_only.xlsx",
_valid_xlsx_bytes(
sheet_xml=(
'<row r="1">'
'<c r="A1" t="inlineStr"><is><t>id</t></is></c>'
'<c r="B1" t="inlineStr"><is><t>name</t></is></c>'
"</row>"
),
),
)
print(f"\nWrote {len(list(_OUT.iterdir()))} files.")
if __name__ == "__main__":
main()