Build a corpus of 35 deliberately-broken files (empty bytes, NUL
bytes, mojibake, UTF-16 without BOM, mismatched columns, unescaped
quotes, corrupt zip, etc.) and pin the analyzer's stability contract
against them.
Files land in ``test-cases/junk-corpus/test_data/``. The generator
``make_junk_corpus.py`` produces them deterministically (one random
sample uses ``secrets.token_bytes`` — committed bytes are stable
across regenerations because the byte stream is captured at commit
time). README documents the categories and how to add new shapes.
``tests/test_junk_corpus.py`` parametrizes over every file in the
corpus and asserts:
1. ``_run_analysis_on_upload`` never raises — exceptions must be
caught and surfaced as a synthetic ``Finding`` with
severity="error". This was the user-reported crash for
13_non_latin_scripts.csv that the previous fix in ae9d4a2
defensively wrapped; the corpus now stops the regression
from re-landing on a different shape.
2. Every Finding in the result list is well-formed (string id,
valid severity, non-empty description).
3. A high-risk subset (empty.csv, only_bom.csv, only_nul.csv,
corrupt_xlsx.xlsx) MUST surface at least one error-level
Finding — otherwise the GUI would render "no issues found"
for a structurally broken file.
4. Error-level Finding descriptions are at least 20 chars so the
UI banner gives the user something to act on.
Also exclude ``junk-corpus`` from ``tests/test_fixtures_sweep.py``
since that sweep is happy-path (round-trip the text cleaner) and
fights with files designed to break it. The contract is enforced
by the dedicated junk-corpus test, not the sweep.
Runtime: 12 s for the junk-corpus tests, 30 s for the full
project suite (was 19 s without these). 2118 tests pass.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
232 lines
8.3 KiB
Python
232 lines
8.3 KiB
Python
"""Generate a corpus of pathological files for stress-testing the upload
|
|
analyzer.
|
|
|
|
Each file in ``test_data/`` is deliberately broken in a different way:
|
|
empty bytes, NUL bytes, mojibake, UTF-16 without BOM, mismatched columns,
|
|
unescaped quotes, etc. The goal is to make sure ``_run_analysis_on_upload``
|
|
returns a clean error Finding (never a Python traceback) for any of them,
|
|
in any combination, on every operating system the GUI ships on.
|
|
|
|
Run::
|
|
|
|
python test-cases/junk-corpus/make_junk_corpus.py
|
|
|
|
The matching pytest at ``tests/test_junk_corpus.py`` iterates every file
|
|
in ``test_data/`` and asserts the analyzer either returns findings or an
|
|
error Finding — never raises.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import io
|
|
import os
|
|
import secrets
|
|
import struct
|
|
import zipfile
|
|
from pathlib import Path
|
|
|
|
|
|
_HERE = Path(__file__).resolve().parent
|
|
_OUT = _HERE / "test_data"
|
|
|
|
|
|
def write(name: str, data: bytes) -> None:
|
|
"""Write *data* to ``test_data/name`` and report the size."""
|
|
path = _OUT / name
|
|
path.write_bytes(data)
|
|
print(f" {name:<40} {len(data):>10} bytes")
|
|
|
|
|
|
def _valid_xlsx_bytes(*, sheet_xml: str) -> bytes:
|
|
"""Build a minimal but valid .xlsx (zip with the required parts).
|
|
|
|
``sheet_xml`` is the inner ``<sheetData>`` content; the rest of the
|
|
workbook scaffolding is filled in around it. Good enough for pandas
|
|
to load.
|
|
"""
|
|
buf = io.BytesIO()
|
|
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
|
|
z.writestr(
|
|
"[Content_Types].xml",
|
|
'<?xml version="1.0"?>'
|
|
'<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">'
|
|
'<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>'
|
|
'<Default Extension="xml" ContentType="application/xml"/>'
|
|
'<Override PartName="/xl/workbook.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml"/>'
|
|
'<Override PartName="/xl/worksheets/sheet1.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml"/>'
|
|
"</Types>",
|
|
)
|
|
z.writestr(
|
|
"_rels/.rels",
|
|
'<?xml version="1.0"?>'
|
|
'<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
|
|
'<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="xl/workbook.xml"/>'
|
|
"</Relationships>",
|
|
)
|
|
z.writestr(
|
|
"xl/_rels/workbook.xml.rels",
|
|
'<?xml version="1.0"?>'
|
|
'<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
|
|
'<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet" Target="worksheets/sheet1.xml"/>'
|
|
"</Relationships>",
|
|
)
|
|
z.writestr(
|
|
"xl/workbook.xml",
|
|
'<?xml version="1.0"?>'
|
|
'<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"'
|
|
' xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">'
|
|
'<sheets><sheet name="Sheet1" sheetId="1" r:id="rId1"/></sheets>'
|
|
"</workbook>",
|
|
)
|
|
z.writestr(
|
|
"xl/worksheets/sheet1.xml",
|
|
'<?xml version="1.0"?>'
|
|
'<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">'
|
|
f"<sheetData>{sheet_xml}</sheetData>"
|
|
"</worksheet>",
|
|
)
|
|
return buf.getvalue()
|
|
|
|
|
|
def main() -> None:
|
|
_OUT.mkdir(parents=True, exist_ok=True)
|
|
print(f"Writing junk corpus to {_OUT}")
|
|
|
|
# ---- Empty / near-empty -------------------------------------------------
|
|
write("empty.csv", b"")
|
|
write("only_whitespace.csv", b" \t\n \n\t \n")
|
|
write("only_bom.csv", b"\xef\xbb\xbf")
|
|
write("only_nul.csv", b"\x00" * 64)
|
|
write("just_newlines.csv", b"\n\n\n\n\n")
|
|
write("header_only.csv", b"id,name,note\n")
|
|
|
|
# ---- Random / binary garbage -------------------------------------------
|
|
write("random_bytes.csv", secrets.token_bytes(2048))
|
|
# Bytes that look like a PNG signature plus garbage; would mislead any
|
|
# naive file-type sniffer.
|
|
write("png_magic_as_csv.csv", b"\x89PNG\r\n\x1a\n" + secrets.token_bytes(512))
|
|
|
|
# ---- Truncated / structurally damaged ----------------------------------
|
|
write(
|
|
"truncated_mid_row.csv",
|
|
b"id,name,note\n1,alice,hello\n2,bob,wor", # row 2 ends mid-cell
|
|
)
|
|
write(
|
|
"one_huge_line.csv",
|
|
b"a," * 5_000, # 10KB single line, no newline anywhere
|
|
)
|
|
write(
|
|
"massive_columns.csv",
|
|
(",".join(f"c{i}" for i in range(500)) + "\n"
|
|
+ ",".join("x" for _ in range(500)) + "\n").encode(),
|
|
)
|
|
write(
|
|
"single_column.csv",
|
|
b"\n".join([b"id"] + [str(i).encode() for i in range(20)]) + b"\n",
|
|
)
|
|
|
|
# ---- Wrong / misleading delimiter --------------------------------------
|
|
write(
|
|
"tsv_as_csv.csv",
|
|
b"id\tname\tnote\n1\talice\thi\n2\tbob\tworld\n",
|
|
)
|
|
write(
|
|
"mixed_delimiters.csv",
|
|
b"id,name\tnote;extra|tail\n1,alice\thi;x|y\n",
|
|
)
|
|
|
|
# ---- Encoding chaos ----------------------------------------------------
|
|
sample_text = "id,name,note\n1,café,hello\n2,naïve,world\n"
|
|
write("utf16_le_no_bom.csv", sample_text.encode("utf-16-le"))
|
|
write("utf16_be_with_bom.csv", b"\xfe\xff" + sample_text.encode("utf-16-be"))
|
|
write("utf32_le.csv", sample_text.encode("utf-32-le"))
|
|
# Latin-1 bytes that decode as UTF-8 produce mojibake (é, ï etc.)
|
|
write("mojibake.csv", sample_text.encode("latin-1"))
|
|
# Bytes that aren't valid UTF-8 (lone continuation bytes)
|
|
write("invalid_utf8.csv", b"id,name\n1,\xff\xfe\xfd,hello\n")
|
|
# cp1252-encoded smart quotes in column values. cp1252 ascribes
|
|
# smart-quote glyphs to bytes 0x91-0x94; the surrounding ASCII +
|
|
# accented "é" is just there to keep the value realistic.
|
|
write(
|
|
"cp1252_smart_quotes.csv",
|
|
b"id,quote\n1,"
|
|
+ "café ".encode("cp1252")
|
|
+ b"\x93smart\x94 \x91quote\x92"
|
|
+ b"\n",
|
|
)
|
|
|
|
# ---- Quoting and field-shape pathologies -------------------------------
|
|
write(
|
|
"unescaped_quotes.csv",
|
|
b'id,note\n1,"this has " unescaped quote"\n2,"normal"\n',
|
|
)
|
|
write(
|
|
"embedded_newlines.csv",
|
|
b'id,note\n1,"line one\nline two"\n2,"single line"\n',
|
|
)
|
|
write(
|
|
"mismatched_columns.csv",
|
|
b"id,name,note\n1,alice,hi\n2,bob\n3,carol,hi,extra,fields\n",
|
|
)
|
|
write(
|
|
"duplicate_headers.csv",
|
|
b"col,col,col\n1,2,3\n4,5,6\n",
|
|
)
|
|
write(
|
|
"empty_header_names.csv",
|
|
b",,,\n1,2,3,4\n5,6,7,8\n",
|
|
)
|
|
write(
|
|
"trailing_commas.csv",
|
|
b"id,name,note,\n1,alice,hi,\n2,bob,wo,\n",
|
|
)
|
|
|
|
# ---- Content pathologies ----------------------------------------------
|
|
write(
|
|
"all_nulls.csv",
|
|
b"id,name,note\nNULL,NULL,NULL\nN/A,NA,(null)\nNone,nan,?\n",
|
|
)
|
|
write(
|
|
"very_wide_cell.csv",
|
|
b'id,blob\n1,"' + b"x" * 10_000 + b'"\n',
|
|
)
|
|
write(
|
|
"all_same_row.csv",
|
|
b"id,name,note\n" + b"1,alice,hello\n" * 100,
|
|
)
|
|
|
|
# ---- Extension confusion ----------------------------------------------
|
|
write("no_extension", b"id,name,note\n1,alice,hi\n")
|
|
write(
|
|
"weird_extension.foo",
|
|
b"id,name,note\n1,alice,hi\n",
|
|
)
|
|
write(
|
|
"double_extension.csv.txt",
|
|
b"id,name,note\n1,alice,hi\n",
|
|
)
|
|
|
|
# ---- Excel-specific pathologies ----------------------------------------
|
|
# Not a real zip — pandas/openpyxl should error cleanly.
|
|
write("corrupt_xlsx.xlsx", b"PK\x03\x04 not really a zip file")
|
|
# Valid xlsx with an entirely empty sheet.
|
|
write("excel_empty.xlsx", _valid_xlsx_bytes(sheet_xml=""))
|
|
# Valid xlsx with one row of headers and no data.
|
|
write(
|
|
"excel_header_only.xlsx",
|
|
_valid_xlsx_bytes(
|
|
sheet_xml=(
|
|
'<row r="1">'
|
|
'<c r="A1" t="inlineStr"><is><t>id</t></is></c>'
|
|
'<c r="B1" t="inlineStr"><is><t>name</t></is></c>'
|
|
"</row>"
|
|
),
|
|
),
|
|
)
|
|
|
|
print(f"\nWrote {len(list(_OUT.iterdir()))} files.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|