From 696996c11962df4b9f23b4273c0d221bf2756d32 Mon Sep 17 00:00:00 2001 From: Michael Date: Sat, 16 May 2026 21:35:22 +0000 Subject: [PATCH] test(junk-corpus): pathological-input stress suite for the analyzer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Build a corpus of 35 deliberately-broken files (empty bytes, NUL bytes, mojibake, UTF-16 without BOM, mismatched columns, unescaped quotes, corrupt zip, etc.) and pin the analyzer's stability contract against them. Files land in ``test-cases/junk-corpus/test_data/``. The generator ``make_junk_corpus.py`` produces them deterministically (one random sample uses ``secrets.token_bytes`` — committed bytes are stable across regenerations because the byte stream is captured at commit time). README documents the categories and how to add new shapes. ``tests/test_junk_corpus.py`` parametrizes over every file in the corpus and asserts: 1. ``_run_analysis_on_upload`` never raises — exceptions must be caught and surfaced as a synthetic ``Finding`` with severity="error". This was the user-reported crash for 13_non_latin_scripts.csv that the previous fix in ae9d4a2 defensively wrapped; the corpus now stops the regression from re-landing on a different shape. 2. Every Finding in the result list is well-formed (string id, valid severity, non-empty description). 3. A high-risk subset (empty.csv, only_bom.csv, only_nul.csv, corrupt_xlsx.xlsx) MUST surface at least one error-level Finding — otherwise the GUI would render "no issues found" for a structurally broken file. 4. Error-level Finding descriptions are at least 20 chars so the UI banner gives the user something to act on. Also exclude ``junk-corpus`` from ``tests/test_fixtures_sweep.py`` since that sweep is happy-path (round-trip the text cleaner) and fights with files designed to break it. The contract is enforced by the dedicated junk-corpus test, not the sweep. Runtime: 12 s for the junk-corpus tests, 30 s for the full project suite (was 19 s without these). 2118 tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- test-cases/junk-corpus/README.md | 63 +++++ test-cases/junk-corpus/make_junk_corpus.py | 231 ++++++++++++++++++ .../junk-corpus/test_data/all_nulls.csv | 4 + .../junk-corpus/test_data/all_same_row.csv | 101 ++++++++ .../junk-corpus/test_data/corrupt_xlsx.xlsx | 1 + .../test_data/cp1252_smart_quotes.csv | 2 + .../test_data/double_extension.csv.txt | 2 + .../test_data/duplicate_headers.csv | 3 + .../test_data/embedded_newlines.csv | 4 + test-cases/junk-corpus/test_data/empty.csv | 0 .../test_data/empty_header_names.csv | 3 + .../junk-corpus/test_data/excel_empty.xlsx | Bin 0 -> 1392 bytes .../test_data/excel_header_only.xlsx | Bin 0 -> 1442 bytes .../junk-corpus/test_data/header_only.csv | 1 + .../junk-corpus/test_data/invalid_utf8.csv | 2 + .../junk-corpus/test_data/just_newlines.csv | 5 + .../junk-corpus/test_data/massive_columns.csv | 2 + .../test_data/mismatched_columns.csv | 4 + .../test_data/mixed_delimiters.csv | 2 + test-cases/junk-corpus/test_data/mojibake.csv | 3 + test-cases/junk-corpus/test_data/no_extension | 2 + .../junk-corpus/test_data/one_huge_line.csv | 1 + test-cases/junk-corpus/test_data/only_bom.csv | 1 + test-cases/junk-corpus/test_data/only_nul.csv | Bin 0 -> 64 bytes .../junk-corpus/test_data/only_whitespace.csv | 3 + .../test_data/png_magic_as_csv.csv | Bin 0 -> 520 bytes .../junk-corpus/test_data/random_bytes.csv | Bin 0 -> 2048 bytes .../junk-corpus/test_data/single_column.csv | 21 ++ .../junk-corpus/test_data/trailing_commas.csv | 3 + .../test_data/truncated_mid_row.csv | 3 + .../junk-corpus/test_data/tsv_as_csv.csv | 3 + .../test_data/unescaped_quotes.csv | 3 + .../test_data/utf16_be_with_bom.csv | Bin 0 -> 82 bytes .../junk-corpus/test_data/utf16_le_no_bom.csv | Bin 0 -> 80 bytes test-cases/junk-corpus/test_data/utf32_le.csv | Bin 0 -> 160 bytes .../junk-corpus/test_data/very_wide_cell.csv | 2 + .../junk-corpus/test_data/weird_extension.foo | 2 + tests/test_fixtures_sweep.py | 6 +- tests/test_junk_corpus.py | 156 ++++++++++++ 39 files changed, 637 insertions(+), 2 deletions(-) create mode 100644 test-cases/junk-corpus/README.md create mode 100644 test-cases/junk-corpus/make_junk_corpus.py create mode 100644 test-cases/junk-corpus/test_data/all_nulls.csv create mode 100644 test-cases/junk-corpus/test_data/all_same_row.csv create mode 100644 test-cases/junk-corpus/test_data/corrupt_xlsx.xlsx create mode 100644 test-cases/junk-corpus/test_data/cp1252_smart_quotes.csv create mode 100644 test-cases/junk-corpus/test_data/double_extension.csv.txt create mode 100644 test-cases/junk-corpus/test_data/duplicate_headers.csv create mode 100644 test-cases/junk-corpus/test_data/embedded_newlines.csv create mode 100644 test-cases/junk-corpus/test_data/empty.csv create mode 100644 test-cases/junk-corpus/test_data/empty_header_names.csv create mode 100644 test-cases/junk-corpus/test_data/excel_empty.xlsx create mode 100644 test-cases/junk-corpus/test_data/excel_header_only.xlsx create mode 100644 test-cases/junk-corpus/test_data/header_only.csv create mode 100644 test-cases/junk-corpus/test_data/invalid_utf8.csv create mode 100644 test-cases/junk-corpus/test_data/just_newlines.csv create mode 100644 test-cases/junk-corpus/test_data/massive_columns.csv create mode 100644 test-cases/junk-corpus/test_data/mismatched_columns.csv create mode 100644 test-cases/junk-corpus/test_data/mixed_delimiters.csv create mode 100644 test-cases/junk-corpus/test_data/mojibake.csv create mode 100644 test-cases/junk-corpus/test_data/no_extension create mode 100644 test-cases/junk-corpus/test_data/one_huge_line.csv create mode 100644 test-cases/junk-corpus/test_data/only_bom.csv create mode 100644 test-cases/junk-corpus/test_data/only_nul.csv create mode 100644 test-cases/junk-corpus/test_data/only_whitespace.csv create mode 100644 test-cases/junk-corpus/test_data/png_magic_as_csv.csv create mode 100644 test-cases/junk-corpus/test_data/random_bytes.csv create mode 100644 test-cases/junk-corpus/test_data/single_column.csv create mode 100644 test-cases/junk-corpus/test_data/trailing_commas.csv create mode 100644 test-cases/junk-corpus/test_data/truncated_mid_row.csv create mode 100644 test-cases/junk-corpus/test_data/tsv_as_csv.csv create mode 100644 test-cases/junk-corpus/test_data/unescaped_quotes.csv create mode 100644 test-cases/junk-corpus/test_data/utf16_be_with_bom.csv create mode 100644 test-cases/junk-corpus/test_data/utf16_le_no_bom.csv create mode 100644 test-cases/junk-corpus/test_data/utf32_le.csv create mode 100644 test-cases/junk-corpus/test_data/very_wide_cell.csv create mode 100644 test-cases/junk-corpus/test_data/weird_extension.foo create mode 100644 tests/test_junk_corpus.py diff --git a/test-cases/junk-corpus/README.md b/test-cases/junk-corpus/README.md new file mode 100644 index 0000000..d46abb5 --- /dev/null +++ b/test-cases/junk-corpus/README.md @@ -0,0 +1,63 @@ +# Junk Corpus — pathological-input stress tests + +This corpus exists to make the upload analyzer prove it can survive any +file a user (or an adversary) might drop on it. Every file under +`test_data/` is deliberately broken in a different way: empty bytes, +NUL bytes, mojibake, UTF-16 without a BOM, mismatched columns, +unescaped quotes, corrupt `.xlsx`, and so on. + +The contract enforced by `tests/test_junk_corpus.py`: + +1. `_run_analysis_on_upload(file)` MUST NOT raise. Errors are caught + and surfaced as a synthetic `Finding` with severity `"error"`. +2. The return is always a `list[Finding]` (possibly empty for files + the analyzer judges clean). +3. A specific subset of files (`empty.csv`, `only_bom.csv`, + `only_nul.csv`, `corrupt_xlsx.xlsx`) MUST produce at least one + error-level Finding so the GUI shows a red banner instead of + silently rendering "no issues found". + +## Why this matters + +In a multi-file home-page upload, one bad file used to bubble a +Python traceback up through the page chrome and kill every other +file's analysis. The defensive wrap in `_run_analysis_on_upload` plus +this stress test together prevent that regression. + +## Regenerating the corpus + +```bash +python test-cases/junk-corpus/make_junk_corpus.py +``` + +The generator writes 35-ish files into `test_data/`. They are small +(< 100 KB each) and committed to the repo so the stress test runs +without depending on a regenerate step. + +## Adding a new pathological shape + +1. Add a `write(...)` call to `make_junk_corpus.py`. +2. Re-run that script to materialize the file on disk. +3. (Optional) Add the filename to `_MUST_BE_ERROR` in + `tests/test_junk_corpus.py` if "no findings" would be a silent + failure for that shape. + +## What's already covered + +| Category | Files | +|---|---| +| Empty / near-empty | `empty.csv`, `only_whitespace.csv`, `only_bom.csv`, `only_nul.csv`, `just_newlines.csv`, `header_only.csv` | +| Random / binary garbage | `random_bytes.csv`, `png_magic_as_csv.csv` | +| Truncated or huge | `truncated_mid_row.csv`, `one_huge_line.csv`, `massive_columns.csv`, `single_column.csv` | +| Wrong delimiter | `tsv_as_csv.csv`, `mixed_delimiters.csv` | +| Encoding chaos | `utf16_le_no_bom.csv`, `utf16_be_with_bom.csv`, `utf32_le.csv`, `mojibake.csv`, `invalid_utf8.csv`, `cp1252_smart_quotes.csv` | +| Quoting / shape | `unescaped_quotes.csv`, `embedded_newlines.csv`, `mismatched_columns.csv`, `duplicate_headers.csv`, `empty_header_names.csv`, `trailing_commas.csv` | +| Content | `all_nulls.csv`, `very_wide_cell.csv`, `all_same_row.csv` | +| Extension confusion | `no_extension`, `weird_extension.foo`, `double_extension.csv.txt` | +| Excel pathologies | `corrupt_xlsx.xlsx`, `excel_empty.xlsx`, `excel_header_only.xlsx` | + +## Manually loading a junk file in the GUI + +The files are real on-disk artifacts. Drag any of them into the home +page uploader to verify the GUI renders a sensible error (or clean +findings, for files the analyzer is OK with) instead of crashing. diff --git a/test-cases/junk-corpus/make_junk_corpus.py b/test-cases/junk-corpus/make_junk_corpus.py new file mode 100644 index 0000000..fb6817f --- /dev/null +++ b/test-cases/junk-corpus/make_junk_corpus.py @@ -0,0 +1,231 @@ +"""Generate a corpus of pathological files for stress-testing the upload +analyzer. + +Each file in ``test_data/`` is deliberately broken in a different way: +empty bytes, NUL bytes, mojibake, UTF-16 without BOM, mismatched columns, +unescaped quotes, etc. The goal is to make sure ``_run_analysis_on_upload`` +returns a clean error Finding (never a Python traceback) for any of them, +in any combination, on every operating system the GUI ships on. + +Run:: + + python test-cases/junk-corpus/make_junk_corpus.py + +The matching pytest at ``tests/test_junk_corpus.py`` iterates every file +in ``test_data/`` and asserts the analyzer either returns findings or an +error Finding — never raises. +""" + +from __future__ import annotations + +import io +import os +import secrets +import struct +import zipfile +from pathlib import Path + + +_HERE = Path(__file__).resolve().parent +_OUT = _HERE / "test_data" + + +def write(name: str, data: bytes) -> None: + """Write *data* to ``test_data/name`` and report the size.""" + path = _OUT / name + path.write_bytes(data) + print(f" {name:<40} {len(data):>10} bytes") + + +def _valid_xlsx_bytes(*, sheet_xml: str) -> bytes: + """Build a minimal but valid .xlsx (zip with the required parts). + + ``sheet_xml`` is the inner ```` content; the rest of the + workbook scaffolding is filled in around it. Good enough for pandas + to load. + """ + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z: + z.writestr( + "[Content_Types].xml", + '' + '' + '' + '' + '' + '' + "", + ) + z.writestr( + "_rels/.rels", + '' + '' + '' + "", + ) + z.writestr( + "xl/_rels/workbook.xml.rels", + '' + '' + '' + "", + ) + z.writestr( + "xl/workbook.xml", + '' + '' + '' + "", + ) + z.writestr( + "xl/worksheets/sheet1.xml", + '' + '' + f"{sheet_xml}" + "", + ) + return buf.getvalue() + + +def main() -> None: + _OUT.mkdir(parents=True, exist_ok=True) + print(f"Writing junk corpus to {_OUT}") + + # ---- Empty / near-empty ------------------------------------------------- + write("empty.csv", b"") + write("only_whitespace.csv", b" \t\n \n\t \n") + write("only_bom.csv", b"\xef\xbb\xbf") + write("only_nul.csv", b"\x00" * 64) + write("just_newlines.csv", b"\n\n\n\n\n") + write("header_only.csv", b"id,name,note\n") + + # ---- Random / binary garbage ------------------------------------------- + write("random_bytes.csv", secrets.token_bytes(2048)) + # Bytes that look like a PNG signature plus garbage; would mislead any + # naive file-type sniffer. + write("png_magic_as_csv.csv", b"\x89PNG\r\n\x1a\n" + secrets.token_bytes(512)) + + # ---- Truncated / structurally damaged ---------------------------------- + write( + "truncated_mid_row.csv", + b"id,name,note\n1,alice,hello\n2,bob,wor", # row 2 ends mid-cell + ) + write( + "one_huge_line.csv", + b"a," * 5_000, # 10KB single line, no newline anywhere + ) + write( + "massive_columns.csv", + (",".join(f"c{i}" for i in range(500)) + "\n" + + ",".join("x" for _ in range(500)) + "\n").encode(), + ) + write( + "single_column.csv", + b"\n".join([b"id"] + [str(i).encode() for i in range(20)]) + b"\n", + ) + + # ---- Wrong / misleading delimiter -------------------------------------- + write( + "tsv_as_csv.csv", + b"id\tname\tnote\n1\talice\thi\n2\tbob\tworld\n", + ) + write( + "mixed_delimiters.csv", + b"id,name\tnote;extra|tail\n1,alice\thi;x|y\n", + ) + + # ---- Encoding chaos ---------------------------------------------------- + sample_text = "id,name,note\n1,café,hello\n2,naïve,world\n" + write("utf16_le_no_bom.csv", sample_text.encode("utf-16-le")) + write("utf16_be_with_bom.csv", b"\xfe\xff" + sample_text.encode("utf-16-be")) + write("utf32_le.csv", sample_text.encode("utf-32-le")) + # Latin-1 bytes that decode as UTF-8 produce mojibake (é, ï etc.) + write("mojibake.csv", sample_text.encode("latin-1")) + # Bytes that aren't valid UTF-8 (lone continuation bytes) + write("invalid_utf8.csv", b"id,name\n1,\xff\xfe\xfd,hello\n") + # cp1252-encoded smart quotes in column values. cp1252 ascribes + # smart-quote glyphs to bytes 0x91-0x94; the surrounding ASCII + + # accented "é" is just there to keep the value realistic. + write( + "cp1252_smart_quotes.csv", + b"id,quote\n1," + + "café ".encode("cp1252") + + b"\x93smart\x94 \x91quote\x92" + + b"\n", + ) + + # ---- Quoting and field-shape pathologies ------------------------------- + write( + "unescaped_quotes.csv", + b'id,note\n1,"this has " unescaped quote"\n2,"normal"\n', + ) + write( + "embedded_newlines.csv", + b'id,note\n1,"line one\nline two"\n2,"single line"\n', + ) + write( + "mismatched_columns.csv", + b"id,name,note\n1,alice,hi\n2,bob\n3,carol,hi,extra,fields\n", + ) + write( + "duplicate_headers.csv", + b"col,col,col\n1,2,3\n4,5,6\n", + ) + write( + "empty_header_names.csv", + b",,,\n1,2,3,4\n5,6,7,8\n", + ) + write( + "trailing_commas.csv", + b"id,name,note,\n1,alice,hi,\n2,bob,wo,\n", + ) + + # ---- Content pathologies ---------------------------------------------- + write( + "all_nulls.csv", + b"id,name,note\nNULL,NULL,NULL\nN/A,NA,(null)\nNone,nan,?\n", + ) + write( + "very_wide_cell.csv", + b'id,blob\n1,"' + b"x" * 10_000 + b'"\n', + ) + write( + "all_same_row.csv", + b"id,name,note\n" + b"1,alice,hello\n" * 100, + ) + + # ---- Extension confusion ---------------------------------------------- + write("no_extension", b"id,name,note\n1,alice,hi\n") + write( + "weird_extension.foo", + b"id,name,note\n1,alice,hi\n", + ) + write( + "double_extension.csv.txt", + b"id,name,note\n1,alice,hi\n", + ) + + # ---- Excel-specific pathologies ---------------------------------------- + # Not a real zip — pandas/openpyxl should error cleanly. + write("corrupt_xlsx.xlsx", b"PK\x03\x04 not really a zip file") + # Valid xlsx with an entirely empty sheet. + write("excel_empty.xlsx", _valid_xlsx_bytes(sheet_xml="")) + # Valid xlsx with one row of headers and no data. + write( + "excel_header_only.xlsx", + _valid_xlsx_bytes( + sheet_xml=( + '' + 'id' + 'name' + "" + ), + ), + ) + + print(f"\nWrote {len(list(_OUT.iterdir()))} files.") + + +if __name__ == "__main__": + main() diff --git a/test-cases/junk-corpus/test_data/all_nulls.csv b/test-cases/junk-corpus/test_data/all_nulls.csv new file mode 100644 index 0000000..7c2a26b --- /dev/null +++ b/test-cases/junk-corpus/test_data/all_nulls.csv @@ -0,0 +1,4 @@ +id,name,note +NULL,NULL,NULL +N/A,NA,(null) +None,nan,? diff --git a/test-cases/junk-corpus/test_data/all_same_row.csv b/test-cases/junk-corpus/test_data/all_same_row.csv new file mode 100644 index 0000000..20960d9 --- /dev/null +++ b/test-cases/junk-corpus/test_data/all_same_row.csv @@ -0,0 +1,101 @@ +id,name,note +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello +1,alice,hello diff --git a/test-cases/junk-corpus/test_data/corrupt_xlsx.xlsx b/test-cases/junk-corpus/test_data/corrupt_xlsx.xlsx new file mode 100644 index 0000000..93ef80b --- /dev/null +++ b/test-cases/junk-corpus/test_data/corrupt_xlsx.xlsx @@ -0,0 +1 @@ +PK not really a zip file \ No newline at end of file diff --git a/test-cases/junk-corpus/test_data/cp1252_smart_quotes.csv b/test-cases/junk-corpus/test_data/cp1252_smart_quotes.csv new file mode 100644 index 0000000..f56f128 --- /dev/null +++ b/test-cases/junk-corpus/test_data/cp1252_smart_quotes.csv @@ -0,0 +1,2 @@ +id,quote +1,café “smart” ‘quote’ diff --git a/test-cases/junk-corpus/test_data/double_extension.csv.txt b/test-cases/junk-corpus/test_data/double_extension.csv.txt new file mode 100644 index 0000000..fb3798b --- /dev/null +++ b/test-cases/junk-corpus/test_data/double_extension.csv.txt @@ -0,0 +1,2 @@ +id,name,note +1,alice,hi diff --git a/test-cases/junk-corpus/test_data/duplicate_headers.csv b/test-cases/junk-corpus/test_data/duplicate_headers.csv new file mode 100644 index 0000000..df36192 --- /dev/null +++ b/test-cases/junk-corpus/test_data/duplicate_headers.csv @@ -0,0 +1,3 @@ +col,col,col +1,2,3 +4,5,6 diff --git a/test-cases/junk-corpus/test_data/embedded_newlines.csv b/test-cases/junk-corpus/test_data/embedded_newlines.csv new file mode 100644 index 0000000..4a04eda --- /dev/null +++ b/test-cases/junk-corpus/test_data/embedded_newlines.csv @@ -0,0 +1,4 @@ +id,note +1,"line one +line two" +2,"single line" diff --git a/test-cases/junk-corpus/test_data/empty.csv b/test-cases/junk-corpus/test_data/empty.csv new file mode 100644 index 0000000..e69de29 diff --git a/test-cases/junk-corpus/test_data/empty_header_names.csv b/test-cases/junk-corpus/test_data/empty_header_names.csv new file mode 100644 index 0000000..b806a1b --- /dev/null +++ b/test-cases/junk-corpus/test_data/empty_header_names.csv @@ -0,0 +1,3 @@ +,,, +1,2,3,4 +5,6,7,8 diff --git a/test-cases/junk-corpus/test_data/excel_empty.xlsx b/test-cases/junk-corpus/test_data/excel_empty.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..0b31d85f918707e60bff7d4eae220a9a4a629db3 GIT binary patch literal 1392 zcmWIWW@Zs#U|`^2__KOLO!Ulk>z)F6+)NA%!azFOIX|x?HLoN-q_QBjI99JBH)rj{ zy+MZ!1X|)<7n=#so!r#4ajwz@Hm{{NhN-Yol5&onahGouLS#9Zm-(K zEL)>C|9a&;#oNCct88xS$Sp6bU8u52=jgwrIg!0bB-UP>(ao2BzjpDO^(>*2y4%@> zD?@pN@`Tc!rXMi<9x`vPkNYFZV7mp{dNXwPw3_T|Ggx(h`kf5PU-vRzHQDVH3X~B} zJkA_g7N#3)X;Syqyx!1QC;9Vv*L~mWzcoDFeVac3B~-kd7h6pOh6XDmFl2yqd{Jsn zvA!ON>^*PGcgTQ;^}%WvJBPbQ1`C$OM(k436A1e5p8MzoXF#66TbG2y-#=d>*WEt5 z_LZYh`ZuXni(0m=)Za2|O0&gT#Z`Ac1RkCeWTEJ0J<)jM0|&c&HnH>|xzr;GZ<5>& zIY-I`+VS3)z3R28$Mx_9>()E`zhT3(d8zU3-OBUp=KRP?xp~6F^YFGulT&vZ>btrz z&Xy|YH_JzLey-Wel374!vjLqi1*9u-^dUYd&o9bO%FoXRMJv=d)*#=oA^RpmP`o2( z`^sB3BD1f*?UWFhDltRn|2DhRx&J?`-WB-Fl8x6l)cO9_^r+q`4R_~6Ozk}&HOIqB zVN>Z7-f+V}jmd^-`AfD`9!hzWq;@Da_Pt@f^aj(bnHMJbJ2SufB=sy8(Vk$+)&Mk>Lc-JnPUrpKKTj6{16`@`=z(v zTBzB8$7TJai;Ma1Hr&}ZK0bkC!l~X6Vci$@z6>fWUX5yZcOPB6`;*z1Zt>fnUPa)USU-oiu z5RUrQF8KUrZ$;H7z_U~fcc(!fj5(dY$+D&aM z8COoVx%OnDPdx*wXL1?scVz=T(F(*82+tH}q^6b>>w_snPz?F@I`RR1#j?NY6T_Tu z7o0^8EtTJRK$T;T%eJeZYLxgw^R2C;awYih?dI=Xr^^5CTDAGH54xK&l{T%rxu)4W z?JMVkU?sP=-b?SC+0C)obPDShu7@?f745sf^C!e??tb_xE<=L(&-449Ho(wkWD;S< zT`T}y2?hz)F6+)NA%!azFOIX|x?HLoN-q_QBjI99JBH)rj{ zy+MZ!1X|)<7n=#so!r#4ajwz@Hm{{NhN-Yol5&onahGouLS#9Zm-(K zEL)>C|9a&;#oNCct88xS$Sp6bU8u52=jgwrIg!0bB-UP>(ao2BzjpDO^(>*2y4%@> zD?@pN@`Tc!rXMi<9x`vPkNYFZV7mp{dNXwPw3_T|Ggx(h`kf5PU-vRzHQDVH3X~B} zJkA_g7N#3)X;Syqyx!1QC;9Vv*L~mWzcoDFeVac3B~-kd7h6pOh6XDmFl2yqd{Jsn zvA!ON>^*PGcgTQ;^}%WvJBPbQ1`C$OM(k436A1e5p8MzoXF#66TbG2y-#=d>*WEt5 z_LZYh`ZuXni(0m=)Za2|O0&gT#Z`Ac1RkCeWTEJ0J<)jM0|&c&HnH>|xzr;GZ<5>& zIY-I`+VS3)z3R28$Mx_9>()E`zhT3(d8zU3-OBUp=KRP?xp~6F^YFGulT&vZ>btrz z&Xy|YH_JzLey-Wel374!vjLqi1*9u-^dUYd&o9bO%FoXRMJv=d)*#=oA^RpmP`o2( z`^sB3BD1f*?UWFhDltRn|2DhRx&J?`-WB-Fl8x6l)cO9_^r+q`4R_~6Ozk}&HOIqB zVN>Z7-f+V}jmd^-`AfD`9!hzWq;@Da_Pt@f^aj(bnHMJbJ2SufB=sy8(Vk$+)&Mk>Lc-JnPUrpKKTj6{16`@`=z(v zTBzB8$7TJai;Ma1Hr&}ZK0bkC!l~X6Vci$@z6>fWUX5yZcOPB6`;*z1Zt>fnUPa)USU-oiu z5RUrQF8KUrZ$;H7z_U~fcc(!fj5(dY$+D&aM z8COoVx%OnDPdx*wXI8{$GR_5h;tLQ#&wleE><#v%>pF1?% z?w(mI>-vnD;k)v(1EDhwR~PF}X#DGpsTdYmFOmoURMW5jTPH!B-RB`XlR0v+ql0^$JxY;7r} literal 0 HcmV?d00001 diff --git a/test-cases/junk-corpus/test_data/header_only.csv b/test-cases/junk-corpus/test_data/header_only.csv new file mode 100644 index 0000000..402416f --- /dev/null +++ b/test-cases/junk-corpus/test_data/header_only.csv @@ -0,0 +1 @@ +id,name,note diff --git a/test-cases/junk-corpus/test_data/invalid_utf8.csv b/test-cases/junk-corpus/test_data/invalid_utf8.csv new file mode 100644 index 0000000..1827e39 --- /dev/null +++ b/test-cases/junk-corpus/test_data/invalid_utf8.csv @@ -0,0 +1,2 @@ +id,name +1,ÿþý,hello diff --git a/test-cases/junk-corpus/test_data/just_newlines.csv b/test-cases/junk-corpus/test_data/just_newlines.csv new file mode 100644 index 0000000..3f2ff2d --- /dev/null +++ b/test-cases/junk-corpus/test_data/just_newlines.csv @@ -0,0 +1,5 @@ + + + + + diff --git a/test-cases/junk-corpus/test_data/massive_columns.csv b/test-cases/junk-corpus/test_data/massive_columns.csv new file mode 100644 index 0000000..088b2b7 --- /dev/null +++ b/test-cases/junk-corpus/test_data/massive_columns.csv @@ -0,0 +1,2 @@ +c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,c21,c22,c23,c24,c25,c26,c27,c28,c29,c30,c31,c32,c33,c34,c35,c36,c37,c38,c39,c40,c41,c42,c43,c44,c45,c46,c47,c48,c49,c50,c51,c52,c53,c54,c55,c56,c57,c58,c59,c60,c61,c62,c63,c64,c65,c66,c67,c68,c69,c70,c71,c72,c73,c74,c75,c76,c77,c78,c79,c80,c81,c82,c83,c84,c85,c86,c87,c88,c89,c90,c91,c92,c93,c94,c95,c96,c97,c98,c99,c100,c101,c102,c103,c104,c105,c106,c107,c108,c109,c110,c111,c112,c113,c114,c115,c116,c117,c118,c119,c120,c121,c122,c123,c124,c125,c126,c127,c128,c129,c130,c131,c132,c133,c134,c135,c136,c137,c138,c139,c140,c141,c142,c143,c144,c145,c146,c147,c148,c149,c150,c151,c152,c153,c154,c155,c156,c157,c158,c159,c160,c161,c162,c163,c164,c165,c166,c167,c168,c169,c170,c171,c172,c173,c174,c175,c176,c177,c178,c179,c180,c181,c182,c183,c184,c185,c186,c187,c188,c189,c190,c191,c192,c193,c194,c195,c196,c197,c198,c199,c200,c201,c202,c203,c204,c205,c206,c207,c208,c209,c210,c211,c212,c213,c214,c215,c216,c217,c218,c219,c220,c221,c222,c223,c224,c225,c226,c227,c228,c229,c230,c231,c232,c233,c234,c235,c236,c237,c238,c239,c240,c241,c242,c243,c244,c245,c246,c247,c248,c249,c250,c251,c252,c253,c254,c255,c256,c257,c258,c259,c260,c261,c262,c263,c264,c265,c266,c267,c268,c269,c270,c271,c272,c273,c274,c275,c276,c277,c278,c279,c280,c281,c282,c283,c284,c285,c286,c287,c288,c289,c290,c291,c292,c293,c294,c295,c296,c297,c298,c299,c300,c301,c302,c303,c304,c305,c306,c307,c308,c309,c310,c311,c312,c313,c314,c315,c316,c317,c318,c319,c320,c321,c322,c323,c324,c325,c326,c327,c328,c329,c330,c331,c332,c333,c334,c335,c336,c337,c338,c339,c340,c341,c342,c343,c344,c345,c346,c347,c348,c349,c350,c351,c352,c353,c354,c355,c356,c357,c358,c359,c360,c361,c362,c363,c364,c365,c366,c367,c368,c369,c370,c371,c372,c373,c374,c375,c376,c377,c378,c379,c380,c381,c382,c383,c384,c385,c386,c387,c388,c389,c390,c391,c392,c393,c394,c395,c396,c397,c398,c399,c400,c401,c402,c403,c404,c405,c406,c407,c408,c409,c410,c411,c412,c413,c414,c415,c416,c417,c418,c419,c420,c421,c422,c423,c424,c425,c426,c427,c428,c429,c430,c431,c432,c433,c434,c435,c436,c437,c438,c439,c440,c441,c442,c443,c444,c445,c446,c447,c448,c449,c450,c451,c452,c453,c454,c455,c456,c457,c458,c459,c460,c461,c462,c463,c464,c465,c466,c467,c468,c469,c470,c471,c472,c473,c474,c475,c476,c477,c478,c479,c480,c481,c482,c483,c484,c485,c486,c487,c488,c489,c490,c491,c492,c493,c494,c495,c496,c497,c498,c499 +x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x diff --git a/test-cases/junk-corpus/test_data/mismatched_columns.csv b/test-cases/junk-corpus/test_data/mismatched_columns.csv new file mode 100644 index 0000000..5140c62 --- /dev/null +++ b/test-cases/junk-corpus/test_data/mismatched_columns.csv @@ -0,0 +1,4 @@ +id,name,note +1,alice,hi +2,bob +3,carol,hi,extra,fields diff --git a/test-cases/junk-corpus/test_data/mixed_delimiters.csv b/test-cases/junk-corpus/test_data/mixed_delimiters.csv new file mode 100644 index 0000000..dcb18e4 --- /dev/null +++ b/test-cases/junk-corpus/test_data/mixed_delimiters.csv @@ -0,0 +1,2 @@ +id,name note;extra|tail +1,alice hi;x|y diff --git a/test-cases/junk-corpus/test_data/mojibake.csv b/test-cases/junk-corpus/test_data/mojibake.csv new file mode 100644 index 0000000..b6487d8 --- /dev/null +++ b/test-cases/junk-corpus/test_data/mojibake.csv @@ -0,0 +1,3 @@ +id,name,note +1,café,hello +2,naïve,world diff --git a/test-cases/junk-corpus/test_data/no_extension b/test-cases/junk-corpus/test_data/no_extension new file mode 100644 index 0000000..fb3798b --- /dev/null +++ b/test-cases/junk-corpus/test_data/no_extension @@ -0,0 +1,2 @@ +id,name,note +1,alice,hi diff --git a/test-cases/junk-corpus/test_data/one_huge_line.csv b/test-cases/junk-corpus/test_data/one_huge_line.csv new file mode 100644 index 0000000..bc0d242 --- /dev/null +++ b/test-cases/junk-corpus/test_data/one_huge_line.csv @@ -0,0 +1 @@ +a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a, \ No newline at end of file diff --git a/test-cases/junk-corpus/test_data/only_bom.csv b/test-cases/junk-corpus/test_data/only_bom.csv new file mode 100644 index 0000000..5f28270 --- /dev/null +++ b/test-cases/junk-corpus/test_data/only_bom.csv @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/test-cases/junk-corpus/test_data/only_nul.csv b/test-cases/junk-corpus/test_data/only_nul.csv new file mode 100644 index 0000000000000000000000000000000000000000..9017fd98b5f67d928cc64c59b2c025472ce74f8c GIT binary patch literal 64 LcmZQzpbP*206+i% literal 0 HcmV?d00001 diff --git a/test-cases/junk-corpus/test_data/only_whitespace.csv b/test-cases/junk-corpus/test_data/only_whitespace.csv new file mode 100644 index 0000000..fa3940d --- /dev/null +++ b/test-cases/junk-corpus/test_data/only_whitespace.csv @@ -0,0 +1,3 @@ + + + diff --git a/test-cases/junk-corpus/test_data/png_magic_as_csv.csv b/test-cases/junk-corpus/test_data/png_magic_as_csv.csv new file mode 100644 index 0000000000000000000000000000000000000000..27954ede0c34a1fd1e89617dc81a413c0227aeed GIT binary patch literal 520 zcmV+j0{8uiP)v6{b%91#5g6UJ#Pe+ZemOd=j&}5FEuJUPCKBlrKkC{v63B z-(I+2iz0eJw?#CPP|_h@O%mo9@s4^}b*aXM42Wj=%zT!QO-n(MTLoA0_GCKKd(wJt zo_Kc|bf&tX;;u`f*@&sr+W3;-)|i|2LWJ)M2)Buz?CGNs4jIYCC_wvHI2 ze$S=z%SGf2wV>$O*R!h!AOCgi2A(#2;25PoRvCoh`7HoGJ|}PeUbkCEJ=Wd66}jY| zaZ**qUPhUfZLexWi$r}zk{P{MpuQ6X~*)95JmK?boVZpm}VvXQSd^y_8`9$=lbVLmF zc-51e1<$PUSv{p}rEEmf^{K&K$ULLCV*d&ynM5NtPPwvm&S()ta>?nKtRQ_SQW9 zE%LPI9o<7npR}S!V!;?!lOlG7WgJ+L{Od~N3Po{zNFw&TRvuB6zcE{kbL7G8)EcOR KYyGYGq^8Ww+z3tp literal 0 HcmV?d00001 diff --git a/test-cases/junk-corpus/test_data/random_bytes.csv b/test-cases/junk-corpus/test_data/random_bytes.csv new file mode 100644 index 0000000000000000000000000000000000000000..110995a5bcd2c8d453803e15d41048c3ebf6735a GIT binary patch literal 2048 zcmV+b2>->rSWL~BGx0~k(rHcj0 zcnYYfwPyo!r?YNSmX)LyYR5Vjb3xb|h!Aln%3Gp0|5||jZt`z%57qNU^)7P1n>eV> zIoPhhPdT9P9fmupWoNjm>{!$3HMLl~qWs|fWlKeJ|>OV*MTz6D1x%SR8K zT%I4ZdA3}AkrVKrZwbSwXwLW#?>$4PuX==D36(I8bIJk58QeClilG^5KXOo{d3x`+Gi0Y;``qJOuIc4s;t2L zbq9oqx!6<)CJ5n1sjw0z0+ud*8Ar=fgM%dF@(bazi!f&(Bt|6ad39&lTt(fK{>q4W zF(*ZEF}6(1XM@)Rz^eFX0I(Vx@~?=e4B+eu2Utv9c$gnU1F7(Shhbst)ay(J@9K8> zB_lNlg@OYS^K zIGuJpDjh7D1R@-*Tk@l7DJ7<2c$L_nkp>m>lJ+!XFg_8@%^9& z{GRSxQS*^j;N)Ot7~(yG!`)PeUM;wk<>O$9JF(4H!V&jA7ybS5x=UNm>@ULZ)EBuc zdK?w;^LdLGy$7*|&?D)NH63wCd_WTIVJfX}Bro?B!X^o_5OOU6L-$FUiBDXCh_S9( zf^=9U6D#L8pZ-${LP{EoHyeP5jizbmbPa+%;J9DVlP~*@)~I<^T7J)zA2SUDoR*(G zE~ocPEJOH065G2tbAY7gZx884;B@bwLykro@?qyi#>(!+cfD%79Evy;cjzefl`&*LDQ?S6_wg~`hUx; z{xM$dGbs|moZaPI!UO`7b|^4MiC_GTnRUUXJ7QVgJf#W@Odb7;#dIER1F#wRqV{uF z=&f=E%M8cm0THp-X1%Tfh=3L?XwyeYpM%Feq@y<_N;nSDpreK#6`Mpeu=*2pxJU_4 zcaja!KSwBXhrvf*Da|1 zGg9ygy&;)V>EC9_uP}u6xedS4){>FE7 z4`%h8@-VP3xTFHW<00v4mS8zVm3++1RwL&`Lr)m32nZT0`I!OTC} zJIK(XqBDb~Nm=;%^d$GO?U}EMZdwGp;&@;7b_BZefcY22u*`S)mY1+OxFlpG=+VeD zbq;kskk*h%B+>2FG6o`ZSi*wFaBE7#wv0)Wj-!X+{Q6CuP9mL3Q42;1R9{BYKMh!b zoGm_#yN{Z&Y6=|b#)+XxRWwT+(j%pHB{3p1inbr#748c=2tP^-41;3o;>&O(%{f*% zGLdm$p`Vap>OU(4rlQ;pIuMa$)+)qKh5kb<$qQ5X8xSlDKQ(>In9I;JS_rW{rrZ6z zK~_Ml3u|UDqcW9X3QEbJO=@I%fsco`_KpWeT-OzxsL}Vwz}4Tk!V)s6o^IE$)w#Ko zyHak*Kj5ay@jVN*{KJKkt6(2+O;ux{7_%_wFnEaq+jRGU8>ia3jOsLgapK0k;M;pw z^|_^z?>tC%Qla)gtJW2DQ+Pmhu;WO@wl#7J_Y3cd4McR>9aNGW^s~x>k(z-cuHT9< zq%=ehf=*mhA-Lz5CGp~$1e5R}s`tJRvma;ny5u(Ityigq1C@7dCtkr14oDG1VzyTq z%S2ETFfmfY@qqUl_j&(G4L$suj>s?G{p20bN46hb4*=BZQ&hd}@Wt4;ZM78QFYP!+ zG9+aAH}wLx9#KQhTvMqY5uB6JLN{=cKGZV7io=+5JdhYZ?%Aq%Vt|YDY4DwZvKwRI zWC<2%gJok;!a_^3$|glI$$W}XjTHL5s4ktX*<5SHN;9`Mvz9B@IvfrLV%$K|sx_R` zgE|yb^|?dpb}T-VO5G^FW1yx#*}Kv4+TM6pn~SQ=AA0I#g%Bj~f*5~*LscH~ z1rmbi2*>KJ?q1}=`+Jhdg)ASGW z(z3f~K=CV^l;=vae)(_n_W`HI4dt5i;PU9Z{d!zT&ea?l9Yf7QQx!1CY3H=fvL}ku e$(Eqc2|G4COnXT*AqYi$>`Q>%qpMAK-pxqlx$;{8 literal 0 HcmV?d00001 diff --git a/test-cases/junk-corpus/test_data/single_column.csv b/test-cases/junk-corpus/test_data/single_column.csv new file mode 100644 index 0000000..4e6ad35 --- /dev/null +++ b/test-cases/junk-corpus/test_data/single_column.csv @@ -0,0 +1,21 @@ +id +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 diff --git a/test-cases/junk-corpus/test_data/trailing_commas.csv b/test-cases/junk-corpus/test_data/trailing_commas.csv new file mode 100644 index 0000000..10dd119 --- /dev/null +++ b/test-cases/junk-corpus/test_data/trailing_commas.csv @@ -0,0 +1,3 @@ +id,name,note, +1,alice,hi, +2,bob,wo, diff --git a/test-cases/junk-corpus/test_data/truncated_mid_row.csv b/test-cases/junk-corpus/test_data/truncated_mid_row.csv new file mode 100644 index 0000000..cd2f439 --- /dev/null +++ b/test-cases/junk-corpus/test_data/truncated_mid_row.csv @@ -0,0 +1,3 @@ +id,name,note +1,alice,hello +2,bob,wor \ No newline at end of file diff --git a/test-cases/junk-corpus/test_data/tsv_as_csv.csv b/test-cases/junk-corpus/test_data/tsv_as_csv.csv new file mode 100644 index 0000000..682f7ca --- /dev/null +++ b/test-cases/junk-corpus/test_data/tsv_as_csv.csv @@ -0,0 +1,3 @@ +id name note +1 alice hi +2 bob world diff --git a/test-cases/junk-corpus/test_data/unescaped_quotes.csv b/test-cases/junk-corpus/test_data/unescaped_quotes.csv new file mode 100644 index 0000000..20fef35 --- /dev/null +++ b/test-cases/junk-corpus/test_data/unescaped_quotes.csv @@ -0,0 +1,3 @@ +id,note +1,"this has " unescaped quote" +2,"normal" diff --git a/test-cases/junk-corpus/test_data/utf16_be_with_bom.csv b/test-cases/junk-corpus/test_data/utf16_be_with_bom.csv new file mode 100644 index 0000000000000000000000000000000000000000..153023fb5f7fe483089441ad1897b6572dabee50 GIT binary patch literal 82 zcmezOpCOYWg+YfQk0FsEmmw9*%4aA6vbY!wfxKj(cpAe?AS(kXmIH+OKp7*bj`s{@ PU_IqP(ITK+3Ii7a;t~*( literal 0 HcmV?d00001 diff --git a/test-cases/junk-corpus/test_data/utf16_le_no_bom.csv b/test-cases/junk-corpus/test_data/utf16_le_no_bom.csv new file mode 100644 index 0000000000000000000000000000000000000000..72bb74ae04dad9fc6bde63eb3bb9f3a70738e949 GIT binary patch literal 80 zcmc~~NMX=n$YV%k$Yn?cv+@~AfGjQsLm)31D4xdf63EH`isb-dK2XL8s^dLF8CXv_ MP_zgrmjaXn08W|?mH+?% literal 0 HcmV?d00001 diff --git a/test-cases/junk-corpus/test_data/utf32_le.csv b/test-cases/junk-corpus/test_data/utf32_le.csv new file mode 100644 index 0000000000000000000000000000000000000000..7a9398eadb3742295cd50543211e92289c08e428 GIT binary patch literal 160 zcmXwzOA3H63&rrSFNx6q3SIz`+pPU9C#5+H;b{~+dKgsKzG!B*k literal 0 HcmV?d00001 diff --git a/test-cases/junk-corpus/test_data/very_wide_cell.csv b/test-cases/junk-corpus/test_data/very_wide_cell.csv new file mode 100644 index 0000000..b12611a --- /dev/null +++ b/test-cases/junk-corpus/test_data/very_wide_cell.csv @@ -0,0 +1,2 @@ +id,blob +1,"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" diff --git a/test-cases/junk-corpus/test_data/weird_extension.foo b/test-cases/junk-corpus/test_data/weird_extension.foo new file mode 100644 index 0000000..fb3798b --- /dev/null +++ b/test-cases/junk-corpus/test_data/weird_extension.foo @@ -0,0 +1,2 @@ +id,name,note +1,alice,hi diff --git a/tests/test_fixtures_sweep.py b/tests/test_fixtures_sweep.py index fb41a66..7a4cac2 100644 --- a/tests/test_fixtures_sweep.py +++ b/tests/test_fixtures_sweep.py @@ -35,8 +35,10 @@ TEST_CASES_DIR = Path(__file__).resolve().parent.parent / "test-cases" # Subdirectories in test-cases/ that are exercised by their own dedicated # tests. The sweep ignores these so we don't double-test or fight expected -# byte-exact outputs. -_EXCLUDED_SUBDIRS = {"text-cleaner-corpus"} +# byte-exact outputs. ``junk-corpus`` is intentionally pathological — +# files there are designed to break the cleaner/analyzer; the contract is +# enforced by ``tests/test_junk_corpus.py``, not this happy-path sweep. +_EXCLUDED_SUBDIRS = {"text-cleaner-corpus", "junk-corpus"} # File suffixes we know how to load. _SUPPORTED_SUFFIXES = {".csv", ".tsv", ".xlsx", ".xls"} diff --git a/tests/test_junk_corpus.py b/tests/test_junk_corpus.py new file mode 100644 index 0000000..4f051d8 --- /dev/null +++ b/tests/test_junk_corpus.py @@ -0,0 +1,156 @@ +"""Stress-test the upload analyzer against a corpus of pathological files. + +Every file under ``test-cases/junk-corpus/test_data/`` is fed through +``_run_analysis_on_upload`` — the same path the GUI takes when a user +drops a file on the home page. The contract under test is: + +* The call never raises. Errors must surface as a synthetic ``Finding`` + with severity ``"error"``, not a Python traceback that the page + chrome bubbles up to the user. +* The return is always a list of :class:`Finding` (possibly empty for + files the analyzer judges clean). +* Specific high-risk files (empty bytes, corrupt zip, etc.) MUST + produce at least one error-level Finding so the UI shows a red + banner rather than silently rendering "no issues found". + +To add a new pathological shape: + +1. Edit ``test-cases/junk-corpus/make_junk_corpus.py`` to write the new + file under ``test_data/``. +2. Re-run that script to materialize the file on disk. +3. (Optional) Add the filename to ``_MUST_BE_ERROR`` below if the file + represents a state where "no findings" would be a silent failure. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from src.core.analyze import Finding +from src.gui.components._legacy import _run_analysis_on_upload + + +_CORPUS = Path(__file__).resolve().parent.parent / "test-cases" / "junk-corpus" / "test_data" + + +class _FakeUpload: + """Duck-type the Streamlit ``UploadedFile`` interface from a path.""" + + def __init__(self, path: Path) -> None: + self.name = path.name + self._bytes = path.read_bytes() + + def getvalue(self) -> bytes: + return self._bytes + + +def _corpus_files() -> list[Path]: + files = sorted(p for p in _CORPUS.iterdir() if p.is_file()) + if not files: + raise RuntimeError( + f"Junk corpus is empty. Run " + f"`python test-cases/junk-corpus/make_junk_corpus.py` " + f"to generate {_CORPUS}." + ) + return files + + +# Files where "zero findings" would be a silent failure — these are +# structurally broken enough that the analyzer MUST flag them. The +# error-level Finding is what shows the user a red banner instead of +# the misleading "no issues found" success path. +_MUST_BE_ERROR = { + "empty.csv", + "only_bom.csv", + "only_nul.csv", + "corrupt_xlsx.xlsx", +} + + +@pytest.mark.parametrize( + "path", + _corpus_files(), + ids=lambda p: p.name, +) +class TestJunkCorpus: + """Every pathological file must round-trip through the analyzer + without raising. The error message format is checked separately + via :func:`TestJunkCorpus.test_error_findings_have_a_description`. + """ + + def test_no_exception_propagates(self, path: Path) -> None: + upload = _FakeUpload(path) + # The point of the test: any exception from analyze() / pandas / + # repair_bytes / openpyxl SHOULD have been caught and turned + # into an error Finding by ``_run_analysis_on_upload``. If this + # raises, the home page would crash on this file in production. + findings = _run_analysis_on_upload(upload) + assert isinstance(findings, list), ( + f"{path.name}: expected list[Finding], got {type(findings).__name__}" + ) + + def test_findings_are_well_formed(self, path: Path) -> None: + upload = _FakeUpload(path) + findings = _run_analysis_on_upload(upload) + for f in findings: + assert isinstance(f, Finding), ( + f"{path.name}: non-Finding in result list: {f!r}" + ) + assert isinstance(f.id, str) and f.id, ( + f"{path.name}: Finding has empty id" + ) + assert f.severity in ("info", "warn", "error"), ( + f"{path.name}: Finding has bad severity {f.severity!r}" + ) + assert isinstance(f.description, str) and f.description, ( + f"{path.name}: Finding has empty description" + ) + + def test_must_be_error_files_actually_flag(self, path: Path) -> None: + if path.name not in _MUST_BE_ERROR: + pytest.skip(f"{path.name} is allowed to pass clean") + upload = _FakeUpload(path) + findings = _run_analysis_on_upload(upload) + errors = [f for f in findings if f.severity == "error"] + assert errors, ( + f"{path.name} should surface at least one error-level " + f"Finding so the UI shows a red banner; got {len(findings)} " + f"findings (none of severity 'error')." + ) + + def test_error_findings_have_a_description(self, path: Path) -> None: + """Error findings must carry a description the user can act on. + + For an empty / corrupt file the description is the ONLY thing + the user sees — it has to name the file or include enough + context that they can fix the underlying problem. + """ + upload = _FakeUpload(path) + findings = _run_analysis_on_upload(upload) + for f in findings: + if f.severity != "error": + continue + # The synthetic error Findings always interpolate the file + # name; analyzer-generated errors include the column or a + # description that mentions what was wrong. + assert len(f.description) >= 20, ( + f"{path.name}: error Finding description is too short " + f"to be useful: {f.description!r}" + ) + + +def test_corpus_contains_expected_shapes() -> None: + """Sanity-check that the corpus generator wrote the files we rely + on for the must-be-error matrix. If somebody renames a file in + ``make_junk_corpus.py`` without updating ``_MUST_BE_ERROR``, this + test catches it before the per-file parametrization silently + skips the must-be-error assertion.""" + names = {p.name for p in _corpus_files()} + missing = _MUST_BE_ERROR - names + assert not missing, ( + f"_MUST_BE_ERROR references files that don't exist in the " + f"corpus: {sorted(missing)}. Regenerate the corpus or update " + f"_MUST_BE_ERROR." + )