""" Generator for the 02_text_cleaner test corpus. Writes raw bytes where exact control over encoding/line-endings/invisible characters matters. Do not edit the output files in a text editor that "helpfully" normalizes anything; it will silently break the tests. Run from the corpus root: python generate_test_data.py """ from pathlib import Path ROOT = Path(__file__).parent TD = ROOT / "test_data" EX = ROOT / "expected" TD.mkdir(exist_ok=True) EX.mkdir(exist_ok=True) def write_bytes(path, data): Path(path).write_bytes(data) def write_text(path, text, encoding="utf-8", newline="\n"): # Explicit byte write so we control line endings exactly. if newline != "\n": text = text.replace("\n", newline) Path(path).write_bytes(text.encode(encoding)) # --------------------------------------------------------------------------- # 01 Whitespace - basic (ASCII space + tab) # --------------------------------------------------------------------------- write_text(TD / "01_whitespace_basic.csv", ( "id,name,city\n" "1, Alice ,New York\n" # leading + trailing spaces "2,Bob, Chicago\n" # leading spaces "3,Carol ,San Francisco \n" # trailing spaces "4,Dan Smith,Austin\n" # internal multi-space "5,\tEve\t,\tBoston\t\n" # tab padding "6,Frank van der Berg,Denver\n" # multiple internal multi-space runs "7, Grace Hopper , Palo Alto \n" # everything at once )) write_text(EX / "01_whitespace_basic.csv", ( "id,name,city\n" "1,Alice,New York\n" "2,Bob,Chicago\n" "3,Carol,San Francisco\n" "4,Dan Smith,Austin\n" "5,Eve,Boston\n" "6,Frank van der Berg,Denver\n" "7,Grace Hopper,Palo Alto\n" )) # --------------------------------------------------------------------------- # 02 Whitespace - unicode (NBSP, narrow NBSP, ideographic space, etc.) # --------------------------------------------------------------------------- # These are the whitespace-pretenders that .strip() in Python 3 actually # DOES handle, but that .strip() in many naive implementations (or pandas # defaults) does NOT. Test that they're stripped, not preserved. NBSP = "\u00A0" # non-breaking space (very common from Word/Excel paste) NNBSP = "\u202F" # narrow no-break space IDEO = "\u3000" # ideographic space (CJK) EM_SPACE = "\u2003" # em space THIN_SPACE = "\u2009" # thin space write_text(TD / "02_whitespace_unicode.csv", ( "id,label,note\n" f"1,{NBSP}Premium{NBSP},NBSP padding\n" f"2,{NNBSP}Discount{NNBSP},narrow NBSP\n" f"3,{IDEO}Standard{IDEO},ideographic space\n" f"4,Tier{EM_SPACE}{EM_SPACE}One,em-space internal\n" f"5,Cost{THIN_SPACE}Plus,thin-space internal\n" f"6, {NBSP} mixed {NBSP} ,ascii + NBSP combined\n" )) write_text(EX / "02_whitespace_unicode.csv", ( "id,label,note\n" "1,Premium,NBSP padding\n" "2,Discount,narrow NBSP\n" "3,Standard,ideographic space\n" "4,Tier One,em-space internal\n" "5,Cost Plus,thin-space internal\n" "6,mixed,ascii + NBSP combined\n" )) # --------------------------------------------------------------------------- # 03 Smart punctuation (curly quotes, em/en dash, ellipsis, primes) # --------------------------------------------------------------------------- # This is the #1 source of pollution from data that ever passed through # Word, Outlook, or Excel autocorrect. ASCII-fy it. write_text(TD / "03_smart_punctuation.csv", ( "id,quote,measurement\n" "1,\u201cHello world\u201d,5\u2032 11\u2033\n" # curly double quotes, prime/double-prime "2,it\u2019s working,\u2014\n" # curly apostrophe, em-dash alone "3,2020\u20132024,from \u2018a\u2019 to \u2018z\u2019\n" # en-dash range, curly singles "4,wait\u2026,3 \u00d7 4\n" # ellipsis char, multiplication sign "5,\u00abquoted\u00bb,5 \u00b1 0.1\n" # guillemets, plus-minus )) # Default policy: ASCII-fy where round-trip-safe. # Notable: \u00d7 (multiplication) and \u00b1 (plus-minus) are typographically # meaningful and not safely round-trippable to ASCII, so we PRESERVE them # (case 4 col3, case 5 col3). Document this in TEST-CASES.md. write_text(EX / "03_smart_punctuation.csv", ( "id,quote,measurement\n" "1,\"\"\"Hello world\"\"\",\"5' 11\"\"\"\n" "2,it's working,-\n" "3,2020-2024,from 'a' to 'z'\n" "4,wait...,3 \u00d7 4\n" "5,\"\"\"quoted\"\"\",5 \u00b1 0.1\n" )) # --------------------------------------------------------------------------- # 04 Unicode normalization forms (NFC vs NFD, ligatures, fullwidth) # --------------------------------------------------------------------------- # "café" can be either: # NFC: "caf\u00e9" (e-acute as single code point) # NFD: "cafe\u0301" (e + combining acute accent, two code points) # These look identical but compare unequal. Normalize to NFC. write_text(TD / "04_unicode_forms.csv", ( "id,name,description\n" "1,caf\u00e9,NFC form (single code point)\n" "2,cafe\u0301,NFD form (e + combining accent)\n" "3,na\u00efve,NFC i-diaeresis\n" "4,nai\u0308ve,NFD i + combining diaeresis\n" "5,o\uFB03ce,fi-ligature (\uFB03)\n" # 'office' written with 'ffi' ligature "6,\uFF21\uFF22\uFF23,fullwidth ABC\n" # A B C "7,\u2168 century,roman numeral nine (single code point)\n" # Ⅸ )) # Policy: NFC by default (most compatible, smallest, what Excel emits). # NFKC option would also fold ligatures and fullwidth digits/letters, # but is destructive for some legitimate text. Default = NFC. # So: # - Cases 1 vs 2 should produce identical output after normalization # - Cases 3 vs 4 should produce identical output # - Case 5 ligature stays as ligature under NFC (would fold under NFKC) # - Case 6 fullwidth stays fullwidth under NFC (would fold under NFKC) write_text(EX / "04_unicode_forms.csv", ( "id,name,description\n" "1,caf\u00e9,NFC form (single code point)\n" "2,caf\u00e9,NFD form (e + combining accent)\n" # same bytes as row 1 now "3,na\u00efve,NFC i-diaeresis\n" "4,na\u00efve,NFD i + combining diaeresis\n" # same as row 3 now "5,o\uFB03ce,fi-ligature (\uFB03)\n" "6,\uFF21\uFF22\uFF23,fullwidth ABC\n" "7,\u2168 century,roman numeral nine (single code point)\n" )) # --------------------------------------------------------------------------- # 05 Zero-width / invisible characters # --------------------------------------------------------------------------- ZWSP = "\u200B" # zero-width space ZWNJ = "\u200C" # zero-width non-joiner ZWJ = "\u200D" # zero-width joiner LRM = "\u200E" # left-to-right mark RLM = "\u200F" # right-to-left mark SOFT_HYPHEN = "\u00AD" WORD_JOINER = "\u2060" write_text(TD / "05_zero_width_invisible.csv", ( "id,value,note\n" f"1,Hel{ZWSP}lo,zero-width space inside word\n" f"2,{ZWSP}Lead{ZWSP}ing,leading + internal ZWSP\n" f"3,Trail{ZWSP},trailing ZWSP\n" f"4,a{ZWNJ}b{ZWJ}c,ZWNJ and ZWJ\n" f"5,{LRM}Marked{RLM},LTR + RTL marks bracketing\n" f"6,co{SOFT_HYPHEN}operate,soft hyphen\n" f"7,no{WORD_JOINER}break,word joiner\n" )) write_text(EX / "05_zero_width_invisible.csv", ( "id,value,note\n" "1,Hello,zero-width space inside word\n" "2,Leading,leading + internal ZWSP\n" "3,Trail,trailing ZWSP\n" "4,abc,ZWNJ and ZWJ\n" "5,Marked,LTR + RTL marks bracketing\n" "6,cooperate,soft hyphen\n" "7,nobreak,word joiner\n" )) # --------------------------------------------------------------------------- # 06 Control characters (non-printable, except tab/CR/LF inside quoted cells) # --------------------------------------------------------------------------- # These bytes show up in real exports from broken systems, terminals, or # binary data accidentally exported as text. # \x00 NUL, \x01 SOH, \x07 BEL, \x08 BS, \x0B VT, \x0C FF, \x1B ESC, \x7F DEL write_text(TD / "06_control_characters.csv", ( "id,value,note\n" "1,Hello\x00World,NUL byte inside\n" "2,Bell\x07Sound,BEL character\n" "3,Back\x08space,backspace\n" "4,Vert\x0BTab,vertical tab\n" "5,Form\x0CFeed,form feed\n" "6,Esc\x1Bape,ESC character\n" "7,Del\x7Fete,DEL character\n" "8,Mixed\x00\x07\x1Bjunk,multiple controls in one cell\n" )) write_text(EX / "06_control_characters.csv", ( "id,value,note\n" "1,HelloWorld,NUL byte inside\n" "2,BellSound,BEL character\n" "3,Backspace,backspace\n" "4,VertTab,vertical tab\n" "5,FormFeed,form feed\n" "6,Escape,ESC character\n" "7,Delete,DEL character\n" "8,Mixedjunk,multiple controls in one cell\n" )) # --------------------------------------------------------------------------- # 07 BOM at start of file (UTF-8 BOM = EF BB BF) # --------------------------------------------------------------------------- # Excel writes UTF-8 with BOM by default. pandas usually handles it but # leaves the BOM as part of the first column's header name if you're not # careful, producing a mystery column called "\ufeffid" that breaks lookups. bom = b"\xef\xbb\xbf" content = ( "id,name,city\n" "1,Alice,New York\n" "2,Bob,Chicago\n" ).encode("utf-8") write_bytes(TD / "07_bom_utf8.csv", bom + content) # Expected: BOM stripped on read, output written WITHOUT BOM, header is # clean "id" not "\ufeffid". write_bytes(EX / "07_bom_utf8.csv", content) # --------------------------------------------------------------------------- # 08 Line endings - all CRLF (Windows) # --------------------------------------------------------------------------- # Default policy: normalize to LF on output. write_text(TD / "08_line_endings_crlf.csv", ( "id,name\n" "1,Alice\n" "2,Bob\n" "3,Carol\n" ), newline="\r\n") write_text(EX / "08_line_endings_crlf.csv", ( "id,name\n" "1,Alice\n" "2,Bob\n" "3,Carol\n" )) # --------------------------------------------------------------------------- # 09 Line endings - CR only (classic Mac, pre-OSX, occasionally still seen) # --------------------------------------------------------------------------- write_text(TD / "09_line_endings_cr.csv", ( "id,name\n" "1,Alice\n" "2,Bob\n" "3,Carol\n" ), newline="\r") write_text(EX / "09_line_endings_cr.csv", ( "id,name\n" "1,Alice\n" "2,Bob\n" "3,Carol\n" )) # --------------------------------------------------------------------------- # 10 Line endings - mixed within the same file # --------------------------------------------------------------------------- # Real-world disaster mode: file edited on multiple OSes, or concatenated # from sources with different conventions. mixed = ( b"id,name\r\n" b"1,Alice\n" b"2,Bob\r" b"3,Carol\r\n" b"4,Dan\n" ) write_bytes(TD / "10_line_endings_mixed.csv", mixed) write_text(EX / "10_line_endings_mixed.csv", ( "id,name\n" "1,Alice\n" "2,Bob\n" "3,Carol\n" "4,Dan\n" )) # --------------------------------------------------------------------------- # 11 Embedded newlines INSIDE quoted cells (must be preserved!) # --------------------------------------------------------------------------- # This is the trap: line-ending normalization at the FILE level must not # destroy intentional newlines INSIDE quoted multi-line cells (e.g., a # notes column or an address column). # But the embedded line endings should also be normalized to LF for # consistency. write_text(TD / "11_embedded_newlines.csv", ( "id,address,notes\n" "1,\"123 Main St\r\nApt 4B\r\nNew York, NY\",\"line1\nline2\"\n" "2,\"Single line\",\"contains\rclassic mac\rinternal\"\n" "3,\"normal\",\"no newlines here\"\n" )) # Expected: file-level CRLF normalized to LF; embedded CRLF/CR also # normalized to LF; cells stay multi-line. write_text(EX / "11_embedded_newlines.csv", ( "id,address,notes\n" "1,\"123 Main St\nApt 4B\nNew York, NY\",\"line1\nline2\"\n" "2,Single line,\"contains\nclassic mac\ninternal\"\n" "3,normal,no newlines here\n" )) # --------------------------------------------------------------------------- # 12 Case operations (opt-in, default = preserve) # --------------------------------------------------------------------------- # This file tests case operations IF the user requests them. # Default behavior: PRESERVE. So expected_default == input. # An expected_lower.csv shows what lower-case mode produces. write_text(TD / "12_case_variations.csv", ( "id,name,email,product\n" "1,ALICE SMITH,Alice@Example.COM,Widget\n" "2,bob jones,BOB@example.com,GADGET\n" "3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n" "4,DAN O'CONNOR,Dan@Example.com,gizmo\n" )) # Default expected: identical to input (case ops are opt-in). write_text(EX / "12_case_variations__default.csv", ( "id,name,email,product\n" "1,ALICE SMITH,Alice@Example.COM,Widget\n" "2,bob jones,BOB@example.com,GADGET\n" "3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n" "4,DAN O'CONNOR,Dan@Example.com,gizmo\n" )) # With --case-email=lower applied to email column only: write_text(EX / "12_case_variations__email_lower.csv", ( "id,name,email,product\n" "1,ALICE SMITH,alice@example.com,Widget\n" "2,bob jones,bob@example.com,GADGET\n" "3,Carol Brown,carol@example.com,wIdGeT\n" "4,DAN O'CONNOR,dan@example.com,gizmo\n" )) # With --case=title applied to name column: write_text(EX / "12_case_variations__name_title.csv", ( "id,name,email,product\n" "1,Alice Smith,Alice@Example.COM,Widget\n" "2,Bob Jones,BOB@example.com,GADGET\n" "3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n" "4,Dan O'Connor,Dan@Example.com,gizmo\n" # title-case must not break O'C )) # --------------------------------------------------------------------------- # 13 Non-Latin scripts and emoji (PRESERVE; do not mangle) # --------------------------------------------------------------------------- # This is a negative test: the cleaner must not damage characters that # look "foreign" to it. Whitespace trimming and Unicode NFC are still applied. write_text(TD / "13_non_latin_scripts.csv", ( "id,name,note\n" "1, \u4e2d\u56fd\u5317\u4eac ,Beijing in Chinese (with leading/trailing space)\n" "2,\u30c6\u30b9\u30c8,Japanese katakana (test)\n" "3,\u062a\u062c\u0631\u0628\u0629,Arabic (test) - RTL\n" "4,\u041c\u043e\u0441\u043a\u0432\u0430,Russian (Moscow)\n" "5,\U0001F389 launch \U0001F680,emoji preserved\n" "6,caf\u00e9 \u2615,emoji + accent combo\n" )) write_text(EX / "13_non_latin_scripts.csv", ( "id,name,note\n" "1,\u4e2d\u56fd\u5317\u4eac,Beijing in Chinese (with leading/trailing space)\n" "2,\u30c6\u30b9\u30c8,Japanese katakana (test)\n" "3,\u062a\u062c\u0631\u0628\u0629,Arabic (test) - RTL\n" "4,\u041c\u043e\u0441\u043a\u0432\u0430,Russian (Moscow)\n" "5,\U0001F389 launch \U0001F680,emoji preserved\n" "6,caf\u00e9 \u2615,emoji + accent combo\n" )) # --------------------------------------------------------------------------- # 14 Mojibake (double-encoded UTF-8 / cp1252 misread as Latin-1) # --------------------------------------------------------------------------- # Classic mojibake: someone took a UTF-8 file, opened it as Windows-1252, # saved as UTF-8 again. "café" becomes "café", "naïve" becomes "naïve". # The text cleaner CANNOT reliably auto-fix this (it's a heuristic and can # false-positive on legitimate strings). Default = WARN, do not auto-fix. # Optional --fix-mojibake flag (uses ftfy library) can attempt repair. write_text(TD / "14_mojibake.csv", ( "id,name,city\n" "1,caf\u00c3\u00a9,M\u00c3\u00bcnchen\n" # café, München mojibaked "2,na\u00c3\u00afve,r\u00c3\u00a9sum\u00c3\u00a9\n" # naïve, résumé "3,don\u00e2\u20ac\u2122t,smart-apostrophe mojibake\n" # don't via cp1252-mojibake "4,Alice,New York\n" # clean control row )) # Expected output WITHOUT mojibake fix (default): bytes preserved, but # reader emits a warning to logs. write_text(EX / "14_mojibake__default.csv", ( "id,name,city\n" "1,caf\u00c3\u00a9,M\u00c3\u00bcnchen\n" "2,na\u00c3\u00afve,r\u00c3\u00a9sum\u00c3\u00a9\n" "3,don\u00e2\u20ac\u2122t,smart-apostrophe mojibake\n" "4,Alice,New York\n" )) # Expected output WITH --fix-mojibake (uses ftfy or equivalent): write_text(EX / "14_mojibake__fixed.csv", ( "id,name,city\n" "1,caf\u00e9,M\u00fcnchen\n" "2,na\u00efve,r\u00e9sum\u00e9\n" "3,don't,smart-apostrophe mojibake\n" # smart apostrophe also fixed "4,Alice,New York\n" )) # --------------------------------------------------------------------------- # 15 Whitespace-only cells (boundary case with script 04) # --------------------------------------------------------------------------- # Per TECHNICAL.md Section 9.3: 02 trims first, leaving an empty string. # 04 then detects empty strings as disguised null. So 02's job here is # just to convert " " into "". write_text(TD / "15_whitespace_only_cells.csv", ( "id,value\n" "1,real\n" "2, \n" # spaces only "3,\t\t\n" # tabs only "4,\u00A0\u00A0\n" # NBSP only "5, \t \u00A0 \n" # mixed whitespace "6,\n" # already empty "7,actual value\n" )) write_text(EX / "15_whitespace_only_cells.csv", ( "id,value\n" "1,real\n" "2,\n" # all whitespace -> empty "3,\n" "4,\n" "5,\n" "6,\n" "7,actual value\n" )) # --------------------------------------------------------------------------- # 16 Dirty headers # --------------------------------------------------------------------------- # Headers themselves have whitespace, BOM remnants, smart quotes, etc. # These break downstream lookups (df["email"] fails because the column # is actually called " Email " with NBSP padding). write_text(TD / "16_dirty_headers.csv", ( " id ,\u00a0Customer Name\u00a0,\u201cEmail\u201d,Phone\u200b\n" "1,Alice,alice@example.com,555-1234\n" "2,Bob,bob@example.com,555-5678\n" )) # Expected: headers cleaned by SAME rules as data cells. # Note: smart quotes around "Email" become straight quotes. The header # "\"Email\"" with embedded quotes needs CSV-quoting in the output. write_text(EX / "16_dirty_headers.csv", ( "id,Customer Name,\"\"\"Email\"\"\",Phone\n" "1,Alice,alice@example.com,555-1234\n" "2,Bob,bob@example.com,555-5678\n" )) # --------------------------------------------------------------------------- # 17 Preserve-intended (negative tests - things 02 must NOT touch) # --------------------------------------------------------------------------- # Numbers that LOOK like they have whitespace are tricky: " 123 " is # a number with padding (trim) but "1 234" might be a thousands-separator # locale (don't collapse). Default: trim outer whitespace, but DO NOT # collapse internal whitespace in cells that parse as numeric. This is a # judgment call; document it. # # Also: do not reformat dates, currencies, or phone numbers. That's 03. # Do not detect or replace null-like values. That's 04. write_text(TD / "17_preserve_intended.csv", ( "id,price,european_number,date,phone,quantity\n" "1, 100 ,1 234,2024-01-15,(555) 123-4567,42\n" "2,\" $1,500.00 \",12 345,15/01/2024,555.123.4567,7\n" "3, N/A ,nan,Jan 15 2024,+1 555 123 4567,0\n" )) # Expected: outer whitespace trimmed everywhere, but: # - "1 234" stays "1 234" (looks like European/space-thousands; don't collapse) # - "$1,500.00" stays unchanged (currency, that's 03's domain) # - "15/01/2024" stays unchanged (date, that's 03's domain) # - "(555) 123-4567" stays unchanged (phone, that's 03's domain) # - "N/A" stays "N/A" (null-like, that's 04's domain - 02 doesn't decide what's null) # - phone "+1 555 123 4567" - keep internal spaces (it's a phone, 03's domain) write_text(EX / "17_preserve_intended.csv", ( "id,price,european_number,date,phone,quantity\n" "1,100,1 234,2024-01-15,(555) 123-4567,42\n" "2,\"$1,500.00\",12 345,15/01/2024,555.123.4567,7\n" "3,N/A,nan,Jan 15 2024,+1 555 123 4567,0\n" )) # --------------------------------------------------------------------------- # 18 Empty file (zero bytes) # --------------------------------------------------------------------------- write_bytes(TD / "18_empty_file.csv", b"") # Expected: graceful handling, output is also empty (or warning emitted). write_bytes(EX / "18_empty_file.csv", b"") # --------------------------------------------------------------------------- # 19 Headers only (no data rows) # --------------------------------------------------------------------------- write_text(TD / "19_headers_only.csv", ( " id ,Name\u00a0,Email\u200b\n" )) # Expected: headers cleaned, no data rows in output. write_text(EX / "19_headers_only.csv", ( "id,Name,Email\n" )) # --------------------------------------------------------------------------- # 20 Real-world kitchen sink (everything combined) # --------------------------------------------------------------------------- # Simulates a typical messy export: came from Excel via cp1252 paste, # saved as UTF-8 with BOM, has CRLF, has smart quotes from autocorrect, # has NBSP from copy/paste, has trailing whitespace. content = ( " id ,\u00a0Name\u00a0,\u201cEmail\u201d,Notes\u200b\n" "1,\u00a0Alice Smith\u00a0,Alice@Example.COM,\u201cVIP\u201d customer \u2014 contact ASAP\u2026\r\n" "2,\tBob\tJones\t,bob@example.com,it\u2019s 5\u20326\u2033 tall\r\n" "3, Carol Brown ,CAROL@EXAMPLE.COM,3 \u00d7 4 = 12 (preserve \u00d7)\r\n" "4, ,empty@example.com,whitespace-only name (becomes empty)\r\n" ) data_bytes = b"\xef\xbb\xbf" + content.encode("utf-8") # replace LF with CRLF wherever it isn't already to be unambiguous write_bytes(TD / "20_kitchen_sink.csv", data_bytes) # Expected: BOM gone, headers clean, smart quotes ASCII-fied, NBSP/ZWSP # stripped, internal multi-space collapsed, CRLF normalized to LF, # whitespace-only cells become empty, multiplication sign preserved, # em-dash and ellipsis converted, prime/double-prime converted. write_text(EX / "20_kitchen_sink.csv", ( "id,Name,\"\"\"Email\"\"\",Notes\n" "1,Alice Smith,Alice@Example.COM,\"\"\"VIP\"\" customer - contact ASAP...\"\n" "2,Bob Jones,bob@example.com,\"it's 5'6\"\" tall\"\n" "3,Carol Brown,CAROL@EXAMPLE.COM,3 \u00d7 4 = 12 (preserve \u00d7)\n" "4,,empty@example.com,whitespace-only name (becomes empty)\n" )) print("All CSV test files written.") print(f" inputs: {TD}") print(f" expected: {EX}")