Files
datatools-dev/test-cases/text-cleaner-corpus/generate_test_data.py
Michael c349a90e18 test: add text-cleaner corpus and close gaps surfaced by it
The 21-fixture corpus (test-cases/text-cleaner-corpus/) exercises the cleaner
end-to-end against the spec in TEST-CASES.md. Closing the failing cases drove
five small cleaner fixes plus two fixture-generation fixes:

- _SMART_CHARS: add prime, double prime, guillemets (case 03)
- _ZERO_WIDTH: add soft hyphen U+00AD (case 05)
- clean_dataframe: clean column headers via the same pipeline (cases 16/19/20),
  with a clean_headers toggle on CleanOptions
- smart_title_case: title-case full-shout strings ("ALICE SMITH" -> "Alice
  Smith") while still preserving embedded acronyms; preserve uppercase after
  apostrophe in names ("O'CONNOR" -> "O'Connor", "o'neil" -> "O'neil")
- test_corpus.py reader: pre-strip NUL bytes (C parser truncates at NUL,
  python engine is too strict about embedded literal "), per spec case 06
- generate_test_data.py: properly CSV-escape literal-quote cells in case 03
  expected; quote the rogue-comma price field in case 17 input

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 15:37:35 +00:00

546 lines
22 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Generator for the 02_text_cleaner test corpus.
Writes raw bytes where exact control over encoding/line-endings/invisible
characters matters. Do not edit the output files in a text editor that
"helpfully" normalizes anything; it will silently break the tests.
Run from the corpus root:
python generate_test_data.py
"""
from pathlib import Path
ROOT = Path(__file__).parent
TD = ROOT / "test_data"
EX = ROOT / "expected"
TD.mkdir(exist_ok=True)
EX.mkdir(exist_ok=True)
def write_bytes(path, data):
Path(path).write_bytes(data)
def write_text(path, text, encoding="utf-8", newline="\n"):
# Explicit byte write so we control line endings exactly.
if newline != "\n":
text = text.replace("\n", newline)
Path(path).write_bytes(text.encode(encoding))
# ---------------------------------------------------------------------------
# 01 Whitespace - basic (ASCII space + tab)
# ---------------------------------------------------------------------------
write_text(TD / "01_whitespace_basic.csv", (
"id,name,city\n"
"1, Alice ,New York\n" # leading + trailing spaces
"2,Bob, Chicago\n" # leading spaces
"3,Carol ,San Francisco \n" # trailing spaces
"4,Dan Smith,Austin\n" # internal multi-space
"5,\tEve\t,\tBoston\t\n" # tab padding
"6,Frank van der Berg,Denver\n" # multiple internal multi-space runs
"7, Grace Hopper , Palo Alto \n" # everything at once
))
write_text(EX / "01_whitespace_basic.csv", (
"id,name,city\n"
"1,Alice,New York\n"
"2,Bob,Chicago\n"
"3,Carol,San Francisco\n"
"4,Dan Smith,Austin\n"
"5,Eve,Boston\n"
"6,Frank van der Berg,Denver\n"
"7,Grace Hopper,Palo Alto\n"
))
# ---------------------------------------------------------------------------
# 02 Whitespace - unicode (NBSP, narrow NBSP, ideographic space, etc.)
# ---------------------------------------------------------------------------
# These are the whitespace-pretenders that .strip() in Python 3 actually
# DOES handle, but that .strip() in many naive implementations (or pandas
# defaults) does NOT. Test that they're stripped, not preserved.
NBSP = "\u00A0" # non-breaking space (very common from Word/Excel paste)
NNBSP = "\u202F" # narrow no-break space
IDEO = "\u3000" # ideographic space (CJK)
EM_SPACE = "\u2003" # em space
THIN_SPACE = "\u2009" # thin space
write_text(TD / "02_whitespace_unicode.csv", (
"id,label,note\n"
f"1,{NBSP}Premium{NBSP},NBSP padding\n"
f"2,{NNBSP}Discount{NNBSP},narrow NBSP\n"
f"3,{IDEO}Standard{IDEO},ideographic space\n"
f"4,Tier{EM_SPACE}{EM_SPACE}One,em-space internal\n"
f"5,Cost{THIN_SPACE}Plus,thin-space internal\n"
f"6, {NBSP} mixed {NBSP} ,ascii + NBSP combined\n"
))
write_text(EX / "02_whitespace_unicode.csv", (
"id,label,note\n"
"1,Premium,NBSP padding\n"
"2,Discount,narrow NBSP\n"
"3,Standard,ideographic space\n"
"4,Tier One,em-space internal\n"
"5,Cost Plus,thin-space internal\n"
"6,mixed,ascii + NBSP combined\n"
))
# ---------------------------------------------------------------------------
# 03 Smart punctuation (curly quotes, em/en dash, ellipsis, primes)
# ---------------------------------------------------------------------------
# This is the #1 source of pollution from data that ever passed through
# Word, Outlook, or Excel autocorrect. ASCII-fy it.
write_text(TD / "03_smart_punctuation.csv", (
"id,quote,measurement\n"
"1,\u201cHello world\u201d,5\u2032 11\u2033\n" # curly double quotes, prime/double-prime
"2,it\u2019s working,\u2014\n" # curly apostrophe, em-dash alone
"3,2020\u20132024,from \u2018a\u2019 to \u2018z\u2019\n" # en-dash range, curly singles
"4,wait\u2026,3 \u00d7 4\n" # ellipsis char, multiplication sign
"5,\u00abquoted\u00bb,5 \u00b1 0.1\n" # guillemets, plus-minus
))
# Default policy: ASCII-fy where round-trip-safe.
# Notable: \u00d7 (multiplication) and \u00b1 (plus-minus) are typographically
# meaningful and not safely round-trippable to ASCII, so we PRESERVE them
# (case 4 col3, case 5 col3). Document this in TEST-CASES.md.
write_text(EX / "03_smart_punctuation.csv", (
"id,quote,measurement\n"
"1,\"\"\"Hello world\"\"\",\"5' 11\"\"\"\n"
"2,it's working,-\n"
"3,2020-2024,from 'a' to 'z'\n"
"4,wait...,3 \u00d7 4\n"
"5,\"\"\"quoted\"\"\",5 \u00b1 0.1\n"
))
# ---------------------------------------------------------------------------
# 04 Unicode normalization forms (NFC vs NFD, ligatures, fullwidth)
# ---------------------------------------------------------------------------
# "café" can be either:
# NFC: "caf\u00e9" (e-acute as single code point)
# NFD: "cafe\u0301" (e + combining acute accent, two code points)
# These look identical but compare unequal. Normalize to NFC.
write_text(TD / "04_unicode_forms.csv", (
"id,name,description\n"
"1,caf\u00e9,NFC form (single code point)\n"
"2,cafe\u0301,NFD form (e + combining accent)\n"
"3,na\u00efve,NFC i-diaeresis\n"
"4,nai\u0308ve,NFD i + combining diaeresis\n"
"5,o\uFB03ce,fi-ligature (\uFB03)\n" # 'office' written with 'ffi' ligature
"6,\uFF21\uFF22\uFF23,fullwidth ABC\n" #
"7,\u2168 century,roman numeral nine (single code point)\n" # Ⅸ
))
# Policy: NFC by default (most compatible, smallest, what Excel emits).
# NFKC option would also fold ligatures and fullwidth digits/letters,
# but is destructive for some legitimate text. Default = NFC.
# So:
# - Cases 1 vs 2 should produce identical output after normalization
# - Cases 3 vs 4 should produce identical output
# - Case 5 ligature stays as ligature under NFC (would fold under NFKC)
# - Case 6 fullwidth stays fullwidth under NFC (would fold under NFKC)
write_text(EX / "04_unicode_forms.csv", (
"id,name,description\n"
"1,caf\u00e9,NFC form (single code point)\n"
"2,caf\u00e9,NFD form (e + combining accent)\n" # same bytes as row 1 now
"3,na\u00efve,NFC i-diaeresis\n"
"4,na\u00efve,NFD i + combining diaeresis\n" # same as row 3 now
"5,o\uFB03ce,fi-ligature (\uFB03)\n"
"6,\uFF21\uFF22\uFF23,fullwidth ABC\n"
"7,\u2168 century,roman numeral nine (single code point)\n"
))
# ---------------------------------------------------------------------------
# 05 Zero-width / invisible characters
# ---------------------------------------------------------------------------
ZWSP = "\u200B" # zero-width space
ZWNJ = "\u200C" # zero-width non-joiner
ZWJ = "\u200D" # zero-width joiner
LRM = "\u200E" # left-to-right mark
RLM = "\u200F" # right-to-left mark
SOFT_HYPHEN = "\u00AD"
WORD_JOINER = "\u2060"
write_text(TD / "05_zero_width_invisible.csv", (
"id,value,note\n"
f"1,Hel{ZWSP}lo,zero-width space inside word\n"
f"2,{ZWSP}Lead{ZWSP}ing,leading + internal ZWSP\n"
f"3,Trail{ZWSP},trailing ZWSP\n"
f"4,a{ZWNJ}b{ZWJ}c,ZWNJ and ZWJ\n"
f"5,{LRM}Marked{RLM},LTR + RTL marks bracketing\n"
f"6,co{SOFT_HYPHEN}operate,soft hyphen\n"
f"7,no{WORD_JOINER}break,word joiner\n"
))
write_text(EX / "05_zero_width_invisible.csv", (
"id,value,note\n"
"1,Hello,zero-width space inside word\n"
"2,Leading,leading + internal ZWSP\n"
"3,Trail,trailing ZWSP\n"
"4,abc,ZWNJ and ZWJ\n"
"5,Marked,LTR + RTL marks bracketing\n"
"6,cooperate,soft hyphen\n"
"7,nobreak,word joiner\n"
))
# ---------------------------------------------------------------------------
# 06 Control characters (non-printable, except tab/CR/LF inside quoted cells)
# ---------------------------------------------------------------------------
# These bytes show up in real exports from broken systems, terminals, or
# binary data accidentally exported as text.
# \x00 NUL, \x01 SOH, \x07 BEL, \x08 BS, \x0B VT, \x0C FF, \x1B ESC, \x7F DEL
write_text(TD / "06_control_characters.csv", (
"id,value,note\n"
"1,Hello\x00World,NUL byte inside\n"
"2,Bell\x07Sound,BEL character\n"
"3,Back\x08space,backspace\n"
"4,Vert\x0BTab,vertical tab\n"
"5,Form\x0CFeed,form feed\n"
"6,Esc\x1Bape,ESC character\n"
"7,Del\x7Fete,DEL character\n"
"8,Mixed\x00\x07\x1Bjunk,multiple controls in one cell\n"
))
write_text(EX / "06_control_characters.csv", (
"id,value,note\n"
"1,HelloWorld,NUL byte inside\n"
"2,BellSound,BEL character\n"
"3,Backspace,backspace\n"
"4,VertTab,vertical tab\n"
"5,FormFeed,form feed\n"
"6,Escape,ESC character\n"
"7,Delete,DEL character\n"
"8,Mixedjunk,multiple controls in one cell\n"
))
# ---------------------------------------------------------------------------
# 07 BOM at start of file (UTF-8 BOM = EF BB BF)
# ---------------------------------------------------------------------------
# Excel writes UTF-8 with BOM by default. pandas usually handles it but
# leaves the BOM as part of the first column's header name if you're not
# careful, producing a mystery column called "\ufeffid" that breaks lookups.
bom = b"\xef\xbb\xbf"
content = (
"id,name,city\n"
"1,Alice,New York\n"
"2,Bob,Chicago\n"
).encode("utf-8")
write_bytes(TD / "07_bom_utf8.csv", bom + content)
# Expected: BOM stripped on read, output written WITHOUT BOM, header is
# clean "id" not "\ufeffid".
write_bytes(EX / "07_bom_utf8.csv", content)
# ---------------------------------------------------------------------------
# 08 Line endings - all CRLF (Windows)
# ---------------------------------------------------------------------------
# Default policy: normalize to LF on output.
write_text(TD / "08_line_endings_crlf.csv", (
"id,name\n"
"1,Alice\n"
"2,Bob\n"
"3,Carol\n"
), newline="\r\n")
write_text(EX / "08_line_endings_crlf.csv", (
"id,name\n"
"1,Alice\n"
"2,Bob\n"
"3,Carol\n"
))
# ---------------------------------------------------------------------------
# 09 Line endings - CR only (classic Mac, pre-OSX, occasionally still seen)
# ---------------------------------------------------------------------------
write_text(TD / "09_line_endings_cr.csv", (
"id,name\n"
"1,Alice\n"
"2,Bob\n"
"3,Carol\n"
), newline="\r")
write_text(EX / "09_line_endings_cr.csv", (
"id,name\n"
"1,Alice\n"
"2,Bob\n"
"3,Carol\n"
))
# ---------------------------------------------------------------------------
# 10 Line endings - mixed within the same file
# ---------------------------------------------------------------------------
# Real-world disaster mode: file edited on multiple OSes, or concatenated
# from sources with different conventions.
mixed = (
b"id,name\r\n"
b"1,Alice\n"
b"2,Bob\r"
b"3,Carol\r\n"
b"4,Dan\n"
)
write_bytes(TD / "10_line_endings_mixed.csv", mixed)
write_text(EX / "10_line_endings_mixed.csv", (
"id,name\n"
"1,Alice\n"
"2,Bob\n"
"3,Carol\n"
"4,Dan\n"
))
# ---------------------------------------------------------------------------
# 11 Embedded newlines INSIDE quoted cells (must be preserved!)
# ---------------------------------------------------------------------------
# This is the trap: line-ending normalization at the FILE level must not
# destroy intentional newlines INSIDE quoted multi-line cells (e.g., a
# notes column or an address column).
# But the embedded line endings should also be normalized to LF for
# consistency.
write_text(TD / "11_embedded_newlines.csv", (
"id,address,notes\n"
"1,\"123 Main St\r\nApt 4B\r\nNew York, NY\",\"line1\nline2\"\n"
"2,\"Single line\",\"contains\rclassic mac\rinternal\"\n"
"3,\"normal\",\"no newlines here\"\n"
))
# Expected: file-level CRLF normalized to LF; embedded CRLF/CR also
# normalized to LF; cells stay multi-line.
write_text(EX / "11_embedded_newlines.csv", (
"id,address,notes\n"
"1,\"123 Main St\nApt 4B\nNew York, NY\",\"line1\nline2\"\n"
"2,Single line,\"contains\nclassic mac\ninternal\"\n"
"3,normal,no newlines here\n"
))
# ---------------------------------------------------------------------------
# 12 Case operations (opt-in, default = preserve)
# ---------------------------------------------------------------------------
# This file tests case operations IF the user requests them.
# Default behavior: PRESERVE. So expected_default == input.
# An expected_lower.csv shows what lower-case mode produces.
write_text(TD / "12_case_variations.csv", (
"id,name,email,product\n"
"1,ALICE SMITH,Alice@Example.COM,Widget\n"
"2,bob jones,BOB@example.com,GADGET\n"
"3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n"
"4,DAN O'CONNOR,Dan@Example.com,gizmo\n"
))
# Default expected: identical to input (case ops are opt-in).
write_text(EX / "12_case_variations__default.csv", (
"id,name,email,product\n"
"1,ALICE SMITH,Alice@Example.COM,Widget\n"
"2,bob jones,BOB@example.com,GADGET\n"
"3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n"
"4,DAN O'CONNOR,Dan@Example.com,gizmo\n"
))
# With --case-email=lower applied to email column only:
write_text(EX / "12_case_variations__email_lower.csv", (
"id,name,email,product\n"
"1,ALICE SMITH,alice@example.com,Widget\n"
"2,bob jones,bob@example.com,GADGET\n"
"3,Carol Brown,carol@example.com,wIdGeT\n"
"4,DAN O'CONNOR,dan@example.com,gizmo\n"
))
# With --case=title applied to name column:
write_text(EX / "12_case_variations__name_title.csv", (
"id,name,email,product\n"
"1,Alice Smith,Alice@Example.COM,Widget\n"
"2,Bob Jones,BOB@example.com,GADGET\n"
"3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n"
"4,Dan O'Connor,Dan@Example.com,gizmo\n" # title-case must not break O'C
))
# ---------------------------------------------------------------------------
# 13 Non-Latin scripts and emoji (PRESERVE; do not mangle)
# ---------------------------------------------------------------------------
# This is a negative test: the cleaner must not damage characters that
# look "foreign" to it. Whitespace trimming and Unicode NFC are still applied.
write_text(TD / "13_non_latin_scripts.csv", (
"id,name,note\n"
"1, \u4e2d\u56fd\u5317\u4eac ,Beijing in Chinese (with leading/trailing space)\n"
"2,\u30c6\u30b9\u30c8,Japanese katakana (test)\n"
"3,\u062a\u062c\u0631\u0628\u0629,Arabic (test) - RTL\n"
"4,\u041c\u043e\u0441\u043a\u0432\u0430,Russian (Moscow)\n"
"5,\U0001F389 launch \U0001F680,emoji preserved\n"
"6,caf\u00e9 \u2615,emoji + accent combo\n"
))
write_text(EX / "13_non_latin_scripts.csv", (
"id,name,note\n"
"1,\u4e2d\u56fd\u5317\u4eac,Beijing in Chinese (with leading/trailing space)\n"
"2,\u30c6\u30b9\u30c8,Japanese katakana (test)\n"
"3,\u062a\u062c\u0631\u0628\u0629,Arabic (test) - RTL\n"
"4,\u041c\u043e\u0441\u043a\u0432\u0430,Russian (Moscow)\n"
"5,\U0001F389 launch \U0001F680,emoji preserved\n"
"6,caf\u00e9 \u2615,emoji + accent combo\n"
))
# ---------------------------------------------------------------------------
# 14 Mojibake (double-encoded UTF-8 / cp1252 misread as Latin-1)
# ---------------------------------------------------------------------------
# Classic mojibake: someone took a UTF-8 file, opened it as Windows-1252,
# saved as UTF-8 again. "café" becomes "café", "naïve" becomes "naïve".
# The text cleaner CANNOT reliably auto-fix this (it's a heuristic and can
# false-positive on legitimate strings). Default = WARN, do not auto-fix.
# Optional --fix-mojibake flag (uses ftfy library) can attempt repair.
write_text(TD / "14_mojibake.csv", (
"id,name,city\n"
"1,caf\u00c3\u00a9,M\u00c3\u00bcnchen\n" # café, München mojibaked
"2,na\u00c3\u00afve,r\u00c3\u00a9sum\u00c3\u00a9\n" # naïve, résumé
"3,don\u00e2\u20ac\u2122t,smart-apostrophe mojibake\n" # don't via cp1252-mojibake
"4,Alice,New York\n" # clean control row
))
# Expected output WITHOUT mojibake fix (default): bytes preserved, but
# reader emits a warning to logs.
write_text(EX / "14_mojibake__default.csv", (
"id,name,city\n"
"1,caf\u00c3\u00a9,M\u00c3\u00bcnchen\n"
"2,na\u00c3\u00afve,r\u00c3\u00a9sum\u00c3\u00a9\n"
"3,don\u00e2\u20ac\u2122t,smart-apostrophe mojibake\n"
"4,Alice,New York\n"
))
# Expected output WITH --fix-mojibake (uses ftfy or equivalent):
write_text(EX / "14_mojibake__fixed.csv", (
"id,name,city\n"
"1,caf\u00e9,M\u00fcnchen\n"
"2,na\u00efve,r\u00e9sum\u00e9\n"
"3,don't,smart-apostrophe mojibake\n" # smart apostrophe also fixed
"4,Alice,New York\n"
))
# ---------------------------------------------------------------------------
# 15 Whitespace-only cells (boundary case with script 04)
# ---------------------------------------------------------------------------
# Per TECHNICAL.md Section 9.3: 02 trims first, leaving an empty string.
# 04 then detects empty strings as disguised null. So 02's job here is
# just to convert " " into "".
write_text(TD / "15_whitespace_only_cells.csv", (
"id,value\n"
"1,real\n"
"2, \n" # spaces only
"3,\t\t\n" # tabs only
"4,\u00A0\u00A0\n" # NBSP only
"5, \t \u00A0 \n" # mixed whitespace
"6,\n" # already empty
"7,actual value\n"
))
write_text(EX / "15_whitespace_only_cells.csv", (
"id,value\n"
"1,real\n"
"2,\n" # all whitespace -> empty
"3,\n"
"4,\n"
"5,\n"
"6,\n"
"7,actual value\n"
))
# ---------------------------------------------------------------------------
# 16 Dirty headers
# ---------------------------------------------------------------------------
# Headers themselves have whitespace, BOM remnants, smart quotes, etc.
# These break downstream lookups (df["email"] fails because the column
# is actually called " Email " with NBSP padding).
write_text(TD / "16_dirty_headers.csv", (
" id ,\u00a0Customer Name\u00a0,\u201cEmail\u201d,Phone\u200b\n"
"1,Alice,alice@example.com,555-1234\n"
"2,Bob,bob@example.com,555-5678\n"
))
# Expected: headers cleaned by SAME rules as data cells.
# Note: smart quotes around "Email" become straight quotes. The header
# "\"Email\"" with embedded quotes needs CSV-quoting in the output.
write_text(EX / "16_dirty_headers.csv", (
"id,Customer Name,\"\"\"Email\"\"\",Phone\n"
"1,Alice,alice@example.com,555-1234\n"
"2,Bob,bob@example.com,555-5678\n"
))
# ---------------------------------------------------------------------------
# 17 Preserve-intended (negative tests - things 02 must NOT touch)
# ---------------------------------------------------------------------------
# Numbers that LOOK like they have whitespace are tricky: " 123 " is
# a number with padding (trim) but "1 234" might be a thousands-separator
# locale (don't collapse). Default: trim outer whitespace, but DO NOT
# collapse internal whitespace in cells that parse as numeric. This is a
# judgment call; document it.
#
# Also: do not reformat dates, currencies, or phone numbers. That's 03.
# Do not detect or replace null-like values. That's 04.
write_text(TD / "17_preserve_intended.csv", (
"id,price,european_number,date,phone,quantity\n"
"1, 100 ,1 234,2024-01-15,(555) 123-4567,42\n"
"2,\" $1,500.00 \",12 345,15/01/2024,555.123.4567,7\n"
"3, N/A ,nan,Jan 15 2024,+1 555 123 4567,0\n"
))
# Expected: outer whitespace trimmed everywhere, but:
# - "1 234" stays "1 234" (looks like European/space-thousands; don't collapse)
# - "$1,500.00" stays unchanged (currency, that's 03's domain)
# - "15/01/2024" stays unchanged (date, that's 03's domain)
# - "(555) 123-4567" stays unchanged (phone, that's 03's domain)
# - "N/A" stays "N/A" (null-like, that's 04's domain - 02 doesn't decide what's null)
# - phone "+1 555 123 4567" - keep internal spaces (it's a phone, 03's domain)
write_text(EX / "17_preserve_intended.csv", (
"id,price,european_number,date,phone,quantity\n"
"1,100,1 234,2024-01-15,(555) 123-4567,42\n"
"2,\"$1,500.00\",12 345,15/01/2024,555.123.4567,7\n"
"3,N/A,nan,Jan 15 2024,+1 555 123 4567,0\n"
))
# ---------------------------------------------------------------------------
# 18 Empty file (zero bytes)
# ---------------------------------------------------------------------------
write_bytes(TD / "18_empty_file.csv", b"")
# Expected: graceful handling, output is also empty (or warning emitted).
write_bytes(EX / "18_empty_file.csv", b"")
# ---------------------------------------------------------------------------
# 19 Headers only (no data rows)
# ---------------------------------------------------------------------------
write_text(TD / "19_headers_only.csv", (
" id ,Name\u00a0,Email\u200b\n"
))
# Expected: headers cleaned, no data rows in output.
write_text(EX / "19_headers_only.csv", (
"id,Name,Email\n"
))
# ---------------------------------------------------------------------------
# 20 Real-world kitchen sink (everything combined)
# ---------------------------------------------------------------------------
# Simulates a typical messy export: came from Excel via cp1252 paste,
# saved as UTF-8 with BOM, has CRLF, has smart quotes from autocorrect,
# has NBSP from copy/paste, has trailing whitespace.
content = (
" id ,\u00a0Name\u00a0,\u201cEmail\u201d,Notes\u200b\n"
"1,\u00a0Alice Smith\u00a0,Alice@Example.COM,\u201cVIP\u201d customer \u2014 contact ASAP\u2026\r\n"
"2,\tBob\tJones\t,bob@example.com,it\u2019s 5\u20326\u2033 tall\r\n"
"3, Carol Brown ,CAROL@EXAMPLE.COM,3 \u00d7 4 = 12 (preserve \u00d7)\r\n"
"4, ,empty@example.com,whitespace-only name (becomes empty)\r\n"
)
data_bytes = b"\xef\xbb\xbf" + content.encode("utf-8")
# replace LF with CRLF wherever it isn't already to be unambiguous
write_bytes(TD / "20_kitchen_sink.csv", data_bytes)
# Expected: BOM gone, headers clean, smart quotes ASCII-fied, NBSP/ZWSP
# stripped, internal multi-space collapsed, CRLF normalized to LF,
# whitespace-only cells become empty, multiplication sign preserved,
# em-dash and ellipsis converted, prime/double-prime converted.
write_text(EX / "20_kitchen_sink.csv", (
"id,Name,\"\"\"Email\"\"\",Notes\n"
"1,Alice Smith,Alice@Example.COM,\"\"\"VIP\"\" customer - contact ASAP...\"\n"
"2,Bob Jones,bob@example.com,\"it's 5'6\"\" tall\"\n"
"3,Carol Brown,CAROL@EXAMPLE.COM,3 \u00d7 4 = 12 (preserve \u00d7)\n"
"4,,empty@example.com,whitespace-only name (becomes empty)\n"
))
print("All CSV test files written.")
print(f" inputs: {TD}")
print(f" expected: {EX}")