datatools-dev/test-cases/text-cleaner-corpus/generate_test_data.py

"""
Generator for the 02_text_cleaner test corpus.

Writes raw bytes where exact control over encoding/line-endings/invisible
characters matters. Do not edit the output files in a text editor that
"helpfully" normalizes anything; it will silently break the tests.

Run from the corpus root:
    python generate_test_data.py
"""
from pathlib import Path

ROOT = Path(__file__).parent
TD = ROOT / "test_data"
EX = ROOT / "expected"
TD.mkdir(exist_ok=True)
EX.mkdir(exist_ok=True)


def write_bytes(path, data):
    Path(path).write_bytes(data)


def write_text(path, text, encoding="utf-8", newline="\n"):
    # Explicit byte write so we control line endings exactly.
    if newline != "\n":
        text = text.replace("\n", newline)
    Path(path).write_bytes(text.encode(encoding))


# ---------------------------------------------------------------------------
# 01 Whitespace - basic (ASCII space + tab)
# ---------------------------------------------------------------------------
write_text(TD / "01_whitespace_basic.csv", (
    "id,name,city\n"
    "1,  Alice  ,New York\n"            # leading + trailing spaces
    "2,Bob,   Chicago\n"                # leading spaces
    "3,Carol   ,San Francisco   \n"     # trailing spaces
    "4,Dan    Smith,Austin\n"           # internal multi-space
    "5,\tEve\t,\tBoston\t\n"            # tab padding
    "6,Frank  van  der  Berg,Denver\n"  # multiple internal multi-space runs
    "7,  Grace   Hopper  ,  Palo  Alto  \n"  # everything at once
))

write_text(EX / "01_whitespace_basic.csv", (
    "id,name,city\n"
    "1,Alice,New York\n"
    "2,Bob,Chicago\n"
    "3,Carol,San Francisco\n"
    "4,Dan Smith,Austin\n"
    "5,Eve,Boston\n"
    "6,Frank van der Berg,Denver\n"
    "7,Grace Hopper,Palo Alto\n"
))

# ---------------------------------------------------------------------------
# 02 Whitespace - unicode (NBSP, narrow NBSP, ideographic space, etc.)
# ---------------------------------------------------------------------------
# These are the whitespace-pretenders that .strip() in Python 3 actually
# DOES handle, but that .strip() in many naive implementations (or pandas
# defaults) does NOT. Test that they're stripped, not preserved.
NBSP = "\u00A0"          # non-breaking space (very common from Word/Excel paste)
NNBSP = "\u202F"         # narrow no-break space
IDEO = "\u3000"          # ideographic space (CJK)
EM_SPACE = "\u2003"      # em space
THIN_SPACE = "\u2009"    # thin space
write_text(TD / "02_whitespace_unicode.csv", (
    "id,label,note\n"
    f"1,{NBSP}Premium{NBSP},NBSP padding\n"
    f"2,{NNBSP}Discount{NNBSP},narrow NBSP\n"
    f"3,{IDEO}Standard{IDEO},ideographic space\n"
    f"4,Tier{EM_SPACE}{EM_SPACE}One,em-space internal\n"
    f"5,Cost{THIN_SPACE}Plus,thin-space internal\n"
    f"6,  {NBSP} mixed {NBSP}  ,ascii + NBSP combined\n"
))

write_text(EX / "02_whitespace_unicode.csv", (
    "id,label,note\n"
    "1,Premium,NBSP padding\n"
    "2,Discount,narrow NBSP\n"
    "3,Standard,ideographic space\n"
    "4,Tier One,em-space internal\n"
    "5,Cost Plus,thin-space internal\n"
    "6,mixed,ascii + NBSP combined\n"
))

# ---------------------------------------------------------------------------
# 03 Smart punctuation (curly quotes, em/en dash, ellipsis, primes)
# ---------------------------------------------------------------------------
# This is the #1 source of pollution from data that ever passed through
# Word, Outlook, or Excel autocorrect. ASCII-fy it.
write_text(TD / "03_smart_punctuation.csv", (
    "id,quote,measurement\n"
    "1,\u201cHello world\u201d,5\u2032 11\u2033\n"        # curly double quotes, prime/double-prime
    "2,it\u2019s working,\u2014\n"                          # curly apostrophe, em-dash alone
    "3,2020\u20132024,from \u2018a\u2019 to \u2018z\u2019\n"  # en-dash range, curly singles
    "4,wait\u2026,3 \u00d7 4\n"                              # ellipsis char, multiplication sign
    "5,\u00abquoted\u00bb,5 \u00b1 0.1\n"                    # guillemets, plus-minus
))

# Default policy: ASCII-fy where round-trip-safe.
# Notable: \u00d7 (multiplication) and \u00b1 (plus-minus) are typographically
# meaningful and not safely round-trippable to ASCII, so we PRESERVE them
# (case 4 col3, case 5 col3). Document this in TEST-CASES.md.
write_text(EX / "03_smart_punctuation.csv", (
    "id,quote,measurement\n"
    "1,\"\"\"Hello world\"\"\",\"5' 11\"\"\"\n"
    "2,it's working,-\n"
    "3,2020-2024,from 'a' to 'z'\n"
    "4,wait...,3 \u00d7 4\n"
    "5,\"\"\"quoted\"\"\",5 \u00b1 0.1\n"
))

# ---------------------------------------------------------------------------
# 04 Unicode normalization forms (NFC vs NFD, ligatures, fullwidth)
# ---------------------------------------------------------------------------
# "café" can be either:
#   NFC: "caf\u00e9"           (e-acute as single code point)
#   NFD: "cafe\u0301"          (e + combining acute accent, two code points)
# These look identical but compare unequal. Normalize to NFC.
write_text(TD / "04_unicode_forms.csv", (
    "id,name,description\n"
    "1,caf\u00e9,NFC form (single code point)\n"
    "2,cafe\u0301,NFD form (e + combining accent)\n"
    "3,na\u00efve,NFC i-diaeresis\n"
    "4,nai\u0308ve,NFD i + combining diaeresis\n"
    "5,o\uFB03ce,fi-ligature (\uFB03)\n"             # 'office' written with 'ffi' ligature
    "6,\uFF21\uFF22\uFF23,fullwidth ABC\n"          # Ａ Ｂ Ｃ
    "7,\u2168 century,roman numeral nine (single code point)\n"  # Ⅸ
))

# Policy: NFC by default (most compatible, smallest, what Excel emits).
# NFKC option would also fold ligatures and fullwidth digits/letters,
# but is destructive for some legitimate text. Default = NFC.
# So:
#  - Cases 1 vs 2 should produce identical output after normalization
#  - Cases 3 vs 4 should produce identical output
#  - Case 5 ligature stays as ligature under NFC (would fold under NFKC)
#  - Case 6 fullwidth stays fullwidth under NFC (would fold under NFKC)
write_text(EX / "04_unicode_forms.csv", (
    "id,name,description\n"
    "1,caf\u00e9,NFC form (single code point)\n"
    "2,caf\u00e9,NFD form (e + combining accent)\n"  # same bytes as row 1 now
    "3,na\u00efve,NFC i-diaeresis\n"
    "4,na\u00efve,NFD i + combining diaeresis\n"      # same as row 3 now
    "5,o\uFB03ce,fi-ligature (\uFB03)\n"
    "6,\uFF21\uFF22\uFF23,fullwidth ABC\n"
    "7,\u2168 century,roman numeral nine (single code point)\n"
))

# ---------------------------------------------------------------------------
# 05 Zero-width / invisible characters
# ---------------------------------------------------------------------------
ZWSP = "\u200B"     # zero-width space
ZWNJ = "\u200C"     # zero-width non-joiner
ZWJ = "\u200D"      # zero-width joiner
LRM = "\u200E"      # left-to-right mark
RLM = "\u200F"      # right-to-left mark
SOFT_HYPHEN = "\u00AD"
WORD_JOINER = "\u2060"
write_text(TD / "05_zero_width_invisible.csv", (
    "id,value,note\n"
    f"1,Hel{ZWSP}lo,zero-width space inside word\n"
    f"2,{ZWSP}Lead{ZWSP}ing,leading + internal ZWSP\n"
    f"3,Trail{ZWSP},trailing ZWSP\n"
    f"4,a{ZWNJ}b{ZWJ}c,ZWNJ and ZWJ\n"
    f"5,{LRM}Marked{RLM},LTR + RTL marks bracketing\n"
    f"6,co{SOFT_HYPHEN}operate,soft hyphen\n"
    f"7,no{WORD_JOINER}break,word joiner\n"
))

write_text(EX / "05_zero_width_invisible.csv", (
    "id,value,note\n"
    "1,Hello,zero-width space inside word\n"
    "2,Leading,leading + internal ZWSP\n"
    "3,Trail,trailing ZWSP\n"
    "4,abc,ZWNJ and ZWJ\n"
    "5,Marked,LTR + RTL marks bracketing\n"
    "6,cooperate,soft hyphen\n"
    "7,nobreak,word joiner\n"
))

# ---------------------------------------------------------------------------
# 06 Control characters (non-printable, except tab/CR/LF inside quoted cells)
# ---------------------------------------------------------------------------
# These bytes show up in real exports from broken systems, terminals, or
# binary data accidentally exported as text.
# \x00 NUL, \x01 SOH, \x07 BEL, \x08 BS, \x0B VT, \x0C FF, \x1B ESC, \x7F DEL
write_text(TD / "06_control_characters.csv", (
    "id,value,note\n"
    "1,Hello\x00World,NUL byte inside\n"
    "2,Bell\x07Sound,BEL character\n"
    "3,Back\x08space,backspace\n"
    "4,Vert\x0BTab,vertical tab\n"
    "5,Form\x0CFeed,form feed\n"
    "6,Esc\x1Bape,ESC character\n"
    "7,Del\x7Fete,DEL character\n"
    "8,Mixed\x00\x07\x1Bjunk,multiple controls in one cell\n"
))

write_text(EX / "06_control_characters.csv", (
    "id,value,note\n"
    "1,HelloWorld,NUL byte inside\n"
    "2,BellSound,BEL character\n"
    "3,Backspace,backspace\n"
    "4,VertTab,vertical tab\n"
    "5,FormFeed,form feed\n"
    "6,Escape,ESC character\n"
    "7,Delete,DEL character\n"
    "8,Mixedjunk,multiple controls in one cell\n"
))

# ---------------------------------------------------------------------------
# 07 BOM at start of file (UTF-8 BOM = EF BB BF)
# ---------------------------------------------------------------------------
# Excel writes UTF-8 with BOM by default. pandas usually handles it but
# leaves the BOM as part of the first column's header name if you're not
# careful, producing a mystery column called "\ufeffid" that breaks lookups.
bom = b"\xef\xbb\xbf"
content = (
    "id,name,city\n"
    "1,Alice,New York\n"
    "2,Bob,Chicago\n"
).encode("utf-8")
write_bytes(TD / "07_bom_utf8.csv", bom + content)

# Expected: BOM stripped on read, output written WITHOUT BOM, header is
# clean "id" not "\ufeffid".
write_bytes(EX / "07_bom_utf8.csv", content)

# ---------------------------------------------------------------------------
# 08 Line endings - all CRLF (Windows)
# ---------------------------------------------------------------------------
# Default policy: normalize to LF on output.
write_text(TD / "08_line_endings_crlf.csv", (
    "id,name\n"
    "1,Alice\n"
    "2,Bob\n"
    "3,Carol\n"
), newline="\r\n")

write_text(EX / "08_line_endings_crlf.csv", (
    "id,name\n"
    "1,Alice\n"
    "2,Bob\n"
    "3,Carol\n"
))

# ---------------------------------------------------------------------------
# 09 Line endings - CR only (classic Mac, pre-OSX, occasionally still seen)
# ---------------------------------------------------------------------------
write_text(TD / "09_line_endings_cr.csv", (
    "id,name\n"
    "1,Alice\n"
    "2,Bob\n"
    "3,Carol\n"
), newline="\r")

write_text(EX / "09_line_endings_cr.csv", (
    "id,name\n"
    "1,Alice\n"
    "2,Bob\n"
    "3,Carol\n"
))

# ---------------------------------------------------------------------------
# 10 Line endings - mixed within the same file
# ---------------------------------------------------------------------------
# Real-world disaster mode: file edited on multiple OSes, or concatenated
# from sources with different conventions.
mixed = (
    b"id,name\r\n"
    b"1,Alice\n"
    b"2,Bob\r"
    b"3,Carol\r\n"
    b"4,Dan\n"
)
write_bytes(TD / "10_line_endings_mixed.csv", mixed)

write_text(EX / "10_line_endings_mixed.csv", (
    "id,name\n"
    "1,Alice\n"
    "2,Bob\n"
    "3,Carol\n"
    "4,Dan\n"
))

# ---------------------------------------------------------------------------
# 11 Embedded newlines INSIDE quoted cells (must be preserved!)
# ---------------------------------------------------------------------------
# This is the trap: line-ending normalization at the FILE level must not
# destroy intentional newlines INSIDE quoted multi-line cells (e.g., a
# notes column or an address column).
# But the embedded line endings should also be normalized to LF for
# consistency.
write_text(TD / "11_embedded_newlines.csv", (
    "id,address,notes\n"
    "1,\"123 Main St\r\nApt 4B\r\nNew York, NY\",\"line1\nline2\"\n"
    "2,\"Single line\",\"contains\rclassic mac\rinternal\"\n"
    "3,\"normal\",\"no newlines here\"\n"
))

# Expected: file-level CRLF normalized to LF; embedded CRLF/CR also
# normalized to LF; cells stay multi-line.
write_text(EX / "11_embedded_newlines.csv", (
    "id,address,notes\n"
    "1,\"123 Main St\nApt 4B\nNew York, NY\",\"line1\nline2\"\n"
    "2,Single line,\"contains\nclassic mac\ninternal\"\n"
    "3,normal,no newlines here\n"
))

# ---------------------------------------------------------------------------
# 12 Case operations (opt-in, default = preserve)
# ---------------------------------------------------------------------------
# This file tests case operations IF the user requests them.
# Default behavior: PRESERVE. So expected_default == input.
# An expected_lower.csv shows what lower-case mode produces.
write_text(TD / "12_case_variations.csv", (
    "id,name,email,product\n"
    "1,ALICE SMITH,Alice@Example.COM,Widget\n"
    "2,bob jones,BOB@example.com,GADGET\n"
    "3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n"
    "4,DAN O'CONNOR,Dan@Example.com,gizmo\n"
))

# Default expected: identical to input (case ops are opt-in).
write_text(EX / "12_case_variations__default.csv", (
    "id,name,email,product\n"
    "1,ALICE SMITH,Alice@Example.COM,Widget\n"
    "2,bob jones,BOB@example.com,GADGET\n"
    "3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n"
    "4,DAN O'CONNOR,Dan@Example.com,gizmo\n"
))

# With --case-email=lower applied to email column only:
write_text(EX / "12_case_variations__email_lower.csv", (
    "id,name,email,product\n"
    "1,ALICE SMITH,alice@example.com,Widget\n"
    "2,bob jones,bob@example.com,GADGET\n"
    "3,Carol Brown,carol@example.com,wIdGeT\n"
    "4,DAN O'CONNOR,dan@example.com,gizmo\n"
))

# With --case=title applied to name column:
write_text(EX / "12_case_variations__name_title.csv", (
    "id,name,email,product\n"
    "1,Alice Smith,Alice@Example.COM,Widget\n"
    "2,Bob Jones,BOB@example.com,GADGET\n"
    "3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n"
    "4,Dan O'Connor,Dan@Example.com,gizmo\n"          # title-case must not break O'C
))

# ---------------------------------------------------------------------------
# 13 Non-Latin scripts and emoji (PRESERVE; do not mangle)
# ---------------------------------------------------------------------------
# This is a negative test: the cleaner must not damage characters that
# look "foreign" to it. Whitespace trimming and Unicode NFC are still applied.
write_text(TD / "13_non_latin_scripts.csv", (
    "id,name,note\n"
    "1, \u4e2d\u56fd\u5317\u4eac ,Beijing in Chinese (with leading/trailing space)\n"
    "2,\u30c6\u30b9\u30c8,Japanese katakana (test)\n"
    "3,\u062a\u062c\u0631\u0628\u0629,Arabic (test) - RTL\n"
    "4,\u041c\u043e\u0441\u043a\u0432\u0430,Russian (Moscow)\n"
    "5,\U0001F389 launch \U0001F680,emoji preserved\n"
    "6,caf\u00e9 \u2615,emoji + accent combo\n"
))

write_text(EX / "13_non_latin_scripts.csv", (
    "id,name,note\n"
    "1,\u4e2d\u56fd\u5317\u4eac,Beijing in Chinese (with leading/trailing space)\n"
    "2,\u30c6\u30b9\u30c8,Japanese katakana (test)\n"
    "3,\u062a\u062c\u0631\u0628\u0629,Arabic (test) - RTL\n"
    "4,\u041c\u043e\u0441\u043a\u0432\u0430,Russian (Moscow)\n"
    "5,\U0001F389 launch \U0001F680,emoji preserved\n"
    "6,caf\u00e9 \u2615,emoji + accent combo\n"
))

# ---------------------------------------------------------------------------
# 14 Mojibake (double-encoded UTF-8 / cp1252 misread as Latin-1)
# ---------------------------------------------------------------------------
# Classic mojibake: someone took a UTF-8 file, opened it as Windows-1252,
# saved as UTF-8 again. "café" becomes "cafÃ©", "naïve" becomes "naÃ¯ve".
# The text cleaner CANNOT reliably auto-fix this (it's a heuristic and can
# false-positive on legitimate strings). Default = WARN, do not auto-fix.
# Optional --fix-mojibake flag (uses ftfy library) can attempt repair.
write_text(TD / "14_mojibake.csv", (
    "id,name,city\n"
    "1,caf\u00c3\u00a9,M\u00c3\u00bcnchen\n"        # café, München mojibaked
    "2,na\u00c3\u00afve,r\u00c3\u00a9sum\u00c3\u00a9\n"  # naïve, résumé
    "3,don\u00e2\u20ac\u2122t,smart-apostrophe mojibake\n"  # don't via cp1252-mojibake
    "4,Alice,New York\n"                              # clean control row
))

# Expected output WITHOUT mojibake fix (default): bytes preserved, but
# reader emits a warning to logs.
write_text(EX / "14_mojibake__default.csv", (
    "id,name,city\n"
    "1,caf\u00c3\u00a9,M\u00c3\u00bcnchen\n"
    "2,na\u00c3\u00afve,r\u00c3\u00a9sum\u00c3\u00a9\n"
    "3,don\u00e2\u20ac\u2122t,smart-apostrophe mojibake\n"
    "4,Alice,New York\n"
))

# Expected output WITH --fix-mojibake (uses ftfy or equivalent):
write_text(EX / "14_mojibake__fixed.csv", (
    "id,name,city\n"
    "1,caf\u00e9,M\u00fcnchen\n"
    "2,na\u00efve,r\u00e9sum\u00e9\n"
    "3,don't,smart-apostrophe mojibake\n"          # smart apostrophe also fixed
    "4,Alice,New York\n"
))

# ---------------------------------------------------------------------------
# 15 Whitespace-only cells (boundary case with script 04)
# ---------------------------------------------------------------------------
# Per TECHNICAL.md Section 9.3: 02 trims first, leaving an empty string.
# 04 then detects empty strings as disguised null. So 02's job here is
# just to convert "   " into "".
write_text(TD / "15_whitespace_only_cells.csv", (
    "id,value\n"
    "1,real\n"
    "2,   \n"                  # spaces only
    "3,\t\t\n"                 # tabs only
    "4,\u00A0\u00A0\n"         # NBSP only
    "5, \t \u00A0 \n"          # mixed whitespace
    "6,\n"                     # already empty
    "7,actual value\n"
))

write_text(EX / "15_whitespace_only_cells.csv", (
    "id,value\n"
    "1,real\n"
    "2,\n"                      # all whitespace -> empty
    "3,\n"
    "4,\n"
    "5,\n"
    "6,\n"
    "7,actual value\n"
))

# ---------------------------------------------------------------------------
# 16 Dirty headers
# ---------------------------------------------------------------------------
# Headers themselves have whitespace, BOM remnants, smart quotes, etc.
# These break downstream lookups (df["email"] fails because the column
# is actually called "  Email  " with NBSP padding).
write_text(TD / "16_dirty_headers.csv", (
    "  id  ,\u00a0Customer Name\u00a0,\u201cEmail\u201d,Phone\u200b\n"
    "1,Alice,alice@example.com,555-1234\n"
    "2,Bob,bob@example.com,555-5678\n"
))

# Expected: headers cleaned by SAME rules as data cells.
# Note: smart quotes around "Email" become straight quotes. The header
# "\"Email\"" with embedded quotes needs CSV-quoting in the output.
write_text(EX / "16_dirty_headers.csv", (
    "id,Customer Name,\"\"\"Email\"\"\",Phone\n"
    "1,Alice,alice@example.com,555-1234\n"
    "2,Bob,bob@example.com,555-5678\n"
))

# ---------------------------------------------------------------------------
# 17 Preserve-intended (negative tests - things 02 must NOT touch)
# ---------------------------------------------------------------------------
# Numbers that LOOK like they have whitespace are tricky: "  123  " is
# a number with padding (trim) but "1 234" might be a thousands-separator
# locale (don't collapse). Default: trim outer whitespace, but DO NOT
# collapse internal whitespace in cells that parse as numeric. This is a
# judgment call; document it.
#
# Also: do not reformat dates, currencies, or phone numbers. That's 03.
# Do not detect or replace null-like values. That's 04.
write_text(TD / "17_preserve_intended.csv", (
    "id,price,european_number,date,phone,quantity\n"
    "1,  100  ,1 234,2024-01-15,(555) 123-4567,42\n"
    "2,\"  $1,500.00  \",12 345,15/01/2024,555.123.4567,7\n"
    "3,  N/A  ,nan,Jan 15 2024,+1 555 123 4567,0\n"
))

# Expected: outer whitespace trimmed everywhere, but:
#  - "1 234" stays "1 234" (looks like European/space-thousands; don't collapse)
#  - "$1,500.00" stays unchanged (currency, that's 03's domain)
#  - "15/01/2024" stays unchanged (date, that's 03's domain)
#  - "(555) 123-4567" stays unchanged (phone, that's 03's domain)
#  - "N/A" stays "N/A" (null-like, that's 04's domain - 02 doesn't decide what's null)
#  - phone "+1 555 123 4567" - keep internal spaces (it's a phone, 03's domain)
write_text(EX / "17_preserve_intended.csv", (
    "id,price,european_number,date,phone,quantity\n"
    "1,100,1 234,2024-01-15,(555) 123-4567,42\n"
    "2,\"$1,500.00\",12 345,15/01/2024,555.123.4567,7\n"
    "3,N/A,nan,Jan 15 2024,+1 555 123 4567,0\n"
))

# ---------------------------------------------------------------------------
# 18 Empty file (zero bytes)
# ---------------------------------------------------------------------------
write_bytes(TD / "18_empty_file.csv", b"")

# Expected: graceful handling, output is also empty (or warning emitted).
write_bytes(EX / "18_empty_file.csv", b"")

# ---------------------------------------------------------------------------
# 19 Headers only (no data rows)
# ---------------------------------------------------------------------------
write_text(TD / "19_headers_only.csv", (
    "  id  ,Name\u00a0,Email\u200b\n"
))

# Expected: headers cleaned, no data rows in output.
write_text(EX / "19_headers_only.csv", (
    "id,Name,Email\n"
))

# ---------------------------------------------------------------------------
# 20 Real-world kitchen sink (everything combined)
# ---------------------------------------------------------------------------
# Simulates a typical messy export: came from Excel via cp1252 paste,
# saved as UTF-8 with BOM, has CRLF, has smart quotes from autocorrect,
# has NBSP from copy/paste, has trailing whitespace.
content = (
    "  id  ,\u00a0Name\u00a0,\u201cEmail\u201d,Notes\u200b\n"
    "1,\u00a0Alice  Smith\u00a0,Alice@Example.COM,\u201cVIP\u201d customer \u2014 contact ASAP\u2026\r\n"
    "2,\tBob\tJones\t,bob@example.com,it\u2019s 5\u20326\u2033 tall\r\n"
    "3,  Carol  Brown  ,CAROL@EXAMPLE.COM,3 \u00d7 4 = 12 (preserve \u00d7)\r\n"
    "4,   ,empty@example.com,whitespace-only name (becomes empty)\r\n"
)
data_bytes = b"\xef\xbb\xbf" + content.encode("utf-8")
# replace LF with CRLF wherever it isn't already to be unambiguous
write_bytes(TD / "20_kitchen_sink.csv", data_bytes)

# Expected: BOM gone, headers clean, smart quotes ASCII-fied, NBSP/ZWSP
# stripped, internal multi-space collapsed, CRLF normalized to LF,
# whitespace-only cells become empty, multiplication sign preserved,
# em-dash and ellipsis converted, prime/double-prime converted.
write_text(EX / "20_kitchen_sink.csv", (
    "id,Name,\"\"\"Email\"\"\",Notes\n"
    "1,Alice Smith,Alice@Example.COM,\"\"\"VIP\"\" customer - contact ASAP...\"\n"
    "2,Bob Jones,bob@example.com,\"it's 5'6\"\" tall\"\n"
    "3,Carol Brown,CAROL@EXAMPLE.COM,3 \u00d7 4 = 12 (preserve \u00d7)\n"
    "4,,empty@example.com,whitespace-only name (becomes empty)\n"
))

print("All CSV test files written.")
print(f"  inputs:   {TD}")
print(f"  expected: {EX}")