test: add text-cleaner corpus and close gaps surfaced by it

The 21-fixture corpus (test-cases/text-cleaner-corpus/) exercises the cleaner
end-to-end against the spec in TEST-CASES.md. Closing the failing cases drove
five small cleaner fixes plus two fixture-generation fixes:

- _SMART_CHARS: add prime, double prime, guillemets (case 03)
- _ZERO_WIDTH: add soft hyphen U+00AD (case 05)
- clean_dataframe: clean column headers via the same pipeline (cases 16/19/20),
  with a clean_headers toggle on CleanOptions
- smart_title_case: title-case full-shout strings ("ALICE SMITH" -> "Alice
  Smith") while still preserving embedded acronyms; preserve uppercase after
  apostrophe in names ("O'CONNOR" -> "O'Connor", "o'neil" -> "O'neil")
- test_corpus.py reader: pre-strip NUL bytes (C parser truncates at NUL,
  python engine is too strict about embedded literal "), per spec case 06
- generate_test_data.py: properly CSV-escape literal-quote cells in case 03
  expected; quote the rogue-comma price field in case 17 input

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-29 15:37:35 +00:00
parent 54f92ae47e
commit c349a90e18
50 changed files with 1644 additions and 4 deletions

View File

@@ -0,0 +1,545 @@
"""
Generator for the 02_text_cleaner test corpus.
Writes raw bytes where exact control over encoding/line-endings/invisible
characters matters. Do not edit the output files in a text editor that
"helpfully" normalizes anything; it will silently break the tests.
Run from the corpus root:
python generate_test_data.py
"""
from pathlib import Path
ROOT = Path(__file__).parent
TD = ROOT / "test_data"
EX = ROOT / "expected"
TD.mkdir(exist_ok=True)
EX.mkdir(exist_ok=True)
def write_bytes(path, data):
Path(path).write_bytes(data)
def write_text(path, text, encoding="utf-8", newline="\n"):
# Explicit byte write so we control line endings exactly.
if newline != "\n":
text = text.replace("\n", newline)
Path(path).write_bytes(text.encode(encoding))
# ---------------------------------------------------------------------------
# 01 Whitespace - basic (ASCII space + tab)
# ---------------------------------------------------------------------------
write_text(TD / "01_whitespace_basic.csv", (
"id,name,city\n"
"1, Alice ,New York\n" # leading + trailing spaces
"2,Bob, Chicago\n" # leading spaces
"3,Carol ,San Francisco \n" # trailing spaces
"4,Dan Smith,Austin\n" # internal multi-space
"5,\tEve\t,\tBoston\t\n" # tab padding
"6,Frank van der Berg,Denver\n" # multiple internal multi-space runs
"7, Grace Hopper , Palo Alto \n" # everything at once
))
write_text(EX / "01_whitespace_basic.csv", (
"id,name,city\n"
"1,Alice,New York\n"
"2,Bob,Chicago\n"
"3,Carol,San Francisco\n"
"4,Dan Smith,Austin\n"
"5,Eve,Boston\n"
"6,Frank van der Berg,Denver\n"
"7,Grace Hopper,Palo Alto\n"
))
# ---------------------------------------------------------------------------
# 02 Whitespace - unicode (NBSP, narrow NBSP, ideographic space, etc.)
# ---------------------------------------------------------------------------
# These are the whitespace-pretenders that .strip() in Python 3 actually
# DOES handle, but that .strip() in many naive implementations (or pandas
# defaults) does NOT. Test that they're stripped, not preserved.
NBSP = "\u00A0" # non-breaking space (very common from Word/Excel paste)
NNBSP = "\u202F" # narrow no-break space
IDEO = "\u3000" # ideographic space (CJK)
EM_SPACE = "\u2003" # em space
THIN_SPACE = "\u2009" # thin space
write_text(TD / "02_whitespace_unicode.csv", (
"id,label,note\n"
f"1,{NBSP}Premium{NBSP},NBSP padding\n"
f"2,{NNBSP}Discount{NNBSP},narrow NBSP\n"
f"3,{IDEO}Standard{IDEO},ideographic space\n"
f"4,Tier{EM_SPACE}{EM_SPACE}One,em-space internal\n"
f"5,Cost{THIN_SPACE}Plus,thin-space internal\n"
f"6, {NBSP} mixed {NBSP} ,ascii + NBSP combined\n"
))
write_text(EX / "02_whitespace_unicode.csv", (
"id,label,note\n"
"1,Premium,NBSP padding\n"
"2,Discount,narrow NBSP\n"
"3,Standard,ideographic space\n"
"4,Tier One,em-space internal\n"
"5,Cost Plus,thin-space internal\n"
"6,mixed,ascii + NBSP combined\n"
))
# ---------------------------------------------------------------------------
# 03 Smart punctuation (curly quotes, em/en dash, ellipsis, primes)
# ---------------------------------------------------------------------------
# This is the #1 source of pollution from data that ever passed through
# Word, Outlook, or Excel autocorrect. ASCII-fy it.
write_text(TD / "03_smart_punctuation.csv", (
"id,quote,measurement\n"
"1,\u201cHello world\u201d,5\u2032 11\u2033\n" # curly double quotes, prime/double-prime
"2,it\u2019s working,\u2014\n" # curly apostrophe, em-dash alone
"3,2020\u20132024,from \u2018a\u2019 to \u2018z\u2019\n" # en-dash range, curly singles
"4,wait\u2026,3 \u00d7 4\n" # ellipsis char, multiplication sign
"5,\u00abquoted\u00bb,5 \u00b1 0.1\n" # guillemets, plus-minus
))
# Default policy: ASCII-fy where round-trip-safe.
# Notable: \u00d7 (multiplication) and \u00b1 (plus-minus) are typographically
# meaningful and not safely round-trippable to ASCII, so we PRESERVE them
# (case 4 col3, case 5 col3). Document this in TEST-CASES.md.
write_text(EX / "03_smart_punctuation.csv", (
"id,quote,measurement\n"
"1,\"\"\"Hello world\"\"\",\"5' 11\"\"\"\n"
"2,it's working,-\n"
"3,2020-2024,from 'a' to 'z'\n"
"4,wait...,3 \u00d7 4\n"
"5,\"\"\"quoted\"\"\",5 \u00b1 0.1\n"
))
# ---------------------------------------------------------------------------
# 04 Unicode normalization forms (NFC vs NFD, ligatures, fullwidth)
# ---------------------------------------------------------------------------
# "café" can be either:
# NFC: "caf\u00e9" (e-acute as single code point)
# NFD: "cafe\u0301" (e + combining acute accent, two code points)
# These look identical but compare unequal. Normalize to NFC.
write_text(TD / "04_unicode_forms.csv", (
"id,name,description\n"
"1,caf\u00e9,NFC form (single code point)\n"
"2,cafe\u0301,NFD form (e + combining accent)\n"
"3,na\u00efve,NFC i-diaeresis\n"
"4,nai\u0308ve,NFD i + combining diaeresis\n"
"5,o\uFB03ce,fi-ligature (\uFB03)\n" # 'office' written with 'ffi' ligature
"6,\uFF21\uFF22\uFF23,fullwidth ABC\n" #
"7,\u2168 century,roman numeral nine (single code point)\n" # Ⅸ
))
# Policy: NFC by default (most compatible, smallest, what Excel emits).
# NFKC option would also fold ligatures and fullwidth digits/letters,
# but is destructive for some legitimate text. Default = NFC.
# So:
# - Cases 1 vs 2 should produce identical output after normalization
# - Cases 3 vs 4 should produce identical output
# - Case 5 ligature stays as ligature under NFC (would fold under NFKC)
# - Case 6 fullwidth stays fullwidth under NFC (would fold under NFKC)
write_text(EX / "04_unicode_forms.csv", (
"id,name,description\n"
"1,caf\u00e9,NFC form (single code point)\n"
"2,caf\u00e9,NFD form (e + combining accent)\n" # same bytes as row 1 now
"3,na\u00efve,NFC i-diaeresis\n"
"4,na\u00efve,NFD i + combining diaeresis\n" # same as row 3 now
"5,o\uFB03ce,fi-ligature (\uFB03)\n"
"6,\uFF21\uFF22\uFF23,fullwidth ABC\n"
"7,\u2168 century,roman numeral nine (single code point)\n"
))
# ---------------------------------------------------------------------------
# 05 Zero-width / invisible characters
# ---------------------------------------------------------------------------
ZWSP = "\u200B" # zero-width space
ZWNJ = "\u200C" # zero-width non-joiner
ZWJ = "\u200D" # zero-width joiner
LRM = "\u200E" # left-to-right mark
RLM = "\u200F" # right-to-left mark
SOFT_HYPHEN = "\u00AD"
WORD_JOINER = "\u2060"
write_text(TD / "05_zero_width_invisible.csv", (
"id,value,note\n"
f"1,Hel{ZWSP}lo,zero-width space inside word\n"
f"2,{ZWSP}Lead{ZWSP}ing,leading + internal ZWSP\n"
f"3,Trail{ZWSP},trailing ZWSP\n"
f"4,a{ZWNJ}b{ZWJ}c,ZWNJ and ZWJ\n"
f"5,{LRM}Marked{RLM},LTR + RTL marks bracketing\n"
f"6,co{SOFT_HYPHEN}operate,soft hyphen\n"
f"7,no{WORD_JOINER}break,word joiner\n"
))
write_text(EX / "05_zero_width_invisible.csv", (
"id,value,note\n"
"1,Hello,zero-width space inside word\n"
"2,Leading,leading + internal ZWSP\n"
"3,Trail,trailing ZWSP\n"
"4,abc,ZWNJ and ZWJ\n"
"5,Marked,LTR + RTL marks bracketing\n"
"6,cooperate,soft hyphen\n"
"7,nobreak,word joiner\n"
))
# ---------------------------------------------------------------------------
# 06 Control characters (non-printable, except tab/CR/LF inside quoted cells)
# ---------------------------------------------------------------------------
# These bytes show up in real exports from broken systems, terminals, or
# binary data accidentally exported as text.
# \x00 NUL, \x01 SOH, \x07 BEL, \x08 BS, \x0B VT, \x0C FF, \x1B ESC, \x7F DEL
write_text(TD / "06_control_characters.csv", (
"id,value,note\n"
"1,Hello\x00World,NUL byte inside\n"
"2,Bell\x07Sound,BEL character\n"
"3,Back\x08space,backspace\n"
"4,Vert\x0BTab,vertical tab\n"
"5,Form\x0CFeed,form feed\n"
"6,Esc\x1Bape,ESC character\n"
"7,Del\x7Fete,DEL character\n"
"8,Mixed\x00\x07\x1Bjunk,multiple controls in one cell\n"
))
write_text(EX / "06_control_characters.csv", (
"id,value,note\n"
"1,HelloWorld,NUL byte inside\n"
"2,BellSound,BEL character\n"
"3,Backspace,backspace\n"
"4,VertTab,vertical tab\n"
"5,FormFeed,form feed\n"
"6,Escape,ESC character\n"
"7,Delete,DEL character\n"
"8,Mixedjunk,multiple controls in one cell\n"
))
# ---------------------------------------------------------------------------
# 07 BOM at start of file (UTF-8 BOM = EF BB BF)
# ---------------------------------------------------------------------------
# Excel writes UTF-8 with BOM by default. pandas usually handles it but
# leaves the BOM as part of the first column's header name if you're not
# careful, producing a mystery column called "\ufeffid" that breaks lookups.
bom = b"\xef\xbb\xbf"
content = (
"id,name,city\n"
"1,Alice,New York\n"
"2,Bob,Chicago\n"
).encode("utf-8")
write_bytes(TD / "07_bom_utf8.csv", bom + content)
# Expected: BOM stripped on read, output written WITHOUT BOM, header is
# clean "id" not "\ufeffid".
write_bytes(EX / "07_bom_utf8.csv", content)
# ---------------------------------------------------------------------------
# 08 Line endings - all CRLF (Windows)
# ---------------------------------------------------------------------------
# Default policy: normalize to LF on output.
write_text(TD / "08_line_endings_crlf.csv", (
"id,name\n"
"1,Alice\n"
"2,Bob\n"
"3,Carol\n"
), newline="\r\n")
write_text(EX / "08_line_endings_crlf.csv", (
"id,name\n"
"1,Alice\n"
"2,Bob\n"
"3,Carol\n"
))
# ---------------------------------------------------------------------------
# 09 Line endings - CR only (classic Mac, pre-OSX, occasionally still seen)
# ---------------------------------------------------------------------------
write_text(TD / "09_line_endings_cr.csv", (
"id,name\n"
"1,Alice\n"
"2,Bob\n"
"3,Carol\n"
), newline="\r")
write_text(EX / "09_line_endings_cr.csv", (
"id,name\n"
"1,Alice\n"
"2,Bob\n"
"3,Carol\n"
))
# ---------------------------------------------------------------------------
# 10 Line endings - mixed within the same file
# ---------------------------------------------------------------------------
# Real-world disaster mode: file edited on multiple OSes, or concatenated
# from sources with different conventions.
mixed = (
b"id,name\r\n"
b"1,Alice\n"
b"2,Bob\r"
b"3,Carol\r\n"
b"4,Dan\n"
)
write_bytes(TD / "10_line_endings_mixed.csv", mixed)
write_text(EX / "10_line_endings_mixed.csv", (
"id,name\n"
"1,Alice\n"
"2,Bob\n"
"3,Carol\n"
"4,Dan\n"
))
# ---------------------------------------------------------------------------
# 11 Embedded newlines INSIDE quoted cells (must be preserved!)
# ---------------------------------------------------------------------------
# This is the trap: line-ending normalization at the FILE level must not
# destroy intentional newlines INSIDE quoted multi-line cells (e.g., a
# notes column or an address column).
# But the embedded line endings should also be normalized to LF for
# consistency.
write_text(TD / "11_embedded_newlines.csv", (
"id,address,notes\n"
"1,\"123 Main St\r\nApt 4B\r\nNew York, NY\",\"line1\nline2\"\n"
"2,\"Single line\",\"contains\rclassic mac\rinternal\"\n"
"3,\"normal\",\"no newlines here\"\n"
))
# Expected: file-level CRLF normalized to LF; embedded CRLF/CR also
# normalized to LF; cells stay multi-line.
write_text(EX / "11_embedded_newlines.csv", (
"id,address,notes\n"
"1,\"123 Main St\nApt 4B\nNew York, NY\",\"line1\nline2\"\n"
"2,Single line,\"contains\nclassic mac\ninternal\"\n"
"3,normal,no newlines here\n"
))
# ---------------------------------------------------------------------------
# 12 Case operations (opt-in, default = preserve)
# ---------------------------------------------------------------------------
# This file tests case operations IF the user requests them.
# Default behavior: PRESERVE. So expected_default == input.
# An expected_lower.csv shows what lower-case mode produces.
write_text(TD / "12_case_variations.csv", (
"id,name,email,product\n"
"1,ALICE SMITH,Alice@Example.COM,Widget\n"
"2,bob jones,BOB@example.com,GADGET\n"
"3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n"
"4,DAN O'CONNOR,Dan@Example.com,gizmo\n"
))
# Default expected: identical to input (case ops are opt-in).
write_text(EX / "12_case_variations__default.csv", (
"id,name,email,product\n"
"1,ALICE SMITH,Alice@Example.COM,Widget\n"
"2,bob jones,BOB@example.com,GADGET\n"
"3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n"
"4,DAN O'CONNOR,Dan@Example.com,gizmo\n"
))
# With --case-email=lower applied to email column only:
write_text(EX / "12_case_variations__email_lower.csv", (
"id,name,email,product\n"
"1,ALICE SMITH,alice@example.com,Widget\n"
"2,bob jones,bob@example.com,GADGET\n"
"3,Carol Brown,carol@example.com,wIdGeT\n"
"4,DAN O'CONNOR,dan@example.com,gizmo\n"
))
# With --case=title applied to name column:
write_text(EX / "12_case_variations__name_title.csv", (
"id,name,email,product\n"
"1,Alice Smith,Alice@Example.COM,Widget\n"
"2,Bob Jones,BOB@example.com,GADGET\n"
"3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n"
"4,Dan O'Connor,Dan@Example.com,gizmo\n" # title-case must not break O'C
))
# ---------------------------------------------------------------------------
# 13 Non-Latin scripts and emoji (PRESERVE; do not mangle)
# ---------------------------------------------------------------------------
# This is a negative test: the cleaner must not damage characters that
# look "foreign" to it. Whitespace trimming and Unicode NFC are still applied.
write_text(TD / "13_non_latin_scripts.csv", (
"id,name,note\n"
"1, \u4e2d\u56fd\u5317\u4eac ,Beijing in Chinese (with leading/trailing space)\n"
"2,\u30c6\u30b9\u30c8,Japanese katakana (test)\n"
"3,\u062a\u062c\u0631\u0628\u0629,Arabic (test) - RTL\n"
"4,\u041c\u043e\u0441\u043a\u0432\u0430,Russian (Moscow)\n"
"5,\U0001F389 launch \U0001F680,emoji preserved\n"
"6,caf\u00e9 \u2615,emoji + accent combo\n"
))
write_text(EX / "13_non_latin_scripts.csv", (
"id,name,note\n"
"1,\u4e2d\u56fd\u5317\u4eac,Beijing in Chinese (with leading/trailing space)\n"
"2,\u30c6\u30b9\u30c8,Japanese katakana (test)\n"
"3,\u062a\u062c\u0631\u0628\u0629,Arabic (test) - RTL\n"
"4,\u041c\u043e\u0441\u043a\u0432\u0430,Russian (Moscow)\n"
"5,\U0001F389 launch \U0001F680,emoji preserved\n"
"6,caf\u00e9 \u2615,emoji + accent combo\n"
))
# ---------------------------------------------------------------------------
# 14 Mojibake (double-encoded UTF-8 / cp1252 misread as Latin-1)
# ---------------------------------------------------------------------------
# Classic mojibake: someone took a UTF-8 file, opened it as Windows-1252,
# saved as UTF-8 again. "café" becomes "café", "naïve" becomes "naïve".
# The text cleaner CANNOT reliably auto-fix this (it's a heuristic and can
# false-positive on legitimate strings). Default = WARN, do not auto-fix.
# Optional --fix-mojibake flag (uses ftfy library) can attempt repair.
write_text(TD / "14_mojibake.csv", (
"id,name,city\n"
"1,caf\u00c3\u00a9,M\u00c3\u00bcnchen\n" # café, München mojibaked
"2,na\u00c3\u00afve,r\u00c3\u00a9sum\u00c3\u00a9\n" # naïve, résumé
"3,don\u00e2\u20ac\u2122t,smart-apostrophe mojibake\n" # don't via cp1252-mojibake
"4,Alice,New York\n" # clean control row
))
# Expected output WITHOUT mojibake fix (default): bytes preserved, but
# reader emits a warning to logs.
write_text(EX / "14_mojibake__default.csv", (
"id,name,city\n"
"1,caf\u00c3\u00a9,M\u00c3\u00bcnchen\n"
"2,na\u00c3\u00afve,r\u00c3\u00a9sum\u00c3\u00a9\n"
"3,don\u00e2\u20ac\u2122t,smart-apostrophe mojibake\n"
"4,Alice,New York\n"
))
# Expected output WITH --fix-mojibake (uses ftfy or equivalent):
write_text(EX / "14_mojibake__fixed.csv", (
"id,name,city\n"
"1,caf\u00e9,M\u00fcnchen\n"
"2,na\u00efve,r\u00e9sum\u00e9\n"
"3,don't,smart-apostrophe mojibake\n" # smart apostrophe also fixed
"4,Alice,New York\n"
))
# ---------------------------------------------------------------------------
# 15 Whitespace-only cells (boundary case with script 04)
# ---------------------------------------------------------------------------
# Per TECHNICAL.md Section 9.3: 02 trims first, leaving an empty string.
# 04 then detects empty strings as disguised null. So 02's job here is
# just to convert " " into "".
write_text(TD / "15_whitespace_only_cells.csv", (
"id,value\n"
"1,real\n"
"2, \n" # spaces only
"3,\t\t\n" # tabs only
"4,\u00A0\u00A0\n" # NBSP only
"5, \t \u00A0 \n" # mixed whitespace
"6,\n" # already empty
"7,actual value\n"
))
write_text(EX / "15_whitespace_only_cells.csv", (
"id,value\n"
"1,real\n"
"2,\n" # all whitespace -> empty
"3,\n"
"4,\n"
"5,\n"
"6,\n"
"7,actual value\n"
))
# ---------------------------------------------------------------------------
# 16 Dirty headers
# ---------------------------------------------------------------------------
# Headers themselves have whitespace, BOM remnants, smart quotes, etc.
# These break downstream lookups (df["email"] fails because the column
# is actually called " Email " with NBSP padding).
write_text(TD / "16_dirty_headers.csv", (
" id ,\u00a0Customer Name\u00a0,\u201cEmail\u201d,Phone\u200b\n"
"1,Alice,alice@example.com,555-1234\n"
"2,Bob,bob@example.com,555-5678\n"
))
# Expected: headers cleaned by SAME rules as data cells.
# Note: smart quotes around "Email" become straight quotes. The header
# "\"Email\"" with embedded quotes needs CSV-quoting in the output.
write_text(EX / "16_dirty_headers.csv", (
"id,Customer Name,\"\"\"Email\"\"\",Phone\n"
"1,Alice,alice@example.com,555-1234\n"
"2,Bob,bob@example.com,555-5678\n"
))
# ---------------------------------------------------------------------------
# 17 Preserve-intended (negative tests - things 02 must NOT touch)
# ---------------------------------------------------------------------------
# Numbers that LOOK like they have whitespace are tricky: " 123 " is
# a number with padding (trim) but "1 234" might be a thousands-separator
# locale (don't collapse). Default: trim outer whitespace, but DO NOT
# collapse internal whitespace in cells that parse as numeric. This is a
# judgment call; document it.
#
# Also: do not reformat dates, currencies, or phone numbers. That's 03.
# Do not detect or replace null-like values. That's 04.
write_text(TD / "17_preserve_intended.csv", (
"id,price,european_number,date,phone,quantity\n"
"1, 100 ,1 234,2024-01-15,(555) 123-4567,42\n"
"2,\" $1,500.00 \",12 345,15/01/2024,555.123.4567,7\n"
"3, N/A ,nan,Jan 15 2024,+1 555 123 4567,0\n"
))
# Expected: outer whitespace trimmed everywhere, but:
# - "1 234" stays "1 234" (looks like European/space-thousands; don't collapse)
# - "$1,500.00" stays unchanged (currency, that's 03's domain)
# - "15/01/2024" stays unchanged (date, that's 03's domain)
# - "(555) 123-4567" stays unchanged (phone, that's 03's domain)
# - "N/A" stays "N/A" (null-like, that's 04's domain - 02 doesn't decide what's null)
# - phone "+1 555 123 4567" - keep internal spaces (it's a phone, 03's domain)
write_text(EX / "17_preserve_intended.csv", (
"id,price,european_number,date,phone,quantity\n"
"1,100,1 234,2024-01-15,(555) 123-4567,42\n"
"2,\"$1,500.00\",12 345,15/01/2024,555.123.4567,7\n"
"3,N/A,nan,Jan 15 2024,+1 555 123 4567,0\n"
))
# ---------------------------------------------------------------------------
# 18 Empty file (zero bytes)
# ---------------------------------------------------------------------------
write_bytes(TD / "18_empty_file.csv", b"")
# Expected: graceful handling, output is also empty (or warning emitted).
write_bytes(EX / "18_empty_file.csv", b"")
# ---------------------------------------------------------------------------
# 19 Headers only (no data rows)
# ---------------------------------------------------------------------------
write_text(TD / "19_headers_only.csv", (
" id ,Name\u00a0,Email\u200b\n"
))
# Expected: headers cleaned, no data rows in output.
write_text(EX / "19_headers_only.csv", (
"id,Name,Email\n"
))
# ---------------------------------------------------------------------------
# 20 Real-world kitchen sink (everything combined)
# ---------------------------------------------------------------------------
# Simulates a typical messy export: came from Excel via cp1252 paste,
# saved as UTF-8 with BOM, has CRLF, has smart quotes from autocorrect,
# has NBSP from copy/paste, has trailing whitespace.
content = (
" id ,\u00a0Name\u00a0,\u201cEmail\u201d,Notes\u200b\n"
"1,\u00a0Alice Smith\u00a0,Alice@Example.COM,\u201cVIP\u201d customer \u2014 contact ASAP\u2026\r\n"
"2,\tBob\tJones\t,bob@example.com,it\u2019s 5\u20326\u2033 tall\r\n"
"3, Carol Brown ,CAROL@EXAMPLE.COM,3 \u00d7 4 = 12 (preserve \u00d7)\r\n"
"4, ,empty@example.com,whitespace-only name (becomes empty)\r\n"
)
data_bytes = b"\xef\xbb\xbf" + content.encode("utf-8")
# replace LF with CRLF wherever it isn't already to be unambiguous
write_bytes(TD / "20_kitchen_sink.csv", data_bytes)
# Expected: BOM gone, headers clean, smart quotes ASCII-fied, NBSP/ZWSP
# stripped, internal multi-space collapsed, CRLF normalized to LF,
# whitespace-only cells become empty, multiplication sign preserved,
# em-dash and ellipsis converted, prime/double-prime converted.
write_text(EX / "20_kitchen_sink.csv", (
"id,Name,\"\"\"Email\"\"\",Notes\n"
"1,Alice Smith,Alice@Example.COM,\"\"\"VIP\"\" customer - contact ASAP...\"\n"
"2,Bob Jones,bob@example.com,\"it's 5'6\"\" tall\"\n"
"3,Carol Brown,CAROL@EXAMPLE.COM,3 \u00d7 4 = 12 (preserve \u00d7)\n"
"4,,empty@example.com,whitespace-only name (becomes empty)\n"
))
print("All CSV test files written.")
print(f" inputs: {TD}")
print(f" expected: {EX}")