test: add text-cleaner corpus and close gaps surfaced by it

The 21-fixture corpus (test-cases/text-cleaner-corpus/) exercises the cleaner end-to-end against the spec in TEST-CASES.md. Closing the failing cases drove five small cleaner fixes plus two fixture-generation fixes: - _SMART_CHARS: add prime, double prime, guillemets (case 03) - _ZERO_WIDTH: add soft hyphen U+00AD (case 05) - clean_dataframe: clean column headers via the same pipeline (cases 16/19/20), with a clean_headers toggle on CleanOptions - smart_title_case: title-case full-shout strings ("ALICE SMITH" -> "Alice Smith") while still preserving embedded acronyms; preserve uppercase after apostrophe in names ("O'CONNOR" -> "O'Connor", "o'neil" -> "O'neil") - test_corpus.py reader: pre-strip NUL bytes (C parser truncates at NUL, python engine is too strict about embedded literal "), per spec case 06 - generate_test_data.py: properly CSV-escape literal-quote cells in case 03 expected; quote the rogue-comma price field in case 17 input Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 15:37:35 +00:00
parent 54f92ae47e
commit c349a90e18
50 changed files with 1644 additions and 4 deletions
--- a/test-cases/text-cleaner-corpus/generate_test_data.py
+++ b/test-cases/text-cleaner-corpus/generate_test_data.py
@@ -0,0 +1,545 @@
+"""
+Generator for the 02_text_cleaner test corpus.
+
+Writes raw bytes where exact control over encoding/line-endings/invisible
+characters matters. Do not edit the output files in a text editor that
+"helpfully" normalizes anything; it will silently break the tests.
+
+Run from the corpus root:
+    python generate_test_data.py
+"""
+from pathlib import Path
+
+ROOT = Path(__file__).parent
+TD = ROOT / "test_data"
+EX = ROOT / "expected"
+TD.mkdir(exist_ok=True)
+EX.mkdir(exist_ok=True)
+
+
+def write_bytes(path, data):
+    Path(path).write_bytes(data)
+
+
+def write_text(path, text, encoding="utf-8", newline="\n"):
+    # Explicit byte write so we control line endings exactly.
+    if newline != "\n":
+        text = text.replace("\n", newline)
+    Path(path).write_bytes(text.encode(encoding))
+
+
+# ---------------------------------------------------------------------------
+# 01 Whitespace - basic (ASCII space + tab)
+# ---------------------------------------------------------------------------
+write_text(TD / "01_whitespace_basic.csv", (
+    "id,name,city\n"
+    "1,  Alice  ,New York\n"            # leading + trailing spaces
+    "2,Bob,   Chicago\n"                # leading spaces
+    "3,Carol   ,San Francisco   \n"     # trailing spaces
+    "4,Dan    Smith,Austin\n"           # internal multi-space
+    "5,\tEve\t,\tBoston\t\n"            # tab padding
+    "6,Frank  van  der  Berg,Denver\n"  # multiple internal multi-space runs
+    "7,  Grace   Hopper  ,  Palo  Alto  \n"  # everything at once
+))
+
+write_text(EX / "01_whitespace_basic.csv", (
+    "id,name,city\n"
+    "1,Alice,New York\n"
+    "2,Bob,Chicago\n"
+    "3,Carol,San Francisco\n"
+    "4,Dan Smith,Austin\n"
+    "5,Eve,Boston\n"
+    "6,Frank van der Berg,Denver\n"
+    "7,Grace Hopper,Palo Alto\n"
+))
+
+# ---------------------------------------------------------------------------
+# 02 Whitespace - unicode (NBSP, narrow NBSP, ideographic space, etc.)
+# ---------------------------------------------------------------------------
+# These are the whitespace-pretenders that .strip() in Python 3 actually
+# DOES handle, but that .strip() in many naive implementations (or pandas
+# defaults) does NOT. Test that they're stripped, not preserved.
+NBSP = "\u00A0"          # non-breaking space (very common from Word/Excel paste)
+NNBSP = "\u202F"         # narrow no-break space
+IDEO = "\u3000"          # ideographic space (CJK)
+EM_SPACE = "\u2003"      # em space
+THIN_SPACE = "\u2009"    # thin space
+write_text(TD / "02_whitespace_unicode.csv", (
+    "id,label,note\n"
+    f"1,{NBSP}Premium{NBSP},NBSP padding\n"
+    f"2,{NNBSP}Discount{NNBSP},narrow NBSP\n"
+    f"3,{IDEO}Standard{IDEO},ideographic space\n"
+    f"4,Tier{EM_SPACE}{EM_SPACE}One,em-space internal\n"
+    f"5,Cost{THIN_SPACE}Plus,thin-space internal\n"
+    f"6,  {NBSP} mixed {NBSP}  ,ascii + NBSP combined\n"
+))
+
+write_text(EX / "02_whitespace_unicode.csv", (
+    "id,label,note\n"
+    "1,Premium,NBSP padding\n"
+    "2,Discount,narrow NBSP\n"
+    "3,Standard,ideographic space\n"
+    "4,Tier One,em-space internal\n"
+    "5,Cost Plus,thin-space internal\n"
+    "6,mixed,ascii + NBSP combined\n"
+))
+
+# ---------------------------------------------------------------------------
+# 03 Smart punctuation (curly quotes, em/en dash, ellipsis, primes)
+# ---------------------------------------------------------------------------
+# This is the #1 source of pollution from data that ever passed through
+# Word, Outlook, or Excel autocorrect. ASCII-fy it.
+write_text(TD / "03_smart_punctuation.csv", (
+    "id,quote,measurement\n"
+    "1,\u201cHello world\u201d,5\u2032 11\u2033\n"        # curly double quotes, prime/double-prime
+    "2,it\u2019s working,\u2014\n"                          # curly apostrophe, em-dash alone
+    "3,2020\u20132024,from \u2018a\u2019 to \u2018z\u2019\n"  # en-dash range, curly singles
+    "4,wait\u2026,3 \u00d7 4\n"                              # ellipsis char, multiplication sign
+    "5,\u00abquoted\u00bb,5 \u00b1 0.1\n"                    # guillemets, plus-minus
+))
+
+# Default policy: ASCII-fy where round-trip-safe.
+# Notable: \u00d7 (multiplication) and \u00b1 (plus-minus) are typographically
+# meaningful and not safely round-trippable to ASCII, so we PRESERVE them
+# (case 4 col3, case 5 col3). Document this in TEST-CASES.md.
+write_text(EX / "03_smart_punctuation.csv", (
+    "id,quote,measurement\n"
+    "1,\"\"\"Hello world\"\"\",\"5' 11\"\"\"\n"
+    "2,it's working,-\n"
+    "3,2020-2024,from 'a' to 'z'\n"
+    "4,wait...,3 \u00d7 4\n"
+    "5,\"\"\"quoted\"\"\",5 \u00b1 0.1\n"
+))
+
+# ---------------------------------------------------------------------------
+# 04 Unicode normalization forms (NFC vs NFD, ligatures, fullwidth)
+# ---------------------------------------------------------------------------
+# "café" can be either:
+#   NFC: "caf\u00e9"           (e-acute as single code point)
+#   NFD: "cafe\u0301"          (e + combining acute accent, two code points)
+# These look identical but compare unequal. Normalize to NFC.
+write_text(TD / "04_unicode_forms.csv", (
+    "id,name,description\n"
+    "1,caf\u00e9,NFC form (single code point)\n"
+    "2,cafe\u0301,NFD form (e + combining accent)\n"
+    "3,na\u00efve,NFC i-diaeresis\n"
+    "4,nai\u0308ve,NFD i + combining diaeresis\n"
+    "5,o\uFB03ce,fi-ligature (\uFB03)\n"             # 'office' written with 'ffi' ligature
+    "6,\uFF21\uFF22\uFF23,fullwidth ABC\n"          # Ａ Ｂ Ｃ
+    "7,\u2168 century,roman numeral nine (single code point)\n"  # Ⅸ
+))
+
+# Policy: NFC by default (most compatible, smallest, what Excel emits).
+# NFKC option would also fold ligatures and fullwidth digits/letters,
+# but is destructive for some legitimate text. Default = NFC.
+# So:
+#  - Cases 1 vs 2 should produce identical output after normalization
+#  - Cases 3 vs 4 should produce identical output
+#  - Case 5 ligature stays as ligature under NFC (would fold under NFKC)
+#  - Case 6 fullwidth stays fullwidth under NFC (would fold under NFKC)
+write_text(EX / "04_unicode_forms.csv", (
+    "id,name,description\n"
+    "1,caf\u00e9,NFC form (single code point)\n"
+    "2,caf\u00e9,NFD form (e + combining accent)\n"  # same bytes as row 1 now
+    "3,na\u00efve,NFC i-diaeresis\n"
+    "4,na\u00efve,NFD i + combining diaeresis\n"      # same as row 3 now
+    "5,o\uFB03ce,fi-ligature (\uFB03)\n"
+    "6,\uFF21\uFF22\uFF23,fullwidth ABC\n"
+    "7,\u2168 century,roman numeral nine (single code point)\n"
+))
+
+# ---------------------------------------------------------------------------
+# 05 Zero-width / invisible characters
+# ---------------------------------------------------------------------------
+ZWSP = "\u200B"     # zero-width space
+ZWNJ = "\u200C"     # zero-width non-joiner
+ZWJ = "\u200D"      # zero-width joiner
+LRM = "\u200E"      # left-to-right mark
+RLM = "\u200F"      # right-to-left mark
+SOFT_HYPHEN = "\u00AD"
+WORD_JOINER = "\u2060"
+write_text(TD / "05_zero_width_invisible.csv", (
+    "id,value,note\n"
+    f"1,Hel{ZWSP}lo,zero-width space inside word\n"
+    f"2,{ZWSP}Lead{ZWSP}ing,leading + internal ZWSP\n"
+    f"3,Trail{ZWSP},trailing ZWSP\n"
+    f"4,a{ZWNJ}b{ZWJ}c,ZWNJ and ZWJ\n"
+    f"5,{LRM}Marked{RLM},LTR + RTL marks bracketing\n"
+    f"6,co{SOFT_HYPHEN}operate,soft hyphen\n"
+    f"7,no{WORD_JOINER}break,word joiner\n"
+))
+
+write_text(EX / "05_zero_width_invisible.csv", (
+    "id,value,note\n"
+    "1,Hello,zero-width space inside word\n"
+    "2,Leading,leading + internal ZWSP\n"
+    "3,Trail,trailing ZWSP\n"
+    "4,abc,ZWNJ and ZWJ\n"
+    "5,Marked,LTR + RTL marks bracketing\n"
+    "6,cooperate,soft hyphen\n"
+    "7,nobreak,word joiner\n"
+))
+
+# ---------------------------------------------------------------------------
+# 06 Control characters (non-printable, except tab/CR/LF inside quoted cells)
+# ---------------------------------------------------------------------------
+# These bytes show up in real exports from broken systems, terminals, or
+# binary data accidentally exported as text.
+# \x00 NUL, \x01 SOH, \x07 BEL, \x08 BS, \x0B VT, \x0C FF, \x1B ESC, \x7F DEL
+write_text(TD / "06_control_characters.csv", (
+    "id,value,note\n"
+    "1,Hello\x00World,NUL byte inside\n"
+    "2,Bell\x07Sound,BEL character\n"
+    "3,Back\x08space,backspace\n"
+    "4,Vert\x0BTab,vertical tab\n"
+    "5,Form\x0CFeed,form feed\n"
+    "6,Esc\x1Bape,ESC character\n"
+    "7,Del\x7Fete,DEL character\n"
+    "8,Mixed\x00\x07\x1Bjunk,multiple controls in one cell\n"
+))
+
+write_text(EX / "06_control_characters.csv", (
+    "id,value,note\n"
+    "1,HelloWorld,NUL byte inside\n"
+    "2,BellSound,BEL character\n"
+    "3,Backspace,backspace\n"
+    "4,VertTab,vertical tab\n"
+    "5,FormFeed,form feed\n"
+    "6,Escape,ESC character\n"
+    "7,Delete,DEL character\n"
+    "8,Mixedjunk,multiple controls in one cell\n"
+))
+
+# ---------------------------------------------------------------------------
+# 07 BOM at start of file (UTF-8 BOM = EF BB BF)
+# ---------------------------------------------------------------------------
+# Excel writes UTF-8 with BOM by default. pandas usually handles it but
+# leaves the BOM as part of the first column's header name if you're not
+# careful, producing a mystery column called "\ufeffid" that breaks lookups.
+bom = b"\xef\xbb\xbf"
+content = (
+    "id,name,city\n"
+    "1,Alice,New York\n"
+    "2,Bob,Chicago\n"
+).encode("utf-8")
+write_bytes(TD / "07_bom_utf8.csv", bom + content)
+
+# Expected: BOM stripped on read, output written WITHOUT BOM, header is
+# clean "id" not "\ufeffid".
+write_bytes(EX / "07_bom_utf8.csv", content)
+
+# ---------------------------------------------------------------------------
+# 08 Line endings - all CRLF (Windows)
+# ---------------------------------------------------------------------------
+# Default policy: normalize to LF on output.
+write_text(TD / "08_line_endings_crlf.csv", (
+    "id,name\n"
+    "1,Alice\n"
+    "2,Bob\n"
+    "3,Carol\n"
+), newline="\r\n")
+
+write_text(EX / "08_line_endings_crlf.csv", (
+    "id,name\n"
+    "1,Alice\n"
+    "2,Bob\n"
+    "3,Carol\n"
+))
+
+# ---------------------------------------------------------------------------
+# 09 Line endings - CR only (classic Mac, pre-OSX, occasionally still seen)
+# ---------------------------------------------------------------------------
+write_text(TD / "09_line_endings_cr.csv", (
+    "id,name\n"
+    "1,Alice\n"
+    "2,Bob\n"
+    "3,Carol\n"
+), newline="\r")
+
+write_text(EX / "09_line_endings_cr.csv", (
+    "id,name\n"
+    "1,Alice\n"
+    "2,Bob\n"
+    "3,Carol\n"
+))
+
+# ---------------------------------------------------------------------------
+# 10 Line endings - mixed within the same file
+# ---------------------------------------------------------------------------
+# Real-world disaster mode: file edited on multiple OSes, or concatenated
+# from sources with different conventions.
+mixed = (
+    b"id,name\r\n"
+    b"1,Alice\n"
+    b"2,Bob\r"
+    b"3,Carol\r\n"
+    b"4,Dan\n"
+)
+write_bytes(TD / "10_line_endings_mixed.csv", mixed)
+
+write_text(EX / "10_line_endings_mixed.csv", (
+    "id,name\n"
+    "1,Alice\n"
+    "2,Bob\n"
+    "3,Carol\n"
+    "4,Dan\n"
+))
+
+# ---------------------------------------------------------------------------
+# 11 Embedded newlines INSIDE quoted cells (must be preserved!)
+# ---------------------------------------------------------------------------
+# This is the trap: line-ending normalization at the FILE level must not
+# destroy intentional newlines INSIDE quoted multi-line cells (e.g., a
+# notes column or an address column).
+# But the embedded line endings should also be normalized to LF for
+# consistency.
+write_text(TD / "11_embedded_newlines.csv", (
+    "id,address,notes\n"
+    "1,\"123 Main St\r\nApt 4B\r\nNew York, NY\",\"line1\nline2\"\n"
+    "2,\"Single line\",\"contains\rclassic mac\rinternal\"\n"
+    "3,\"normal\",\"no newlines here\"\n"
+))
+
+# Expected: file-level CRLF normalized to LF; embedded CRLF/CR also
+# normalized to LF; cells stay multi-line.
+write_text(EX / "11_embedded_newlines.csv", (
+    "id,address,notes\n"
+    "1,\"123 Main St\nApt 4B\nNew York, NY\",\"line1\nline2\"\n"
+    "2,Single line,\"contains\nclassic mac\ninternal\"\n"
+    "3,normal,no newlines here\n"
+))
+
+# ---------------------------------------------------------------------------
+# 12 Case operations (opt-in, default = preserve)
+# ---------------------------------------------------------------------------
+# This file tests case operations IF the user requests them.
+# Default behavior: PRESERVE. So expected_default == input.
+# An expected_lower.csv shows what lower-case mode produces.
+write_text(TD / "12_case_variations.csv", (
+    "id,name,email,product\n"
+    "1,ALICE SMITH,Alice@Example.COM,Widget\n"
+    "2,bob jones,BOB@example.com,GADGET\n"
+    "3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n"
+    "4,DAN O'CONNOR,Dan@Example.com,gizmo\n"
+))
+
+# Default expected: identical to input (case ops are opt-in).
+write_text(EX / "12_case_variations__default.csv", (
+    "id,name,email,product\n"
+    "1,ALICE SMITH,Alice@Example.COM,Widget\n"
+    "2,bob jones,BOB@example.com,GADGET\n"
+    "3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n"
+    "4,DAN O'CONNOR,Dan@Example.com,gizmo\n"
+))
+
+# With --case-email=lower applied to email column only:
+write_text(EX / "12_case_variations__email_lower.csv", (
+    "id,name,email,product\n"
+    "1,ALICE SMITH,alice@example.com,Widget\n"
+    "2,bob jones,bob@example.com,GADGET\n"
+    "3,Carol Brown,carol@example.com,wIdGeT\n"
+    "4,DAN O'CONNOR,dan@example.com,gizmo\n"
+))
+
+# With --case=title applied to name column:
+write_text(EX / "12_case_variations__name_title.csv", (
+    "id,name,email,product\n"
+    "1,Alice Smith,Alice@Example.COM,Widget\n"
+    "2,Bob Jones,BOB@example.com,GADGET\n"
+    "3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n"
+    "4,Dan O'Connor,Dan@Example.com,gizmo\n"          # title-case must not break O'C
+))
+
+# ---------------------------------------------------------------------------
+# 13 Non-Latin scripts and emoji (PRESERVE; do not mangle)
+# ---------------------------------------------------------------------------
+# This is a negative test: the cleaner must not damage characters that
+# look "foreign" to it. Whitespace trimming and Unicode NFC are still applied.
+write_text(TD / "13_non_latin_scripts.csv", (
+    "id,name,note\n"
+    "1, \u4e2d\u56fd\u5317\u4eac ,Beijing in Chinese (with leading/trailing space)\n"
+    "2,\u30c6\u30b9\u30c8,Japanese katakana (test)\n"
+    "3,\u062a\u062c\u0631\u0628\u0629,Arabic (test) - RTL\n"
+    "4,\u041c\u043e\u0441\u043a\u0432\u0430,Russian (Moscow)\n"
+    "5,\U0001F389 launch \U0001F680,emoji preserved\n"
+    "6,caf\u00e9 \u2615,emoji + accent combo\n"
+))
+
+write_text(EX / "13_non_latin_scripts.csv", (
+    "id,name,note\n"
+    "1,\u4e2d\u56fd\u5317\u4eac,Beijing in Chinese (with leading/trailing space)\n"
+    "2,\u30c6\u30b9\u30c8,Japanese katakana (test)\n"
+    "3,\u062a\u062c\u0631\u0628\u0629,Arabic (test) - RTL\n"
+    "4,\u041c\u043e\u0441\u043a\u0432\u0430,Russian (Moscow)\n"
+    "5,\U0001F389 launch \U0001F680,emoji preserved\n"
+    "6,caf\u00e9 \u2615,emoji + accent combo\n"
+))
+
+# ---------------------------------------------------------------------------
+# 14 Mojibake (double-encoded UTF-8 / cp1252 misread as Latin-1)
+# ---------------------------------------------------------------------------
+# Classic mojibake: someone took a UTF-8 file, opened it as Windows-1252,
+# saved as UTF-8 again. "café" becomes "cafÃ©", "naïve" becomes "naÃ¯ve".
+# The text cleaner CANNOT reliably auto-fix this (it's a heuristic and can
+# false-positive on legitimate strings). Default = WARN, do not auto-fix.
+# Optional --fix-mojibake flag (uses ftfy library) can attempt repair.
+write_text(TD / "14_mojibake.csv", (
+    "id,name,city\n"
+    "1,caf\u00c3\u00a9,M\u00c3\u00bcnchen\n"        # café, München mojibaked
+    "2,na\u00c3\u00afve,r\u00c3\u00a9sum\u00c3\u00a9\n"  # naïve, résumé
+    "3,don\u00e2\u20ac\u2122t,smart-apostrophe mojibake\n"  # don't via cp1252-mojibake
+    "4,Alice,New York\n"                              # clean control row
+))
+
+# Expected output WITHOUT mojibake fix (default): bytes preserved, but
+# reader emits a warning to logs.
+write_text(EX / "14_mojibake__default.csv", (
+    "id,name,city\n"
+    "1,caf\u00c3\u00a9,M\u00c3\u00bcnchen\n"
+    "2,na\u00c3\u00afve,r\u00c3\u00a9sum\u00c3\u00a9\n"
+    "3,don\u00e2\u20ac\u2122t,smart-apostrophe mojibake\n"
+    "4,Alice,New York\n"
+))
+
+# Expected output WITH --fix-mojibake (uses ftfy or equivalent):
+write_text(EX / "14_mojibake__fixed.csv", (
+    "id,name,city\n"
+    "1,caf\u00e9,M\u00fcnchen\n"
+    "2,na\u00efve,r\u00e9sum\u00e9\n"
+    "3,don't,smart-apostrophe mojibake\n"          # smart apostrophe also fixed
+    "4,Alice,New York\n"
+))
+
+# ---------------------------------------------------------------------------
+# 15 Whitespace-only cells (boundary case with script 04)
+# ---------------------------------------------------------------------------
+# Per TECHNICAL.md Section 9.3: 02 trims first, leaving an empty string.
+# 04 then detects empty strings as disguised null. So 02's job here is
+# just to convert "   " into "".
+write_text(TD / "15_whitespace_only_cells.csv", (
+    "id,value\n"
+    "1,real\n"
+    "2,   \n"                  # spaces only
+    "3,\t\t\n"                 # tabs only
+    "4,\u00A0\u00A0\n"         # NBSP only
+    "5, \t \u00A0 \n"          # mixed whitespace
+    "6,\n"                     # already empty
+    "7,actual value\n"
+))
+
+write_text(EX / "15_whitespace_only_cells.csv", (
+    "id,value\n"
+    "1,real\n"
+    "2,\n"                      # all whitespace -> empty
+    "3,\n"
+    "4,\n"
+    "5,\n"
+    "6,\n"
+    "7,actual value\n"
+))
+
+# ---------------------------------------------------------------------------
+# 16 Dirty headers
+# ---------------------------------------------------------------------------
+# Headers themselves have whitespace, BOM remnants, smart quotes, etc.
+# These break downstream lookups (df["email"] fails because the column
+# is actually called "  Email  " with NBSP padding).
+write_text(TD / "16_dirty_headers.csv", (
+    "  id  ,\u00a0Customer Name\u00a0,\u201cEmail\u201d,Phone\u200b\n"
+    "1,Alice,alice@example.com,555-1234\n"
+    "2,Bob,bob@example.com,555-5678\n"
+))
+
+# Expected: headers cleaned by SAME rules as data cells.
+# Note: smart quotes around "Email" become straight quotes. The header
+# "\"Email\"" with embedded quotes needs CSV-quoting in the output.
+write_text(EX / "16_dirty_headers.csv", (
+    "id,Customer Name,\"\"\"Email\"\"\",Phone\n"
+    "1,Alice,alice@example.com,555-1234\n"
+    "2,Bob,bob@example.com,555-5678\n"
+))
+
+# ---------------------------------------------------------------------------
+# 17 Preserve-intended (negative tests - things 02 must NOT touch)
+# ---------------------------------------------------------------------------
+# Numbers that LOOK like they have whitespace are tricky: "  123  " is
+# a number with padding (trim) but "1 234" might be a thousands-separator
+# locale (don't collapse). Default: trim outer whitespace, but DO NOT
+# collapse internal whitespace in cells that parse as numeric. This is a
+# judgment call; document it.
+#
+# Also: do not reformat dates, currencies, or phone numbers. That's 03.
+# Do not detect or replace null-like values. That's 04.
+write_text(TD / "17_preserve_intended.csv", (
+    "id,price,european_number,date,phone,quantity\n"
+    "1,  100  ,1 234,2024-01-15,(555) 123-4567,42\n"
+    "2,\"  $1,500.00  \",12 345,15/01/2024,555.123.4567,7\n"
+    "3,  N/A  ,nan,Jan 15 2024,+1 555 123 4567,0\n"
+))
+
+# Expected: outer whitespace trimmed everywhere, but:
+#  - "1 234" stays "1 234" (looks like European/space-thousands; don't collapse)
+#  - "$1,500.00" stays unchanged (currency, that's 03's domain)
+#  - "15/01/2024" stays unchanged (date, that's 03's domain)
+#  - "(555) 123-4567" stays unchanged (phone, that's 03's domain)
+#  - "N/A" stays "N/A" (null-like, that's 04's domain - 02 doesn't decide what's null)
+#  - phone "+1 555 123 4567" - keep internal spaces (it's a phone, 03's domain)
+write_text(EX / "17_preserve_intended.csv", (
+    "id,price,european_number,date,phone,quantity\n"
+    "1,100,1 234,2024-01-15,(555) 123-4567,42\n"
+    "2,\"$1,500.00\",12 345,15/01/2024,555.123.4567,7\n"
+    "3,N/A,nan,Jan 15 2024,+1 555 123 4567,0\n"
+))
+
+# ---------------------------------------------------------------------------
+# 18 Empty file (zero bytes)
+# ---------------------------------------------------------------------------
+write_bytes(TD / "18_empty_file.csv", b"")
+
+# Expected: graceful handling, output is also empty (or warning emitted).
+write_bytes(EX / "18_empty_file.csv", b"")
+
+# ---------------------------------------------------------------------------
+# 19 Headers only (no data rows)
+# ---------------------------------------------------------------------------
+write_text(TD / "19_headers_only.csv", (
+    "  id  ,Name\u00a0,Email\u200b\n"
+))
+
+# Expected: headers cleaned, no data rows in output.
+write_text(EX / "19_headers_only.csv", (
+    "id,Name,Email\n"
+))
+
+# ---------------------------------------------------------------------------
+# 20 Real-world kitchen sink (everything combined)
+# ---------------------------------------------------------------------------
+# Simulates a typical messy export: came from Excel via cp1252 paste,
+# saved as UTF-8 with BOM, has CRLF, has smart quotes from autocorrect,
+# has NBSP from copy/paste, has trailing whitespace.
+content = (
+    "  id  ,\u00a0Name\u00a0,\u201cEmail\u201d,Notes\u200b\n"
+    "1,\u00a0Alice  Smith\u00a0,Alice@Example.COM,\u201cVIP\u201d customer \u2014 contact ASAP\u2026\r\n"
+    "2,\tBob\tJones\t,bob@example.com,it\u2019s 5\u20326\u2033 tall\r\n"
+    "3,  Carol  Brown  ,CAROL@EXAMPLE.COM,3 \u00d7 4 = 12 (preserve \u00d7)\r\n"
+    "4,   ,empty@example.com,whitespace-only name (becomes empty)\r\n"
+)
+data_bytes = b"\xef\xbb\xbf" + content.encode("utf-8")
+# replace LF with CRLF wherever it isn't already to be unambiguous
+write_bytes(TD / "20_kitchen_sink.csv", data_bytes)
+
+# Expected: BOM gone, headers clean, smart quotes ASCII-fied, NBSP/ZWSP
+# stripped, internal multi-space collapsed, CRLF normalized to LF,
+# whitespace-only cells become empty, multiplication sign preserved,
+# em-dash and ellipsis converted, prime/double-prime converted.
+write_text(EX / "20_kitchen_sink.csv", (
+    "id,Name,\"\"\"Email\"\"\",Notes\n"
+    "1,Alice Smith,Alice@Example.COM,\"\"\"VIP\"\" customer - contact ASAP...\"\n"
+    "2,Bob Jones,bob@example.com,\"it's 5'6\"\" tall\"\n"
+    "3,Carol Brown,CAROL@EXAMPLE.COM,3 \u00d7 4 = 12 (preserve \u00d7)\n"
+    "4,,empty@example.com,whitespace-only name (becomes empty)\n"
+))
+
+print("All CSV test files written.")
+print(f"  inputs:   {TD}")
+print(f"  expected: {EX}")