The 21-fixture corpus (test-cases/text-cleaner-corpus/) exercises the cleaner
end-to-end against the spec in TEST-CASES.md. Closing the failing cases drove
five small cleaner fixes plus two fixture-generation fixes:
- _SMART_CHARS: add prime, double prime, guillemets (case 03)
- _ZERO_WIDTH: add soft hyphen U+00AD (case 05)
- clean_dataframe: clean column headers via the same pipeline (cases 16/19/20),
with a clean_headers toggle on CleanOptions
- smart_title_case: title-case full-shout strings ("ALICE SMITH" -> "Alice
Smith") while still preserving embedded acronyms; preserve uppercase after
apostrophe in names ("O'CONNOR" -> "O'Connor", "o'neil" -> "O'neil")
- test_corpus.py reader: pre-strip NUL bytes (C parser truncates at NUL,
python engine is too strict about embedded literal "), per spec case 06
- generate_test_data.py: properly CSV-escape literal-quote cells in case 03
expected; quote the rogue-comma price field in case 17 input
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
546 lines
22 KiB
Python
546 lines
22 KiB
Python
"""
|
||
Generator for the 02_text_cleaner test corpus.
|
||
|
||
Writes raw bytes where exact control over encoding/line-endings/invisible
|
||
characters matters. Do not edit the output files in a text editor that
|
||
"helpfully" normalizes anything; it will silently break the tests.
|
||
|
||
Run from the corpus root:
|
||
python generate_test_data.py
|
||
"""
|
||
from pathlib import Path
|
||
|
||
ROOT = Path(__file__).parent
|
||
TD = ROOT / "test_data"
|
||
EX = ROOT / "expected"
|
||
TD.mkdir(exist_ok=True)
|
||
EX.mkdir(exist_ok=True)
|
||
|
||
|
||
def write_bytes(path, data):
|
||
Path(path).write_bytes(data)
|
||
|
||
|
||
def write_text(path, text, encoding="utf-8", newline="\n"):
|
||
# Explicit byte write so we control line endings exactly.
|
||
if newline != "\n":
|
||
text = text.replace("\n", newline)
|
||
Path(path).write_bytes(text.encode(encoding))
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 01 Whitespace - basic (ASCII space + tab)
|
||
# ---------------------------------------------------------------------------
|
||
write_text(TD / "01_whitespace_basic.csv", (
|
||
"id,name,city\n"
|
||
"1, Alice ,New York\n" # leading + trailing spaces
|
||
"2,Bob, Chicago\n" # leading spaces
|
||
"3,Carol ,San Francisco \n" # trailing spaces
|
||
"4,Dan Smith,Austin\n" # internal multi-space
|
||
"5,\tEve\t,\tBoston\t\n" # tab padding
|
||
"6,Frank van der Berg,Denver\n" # multiple internal multi-space runs
|
||
"7, Grace Hopper , Palo Alto \n" # everything at once
|
||
))
|
||
|
||
write_text(EX / "01_whitespace_basic.csv", (
|
||
"id,name,city\n"
|
||
"1,Alice,New York\n"
|
||
"2,Bob,Chicago\n"
|
||
"3,Carol,San Francisco\n"
|
||
"4,Dan Smith,Austin\n"
|
||
"5,Eve,Boston\n"
|
||
"6,Frank van der Berg,Denver\n"
|
||
"7,Grace Hopper,Palo Alto\n"
|
||
))
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 02 Whitespace - unicode (NBSP, narrow NBSP, ideographic space, etc.)
|
||
# ---------------------------------------------------------------------------
|
||
# These are the whitespace-pretenders that .strip() in Python 3 actually
|
||
# DOES handle, but that .strip() in many naive implementations (or pandas
|
||
# defaults) does NOT. Test that they're stripped, not preserved.
|
||
NBSP = "\u00A0" # non-breaking space (very common from Word/Excel paste)
|
||
NNBSP = "\u202F" # narrow no-break space
|
||
IDEO = "\u3000" # ideographic space (CJK)
|
||
EM_SPACE = "\u2003" # em space
|
||
THIN_SPACE = "\u2009" # thin space
|
||
write_text(TD / "02_whitespace_unicode.csv", (
|
||
"id,label,note\n"
|
||
f"1,{NBSP}Premium{NBSP},NBSP padding\n"
|
||
f"2,{NNBSP}Discount{NNBSP},narrow NBSP\n"
|
||
f"3,{IDEO}Standard{IDEO},ideographic space\n"
|
||
f"4,Tier{EM_SPACE}{EM_SPACE}One,em-space internal\n"
|
||
f"5,Cost{THIN_SPACE}Plus,thin-space internal\n"
|
||
f"6, {NBSP} mixed {NBSP} ,ascii + NBSP combined\n"
|
||
))
|
||
|
||
write_text(EX / "02_whitespace_unicode.csv", (
|
||
"id,label,note\n"
|
||
"1,Premium,NBSP padding\n"
|
||
"2,Discount,narrow NBSP\n"
|
||
"3,Standard,ideographic space\n"
|
||
"4,Tier One,em-space internal\n"
|
||
"5,Cost Plus,thin-space internal\n"
|
||
"6,mixed,ascii + NBSP combined\n"
|
||
))
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 03 Smart punctuation (curly quotes, em/en dash, ellipsis, primes)
|
||
# ---------------------------------------------------------------------------
|
||
# This is the #1 source of pollution from data that ever passed through
|
||
# Word, Outlook, or Excel autocorrect. ASCII-fy it.
|
||
write_text(TD / "03_smart_punctuation.csv", (
|
||
"id,quote,measurement\n"
|
||
"1,\u201cHello world\u201d,5\u2032 11\u2033\n" # curly double quotes, prime/double-prime
|
||
"2,it\u2019s working,\u2014\n" # curly apostrophe, em-dash alone
|
||
"3,2020\u20132024,from \u2018a\u2019 to \u2018z\u2019\n" # en-dash range, curly singles
|
||
"4,wait\u2026,3 \u00d7 4\n" # ellipsis char, multiplication sign
|
||
"5,\u00abquoted\u00bb,5 \u00b1 0.1\n" # guillemets, plus-minus
|
||
))
|
||
|
||
# Default policy: ASCII-fy where round-trip-safe.
|
||
# Notable: \u00d7 (multiplication) and \u00b1 (plus-minus) are typographically
|
||
# meaningful and not safely round-trippable to ASCII, so we PRESERVE them
|
||
# (case 4 col3, case 5 col3). Document this in TEST-CASES.md.
|
||
write_text(EX / "03_smart_punctuation.csv", (
|
||
"id,quote,measurement\n"
|
||
"1,\"\"\"Hello world\"\"\",\"5' 11\"\"\"\n"
|
||
"2,it's working,-\n"
|
||
"3,2020-2024,from 'a' to 'z'\n"
|
||
"4,wait...,3 \u00d7 4\n"
|
||
"5,\"\"\"quoted\"\"\",5 \u00b1 0.1\n"
|
||
))
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 04 Unicode normalization forms (NFC vs NFD, ligatures, fullwidth)
|
||
# ---------------------------------------------------------------------------
|
||
# "café" can be either:
|
||
# NFC: "caf\u00e9" (e-acute as single code point)
|
||
# NFD: "cafe\u0301" (e + combining acute accent, two code points)
|
||
# These look identical but compare unequal. Normalize to NFC.
|
||
write_text(TD / "04_unicode_forms.csv", (
|
||
"id,name,description\n"
|
||
"1,caf\u00e9,NFC form (single code point)\n"
|
||
"2,cafe\u0301,NFD form (e + combining accent)\n"
|
||
"3,na\u00efve,NFC i-diaeresis\n"
|
||
"4,nai\u0308ve,NFD i + combining diaeresis\n"
|
||
"5,o\uFB03ce,fi-ligature (\uFB03)\n" # 'office' written with 'ffi' ligature
|
||
"6,\uFF21\uFF22\uFF23,fullwidth ABC\n" # A B C
|
||
"7,\u2168 century,roman numeral nine (single code point)\n" # Ⅸ
|
||
))
|
||
|
||
# Policy: NFC by default (most compatible, smallest, what Excel emits).
|
||
# NFKC option would also fold ligatures and fullwidth digits/letters,
|
||
# but is destructive for some legitimate text. Default = NFC.
|
||
# So:
|
||
# - Cases 1 vs 2 should produce identical output after normalization
|
||
# - Cases 3 vs 4 should produce identical output
|
||
# - Case 5 ligature stays as ligature under NFC (would fold under NFKC)
|
||
# - Case 6 fullwidth stays fullwidth under NFC (would fold under NFKC)
|
||
write_text(EX / "04_unicode_forms.csv", (
|
||
"id,name,description\n"
|
||
"1,caf\u00e9,NFC form (single code point)\n"
|
||
"2,caf\u00e9,NFD form (e + combining accent)\n" # same bytes as row 1 now
|
||
"3,na\u00efve,NFC i-diaeresis\n"
|
||
"4,na\u00efve,NFD i + combining diaeresis\n" # same as row 3 now
|
||
"5,o\uFB03ce,fi-ligature (\uFB03)\n"
|
||
"6,\uFF21\uFF22\uFF23,fullwidth ABC\n"
|
||
"7,\u2168 century,roman numeral nine (single code point)\n"
|
||
))
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 05 Zero-width / invisible characters
|
||
# ---------------------------------------------------------------------------
|
||
ZWSP = "\u200B" # zero-width space
|
||
ZWNJ = "\u200C" # zero-width non-joiner
|
||
ZWJ = "\u200D" # zero-width joiner
|
||
LRM = "\u200E" # left-to-right mark
|
||
RLM = "\u200F" # right-to-left mark
|
||
SOFT_HYPHEN = "\u00AD"
|
||
WORD_JOINER = "\u2060"
|
||
write_text(TD / "05_zero_width_invisible.csv", (
|
||
"id,value,note\n"
|
||
f"1,Hel{ZWSP}lo,zero-width space inside word\n"
|
||
f"2,{ZWSP}Lead{ZWSP}ing,leading + internal ZWSP\n"
|
||
f"3,Trail{ZWSP},trailing ZWSP\n"
|
||
f"4,a{ZWNJ}b{ZWJ}c,ZWNJ and ZWJ\n"
|
||
f"5,{LRM}Marked{RLM},LTR + RTL marks bracketing\n"
|
||
f"6,co{SOFT_HYPHEN}operate,soft hyphen\n"
|
||
f"7,no{WORD_JOINER}break,word joiner\n"
|
||
))
|
||
|
||
write_text(EX / "05_zero_width_invisible.csv", (
|
||
"id,value,note\n"
|
||
"1,Hello,zero-width space inside word\n"
|
||
"2,Leading,leading + internal ZWSP\n"
|
||
"3,Trail,trailing ZWSP\n"
|
||
"4,abc,ZWNJ and ZWJ\n"
|
||
"5,Marked,LTR + RTL marks bracketing\n"
|
||
"6,cooperate,soft hyphen\n"
|
||
"7,nobreak,word joiner\n"
|
||
))
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 06 Control characters (non-printable, except tab/CR/LF inside quoted cells)
|
||
# ---------------------------------------------------------------------------
|
||
# These bytes show up in real exports from broken systems, terminals, or
|
||
# binary data accidentally exported as text.
|
||
# \x00 NUL, \x01 SOH, \x07 BEL, \x08 BS, \x0B VT, \x0C FF, \x1B ESC, \x7F DEL
|
||
write_text(TD / "06_control_characters.csv", (
|
||
"id,value,note\n"
|
||
"1,Hello\x00World,NUL byte inside\n"
|
||
"2,Bell\x07Sound,BEL character\n"
|
||
"3,Back\x08space,backspace\n"
|
||
"4,Vert\x0BTab,vertical tab\n"
|
||
"5,Form\x0CFeed,form feed\n"
|
||
"6,Esc\x1Bape,ESC character\n"
|
||
"7,Del\x7Fete,DEL character\n"
|
||
"8,Mixed\x00\x07\x1Bjunk,multiple controls in one cell\n"
|
||
))
|
||
|
||
write_text(EX / "06_control_characters.csv", (
|
||
"id,value,note\n"
|
||
"1,HelloWorld,NUL byte inside\n"
|
||
"2,BellSound,BEL character\n"
|
||
"3,Backspace,backspace\n"
|
||
"4,VertTab,vertical tab\n"
|
||
"5,FormFeed,form feed\n"
|
||
"6,Escape,ESC character\n"
|
||
"7,Delete,DEL character\n"
|
||
"8,Mixedjunk,multiple controls in one cell\n"
|
||
))
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 07 BOM at start of file (UTF-8 BOM = EF BB BF)
|
||
# ---------------------------------------------------------------------------
|
||
# Excel writes UTF-8 with BOM by default. pandas usually handles it but
|
||
# leaves the BOM as part of the first column's header name if you're not
|
||
# careful, producing a mystery column called "\ufeffid" that breaks lookups.
|
||
bom = b"\xef\xbb\xbf"
|
||
content = (
|
||
"id,name,city\n"
|
||
"1,Alice,New York\n"
|
||
"2,Bob,Chicago\n"
|
||
).encode("utf-8")
|
||
write_bytes(TD / "07_bom_utf8.csv", bom + content)
|
||
|
||
# Expected: BOM stripped on read, output written WITHOUT BOM, header is
|
||
# clean "id" not "\ufeffid".
|
||
write_bytes(EX / "07_bom_utf8.csv", content)
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 08 Line endings - all CRLF (Windows)
|
||
# ---------------------------------------------------------------------------
|
||
# Default policy: normalize to LF on output.
|
||
write_text(TD / "08_line_endings_crlf.csv", (
|
||
"id,name\n"
|
||
"1,Alice\n"
|
||
"2,Bob\n"
|
||
"3,Carol\n"
|
||
), newline="\r\n")
|
||
|
||
write_text(EX / "08_line_endings_crlf.csv", (
|
||
"id,name\n"
|
||
"1,Alice\n"
|
||
"2,Bob\n"
|
||
"3,Carol\n"
|
||
))
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 09 Line endings - CR only (classic Mac, pre-OSX, occasionally still seen)
|
||
# ---------------------------------------------------------------------------
|
||
write_text(TD / "09_line_endings_cr.csv", (
|
||
"id,name\n"
|
||
"1,Alice\n"
|
||
"2,Bob\n"
|
||
"3,Carol\n"
|
||
), newline="\r")
|
||
|
||
write_text(EX / "09_line_endings_cr.csv", (
|
||
"id,name\n"
|
||
"1,Alice\n"
|
||
"2,Bob\n"
|
||
"3,Carol\n"
|
||
))
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 10 Line endings - mixed within the same file
|
||
# ---------------------------------------------------------------------------
|
||
# Real-world disaster mode: file edited on multiple OSes, or concatenated
|
||
# from sources with different conventions.
|
||
mixed = (
|
||
b"id,name\r\n"
|
||
b"1,Alice\n"
|
||
b"2,Bob\r"
|
||
b"3,Carol\r\n"
|
||
b"4,Dan\n"
|
||
)
|
||
write_bytes(TD / "10_line_endings_mixed.csv", mixed)
|
||
|
||
write_text(EX / "10_line_endings_mixed.csv", (
|
||
"id,name\n"
|
||
"1,Alice\n"
|
||
"2,Bob\n"
|
||
"3,Carol\n"
|
||
"4,Dan\n"
|
||
))
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 11 Embedded newlines INSIDE quoted cells (must be preserved!)
|
||
# ---------------------------------------------------------------------------
|
||
# This is the trap: line-ending normalization at the FILE level must not
|
||
# destroy intentional newlines INSIDE quoted multi-line cells (e.g., a
|
||
# notes column or an address column).
|
||
# But the embedded line endings should also be normalized to LF for
|
||
# consistency.
|
||
write_text(TD / "11_embedded_newlines.csv", (
|
||
"id,address,notes\n"
|
||
"1,\"123 Main St\r\nApt 4B\r\nNew York, NY\",\"line1\nline2\"\n"
|
||
"2,\"Single line\",\"contains\rclassic mac\rinternal\"\n"
|
||
"3,\"normal\",\"no newlines here\"\n"
|
||
))
|
||
|
||
# Expected: file-level CRLF normalized to LF; embedded CRLF/CR also
|
||
# normalized to LF; cells stay multi-line.
|
||
write_text(EX / "11_embedded_newlines.csv", (
|
||
"id,address,notes\n"
|
||
"1,\"123 Main St\nApt 4B\nNew York, NY\",\"line1\nline2\"\n"
|
||
"2,Single line,\"contains\nclassic mac\ninternal\"\n"
|
||
"3,normal,no newlines here\n"
|
||
))
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 12 Case operations (opt-in, default = preserve)
|
||
# ---------------------------------------------------------------------------
|
||
# This file tests case operations IF the user requests them.
|
||
# Default behavior: PRESERVE. So expected_default == input.
|
||
# An expected_lower.csv shows what lower-case mode produces.
|
||
write_text(TD / "12_case_variations.csv", (
|
||
"id,name,email,product\n"
|
||
"1,ALICE SMITH,Alice@Example.COM,Widget\n"
|
||
"2,bob jones,BOB@example.com,GADGET\n"
|
||
"3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n"
|
||
"4,DAN O'CONNOR,Dan@Example.com,gizmo\n"
|
||
))
|
||
|
||
# Default expected: identical to input (case ops are opt-in).
|
||
write_text(EX / "12_case_variations__default.csv", (
|
||
"id,name,email,product\n"
|
||
"1,ALICE SMITH,Alice@Example.COM,Widget\n"
|
||
"2,bob jones,BOB@example.com,GADGET\n"
|
||
"3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n"
|
||
"4,DAN O'CONNOR,Dan@Example.com,gizmo\n"
|
||
))
|
||
|
||
# With --case-email=lower applied to email column only:
|
||
write_text(EX / "12_case_variations__email_lower.csv", (
|
||
"id,name,email,product\n"
|
||
"1,ALICE SMITH,alice@example.com,Widget\n"
|
||
"2,bob jones,bob@example.com,GADGET\n"
|
||
"3,Carol Brown,carol@example.com,wIdGeT\n"
|
||
"4,DAN O'CONNOR,dan@example.com,gizmo\n"
|
||
))
|
||
|
||
# With --case=title applied to name column:
|
||
write_text(EX / "12_case_variations__name_title.csv", (
|
||
"id,name,email,product\n"
|
||
"1,Alice Smith,Alice@Example.COM,Widget\n"
|
||
"2,Bob Jones,BOB@example.com,GADGET\n"
|
||
"3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n"
|
||
"4,Dan O'Connor,Dan@Example.com,gizmo\n" # title-case must not break O'C
|
||
))
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 13 Non-Latin scripts and emoji (PRESERVE; do not mangle)
|
||
# ---------------------------------------------------------------------------
|
||
# This is a negative test: the cleaner must not damage characters that
|
||
# look "foreign" to it. Whitespace trimming and Unicode NFC are still applied.
|
||
write_text(TD / "13_non_latin_scripts.csv", (
|
||
"id,name,note\n"
|
||
"1, \u4e2d\u56fd\u5317\u4eac ,Beijing in Chinese (with leading/trailing space)\n"
|
||
"2,\u30c6\u30b9\u30c8,Japanese katakana (test)\n"
|
||
"3,\u062a\u062c\u0631\u0628\u0629,Arabic (test) - RTL\n"
|
||
"4,\u041c\u043e\u0441\u043a\u0432\u0430,Russian (Moscow)\n"
|
||
"5,\U0001F389 launch \U0001F680,emoji preserved\n"
|
||
"6,caf\u00e9 \u2615,emoji + accent combo\n"
|
||
))
|
||
|
||
write_text(EX / "13_non_latin_scripts.csv", (
|
||
"id,name,note\n"
|
||
"1,\u4e2d\u56fd\u5317\u4eac,Beijing in Chinese (with leading/trailing space)\n"
|
||
"2,\u30c6\u30b9\u30c8,Japanese katakana (test)\n"
|
||
"3,\u062a\u062c\u0631\u0628\u0629,Arabic (test) - RTL\n"
|
||
"4,\u041c\u043e\u0441\u043a\u0432\u0430,Russian (Moscow)\n"
|
||
"5,\U0001F389 launch \U0001F680,emoji preserved\n"
|
||
"6,caf\u00e9 \u2615,emoji + accent combo\n"
|
||
))
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 14 Mojibake (double-encoded UTF-8 / cp1252 misread as Latin-1)
|
||
# ---------------------------------------------------------------------------
|
||
# Classic mojibake: someone took a UTF-8 file, opened it as Windows-1252,
|
||
# saved as UTF-8 again. "café" becomes "café", "naïve" becomes "naïve".
|
||
# The text cleaner CANNOT reliably auto-fix this (it's a heuristic and can
|
||
# false-positive on legitimate strings). Default = WARN, do not auto-fix.
|
||
# Optional --fix-mojibake flag (uses ftfy library) can attempt repair.
|
||
write_text(TD / "14_mojibake.csv", (
|
||
"id,name,city\n"
|
||
"1,caf\u00c3\u00a9,M\u00c3\u00bcnchen\n" # café, München mojibaked
|
||
"2,na\u00c3\u00afve,r\u00c3\u00a9sum\u00c3\u00a9\n" # naïve, résumé
|
||
"3,don\u00e2\u20ac\u2122t,smart-apostrophe mojibake\n" # don't via cp1252-mojibake
|
||
"4,Alice,New York\n" # clean control row
|
||
))
|
||
|
||
# Expected output WITHOUT mojibake fix (default): bytes preserved, but
|
||
# reader emits a warning to logs.
|
||
write_text(EX / "14_mojibake__default.csv", (
|
||
"id,name,city\n"
|
||
"1,caf\u00c3\u00a9,M\u00c3\u00bcnchen\n"
|
||
"2,na\u00c3\u00afve,r\u00c3\u00a9sum\u00c3\u00a9\n"
|
||
"3,don\u00e2\u20ac\u2122t,smart-apostrophe mojibake\n"
|
||
"4,Alice,New York\n"
|
||
))
|
||
|
||
# Expected output WITH --fix-mojibake (uses ftfy or equivalent):
|
||
write_text(EX / "14_mojibake__fixed.csv", (
|
||
"id,name,city\n"
|
||
"1,caf\u00e9,M\u00fcnchen\n"
|
||
"2,na\u00efve,r\u00e9sum\u00e9\n"
|
||
"3,don't,smart-apostrophe mojibake\n" # smart apostrophe also fixed
|
||
"4,Alice,New York\n"
|
||
))
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 15 Whitespace-only cells (boundary case with script 04)
|
||
# ---------------------------------------------------------------------------
|
||
# Per TECHNICAL.md Section 9.3: 02 trims first, leaving an empty string.
|
||
# 04 then detects empty strings as disguised null. So 02's job here is
|
||
# just to convert " " into "".
|
||
write_text(TD / "15_whitespace_only_cells.csv", (
|
||
"id,value\n"
|
||
"1,real\n"
|
||
"2, \n" # spaces only
|
||
"3,\t\t\n" # tabs only
|
||
"4,\u00A0\u00A0\n" # NBSP only
|
||
"5, \t \u00A0 \n" # mixed whitespace
|
||
"6,\n" # already empty
|
||
"7,actual value\n"
|
||
))
|
||
|
||
write_text(EX / "15_whitespace_only_cells.csv", (
|
||
"id,value\n"
|
||
"1,real\n"
|
||
"2,\n" # all whitespace -> empty
|
||
"3,\n"
|
||
"4,\n"
|
||
"5,\n"
|
||
"6,\n"
|
||
"7,actual value\n"
|
||
))
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 16 Dirty headers
|
||
# ---------------------------------------------------------------------------
|
||
# Headers themselves have whitespace, BOM remnants, smart quotes, etc.
|
||
# These break downstream lookups (df["email"] fails because the column
|
||
# is actually called " Email " with NBSP padding).
|
||
write_text(TD / "16_dirty_headers.csv", (
|
||
" id ,\u00a0Customer Name\u00a0,\u201cEmail\u201d,Phone\u200b\n"
|
||
"1,Alice,alice@example.com,555-1234\n"
|
||
"2,Bob,bob@example.com,555-5678\n"
|
||
))
|
||
|
||
# Expected: headers cleaned by SAME rules as data cells.
|
||
# Note: smart quotes around "Email" become straight quotes. The header
|
||
# "\"Email\"" with embedded quotes needs CSV-quoting in the output.
|
||
write_text(EX / "16_dirty_headers.csv", (
|
||
"id,Customer Name,\"\"\"Email\"\"\",Phone\n"
|
||
"1,Alice,alice@example.com,555-1234\n"
|
||
"2,Bob,bob@example.com,555-5678\n"
|
||
))
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 17 Preserve-intended (negative tests - things 02 must NOT touch)
|
||
# ---------------------------------------------------------------------------
|
||
# Numbers that LOOK like they have whitespace are tricky: " 123 " is
|
||
# a number with padding (trim) but "1 234" might be a thousands-separator
|
||
# locale (don't collapse). Default: trim outer whitespace, but DO NOT
|
||
# collapse internal whitespace in cells that parse as numeric. This is a
|
||
# judgment call; document it.
|
||
#
|
||
# Also: do not reformat dates, currencies, or phone numbers. That's 03.
|
||
# Do not detect or replace null-like values. That's 04.
|
||
write_text(TD / "17_preserve_intended.csv", (
|
||
"id,price,european_number,date,phone,quantity\n"
|
||
"1, 100 ,1 234,2024-01-15,(555) 123-4567,42\n"
|
||
"2,\" $1,500.00 \",12 345,15/01/2024,555.123.4567,7\n"
|
||
"3, N/A ,nan,Jan 15 2024,+1 555 123 4567,0\n"
|
||
))
|
||
|
||
# Expected: outer whitespace trimmed everywhere, but:
|
||
# - "1 234" stays "1 234" (looks like European/space-thousands; don't collapse)
|
||
# - "$1,500.00" stays unchanged (currency, that's 03's domain)
|
||
# - "15/01/2024" stays unchanged (date, that's 03's domain)
|
||
# - "(555) 123-4567" stays unchanged (phone, that's 03's domain)
|
||
# - "N/A" stays "N/A" (null-like, that's 04's domain - 02 doesn't decide what's null)
|
||
# - phone "+1 555 123 4567" - keep internal spaces (it's a phone, 03's domain)
|
||
write_text(EX / "17_preserve_intended.csv", (
|
||
"id,price,european_number,date,phone,quantity\n"
|
||
"1,100,1 234,2024-01-15,(555) 123-4567,42\n"
|
||
"2,\"$1,500.00\",12 345,15/01/2024,555.123.4567,7\n"
|
||
"3,N/A,nan,Jan 15 2024,+1 555 123 4567,0\n"
|
||
))
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 18 Empty file (zero bytes)
|
||
# ---------------------------------------------------------------------------
|
||
write_bytes(TD / "18_empty_file.csv", b"")
|
||
|
||
# Expected: graceful handling, output is also empty (or warning emitted).
|
||
write_bytes(EX / "18_empty_file.csv", b"")
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 19 Headers only (no data rows)
|
||
# ---------------------------------------------------------------------------
|
||
write_text(TD / "19_headers_only.csv", (
|
||
" id ,Name\u00a0,Email\u200b\n"
|
||
))
|
||
|
||
# Expected: headers cleaned, no data rows in output.
|
||
write_text(EX / "19_headers_only.csv", (
|
||
"id,Name,Email\n"
|
||
))
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 20 Real-world kitchen sink (everything combined)
|
||
# ---------------------------------------------------------------------------
|
||
# Simulates a typical messy export: came from Excel via cp1252 paste,
|
||
# saved as UTF-8 with BOM, has CRLF, has smart quotes from autocorrect,
|
||
# has NBSP from copy/paste, has trailing whitespace.
|
||
content = (
|
||
" id ,\u00a0Name\u00a0,\u201cEmail\u201d,Notes\u200b\n"
|
||
"1,\u00a0Alice Smith\u00a0,Alice@Example.COM,\u201cVIP\u201d customer \u2014 contact ASAP\u2026\r\n"
|
||
"2,\tBob\tJones\t,bob@example.com,it\u2019s 5\u20326\u2033 tall\r\n"
|
||
"3, Carol Brown ,CAROL@EXAMPLE.COM,3 \u00d7 4 = 12 (preserve \u00d7)\r\n"
|
||
"4, ,empty@example.com,whitespace-only name (becomes empty)\r\n"
|
||
)
|
||
data_bytes = b"\xef\xbb\xbf" + content.encode("utf-8")
|
||
# replace LF with CRLF wherever it isn't already to be unambiguous
|
||
write_bytes(TD / "20_kitchen_sink.csv", data_bytes)
|
||
|
||
# Expected: BOM gone, headers clean, smart quotes ASCII-fied, NBSP/ZWSP
|
||
# stripped, internal multi-space collapsed, CRLF normalized to LF,
|
||
# whitespace-only cells become empty, multiplication sign preserved,
|
||
# em-dash and ellipsis converted, prime/double-prime converted.
|
||
write_text(EX / "20_kitchen_sink.csv", (
|
||
"id,Name,\"\"\"Email\"\"\",Notes\n"
|
||
"1,Alice Smith,Alice@Example.COM,\"\"\"VIP\"\" customer - contact ASAP...\"\n"
|
||
"2,Bob Jones,bob@example.com,\"it's 5'6\"\" tall\"\n"
|
||
"3,Carol Brown,CAROL@EXAMPLE.COM,3 \u00d7 4 = 12 (preserve \u00d7)\n"
|
||
"4,,empty@example.com,whitespace-only name (becomes empty)\n"
|
||
))
|
||
|
||
print("All CSV test files written.")
|
||
print(f" inputs: {TD}")
|
||
print(f" expected: {EX}")
|