test: add text-cleaner corpus and close gaps surfaced by it

The 21-fixture corpus (test-cases/text-cleaner-corpus/) exercises the cleaner end-to-end against the spec in TEST-CASES.md. Closing the failing cases drove five small cleaner fixes plus two fixture-generation fixes: - _SMART_CHARS: add prime, double prime, guillemets (case 03) - _ZERO_WIDTH: add soft hyphen U+00AD (case 05) - clean_dataframe: clean column headers via the same pipeline (cases 16/19/20), with a clean_headers toggle on CleanOptions - smart_title_case: title-case full-shout strings ("ALICE SMITH" -> "Alice Smith") while still preserving embedded acronyms; preserve uppercase after apostrophe in names ("O'CONNOR" -> "O'Connor", "o'neil" -> "O'neil") - test_corpus.py reader: pre-strip NUL bytes (C parser truncates at NUL, python engine is too strict about embedded literal "), per spec case 06 - generate_test_data.py: properly CSV-escape literal-quote cells in case 03 expected; quote the rogue-comma price field in case 17 input Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 15:37:35 +00:00
parent 54f92ae47e
commit c349a90e18
50 changed files with 1644 additions and 4 deletions
--- a/test-cases/text-cleaner-corpus/generate_xlsx.py
+++ b/test-cases/text-cleaner-corpus/generate_xlsx.py
@@ -0,0 +1,74 @@
+"""
+Generate the XLSX test workbook for 02_text_cleaner.
+
+Excel-specific pollution patterns that don't appear in CSV:
+  - Cells with leading apostrophe (Excel's force-text prefix; openpyxl
+    surfaces these as plain strings but they show up in real exports)
+  - Multi-line cells from Alt+Enter (carry \\n internally)
+  - Smart quotes from Excel's autocorrect-as-you-type
+  - NBSP padding from copy/paste from Word or web pages
+  - Multiple sheets with different pollution profiles
+"""
+from pathlib import Path
+from openpyxl import Workbook
+from openpyxl.styles import Font, Alignment
+
+OUT = Path(__file__).parent / "test_data" / "21_excel_pollution.xlsx"
+
+wb = Workbook()
+
+# --------------------------------------------------------------------
+# Sheet 1: Customers - whitespace + smart quotes + NBSP
+# --------------------------------------------------------------------
+ws = wb.active
+ws.title = "Customers"
+ws.append(["  id  ", "\u00a0Name\u00a0", "\u201cEmail\u201d", "Phone\u200b"])  # dirty headers
+ws.append([1, "  Alice  Smith  ", "Alice@Example.COM", "555-1234"])
+ws.append([2, "\u00a0Bob\u00a0Jones\u00a0", "bob@example.com", "555-5678"])
+ws.append([3, "\tCarol\tBrown\t", "CAROL@example.com", "  555-9012  "])
+ws.append([4, "Dan O\u2019Connor", "dan@example.com", "555-3456"])  # curly apostrophe
+ws.append([5, "Eve \u2014 the Engineer", "eve@example.com", "555-7890"])  # em-dash
+ws.append([6, "   ", "frank@example.com", "555-2468"])  # whitespace-only -> empty for 04
+
+# --------------------------------------------------------------------
+# Sheet 2: Notes - multi-line cells, embedded line breaks
+# --------------------------------------------------------------------
+ws2 = wb.create_sheet("Notes")
+ws2.append(["id", "title", "body"])
+ws2.append([1, "Welcome", "Line one\nLine two\nLine three"])
+ws2.append([2, "Address", "123 Main St\r\nApt 4B\r\nNew York"])  # mixed line endings inside
+ws2.append([3, "Quote", "She said \u201chello\u201d and left\u2026"])  # smart quotes + ellipsis
+ws2.append([4, " padded ", "  multiline\n  with leading whitespace per line  "])
+# Mark column B with wrap_text so line breaks render in Excel
+for row in ws2.iter_rows(min_row=2, max_row=ws2.max_row, min_col=3, max_col=3):
+    for cell in row:
+        cell.alignment = Alignment(wrap_text=True)
+
+# --------------------------------------------------------------------
+# Sheet 3: International - non-Latin scripts and emoji (preserve!)
+# --------------------------------------------------------------------
+ws3 = wb.create_sheet("International")
+ws3.append(["id", "city", "language"])
+ws3.append([1, " \u4e2d\u56fd\u5317\u4eac ", "Chinese"])
+ws3.append([2, "\u30c6\u30b9\u30c8 ", "Japanese (with trailing space)"])
+ws3.append([3, " \u041c\u043e\u0441\u043a\u0432\u0430", "Russian"])
+ws3.append([4, "\u062a\u062c\u0631\u0628\u0629", "Arabic"])
+ws3.append([5, "Caf\u00e9 \u2615", "emoji preserved"])
+ws3.append([6, "Launch \U0001F389\U0001F680", "more emoji"])
+
+# --------------------------------------------------------------------
+# Sheet 4: ForceText - cells originally entered with leading apostrophe
+# --------------------------------------------------------------------
+# Excel's force-text prefix '0001234 stores as string "0001234" when read
+# by openpyxl. Sometimes (broken exports) the apostrophe leaks through as
+# part of the value. Test that 02 doesn't try to "clean" leading
+# apostrophes - they may be intentional for ID columns.
+ws4 = wb.create_sheet("ForceText")
+ws4.append(["id", "sku", "zip"])
+ws4.append([1, "0001234", "08540"])     # legitimate leading-zero IDs
+ws4.append([2, "  0005678  ", "01001"])  # padded - trim outer space, keep zeros
+ws4.append([3, "'9999999", "10001"])    # leaked apostrophe - PRESERVE (judgment call)
+
+wb.save(OUT)
+print(f"Wrote {OUT}")
+print(f"Sheets: {wb.sheetnames}")