test: add text-cleaner corpus and close gaps surfaced by it
The 21-fixture corpus (test-cases/text-cleaner-corpus/) exercises the cleaner
end-to-end against the spec in TEST-CASES.md. Closing the failing cases drove
five small cleaner fixes plus two fixture-generation fixes:
- _SMART_CHARS: add prime, double prime, guillemets (case 03)
- _ZERO_WIDTH: add soft hyphen U+00AD (case 05)
- clean_dataframe: clean column headers via the same pipeline (cases 16/19/20),
with a clean_headers toggle on CleanOptions
- smart_title_case: title-case full-shout strings ("ALICE SMITH" -> "Alice
Smith") while still preserving embedded acronyms; preserve uppercase after
apostrophe in names ("O'CONNOR" -> "O'Connor", "o'neil" -> "O'neil")
- test_corpus.py reader: pre-strip NUL bytes (C parser truncates at NUL,
python engine is too strict about embedded literal "), per spec case 06
- generate_test_data.py: properly CSV-escape literal-quote cells in case 03
expected; quote the rogue-comma price field in case 17 input
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
74
test-cases/text-cleaner-corpus/generate_xlsx.py
Normal file
74
test-cases/text-cleaner-corpus/generate_xlsx.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""
|
||||
Generate the XLSX test workbook for 02_text_cleaner.
|
||||
|
||||
Excel-specific pollution patterns that don't appear in CSV:
|
||||
- Cells with leading apostrophe (Excel's force-text prefix; openpyxl
|
||||
surfaces these as plain strings but they show up in real exports)
|
||||
- Multi-line cells from Alt+Enter (carry \\n internally)
|
||||
- Smart quotes from Excel's autocorrect-as-you-type
|
||||
- NBSP padding from copy/paste from Word or web pages
|
||||
- Multiple sheets with different pollution profiles
|
||||
"""
|
||||
from pathlib import Path
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.styles import Font, Alignment
|
||||
|
||||
OUT = Path(__file__).parent / "test_data" / "21_excel_pollution.xlsx"
|
||||
|
||||
wb = Workbook()
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Sheet 1: Customers - whitespace + smart quotes + NBSP
|
||||
# --------------------------------------------------------------------
|
||||
ws = wb.active
|
||||
ws.title = "Customers"
|
||||
ws.append([" id ", "\u00a0Name\u00a0", "\u201cEmail\u201d", "Phone\u200b"]) # dirty headers
|
||||
ws.append([1, " Alice Smith ", "Alice@Example.COM", "555-1234"])
|
||||
ws.append([2, "\u00a0Bob\u00a0Jones\u00a0", "bob@example.com", "555-5678"])
|
||||
ws.append([3, "\tCarol\tBrown\t", "CAROL@example.com", " 555-9012 "])
|
||||
ws.append([4, "Dan O\u2019Connor", "dan@example.com", "555-3456"]) # curly apostrophe
|
||||
ws.append([5, "Eve \u2014 the Engineer", "eve@example.com", "555-7890"]) # em-dash
|
||||
ws.append([6, " ", "frank@example.com", "555-2468"]) # whitespace-only -> empty for 04
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Sheet 2: Notes - multi-line cells, embedded line breaks
|
||||
# --------------------------------------------------------------------
|
||||
ws2 = wb.create_sheet("Notes")
|
||||
ws2.append(["id", "title", "body"])
|
||||
ws2.append([1, "Welcome", "Line one\nLine two\nLine three"])
|
||||
ws2.append([2, "Address", "123 Main St\r\nApt 4B\r\nNew York"]) # mixed line endings inside
|
||||
ws2.append([3, "Quote", "She said \u201chello\u201d and left\u2026"]) # smart quotes + ellipsis
|
||||
ws2.append([4, " padded ", " multiline\n with leading whitespace per line "])
|
||||
# Mark column B with wrap_text so line breaks render in Excel
|
||||
for row in ws2.iter_rows(min_row=2, max_row=ws2.max_row, min_col=3, max_col=3):
|
||||
for cell in row:
|
||||
cell.alignment = Alignment(wrap_text=True)
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Sheet 3: International - non-Latin scripts and emoji (preserve!)
|
||||
# --------------------------------------------------------------------
|
||||
ws3 = wb.create_sheet("International")
|
||||
ws3.append(["id", "city", "language"])
|
||||
ws3.append([1, " \u4e2d\u56fd\u5317\u4eac ", "Chinese"])
|
||||
ws3.append([2, "\u30c6\u30b9\u30c8 ", "Japanese (with trailing space)"])
|
||||
ws3.append([3, " \u041c\u043e\u0441\u043a\u0432\u0430", "Russian"])
|
||||
ws3.append([4, "\u062a\u062c\u0631\u0628\u0629", "Arabic"])
|
||||
ws3.append([5, "Caf\u00e9 \u2615", "emoji preserved"])
|
||||
ws3.append([6, "Launch \U0001F389\U0001F680", "more emoji"])
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Sheet 4: ForceText - cells originally entered with leading apostrophe
|
||||
# --------------------------------------------------------------------
|
||||
# Excel's force-text prefix '0001234 stores as string "0001234" when read
|
||||
# by openpyxl. Sometimes (broken exports) the apostrophe leaks through as
|
||||
# part of the value. Test that 02 doesn't try to "clean" leading
|
||||
# apostrophes - they may be intentional for ID columns.
|
||||
ws4 = wb.create_sheet("ForceText")
|
||||
ws4.append(["id", "sku", "zip"])
|
||||
ws4.append([1, "0001234", "08540"]) # legitimate leading-zero IDs
|
||||
ws4.append([2, " 0005678 ", "01001"]) # padded - trim outer space, keep zeros
|
||||
ws4.append([3, "'9999999", "10001"]) # leaked apostrophe - PRESERVE (judgment call)
|
||||
|
||||
wb.save(OUT)
|
||||
print(f"Wrote {OUT}")
|
||||
print(f"Sheets: {wb.sheetnames}")
|
||||
Reference in New Issue
Block a user