datatools-dev/test-cases/text-cleaner-corpus/generate_xlsx.py

"""
Generate the XLSX test workbook for 02_text_cleaner.

Excel-specific pollution patterns that don't appear in CSV:
  - Cells with leading apostrophe (Excel's force-text prefix; openpyxl
    surfaces these as plain strings but they show up in real exports)
  - Multi-line cells from Alt+Enter (carry \\n internally)
  - Smart quotes from Excel's autocorrect-as-you-type
  - NBSP padding from copy/paste from Word or web pages
  - Multiple sheets with different pollution profiles
"""
from pathlib import Path
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment

OUT = Path(__file__).parent / "test_data" / "21_excel_pollution.xlsx"

wb = Workbook()

# --------------------------------------------------------------------
# Sheet 1: Customers - whitespace + smart quotes + NBSP
# --------------------------------------------------------------------
ws = wb.active
ws.title = "Customers"
ws.append(["  id  ", "\u00a0Name\u00a0", "\u201cEmail\u201d", "Phone\u200b"])  # dirty headers
ws.append([1, "  Alice  Smith  ", "Alice@Example.COM", "555-1234"])
ws.append([2, "\u00a0Bob\u00a0Jones\u00a0", "bob@example.com", "555-5678"])
ws.append([3, "\tCarol\tBrown\t", "CAROL@example.com", "  555-9012  "])
ws.append([4, "Dan O\u2019Connor", "dan@example.com", "555-3456"])  # curly apostrophe
ws.append([5, "Eve \u2014 the Engineer", "eve@example.com", "555-7890"])  # em-dash
ws.append([6, "   ", "frank@example.com", "555-2468"])  # whitespace-only -> empty for 04

# --------------------------------------------------------------------
# Sheet 2: Notes - multi-line cells, embedded line breaks
# --------------------------------------------------------------------
ws2 = wb.create_sheet("Notes")
ws2.append(["id", "title", "body"])
ws2.append([1, "Welcome", "Line one\nLine two\nLine three"])
ws2.append([2, "Address", "123 Main St\r\nApt 4B\r\nNew York"])  # mixed line endings inside
ws2.append([3, "Quote", "She said \u201chello\u201d and left\u2026"])  # smart quotes + ellipsis
ws2.append([4, " padded ", "  multiline\n  with leading whitespace per line  "])
# Mark column B with wrap_text so line breaks render in Excel
for row in ws2.iter_rows(min_row=2, max_row=ws2.max_row, min_col=3, max_col=3):
    for cell in row:
        cell.alignment = Alignment(wrap_text=True)

# --------------------------------------------------------------------
# Sheet 3: International - non-Latin scripts and emoji (preserve!)
# --------------------------------------------------------------------
ws3 = wb.create_sheet("International")
ws3.append(["id", "city", "language"])
ws3.append([1, " \u4e2d\u56fd\u5317\u4eac ", "Chinese"])
ws3.append([2, "\u30c6\u30b9\u30c8 ", "Japanese (with trailing space)"])
ws3.append([3, " \u041c\u043e\u0441\u043a\u0432\u0430", "Russian"])
ws3.append([4, "\u062a\u062c\u0631\u0628\u0629", "Arabic"])
ws3.append([5, "Caf\u00e9 \u2615", "emoji preserved"])
ws3.append([6, "Launch \U0001F389\U0001F680", "more emoji"])

# --------------------------------------------------------------------
# Sheet 4: ForceText - cells originally entered with leading apostrophe
# --------------------------------------------------------------------
# Excel's force-text prefix '0001234 stores as string "0001234" when read
# by openpyxl. Sometimes (broken exports) the apostrophe leaks through as
# part of the value. Test that 02 doesn't try to "clean" leading
# apostrophes - they may be intentional for ID columns.
ws4 = wb.create_sheet("ForceText")
ws4.append(["id", "sku", "zip"])
ws4.append([1, "0001234", "08540"])     # legitimate leading-zero IDs
ws4.append([2, "  0005678  ", "01001"])  # padded - trim outer space, keep zeros
ws4.append([3, "'9999999", "10001"])    # leaked apostrophe - PRESERVE (judgment call)

wb.save(OUT)
print(f"Wrote {OUT}")
print(f"Sheets: {wb.sheetnames}")