""" Generate the XLSX test workbook for 02_text_cleaner. Excel-specific pollution patterns that don't appear in CSV: - Cells with leading apostrophe (Excel's force-text prefix; openpyxl surfaces these as plain strings but they show up in real exports) - Multi-line cells from Alt+Enter (carry \\n internally) - Smart quotes from Excel's autocorrect-as-you-type - NBSP padding from copy/paste from Word or web pages - Multiple sheets with different pollution profiles """ from pathlib import Path from openpyxl import Workbook from openpyxl.styles import Font, Alignment OUT = Path(__file__).parent / "test_data" / "21_excel_pollution.xlsx" wb = Workbook() # -------------------------------------------------------------------- # Sheet 1: Customers - whitespace + smart quotes + NBSP # -------------------------------------------------------------------- ws = wb.active ws.title = "Customers" ws.append([" id ", "\u00a0Name\u00a0", "\u201cEmail\u201d", "Phone\u200b"]) # dirty headers ws.append([1, " Alice Smith ", "Alice@Example.COM", "555-1234"]) ws.append([2, "\u00a0Bob\u00a0Jones\u00a0", "bob@example.com", "555-5678"]) ws.append([3, "\tCarol\tBrown\t", "CAROL@example.com", " 555-9012 "]) ws.append([4, "Dan O\u2019Connor", "dan@example.com", "555-3456"]) # curly apostrophe ws.append([5, "Eve \u2014 the Engineer", "eve@example.com", "555-7890"]) # em-dash ws.append([6, " ", "frank@example.com", "555-2468"]) # whitespace-only -> empty for 04 # -------------------------------------------------------------------- # Sheet 2: Notes - multi-line cells, embedded line breaks # -------------------------------------------------------------------- ws2 = wb.create_sheet("Notes") ws2.append(["id", "title", "body"]) ws2.append([1, "Welcome", "Line one\nLine two\nLine three"]) ws2.append([2, "Address", "123 Main St\r\nApt 4B\r\nNew York"]) # mixed line endings inside ws2.append([3, "Quote", "She said \u201chello\u201d and left\u2026"]) # smart quotes + ellipsis ws2.append([4, " padded ", " multiline\n with leading whitespace per line "]) # Mark column B with wrap_text so line breaks render in Excel for row in ws2.iter_rows(min_row=2, max_row=ws2.max_row, min_col=3, max_col=3): for cell in row: cell.alignment = Alignment(wrap_text=True) # -------------------------------------------------------------------- # Sheet 3: International - non-Latin scripts and emoji (preserve!) # -------------------------------------------------------------------- ws3 = wb.create_sheet("International") ws3.append(["id", "city", "language"]) ws3.append([1, " \u4e2d\u56fd\u5317\u4eac ", "Chinese"]) ws3.append([2, "\u30c6\u30b9\u30c8 ", "Japanese (with trailing space)"]) ws3.append([3, " \u041c\u043e\u0441\u043a\u0432\u0430", "Russian"]) ws3.append([4, "\u062a\u062c\u0631\u0628\u0629", "Arabic"]) ws3.append([5, "Caf\u00e9 \u2615", "emoji preserved"]) ws3.append([6, "Launch \U0001F389\U0001F680", "more emoji"]) # -------------------------------------------------------------------- # Sheet 4: ForceText - cells originally entered with leading apostrophe # -------------------------------------------------------------------- # Excel's force-text prefix '0001234 stores as string "0001234" when read # by openpyxl. Sometimes (broken exports) the apostrophe leaks through as # part of the value. Test that 02 doesn't try to "clean" leading # apostrophes - they may be intentional for ID columns. ws4 = wb.create_sheet("ForceText") ws4.append(["id", "sku", "zip"]) ws4.append([1, "0001234", "08540"]) # legitimate leading-zero IDs ws4.append([2, " 0005678 ", "01001"]) # padded - trim outer space, keep zeros ws4.append([3, "'9999999", "10001"]) # leaked apostrophe - PRESERVE (judgment call) wb.save(OUT) print(f"Wrote {OUT}") print(f"Sheets: {wb.sheetnames}")