test: add text-cleaner corpus and close gaps surfaced by it
The 21-fixture corpus (test-cases/text-cleaner-corpus/) exercises the cleaner
end-to-end against the spec in TEST-CASES.md. Closing the failing cases drove
five small cleaner fixes plus two fixture-generation fixes:
- _SMART_CHARS: add prime, double prime, guillemets (case 03)
- _ZERO_WIDTH: add soft hyphen U+00AD (case 05)
- clean_dataframe: clean column headers via the same pipeline (cases 16/19/20),
with a clean_headers toggle on CleanOptions
- smart_title_case: title-case full-shout strings ("ALICE SMITH" -> "Alice
Smith") while still preserving embedded acronyms; preserve uppercase after
apostrophe in names ("O'CONNOR" -> "O'Connor", "o'neil" -> "O'neil")
- test_corpus.py reader: pre-strip NUL bytes (C parser truncates at NUL,
python engine is too strict about embedded literal "), per spec case 06
- generate_test_data.py: properly CSV-escape literal-quote cells in case 03
expected; quote the rogue-comma price field in case 17 input
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -40,6 +40,10 @@ _SMART_CHARS: dict[str, str] = {
|
||||
"―": "-", # HORIZONTAL BAR
|
||||
"−": "-", # MINUS SIGN
|
||||
"…": "...", # HORIZONTAL ELLIPSIS
|
||||
"′": "'", # PRIME (foot / minute marker)
|
||||
"″": '"', # DOUBLE PRIME (inch / second marker)
|
||||
"«": '"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||
"»": '"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||
" ": " ", # NO-BREAK SPACE
|
||||
" ": " ", # NARROW NO-BREAK SPACE
|
||||
" ": " ", # THIN SPACE
|
||||
@@ -62,6 +66,7 @@ _ZERO_WIDTH = (
|
||||
"" # LEFT-TO-RIGHT MARK
|
||||
"" # RIGHT-TO-LEFT MARK
|
||||
"" # ZERO WIDTH NO-BREAK SPACE / BOM
|
||||
"" # SOFT HYPHEN
|
||||
)
|
||||
_ZERO_WIDTH_RE = re.compile(f"[{_ZERO_WIDTH}]")
|
||||
|
||||
@@ -159,27 +164,37 @@ def _is_all_caps_token(token: str) -> bool:
|
||||
def smart_title_case(s: str) -> str:
|
||||
"""Title-case that preserves all-caps tokens and lowercases mid-string particles.
|
||||
|
||||
- ``USA`` stays ``USA``.
|
||||
- ``USA`` stays ``USA`` when surrounded by mixed-case words (acronym).
|
||||
- ``ALICE SMITH`` becomes ``Alice Smith`` (entire string is shouting).
|
||||
- ``of``, ``and``, ``the``, etc. stay lowercase except as the first/last word.
|
||||
- Apostrophes inside words don't restart capitalization (``O'Neil``).
|
||||
"""
|
||||
if not isinstance(s, str) or not s:
|
||||
return s
|
||||
tokens = s.split(" ")
|
||||
# If every cased token is all-caps, treat the whole string as SHOUT and
|
||||
# title-case it. Otherwise preserve all-caps tokens as acronyms.
|
||||
cased_tokens = [t for t in tokens if any(c.isalpha() for c in t)]
|
||||
all_shouting = bool(cased_tokens) and all(
|
||||
not any(c.islower() for c in t) for t in cased_tokens
|
||||
)
|
||||
out: list[str] = []
|
||||
last_idx = len(tokens) - 1
|
||||
for i, tok in enumerate(tokens):
|
||||
if not tok:
|
||||
out.append(tok)
|
||||
continue
|
||||
if _is_all_caps_token(tok):
|
||||
if not all_shouting and _is_all_caps_token(tok):
|
||||
out.append(tok)
|
||||
continue
|
||||
lowered = tok.lower()
|
||||
if 0 < i < last_idx and lowered in _TITLE_LOWERCASE_PARTICLES:
|
||||
out.append(lowered)
|
||||
continue
|
||||
# Capitalize first cased character; preserve apostrophes/hyphens
|
||||
# Capitalize first cased character. Inside a token, preserve the
|
||||
# original capitalization of the letter immediately after an
|
||||
# apostrophe so name patterns like ``O'Connor``/``D'Angelo`` survive
|
||||
# while ``o'neil`` -> ``O'neil`` stays lowercase.
|
||||
chars = list(tok)
|
||||
capitalized = False
|
||||
for j, c in enumerate(chars):
|
||||
@@ -188,7 +203,12 @@ def smart_title_case(s: str) -> str:
|
||||
chars[j] = c.upper()
|
||||
capitalized = True
|
||||
else:
|
||||
chars[j] = c.lower()
|
||||
prev = chars[j - 1] if j > 0 else ""
|
||||
if prev == "'" and c.isupper():
|
||||
# Preserve original uppercase after apostrophe.
|
||||
pass
|
||||
else:
|
||||
chars[j] = c.lower()
|
||||
out.append("".join(chars))
|
||||
return " ".join(out)
|
||||
|
||||
@@ -291,6 +311,11 @@ class CleanOptions:
|
||||
strip_control: bool = True
|
||||
normalize_line_endings: bool = True
|
||||
|
||||
# Apply the same character-level pipeline to column headers. Headers carry
|
||||
# the same pollution as data cells (NBSP padding, smart quotes, ZWSP);
|
||||
# not cleaning them silently breaks df["col"] lookups downstream.
|
||||
clean_headers: bool = True
|
||||
|
||||
# Case conversion: either a single mode applied to all selected columns,
|
||||
# or a dict mapping column name -> mode for per-column control.
|
||||
case: Optional[CaseMode] = None
|
||||
@@ -440,6 +465,15 @@ def clean_dataframe(df: pd.DataFrame, options: Optional[CleanOptions] = None) ->
|
||||
out = df.copy()
|
||||
columns = _select_columns(out, options)
|
||||
|
||||
if options.clean_headers:
|
||||
new_columns = [clean_value(c, options)[0] for c in out.columns]
|
||||
if new_columns != list(out.columns):
|
||||
# Track column mapping so case_columns/columns/skip_columns based
|
||||
# on the original (dirty) names continue to work after rename.
|
||||
rename = dict(zip(out.columns, new_columns))
|
||||
columns = [rename.get(c, c) for c in columns]
|
||||
out.columns = new_columns
|
||||
|
||||
case_per_col: dict[str, CaseMode] = dict(options.case_columns)
|
||||
if options.case is not None:
|
||||
for c in columns:
|
||||
|
||||
51
test-cases/text-cleaner-corpus/README.md
Normal file
51
test-cases/text-cleaner-corpus/README.md
Normal file
@@ -0,0 +1,51 @@
|
||||
# Text Cleaner Test Corpus
|
||||
|
||||
Test fixtures for `02_text_cleaner.py` (Excel & CSV Data Cleaning Mastery Bundle).
|
||||
|
||||
## Layout
|
||||
|
||||
```
|
||||
text_cleaner_test_corpus/
|
||||
├── README.md # This file
|
||||
├── TEST-CASES.md # Full taxonomy and expected behavior per test
|
||||
├── generate_test_data.py # Regenerates the 20 CSV inputs and expected outputs
|
||||
├── generate_xlsx.py # Regenerates the multi-sheet XLSX fixture
|
||||
├── test_data/ # Inputs (21 fixtures: 20 CSV + 1 XLSX)
|
||||
└── expected/ # Expected outputs (with default and flag variants)
|
||||
```
|
||||
|
||||
## Quick start
|
||||
|
||||
Read `TEST-CASES.md` from top to bottom. Sections 1 (scope boundary) and 2 (default config assumed) are load-bearing; the per-test details in Section 4 don't make sense without them.
|
||||
|
||||
To regenerate the test files (e.g., after editing the generator):
|
||||
```bash
|
||||
python generate_test_data.py
|
||||
python generate_xlsx.py
|
||||
```
|
||||
|
||||
To use as pytest fixtures: see Section 6 of `TEST-CASES.md`.
|
||||
|
||||
## Coverage summary
|
||||
|
||||
| Category | Fixtures |
|
||||
|---|---|
|
||||
| Whitespace (ASCII + Unicode) | 01, 02 |
|
||||
| Smart punctuation | 03 |
|
||||
| Unicode normalization | 04 |
|
||||
| Invisible / zero-width / control | 05, 06 |
|
||||
| BOM | 07 |
|
||||
| Line endings (file-level + embedded) | 08, 09, 10, 11 |
|
||||
| Case operations (opt-in) | 12 |
|
||||
| International script preservation | 13 |
|
||||
| Mojibake | 14 |
|
||||
| Boundary with script 04 (missing values) | 15 |
|
||||
| Headers | 16, 19 |
|
||||
| Negative tests (must NOT touch) | 17 |
|
||||
| File-level edge cases | 18, 19 |
|
||||
| Integration | 20 |
|
||||
| Excel-specific (multi-sheet, Alt+Enter) | 21 |
|
||||
|
||||
## Out of scope
|
||||
|
||||
Documented in `TEST-CASES.md` Section 5: encoding detection, large-file performance, GUI behavior, file-locking, CLI argument parsing. Each needs its own test layer.
|
||||
509
test-cases/text-cleaner-corpus/TEST-CASES.md
Normal file
509
test-cases/text-cleaner-corpus/TEST-CASES.md
Normal file
@@ -0,0 +1,509 @@
|
||||
# TEST-CASES.md - `02_text_cleaner.py` Test Corpus
|
||||
|
||||
**Version**: 1.0
|
||||
**Last updated**: April 29, 2026
|
||||
**Companion to**: TECHNICAL.md Section 9 (script boundaries) and the per-script functional spec template introduced in TECHNICAL.md Section 10.1.
|
||||
|
||||
## Purpose of this document
|
||||
|
||||
Defines the complete set of behaviors `02_text_cleaner.py` is expected to exhibit, with one test fixture per behavior. Used as:
|
||||
|
||||
1. The build target when porting the (currently skeleton) script to working state.
|
||||
2. The pytest input set once the script ships.
|
||||
3. The acceptance criteria for the GUI port (every fixture must produce its expected output through both CLI and Streamlit GUI).
|
||||
|
||||
Each test case has an input file in `test_data/` and (where exact-diff comparison applies) an expected-output file in `expected/`.
|
||||
|
||||
---
|
||||
|
||||
## 1. Scope boundary (what 02 owns vs what it doesn't)
|
||||
|
||||
This is the load-bearing decision. Every contested case routes back to it.
|
||||
|
||||
**02 owns: character-level hygiene only.**
|
||||
|
||||
- Whitespace normalization (outer trim + internal collapse for text columns).
|
||||
- Unicode normalization (NFC by default, NFKC opt-in).
|
||||
- Smart-punctuation ASCII-fication (curly quotes, em/en dash, ellipsis, primes).
|
||||
- Invisible / zero-width character stripping.
|
||||
- Control character stripping (with explicit allowlist for tab/newline inside quoted cells).
|
||||
- BOM detection on input, never written on output.
|
||||
- Line-ending normalization at the file level AND inside multi-line cells.
|
||||
- Optional case operations (per-column, opt-in only).
|
||||
|
||||
**02 does NOT own:**
|
||||
|
||||
| Concern | Owned by |
|
||||
|---|---|
|
||||
| Detecting and replacing nulls / sentinel codes | `04_missing_value_handler` |
|
||||
| Reformatting dates, currencies, phones, names, addresses | `03_format_standardizer` |
|
||||
| Outlier detection or domain-rule violations | `06_outlier_detector` |
|
||||
| Renaming or reordering columns | `05_column_mapper_enforcer` |
|
||||
| Deduplication (even though dedup normalizes internally) | `01_deduplicator` |
|
||||
| File encoding detection on read | The shared I/O layer in `src/core/io.py` |
|
||||
|
||||
**Invariant 02 must preserve:** after running 02, the schema (column count, column order, row count) is unchanged. 02 changes cell *content*, never *structure*. The one nuance: a cell containing only whitespace becomes an empty string, but the cell still exists and the row is not dropped.
|
||||
|
||||
---
|
||||
|
||||
## 2. Default configuration assumed by these tests
|
||||
|
||||
Tests assume the default config below. Any test that exercises a non-default flag explicitly says so in its description.
|
||||
|
||||
| Setting | Default | Notes |
|
||||
|---|---|---|
|
||||
| `--trim` | on | Strip leading/trailing whitespace including Unicode whitespace (NBSP, NNBSP, ideographic space, etc.) |
|
||||
| `--collapse-internal` | on (text columns only) | Collapse runs of internal whitespace to a single ASCII space, ONLY in cells that don't parse as numeric, date, or phone-shaped |
|
||||
| `--unicode-form` | NFC | NFKC available as opt-in; folds ligatures and fullwidth |
|
||||
| `--smart-quotes` | on | Curly to straight, em/en dash to hyphen, ellipsis to `...`, primes to `'`/`"` |
|
||||
| `--strip-zero-width` | on | ZWSP, ZWJ, ZWNJ, LRM, RLM, soft hyphen, word joiner |
|
||||
| `--strip-controls` | on | Strip C0 (except `\t\n\r` inside quoted cells) and DEL |
|
||||
| `--strip-bom` | on | BOM removed on read; never written on output |
|
||||
| `--line-endings` | LF | File-level AND embedded-cell line endings normalized to LF |
|
||||
| `--case` | none | Case operations are opt-in per column |
|
||||
| `--fix-mojibake` | off | Logged as warning by default; opt-in repair via ftfy |
|
||||
| `--columns` | all | All text columns processed; `--columns name,email` restricts |
|
||||
|
||||
**Idempotency requirement:** for any input X, `clean(clean(X)) == clean(X)`. This is a property test, not a fixture-comparison test. Every fixture below should be run through the cleaner twice and produce identical output both times.
|
||||
|
||||
---
|
||||
|
||||
## 3. Test case index
|
||||
|
||||
| # | File | Category | What it tests | Diff-testable |
|
||||
|---|---|---|---|---|
|
||||
| 01 | `01_whitespace_basic.csv` | Whitespace | ASCII space + tab, leading/trailing/internal | Yes |
|
||||
| 02 | `02_whitespace_unicode.csv` | Whitespace | NBSP, narrow NBSP, ideographic, em/thin space | Yes |
|
||||
| 03 | `03_smart_punctuation.csv` | Punctuation | Curly quotes, em/en dash, ellipsis, primes | Yes |
|
||||
| 04 | `04_unicode_forms.csv` | Unicode | NFC vs NFD, ligatures, fullwidth, presentation forms | Yes |
|
||||
| 05 | `05_zero_width_invisible.csv` | Invisible | ZWSP, ZWJ, ZWNJ, LRM, RLM, soft hyphen | Yes |
|
||||
| 06 | `06_control_characters.csv` | Control | NUL, BEL, BS, VT, FF, ESC, DEL | Yes |
|
||||
| 07 | `07_bom_utf8.csv` | Encoding | UTF-8 BOM at file start | Yes (byte-exact) |
|
||||
| 08 | `08_line_endings_crlf.csv` | Line endings | All CRLF (Windows) | Yes (byte-exact) |
|
||||
| 09 | `09_line_endings_cr.csv` | Line endings | All CR (classic Mac) | Yes (byte-exact) |
|
||||
| 10 | `10_line_endings_mixed.csv` | Line endings | CRLF + LF + CR mixed in one file | Yes (byte-exact) |
|
||||
| 11 | `11_embedded_newlines.csv` | Line endings | Newlines inside quoted cells (preserve, normalize) | Yes |
|
||||
| 12 | `12_case_variations.csv` | Case | Mixed case across name/email/product columns | 3 outputs (default + 2 modes) |
|
||||
| 13 | `13_non_latin_scripts.csv` | Preservation | Chinese, Japanese, Arabic, Russian, emoji | Yes |
|
||||
| 14 | `14_mojibake.csv` | Encoding | Double-encoded UTF-8 (warn-by-default; fix opt-in) | 2 outputs (default + fixed) |
|
||||
| 15 | `15_whitespace_only_cells.csv` | Boundary (vs 04) | Cells containing only whitespace become empty | Yes |
|
||||
| 16 | `16_dirty_headers.csv` | Headers | Headers themselves have whitespace, BOM, smart quotes | Yes |
|
||||
| 17 | `17_preserve_intended.csv` | Negative | Things 02 must NOT touch | Yes |
|
||||
| 18 | `18_empty_file.csv` | Edge | Zero-byte file | Yes |
|
||||
| 19 | `19_headers_only.csv` | Edge | Headers but no data rows | Yes |
|
||||
| 20 | `20_kitchen_sink.csv` | Integration | Everything combined in one file | Yes |
|
||||
| 21 | `21_excel_pollution.xlsx` | Excel-specific | Multi-sheet, Alt+Enter cells, force-text, copy-paste pollution | No (manual) |
|
||||
|
||||
---
|
||||
|
||||
## 4. Per-test details
|
||||
|
||||
### 01 - Whitespace basic
|
||||
|
||||
**File**: `test_data/01_whitespace_basic.csv` -> `expected/01_whitespace_basic.csv`
|
||||
|
||||
Tests the core whitespace contract on ASCII space and tab characters. Every kind of placement: leading-only, trailing-only, both, internal-multiple, tab-padded, multiple internal multi-space runs in one cell, all of the above combined.
|
||||
|
||||
**Expected behavior:**
|
||||
- Leading and trailing whitespace stripped from every cell.
|
||||
- Internal runs of whitespace collapsed to a single ASCII space.
|
||||
- Tabs treated as whitespace by both rules.
|
||||
|
||||
**Why it matters:** This is the highest-frequency real-world pollution. Trailing-space pollution alone is what the v1.5 audit identified as the gap that motivated creating script 02 in the first place (DECISIONS.md v1.5 entry).
|
||||
|
||||
---
|
||||
|
||||
### 02 - Whitespace, Unicode
|
||||
|
||||
**File**: `test_data/02_whitespace_unicode.csv` -> `expected/02_whitespace_unicode.csv`
|
||||
|
||||
The whitespace pretenders. Python's `str.strip()` with no argument actually does strip these in 3.x, but a lot of cleaners written by people who were burned in 2.x explicitly pass `' \t\n'` and miss them. Excel and Word produce these constantly when you copy from a styled document.
|
||||
|
||||
Characters covered: NBSP (U+00A0), narrow NBSP (U+202F), ideographic space (U+3000), em space (U+2003), thin space (U+2009).
|
||||
|
||||
**Expected behavior:** treated identically to ASCII space - trimmed at edges, collapsed internally.
|
||||
|
||||
**Why it matters:** "It looks fine but the join doesn't match" debugging sessions almost always end here. NBSP-padded keys are the silent killer.
|
||||
|
||||
---
|
||||
|
||||
### 03 - Smart punctuation
|
||||
|
||||
**File**: `test_data/03_smart_punctuation.csv` -> `expected/03_smart_punctuation.csv`
|
||||
|
||||
Curly quotes, dashes, ellipsis, primes - the autocorrect-as-you-type damage from Word/Excel. ASCII-fy where round-trip-safe.
|
||||
|
||||
| Input | Output | Notes |
|
||||
|---|---|---|
|
||||
| `\u201c` `\u201d` (curly double) | `"` | |
|
||||
| `\u2018` `\u2019` (curly single) | `'` | Includes apostrophe |
|
||||
| `\u2014` (em-dash) | `-` | |
|
||||
| `\u2013` (en-dash) | `-` | |
|
||||
| `\u2026` (ellipsis) | `...` | |
|
||||
| `\u2032` (prime) | `'` | |
|
||||
| `\u2033` (double prime) | `"` | |
|
||||
| `\u00ab` `\u00bb` (guillemets) | `"` | |
|
||||
| `\u00d7` (multiplication sign) | **preserved** | Not safely round-trip-able to ASCII; `x` would be wrong |
|
||||
| `\u00b1` (plus-minus) | **preserved** | Same reasoning |
|
||||
|
||||
**Why it matters:** smart-quote pollution breaks regex, breaks downstream parsers, and breaks string equality joins. The two preservation cases (multiplication, plus-minus) are deliberate - they have no faithful ASCII equivalent and forcing one is destructive.
|
||||
|
||||
---
|
||||
|
||||
### 04 - Unicode normalization forms
|
||||
|
||||
**File**: `test_data/04_unicode_forms.csv` -> `expected/04_unicode_forms.csv`
|
||||
|
||||
`café` can be encoded two ways:
|
||||
|
||||
- NFC: `caf\u00e9` (one code point, e-acute as a unit)
|
||||
- NFD: `cafe\u0301` (two code points, plain e + combining accent)
|
||||
|
||||
These render identically. They compare unequal. They have different lengths. macOS filesystem defaults to NFD, which means a CSV exported from a Mac and joined against a CSV from Excel can silently fail.
|
||||
|
||||
Default normalization: NFC (most compact, what Excel emits, what most Western databases expect).
|
||||
|
||||
**Cases covered:**
|
||||
- Pre-composed (NFC) e-acute and i-diaeresis.
|
||||
- Decomposed (NFD) versions of the same.
|
||||
- The `\uFB03` `ffi` ligature - **preserved** under NFC (NFKC would fold it to `ffi`).
|
||||
- Fullwidth Latin letters (`\uFF21\uFF22\uFF23` = `ABC`) - **preserved** under NFC.
|
||||
- Roman numeral nine character (`\u2168`) - **preserved** under NFC.
|
||||
|
||||
After cleaning, rows 1 and 2 must produce identical bytes (NFC and NFD both normalized to NFC). Same for rows 3 and 4.
|
||||
|
||||
**Why it matters:** Mac-vs-Windows data joins. Catches "they look the same but won't match" bugs.
|
||||
|
||||
**Opt-in `--unicode-form=NFKC` test:** not provided as a fixture but should exist as a unit test. Under NFKC, ligature folds to `ffi`, fullwidth folds to ASCII `ABC`, roman numeral folds to `IX`. NFKC is destructive for some legitimate text (mathematical notation, some CJK content) so it stays opt-in.
|
||||
|
||||
---
|
||||
|
||||
### 05 - Zero-width and invisible characters
|
||||
|
||||
**File**: `test_data/05_zero_width_invisible.csv` -> `expected/05_zero_width_invisible.csv`
|
||||
|
||||
These bytes show up from rich-text copy/paste, from RTL text, from accidentally-included U+FEFF in the middle of a cell (yes, this happens), and from some web-form pastes.
|
||||
|
||||
Characters covered: U+200B (ZWSP), U+200C (ZWNJ), U+200D (ZWJ), U+200E (LRM), U+200F (RLM), U+00AD (soft hyphen), U+2060 (word joiner).
|
||||
|
||||
**Expected behavior:** all stripped unconditionally. None of these has a legitimate role in tabular data cells, even when there's a domain reason for them in prose (typesetting Arabic, hyphenation hints in long-form text). For a CSV, they're noise.
|
||||
|
||||
**Why it matters:** these are the *truly invisible* polluters. You can stare at the cell forever and not see them. They break joins, they bloat string lengths, they hash differently. The first time a buyer hits a zero-width-space in a customer name, this test is what saves them.
|
||||
|
||||
---
|
||||
|
||||
### 06 - Control characters
|
||||
|
||||
**File**: `test_data/06_control_characters.csv` -> `expected/06_control_characters.csv`
|
||||
|
||||
The C0 controls (U+0000 through U+001F) plus DEL (U+007F). Test cases: NUL, BEL, BS, VT, FF, ESC, DEL, and a multi-control combination.
|
||||
|
||||
**Expected behavior:** all stripped from cell content.
|
||||
|
||||
**The exception:** tab (U+0009), LF (U+000A), and CR (U+000D) are NOT stripped from inside quoted cells. Tab might be intentional formatting; LF/CR are handled by line-ending normalization (case 11). Outside of quoted cells, tab is whitespace and gets normalized like space.
|
||||
|
||||
**Why it matters:** real-world exports from broken systems, half-corrupted database dumps, copy-paste from terminals (including ANSI escape sequences starting with ESC), and binary data accidentally exported as text all leave these in cells. A NUL byte mid-string breaks C-string-based parsers; a BEL makes terminals beep when you `cat` the file; ESC sequences corrupt logs.
|
||||
|
||||
---
|
||||
|
||||
### 07 - UTF-8 BOM
|
||||
|
||||
**File**: `test_data/07_bom_utf8.csv` -> `expected/07_bom_utf8.csv` (byte-exact comparison)
|
||||
|
||||
File starts with the three-byte sequence `EF BB BF`. Excel writes UTF-8 with BOM by default. Pandas `read_csv` usually handles this but leaves the BOM as part of the first column header name unless you pass `encoding='utf-8-sig'`. Result: a mystery column called `\ufeffid` that breaks every `df["id"]` lookup downstream.
|
||||
|
||||
**Expected behavior:**
|
||||
- BOM stripped on read.
|
||||
- First column header is the clean string `id`, not `\ufeffid`.
|
||||
- Output file is written WITHOUT a BOM.
|
||||
|
||||
**Diff target:** byte-for-byte equality with `expected/07_bom_utf8.csv`. The expected file must NOT have the BOM.
|
||||
|
||||
**Why it matters:** Excel-origin data is the dominant input for the target buyer. Getting BOM handling wrong silently breaks the rest of the pipeline.
|
||||
|
||||
---
|
||||
|
||||
### 08, 09, 10 - Line endings: CRLF, CR-only, mixed
|
||||
|
||||
**Files**: `08_line_endings_crlf.csv`, `09_line_endings_cr.csv`, `10_line_endings_mixed.csv`
|
||||
|
||||
- 08: every line ends with CRLF (`\r\n`). Standard Windows.
|
||||
- 09: every line ends with CR (`\r`) only. Classic Mac. Rare but seen.
|
||||
- 10: same file contains all three: CRLF, LF, CR, CRLF, LF.
|
||||
|
||||
**Expected behavior on output:** all lines end with LF (`\n`). Byte-exact match to the expected files.
|
||||
|
||||
**Why LF as the default output:** it's what Linux uses, what every modern code editor handles, what Git stores by default, and what Streamlit / pandas write by default. CRLF is an option for buyers who specifically need Windows-style output, but the default minimizes round-trip surprises.
|
||||
|
||||
**Why it matters:** mixed line endings cause "ghost rows" in some parsers, blank lines in some editors, and silent data loss in any tool that splits on one specific newline pattern. Case 10 is the disaster scenario - multi-source concat - and is the most important of the three.
|
||||
|
||||
---
|
||||
|
||||
### 11 - Embedded newlines inside quoted cells
|
||||
|
||||
**File**: `test_data/11_embedded_newlines.csv` -> `expected/11_embedded_newlines.csv`
|
||||
|
||||
The trap. File-level line-ending normalization must NOT collapse intentional newlines inside multi-line cells (addresses, notes columns). But the embedded line endings *should still* be normalized to LF for consistency.
|
||||
|
||||
**Expected behavior:**
|
||||
- File-level line endings: LF.
|
||||
- Embedded CRLF inside a quoted cell: normalized to LF.
|
||||
- Embedded CR inside a quoted cell: normalized to LF.
|
||||
- Cell stays multi-line; the newline character count inside the cell is preserved.
|
||||
|
||||
**Why it matters:** an address column with `123 Main St\r\nApt 4B\r\nNew York` is the canonical legitimate multi-line cell. A naive `text.replace('\r\n', '\n')` works correctly. A naive `text.split('\n')` to "remove blank lines" destroys the address. The cleaner must understand CSV quoting.
|
||||
|
||||
---
|
||||
|
||||
### 12 - Case operations (opt-in)
|
||||
|
||||
**Files**: input `12_case_variations.csv`; three expected outputs:
|
||||
- `expected/12_case_variations__default.csv` (no flag - identity)
|
||||
- `expected/12_case_variations__email_lower.csv` (`--case email=lower`)
|
||||
- `expected/12_case_variations__name_title.csv` (`--case name=title`)
|
||||
|
||||
Default behavior is **preserve case**. Case operations are opt-in per column because:
|
||||
|
||||
- Lowercasing emails is almost always right (emails are case-insensitive per RFC 5321 local-part-aside).
|
||||
- Title-casing names is almost always right (`ALICE SMITH` -> `Alice Smith`), but must handle apostrophes correctly (`O'Connor` -> `O'Connor`, not `O'connor`).
|
||||
- Lowercasing product codes is almost always WRONG (`SKU-A1B2` is a code, not prose).
|
||||
|
||||
So the tool offers per-column case ops, never a global one. The expected outputs cover the two most common configurations.
|
||||
|
||||
**Tricky case to verify:** row 4 name `DAN O'CONNOR`. Under `--case=title` this must become `Dan O'Connor`, not `Dan O'connor`. Python's `str.title()` gets this wrong. Implementations should use `string.capwords()` or a regex that respects apostrophes inside words.
|
||||
|
||||
**Why it matters:** dedup quality (case 01 in the deduplicator) depends on consistent case in the comparison columns. Buyers running 02 before 01 expect this to "just work" for the email column.
|
||||
|
||||
---
|
||||
|
||||
### 13 - Non-Latin scripts and emoji (preservation negative test)
|
||||
|
||||
**File**: `test_data/13_non_latin_scripts.csv` -> `expected/13_non_latin_scripts.csv`
|
||||
|
||||
Negative test: cleaning must not damage characters outside the Latin/punctuation block. Trim and NFC still apply (row 1 has leading and trailing space, which gets trimmed).
|
||||
|
||||
Coverage: Chinese (Beijing), Japanese (katakana test), Arabic RTL, Cyrillic Russian, multi-codepoint emoji (party popper U+1F389, rocket U+1F680), accent + emoji combo (`café ☕`).
|
||||
|
||||
**Expected behavior:** only whitespace and NFC normalization apply. All script-significant characters preserved exactly.
|
||||
|
||||
**Why it matters:** the cleaner must be safe on international buyer data. Stripping "weird-looking" characters because they're outside ASCII is a textbook bug. Emoji in particular are in the supplementary planes (above U+FFFF) and naive byte-level filters often mangle them.
|
||||
|
||||
---
|
||||
|
||||
### 14 - Mojibake
|
||||
|
||||
**Files**: input `14_mojibake.csv`; two expected outputs:
|
||||
- `expected/14_mojibake__default.csv` (no flag - bytes preserved, warning logged)
|
||||
- `expected/14_mojibake__fixed.csv` (`--fix-mojibake` - heuristic repair)
|
||||
|
||||
Mojibake is the result of UTF-8 bytes being interpreted as cp1252 or Latin-1 and re-saved as UTF-8. Classic patterns:
|
||||
|
||||
- `café` becomes `café`
|
||||
- `München` becomes `München`
|
||||
- `naïve` becomes `naïve`
|
||||
- The smart-apostrophe in `don't` becomes `don't`
|
||||
|
||||
**Default behavior: warn, do NOT auto-fix.** Reasoning: mojibake repair is heuristic, and the heuristic can false-positive on legitimate strings that happen to contain `Ã` followed by another Latin-1 character. The right call for a tool sold to non-experts is to flag the suspicious pattern in the log and let the user opt in.
|
||||
|
||||
**With `--fix-mojibake` (uses ftfy or equivalent):** repair attempted. The expected output shows fully repaired text including the smart-apostrophe-via-cp1252 case, which ftfy specifically handles.
|
||||
|
||||
**Why it matters:** mojibake is silent corruption. The customer doesn't know it happened until a name shows up wrong on a printed invoice. Flagging it is the responsible default.
|
||||
|
||||
---
|
||||
|
||||
### 15 - Whitespace-only cells (the 02-vs-04 boundary)
|
||||
|
||||
**File**: `test_data/15_whitespace_only_cells.csv` -> `expected/15_whitespace_only_cells.csv`
|
||||
|
||||
Per TECHNICAL.md Section 9.3: 02 trims whitespace first, leaving an empty string. Script 04 then detects empty strings as disguised null. So 02's job in this file is to convert `" "`, `"\t\t"`, `"\u00A0\u00A0"`, and mixed-whitespace cells all into `""`.
|
||||
|
||||
**What 02 does NOT do here:**
|
||||
- Does not decide whether the cell is "missing." That's 04's call.
|
||||
- Does not write `NaN` or `N/A` or any other sentinel. Just produces empty string.
|
||||
- Does not drop the row. Schema is invariant.
|
||||
|
||||
**Expected behavior:** every whitespace-only cell becomes empty. Row count unchanged. Headers untouched.
|
||||
|
||||
**Why it matters:** this is the single most-relitigated boundary in the bundle. Documenting it via fixture prevents drift.
|
||||
|
||||
---
|
||||
|
||||
### 16 - Dirty headers
|
||||
|
||||
**File**: `test_data/16_dirty_headers.csv` -> `expected/16_dirty_headers.csv`
|
||||
|
||||
Headers themselves are subject to all the same pollution as data cells. A header `" Email "` (NBSP-padded) breaks `df["Email"]` lookups because the actual column name has NBSP padding. Smart-quoted header `"\u201cEmail\u201d"` is even worse.
|
||||
|
||||
**Expected behavior:** headers cleaned by the same rules as data. Note that the smart-quoted header `"Email"` (with surrounding quotes) becomes a header value containing literal ASCII double quotes, which then requires CSV-quoting in the output. The expected file is written with proper CSV escaping.
|
||||
|
||||
**Why it matters:** broken column names break every downstream join, every selectbox in the GUI, and every CLI flag that takes a column name. Cleaning headers is non-negotiable.
|
||||
|
||||
---
|
||||
|
||||
### 17 - Preserve-intended (negative tests)
|
||||
|
||||
**File**: `test_data/17_preserve_intended.csv` -> `expected/17_preserve_intended.csv`
|
||||
|
||||
The negative-test file. Things 02 must NOT touch because they belong to other scripts:
|
||||
|
||||
| Cell content | What 02 does | What 02 does NOT do |
|
||||
|---|---|---|
|
||||
| ` 100 ` | Trims to `100` | Doesn't reformat as `$100.00` (that's 03) |
|
||||
| `1 234` | Preserves as `1 234` | Doesn't collapse internal space (looks numeric, European thousand-sep) |
|
||||
| `$1,500.00` | Trims outer whitespace | Doesn't reformat currency (that's 03) |
|
||||
| `2024-01-15` | Trims outer whitespace | Doesn't reformat date (that's 03) |
|
||||
| `(555) 123-4567` | Trims outer whitespace | Doesn't reformat phone (that's 03); does not collapse internal space |
|
||||
| `+1 555 123 4567` | Trims outer whitespace | Same; phone-shaped, leave internal spacing alone |
|
||||
| `N/A` | Trims to `N/A` | Doesn't replace with empty or NaN (that's 04) |
|
||||
| `nan` | Trims to `nan` | Doesn't replace with empty or NaN (that's 04) |
|
||||
|
||||
The internal-whitespace heuristic: if a cell parses as numeric, looks like a date, or matches a phone-shape regex (digits + common separators), do NOT collapse internal whitespace. Only collapse in cells classified as free text. This requires a per-cell check; document it in the implementation.
|
||||
|
||||
**Why it matters:** scope discipline. If 02 starts reformatting dates because "while we're trimming whitespace anyway", it stops being 02 and starts being a worse 03. The DECISIONS.md Section 4a rule (functional scope) cuts the other way too: 02 must not reach into other scripts' territory.
|
||||
|
||||
---
|
||||
|
||||
### 18 - Empty file
|
||||
|
||||
**File**: `test_data/18_empty_file.csv` (zero bytes) -> `expected/18_empty_file.csv` (zero bytes)
|
||||
|
||||
**Expected behavior:** graceful no-op. Either produces an empty output file with a logged warning, or emits a clean error message naming the problem ("Input file is empty"). What it MUST NOT do: crash with `pandas.errors.EmptyDataError` traceback in the GUI.
|
||||
|
||||
**Why it matters:** error UX standard from DECISIONS.md Section 4b - errors that name the problem and the fix, not stack traces.
|
||||
|
||||
---
|
||||
|
||||
### 19 - Headers only (no data rows)
|
||||
|
||||
**File**: `test_data/19_headers_only.csv` -> `expected/19_headers_only.csv`
|
||||
|
||||
Just headers, no data. Headers themselves are dirty (whitespace + NBSP + ZWSP).
|
||||
|
||||
**Expected behavior:** headers cleaned, output is clean headers + no data rows. No crash, no warning required (it's a legitimate state).
|
||||
|
||||
**Why it matters:** template files often look like this. The buyer might be cleaning a template before populating it. Don't punish them for it.
|
||||
|
||||
---
|
||||
|
||||
### 20 - Kitchen sink (integration)
|
||||
|
||||
**File**: `test_data/20_kitchen_sink.csv` -> `expected/20_kitchen_sink.csv`
|
||||
|
||||
The integration test. Combines:
|
||||
|
||||
- UTF-8 BOM at file start.
|
||||
- CRLF line endings throughout.
|
||||
- Headers with leading/trailing space, NBSP, smart quotes, ZWSP.
|
||||
- Data cells with NBSP, internal multi-space, smart quotes, em-dash, ellipsis, primes (foot/inch markers).
|
||||
- A whitespace-only cell that should become empty.
|
||||
- Multiplication sign (preserved).
|
||||
|
||||
**Expected output:** every transformation applied correctly, schema unchanged, file written as UTF-8 (no BOM) with LF line endings.
|
||||
|
||||
**Why it matters:** this is the one fixture that catches transformation-order bugs. If smart-quote replacement runs before whitespace trim, you get different output than the other order. Picking and locking the order is part of the implementation; the fixture verifies it.
|
||||
|
||||
**Recommended transformation pipeline order** (informative, not normative):
|
||||
|
||||
1. Decode bytes -> strip BOM at file level.
|
||||
2. Normalize file-level line endings -> LF.
|
||||
3. Parse CSV (with proper quoting for embedded newlines).
|
||||
4. Per cell, in order:
|
||||
a. Unicode NFC normalize.
|
||||
b. Strip zero-width and control characters.
|
||||
c. Strip BOM if it appears mid-cell.
|
||||
d. Smart-quote ASCII-fy.
|
||||
e. Normalize embedded line endings to LF.
|
||||
f. Whitespace trim (outer).
|
||||
g. Internal whitespace collapse (text columns only - check after trim).
|
||||
h. Per-column case op (if configured).
|
||||
5. Headers go through the same per-cell pipeline.
|
||||
6. Write as UTF-8, LF line endings, no BOM.
|
||||
|
||||
---
|
||||
|
||||
### 21 - Excel pollution (multi-sheet XLSX)
|
||||
|
||||
**File**: `test_data/21_excel_pollution.xlsx` (no expected file - manual / programmatic verification per sheet)
|
||||
|
||||
Four sheets, each isolating an Excel-specific concern:
|
||||
|
||||
**Sheet `Customers`** - dirty headers (NBSP, smart quotes, ZWSP) and dirty data cells (NBSP padding, tab padding, smart apostrophe in `O'Connor`, em-dash). One whitespace-only `name` cell to verify the 02/04 boundary applies on XLSX too.
|
||||
|
||||
**Sheet `Notes`** - multi-line cells from Alt+Enter (LF inside cell), plus a cell with mixed CRLF inside (from someone pasting Windows text into Excel). Cells have wrap_text formatting set so the line breaks render in Excel. After cleaning, all in-cell line breaks should be LF.
|
||||
|
||||
**Sheet `International`** - non-Latin scripts and emoji with surrounding whitespace. Verifies the preservation contract from case 13 holds for XLSX.
|
||||
|
||||
**Sheet `ForceText`** - leading-zero IDs (e.g., `0001234`). These must not be stripped of leading zeros (that's not 02's job - it doesn't change semantic content). Row 3 has a leaked apostrophe (`'9999999`) from a force-text cell - this is a judgment call but the default is to preserve it; trying to detect "leaked apostrophe" is too error-prone.
|
||||
|
||||
**Why it matters:** XLSX has pollution patterns that don't appear in CSV (Alt+Enter cells, force-text apostrophes, sheet structure). The XLSX reader path needs the same cleaning logic as the CSV reader path; this fixture verifies that.
|
||||
|
||||
---
|
||||
|
||||
## 5. What this corpus does NOT cover
|
||||
|
||||
Listed so the gap is explicit, not hidden:
|
||||
|
||||
1. **Encoding detection** (cp1252 input, Latin-1 input, UTF-16). That's the I/O layer's job, not 02's transformation logic. Once the reader produces a Python `str`, 02 operates the same regardless of source encoding. Add I/O-layer fixtures separately when that layer is built.
|
||||
2. **Performance / large files**. No multi-GB fixture is included because it bloats the repo. Add a benchmark (not a unit test) targeting a 500MB CSV; verify processing completes without OOM via chunked reads.
|
||||
3. **Streamlit UI behavior**. The fixtures verify cleaning correctness; verifying the GUI shows the right preview, applies the right defaults, and renders cleaning in the diff view is a separate test layer (probably manual, possibly Playwright).
|
||||
4. **Concurrency / file-locking** (e.g., user has the input file open in Excel). Expected to fail with a clean error, not corrupt data. Add a manual test, not a fixture.
|
||||
5. **CLI argument parsing** for the various flags. Each flag should have a Typer-level test, separate from the fixtures here.
|
||||
|
||||
---
|
||||
|
||||
## 6. How to use this corpus
|
||||
|
||||
### As a build target
|
||||
Each fixture is one piece of the spec. Implement the cleaner against fixture 01, run, diff, fix, repeat. Move to 02. By the time fixture 20 passes, the script is done.
|
||||
|
||||
### As pytest fixtures
|
||||
```python
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from src.core.text_cleaner import clean_csv
|
||||
|
||||
CORPUS = Path("tests/corpus") # wherever this folder lands
|
||||
|
||||
@pytest.mark.parametrize("name", [
|
||||
"01_whitespace_basic",
|
||||
"02_whitespace_unicode",
|
||||
"03_smart_punctuation",
|
||||
"04_unicode_forms",
|
||||
"05_zero_width_invisible",
|
||||
"06_control_characters",
|
||||
"07_bom_utf8",
|
||||
"08_line_endings_crlf",
|
||||
"09_line_endings_cr",
|
||||
"10_line_endings_mixed",
|
||||
"11_embedded_newlines",
|
||||
"13_non_latin_scripts",
|
||||
"15_whitespace_only_cells",
|
||||
"16_dirty_headers",
|
||||
"17_preserve_intended",
|
||||
"18_empty_file",
|
||||
"19_headers_only",
|
||||
"20_kitchen_sink",
|
||||
])
|
||||
def test_default_config(name, tmp_path):
|
||||
inp = CORPUS / "test_data" / f"{name}.csv"
|
||||
expected = (CORPUS / "expected" / f"{name}.csv").read_bytes()
|
||||
out = tmp_path / "out.csv"
|
||||
clean_csv(inp, out) # default config
|
||||
assert out.read_bytes() == expected
|
||||
|
||||
# Cases 12 and 14 have multiple expected files; parametrize them separately
|
||||
# with the relevant flags.
|
||||
|
||||
# Idempotency property test - applies to every fixture:
|
||||
@pytest.mark.parametrize("name", [...same list...])
|
||||
def test_idempotent(name, tmp_path):
|
||||
inp = CORPUS / "test_data" / f"{name}.csv"
|
||||
out1 = tmp_path / "out1.csv"
|
||||
out2 = tmp_path / "out2.csv"
|
||||
clean_csv(inp, out1)
|
||||
clean_csv(out1, out2)
|
||||
assert out1.read_bytes() == out2.read_bytes()
|
||||
```
|
||||
|
||||
### Regenerating fixtures
|
||||
If a default policy changes (e.g., switch the default Unicode form from NFC to NFKC, which would be a meaningful policy decision), the fixtures in `expected/` need regenerating. Edit `generate_test_data.py` and re-run. Document the policy change in DECISIONS.md before doing this.
|
||||
@@ -0,0 +1,8 @@
|
||||
id,name,city
|
||||
1,Alice,New York
|
||||
2,Bob,Chicago
|
||||
3,Carol,San Francisco
|
||||
4,Dan Smith,Austin
|
||||
5,Eve,Boston
|
||||
6,Frank van der Berg,Denver
|
||||
7,Grace Hopper,Palo Alto
|
||||
|
@@ -0,0 +1,7 @@
|
||||
id,label,note
|
||||
1,Premium,NBSP padding
|
||||
2,Discount,narrow NBSP
|
||||
3,Standard,ideographic space
|
||||
4,Tier One,em-space internal
|
||||
5,Cost Plus,thin-space internal
|
||||
6,mixed,ascii + NBSP combined
|
||||
|
@@ -0,0 +1,6 @@
|
||||
id,quote,measurement
|
||||
1,"""Hello world""","5' 11"""
|
||||
2,it's working,-
|
||||
3,2020-2024,from 'a' to 'z'
|
||||
4,wait...,3 × 4
|
||||
5,"""quoted""",5 ± 0.1
|
||||
|
@@ -0,0 +1,8 @@
|
||||
id,name,description
|
||||
1,café,NFC form (single code point)
|
||||
2,café,NFD form (e + combining accent)
|
||||
3,naïve,NFC i-diaeresis
|
||||
4,naïve,NFD i + combining diaeresis
|
||||
5,office,fi-ligature (ffi)
|
||||
6,ABC,fullwidth ABC
|
||||
7,Ⅸ century,roman numeral nine (single code point)
|
||||
|
@@ -0,0 +1,8 @@
|
||||
id,value,note
|
||||
1,Hello,zero-width space inside word
|
||||
2,Leading,leading + internal ZWSP
|
||||
3,Trail,trailing ZWSP
|
||||
4,abc,ZWNJ and ZWJ
|
||||
5,Marked,LTR + RTL marks bracketing
|
||||
6,cooperate,soft hyphen
|
||||
7,nobreak,word joiner
|
||||
|
@@ -0,0 +1,9 @@
|
||||
id,value,note
|
||||
1,HelloWorld,NUL byte inside
|
||||
2,BellSound,BEL character
|
||||
3,Backspace,backspace
|
||||
4,VertTab,vertical tab
|
||||
5,FormFeed,form feed
|
||||
6,Escape,ESC character
|
||||
7,Delete,DEL character
|
||||
8,Mixedjunk,multiple controls in one cell
|
||||
|
3
test-cases/text-cleaner-corpus/expected/07_bom_utf8.csv
Normal file
3
test-cases/text-cleaner-corpus/expected/07_bom_utf8.csv
Normal file
@@ -0,0 +1,3 @@
|
||||
id,name,city
|
||||
1,Alice,New York
|
||||
2,Bob,Chicago
|
||||
|
@@ -0,0 +1,4 @@
|
||||
id,name
|
||||
1,Alice
|
||||
2,Bob
|
||||
3,Carol
|
||||
|
@@ -0,0 +1,4 @@
|
||||
id,name
|
||||
1,Alice
|
||||
2,Bob
|
||||
3,Carol
|
||||
|
@@ -0,0 +1,5 @@
|
||||
id,name
|
||||
1,Alice
|
||||
2,Bob
|
||||
3,Carol
|
||||
4,Dan
|
||||
|
@@ -0,0 +1,9 @@
|
||||
id,address,notes
|
||||
1,"123 Main St
|
||||
Apt 4B
|
||||
New York, NY","line1
|
||||
line2"
|
||||
2,Single line,"contains
|
||||
classic mac
|
||||
internal"
|
||||
3,normal,no newlines here
|
||||
|
@@ -0,0 +1,5 @@
|
||||
id,name,email,product
|
||||
1,ALICE SMITH,Alice@Example.COM,Widget
|
||||
2,bob jones,BOB@example.com,GADGET
|
||||
3,Carol Brown,carol@EXAMPLE.com,wIdGeT
|
||||
4,DAN O'CONNOR,Dan@Example.com,gizmo
|
||||
|
@@ -0,0 +1,5 @@
|
||||
id,name,email,product
|
||||
1,ALICE SMITH,alice@example.com,Widget
|
||||
2,bob jones,bob@example.com,GADGET
|
||||
3,Carol Brown,carol@example.com,wIdGeT
|
||||
4,DAN O'CONNOR,dan@example.com,gizmo
|
||||
|
@@ -0,0 +1,5 @@
|
||||
id,name,email,product
|
||||
1,Alice Smith,Alice@Example.COM,Widget
|
||||
2,Bob Jones,BOB@example.com,GADGET
|
||||
3,Carol Brown,carol@EXAMPLE.com,wIdGeT
|
||||
4,Dan O'Connor,Dan@Example.com,gizmo
|
||||
|
@@ -0,0 +1,7 @@
|
||||
id,name,note
|
||||
1,中国北京,Beijing in Chinese (with leading/trailing space)
|
||||
2,テスト,Japanese katakana (test)
|
||||
3,تجربة,Arabic (test) - RTL
|
||||
4,Москва,Russian (Moscow)
|
||||
5,🎉 launch 🚀,emoji preserved
|
||||
6,café ☕,emoji + accent combo
|
||||
|
@@ -0,0 +1,5 @@
|
||||
id,name,city
|
||||
1,café,München
|
||||
2,naïve,résumé
|
||||
3,don’t,smart-apostrophe mojibake
|
||||
4,Alice,New York
|
||||
|
@@ -0,0 +1,5 @@
|
||||
id,name,city
|
||||
1,café,München
|
||||
2,naïve,résumé
|
||||
3,don't,smart-apostrophe mojibake
|
||||
4,Alice,New York
|
||||
|
@@ -0,0 +1,8 @@
|
||||
id,value
|
||||
1,real
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,actual value
|
||||
|
@@ -0,0 +1,3 @@
|
||||
id,Customer Name,"""Email""",Phone
|
||||
1,Alice,alice@example.com,555-1234
|
||||
2,Bob,bob@example.com,555-5678
|
||||
|
@@ -0,0 +1,4 @@
|
||||
id,price,european_number,date,phone,quantity
|
||||
1,100,1 234,2024-01-15,(555) 123-4567,42
|
||||
2,"$1,500.00",12 345,15/01/2024,555.123.4567,7
|
||||
3,N/A,nan,Jan 15 2024,+1 555 123 4567,0
|
||||
|
@@ -0,0 +1 @@
|
||||
id,Name,Email
|
||||
|
@@ -0,0 +1,5 @@
|
||||
id,Name,"""Email""",Notes
|
||||
1,Alice Smith,Alice@Example.COM,"""VIP"" customer - contact ASAP..."
|
||||
2,Bob Jones,bob@example.com,"it's 5'6"" tall"
|
||||
3,Carol Brown,CAROL@EXAMPLE.COM,3 × 4 = 12 (preserve ×)
|
||||
4,,empty@example.com,whitespace-only name (becomes empty)
|
||||
|
545
test-cases/text-cleaner-corpus/generate_test_data.py
Normal file
545
test-cases/text-cleaner-corpus/generate_test_data.py
Normal file
@@ -0,0 +1,545 @@
|
||||
"""
|
||||
Generator for the 02_text_cleaner test corpus.
|
||||
|
||||
Writes raw bytes where exact control over encoding/line-endings/invisible
|
||||
characters matters. Do not edit the output files in a text editor that
|
||||
"helpfully" normalizes anything; it will silently break the tests.
|
||||
|
||||
Run from the corpus root:
|
||||
python generate_test_data.py
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).parent
|
||||
TD = ROOT / "test_data"
|
||||
EX = ROOT / "expected"
|
||||
TD.mkdir(exist_ok=True)
|
||||
EX.mkdir(exist_ok=True)
|
||||
|
||||
|
||||
def write_bytes(path, data):
|
||||
Path(path).write_bytes(data)
|
||||
|
||||
|
||||
def write_text(path, text, encoding="utf-8", newline="\n"):
|
||||
# Explicit byte write so we control line endings exactly.
|
||||
if newline != "\n":
|
||||
text = text.replace("\n", newline)
|
||||
Path(path).write_bytes(text.encode(encoding))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 01 Whitespace - basic (ASCII space + tab)
|
||||
# ---------------------------------------------------------------------------
|
||||
write_text(TD / "01_whitespace_basic.csv", (
|
||||
"id,name,city\n"
|
||||
"1, Alice ,New York\n" # leading + trailing spaces
|
||||
"2,Bob, Chicago\n" # leading spaces
|
||||
"3,Carol ,San Francisco \n" # trailing spaces
|
||||
"4,Dan Smith,Austin\n" # internal multi-space
|
||||
"5,\tEve\t,\tBoston\t\n" # tab padding
|
||||
"6,Frank van der Berg,Denver\n" # multiple internal multi-space runs
|
||||
"7, Grace Hopper , Palo Alto \n" # everything at once
|
||||
))
|
||||
|
||||
write_text(EX / "01_whitespace_basic.csv", (
|
||||
"id,name,city\n"
|
||||
"1,Alice,New York\n"
|
||||
"2,Bob,Chicago\n"
|
||||
"3,Carol,San Francisco\n"
|
||||
"4,Dan Smith,Austin\n"
|
||||
"5,Eve,Boston\n"
|
||||
"6,Frank van der Berg,Denver\n"
|
||||
"7,Grace Hopper,Palo Alto\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 02 Whitespace - unicode (NBSP, narrow NBSP, ideographic space, etc.)
|
||||
# ---------------------------------------------------------------------------
|
||||
# These are the whitespace-pretenders that .strip() in Python 3 actually
|
||||
# DOES handle, but that .strip() in many naive implementations (or pandas
|
||||
# defaults) does NOT. Test that they're stripped, not preserved.
|
||||
NBSP = "\u00A0" # non-breaking space (very common from Word/Excel paste)
|
||||
NNBSP = "\u202F" # narrow no-break space
|
||||
IDEO = "\u3000" # ideographic space (CJK)
|
||||
EM_SPACE = "\u2003" # em space
|
||||
THIN_SPACE = "\u2009" # thin space
|
||||
write_text(TD / "02_whitespace_unicode.csv", (
|
||||
"id,label,note\n"
|
||||
f"1,{NBSP}Premium{NBSP},NBSP padding\n"
|
||||
f"2,{NNBSP}Discount{NNBSP},narrow NBSP\n"
|
||||
f"3,{IDEO}Standard{IDEO},ideographic space\n"
|
||||
f"4,Tier{EM_SPACE}{EM_SPACE}One,em-space internal\n"
|
||||
f"5,Cost{THIN_SPACE}Plus,thin-space internal\n"
|
||||
f"6, {NBSP} mixed {NBSP} ,ascii + NBSP combined\n"
|
||||
))
|
||||
|
||||
write_text(EX / "02_whitespace_unicode.csv", (
|
||||
"id,label,note\n"
|
||||
"1,Premium,NBSP padding\n"
|
||||
"2,Discount,narrow NBSP\n"
|
||||
"3,Standard,ideographic space\n"
|
||||
"4,Tier One,em-space internal\n"
|
||||
"5,Cost Plus,thin-space internal\n"
|
||||
"6,mixed,ascii + NBSP combined\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 03 Smart punctuation (curly quotes, em/en dash, ellipsis, primes)
|
||||
# ---------------------------------------------------------------------------
|
||||
# This is the #1 source of pollution from data that ever passed through
|
||||
# Word, Outlook, or Excel autocorrect. ASCII-fy it.
|
||||
write_text(TD / "03_smart_punctuation.csv", (
|
||||
"id,quote,measurement\n"
|
||||
"1,\u201cHello world\u201d,5\u2032 11\u2033\n" # curly double quotes, prime/double-prime
|
||||
"2,it\u2019s working,\u2014\n" # curly apostrophe, em-dash alone
|
||||
"3,2020\u20132024,from \u2018a\u2019 to \u2018z\u2019\n" # en-dash range, curly singles
|
||||
"4,wait\u2026,3 \u00d7 4\n" # ellipsis char, multiplication sign
|
||||
"5,\u00abquoted\u00bb,5 \u00b1 0.1\n" # guillemets, plus-minus
|
||||
))
|
||||
|
||||
# Default policy: ASCII-fy where round-trip-safe.
|
||||
# Notable: \u00d7 (multiplication) and \u00b1 (plus-minus) are typographically
|
||||
# meaningful and not safely round-trippable to ASCII, so we PRESERVE them
|
||||
# (case 4 col3, case 5 col3). Document this in TEST-CASES.md.
|
||||
write_text(EX / "03_smart_punctuation.csv", (
|
||||
"id,quote,measurement\n"
|
||||
"1,\"\"\"Hello world\"\"\",\"5' 11\"\"\"\n"
|
||||
"2,it's working,-\n"
|
||||
"3,2020-2024,from 'a' to 'z'\n"
|
||||
"4,wait...,3 \u00d7 4\n"
|
||||
"5,\"\"\"quoted\"\"\",5 \u00b1 0.1\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 04 Unicode normalization forms (NFC vs NFD, ligatures, fullwidth)
|
||||
# ---------------------------------------------------------------------------
|
||||
# "café" can be either:
|
||||
# NFC: "caf\u00e9" (e-acute as single code point)
|
||||
# NFD: "cafe\u0301" (e + combining acute accent, two code points)
|
||||
# These look identical but compare unequal. Normalize to NFC.
|
||||
write_text(TD / "04_unicode_forms.csv", (
|
||||
"id,name,description\n"
|
||||
"1,caf\u00e9,NFC form (single code point)\n"
|
||||
"2,cafe\u0301,NFD form (e + combining accent)\n"
|
||||
"3,na\u00efve,NFC i-diaeresis\n"
|
||||
"4,nai\u0308ve,NFD i + combining diaeresis\n"
|
||||
"5,o\uFB03ce,fi-ligature (\uFB03)\n" # 'office' written with 'ffi' ligature
|
||||
"6,\uFF21\uFF22\uFF23,fullwidth ABC\n" # A B C
|
||||
"7,\u2168 century,roman numeral nine (single code point)\n" # Ⅸ
|
||||
))
|
||||
|
||||
# Policy: NFC by default (most compatible, smallest, what Excel emits).
|
||||
# NFKC option would also fold ligatures and fullwidth digits/letters,
|
||||
# but is destructive for some legitimate text. Default = NFC.
|
||||
# So:
|
||||
# - Cases 1 vs 2 should produce identical output after normalization
|
||||
# - Cases 3 vs 4 should produce identical output
|
||||
# - Case 5 ligature stays as ligature under NFC (would fold under NFKC)
|
||||
# - Case 6 fullwidth stays fullwidth under NFC (would fold under NFKC)
|
||||
write_text(EX / "04_unicode_forms.csv", (
|
||||
"id,name,description\n"
|
||||
"1,caf\u00e9,NFC form (single code point)\n"
|
||||
"2,caf\u00e9,NFD form (e + combining accent)\n" # same bytes as row 1 now
|
||||
"3,na\u00efve,NFC i-diaeresis\n"
|
||||
"4,na\u00efve,NFD i + combining diaeresis\n" # same as row 3 now
|
||||
"5,o\uFB03ce,fi-ligature (\uFB03)\n"
|
||||
"6,\uFF21\uFF22\uFF23,fullwidth ABC\n"
|
||||
"7,\u2168 century,roman numeral nine (single code point)\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 05 Zero-width / invisible characters
|
||||
# ---------------------------------------------------------------------------
|
||||
ZWSP = "\u200B" # zero-width space
|
||||
ZWNJ = "\u200C" # zero-width non-joiner
|
||||
ZWJ = "\u200D" # zero-width joiner
|
||||
LRM = "\u200E" # left-to-right mark
|
||||
RLM = "\u200F" # right-to-left mark
|
||||
SOFT_HYPHEN = "\u00AD"
|
||||
WORD_JOINER = "\u2060"
|
||||
write_text(TD / "05_zero_width_invisible.csv", (
|
||||
"id,value,note\n"
|
||||
f"1,Hel{ZWSP}lo,zero-width space inside word\n"
|
||||
f"2,{ZWSP}Lead{ZWSP}ing,leading + internal ZWSP\n"
|
||||
f"3,Trail{ZWSP},trailing ZWSP\n"
|
||||
f"4,a{ZWNJ}b{ZWJ}c,ZWNJ and ZWJ\n"
|
||||
f"5,{LRM}Marked{RLM},LTR + RTL marks bracketing\n"
|
||||
f"6,co{SOFT_HYPHEN}operate,soft hyphen\n"
|
||||
f"7,no{WORD_JOINER}break,word joiner\n"
|
||||
))
|
||||
|
||||
write_text(EX / "05_zero_width_invisible.csv", (
|
||||
"id,value,note\n"
|
||||
"1,Hello,zero-width space inside word\n"
|
||||
"2,Leading,leading + internal ZWSP\n"
|
||||
"3,Trail,trailing ZWSP\n"
|
||||
"4,abc,ZWNJ and ZWJ\n"
|
||||
"5,Marked,LTR + RTL marks bracketing\n"
|
||||
"6,cooperate,soft hyphen\n"
|
||||
"7,nobreak,word joiner\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 06 Control characters (non-printable, except tab/CR/LF inside quoted cells)
|
||||
# ---------------------------------------------------------------------------
|
||||
# These bytes show up in real exports from broken systems, terminals, or
|
||||
# binary data accidentally exported as text.
|
||||
# \x00 NUL, \x01 SOH, \x07 BEL, \x08 BS, \x0B VT, \x0C FF, \x1B ESC, \x7F DEL
|
||||
write_text(TD / "06_control_characters.csv", (
|
||||
"id,value,note\n"
|
||||
"1,Hello\x00World,NUL byte inside\n"
|
||||
"2,Bell\x07Sound,BEL character\n"
|
||||
"3,Back\x08space,backspace\n"
|
||||
"4,Vert\x0BTab,vertical tab\n"
|
||||
"5,Form\x0CFeed,form feed\n"
|
||||
"6,Esc\x1Bape,ESC character\n"
|
||||
"7,Del\x7Fete,DEL character\n"
|
||||
"8,Mixed\x00\x07\x1Bjunk,multiple controls in one cell\n"
|
||||
))
|
||||
|
||||
write_text(EX / "06_control_characters.csv", (
|
||||
"id,value,note\n"
|
||||
"1,HelloWorld,NUL byte inside\n"
|
||||
"2,BellSound,BEL character\n"
|
||||
"3,Backspace,backspace\n"
|
||||
"4,VertTab,vertical tab\n"
|
||||
"5,FormFeed,form feed\n"
|
||||
"6,Escape,ESC character\n"
|
||||
"7,Delete,DEL character\n"
|
||||
"8,Mixedjunk,multiple controls in one cell\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 07 BOM at start of file (UTF-8 BOM = EF BB BF)
|
||||
# ---------------------------------------------------------------------------
|
||||
# Excel writes UTF-8 with BOM by default. pandas usually handles it but
|
||||
# leaves the BOM as part of the first column's header name if you're not
|
||||
# careful, producing a mystery column called "\ufeffid" that breaks lookups.
|
||||
bom = b"\xef\xbb\xbf"
|
||||
content = (
|
||||
"id,name,city\n"
|
||||
"1,Alice,New York\n"
|
||||
"2,Bob,Chicago\n"
|
||||
).encode("utf-8")
|
||||
write_bytes(TD / "07_bom_utf8.csv", bom + content)
|
||||
|
||||
# Expected: BOM stripped on read, output written WITHOUT BOM, header is
|
||||
# clean "id" not "\ufeffid".
|
||||
write_bytes(EX / "07_bom_utf8.csv", content)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 08 Line endings - all CRLF (Windows)
|
||||
# ---------------------------------------------------------------------------
|
||||
# Default policy: normalize to LF on output.
|
||||
write_text(TD / "08_line_endings_crlf.csv", (
|
||||
"id,name\n"
|
||||
"1,Alice\n"
|
||||
"2,Bob\n"
|
||||
"3,Carol\n"
|
||||
), newline="\r\n")
|
||||
|
||||
write_text(EX / "08_line_endings_crlf.csv", (
|
||||
"id,name\n"
|
||||
"1,Alice\n"
|
||||
"2,Bob\n"
|
||||
"3,Carol\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 09 Line endings - CR only (classic Mac, pre-OSX, occasionally still seen)
|
||||
# ---------------------------------------------------------------------------
|
||||
write_text(TD / "09_line_endings_cr.csv", (
|
||||
"id,name\n"
|
||||
"1,Alice\n"
|
||||
"2,Bob\n"
|
||||
"3,Carol\n"
|
||||
), newline="\r")
|
||||
|
||||
write_text(EX / "09_line_endings_cr.csv", (
|
||||
"id,name\n"
|
||||
"1,Alice\n"
|
||||
"2,Bob\n"
|
||||
"3,Carol\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 10 Line endings - mixed within the same file
|
||||
# ---------------------------------------------------------------------------
|
||||
# Real-world disaster mode: file edited on multiple OSes, or concatenated
|
||||
# from sources with different conventions.
|
||||
mixed = (
|
||||
b"id,name\r\n"
|
||||
b"1,Alice\n"
|
||||
b"2,Bob\r"
|
||||
b"3,Carol\r\n"
|
||||
b"4,Dan\n"
|
||||
)
|
||||
write_bytes(TD / "10_line_endings_mixed.csv", mixed)
|
||||
|
||||
write_text(EX / "10_line_endings_mixed.csv", (
|
||||
"id,name\n"
|
||||
"1,Alice\n"
|
||||
"2,Bob\n"
|
||||
"3,Carol\n"
|
||||
"4,Dan\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 11 Embedded newlines INSIDE quoted cells (must be preserved!)
|
||||
# ---------------------------------------------------------------------------
|
||||
# This is the trap: line-ending normalization at the FILE level must not
|
||||
# destroy intentional newlines INSIDE quoted multi-line cells (e.g., a
|
||||
# notes column or an address column).
|
||||
# But the embedded line endings should also be normalized to LF for
|
||||
# consistency.
|
||||
write_text(TD / "11_embedded_newlines.csv", (
|
||||
"id,address,notes\n"
|
||||
"1,\"123 Main St\r\nApt 4B\r\nNew York, NY\",\"line1\nline2\"\n"
|
||||
"2,\"Single line\",\"contains\rclassic mac\rinternal\"\n"
|
||||
"3,\"normal\",\"no newlines here\"\n"
|
||||
))
|
||||
|
||||
# Expected: file-level CRLF normalized to LF; embedded CRLF/CR also
|
||||
# normalized to LF; cells stay multi-line.
|
||||
write_text(EX / "11_embedded_newlines.csv", (
|
||||
"id,address,notes\n"
|
||||
"1,\"123 Main St\nApt 4B\nNew York, NY\",\"line1\nline2\"\n"
|
||||
"2,Single line,\"contains\nclassic mac\ninternal\"\n"
|
||||
"3,normal,no newlines here\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 12 Case operations (opt-in, default = preserve)
|
||||
# ---------------------------------------------------------------------------
|
||||
# This file tests case operations IF the user requests them.
|
||||
# Default behavior: PRESERVE. So expected_default == input.
|
||||
# An expected_lower.csv shows what lower-case mode produces.
|
||||
write_text(TD / "12_case_variations.csv", (
|
||||
"id,name,email,product\n"
|
||||
"1,ALICE SMITH,Alice@Example.COM,Widget\n"
|
||||
"2,bob jones,BOB@example.com,GADGET\n"
|
||||
"3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n"
|
||||
"4,DAN O'CONNOR,Dan@Example.com,gizmo\n"
|
||||
))
|
||||
|
||||
# Default expected: identical to input (case ops are opt-in).
|
||||
write_text(EX / "12_case_variations__default.csv", (
|
||||
"id,name,email,product\n"
|
||||
"1,ALICE SMITH,Alice@Example.COM,Widget\n"
|
||||
"2,bob jones,BOB@example.com,GADGET\n"
|
||||
"3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n"
|
||||
"4,DAN O'CONNOR,Dan@Example.com,gizmo\n"
|
||||
))
|
||||
|
||||
# With --case-email=lower applied to email column only:
|
||||
write_text(EX / "12_case_variations__email_lower.csv", (
|
||||
"id,name,email,product\n"
|
||||
"1,ALICE SMITH,alice@example.com,Widget\n"
|
||||
"2,bob jones,bob@example.com,GADGET\n"
|
||||
"3,Carol Brown,carol@example.com,wIdGeT\n"
|
||||
"4,DAN O'CONNOR,dan@example.com,gizmo\n"
|
||||
))
|
||||
|
||||
# With --case=title applied to name column:
|
||||
write_text(EX / "12_case_variations__name_title.csv", (
|
||||
"id,name,email,product\n"
|
||||
"1,Alice Smith,Alice@Example.COM,Widget\n"
|
||||
"2,Bob Jones,BOB@example.com,GADGET\n"
|
||||
"3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n"
|
||||
"4,Dan O'Connor,Dan@Example.com,gizmo\n" # title-case must not break O'C
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 13 Non-Latin scripts and emoji (PRESERVE; do not mangle)
|
||||
# ---------------------------------------------------------------------------
|
||||
# This is a negative test: the cleaner must not damage characters that
|
||||
# look "foreign" to it. Whitespace trimming and Unicode NFC are still applied.
|
||||
write_text(TD / "13_non_latin_scripts.csv", (
|
||||
"id,name,note\n"
|
||||
"1, \u4e2d\u56fd\u5317\u4eac ,Beijing in Chinese (with leading/trailing space)\n"
|
||||
"2,\u30c6\u30b9\u30c8,Japanese katakana (test)\n"
|
||||
"3,\u062a\u062c\u0631\u0628\u0629,Arabic (test) - RTL\n"
|
||||
"4,\u041c\u043e\u0441\u043a\u0432\u0430,Russian (Moscow)\n"
|
||||
"5,\U0001F389 launch \U0001F680,emoji preserved\n"
|
||||
"6,caf\u00e9 \u2615,emoji + accent combo\n"
|
||||
))
|
||||
|
||||
write_text(EX / "13_non_latin_scripts.csv", (
|
||||
"id,name,note\n"
|
||||
"1,\u4e2d\u56fd\u5317\u4eac,Beijing in Chinese (with leading/trailing space)\n"
|
||||
"2,\u30c6\u30b9\u30c8,Japanese katakana (test)\n"
|
||||
"3,\u062a\u062c\u0631\u0628\u0629,Arabic (test) - RTL\n"
|
||||
"4,\u041c\u043e\u0441\u043a\u0432\u0430,Russian (Moscow)\n"
|
||||
"5,\U0001F389 launch \U0001F680,emoji preserved\n"
|
||||
"6,caf\u00e9 \u2615,emoji + accent combo\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 14 Mojibake (double-encoded UTF-8 / cp1252 misread as Latin-1)
|
||||
# ---------------------------------------------------------------------------
|
||||
# Classic mojibake: someone took a UTF-8 file, opened it as Windows-1252,
|
||||
# saved as UTF-8 again. "café" becomes "café", "naïve" becomes "naïve".
|
||||
# The text cleaner CANNOT reliably auto-fix this (it's a heuristic and can
|
||||
# false-positive on legitimate strings). Default = WARN, do not auto-fix.
|
||||
# Optional --fix-mojibake flag (uses ftfy library) can attempt repair.
|
||||
write_text(TD / "14_mojibake.csv", (
|
||||
"id,name,city\n"
|
||||
"1,caf\u00c3\u00a9,M\u00c3\u00bcnchen\n" # café, München mojibaked
|
||||
"2,na\u00c3\u00afve,r\u00c3\u00a9sum\u00c3\u00a9\n" # naïve, résumé
|
||||
"3,don\u00e2\u20ac\u2122t,smart-apostrophe mojibake\n" # don't via cp1252-mojibake
|
||||
"4,Alice,New York\n" # clean control row
|
||||
))
|
||||
|
||||
# Expected output WITHOUT mojibake fix (default): bytes preserved, but
|
||||
# reader emits a warning to logs.
|
||||
write_text(EX / "14_mojibake__default.csv", (
|
||||
"id,name,city\n"
|
||||
"1,caf\u00c3\u00a9,M\u00c3\u00bcnchen\n"
|
||||
"2,na\u00c3\u00afve,r\u00c3\u00a9sum\u00c3\u00a9\n"
|
||||
"3,don\u00e2\u20ac\u2122t,smart-apostrophe mojibake\n"
|
||||
"4,Alice,New York\n"
|
||||
))
|
||||
|
||||
# Expected output WITH --fix-mojibake (uses ftfy or equivalent):
|
||||
write_text(EX / "14_mojibake__fixed.csv", (
|
||||
"id,name,city\n"
|
||||
"1,caf\u00e9,M\u00fcnchen\n"
|
||||
"2,na\u00efve,r\u00e9sum\u00e9\n"
|
||||
"3,don't,smart-apostrophe mojibake\n" # smart apostrophe also fixed
|
||||
"4,Alice,New York\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 15 Whitespace-only cells (boundary case with script 04)
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per TECHNICAL.md Section 9.3: 02 trims first, leaving an empty string.
|
||||
# 04 then detects empty strings as disguised null. So 02's job here is
|
||||
# just to convert " " into "".
|
||||
write_text(TD / "15_whitespace_only_cells.csv", (
|
||||
"id,value\n"
|
||||
"1,real\n"
|
||||
"2, \n" # spaces only
|
||||
"3,\t\t\n" # tabs only
|
||||
"4,\u00A0\u00A0\n" # NBSP only
|
||||
"5, \t \u00A0 \n" # mixed whitespace
|
||||
"6,\n" # already empty
|
||||
"7,actual value\n"
|
||||
))
|
||||
|
||||
write_text(EX / "15_whitespace_only_cells.csv", (
|
||||
"id,value\n"
|
||||
"1,real\n"
|
||||
"2,\n" # all whitespace -> empty
|
||||
"3,\n"
|
||||
"4,\n"
|
||||
"5,\n"
|
||||
"6,\n"
|
||||
"7,actual value\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 16 Dirty headers
|
||||
# ---------------------------------------------------------------------------
|
||||
# Headers themselves have whitespace, BOM remnants, smart quotes, etc.
|
||||
# These break downstream lookups (df["email"] fails because the column
|
||||
# is actually called " Email " with NBSP padding).
|
||||
write_text(TD / "16_dirty_headers.csv", (
|
||||
" id ,\u00a0Customer Name\u00a0,\u201cEmail\u201d,Phone\u200b\n"
|
||||
"1,Alice,alice@example.com,555-1234\n"
|
||||
"2,Bob,bob@example.com,555-5678\n"
|
||||
))
|
||||
|
||||
# Expected: headers cleaned by SAME rules as data cells.
|
||||
# Note: smart quotes around "Email" become straight quotes. The header
|
||||
# "\"Email\"" with embedded quotes needs CSV-quoting in the output.
|
||||
write_text(EX / "16_dirty_headers.csv", (
|
||||
"id,Customer Name,\"\"\"Email\"\"\",Phone\n"
|
||||
"1,Alice,alice@example.com,555-1234\n"
|
||||
"2,Bob,bob@example.com,555-5678\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 17 Preserve-intended (negative tests - things 02 must NOT touch)
|
||||
# ---------------------------------------------------------------------------
|
||||
# Numbers that LOOK like they have whitespace are tricky: " 123 " is
|
||||
# a number with padding (trim) but "1 234" might be a thousands-separator
|
||||
# locale (don't collapse). Default: trim outer whitespace, but DO NOT
|
||||
# collapse internal whitespace in cells that parse as numeric. This is a
|
||||
# judgment call; document it.
|
||||
#
|
||||
# Also: do not reformat dates, currencies, or phone numbers. That's 03.
|
||||
# Do not detect or replace null-like values. That's 04.
|
||||
write_text(TD / "17_preserve_intended.csv", (
|
||||
"id,price,european_number,date,phone,quantity\n"
|
||||
"1, 100 ,1 234,2024-01-15,(555) 123-4567,42\n"
|
||||
"2,\" $1,500.00 \",12 345,15/01/2024,555.123.4567,7\n"
|
||||
"3, N/A ,nan,Jan 15 2024,+1 555 123 4567,0\n"
|
||||
))
|
||||
|
||||
# Expected: outer whitespace trimmed everywhere, but:
|
||||
# - "1 234" stays "1 234" (looks like European/space-thousands; don't collapse)
|
||||
# - "$1,500.00" stays unchanged (currency, that's 03's domain)
|
||||
# - "15/01/2024" stays unchanged (date, that's 03's domain)
|
||||
# - "(555) 123-4567" stays unchanged (phone, that's 03's domain)
|
||||
# - "N/A" stays "N/A" (null-like, that's 04's domain - 02 doesn't decide what's null)
|
||||
# - phone "+1 555 123 4567" - keep internal spaces (it's a phone, 03's domain)
|
||||
write_text(EX / "17_preserve_intended.csv", (
|
||||
"id,price,european_number,date,phone,quantity\n"
|
||||
"1,100,1 234,2024-01-15,(555) 123-4567,42\n"
|
||||
"2,\"$1,500.00\",12 345,15/01/2024,555.123.4567,7\n"
|
||||
"3,N/A,nan,Jan 15 2024,+1 555 123 4567,0\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 18 Empty file (zero bytes)
|
||||
# ---------------------------------------------------------------------------
|
||||
write_bytes(TD / "18_empty_file.csv", b"")
|
||||
|
||||
# Expected: graceful handling, output is also empty (or warning emitted).
|
||||
write_bytes(EX / "18_empty_file.csv", b"")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 19 Headers only (no data rows)
|
||||
# ---------------------------------------------------------------------------
|
||||
write_text(TD / "19_headers_only.csv", (
|
||||
" id ,Name\u00a0,Email\u200b\n"
|
||||
))
|
||||
|
||||
# Expected: headers cleaned, no data rows in output.
|
||||
write_text(EX / "19_headers_only.csv", (
|
||||
"id,Name,Email\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 20 Real-world kitchen sink (everything combined)
|
||||
# ---------------------------------------------------------------------------
|
||||
# Simulates a typical messy export: came from Excel via cp1252 paste,
|
||||
# saved as UTF-8 with BOM, has CRLF, has smart quotes from autocorrect,
|
||||
# has NBSP from copy/paste, has trailing whitespace.
|
||||
content = (
|
||||
" id ,\u00a0Name\u00a0,\u201cEmail\u201d,Notes\u200b\n"
|
||||
"1,\u00a0Alice Smith\u00a0,Alice@Example.COM,\u201cVIP\u201d customer \u2014 contact ASAP\u2026\r\n"
|
||||
"2,\tBob\tJones\t,bob@example.com,it\u2019s 5\u20326\u2033 tall\r\n"
|
||||
"3, Carol Brown ,CAROL@EXAMPLE.COM,3 \u00d7 4 = 12 (preserve \u00d7)\r\n"
|
||||
"4, ,empty@example.com,whitespace-only name (becomes empty)\r\n"
|
||||
)
|
||||
data_bytes = b"\xef\xbb\xbf" + content.encode("utf-8")
|
||||
# replace LF with CRLF wherever it isn't already to be unambiguous
|
||||
write_bytes(TD / "20_kitchen_sink.csv", data_bytes)
|
||||
|
||||
# Expected: BOM gone, headers clean, smart quotes ASCII-fied, NBSP/ZWSP
|
||||
# stripped, internal multi-space collapsed, CRLF normalized to LF,
|
||||
# whitespace-only cells become empty, multiplication sign preserved,
|
||||
# em-dash and ellipsis converted, prime/double-prime converted.
|
||||
write_text(EX / "20_kitchen_sink.csv", (
|
||||
"id,Name,\"\"\"Email\"\"\",Notes\n"
|
||||
"1,Alice Smith,Alice@Example.COM,\"\"\"VIP\"\" customer - contact ASAP...\"\n"
|
||||
"2,Bob Jones,bob@example.com,\"it's 5'6\"\" tall\"\n"
|
||||
"3,Carol Brown,CAROL@EXAMPLE.COM,3 \u00d7 4 = 12 (preserve \u00d7)\n"
|
||||
"4,,empty@example.com,whitespace-only name (becomes empty)\n"
|
||||
))
|
||||
|
||||
print("All CSV test files written.")
|
||||
print(f" inputs: {TD}")
|
||||
print(f" expected: {EX}")
|
||||
74
test-cases/text-cleaner-corpus/generate_xlsx.py
Normal file
74
test-cases/text-cleaner-corpus/generate_xlsx.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""
|
||||
Generate the XLSX test workbook for 02_text_cleaner.
|
||||
|
||||
Excel-specific pollution patterns that don't appear in CSV:
|
||||
- Cells with leading apostrophe (Excel's force-text prefix; openpyxl
|
||||
surfaces these as plain strings but they show up in real exports)
|
||||
- Multi-line cells from Alt+Enter (carry \\n internally)
|
||||
- Smart quotes from Excel's autocorrect-as-you-type
|
||||
- NBSP padding from copy/paste from Word or web pages
|
||||
- Multiple sheets with different pollution profiles
|
||||
"""
|
||||
from pathlib import Path
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.styles import Font, Alignment
|
||||
|
||||
OUT = Path(__file__).parent / "test_data" / "21_excel_pollution.xlsx"
|
||||
|
||||
wb = Workbook()
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Sheet 1: Customers - whitespace + smart quotes + NBSP
|
||||
# --------------------------------------------------------------------
|
||||
ws = wb.active
|
||||
ws.title = "Customers"
|
||||
ws.append([" id ", "\u00a0Name\u00a0", "\u201cEmail\u201d", "Phone\u200b"]) # dirty headers
|
||||
ws.append([1, " Alice Smith ", "Alice@Example.COM", "555-1234"])
|
||||
ws.append([2, "\u00a0Bob\u00a0Jones\u00a0", "bob@example.com", "555-5678"])
|
||||
ws.append([3, "\tCarol\tBrown\t", "CAROL@example.com", " 555-9012 "])
|
||||
ws.append([4, "Dan O\u2019Connor", "dan@example.com", "555-3456"]) # curly apostrophe
|
||||
ws.append([5, "Eve \u2014 the Engineer", "eve@example.com", "555-7890"]) # em-dash
|
||||
ws.append([6, " ", "frank@example.com", "555-2468"]) # whitespace-only -> empty for 04
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Sheet 2: Notes - multi-line cells, embedded line breaks
|
||||
# --------------------------------------------------------------------
|
||||
ws2 = wb.create_sheet("Notes")
|
||||
ws2.append(["id", "title", "body"])
|
||||
ws2.append([1, "Welcome", "Line one\nLine two\nLine three"])
|
||||
ws2.append([2, "Address", "123 Main St\r\nApt 4B\r\nNew York"]) # mixed line endings inside
|
||||
ws2.append([3, "Quote", "She said \u201chello\u201d and left\u2026"]) # smart quotes + ellipsis
|
||||
ws2.append([4, " padded ", " multiline\n with leading whitespace per line "])
|
||||
# Mark column B with wrap_text so line breaks render in Excel
|
||||
for row in ws2.iter_rows(min_row=2, max_row=ws2.max_row, min_col=3, max_col=3):
|
||||
for cell in row:
|
||||
cell.alignment = Alignment(wrap_text=True)
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Sheet 3: International - non-Latin scripts and emoji (preserve!)
|
||||
# --------------------------------------------------------------------
|
||||
ws3 = wb.create_sheet("International")
|
||||
ws3.append(["id", "city", "language"])
|
||||
ws3.append([1, " \u4e2d\u56fd\u5317\u4eac ", "Chinese"])
|
||||
ws3.append([2, "\u30c6\u30b9\u30c8 ", "Japanese (with trailing space)"])
|
||||
ws3.append([3, " \u041c\u043e\u0441\u043a\u0432\u0430", "Russian"])
|
||||
ws3.append([4, "\u062a\u062c\u0631\u0628\u0629", "Arabic"])
|
||||
ws3.append([5, "Caf\u00e9 \u2615", "emoji preserved"])
|
||||
ws3.append([6, "Launch \U0001F389\U0001F680", "more emoji"])
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Sheet 4: ForceText - cells originally entered with leading apostrophe
|
||||
# --------------------------------------------------------------------
|
||||
# Excel's force-text prefix '0001234 stores as string "0001234" when read
|
||||
# by openpyxl. Sometimes (broken exports) the apostrophe leaks through as
|
||||
# part of the value. Test that 02 doesn't try to "clean" leading
|
||||
# apostrophes - they may be intentional for ID columns.
|
||||
ws4 = wb.create_sheet("ForceText")
|
||||
ws4.append(["id", "sku", "zip"])
|
||||
ws4.append([1, "0001234", "08540"]) # legitimate leading-zero IDs
|
||||
ws4.append([2, " 0005678 ", "01001"]) # padded - trim outer space, keep zeros
|
||||
ws4.append([3, "'9999999", "10001"]) # leaked apostrophe - PRESERVE (judgment call)
|
||||
|
||||
wb.save(OUT)
|
||||
print(f"Wrote {OUT}")
|
||||
print(f"Sheets: {wb.sheetnames}")
|
||||
@@ -0,0 +1,8 @@
|
||||
id,name,city
|
||||
1, Alice ,New York
|
||||
2,Bob, Chicago
|
||||
3,Carol ,San Francisco
|
||||
4,Dan Smith,Austin
|
||||
5, Eve , Boston
|
||||
6,Frank van der Berg,Denver
|
||||
7, Grace Hopper , Palo Alto
|
||||
|
@@ -0,0 +1,7 @@
|
||||
id,label,note
|
||||
1, Premium ,NBSP padding
|
||||
2, Discount ,narrow NBSP
|
||||
3, Standard ,ideographic space
|
||||
4,Tier One,em-space internal
|
||||
5,Cost Plus,thin-space internal
|
||||
6, mixed ,ascii + NBSP combined
|
||||
|
@@ -0,0 +1,6 @@
|
||||
id,quote,measurement
|
||||
1,“Hello world”,5′ 11″
|
||||
2,it’s working,—
|
||||
3,2020–2024,from ‘a’ to ‘z’
|
||||
4,wait…,3 × 4
|
||||
5,«quoted»,5 ± 0.1
|
||||
|
@@ -0,0 +1,8 @@
|
||||
id,name,description
|
||||
1,café,NFC form (single code point)
|
||||
2,café,NFD form (e + combining accent)
|
||||
3,naïve,NFC i-diaeresis
|
||||
4,naïve,NFD i + combining diaeresis
|
||||
5,office,fi-ligature (ffi)
|
||||
6,ABC,fullwidth ABC
|
||||
7,Ⅸ century,roman numeral nine (single code point)
|
||||
|
@@ -0,0 +1,8 @@
|
||||
id,value,note
|
||||
1,Hello,zero-width space inside word
|
||||
2,Leading,leading + internal ZWSP
|
||||
3,Trail,trailing ZWSP
|
||||
4,abc,ZWNJ and ZWJ
|
||||
5,Marked,LTR + RTL marks bracketing
|
||||
6,cooperate,soft hyphen
|
||||
7,nobreak,word joiner
|
||||
|
Binary file not shown.
|
3
test-cases/text-cleaner-corpus/test_data/07_bom_utf8.csv
Normal file
3
test-cases/text-cleaner-corpus/test_data/07_bom_utf8.csv
Normal file
@@ -0,0 +1,3 @@
|
||||
id,name,city
|
||||
1,Alice,New York
|
||||
2,Bob,Chicago
|
||||
|
@@ -0,0 +1,4 @@
|
||||
id,name
|
||||
1,Alice
|
||||
2,Bob
|
||||
3,Carol
|
||||
|
@@ -0,0 +1 @@
|
||||
id,name
|
||||
|
@@ -0,0 +1,4 @@
|
||||
id,name
|
||||
1,Alice
|
||||
2,Bob
|
||||
3,Carol
|
||||
|
@@ -0,0 +1,7 @@
|
||||
id,address,notes
|
||||
1,"123 Main St
|
||||
Apt 4B
|
||||
New York, NY","line1
|
||||
line2"
|
||||
2,"Single line","contains
|
||||
classic mac
|
||||
|
@@ -0,0 +1,5 @@
|
||||
id,name,email,product
|
||||
1,ALICE SMITH,Alice@Example.COM,Widget
|
||||
2,bob jones,BOB@example.com,GADGET
|
||||
3,Carol Brown,carol@EXAMPLE.com,wIdGeT
|
||||
4,DAN O'CONNOR,Dan@Example.com,gizmo
|
||||
|
@@ -0,0 +1,7 @@
|
||||
id,name,note
|
||||
1, 中国北京 ,Beijing in Chinese (with leading/trailing space)
|
||||
2,テスト,Japanese katakana (test)
|
||||
3,تجربة,Arabic (test) - RTL
|
||||
4,Москва,Russian (Moscow)
|
||||
5,🎉 launch 🚀,emoji preserved
|
||||
6,café ☕,emoji + accent combo
|
||||
|
5
test-cases/text-cleaner-corpus/test_data/14_mojibake.csv
Normal file
5
test-cases/text-cleaner-corpus/test_data/14_mojibake.csv
Normal file
@@ -0,0 +1,5 @@
|
||||
id,name,city
|
||||
1,café,München
|
||||
2,naïve,résumé
|
||||
3,don’t,smart-apostrophe mojibake
|
||||
4,Alice,New York
|
||||
|
@@ -0,0 +1,8 @@
|
||||
id,value
|
||||
1,real
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,actual value
|
||||
|
@@ -0,0 +1,3 @@
|
||||
id , Customer Name ,“Email”,Phone
|
||||
1,Alice,alice@example.com,555-1234
|
||||
2,Bob,bob@example.com,555-5678
|
||||
|
@@ -0,0 +1,4 @@
|
||||
id,price,european_number,date,phone,quantity
|
||||
1, 100 ,1 234,2024-01-15,(555) 123-4567,42
|
||||
2," $1,500.00 ",12 345,15/01/2024,555.123.4567,7
|
||||
3, N/A ,nan,Jan 15 2024,+1 555 123 4567,0
|
||||
|
@@ -0,0 +1 @@
|
||||
id ,Name ,Email
|
||||
|
@@ -0,0 +1,5 @@
|
||||
id , Name ,“Email”,Notes
|
||||
1, Alice Smith ,Alice@Example.COM,“VIP” customer — contact ASAP…
|
||||
2, Bob Jones ,bob@example.com,it’s 5′6″ tall
|
||||
3, Carol Brown ,CAROL@EXAMPLE.COM,3 × 4 = 12 (preserve ×)
|
||||
4, ,empty@example.com,whitespace-only name (becomes empty)
|
||||
|
BIN
test-cases/text-cleaner-corpus/test_data/21_excel_pollution.xlsx
Normal file
BIN
test-cases/text-cleaner-corpus/test_data/21_excel_pollution.xlsx
Normal file
Binary file not shown.
209
tests/test_corpus.py
Normal file
209
tests/test_corpus.py
Normal file
@@ -0,0 +1,209 @@
|
||||
"""Run every corpus fixture through the current text cleaner and report diffs.
|
||||
|
||||
This is an *acceptance* test against an external corpus shipped in
|
||||
``test-cases/text-cleaner-corpus/``. Cases that fail are documented gaps
|
||||
between the current implementation and the spec target in TEST-CASES.md.
|
||||
The test fails on diff — that's the point. Each failure is informative.
|
||||
|
||||
Cases 12 and 14 produce multiple expected outputs depending on flags;
|
||||
case 21 is XLSX-only and verified separately (manual / smoke).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from src.core.text_clean import CleanOptions, clean_dataframe
|
||||
|
||||
CORPUS = Path(__file__).parent.parent / "test-cases" / "text-cleaner-corpus"
|
||||
TEST_DATA = CORPUS / "test_data"
|
||||
EXPECTED = CORPUS / "expected"
|
||||
|
||||
|
||||
# Cases where a single default run should produce the expected file
|
||||
DEFAULT_CASES = [
|
||||
"01_whitespace_basic",
|
||||
"02_whitespace_unicode",
|
||||
"03_smart_punctuation",
|
||||
"04_unicode_forms",
|
||||
"05_zero_width_invisible",
|
||||
"06_control_characters",
|
||||
"07_bom_utf8",
|
||||
"08_line_endings_crlf",
|
||||
"09_line_endings_cr",
|
||||
"10_line_endings_mixed",
|
||||
"11_embedded_newlines",
|
||||
"13_non_latin_scripts",
|
||||
"15_whitespace_only_cells",
|
||||
"16_dirty_headers",
|
||||
"17_preserve_intended",
|
||||
"19_headers_only",
|
||||
"20_kitchen_sink",
|
||||
]
|
||||
|
||||
|
||||
def _read_csv_strict(path: Path) -> pd.DataFrame:
|
||||
"""Read a corpus CSV file, treating all cells as strings.
|
||||
|
||||
NUL bytes are stripped from the raw file before parsing because the
|
||||
pandas C engine truncates fields at NUL while the python engine is
|
||||
too strict about embedded literal double quotes. Stripping NUL is
|
||||
the file-level pre-clean step the spec describes for case 06.
|
||||
"""
|
||||
raw = path.read_bytes().replace(b"\x00", b"")
|
||||
return pd.read_csv(
|
||||
io.BytesIO(raw), dtype=str, keep_default_na=False, encoding="utf-8-sig",
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DataFrame-level diff (covers cell content; ignores file-level encoding/EOL)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@pytest.mark.parametrize("name", DEFAULT_CASES)
|
||||
def test_corpus_dataframe_diff(name):
|
||||
"""Run clean_dataframe on the input and diff against the expected DF."""
|
||||
inp_path = TEST_DATA / f"{name}.csv"
|
||||
exp_path = EXPECTED / f"{name}.csv"
|
||||
|
||||
if inp_path.stat().st_size == 0:
|
||||
pytest.skip(f"{name}: input is empty (file-level test)")
|
||||
|
||||
df_in = _read_csv_strict(inp_path)
|
||||
df_expected = _read_csv_strict(exp_path)
|
||||
|
||||
result = clean_dataframe(df_in)
|
||||
|
||||
# Normalize column names in expected/actual the same way (str cast)
|
||||
actual = result.cleaned_df.reset_index(drop=True)
|
||||
expected = df_expected.reset_index(drop=True)
|
||||
|
||||
# Frame-level diff: equal columns, equal cell content
|
||||
assert list(actual.columns) == list(expected.columns), (
|
||||
f"{name}: header mismatch.\n"
|
||||
f" actual: {list(actual.columns)!r}\n"
|
||||
f" expected: {list(expected.columns)!r}"
|
||||
)
|
||||
|
||||
diffs = []
|
||||
for col in expected.columns:
|
||||
for i, (a, e) in enumerate(zip(actual[col].tolist(), expected[col].tolist())):
|
||||
if a != e:
|
||||
diffs.append((i, col, repr(a), repr(e)))
|
||||
assert not diffs, (
|
||||
f"{name}: {len(diffs)} cell mismatch(es). First 5:\n"
|
||||
+ "\n".join(f" row {i} col {c}: actual={a} expected={e}"
|
||||
for i, c, a, e in diffs[:5])
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Idempotency property (every case)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@pytest.mark.parametrize("name", DEFAULT_CASES + ["12_case_variations", "14_mojibake"])
|
||||
def test_corpus_idempotent(name):
|
||||
"""clean(clean(x)) == clean(x) for every fixture."""
|
||||
inp_path = TEST_DATA / f"{name}.csv"
|
||||
if inp_path.stat().st_size == 0:
|
||||
pytest.skip(f"{name}: input is empty")
|
||||
|
||||
df = _read_csv_strict(inp_path)
|
||||
once = clean_dataframe(df).cleaned_df.reset_index(drop=True)
|
||||
twice = clean_dataframe(once).cleaned_df.reset_index(drop=True)
|
||||
assert once.equals(twice), f"{name}: not idempotent"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Special cases: 12 (case ops, opt-in), 14 (mojibake), 18 (empty), 21 (xlsx)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestCaseVariations:
|
||||
"""Case 12: --case email=lower and --case name=title variants."""
|
||||
|
||||
def test_default_is_identity_for_case(self):
|
||||
df = _read_csv_strict(TEST_DATA / "12_case_variations.csv")
|
||||
expected = _read_csv_strict(EXPECTED / "12_case_variations__default.csv")
|
||||
actual = clean_dataframe(df).cleaned_df.reset_index(drop=True)
|
||||
# Default should not change case
|
||||
assert actual.equals(expected), (
|
||||
"12 default: cells differ (case mutated under default config)"
|
||||
)
|
||||
|
||||
def test_email_lower(self):
|
||||
df = _read_csv_strict(TEST_DATA / "12_case_variations.csv")
|
||||
expected = _read_csv_strict(EXPECTED / "12_case_variations__email_lower.csv")
|
||||
opts = CleanOptions(case_columns={"email": "lower"})
|
||||
actual = clean_dataframe(df, opts).cleaned_df.reset_index(drop=True)
|
||||
assert actual.equals(expected), "12 email_lower variant differs"
|
||||
|
||||
def test_name_title(self):
|
||||
df = _read_csv_strict(TEST_DATA / "12_case_variations.csv")
|
||||
expected = _read_csv_strict(EXPECTED / "12_case_variations__name_title.csv")
|
||||
opts = CleanOptions(case_columns={"name": "title"})
|
||||
actual = clean_dataframe(df, opts).cleaned_df.reset_index(drop=True)
|
||||
assert actual.equals(expected), "12 name_title variant differs"
|
||||
|
||||
|
||||
class TestMojibake:
|
||||
def test_default_no_repair(self):
|
||||
df = _read_csv_strict(TEST_DATA / "14_mojibake.csv")
|
||||
expected = _read_csv_strict(EXPECTED / "14_mojibake__default.csv")
|
||||
actual = clean_dataframe(df).cleaned_df.reset_index(drop=True)
|
||||
assert actual.equals(expected), "14 mojibake default (no repair) differs"
|
||||
|
||||
def test_fixed_variant(self):
|
||||
# --fix-mojibake is Tier 2; the cleaner does not implement it. Mark xfail.
|
||||
pytest.xfail("Mojibake auto-repair is Tier 2; not yet implemented (uses ftfy).")
|
||||
|
||||
|
||||
class TestEmptyFile:
|
||||
def test_empty_no_crash(self, tmp_path):
|
||||
"""Case 18: zero-byte file should not crash."""
|
||||
inp = TEST_DATA / "18_empty_file.csv"
|
||||
assert inp.stat().st_size == 0
|
||||
# Reading an empty CSV with pandas raises EmptyDataError; corpus says
|
||||
# the cleaner must handle it gracefully. Not yet wired in core.
|
||||
with pytest.raises(pd.errors.EmptyDataError):
|
||||
pd.read_csv(inp)
|
||||
|
||||
|
||||
class TestXlsxPollution:
|
||||
"""Case 21: XLSX with multi-sheet pollution; smoke-test each sheet."""
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def workbook(self):
|
||||
path = TEST_DATA / "21_excel_pollution.xlsx"
|
||||
return pd.ExcelFile(path, engine="openpyxl")
|
||||
|
||||
def test_sheets_present(self, workbook):
|
||||
names = set(workbook.sheet_names)
|
||||
assert {"Customers", "Notes", "International", "ForceText"}.issubset(names)
|
||||
|
||||
def test_each_sheet_runs_without_error(self, workbook):
|
||||
for sheet in workbook.sheet_names:
|
||||
df = pd.read_excel(
|
||||
workbook, sheet_name=sheet, dtype=str, keep_default_na=False,
|
||||
)
|
||||
result = clean_dataframe(df)
|
||||
assert result.cleaned_df.shape[0] == df.shape[0], (
|
||||
f"sheet {sheet}: row count changed"
|
||||
)
|
||||
|
||||
def test_force_text_leading_zeros_preserved(self, workbook):
|
||||
df = pd.read_excel(
|
||||
workbook, sheet_name="ForceText", dtype=str, keep_default_na=False,
|
||||
)
|
||||
result = clean_dataframe(df)
|
||||
# First column likely an id with leading zeros — make sure it isn't
|
||||
# numerically coerced or stripped.
|
||||
first_col = result.cleaned_df.iloc[:, 0].tolist()
|
||||
for val in first_col:
|
||||
if val and val.lstrip("'").isdigit():
|
||||
assert not val.startswith(" ") and not val.endswith(" ")
|
||||
Reference in New Issue
Block a user