Closes 12 bugs and 8 gaps surfaced by parallel audits across all core modules, plus aligns the dedup-side normalizers with the new format_standardize behavior where they had silently diverged. Bugs (data integrity / correctness): - dedup: NaN/None values matched as duplicates because str(None)='None'. Two rows with missing email silently merged. - dedup: removed_df had 0 columns when nothing was removed; downstream code expecting matching schema broke. Now preserves column shape. - dedup: ColumnMatchStrategy threshold accepted any value; out-of-range silently broke matching. Validated to [0, 100] in __post_init__. - dedup: strategy referencing a missing column was silently skipped. Now raises ValueError listing available columns. - fixes: replace_null_sentinels crashed on non-string sentinels (int/None from JSON payload). Coerced to str. - fixes: _vectorized_regex_sub raised raw re.error on bad patterns. Now wraps as ValueError with clear message. - io: detect_header_row mis-identified all-empty and metadata-only rows as headers (all([]) is True). Now requires ≥2 non-empty cells. - config: from_dict crashed when JSON had unknown fields, breaking forward compat. Now filters to known fields. - analyze: mixed-case email detector flagged all-None columns because str(None)='None' contains both N and one. Now drops NaN before stringify. New features and gap closures: - io: _detect_excel_header_row mirrors detect_header_row for Excel via openpyxl read-only; _read_excel uses it when header_row=None. - io: write_file gains delimiter + encoding params; .tsv extension defaults to tab. - normalizers: normalize_phone preserves extensions as ;ext=N suffix. - normalizers: normalize_address folds spelled-out US state names to 2-letter codes (California ≡ CA). - normalizers: normalize_name drops surname particles (van, de, von) so "Charles de Gaulle" ≡ "Charles Gaulle" for matching. - analyze: new _detect_inconsistent_date_format detector flags columns with mixed ISO/US/EU date shapes; routes to format standardizer. - analyze: _NULL_LIKE recognizes "<na>" (pd.NA repr). - analyze: duplicate-row finding renamed count → n_extra (rows that would actually be removed) with clarified description. - dedup: group_confidence no longer falsely 100.0 when transitive group members lack a recorded direct pair; falls back to 100.0 only when truly no pairs were observed. - dedup: MatchResult / DeduplicationResult docstrings clarify that row_indices refer to the input frame's positional index (output index is reset). - text_clean: visualize_hidden_html(None) now returns None (matches visualize_hidden_text); strip_bom strips at most one BOM per call; sentence_case dead elif branch removed. Tests: - tests/test_audit_fixes.py — 28 regression tests, one or more per numbered finding, named after BUG/GAP/NIT tags so future readers can trace each test back to its audit. - tests/test_fixes_unit.py — 26 isolated unit tests for previously integration-only fix functions (trim_whitespace, strip_nbsp, strip_zero_width, normalize_line_endings, clean_headers, repair_mojibake — last skipped if ftfy unavailable). - tests/test_io.py — adds CSV / TSV / semicolon / UTF-8-BOM round-trip tests + Excel auto-header-detection tests. - tests/test_normalizers.py — adds 8 tests for the alignment work above (phone extension, state names, particles). Adds .claude/ to .gitignore (agent worktrees + local settings). Full project suite: 1197 passed, 4 skipped, 17 xfailed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
569 lines
20 KiB
Python
569 lines
20 KiB
Python
"""Tests for src/core/text_clean.py.
|
||
|
||
Covers edge cases E1-E50 from TECHNICAL.md Section 10.2 plan.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
import pytest
|
||
|
||
from src.core.text_clean import (
|
||
CleanOptions,
|
||
PRESETS,
|
||
apply_case,
|
||
clean_dataframe,
|
||
clean_value,
|
||
collapse_whitespace,
|
||
fold_smart_chars,
|
||
normalize_line_endings,
|
||
sentence_case,
|
||
smart_title_case,
|
||
strip_bom,
|
||
strip_control,
|
||
strip_zero_width,
|
||
to_nfc,
|
||
to_nfkc,
|
||
trim,
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Per-string helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestTrim:
|
||
def test_strips_leading_and_trailing(self):
|
||
assert trim(" hello ") == "hello"
|
||
|
||
def test_preserves_internal_spaces(self):
|
||
assert trim(" a b ") == "a b"
|
||
|
||
def test_empty_string(self):
|
||
assert trim("") == ""
|
||
|
||
def test_idempotent(self):
|
||
assert trim(trim(" x ")) == trim(" x ")
|
||
|
||
|
||
class TestCollapseWhitespace:
|
||
def test_multiple_spaces(self):
|
||
assert collapse_whitespace("a b") == "a b"
|
||
|
||
def test_tab_inside_cell(self): # E2
|
||
assert collapse_whitespace("a\tb") == "a b"
|
||
|
||
def test_mixed_tabs_and_spaces(self): # E3
|
||
assert collapse_whitespace("a \t \t b") == "a b"
|
||
|
||
def test_idempotent(self):
|
||
assert collapse_whitespace(collapse_whitespace("a b")) == collapse_whitespace("a b")
|
||
|
||
|
||
class TestNFC:
|
||
def test_combining_acute(self): # E6
|
||
decomposed = "é" # e + combining acute
|
||
composed = "é" # é
|
||
assert to_nfc(decomposed) == composed
|
||
|
||
def test_idempotent(self):
|
||
s = "café"
|
||
assert to_nfc(to_nfc(s)) == to_nfc(s)
|
||
|
||
|
||
class TestNFKC:
|
||
def test_circled_digit(self): # E7
|
||
assert to_nfkc("①") == "1"
|
||
|
||
def test_ligature(self): # E7
|
||
assert to_nfkc("fi") == "fi"
|
||
|
||
def test_idempotent(self):
|
||
assert to_nfkc(to_nfkc("①fi")) == to_nfkc("①fi")
|
||
|
||
|
||
class TestSmartChars:
|
||
def test_curly_quotes(self): # E11
|
||
assert fold_smart_chars("‘hi’") == "'hi'"
|
||
assert fold_smart_chars("“hi”") == '"hi"'
|
||
|
||
def test_dashes(self): # E12
|
||
assert fold_smart_chars("a—b") == "a-b"
|
||
assert fold_smart_chars("a–b") == "a-b"
|
||
|
||
def test_ellipsis(self): # E13
|
||
assert fold_smart_chars("wait…") == "wait..."
|
||
|
||
def test_nbsp(self): # E14
|
||
assert fold_smart_chars("a b") == "a b"
|
||
|
||
def test_idempotent(self):
|
||
s = "“hi” — a b"
|
||
assert fold_smart_chars(fold_smart_chars(s)) == fold_smart_chars(s)
|
||
|
||
|
||
class TestZeroWidth:
|
||
def test_zwsp_midword(self): # E16
|
||
assert strip_zero_width("foobar") == "foobar"
|
||
|
||
def test_bidi_marks_stripped(self): # E17
|
||
assert strip_zero_width("abc") == "abc"
|
||
|
||
def test_word_joiner(self): # E18
|
||
assert strip_zero_width("ab") == "ab"
|
||
|
||
def test_mid_string_feff(self): # E22
|
||
assert strip_zero_width("foobar") == "foobar"
|
||
|
||
|
||
class TestStripBOM:
|
||
def test_leading_bom(self):
|
||
assert strip_bom("hello") == "hello"
|
||
|
||
def test_no_bom(self):
|
||
assert strip_bom("hello") == "hello"
|
||
|
||
def test_idempotent(self):
|
||
assert strip_bom(strip_bom("x")) == strip_bom("x")
|
||
|
||
|
||
class TestStripControl:
|
||
def test_null_byte(self): # E20
|
||
assert strip_control("a\x00b") == "ab"
|
||
|
||
def test_preserves_tab_newline_cr(self): # E19
|
||
assert strip_control("a\tb\nc\rd") == "a\tb\nc\rd"
|
||
|
||
def test_strips_other_control(self):
|
||
# 0x01..0x1F minus tab/newline/CR/VT/FF? we keep \t \n \r only.
|
||
assert strip_control("a\x01b\x07c\x1fd") == "abcd"
|
||
|
||
def test_strips_del(self):
|
||
assert strip_control("a\x7fb") == "ab"
|
||
|
||
|
||
class TestLineEndings:
|
||
def test_crlf(self): # E23
|
||
assert normalize_line_endings("a\r\nb") == "a\nb"
|
||
|
||
def test_bare_cr(self): # E24
|
||
assert normalize_line_endings("a\rb") == "a\nb"
|
||
|
||
def test_idempotent(self):
|
||
assert (
|
||
normalize_line_endings(normalize_line_endings("a\r\nb\rc"))
|
||
== normalize_line_endings("a\r\nb\rc")
|
||
)
|
||
|
||
|
||
class TestSmartTitleCase:
|
||
def test_preserves_acronym(self): # E26
|
||
assert smart_title_case("USA report") == "USA Report"
|
||
assert smart_title_case("nasa launch") == "Nasa Launch" # already lower
|
||
assert smart_title_case("NASA launch") == "NASA Launch"
|
||
|
||
def test_lowercases_particles_midstring(self): # E27
|
||
assert smart_title_case("the lord of the rings") == "The Lord of the Rings"
|
||
assert smart_title_case("a tale of two cities") == "A Tale of Two Cities"
|
||
|
||
def test_keeps_first_and_last_capitalized(self):
|
||
# "of" at the end stays capitalized
|
||
result = smart_title_case("kingdom of")
|
||
assert result == "Kingdom Of"
|
||
|
||
def test_apostrophe(self):
|
||
assert smart_title_case("o'neil") == "O'neil"
|
||
|
||
|
||
class TestSentenceCase:
|
||
def test_basic(self): # E28
|
||
assert sentence_case("hello. how are you? fine!") == "Hello. How are you? Fine!"
|
||
|
||
def test_preserves_punctuation(self):
|
||
assert sentence_case("WHAT? OK.") == "What? Ok."
|
||
|
||
|
||
class TestApplyCase:
|
||
def test_modes(self):
|
||
assert apply_case("Hello World", "upper") == "HELLO WORLD"
|
||
assert apply_case("Hello World", "lower") == "hello world"
|
||
assert apply_case("hello world", "title") == "Hello World"
|
||
assert apply_case("hello. world.", "sentence") == "Hello. World."
|
||
|
||
def test_unknown_mode_raises(self):
|
||
with pytest.raises(ValueError):
|
||
apply_case("x", "weird") # type: ignore[arg-type]
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# clean_value composition
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestCleanValue:
|
||
def test_default_excel_hygiene(self):
|
||
opts = CleanOptions()
|
||
out, ops = clean_value("“Hello world” ", opts)
|
||
assert out == '"Hello world"'
|
||
assert "fold_smart_chars" in ops
|
||
assert "trim" in ops
|
||
|
||
def test_pure_whitespace_to_empty(self): # E1
|
||
opts = CleanOptions()
|
||
out, ops = clean_value(" ", opts)
|
||
assert out == ""
|
||
|
||
def test_nbsp_only_cell(self): # E5
|
||
opts = CleanOptions()
|
||
out, _ = clean_value(" ", opts)
|
||
assert out == ""
|
||
|
||
def test_non_string_passthrough(self): # E32
|
||
opts = CleanOptions()
|
||
for val in (None, 42, 3.14, True, np.nan):
|
||
out, ops = clean_value(val, opts)
|
||
# NaN compares unequal to itself; check pd.isna for that case
|
||
if isinstance(val, float) and pd.isna(val):
|
||
assert pd.isna(out)
|
||
else:
|
||
assert out == val
|
||
assert ops == []
|
||
|
||
def test_empty_string(self):
|
||
opts = CleanOptions()
|
||
out, ops = clean_value("", opts)
|
||
assert out == ""
|
||
assert ops == []
|
||
|
||
def test_only_unchanged_ops_not_logged(self):
|
||
opts = CleanOptions(trim=True, collapse_whitespace=True, nfc=False, nfkc=False,
|
||
fold_smart_chars=False, strip_zero_width=False,
|
||
strip_bom=False, strip_control=False,
|
||
normalize_line_endings=False)
|
||
out, ops = clean_value("hello", opts)
|
||
assert out == "hello"
|
||
assert ops == []
|
||
|
||
|
||
class TestIdempotency:
|
||
"""E40 — applying the pipeline twice yields the same result as once."""
|
||
|
||
@pytest.mark.parametrize("preset", list(PRESETS.keys()))
|
||
def test_preset_idempotent(self, preset):
|
||
opts = CleanOptions.from_preset(preset)
|
||
cases = [
|
||
"“Hello world” ",
|
||
" \t multi space \r\n ",
|
||
"café",
|
||
"éclair",
|
||
"leading-bom",
|
||
"USA and the Rings",
|
||
"a\x00b\x01c",
|
||
"",
|
||
" ",
|
||
]
|
||
for s in cases:
|
||
once, _ = clean_value(s, opts)
|
||
twice, _ = clean_value(once, opts)
|
||
assert once == twice, f"not idempotent on {s!r} (preset {preset})"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# clean_dataframe
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestCleanDataframe:
|
||
def test_only_string_columns_touched(self): # E31, E33, E35
|
||
df = pd.DataFrame({
|
||
"name": [" Alice ", "Bob"],
|
||
"age": [30, 25],
|
||
"joined": pd.to_datetime(["2024-01-01", "2024-02-01"]),
|
||
"active": [True, False],
|
||
})
|
||
result = clean_dataframe(df)
|
||
assert result.cleaned_df["name"].tolist() == ["Alice", "Bob"]
|
||
assert result.cleaned_df["age"].tolist() == [30, 25]
|
||
assert result.cleaned_df["active"].tolist() == [True, False]
|
||
assert "name" in result.columns_processed
|
||
assert "age" not in result.columns_processed
|
||
|
||
def test_explicit_columns(self): # E41
|
||
df = pd.DataFrame({"a": [" x "], "b": [" y "]})
|
||
result = clean_dataframe(df, CleanOptions(columns=["a"]))
|
||
assert result.cleaned_df["a"].iloc[0] == "x"
|
||
assert result.cleaned_df["b"].iloc[0] == " y "
|
||
assert result.columns_processed == ["a"]
|
||
|
||
def test_skip_columns(self): # E42
|
||
df = pd.DataFrame({"name": [" A "], "notes": [" free text "]})
|
||
result = clean_dataframe(df, CleanOptions(skip_columns=["notes"]))
|
||
assert result.cleaned_df["name"].iloc[0] == "A"
|
||
assert result.cleaned_df["notes"].iloc[0] == " free text "
|
||
|
||
def test_unknown_column_raises(self):
|
||
df = pd.DataFrame({"a": ["x"]})
|
||
with pytest.raises(ValueError):
|
||
clean_dataframe(df, CleanOptions(columns=["missing"]))
|
||
|
||
def test_empty_dataframe(self): # E43
|
||
df = pd.DataFrame()
|
||
result = clean_dataframe(df)
|
||
assert result.cells_changed == 0
|
||
assert result.cells_total == 0
|
||
assert result.cleaned_df.empty
|
||
|
||
def test_single_column_file(self): # E44
|
||
df = pd.DataFrame({"only": [" hello "]})
|
||
result = clean_dataframe(df)
|
||
assert result.cleaned_df["only"].iloc[0] == "hello"
|
||
|
||
def test_all_numeric_no_op(self): # E45
|
||
df = pd.DataFrame({"a": [1, 2], "b": [3.0, 4.0]})
|
||
result = clean_dataframe(df)
|
||
assert result.columns_processed == []
|
||
assert result.cells_changed == 0
|
||
|
||
def test_mixed_object_column_strings_only(self): # E34
|
||
df = pd.DataFrame({"mix": [" hello ", 42, None]})
|
||
result = clean_dataframe(df)
|
||
assert result.cleaned_df["mix"].iloc[0] == "hello"
|
||
assert result.cleaned_df["mix"].iloc[1] == 42
|
||
assert result.cleaned_df["mix"].iloc[2] is None
|
||
|
||
def test_nan_preserved(self): # E32
|
||
df = pd.DataFrame({"a": [" x ", np.nan]})
|
||
result = clean_dataframe(df)
|
||
assert result.cleaned_df["a"].iloc[0] == "x"
|
||
assert pd.isna(result.cleaned_df["a"].iloc[1])
|
||
|
||
def test_changes_audit_count(self): # E48
|
||
df = pd.DataFrame({"a": [" x ", "y", " z"]})
|
||
result = clean_dataframe(df)
|
||
assert result.cells_changed == 2
|
||
assert len(result.changes) == 2
|
||
assert set(result.changes["row"].tolist()) == {0, 2}
|
||
|
||
def test_does_not_mutate_input(self):
|
||
df = pd.DataFrame({"a": [" x "]})
|
||
original = df.copy()
|
||
clean_dataframe(df)
|
||
assert df.equals(original)
|
||
|
||
def test_per_column_case_via_case_columns(self):
|
||
df = pd.DataFrame({"name": ["alice"], "code": ["abc"]})
|
||
result = clean_dataframe(df, CleanOptions(case_columns={"code": "upper"}))
|
||
assert result.cleaned_df["name"].iloc[0] == "alice"
|
||
assert result.cleaned_df["code"].iloc[0] == "ABC"
|
||
|
||
def test_global_case_applied_to_selected_only(self):
|
||
df = pd.DataFrame({"name": ["alice"], "notes": ["bob"]})
|
||
result = clean_dataframe(
|
||
df, CleanOptions(columns=["name"], case="upper"),
|
||
)
|
||
assert result.cleaned_df["name"].iloc[0] == "ALICE"
|
||
assert result.cleaned_df["notes"].iloc[0] == "bob"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Presets and config round-trip
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestPresets:
|
||
def test_minimal_only_trim_collapse(self):
|
||
opts = CleanOptions.from_preset("minimal")
|
||
assert opts.trim is True
|
||
assert opts.collapse_whitespace is True
|
||
assert opts.nfc is False
|
||
assert opts.fold_smart_chars is False
|
||
|
||
def test_excel_hygiene_smart_chars_on_nfkc_off(self):
|
||
opts = CleanOptions.from_preset("excel-hygiene")
|
||
assert opts.fold_smart_chars is True
|
||
assert opts.nfc is True
|
||
assert opts.nfkc is False
|
||
|
||
def test_paranoid_includes_nfkc(self):
|
||
opts = CleanOptions.from_preset("paranoid")
|
||
assert opts.nfkc is True
|
||
|
||
def test_unknown_preset_raises(self):
|
||
with pytest.raises(ValueError):
|
||
CleanOptions.from_preset("does-not-exist")
|
||
|
||
|
||
class TestConfigRoundTrip:
|
||
def test_dict_roundtrip(self): # E49
|
||
opts = CleanOptions(
|
||
trim=False, nfc=True, columns=["a", "b"], skip_columns=["c"],
|
||
case="upper",
|
||
)
|
||
recovered = CleanOptions.from_dict(opts.to_dict())
|
||
assert recovered == opts
|
||
|
||
def test_file_roundtrip(self, tmp_path):
|
||
path = tmp_path / "opts.json"
|
||
opts = CleanOptions(case_columns={"code": "upper"}, fold_smart_chars=False)
|
||
opts.to_file(path)
|
||
loaded = CleanOptions.from_file(path)
|
||
assert loaded == opts
|
||
|
||
def test_unknown_keys_ignored(self): # E50
|
||
data = {"trim": True, "totally_made_up_key": 42}
|
||
opts = CleanOptions.from_dict(data)
|
||
assert opts.trim is True
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Use-case smoke tests (whole-pipeline)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestUseCases:
|
||
def test_excel_save_as_csv_utf8_bom(self):
|
||
# UC3: BOM at start of first cell
|
||
df = pd.DataFrame({"name": ["Alice", "Bob"], "city": ["NYC", "LA"]})
|
||
result = clean_dataframe(df)
|
||
assert result.cleaned_df["name"].iloc[0] == "Alice"
|
||
|
||
def test_word_smart_quotes_in_product_titles(self):
|
||
# UC2
|
||
df = pd.DataFrame({"title": ["“Best Dog Collar”", "Cat Toy — Red"]})
|
||
result = clean_dataframe(df)
|
||
assert result.cleaned_df["title"].iloc[0] == '"Best Dog Collar"'
|
||
assert result.cleaned_df["title"].iloc[1] == "Cat Toy - Red"
|
||
|
||
def test_nbsp_in_email_field(self):
|
||
# UC10: invisible Unicode hiding in emails
|
||
df = pd.DataFrame({"email": ["alice@test.com", "bob @test.com"]})
|
||
result = clean_dataframe(df)
|
||
# ZWSP stripped; NBSP folded to space then collapsed but trim won't remove
|
||
# internal space. So "bob @test.com" remains. That's correct: the cleaner
|
||
# doesn't know that's an email — script 03 owns email format. Just confirm
|
||
# the invisible char is gone.
|
||
assert "" not in result.cleaned_df["email"].iloc[0]
|
||
assert " " not in result.cleaned_df["email"].iloc[1]
|
||
|
||
def test_quickbooks_trailing_spaces(self):
|
||
# UC6: VLOOKUP fails because of trailing spaces
|
||
df = pd.DataFrame({"vendor": ["ACME Corp ", "ACME Corp"]})
|
||
result = clean_dataframe(df)
|
||
assert result.cleaned_df["vendor"].iloc[0] == result.cleaned_df["vendor"].iloc[1]
|
||
|
||
def test_bank_export_crlf_in_memo(self):
|
||
# UC5: \r\n inside multi-line memo cells
|
||
df = pd.DataFrame({"memo": ["line one\r\nline two\r\nline three"]})
|
||
result = clean_dataframe(df)
|
||
assert "\r" not in result.cleaned_df["memo"].iloc[0]
|
||
assert result.cleaned_df["memo"].iloc[0].count("\n") == 2
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Reporting / dtype edge cases
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestReporting:
|
||
def test_changes_columns_present(self):
|
||
df = pd.DataFrame({"a": [" x "]})
|
||
result = clean_dataframe(df)
|
||
assert list(result.changes.columns) == [
|
||
"row", "column", "old", "new", "ops_applied",
|
||
]
|
||
|
||
def test_changes_empty_when_no_changes(self):
|
||
df = pd.DataFrame({"a": ["x", "y"]})
|
||
result = clean_dataframe(df)
|
||
assert result.cells_changed == 0
|
||
assert result.changes.empty
|
||
|
||
def test_cells_total_counts_only_processed_columns(self):
|
||
df = pd.DataFrame({"a": ["x", "y", "z"], "n": [1, 2, 3]})
|
||
result = clean_dataframe(df)
|
||
assert result.cells_total == 3 # only "a" is processed
|
||
|
||
|
||
class TestVisualizeHidden:
|
||
"""``visualize_hidden_*`` makes invisible characters visible to the user."""
|
||
|
||
def test_text_passes_ascii_through(self):
|
||
from src.core.text_clean import visualize_hidden_text
|
||
assert visualize_hidden_text("hello") == "hello"
|
||
|
||
def test_text_labels_nbsp(self):
|
||
from src.core.text_clean import visualize_hidden_text
|
||
out = visualize_hidden_text("Hi\u00a0there")
|
||
assert "[NBSP]" in out
|
||
|
||
def test_text_labels_zwsp(self):
|
||
from src.core.text_clean import visualize_hidden_text
|
||
out = visualize_hidden_text("a\u200bb")
|
||
assert "[ZWSP]" in out
|
||
|
||
def test_text_labels_tab_and_newline(self):
|
||
from src.core.text_clean import visualize_hidden_text
|
||
out = visualize_hidden_text("a\tb\nc")
|
||
assert "[TAB]" in out
|
||
assert "[LF]" in out
|
||
|
||
def test_text_labels_smart_quotes(self):
|
||
from src.core.text_clean import visualize_hidden_text
|
||
out = visualize_hidden_text("“hi”")
|
||
assert "[L DQUOTE]" in out and "[R DQUOTE]" in out
|
||
|
||
def test_text_labels_unmapped_control_with_codepoint(self):
|
||
from src.core.text_clean import visualize_hidden_text
|
||
out = visualize_hidden_text("a\x07b") # BEL
|
||
assert "[U+0007]" in out
|
||
|
||
def test_html_wraps_invisibles_in_span(self):
|
||
from src.core.text_clean import visualize_hidden_html
|
||
out = visualize_hidden_html("Hi\u00a0\u200bthere")
|
||
assert '<span class="hidden-char' in out
|
||
assert "U+00A0" in out and "U+200B" in out
|
||
|
||
def test_html_escapes_dangerous_chars(self):
|
||
from src.core.text_clean import visualize_hidden_html
|
||
out = visualize_hidden_html("<a&b>")
|
||
assert "<" in out and "&" in out and ">" in out
|
||
|
||
def test_html_passes_normal_text_through(self):
|
||
from src.core.text_clean import visualize_hidden_html
|
||
assert visualize_hidden_html("plain") == "plain"
|
||
|
||
def test_css_returns_a_style_block(self):
|
||
from src.core.text_clean import hidden_char_css
|
||
css = hidden_char_css()
|
||
assert "<style>" in css and "hidden-char" in css
|
||
|
||
def test_non_string_passthrough(self):
|
||
from src.core.text_clean import visualize_hidden_text, visualize_hidden_html
|
||
# Both functions now consistently pass non-strings through
|
||
# unchanged (audit NIT-13).
|
||
assert visualize_hidden_text(None) is None # type: ignore[arg-type]
|
||
assert visualize_hidden_html(None) is None # type: ignore[arg-type]
|
||
def test_html_marks_leading_trailing_ascii_space(self):
|
||
from src.core.text_clean import visualize_hidden_html
|
||
out = visualize_hidden_html(" Alice ", mark_outer_whitespace=True)
|
||
# Two leading and two trailing space badges
|
||
assert out.count("SP LEAD") == 2
|
||
assert out.count("SP TRAIL") == 2
|
||
# Inner "Alice" untouched
|
||
assert "Alice" in out
|
||
|
||
def test_html_default_does_not_mark_outer_ascii_space(self):
|
||
from src.core.text_clean import visualize_hidden_html
|
||
out = visualize_hidden_html(" Alice ")
|
||
assert "SP LEAD" not in out and "SP TRAIL" not in out
|
||
|
||
def test_html_marks_leading_tab(self):
|
||
from src.core.text_clean import visualize_hidden_html
|
||
out = visualize_hidden_html("\tAlice", mark_outer_whitespace=True)
|
||
assert "TAB" in out # tab gets a badge
|
||
|
||
def test_html_only_whitespace_string_marked_as_leading(self):
|
||
from src.core.text_clean import visualize_hidden_html
|
||
out = visualize_hidden_html(" ", mark_outer_whitespace=True)
|
||
# All three chars treated as leading; trailing run is empty.
|
||
assert out.count("SP LEAD") == 3
|
||
|