feat(text_clean): visualize hidden characters in the cleaner GUI
The whole point of the cleaner is to remove characters the user can't
see — which makes the "before / after" preview nearly useless by default.
A cell with NBSP padding looks identical to a cell with regular spaces.
Two new helpers in src.core.text_clean:
visualize_hidden_text(s)
Plain-text rendering: each invisible/control/smart character is
replaced by a glyph + [LABEL] (e.g. "·[NBSP]", "→[TAB]", "∅[ZWSP]",
"""[L DQUOTE]"). Suitable for terminal output, CSV exports, anywhere
HTML is wrong. Unmapped C0 controls render as [U+XXXX].
visualize_hidden_html(s) + hidden_char_css()
HTML rendering: every flagged character is wrapped in a <span> with
a CSS class and a tooltip showing the codepoint and label. Pair with
hidden_char_css() to inject the matching styles. Three colour bands
(whitespace, special, control) so the user can scan an audit table
and spot what's being changed at a glance.
Mapping covers: ASCII tab/LF/CR, every NBSP variant (U+00A0, U+202F,
U+2009, …), zero-width family (ZWSP/ZWNJ/ZWJ/WJ/BOM/SHY), bidi marks
(LRM/RLM), all smart quotes, en/em dashes, ellipsis, prime/double-prime,
and guillemets. ASCII printable text passes through; HTML output also
escapes &/</> .
GUI wiring (src/gui/pages/2_Text_Cleaner.py)
The "Examples" changes table now defaults to a hidden-char-rendered
HTML view: every NBSP/ZWSP/smart-quote/control char is shown with its
badge and codepoint tooltip. A "Show hidden characters" toggle lets
the user fall back to the raw st.dataframe view if they prefer.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -480,3 +480,62 @@ class TestReporting:
|
||||
df = pd.DataFrame({"a": ["x", "y", "z"], "n": [1, 2, 3]})
|
||||
result = clean_dataframe(df)
|
||||
assert result.cells_total == 3 # only "a" is processed
|
||||
|
||||
|
||||
class TestVisualizeHidden:
|
||||
"""``visualize_hidden_*`` makes invisible characters visible to the user."""
|
||||
|
||||
def test_text_passes_ascii_through(self):
|
||||
from src.core.text_clean import visualize_hidden_text
|
||||
assert visualize_hidden_text("hello") == "hello"
|
||||
|
||||
def test_text_labels_nbsp(self):
|
||||
from src.core.text_clean import visualize_hidden_text
|
||||
out = visualize_hidden_text("Hi\u00a0there")
|
||||
assert "[NBSP]" in out
|
||||
|
||||
def test_text_labels_zwsp(self):
|
||||
from src.core.text_clean import visualize_hidden_text
|
||||
out = visualize_hidden_text("a\u200bb")
|
||||
assert "[ZWSP]" in out
|
||||
|
||||
def test_text_labels_tab_and_newline(self):
|
||||
from src.core.text_clean import visualize_hidden_text
|
||||
out = visualize_hidden_text("a\tb\nc")
|
||||
assert "[TAB]" in out
|
||||
assert "[LF]" in out
|
||||
|
||||
def test_text_labels_smart_quotes(self):
|
||||
from src.core.text_clean import visualize_hidden_text
|
||||
out = visualize_hidden_text("“hi”")
|
||||
assert "[L DQUOTE]" in out and "[R DQUOTE]" in out
|
||||
|
||||
def test_text_labels_unmapped_control_with_codepoint(self):
|
||||
from src.core.text_clean import visualize_hidden_text
|
||||
out = visualize_hidden_text("a\x07b") # BEL
|
||||
assert "[U+0007]" in out
|
||||
|
||||
def test_html_wraps_invisibles_in_span(self):
|
||||
from src.core.text_clean import visualize_hidden_html
|
||||
out = visualize_hidden_html("Hi\u00a0\u200bthere")
|
||||
assert '<span class="hidden-char' in out
|
||||
assert "U+00A0" in out and "U+200B" in out
|
||||
|
||||
def test_html_escapes_dangerous_chars(self):
|
||||
from src.core.text_clean import visualize_hidden_html
|
||||
out = visualize_hidden_html("<a&b>")
|
||||
assert "<" in out and "&" in out and ">" in out
|
||||
|
||||
def test_html_passes_normal_text_through(self):
|
||||
from src.core.text_clean import visualize_hidden_html
|
||||
assert visualize_hidden_html("plain") == "plain"
|
||||
|
||||
def test_css_returns_a_style_block(self):
|
||||
from src.core.text_clean import hidden_char_css
|
||||
css = hidden_char_css()
|
||||
assert "<style>" in css and "hidden-char" in css
|
||||
|
||||
def test_non_string_passthrough(self):
|
||||
from src.core.text_clean import visualize_hidden_text, visualize_hidden_html
|
||||
assert visualize_hidden_text(None) is None # type: ignore[arg-type]
|
||||
assert visualize_hidden_html(None) == ""
|
||||
|
||||
Reference in New Issue
Block a user