feat(text_clean): visualize hidden characters in the cleaner GUI
The whole point of the cleaner is to remove characters the user can't
see — which makes the "before / after" preview nearly useless by default.
A cell with NBSP padding looks identical to a cell with regular spaces.
Two new helpers in src.core.text_clean:
visualize_hidden_text(s)
Plain-text rendering: each invisible/control/smart character is
replaced by a glyph + [LABEL] (e.g. "·[NBSP]", "→[TAB]", "∅[ZWSP]",
"""[L DQUOTE]"). Suitable for terminal output, CSV exports, anywhere
HTML is wrong. Unmapped C0 controls render as [U+XXXX].
visualize_hidden_html(s) + hidden_char_css()
HTML rendering: every flagged character is wrapped in a <span> with
a CSS class and a tooltip showing the codepoint and label. Pair with
hidden_char_css() to inject the matching styles. Three colour bands
(whitespace, special, control) so the user can scan an audit table
and spot what's being changed at a glance.
Mapping covers: ASCII tab/LF/CR, every NBSP variant (U+00A0, U+202F,
U+2009, …), zero-width family (ZWSP/ZWNJ/ZWJ/WJ/BOM/SHY), bidi marks
(LRM/RLM), all smart quotes, en/em dashes, ellipsis, prime/double-prime,
and guillemets. ASCII printable text passes through; HTML output also
escapes &/</> .
GUI wiring (src/gui/pages/2_Text_Cleaner.py)
The "Examples" changes table now defaults to a hidden-char-rendered
HTML view: every NBSP/ZWSP/smart-quote/control char is shown with its
badge and codepoint tooltip. A "Show hidden characters" toggle lets
the user fall back to the raw st.dataframe view if they prefer.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -78,6 +78,7 @@ from .text_clean import (
|
|||||||
clean_value,
|
clean_value,
|
||||||
collapse_whitespace,
|
collapse_whitespace,
|
||||||
fold_smart_chars,
|
fold_smart_chars,
|
||||||
|
hidden_char_css,
|
||||||
normalize_line_endings,
|
normalize_line_endings,
|
||||||
sentence_case,
|
sentence_case,
|
||||||
smart_title_case,
|
smart_title_case,
|
||||||
@@ -87,6 +88,8 @@ from .text_clean import (
|
|||||||
to_nfc,
|
to_nfc,
|
||||||
to_nfkc,
|
to_nfkc,
|
||||||
trim,
|
trim,
|
||||||
|
visualize_hidden_html,
|
||||||
|
visualize_hidden_text,
|
||||||
)
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
@@ -146,4 +149,7 @@ __all__ = [
|
|||||||
"smart_title_case",
|
"smart_title_case",
|
||||||
"sentence_case",
|
"sentence_case",
|
||||||
"apply_case",
|
"apply_case",
|
||||||
|
"visualize_hidden_text",
|
||||||
|
"visualize_hidden_html",
|
||||||
|
"hidden_char_css",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -587,3 +587,174 @@ def clean_dataframe(df: pd.DataFrame, options: Optional[CleanOptions] = None) ->
|
|||||||
cells_total=cells_total,
|
cells_total=cells_total,
|
||||||
columns_processed=columns,
|
columns_processed=columns,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Hidden-character visualization
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# The whole point of the cleaner is to remove characters the user can't see.
|
||||||
|
# That makes the GUI's "before / after" view nearly useless by default — a
|
||||||
|
# cell with NBSP padding looks identical to a cell with regular spaces. The
|
||||||
|
# visualizers below render those characters with visible glyphs (and
|
||||||
|
# tooltips, in the HTML variant) so the user can actually verify what the
|
||||||
|
# cleaner is doing.
|
||||||
|
#
|
||||||
|
# Mapping rule: every character that's normally invisible OR that the
|
||||||
|
# cleaner treats specially gets a glyph + label. ASCII printable characters
|
||||||
|
# pass through untouched. Tabs and newlines are visualized because they're
|
||||||
|
# whitespace the user often wants to see.
|
||||||
|
|
||||||
|
# Unicode codepoint -> (visible glyph, short label). The label appears as a
|
||||||
|
# tooltip in the HTML variant and as ``[label]`` in the text variant.
|
||||||
|
_VISIBLE_CHAR_MAP: dict[str, tuple[str, str]] = {
|
||||||
|
# Whitespace
|
||||||
|
"\t": ("→", "TAB"),
|
||||||
|
"\n": ("↵", "LF"),
|
||||||
|
"\r": ("␍", "CR"),
|
||||||
|
" ": ("·", "NBSP"),
|
||||||
|
" ": ("·", "NNBSP"),
|
||||||
|
" ": ("·", "THIN SP"),
|
||||||
|
" ": ("·", "HAIR SP"),
|
||||||
|
" ": ("·", "EN SP"),
|
||||||
|
" ": ("·", "EM SP"),
|
||||||
|
" ": ("·", "IDEO SP"),
|
||||||
|
# Zero-width / invisible
|
||||||
|
"": ("∅", "ZWSP"),
|
||||||
|
"": ("∅", "ZWNJ"),
|
||||||
|
"": ("∅", "ZWJ"),
|
||||||
|
"": ("∅", "WJ"),
|
||||||
|
"": ("⮕", "LRM"),
|
||||||
|
"": ("⬅", "RLM"),
|
||||||
|
"": ("∅", "BOM"),
|
||||||
|
"": ("-", "SHY"),
|
||||||
|
# Smart quotes / dashes / ellipsis (visible but the user often wants
|
||||||
|
# them flagged because they're being folded)
|
||||||
|
"‘": ("'", "L QUOTE"),
|
||||||
|
"’": ("'", "R QUOTE"),
|
||||||
|
"“": ('"', "L DQUOTE"),
|
||||||
|
"”": ('"', "R DQUOTE"),
|
||||||
|
"—": ("—", "EM DASH"),
|
||||||
|
"–": ("–", "EN DASH"),
|
||||||
|
"…": ("…", "ELLIPSIS"),
|
||||||
|
"′": ("′", "PRIME"),
|
||||||
|
"″": ("″", "DPRIME"),
|
||||||
|
"«": ("«", "L GUILL"),
|
||||||
|
"»": ("»", "R GUILL"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _is_control(ch: str) -> bool:
|
||||||
|
"""C0 control char (other than \\t \\n \\r) or DEL."""
|
||||||
|
cp = ord(ch)
|
||||||
|
if cp == 0x7F:
|
||||||
|
return True
|
||||||
|
if cp < 0x20 and ch not in ("\t", "\n", "\r"):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def visualize_hidden_text(s: str) -> str:
|
||||||
|
"""Return a plain-text rendering of *s* with hidden characters labelled.
|
||||||
|
|
||||||
|
Each invisible/control/smart character is replaced by its glyph from
|
||||||
|
``_VISIBLE_CHAR_MAP`` followed by ``[LABEL]``; ASCII printable
|
||||||
|
characters pass through untouched. C0 control characters not in the
|
||||||
|
map are rendered as ``[U+XXXX]``.
|
||||||
|
|
||||||
|
Suitable for terminal output, CSV exports, and any context where HTML
|
||||||
|
is not appropriate.
|
||||||
|
"""
|
||||||
|
if not isinstance(s, str):
|
||||||
|
return s
|
||||||
|
out: list[str] = []
|
||||||
|
for ch in s:
|
||||||
|
mapped = _VISIBLE_CHAR_MAP.get(ch)
|
||||||
|
if mapped is not None:
|
||||||
|
glyph, label = mapped
|
||||||
|
out.append(f"{glyph}[{label}]")
|
||||||
|
elif _is_control(ch):
|
||||||
|
out.append(f"[U+{ord(ch):04X}]")
|
||||||
|
else:
|
||||||
|
out.append(ch)
|
||||||
|
return "".join(out)
|
||||||
|
|
||||||
|
|
||||||
|
def visualize_hidden_html(s: str) -> str:
|
||||||
|
"""Return an HTML rendering of *s* with hidden characters highlighted.
|
||||||
|
|
||||||
|
Each invisible/control/smart character is wrapped in a ``<span>`` with
|
||||||
|
a CSS class and a ``title`` attribute showing the codepoint and label,
|
||||||
|
so the user gets a tooltip on hover. ASCII printable text is HTML-
|
||||||
|
escaped but otherwise left as-is.
|
||||||
|
|
||||||
|
Pair with :func:`hidden_char_css` to inject the matching styles into
|
||||||
|
the page.
|
||||||
|
"""
|
||||||
|
if not isinstance(s, str):
|
||||||
|
return ""
|
||||||
|
parts: list[str] = []
|
||||||
|
for ch in s:
|
||||||
|
mapped = _VISIBLE_CHAR_MAP.get(ch)
|
||||||
|
if mapped is not None:
|
||||||
|
glyph, label = mapped
|
||||||
|
cp = f"U+{ord(ch):04X}"
|
||||||
|
css = "hidden-whitespace" if label.endswith("SP") or label in ("TAB", "LF", "CR", "NBSP") else "hidden-special"
|
||||||
|
parts.append(
|
||||||
|
f'<span class="hidden-char {css}" '
|
||||||
|
f'title="{cp} {label}">{glyph}</span>'
|
||||||
|
)
|
||||||
|
elif _is_control(ch):
|
||||||
|
cp = f"U+{ord(ch):04X}"
|
||||||
|
parts.append(
|
||||||
|
f'<span class="hidden-char hidden-control" '
|
||||||
|
f'title="{cp} CTRL">␣</span>'
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# HTML-escape only the dangerous characters; preserve everything
|
||||||
|
# else so the visible content reads naturally.
|
||||||
|
if ch == "&":
|
||||||
|
parts.append("&")
|
||||||
|
elif ch == "<":
|
||||||
|
parts.append("<")
|
||||||
|
elif ch == ">":
|
||||||
|
parts.append(">")
|
||||||
|
else:
|
||||||
|
parts.append(ch)
|
||||||
|
return "".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def hidden_char_css() -> str:
|
||||||
|
"""CSS used by :func:`visualize_hidden_html` rendering.
|
||||||
|
|
||||||
|
Returned as a plain string so callers can inject it into Streamlit via
|
||||||
|
``st.markdown(hidden_char_css(), unsafe_allow_html=True)``.
|
||||||
|
"""
|
||||||
|
return """
|
||||||
|
<style>
|
||||||
|
.hidden-char {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 0 2px;
|
||||||
|
margin: 0 1px;
|
||||||
|
border-radius: 3px;
|
||||||
|
font-family: ui-monospace, SFMono-Regular, monospace;
|
||||||
|
font-size: 0.85em;
|
||||||
|
cursor: help;
|
||||||
|
}
|
||||||
|
.hidden-char.hidden-whitespace {
|
||||||
|
background: #fff3cd;
|
||||||
|
color: #856404;
|
||||||
|
border: 1px solid #ffeaa7;
|
||||||
|
}
|
||||||
|
.hidden-char.hidden-special {
|
||||||
|
background: #d1ecf1;
|
||||||
|
color: #0c5460;
|
||||||
|
border: 1px solid #bee5eb;
|
||||||
|
}
|
||||||
|
.hidden-char.hidden-control {
|
||||||
|
background: #f8d7da;
|
||||||
|
color: #721c24;
|
||||||
|
border: 1px solid #f5c6cb;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
"""
|
||||||
|
|||||||
@@ -19,6 +19,8 @@ from src.core.text_clean import (
|
|||||||
PRESETS,
|
PRESETS,
|
||||||
CleanOptions,
|
CleanOptions,
|
||||||
clean_dataframe,
|
clean_dataframe,
|
||||||
|
hidden_char_css,
|
||||||
|
visualize_hidden_html,
|
||||||
)
|
)
|
||||||
|
|
||||||
hide_streamlit_chrome()
|
hide_streamlit_chrome()
|
||||||
@@ -205,8 +207,53 @@ if result.cells_changed:
|
|||||||
)
|
)
|
||||||
|
|
||||||
st.markdown("**Examples (first 25 changes)**")
|
st.markdown("**Examples (first 25 changes)**")
|
||||||
|
show_hidden = st.toggle(
|
||||||
|
"Show hidden characters (NBSP, ZWSP, smart quotes, control chars…)",
|
||||||
|
value=True,
|
||||||
|
help=(
|
||||||
|
"Highlights characters the cleaner is removing or replacing. "
|
||||||
|
"Hover any badge to see the codepoint and label."
|
||||||
|
),
|
||||||
|
key="textclean_show_hidden",
|
||||||
|
)
|
||||||
examples = result.changes.head(25).copy()
|
examples = result.changes.head(25).copy()
|
||||||
examples["row"] = examples["row"] + 1
|
examples["row"] = examples["row"] + 1
|
||||||
|
if show_hidden:
|
||||||
|
# Inject the badge CSS once, then render an HTML table so the
|
||||||
|
# invisibles in old/new are actually visible to the user.
|
||||||
|
st.markdown(hidden_char_css(), unsafe_allow_html=True)
|
||||||
|
rows_html = []
|
||||||
|
for _, row in examples.iterrows():
|
||||||
|
rows_html.append(
|
||||||
|
"<tr>"
|
||||||
|
f"<td>{row['row']}</td>"
|
||||||
|
f"<td><code>{visualize_hidden_html(str(row['column']))}</code></td>"
|
||||||
|
f"<td>{visualize_hidden_html(str(row['old']))}</td>"
|
||||||
|
f"<td>{visualize_hidden_html(str(row['new']))}</td>"
|
||||||
|
f"<td><code>{row['ops_applied']}</code></td>"
|
||||||
|
"</tr>"
|
||||||
|
)
|
||||||
|
st.markdown(
|
||||||
|
"<table class='hidden-char-table'>"
|
||||||
|
"<thead><tr>"
|
||||||
|
"<th style='text-align:left'>Row</th>"
|
||||||
|
"<th style='text-align:left'>Column</th>"
|
||||||
|
"<th style='text-align:left'>Before</th>"
|
||||||
|
"<th style='text-align:left'>After</th>"
|
||||||
|
"<th style='text-align:left'>Ops applied</th>"
|
||||||
|
"</tr></thead>"
|
||||||
|
f"<tbody>{''.join(rows_html)}</tbody>"
|
||||||
|
"</table>"
|
||||||
|
"<style>"
|
||||||
|
".hidden-char-table { width: 100%; border-collapse: collapse; }"
|
||||||
|
".hidden-char-table th, .hidden-char-table td { "
|
||||||
|
" padding: 4px 8px; border-bottom: 1px solid #eee; "
|
||||||
|
" vertical-align: top; }"
|
||||||
|
".hidden-char-table tbody tr:hover { background: #fafafa; }"
|
||||||
|
"</style>",
|
||||||
|
unsafe_allow_html=True,
|
||||||
|
)
|
||||||
|
else:
|
||||||
st.dataframe(examples, use_container_width=True, hide_index=True)
|
st.dataframe(examples, use_container_width=True, hide_index=True)
|
||||||
|
|
||||||
st.markdown("**Cleaned preview (first 10 rows)**")
|
st.markdown("**Cleaned preview (first 10 rows)**")
|
||||||
|
|||||||
@@ -480,3 +480,62 @@ class TestReporting:
|
|||||||
df = pd.DataFrame({"a": ["x", "y", "z"], "n": [1, 2, 3]})
|
df = pd.DataFrame({"a": ["x", "y", "z"], "n": [1, 2, 3]})
|
||||||
result = clean_dataframe(df)
|
result = clean_dataframe(df)
|
||||||
assert result.cells_total == 3 # only "a" is processed
|
assert result.cells_total == 3 # only "a" is processed
|
||||||
|
|
||||||
|
|
||||||
|
class TestVisualizeHidden:
|
||||||
|
"""``visualize_hidden_*`` makes invisible characters visible to the user."""
|
||||||
|
|
||||||
|
def test_text_passes_ascii_through(self):
|
||||||
|
from src.core.text_clean import visualize_hidden_text
|
||||||
|
assert visualize_hidden_text("hello") == "hello"
|
||||||
|
|
||||||
|
def test_text_labels_nbsp(self):
|
||||||
|
from src.core.text_clean import visualize_hidden_text
|
||||||
|
out = visualize_hidden_text("Hi\u00a0there")
|
||||||
|
assert "[NBSP]" in out
|
||||||
|
|
||||||
|
def test_text_labels_zwsp(self):
|
||||||
|
from src.core.text_clean import visualize_hidden_text
|
||||||
|
out = visualize_hidden_text("a\u200bb")
|
||||||
|
assert "[ZWSP]" in out
|
||||||
|
|
||||||
|
def test_text_labels_tab_and_newline(self):
|
||||||
|
from src.core.text_clean import visualize_hidden_text
|
||||||
|
out = visualize_hidden_text("a\tb\nc")
|
||||||
|
assert "[TAB]" in out
|
||||||
|
assert "[LF]" in out
|
||||||
|
|
||||||
|
def test_text_labels_smart_quotes(self):
|
||||||
|
from src.core.text_clean import visualize_hidden_text
|
||||||
|
out = visualize_hidden_text("“hi”")
|
||||||
|
assert "[L DQUOTE]" in out and "[R DQUOTE]" in out
|
||||||
|
|
||||||
|
def test_text_labels_unmapped_control_with_codepoint(self):
|
||||||
|
from src.core.text_clean import visualize_hidden_text
|
||||||
|
out = visualize_hidden_text("a\x07b") # BEL
|
||||||
|
assert "[U+0007]" in out
|
||||||
|
|
||||||
|
def test_html_wraps_invisibles_in_span(self):
|
||||||
|
from src.core.text_clean import visualize_hidden_html
|
||||||
|
out = visualize_hidden_html("Hi\u00a0\u200bthere")
|
||||||
|
assert '<span class="hidden-char' in out
|
||||||
|
assert "U+00A0" in out and "U+200B" in out
|
||||||
|
|
||||||
|
def test_html_escapes_dangerous_chars(self):
|
||||||
|
from src.core.text_clean import visualize_hidden_html
|
||||||
|
out = visualize_hidden_html("<a&b>")
|
||||||
|
assert "<" in out and "&" in out and ">" in out
|
||||||
|
|
||||||
|
def test_html_passes_normal_text_through(self):
|
||||||
|
from src.core.text_clean import visualize_hidden_html
|
||||||
|
assert visualize_hidden_html("plain") == "plain"
|
||||||
|
|
||||||
|
def test_css_returns_a_style_block(self):
|
||||||
|
from src.core.text_clean import hidden_char_css
|
||||||
|
css = hidden_char_css()
|
||||||
|
assert "<style>" in css and "hidden-char" in css
|
||||||
|
|
||||||
|
def test_non_string_passthrough(self):
|
||||||
|
from src.core.text_clean import visualize_hidden_text, visualize_hidden_html
|
||||||
|
assert visualize_hidden_text(None) is None # type: ignore[arg-type]
|
||||||
|
assert visualize_hidden_html(None) == ""
|
||||||
|
|||||||
Reference in New Issue
Block a user