feat(text_clean): visualize hidden characters in the cleaner GUI

The whole point of the cleaner is to remove characters the user can't
see — which makes the "before / after" preview nearly useless by default.
A cell with NBSP padding looks identical to a cell with regular spaces.

Two new helpers in src.core.text_clean:

  visualize_hidden_text(s)
    Plain-text rendering: each invisible/control/smart character is
    replaced by a glyph + [LABEL] (e.g. "·[NBSP]", "→[TAB]", "∅[ZWSP]",
    """[L DQUOTE]"). Suitable for terminal output, CSV exports, anywhere
    HTML is wrong. Unmapped C0 controls render as [U+XXXX].

  visualize_hidden_html(s) + hidden_char_css()
    HTML rendering: every flagged character is wrapped in a <span> with
    a CSS class and a tooltip showing the codepoint and label. Pair with
    hidden_char_css() to inject the matching styles. Three colour bands
    (whitespace, special, control) so the user can scan an audit table
    and spot what's being changed at a glance.

Mapping covers: ASCII tab/LF/CR, every NBSP variant (U+00A0, U+202F,
U+2009, …), zero-width family (ZWSP/ZWNJ/ZWJ/WJ/BOM/SHY), bidi marks
(LRM/RLM), all smart quotes, en/em dashes, ellipsis, prime/double-prime,
and guillemets. ASCII printable text passes through; HTML output also
escapes &/</> .

GUI wiring (src/gui/pages/2_Text_Cleaner.py)
  The "Examples" changes table now defaults to a hidden-char-rendered
  HTML view: every NBSP/ZWSP/smart-quote/control char is shown with its
  badge and codepoint tooltip. A "Show hidden characters" toggle lets
  the user fall back to the raw st.dataframe view if they prefer.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-29 16:14:14 +00:00
parent 794d4cda94
commit 90ceada2d1
4 changed files with 284 additions and 1 deletions

View File

@@ -78,6 +78,7 @@ from .text_clean import (
clean_value,
collapse_whitespace,
fold_smart_chars,
hidden_char_css,
normalize_line_endings,
sentence_case,
smart_title_case,
@@ -87,6 +88,8 @@ from .text_clean import (
to_nfc,
to_nfkc,
trim,
visualize_hidden_html,
visualize_hidden_text,
)
__all__ = [
@@ -146,4 +149,7 @@ __all__ = [
"smart_title_case",
"sentence_case",
"apply_case",
"visualize_hidden_text",
"visualize_hidden_html",
"hidden_char_css",
]

View File

@@ -587,3 +587,174 @@ def clean_dataframe(df: pd.DataFrame, options: Optional[CleanOptions] = None) ->
cells_total=cells_total,
columns_processed=columns,
)
# ---------------------------------------------------------------------------
# Hidden-character visualization
# ---------------------------------------------------------------------------
#
# The whole point of the cleaner is to remove characters the user can't see.
# That makes the GUI's "before / after" view nearly useless by default — a
# cell with NBSP padding looks identical to a cell with regular spaces. The
# visualizers below render those characters with visible glyphs (and
# tooltips, in the HTML variant) so the user can actually verify what the
# cleaner is doing.
#
# Mapping rule: every character that's normally invisible OR that the
# cleaner treats specially gets a glyph + label. ASCII printable characters
# pass through untouched. Tabs and newlines are visualized because they're
# whitespace the user often wants to see.
# Unicode codepoint -> (visible glyph, short label). The label appears as a
# tooltip in the HTML variant and as ``[label]`` in the text variant.
_VISIBLE_CHAR_MAP: dict[str, tuple[str, str]] = {
# Whitespace
"\t": ("", "TAB"),
"\n": ("", "LF"),
"\r": ("", "CR"),
" ": ("·", "NBSP"),
"": ("·", "NNBSP"),
"": ("·", "THIN SP"),
"": ("·", "HAIR SP"),
"": ("·", "EN SP"),
"": ("·", "EM SP"),
" ": ("·", "IDEO SP"),
# Zero-width / invisible
"": ("", "ZWSP"),
"": ("", "ZWNJ"),
"": ("", "ZWJ"),
"": ("", "WJ"),
"": ("", "LRM"),
"": ("", "RLM"),
"": ("", "BOM"),
"­": ("-", "SHY"),
# Smart quotes / dashes / ellipsis (visible but the user often wants
# them flagged because they're being folded)
"": ("'", "L QUOTE"),
"": ("'", "R QUOTE"),
"": ('"', "L DQUOTE"),
"": ('"', "R DQUOTE"),
"": ("", "EM DASH"),
"": ("", "EN DASH"),
"": ("", "ELLIPSIS"),
"": ("", "PRIME"),
"": ("", "DPRIME"),
"«": ("«", "L GUILL"),
"»": ("»", "R GUILL"),
}
def _is_control(ch: str) -> bool:
"""C0 control char (other than \\t \\n \\r) or DEL."""
cp = ord(ch)
if cp == 0x7F:
return True
if cp < 0x20 and ch not in ("\t", "\n", "\r"):
return True
return False
def visualize_hidden_text(s: str) -> str:
"""Return a plain-text rendering of *s* with hidden characters labelled.
Each invisible/control/smart character is replaced by its glyph from
``_VISIBLE_CHAR_MAP`` followed by ``[LABEL]``; ASCII printable
characters pass through untouched. C0 control characters not in the
map are rendered as ``[U+XXXX]``.
Suitable for terminal output, CSV exports, and any context where HTML
is not appropriate.
"""
if not isinstance(s, str):
return s
out: list[str] = []
for ch in s:
mapped = _VISIBLE_CHAR_MAP.get(ch)
if mapped is not None:
glyph, label = mapped
out.append(f"{glyph}[{label}]")
elif _is_control(ch):
out.append(f"[U+{ord(ch):04X}]")
else:
out.append(ch)
return "".join(out)
def visualize_hidden_html(s: str) -> str:
"""Return an HTML rendering of *s* with hidden characters highlighted.
Each invisible/control/smart character is wrapped in a ``<span>`` with
a CSS class and a ``title`` attribute showing the codepoint and label,
so the user gets a tooltip on hover. ASCII printable text is HTML-
escaped but otherwise left as-is.
Pair with :func:`hidden_char_css` to inject the matching styles into
the page.
"""
if not isinstance(s, str):
return ""
parts: list[str] = []
for ch in s:
mapped = _VISIBLE_CHAR_MAP.get(ch)
if mapped is not None:
glyph, label = mapped
cp = f"U+{ord(ch):04X}"
css = "hidden-whitespace" if label.endswith("SP") or label in ("TAB", "LF", "CR", "NBSP") else "hidden-special"
parts.append(
f'<span class="hidden-char {css}" '
f'title="{cp} {label}">{glyph}</span>'
)
elif _is_control(ch):
cp = f"U+{ord(ch):04X}"
parts.append(
f'<span class="hidden-char hidden-control" '
f'title="{cp} CTRL">␣</span>'
)
else:
# HTML-escape only the dangerous characters; preserve everything
# else so the visible content reads naturally.
if ch == "&":
parts.append("&amp;")
elif ch == "<":
parts.append("&lt;")
elif ch == ">":
parts.append("&gt;")
else:
parts.append(ch)
return "".join(parts)
def hidden_char_css() -> str:
"""CSS used by :func:`visualize_hidden_html` rendering.
Returned as a plain string so callers can inject it into Streamlit via
``st.markdown(hidden_char_css(), unsafe_allow_html=True)``.
"""
return """
<style>
.hidden-char {
display: inline-block;
padding: 0 2px;
margin: 0 1px;
border-radius: 3px;
font-family: ui-monospace, SFMono-Regular, monospace;
font-size: 0.85em;
cursor: help;
}
.hidden-char.hidden-whitespace {
background: #fff3cd;
color: #856404;
border: 1px solid #ffeaa7;
}
.hidden-char.hidden-special {
background: #d1ecf1;
color: #0c5460;
border: 1px solid #bee5eb;
}
.hidden-char.hidden-control {
background: #f8d7da;
color: #721c24;
border: 1px solid #f5c6cb;
}
</style>
"""

View File

@@ -19,6 +19,8 @@ from src.core.text_clean import (
PRESETS,
CleanOptions,
clean_dataframe,
hidden_char_css,
visualize_hidden_html,
)
hide_streamlit_chrome()
@@ -205,9 +207,54 @@ if result.cells_changed:
)
st.markdown("**Examples (first 25 changes)**")
show_hidden = st.toggle(
"Show hidden characters (NBSP, ZWSP, smart quotes, control chars…)",
value=True,
help=(
"Highlights characters the cleaner is removing or replacing. "
"Hover any badge to see the codepoint and label."
),
key="textclean_show_hidden",
)
examples = result.changes.head(25).copy()
examples["row"] = examples["row"] + 1
st.dataframe(examples, use_container_width=True, hide_index=True)
if show_hidden:
# Inject the badge CSS once, then render an HTML table so the
# invisibles in old/new are actually visible to the user.
st.markdown(hidden_char_css(), unsafe_allow_html=True)
rows_html = []
for _, row in examples.iterrows():
rows_html.append(
"<tr>"
f"<td>{row['row']}</td>"
f"<td><code>{visualize_hidden_html(str(row['column']))}</code></td>"
f"<td>{visualize_hidden_html(str(row['old']))}</td>"
f"<td>{visualize_hidden_html(str(row['new']))}</td>"
f"<td><code>{row['ops_applied']}</code></td>"
"</tr>"
)
st.markdown(
"<table class='hidden-char-table'>"
"<thead><tr>"
"<th style='text-align:left'>Row</th>"
"<th style='text-align:left'>Column</th>"
"<th style='text-align:left'>Before</th>"
"<th style='text-align:left'>After</th>"
"<th style='text-align:left'>Ops applied</th>"
"</tr></thead>"
f"<tbody>{''.join(rows_html)}</tbody>"
"</table>"
"<style>"
".hidden-char-table { width: 100%; border-collapse: collapse; }"
".hidden-char-table th, .hidden-char-table td { "
" padding: 4px 8px; border-bottom: 1px solid #eee; "
" vertical-align: top; }"
".hidden-char-table tbody tr:hover { background: #fafafa; }"
"</style>",
unsafe_allow_html=True,
)
else:
st.dataframe(examples, use_container_width=True, hide_index=True)
st.markdown("**Cleaned preview (first 10 rows)**")
st.dataframe(result.cleaned_df.head(10), use_container_width=True)

View File

@@ -480,3 +480,62 @@ class TestReporting:
df = pd.DataFrame({"a": ["x", "y", "z"], "n": [1, 2, 3]})
result = clean_dataframe(df)
assert result.cells_total == 3 # only "a" is processed
class TestVisualizeHidden:
"""``visualize_hidden_*`` makes invisible characters visible to the user."""
def test_text_passes_ascii_through(self):
from src.core.text_clean import visualize_hidden_text
assert visualize_hidden_text("hello") == "hello"
def test_text_labels_nbsp(self):
from src.core.text_clean import visualize_hidden_text
out = visualize_hidden_text("Hi\u00a0there")
assert "[NBSP]" in out
def test_text_labels_zwsp(self):
from src.core.text_clean import visualize_hidden_text
out = visualize_hidden_text("a\u200bb")
assert "[ZWSP]" in out
def test_text_labels_tab_and_newline(self):
from src.core.text_clean import visualize_hidden_text
out = visualize_hidden_text("a\tb\nc")
assert "[TAB]" in out
assert "[LF]" in out
def test_text_labels_smart_quotes(self):
from src.core.text_clean import visualize_hidden_text
out = visualize_hidden_text("“hi”")
assert "[L DQUOTE]" in out and "[R DQUOTE]" in out
def test_text_labels_unmapped_control_with_codepoint(self):
from src.core.text_clean import visualize_hidden_text
out = visualize_hidden_text("a\x07b") # BEL
assert "[U+0007]" in out
def test_html_wraps_invisibles_in_span(self):
from src.core.text_clean import visualize_hidden_html
out = visualize_hidden_html("Hi\u00a0\u200bthere")
assert '<span class="hidden-char' in out
assert "U+00A0" in out and "U+200B" in out
def test_html_escapes_dangerous_chars(self):
from src.core.text_clean import visualize_hidden_html
out = visualize_hidden_html("<a&b>")
assert "&lt;" in out and "&amp;" in out and "&gt;" in out
def test_html_passes_normal_text_through(self):
from src.core.text_clean import visualize_hidden_html
assert visualize_hidden_html("plain") == "plain"
def test_css_returns_a_style_block(self):
from src.core.text_clean import hidden_char_css
css = hidden_char_css()
assert "<style>" in css and "hidden-char" in css
def test_non_string_passthrough(self):
from src.core.text_clean import visualize_hidden_text, visualize_hidden_html
assert visualize_hidden_text(None) is None # type: ignore[arg-type]
assert visualize_hidden_html(None) == ""