feat(text_clean): visualize hidden characters in the cleaner GUI

The whole point of the cleaner is to remove characters the user can't see — which makes the "before / after" preview nearly useless by default. A cell with NBSP padding looks identical to a cell with regular spaces. Two new helpers in src.core.text_clean: visualize_hidden_text(s) Plain-text rendering: each invisible/control/smart character is replaced by a glyph + [LABEL] (e.g. "·[NBSP]", "→[TAB]", "∅[ZWSP]", """[L DQUOTE]"). Suitable for terminal output, CSV exports, anywhere HTML is wrong. Unmapped C0 controls render as [U+XXXX]. visualize_hidden_html(s) + hidden_char_css() HTML rendering: every flagged character is wrapped in a <span> with a CSS class and a tooltip showing the codepoint and label. Pair with hidden_char_css() to inject the matching styles. Three colour bands (whitespace, special, control) so the user can scan an audit table and spot what's being changed at a glance. Mapping covers: ASCII tab/LF/CR, every NBSP variant (U+00A0, U+202F, U+2009, …), zero-width family (ZWSP/ZWNJ/ZWJ/WJ/BOM/SHY), bidi marks (LRM/RLM), all smart quotes, en/em dashes, ellipsis, prime/double-prime, and guillemets. ASCII printable text passes through; HTML output also escapes &/</> . GUI wiring (src/gui/pages/2_Text_Cleaner.py) The "Examples" changes table now defaults to a hidden-char-rendered HTML view: every NBSP/ZWSP/smart-quote/control char is shown with its badge and codepoint tooltip. A "Show hidden characters" toggle lets the user fall back to the raw st.dataframe view if they prefer. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 16:14:14 +00:00
parent 794d4cda94
commit 90ceada2d1
4 changed files with 284 additions and 1 deletions
--- a/src/core/init.py
+++ b/src/core/init.py
@@ -78,6 +78,7 @@ from .text_clean import (
    clean_value,
    collapse_whitespace,
    fold_smart_chars,
+    hidden_char_css,
    normalize_line_endings,
    sentence_case,
    smart_title_case,
@@ -87,6 +88,8 @@ from .text_clean import (
    to_nfc,
    to_nfkc,
    trim,
+    visualize_hidden_html,
+    visualize_hidden_text,
 )

 __all__ = [
@@ -146,4 +149,7 @@ __all__ = [
    "smart_title_case",
    "sentence_case",
    "apply_case",
+    "visualize_hidden_text",
+    "visualize_hidden_html",
+    "hidden_char_css",
 ]
--- a/src/core/text_clean.py
+++ b/src/core/text_clean.py
@@ -587,3 +587,174 @@ def clean_dataframe(df: pd.DataFrame, options: Optional[CleanOptions] = None) ->
        cells_total=cells_total,
        columns_processed=columns,
    )
+
+
+# ---------------------------------------------------------------------------
+# Hidden-character visualization
+# ---------------------------------------------------------------------------
+#
+# The whole point of the cleaner is to remove characters the user can't see.
+# That makes the GUI's "before / after" view nearly useless by default — a
+# cell with NBSP padding looks identical to a cell with regular spaces. The
+# visualizers below render those characters with visible glyphs (and
+# tooltips, in the HTML variant) so the user can actually verify what the
+# cleaner is doing.
+#
+# Mapping rule: every character that's normally invisible OR that the
+# cleaner treats specially gets a glyph + label. ASCII printable characters
+# pass through untouched. Tabs and newlines are visualized because they're
+# whitespace the user often wants to see.
+
+# Unicode codepoint -> (visible glyph, short label). The label appears as a
+# tooltip in the HTML variant and as ``[label]`` in the text variant.
+_VISIBLE_CHAR_MAP: dict[str, tuple[str, str]] = {
+    # Whitespace
+    "\t":     ("→",  "TAB"),
+    "\n":     ("↵",  "LF"),
+    "\r":     ("␍",  "CR"),
+    " ": ("·",  "NBSP"),
+    " ": ("·",  "NNBSP"),
+    " ": ("·",  "THIN SP"),
+    " ": ("·",  "HAIR SP"),
+    " ": ("·",  "EN SP"),
+    " ": ("·",  "EM SP"),
+    "　": ("·",  "IDEO SP"),
+    # Zero-width / invisible
+    "": ("∅",  "ZWSP"),
+    "‌": ("∅",  "ZWNJ"),
+    "‍": ("∅",  "ZWJ"),
+    "⁠": ("∅",  "WJ"),
+    "‎": ("⮕",  "LRM"),
+    "‏": ("⬅",  "RLM"),
+    "": ("∅",  "BOM"),
+    "": ("-",  "SHY"),
+    # Smart quotes / dashes / ellipsis (visible but the user often wants
+    # them flagged because they're being folded)
+    "‘": ("'",  "L QUOTE"),
+    "’": ("'",  "R QUOTE"),
+    "“": ('"',  "L DQUOTE"),
+    "”": ('"',  "R DQUOTE"),
+    "—": ("—",  "EM DASH"),
+    "–": ("–",  "EN DASH"),
+    "…": ("…",  "ELLIPSIS"),
+    "′": ("′",  "PRIME"),
+    "″": ("″",  "DPRIME"),
+    "«": ("«",  "L GUILL"),
+    "»": ("»",  "R GUILL"),
+}
+
+
+def _is_control(ch: str) -> bool:
+    """C0 control char (other than \\t \\n \\r) or DEL."""
+    cp = ord(ch)
+    if cp == 0x7F:
+        return True
+    if cp < 0x20 and ch not in ("\t", "\n", "\r"):
+        return True
+    return False
+
+
+def visualize_hidden_text(s: str) -> str:
+    """Return a plain-text rendering of *s* with hidden characters labelled.
+
+    Each invisible/control/smart character is replaced by its glyph from
+    ``_VISIBLE_CHAR_MAP`` followed by ``[LABEL]``; ASCII printable
+    characters pass through untouched. C0 control characters not in the
+    map are rendered as ``[U+XXXX]``.
+
+    Suitable for terminal output, CSV exports, and any context where HTML
+    is not appropriate.
+    """
+    if not isinstance(s, str):
+        return s
+    out: list[str] = []
+    for ch in s:
+        mapped = _VISIBLE_CHAR_MAP.get(ch)
+        if mapped is not None:
+            glyph, label = mapped
+            out.append(f"{glyph}[{label}]")
+        elif _is_control(ch):
+            out.append(f"[U+{ord(ch):04X}]")
+        else:
+            out.append(ch)
+    return "".join(out)
+
+
+def visualize_hidden_html(s: str) -> str:
+    """Return an HTML rendering of *s* with hidden characters highlighted.
+
+    Each invisible/control/smart character is wrapped in a ``<span>`` with
+    a CSS class and a ``title`` attribute showing the codepoint and label,
+    so the user gets a tooltip on hover. ASCII printable text is HTML-
+    escaped but otherwise left as-is.
+
+    Pair with :func:`hidden_char_css` to inject the matching styles into
+    the page.
+    """
+    if not isinstance(s, str):
+        return ""
+    parts: list[str] = []
+    for ch in s:
+        mapped = _VISIBLE_CHAR_MAP.get(ch)
+        if mapped is not None:
+            glyph, label = mapped
+            cp = f"U+{ord(ch):04X}"
+            css = "hidden-whitespace" if label.endswith("SP") or label in ("TAB", "LF", "CR", "NBSP") else "hidden-special"
+            parts.append(
+                f'<span class="hidden-char {css}" '
+                f'title="{cp} {label}">{glyph}</span>'
+            )
+        elif _is_control(ch):
+            cp = f"U+{ord(ch):04X}"
+            parts.append(
+                f'<span class="hidden-char hidden-control" '
+                f'title="{cp} CTRL">␣</span>'
+            )
+        else:
+            # HTML-escape only the dangerous characters; preserve everything
+            # else so the visible content reads naturally.
+            if ch == "&":
+                parts.append("&amp;")
+            elif ch == "<":
+                parts.append("&lt;")
+            elif ch == ">":
+                parts.append("&gt;")
+            else:
+                parts.append(ch)
+    return "".join(parts)
+
+
+def hidden_char_css() -> str:
+    """CSS used by :func:`visualize_hidden_html` rendering.
+
+    Returned as a plain string so callers can inject it into Streamlit via
+    ``st.markdown(hidden_char_css(), unsafe_allow_html=True)``.
+    """
+    return """
+<style>
+.hidden-char {
+    display: inline-block;
+    padding: 0 2px;
+    margin: 0 1px;
+    border-radius: 3px;
+    font-family: ui-monospace, SFMono-Regular, monospace;
+    font-size: 0.85em;
+    cursor: help;
+}
+.hidden-char.hidden-whitespace {
+    background: #fff3cd;
+    color: #856404;
+    border: 1px solid #ffeaa7;
+}
+.hidden-char.hidden-special {
+    background: #d1ecf1;
+    color: #0c5460;
+    border: 1px solid #bee5eb;
+}
+.hidden-char.hidden-control {
+    background: #f8d7da;
+    color: #721c24;
+    border: 1px solid #f5c6cb;
+}
+</style>
+"""
--- a/src/gui/pages/2_Text_Cleaner.py
+++ b/src/gui/pages/2_Text_Cleaner.py
@@ -19,6 +19,8 @@ from src.core.text_clean import (
    PRESETS,
    CleanOptions,
    clean_dataframe,
+    hidden_char_css,
+    visualize_hidden_html,
 )

 hide_streamlit_chrome()
@@ -205,9 +207,54 @@ if result.cells_changed:
    )

    st.markdown("**Examples (first 25 changes)**")
+    show_hidden = st.toggle(
+        "Show hidden characters (NBSP, ZWSP, smart quotes, control chars…)",
+        value=True,
+        help=(
+            "Highlights characters the cleaner is removing or replacing. "
+            "Hover any badge to see the codepoint and label."
+        ),
+        key="textclean_show_hidden",
+    )
    examples = result.changes.head(25).copy()
    examples["row"] = examples["row"] + 1
-    st.dataframe(examples, use_container_width=True, hide_index=True)
+    if show_hidden:
+        # Inject the badge CSS once, then render an HTML table so the
+        # invisibles in old/new are actually visible to the user.
+        st.markdown(hidden_char_css(), unsafe_allow_html=True)
+        rows_html = []
+        for _, row in examples.iterrows():
+            rows_html.append(
+                "<tr>"
+                f"<td>{row['row']}</td>"
+                f"<td><code>{visualize_hidden_html(str(row['column']))}</code></td>"
+                f"<td>{visualize_hidden_html(str(row['old']))}</td>"
+                f"<td>{visualize_hidden_html(str(row['new']))}</td>"
+                f"<td><code>{row['ops_applied']}</code></td>"
+                "</tr>"
+            )
+        st.markdown(
+            "<table class='hidden-char-table'>"
+            "<thead><tr>"
+            "<th style='text-align:left'>Row</th>"
+            "<th style='text-align:left'>Column</th>"
+            "<th style='text-align:left'>Before</th>"
+            "<th style='text-align:left'>After</th>"
+            "<th style='text-align:left'>Ops applied</th>"
+            "</tr></thead>"
+            f"<tbody>{''.join(rows_html)}</tbody>"
+            "</table>"
+            "<style>"
+            ".hidden-char-table { width: 100%; border-collapse: collapse; }"
+            ".hidden-char-table th, .hidden-char-table td { "
+            "  padding: 4px 8px; border-bottom: 1px solid #eee; "
+            "  vertical-align: top; }"
+            ".hidden-char-table tbody tr:hover { background: #fafafa; }"
+            "</style>",
+            unsafe_allow_html=True,
+        )
+    else:
+        st.dataframe(examples, use_container_width=True, hide_index=True)

 st.markdown("**Cleaned preview (first 10 rows)**")
 st.dataframe(result.cleaned_df.head(10), use_container_width=True)
--- a/tests/test_text_clean.py
+++ b/tests/test_text_clean.py
@@ -480,3 +480,62 @@ class TestReporting:
        df = pd.DataFrame({"a": ["x", "y", "z"], "n": [1, 2, 3]})
        result = clean_dataframe(df)
        assert result.cells_total == 3  # only "a" is processed
+
+
+class TestVisualizeHidden:
+    """``visualize_hidden_*`` makes invisible characters visible to the user."""
+
+    def test_text_passes_ascii_through(self):
+        from src.core.text_clean import visualize_hidden_text
+        assert visualize_hidden_text("hello") == "hello"
+
+    def test_text_labels_nbsp(self):
+        from src.core.text_clean import visualize_hidden_text
+        out = visualize_hidden_text("Hi\u00a0there")
+        assert "[NBSP]" in out
+
+    def test_text_labels_zwsp(self):
+        from src.core.text_clean import visualize_hidden_text
+        out = visualize_hidden_text("a\u200bb")
+        assert "[ZWSP]" in out
+
+    def test_text_labels_tab_and_newline(self):
+        from src.core.text_clean import visualize_hidden_text
+        out = visualize_hidden_text("a\tb\nc")
+        assert "[TAB]" in out
+        assert "[LF]" in out
+
+    def test_text_labels_smart_quotes(self):
+        from src.core.text_clean import visualize_hidden_text
+        out = visualize_hidden_text("“hi”")
+        assert "[L DQUOTE]" in out and "[R DQUOTE]" in out
+
+    def test_text_labels_unmapped_control_with_codepoint(self):
+        from src.core.text_clean import visualize_hidden_text
+        out = visualize_hidden_text("a\x07b")  # BEL
+        assert "[U+0007]" in out
+
+    def test_html_wraps_invisibles_in_span(self):
+        from src.core.text_clean import visualize_hidden_html
+        out = visualize_hidden_html("Hi\u00a0\u200bthere")
+        assert '<span class="hidden-char' in out
+        assert "U+00A0" in out and "U+200B" in out
+
+    def test_html_escapes_dangerous_chars(self):
+        from src.core.text_clean import visualize_hidden_html
+        out = visualize_hidden_html("<a&b>")
+        assert "&lt;" in out and "&amp;" in out and "&gt;" in out
+
+    def test_html_passes_normal_text_through(self):
+        from src.core.text_clean import visualize_hidden_html
+        assert visualize_hidden_html("plain") == "plain"
+
+    def test_css_returns_a_style_block(self):
+        from src.core.text_clean import hidden_char_css
+        css = hidden_char_css()
+        assert "<style>" in css and "hidden-char" in css
+
+    def test_non_string_passthrough(self):
+        from src.core.text_clean import visualize_hidden_text, visualize_hidden_html
+        assert visualize_hidden_text(None) is None  # type: ignore[arg-type]
+        assert visualize_hidden_html(None) == ""