diff --git a/src/core/__init__.py b/src/core/__init__.py index 23e3302..1233447 100644 --- a/src/core/__init__.py +++ b/src/core/__init__.py @@ -78,6 +78,7 @@ from .text_clean import ( clean_value, collapse_whitespace, fold_smart_chars, + hidden_char_css, normalize_line_endings, sentence_case, smart_title_case, @@ -87,6 +88,8 @@ from .text_clean import ( to_nfc, to_nfkc, trim, + visualize_hidden_html, + visualize_hidden_text, ) __all__ = [ @@ -146,4 +149,7 @@ __all__ = [ "smart_title_case", "sentence_case", "apply_case", + "visualize_hidden_text", + "visualize_hidden_html", + "hidden_char_css", ] diff --git a/src/core/text_clean.py b/src/core/text_clean.py index 432d40b..4eb2fc4 100644 --- a/src/core/text_clean.py +++ b/src/core/text_clean.py @@ -587,3 +587,174 @@ def clean_dataframe(df: pd.DataFrame, options: Optional[CleanOptions] = None) -> cells_total=cells_total, columns_processed=columns, ) + + +# --------------------------------------------------------------------------- +# Hidden-character visualization +# --------------------------------------------------------------------------- +# +# The whole point of the cleaner is to remove characters the user can't see. +# That makes the GUI's "before / after" view nearly useless by default — a +# cell with NBSP padding looks identical to a cell with regular spaces. The +# visualizers below render those characters with visible glyphs (and +# tooltips, in the HTML variant) so the user can actually verify what the +# cleaner is doing. +# +# Mapping rule: every character that's normally invisible OR that the +# cleaner treats specially gets a glyph + label. ASCII printable characters +# pass through untouched. Tabs and newlines are visualized because they're +# whitespace the user often wants to see. + +# Unicode codepoint -> (visible glyph, short label). The label appears as a +# tooltip in the HTML variant and as ``[label]`` in the text variant. +_VISIBLE_CHAR_MAP: dict[str, tuple[str, str]] = { + # Whitespace + "\t": ("→", "TAB"), + "\n": ("↵", "LF"), + "\r": ("␍", "CR"), + " ": ("·", "NBSP"), + " ": ("·", "NNBSP"), + " ": ("·", "THIN SP"), + " ": ("·", "HAIR SP"), + " ": ("·", "EN SP"), + " ": ("·", "EM SP"), + " ": ("·", "IDEO SP"), + # Zero-width / invisible + "​": ("∅", "ZWSP"), + "‌": ("∅", "ZWNJ"), + "‍": ("∅", "ZWJ"), + "⁠": ("∅", "WJ"), + "‎": ("⮕", "LRM"), + "‏": ("⬅", "RLM"), + "": ("∅", "BOM"), + "­": ("-", "SHY"), + # Smart quotes / dashes / ellipsis (visible but the user often wants + # them flagged because they're being folded) + "‘": ("'", "L QUOTE"), + "’": ("'", "R QUOTE"), + "“": ('"', "L DQUOTE"), + "”": ('"', "R DQUOTE"), + "—": ("—", "EM DASH"), + "–": ("–", "EN DASH"), + "…": ("…", "ELLIPSIS"), + "′": ("′", "PRIME"), + "″": ("″", "DPRIME"), + "«": ("«", "L GUILL"), + "»": ("»", "R GUILL"), +} + + +def _is_control(ch: str) -> bool: + """C0 control char (other than \\t \\n \\r) or DEL.""" + cp = ord(ch) + if cp == 0x7F: + return True + if cp < 0x20 and ch not in ("\t", "\n", "\r"): + return True + return False + + +def visualize_hidden_text(s: str) -> str: + """Return a plain-text rendering of *s* with hidden characters labelled. + + Each invisible/control/smart character is replaced by its glyph from + ``_VISIBLE_CHAR_MAP`` followed by ``[LABEL]``; ASCII printable + characters pass through untouched. C0 control characters not in the + map are rendered as ``[U+XXXX]``. + + Suitable for terminal output, CSV exports, and any context where HTML + is not appropriate. + """ + if not isinstance(s, str): + return s + out: list[str] = [] + for ch in s: + mapped = _VISIBLE_CHAR_MAP.get(ch) + if mapped is not None: + glyph, label = mapped + out.append(f"{glyph}[{label}]") + elif _is_control(ch): + out.append(f"[U+{ord(ch):04X}]") + else: + out.append(ch) + return "".join(out) + + +def visualize_hidden_html(s: str) -> str: + """Return an HTML rendering of *s* with hidden characters highlighted. + + Each invisible/control/smart character is wrapped in a ```` with + a CSS class and a ``title`` attribute showing the codepoint and label, + so the user gets a tooltip on hover. ASCII printable text is HTML- + escaped but otherwise left as-is. + + Pair with :func:`hidden_char_css` to inject the matching styles into + the page. + """ + if not isinstance(s, str): + return "" + parts: list[str] = [] + for ch in s: + mapped = _VISIBLE_CHAR_MAP.get(ch) + if mapped is not None: + glyph, label = mapped + cp = f"U+{ord(ch):04X}" + css = "hidden-whitespace" if label.endswith("SP") or label in ("TAB", "LF", "CR", "NBSP") else "hidden-special" + parts.append( + f'{glyph}' + ) + elif _is_control(ch): + cp = f"U+{ord(ch):04X}" + parts.append( + f'' + ) + else: + # HTML-escape only the dangerous characters; preserve everything + # else so the visible content reads naturally. + if ch == "&": + parts.append("&") + elif ch == "<": + parts.append("<") + elif ch == ">": + parts.append(">") + else: + parts.append(ch) + return "".join(parts) + + +def hidden_char_css() -> str: + """CSS used by :func:`visualize_hidden_html` rendering. + + Returned as a plain string so callers can inject it into Streamlit via + ``st.markdown(hidden_char_css(), unsafe_allow_html=True)``. + """ + return """ + +""" diff --git a/src/gui/pages/2_Text_Cleaner.py b/src/gui/pages/2_Text_Cleaner.py index ab0b8fb..eb1d341 100644 --- a/src/gui/pages/2_Text_Cleaner.py +++ b/src/gui/pages/2_Text_Cleaner.py @@ -19,6 +19,8 @@ from src.core.text_clean import ( PRESETS, CleanOptions, clean_dataframe, + hidden_char_css, + visualize_hidden_html, ) hide_streamlit_chrome() @@ -205,9 +207,54 @@ if result.cells_changed: ) st.markdown("**Examples (first 25 changes)**") + show_hidden = st.toggle( + "Show hidden characters (NBSP, ZWSP, smart quotes, control chars…)", + value=True, + help=( + "Highlights characters the cleaner is removing or replacing. " + "Hover any badge to see the codepoint and label." + ), + key="textclean_show_hidden", + ) examples = result.changes.head(25).copy() examples["row"] = examples["row"] + 1 - st.dataframe(examples, use_container_width=True, hide_index=True) + if show_hidden: + # Inject the badge CSS once, then render an HTML table so the + # invisibles in old/new are actually visible to the user. + st.markdown(hidden_char_css(), unsafe_allow_html=True) + rows_html = [] + for _, row in examples.iterrows(): + rows_html.append( + "" + f"{row['row']}" + f"{visualize_hidden_html(str(row['column']))}" + f"{visualize_hidden_html(str(row['old']))}" + f"{visualize_hidden_html(str(row['new']))}" + f"{row['ops_applied']}" + "" + ) + st.markdown( + "" + "" + "" + "" + "" + "" + "" + "" + f"{''.join(rows_html)}" + "
RowColumnBeforeAfterOps applied
" + "", + unsafe_allow_html=True, + ) + else: + st.dataframe(examples, use_container_width=True, hide_index=True) st.markdown("**Cleaned preview (first 10 rows)**") st.dataframe(result.cleaned_df.head(10), use_container_width=True) diff --git a/tests/test_text_clean.py b/tests/test_text_clean.py index 0ae2fff..4c88cba 100644 --- a/tests/test_text_clean.py +++ b/tests/test_text_clean.py @@ -480,3 +480,62 @@ class TestReporting: df = pd.DataFrame({"a": ["x", "y", "z"], "n": [1, 2, 3]}) result = clean_dataframe(df) assert result.cells_total == 3 # only "a" is processed + + +class TestVisualizeHidden: + """``visualize_hidden_*`` makes invisible characters visible to the user.""" + + def test_text_passes_ascii_through(self): + from src.core.text_clean import visualize_hidden_text + assert visualize_hidden_text("hello") == "hello" + + def test_text_labels_nbsp(self): + from src.core.text_clean import visualize_hidden_text + out = visualize_hidden_text("Hi\u00a0there") + assert "[NBSP]" in out + + def test_text_labels_zwsp(self): + from src.core.text_clean import visualize_hidden_text + out = visualize_hidden_text("a\u200bb") + assert "[ZWSP]" in out + + def test_text_labels_tab_and_newline(self): + from src.core.text_clean import visualize_hidden_text + out = visualize_hidden_text("a\tb\nc") + assert "[TAB]" in out + assert "[LF]" in out + + def test_text_labels_smart_quotes(self): + from src.core.text_clean import visualize_hidden_text + out = visualize_hidden_text("“hi”") + assert "[L DQUOTE]" in out and "[R DQUOTE]" in out + + def test_text_labels_unmapped_control_with_codepoint(self): + from src.core.text_clean import visualize_hidden_text + out = visualize_hidden_text("a\x07b") # BEL + assert "[U+0007]" in out + + def test_html_wraps_invisibles_in_span(self): + from src.core.text_clean import visualize_hidden_html + out = visualize_hidden_html("Hi\u00a0\u200bthere") + assert '") + assert "<" in out and "&" in out and ">" in out + + def test_html_passes_normal_text_through(self): + from src.core.text_clean import visualize_hidden_html + assert visualize_hidden_html("plain") == "plain" + + def test_css_returns_a_style_block(self): + from src.core.text_clean import hidden_char_css + css = hidden_char_css() + assert "