From e9c490ae1b496054c0f4481679270bb04f9081b5 Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 29 Apr 2026 16:26:30 +0000 Subject: [PATCH] feat(gui): hidden-char-aware preview tables in Text Cleaner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Text Cleaner had two st.dataframe previews — the initial upload preview ("Preview: filename") and the post-clean "Cleaned preview" table — that both rendered cells with the same browser-collapses- whitespace, hides-invisibles problem the analyzer findings panel had before commit 1049c03. components.render_hidden_aware_preview(df, n_rows, caption) renders a DataFrame as an HTML table where: - every cell uses visualize_hidden_html(mark_outer_whitespace=True), so leading/trailing ASCII spaces appear as per-character "·" badges - white-space: pre-wrap on every cell preserves internal multi-space runs and embedded newlines visually - headers route through the same visualizer so dirty column names (NBSP padding, ZWSP, smart quotes) show their badges too - NaN cells render as a faint "NaN" placeholder - rows are sticky-headed and scrollable inside a 26rem capped container so a 10-row preview doesn't push the rest of the UI off screen 2_Text_Cleaner.py wires it into both previews: - The upload preview gains its own "Show hidden characters in preview" toggle (default on). - The cleaned preview reuses the existing show_hidden toggle that already governs the Examples changes table, so one switch controls the whole results section. Either toggle off falls back to the original st.dataframe view. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/gui/components.py | 105 ++++++++++++++++++++++++++++++++ src/gui/pages/2_Text_Cleaner.py | 24 +++++++- 2 files changed, 126 insertions(+), 3 deletions(-) diff --git a/src/gui/components.py b/src/gui/components.py index 25641d1..59c47a3 100644 --- a/src/gui/components.py +++ b/src/gui/components.py @@ -797,6 +797,111 @@ def render_findings_panel(findings, *, header: str = "Detected issues") -> None: _render_one_finding(f) +_PREVIEW_TABLE_CSS = """ + +""" + + +def render_hidden_aware_preview( + df, + *, + n_rows: int = 10, + caption: str | None = None, +) -> None: + """Render a DataFrame preview that shows hidden characters in every cell. + + Used for the Text Cleaner's "before" and "after" previews so the user + can actually see the leading/trailing whitespace, NBSP padding, + zero-width characters, and smart punctuation that the cleaner is going + to remove (or just removed). A plain ``st.dataframe`` collapses outer + ASCII whitespace and renders invisibles as nothing, defeating the + point of a preview in a cleanup tool. + + Headers and cell values are both routed through + :func:`visualize_hidden_html` with ``mark_outer_whitespace=True``. + """ + import pandas as pd + from src.core.text_clean import hidden_char_css, visualize_hidden_html + + if df is None or len(df) == 0: + st.info("No rows to preview.") + return + + sliced = df.head(n_rows) if len(df) > n_rows else df + + st.markdown(hidden_char_css() + _PREVIEW_TABLE_CSS, unsafe_allow_html=True) + if caption: + st.caption(caption) + + header_cells = "".join( + f"{visualize_hidden_html(str(c), mark_outer_whitespace=True)}" + for c in sliced.columns + ) + + body_rows: list[str] = [] + for row_idx, (orig_idx, row) in enumerate(sliced.iterrows(), start=1): + cells = ["" + str(row_idx) + ""] + for col in sliced.columns: + value = row[col] + if isinstance(value, str): + rendered = visualize_hidden_html(value, mark_outer_whitespace=True) + elif pd.isna(value): + rendered = "NaN" + else: + # Non-string scalars (numerics, bools) just stringify; they + # won't have invisible chars but we still need html-escape. + rendered = visualize_hidden_html(str(value)) + cells.append(f"{rendered}") + body_rows.append("" + "".join(cells) + "") + + st.markdown( + "
" + "" + f"{header_cells}" + f"{''.join(body_rows)}" + "
#
" + "
", + unsafe_allow_html=True, + ) + + _SAMPLE_TABLE_CSS = """