feat(gui): hidden-char-aware preview tables in Text Cleaner

The Text Cleaner had two st.dataframe previews — the initial upload
preview ("Preview: filename") and the post-clean "Cleaned preview"
table — that both rendered cells with the same browser-collapses-
whitespace, hides-invisibles problem the analyzer findings panel had
before commit 1049c03.

components.render_hidden_aware_preview(df, n_rows, caption) renders a
DataFrame as an HTML table where:
  - every cell uses visualize_hidden_html(mark_outer_whitespace=True),
    so leading/trailing ASCII spaces appear as per-character "·" badges
  - white-space: pre-wrap on every cell preserves internal multi-space
    runs and embedded newlines visually
  - headers route through the same visualizer so dirty column names
    (NBSP padding, ZWSP, smart quotes) show their badges too
  - NaN cells render as a faint "NaN" placeholder
  - rows are sticky-headed and scrollable inside a 26rem capped
    container so a 10-row preview doesn't push the rest of the UI off
    screen

2_Text_Cleaner.py wires it into both previews:
  - The upload preview gains its own "Show hidden characters in preview"
    toggle (default on).
  - The cleaned preview reuses the existing show_hidden toggle that
    already governs the Examples changes table, so one switch controls
    the whole results section.

Either toggle off falls back to the original st.dataframe view.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-29 16:26:30 +00:00
parent 1049c033cb
commit e9c490ae1b
2 changed files with 126 additions and 3 deletions

View File

@@ -797,6 +797,111 @@ def render_findings_panel(findings, *, header: str = "Detected issues") -> None:
_render_one_finding(f)
_PREVIEW_TABLE_CSS = """
<style>
.hidden-aware-preview {
width: 100%;
border-collapse: collapse;
font-size: 0.9em;
}
.hidden-aware-preview th,
.hidden-aware-preview td {
padding: 4px 8px;
border: 1px solid #eee;
text-align: left;
vertical-align: top;
font-family: ui-monospace, SFMono-Regular, monospace;
/* pre-wrap so internal ASCII whitespace and embedded newlines render
as the user wrote them; otherwise browsers collapse adjacent spaces. */
white-space: pre-wrap;
word-break: break-word;
max-width: 32em;
}
.hidden-aware-preview thead th {
background: #f6f8fa;
position: sticky;
top: 0;
}
.hidden-aware-preview tbody tr:nth-child(even) { background: #fafafa; }
.hidden-aware-preview .row-num {
color: #888;
font-family: inherit;
background: #f6f8fa;
text-align: right;
}
.hidden-aware-preview-wrap {
max-height: 26rem;
overflow: auto;
border: 1px solid #eee;
border-radius: 4px;
}
</style>
"""
def render_hidden_aware_preview(
df,
*,
n_rows: int = 10,
caption: str | None = None,
) -> None:
"""Render a DataFrame preview that shows hidden characters in every cell.
Used for the Text Cleaner's "before" and "after" previews so the user
can actually see the leading/trailing whitespace, NBSP padding,
zero-width characters, and smart punctuation that the cleaner is going
to remove (or just removed). A plain ``st.dataframe`` collapses outer
ASCII whitespace and renders invisibles as nothing, defeating the
point of a preview in a cleanup tool.
Headers and cell values are both routed through
:func:`visualize_hidden_html` with ``mark_outer_whitespace=True``.
"""
import pandas as pd
from src.core.text_clean import hidden_char_css, visualize_hidden_html
if df is None or len(df) == 0:
st.info("No rows to preview.")
return
sliced = df.head(n_rows) if len(df) > n_rows else df
st.markdown(hidden_char_css() + _PREVIEW_TABLE_CSS, unsafe_allow_html=True)
if caption:
st.caption(caption)
header_cells = "".join(
f"<th>{visualize_hidden_html(str(c), mark_outer_whitespace=True)}</th>"
for c in sliced.columns
)
body_rows: list[str] = []
for row_idx, (orig_idx, row) in enumerate(sliced.iterrows(), start=1):
cells = ["<td class='row-num'>" + str(row_idx) + "</td>"]
for col in sliced.columns:
value = row[col]
if isinstance(value, str):
rendered = visualize_hidden_html(value, mark_outer_whitespace=True)
elif pd.isna(value):
rendered = "<span style='color:#aaa'>NaN</span>"
else:
# Non-string scalars (numerics, bools) just stringify; they
# won't have invisible chars but we still need html-escape.
rendered = visualize_hidden_html(str(value))
cells.append(f"<td>{rendered}</td>")
body_rows.append("<tr>" + "".join(cells) + "</tr>")
st.markdown(
"<div class='hidden-aware-preview-wrap'>"
"<table class='hidden-aware-preview'>"
f"<thead><tr><th class='row-num'>#</th>{header_cells}</tr></thead>"
f"<tbody>{''.join(body_rows)}</tbody>"
"</table>"
"</div>",
unsafe_allow_html=True,
)
_SAMPLE_TABLE_CSS = """
<style>
.findings-sample-table {

View File

@@ -14,7 +14,11 @@ _project_root = Path(__file__).resolve().parent.parent.parent.parent
if str(_project_root) not in sys.path:
sys.path.insert(0, str(_project_root))
from src.gui.components import hide_streamlit_chrome, pickup_or_upload
from src.gui.components import (
hide_streamlit_chrome,
pickup_or_upload,
render_hidden_aware_preview,
)
from src.core.text_clean import (
PRESETS,
CleanOptions,
@@ -81,7 +85,16 @@ except Exception as e:
st.subheader(f"Preview: {uploaded.name}")
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
st.dataframe(df.head(10), use_container_width=True)
preview_show_hidden = st.toggle(
"Show hidden characters in preview",
value=True,
help="Highlights NBSP, zero-width chars, smart quotes, and leading/trailing whitespace.",
key="textclean_preview_show_hidden",
)
if preview_show_hidden:
render_hidden_aware_preview(df, n_rows=10)
else:
st.dataframe(df.head(10), use_container_width=True)
st.divider()
@@ -257,7 +270,12 @@ if result.cells_changed:
st.dataframe(examples, use_container_width=True, hide_index=True)
st.markdown("**Cleaned preview (first 10 rows)**")
st.dataframe(result.cleaned_df.head(10), use_container_width=True)
# Reuse the same toggle the Examples table uses so the user controls both
# the changes audit and the cleaned preview with one switch.
if show_hidden:
render_hidden_aware_preview(result.cleaned_df, n_rows=10)
else:
st.dataframe(result.cleaned_df.head(10), use_container_width=True)
# ---------------------------------------------------------------------------
# Downloads