feat(i18n): add language-pack scaffold with English and Spanish

Introduces ``src/i18n`` with a tiny JSON-backed t() lookup, an in-session
language preference, and a sidebar selector wired through
``hide_streamlit_chrome`` so every page picks up the same picker. Covers
home, tool cards, findings panel, gate, shutdown, and pickup banner
strings. Tests pin pack parity and the farewell-overlay JS escape so
future packs can't silently regress.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-13 15:11:30 +00:00
parent 4706ed571e
commit c4ce86bd64
8 changed files with 649 additions and 75 deletions

View File

@@ -26,13 +26,16 @@ from src.gui.components import (
hide_streamlit_chrome,
upload_and_analyze_section,
)
from src.i18n import t
st.set_page_config(
page_title="DataTools — Data Cleaning Mastery",
page_title=t("home.page_title"),
page_icon="🧹",
layout="wide",
)
# ``hide_streamlit_chrome`` also renders the sidebar language selector,
# so every page that hides chrome picks up the same picker.
hide_streamlit_chrome()
@@ -40,8 +43,8 @@ hide_streamlit_chrome()
# Home page
# ---------------------------------------------------------------------------
st.title("🧹 DataTools — Data Cleaning Mastery")
st.caption("A 9-tool suite for cleaning, standardizing, and validating tabular data. Runs 100% locally.")
st.title(t("home.title"))
st.caption(t("home.caption"))
st.divider()
@@ -57,7 +60,7 @@ st.divider()
# Tool cards
# ---------------------------------------------------------------------------
from src.gui.tools_registry import TOOLS
from src.gui.tools_registry import TOOLS, tool_description, tool_name
# Render tool cards in a 3-column grid. Cards picked up by the analyzer get a
# coloured "N findings" badge so the user can see at a glance which tools
@@ -70,15 +73,17 @@ for row_start in range(0, len(TOOLS), 3):
break
tool = TOOLS[idx]
with col:
status_key = "status.ready" if tool.status == "Ready" else "status.coming_soon"
status_color = "green" if tool.status == "Ready" else "orange"
badge = ""
n = findings_count_for_tool(tool.tool_id)
if n:
badge = f" :red-background[**{n} finding{'s' if n != 1 else ''}**]"
badge_key = "home.findings_badge_one" if n == 1 else "home.findings_badge_other"
badge = f" :red-background[**{t(badge_key, n=n)}**]"
st.markdown(
f"### {tool.icon} {tool.name}{badge}\n\n"
f"{tool.description}\n\n"
f":{status_color}[**{tool.status}**]"
f"### {tool.icon} {tool_name(tool.tool_id)}{badge}\n\n"
f"{tool_description(tool.tool_id)}\n\n"
f":{status_color}[**{t(status_key)}**]"
)
@@ -87,7 +92,4 @@ for row_start in range(0, len(TOOLS), 3):
# ---------------------------------------------------------------------------
st.divider()
st.caption(
"Runs locally. Your data never leaves this computer. "
"| DataTools v3.0"
)
st.caption(t("chrome.footer"))

View File

@@ -11,6 +11,7 @@ from typing import Optional
import pandas as pd
import streamlit as st
from src.i18n import t as _t
from src.core.dedup import (
Algorithm,
ColumnMatchStrategy,
@@ -72,15 +73,26 @@ footer {
def hide_streamlit_chrome() -> None:
"""Inject CSS to hide Streamlit's default header, menu, and footer."""
"""Inject CSS to hide Streamlit's default header, menu, and footer.
Also renders the sidebar language selector, since every entrypoint
that hides the default chrome wants the picker visible in the
same place. Pages that want a clean chrome without the selector can
inject ``_HIDE_CHROME_CSS`` themselves instead of calling this.
"""
st.markdown(_HIDE_CHROME_CSS, unsafe_allow_html=True)
# Imported lazily so this module stays importable in environments
# where the i18n packs haven't been laid out (e.g. unit tests of
# individual legacy helpers).
from src.i18n import render_language_selector
render_language_selector()
# ---------------------------------------------------------------------------
# Clean shutdown
# ---------------------------------------------------------------------------
_FAREWELL_SCRIPT = """
_FAREWELL_SCRIPT_TEMPLATE = """
<script>
(function () {
// Strategy: append a full-screen overlay directly to the parent's
@@ -104,8 +116,8 @@ _FAREWELL_SCRIPT = """
'<div style="text-align:center;padding:32px 40px;border:1px solid #252a36;' +
'border-radius:12px;background:#161922;max-width:480px;">' +
'<h1 style="margin:0 0 8px 0;font-weight:600;letter-spacing:-0.01em;">' +
'DataTools has shut down</h1>' +
'<p style="opacity:0.7;margin:0;">You can close this window.</p>' +
'__TITLE__</h1>' +
'<p style="opacity:0.7;margin:0;">__SUBTITLE__</p>' +
'</div>';
return overlay;
}
@@ -127,7 +139,32 @@ _FAREWELL_SCRIPT = """
"""
def quit_button(label: str = "Quit app", *, key: str = "quit_app_button") -> None:
def _js_html_safe(s: str) -> str:
"""Escape *s* so it can be embedded inside the farewell overlay's
JS-single-quoted, innerHTML-bound payload.
Order matters: backslash first (so subsequent escapes don't get
re-escaped), then the JS string-terminator, then HTML-special chars.
"""
return (
s.replace("\\", "\\\\")
.replace("'", "\\'")
.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
)
def _farewell_script() -> str:
"""Render the farewell overlay JS with the current language's strings."""
return (
_FAREWELL_SCRIPT_TEMPLATE
.replace("__TITLE__", _js_html_safe(_t("quit.farewell_title")))
.replace("__SUBTITLE__", _js_html_safe(_t("quit.farewell_subtitle")))
)
def quit_button(label: str | None = None, *, key: str = "quit_app_button") -> None:
"""Render a Quit button that terminates the Streamlit server.
Streamlit has no first-class shutdown hook, and signalling the
@@ -140,10 +177,13 @@ def quit_button(label: str = "Quit app", *, key: str = "quit_app_button") -> Non
a self-contained "App closed" page so the user never sees
Streamlit's red connection-error overlay.
"""
if label is None:
label = _t("quit.button")
if st.session_state.get("_app_shutting_down"):
from streamlit.components.v1 import html as _components_html
_components_html(_FAREWELL_SCRIPT, height=0)
st.success("Shutting down… you can close this window.")
_components_html(_farewell_script(), height=0)
st.success(_t("quit.shutting_down"))
st.stop()
if st.button(label, key=key, type="secondary"):
@@ -824,15 +864,25 @@ _TOOL_PAGE_PATHS: dict[str, str] = {
def tool_display_name(tool_id: str) -> str:
"""Map a stable tool id to its GUI display name; falls back to the id."""
return TOOL_DISPLAY_NAMES.get(tool_id, tool_id) if tool_id else "Informational"
"""Map a stable tool id to its GUI display name; falls back to the id.
Routes through the active language pack so the home grid, findings
panel headers, and "Open tool" buttons all stay in sync with the
sidebar's language selection.
"""
if not tool_id:
return _t("findings.untargeted_label")
translated = _t(f"tools.{tool_id}.name")
if translated != f"tools.{tool_id}.name":
return translated
return TOOL_DISPLAY_NAMES.get(tool_id, tool_id)
def _tool_page_slug(tool_id: str) -> str:
return _TOOL_PAGE_PATHS.get(tool_id, "")
def render_findings_panel(findings, *, header: str = "Detected issues") -> None:
def render_findings_panel(findings, *, header: str | None = None) -> None:
"""Render a list of :class:`Finding` objects grouped by tool.
Each tool gets a header with the count, an open-tool button, and a list
@@ -842,8 +892,11 @@ def render_findings_panel(findings, *, header: str = "Detected issues") -> None:
from src.core.analyze import findings_by_tool # local import to avoid cycle
from src.core.text_clean import hidden_char_css
if header is None:
header = _t("findings.header")
if not findings:
st.success("No issues detected. Open any tool below to start working.")
st.success(_t("findings.none"))
return
# Inject the hidden-char badge styles once so every sample value below
@@ -854,7 +907,10 @@ def render_findings_panel(findings, *, header: str = "Detected issues") -> None:
for f in findings:
by_sev[f.severity] = by_sev.get(f.severity, 0) + 1
sev_summary = " · ".join(
f"{_SEVERITY_ICON[s]} {by_sev[s]} {s}"
_t(
"findings.severity_summary_segment",
icon=_SEVERITY_ICON[s], n=by_sev[s], severity=s,
)
for s in ("error", "warn", "info") if by_sev.get(s)
)
st.markdown(f"### {header}")
@@ -865,8 +921,9 @@ def render_findings_panel(findings, *, header: str = "Detected issues") -> None:
for tool_id in sorted(grouped):
items = grouped[tool_id]
name = tool_display_name(tool_id)
with st.expander(
f"{tool_display_name(tool_id)}{len(items)} finding(s)",
_t("findings.tool_section_label", tool=name, n=len(items)),
expanded=any(f.severity == "error" for f in items),
):
for f in items:
@@ -876,11 +933,11 @@ def render_findings_panel(findings, *, header: str = "Detected issues") -> None:
# Streamlit resolves page paths relative to the entrypoint
# (src/gui/app.py), so a leading ``src/gui/`` would point
# outside the allowed page tree on Windows.
st.page_link(page_slug, label=f"Open {tool_display_name(tool_id)}")
st.page_link(page_slug, label=_t("findings.open_tool", tool=name))
if untargeted:
with st.expander(
f"Other / file-level — {len(untargeted)} finding(s)",
_t("findings.other_section_label", n=len(untargeted)),
expanded=False,
):
for f in untargeted:
@@ -1066,28 +1123,15 @@ def upload_and_analyze_section() -> None:
own uploader. Each tool page already has its own uploader today, so
this is purely additive.
"""
st.markdown("### 📤 Upload a file to start")
st.caption(
"Optional: scan an uploaded file for data quality issues and see "
"which tools can fix each one. Skip if you already know what you need."
)
st.caption(
"**Up to 1 GB.** Formats: CSV, TSV, XLSX, XLS. "
"Delimiters auto-detected: comma, tab, semicolon, pipe. "
"Encodings auto-detected: UTF-8 (with/without BOM), UTF-16, "
"cp1252, Latin-1/9, cp1250, ISO-8859-2, cp1251, KOI8-R, "
"Mac Roman, Shift_JIS, GB18030, Big5, EUC-KR — and override on the Review page."
)
st.markdown(f"### {_t('upload.heading')}")
st.caption(_t("upload.intro"))
st.caption(_t("upload.limits"))
uploaded = st.file_uploader(
"Upload CSV or Excel",
_t("upload.uploader_label"),
type=["csv", "tsv", "xlsx", "xls"],
key="home_upload",
help=(
"Up to 1 GB. Comma / tab / semicolon / pipe delimiters all "
"auto-detected. Encoding auto-detected with override on the "
"Review page if needed."
),
help=_t("upload.uploader_help"),
)
if uploaded is None:
return
@@ -1106,16 +1150,16 @@ def upload_and_analyze_section() -> None:
col_run, col_skip, _ = st.columns([1, 1, 4])
with col_run:
run_clicked = st.button("Run analysis", type="primary", key="home_run_analysis")
run_clicked = st.button(_t("upload.run_button"), type="primary", key="home_run_analysis")
with col_skip:
skip_clicked = st.button("Skip", key="home_skip_analysis")
skip_clicked = st.button(_t("upload.skip_button"), key="home_skip_analysis")
if skip_clicked:
st.session_state["home_findings"] = []
st.session_state["home_skipped"] = True
if run_clicked:
with st.spinner("Scanning"):
with st.spinner(_t("upload.scanning")):
findings = _run_analysis_on_upload(uploaded)
st.session_state["home_findings"] = findings
st.session_state["home_skipped"] = False
@@ -1125,7 +1169,7 @@ def upload_and_analyze_section() -> None:
return
if st.session_state.get("home_skipped"):
st.info("Analysis skipped. Open any tool below to start working.")
st.info(_t("upload.skipped_notice"))
return
st.divider()
@@ -1230,13 +1274,9 @@ def require_normalization_gate() -> None:
if matched:
return
name = st.session_state.get("home_uploaded_name", "the uploaded file")
st.warning(
f"**{name}** must pass the CSV-normalization gate before you can "
f"use this tool. Open the Review page to apply the fixes our "
f"analyzer recommends."
)
if st.button("Go to Review & Normalize", type="primary"):
name = st.session_state.get("home_uploaded_name") or _t("gate.default_name")
st.warning(_t("gate.warning", name=name))
if st.button(_t("gate.open_review"), type="primary"):
st.switch_page("pages/0_Review.py")
st.stop()
@@ -1269,27 +1309,22 @@ def pickup_or_upload(
use_session = has_session_upload and not st.session_state.get(override_key, False)
if use_session:
name = st.session_state.get("home_uploaded_name", "uploaded file")
st.info(f"Using **{name}** from the upload screen.")
if st.button("Use a different file", key=f"{key}__pick_diff"):
name = st.session_state.get("home_uploaded_name") or _t("gate.default_name")
st.info(_t("upload.using_session_file", name=name))
if st.button(_t("upload.use_different_file"), key=f"{key}__pick_diff"):
st.session_state[override_key] = True
st.rerun()
return _StashedUpload(name, st.session_state["home_uploaded_bytes"])
if {"csv", "tsv", "xlsx", "xls"} & set(types):
st.caption(
"Up to 1 GB. Delimiters auto-detected: comma, tab, semicolon, pipe. "
"Encoding auto-detected (UTF-8 / UTF-16 / cp1252 / Latin-1 family / "
"cp1250 / cp1251 / KOI8-R / Mac Roman / Shift_JIS / GB18030 / Big5 / "
"EUC-KR), with override on the Review page."
)
st.caption(_t("upload.pickup_caption"))
uploaded = st.file_uploader(label, type=types, key=key, help=help)
if uploaded is not None and st.session_state.get(override_key):
# User has uploaded their own file on this page; clear the override
# so the next visit to a tool page starts fresh.
pass
if uploaded is None and st.session_state.get(override_key) and has_session_upload:
if st.button("Switch back to upload-screen file", key=f"{key}__switch_back"):
if st.button(_t("upload.switch_back"), key=f"{key}__switch_back"):
st.session_state[override_key] = False
st.rerun()
return uploaded

View File

@@ -18,24 +18,21 @@ if str(_project_root) not in sys.path:
sys.path.insert(0, str(_project_root))
from src.gui.components import hide_streamlit_chrome, quit_button
from src.i18n import t
st.set_page_config(
page_title="DataTools — Close",
page_title=t("close_page.page_title"),
page_icon="🛑",
layout="wide",
)
hide_streamlit_chrome()
st.title("🛑 Close DataTools")
st.caption("Shut down the local app and free the terminal.")
st.title(t("close_page.title"))
st.caption(t("close_page.caption"))
st.divider()
st.markdown(
"Clicking the button below will terminate the DataTools server. "
"Any unsaved work in other tools will be lost. Once the app shuts "
"down you can close this window."
)
st.markdown(t("close_page.body"))
st.write("")
quit_button(label="Close the app", key="quit_app_button_page")
quit_button(label=t("close_page.button"), key="quit_app_button_page")

View File

@@ -150,3 +150,20 @@ def display_name(tool_id: str) -> str:
"""Return the human-readable name; fall back to the id when unknown."""
t = tool_by_id(tool_id)
return t.name if t else tool_id
def tool_name(tool_id: str) -> str:
"""Return the localized tool name, falling back to the registry default."""
from src.i18n import t as _t
fallback = display_name(tool_id)
translated = _t(f"tools.{tool_id}.name")
return translated if translated != f"tools.{tool_id}.name" else fallback
def tool_description(tool_id: str) -> str:
"""Return the localized tool description, falling back to the registry default."""
from src.i18n import t as _t
tool = tool_by_id(tool_id)
fallback = tool.description if tool else ""
translated = _t(f"tools.{tool_id}.description")
return translated if translated != f"tools.{tool_id}.description" else fallback

155
src/i18n/__init__.py Normal file
View File

@@ -0,0 +1,155 @@
"""Language packs for the DataTools GUI.
A language pack is a JSON file under ``src/i18n/packs/`` keyed by ISO 639-1
language code (``en.json``, ``es.json``, …). Keys are dotted paths
(``home.title``, ``tools.deduplicator.name``); values are the translated
strings. The English pack is canonical — missing keys in other packs fall
back to the English value, and missing keys in English fall back to the
key itself so a typo surfaces as a visible string instead of a crash.
Adding a language: drop a new ``<code>.json`` next to ``en.json`` mirroring
its key tree, then add a one-line entry to ``LANGUAGES``. The sidebar
selector picks it up automatically.
Translation lookup is intentionally tiny — no gettext, no babel, no
po-file pipeline. Format-string interpolation is supplied by callers via
``str.format``; this module only resolves keys.
"""
from __future__ import annotations
import json
from functools import lru_cache
from pathlib import Path
from typing import Any
_PACK_DIR = Path(__file__).resolve().parent / "packs"
_DEFAULT_LANG = "en"
_SESSION_KEY = "ui_lang"
# Display registry. ``label`` is what the sidebar shows; ``code`` is the
# JSON filename stem. Keep this list short — every entry must have a
# fully-translated pack in ``packs/``.
LANGUAGES: list[dict[str, str]] = [
{"code": "en", "label": "English"},
{"code": "es", "label": "Español"},
]
def available_languages() -> list[dict[str, str]]:
"""Return the public language registry (a fresh list each call)."""
return [dict(entry) for entry in LANGUAGES]
@lru_cache(maxsize=8)
def _load_pack(lang: str) -> dict[str, Any]:
"""Read a pack JSON off disk. Cached so re-renders don't reparse."""
path = _PACK_DIR / f"{lang}.json"
if not path.exists():
return {}
with path.open("r", encoding="utf-8") as fh:
return json.load(fh)
def _resolve(pack: dict[str, Any], key: str) -> Any:
"""Walk a dotted key through a nested dict. Returns None if absent."""
node: Any = pack
for part in key.split("."):
if not isinstance(node, dict) or part not in node:
return None
node = node[part]
return node
def t(key: str, lang: str | None = None, /, **kwargs: Any) -> str:
"""Look up *key* in the current language pack.
Fallback chain: requested lang → English → the key itself. ``kwargs``
are passed through ``str.format`` so call sites can use named
placeholders (``t("upload.using", name=name)``). Missing placeholders
are tolerated — the raw braces remain in the output rather than
raising — because a translation file shouldn't be able to crash the
UI.
"""
if lang is None:
lang = current_language()
value = _resolve(_load_pack(lang), key)
if value is None and lang != _DEFAULT_LANG:
value = _resolve(_load_pack(_DEFAULT_LANG), key)
if value is None:
value = key
if not isinstance(value, str):
return str(value)
if kwargs:
try:
return value.format(**kwargs)
except (KeyError, IndexError):
return value
return value
def current_language() -> str:
"""Return the active language code, defaulting to English.
Reads from ``st.session_state`` when Streamlit is loaded so a sidebar
selector can change the language for the current session. Falls back
to the default when called outside a Streamlit run (e.g. in tests),
which keeps this module importable without Streamlit installed at
that import path.
"""
try:
import streamlit as st
except Exception:
return _DEFAULT_LANG
return st.session_state.get(_SESSION_KEY, _DEFAULT_LANG)
def set_language(lang: str) -> None:
"""Persist *lang* on the Streamlit session. No-op outside Streamlit."""
try:
import streamlit as st
except Exception:
return
st.session_state[_SESSION_KEY] = lang
def render_language_selector(*, location: str = "sidebar") -> None:
"""Render the language picker.
``location`` is either ``"sidebar"`` (default) or ``"inline"``. The
sidebar form is what the home page wires up so every tool page picks
up the same selector through Streamlit's shared sidebar.
"""
import streamlit as st
target = st.sidebar if location == "sidebar" else st
codes = [entry["code"] for entry in LANGUAGES]
labels = {entry["code"]: entry["label"] for entry in LANGUAGES}
current = current_language()
if current not in codes:
current = _DEFAULT_LANG
choice = target.selectbox(
t("chrome.language_label"),
codes,
index=codes.index(current),
format_func=lambda c: labels.get(c, c),
key="_ui_lang_select",
)
if choice != current:
set_language(choice)
st.rerun()
__all__ = [
"LANGUAGES",
"available_languages",
"current_language",
"render_language_selector",
"set_language",
"t",
]

97
src/i18n/packs/en.json Normal file
View File

@@ -0,0 +1,97 @@
{
"chrome": {
"language_label": "Language",
"footer": "Runs locally. Your data never leaves this computer. | DataTools v3.0"
},
"home": {
"page_title": "DataTools — Data Cleaning Mastery",
"title": "🧹 DataTools — Data Cleaning Mastery",
"caption": "A 9-tool suite for cleaning, standardizing, and validating tabular data. Runs 100% locally.",
"findings_badge_one": "{n} finding",
"findings_badge_other": "{n} findings"
},
"status": {
"ready": "Ready",
"coming_soon": "Coming Soon"
},
"upload": {
"heading": "📤 Upload a file to start",
"intro": "Optional: scan an uploaded file for data quality issues and see which tools can fix each one. Skip if you already know what you need.",
"limits": "**Up to 1 GB.** Formats: CSV, TSV, XLSX, XLS. Delimiters auto-detected: comma, tab, semicolon, pipe. Encodings auto-detected: UTF-8 (with/without BOM), UTF-16, cp1252, Latin-1/9, cp1250, ISO-8859-2, cp1251, KOI8-R, Mac Roman, Shift_JIS, GB18030, Big5, EUC-KR — and override on the Review page.",
"uploader_label": "Upload CSV or Excel",
"uploader_help": "Up to 1 GB. Comma / tab / semicolon / pipe delimiters all auto-detected. Encoding auto-detected with override on the Review page if needed.",
"run_button": "Run analysis",
"skip_button": "Skip",
"scanning": "Scanning…",
"skipped_notice": "Analysis skipped. Open any tool below to start working.",
"using_session_file": "Using **{name}** from the upload screen.",
"use_different_file": "Use a different file",
"switch_back": "Switch back to upload-screen file",
"pickup_caption": "Up to 1 GB. Delimiters auto-detected: comma, tab, semicolon, pipe. Encoding auto-detected (UTF-8 / UTF-16 / cp1252 / Latin-1 family / cp1250 / cp1251 / KOI8-R / Mac Roman / Shift_JIS / GB18030 / Big5 / EUC-KR), with override on the Review page."
},
"findings": {
"header": "Detected issues",
"none": "No issues detected. Open any tool below to start working.",
"severity_summary_segment": "{icon} {n} {severity}",
"tool_section_label": "{tool} — {n} finding(s)",
"other_section_label": "Other / file-level — {n} finding(s)",
"open_tool": "Open {tool} →",
"untargeted_label": "Informational"
},
"gate": {
"warning": "**{name}** must pass the CSV-normalization gate before you can use this tool. Open the Review page to apply the fixes our analyzer recommends.",
"default_name": "the uploaded file",
"open_review": "Go to Review & Normalize"
},
"quit": {
"button": "Quit app",
"shutting_down": "Shutting down… you can close this window.",
"farewell_title": "DataTools has shut down",
"farewell_subtitle": "You can close this window."
},
"close_page": {
"page_title": "DataTools — Close",
"title": "🛑 Close DataTools",
"caption": "Shut down the local app and free the terminal.",
"body": "Clicking the button below will terminate the DataTools server. Any unsaved work in other tools will be lost. Once the app shuts down you can close this window.",
"button": "Close the app"
},
"tools": {
"01_deduplicator": {
"name": "Deduplicator",
"description": "Fuzzy matching, normalization, survivor selection, and interactive review."
},
"02_text_cleaner": {
"name": "Text Cleaner",
"description": "Whitespace trim, multi-space collapse, Unicode normalization, BOM and line-ending handling."
},
"03_format_standardizer": {
"name": "Format Standardizer",
"description": "Standardize dates, currencies, names, phone numbers, and addresses."
},
"04_missing_handler": {
"name": "Missing Value Handler",
"description": "Detect disguised nulls, missingness analysis, and imputation strategies."
},
"05_column_mapper": {
"name": "Column Mapper",
"description": "Rename columns, enforce a target schema, and coerce types."
},
"06_outlier_detector": {
"name": "Outlier Detector",
"description": "Z-score, IQR, and MAD detection with domain-rule violations and winsorization."
},
"07_multi_file_merger": {
"name": "Multi-File Merger",
"description": "Combine multiple CSV/Excel files with schema alignment."
},
"08_validator_reporter": {
"name": "Validator & Reporter",
"description": "Validate against rules and generate PDF/Excel quality reports."
},
"09_pipeline_runner": {
"name": "Pipeline Runner",
"description": "Chain tools in recommended order and pass output between steps."
}
}
}

97
src/i18n/packs/es.json Normal file
View File

@@ -0,0 +1,97 @@
{
"chrome": {
"language_label": "Idioma",
"footer": "Se ejecuta localmente. Tus datos nunca salen de este equipo. | DataTools v3.0"
},
"home": {
"page_title": "DataTools — Maestría en limpieza de datos",
"title": "🧹 DataTools — Maestría en limpieza de datos",
"caption": "Conjunto de 9 herramientas para limpiar, estandarizar y validar datos tabulares. Se ejecuta 100% en local.",
"findings_badge_one": "{n} hallazgo",
"findings_badge_other": "{n} hallazgos"
},
"status": {
"ready": "Listo",
"coming_soon": "Próximamente"
},
"upload": {
"heading": "📤 Sube un archivo para empezar",
"intro": "Opcional: analiza un archivo para detectar problemas de calidad de datos y ver qué herramientas pueden corregir cada uno. Sáltalo si ya sabes lo que necesitas.",
"limits": "**Hasta 1 GB.** Formatos: CSV, TSV, XLSX, XLS. Delimitadores detectados automáticamente: coma, tabulador, punto y coma, barra vertical. Codificaciones detectadas automáticamente: UTF-8 (con/sin BOM), UTF-16, cp1252, Latin-1/9, cp1250, ISO-8859-2, cp1251, KOI8-R, Mac Roman, Shift_JIS, GB18030, Big5, EUC-KR — y se pueden sustituir desde la página Revisar.",
"uploader_label": "Sube un archivo CSV o Excel",
"uploader_help": "Hasta 1 GB. Delimitadores coma / tabulador / punto y coma / barra vertical detectados automáticamente. Codificación detectada automáticamente, con opción de sustituirla en la página Revisar.",
"run_button": "Ejecutar análisis",
"skip_button": "Omitir",
"scanning": "Analizando…",
"skipped_notice": "Análisis omitido. Abre cualquier herramienta de abajo para empezar a trabajar.",
"using_session_file": "Usando **{name}** de la pantalla de carga.",
"use_different_file": "Usar otro archivo",
"switch_back": "Volver al archivo de la pantalla de carga",
"pickup_caption": "Hasta 1 GB. Delimitadores detectados automáticamente: coma, tabulador, punto y coma, barra vertical. Codificación detectada automáticamente (UTF-8 / UTF-16 / cp1252 / familia Latin-1 / cp1250 / cp1251 / KOI8-R / Mac Roman / Shift_JIS / GB18030 / Big5 / EUC-KR), con opción de sustituirla en la página Revisar."
},
"findings": {
"header": "Problemas detectados",
"none": "No se detectaron problemas. Abre cualquier herramienta de abajo para empezar a trabajar.",
"severity_summary_segment": "{icon} {n} {severity}",
"tool_section_label": "{tool} — {n} hallazgo(s)",
"other_section_label": "Otros / a nivel de archivo — {n} hallazgo(s)",
"open_tool": "Abrir {tool} →",
"untargeted_label": "Informativo"
},
"gate": {
"warning": "**{name}** debe pasar la verificación de normalización CSV antes de poder usar esta herramienta. Abre la página Revisar para aplicar las correcciones recomendadas por el analizador.",
"default_name": "el archivo cargado",
"open_review": "Ir a Revisar y Normalizar"
},
"quit": {
"button": "Cerrar app",
"shutting_down": "Cerrando… ya puedes cerrar esta ventana.",
"farewell_title": "DataTools se ha cerrado",
"farewell_subtitle": "Ya puedes cerrar esta ventana."
},
"close_page": {
"page_title": "DataTools — Cerrar",
"title": "🛑 Cerrar DataTools",
"caption": "Detén la aplicación local y libera la terminal.",
"body": "Al pulsar el botón de abajo se cerrará el servidor de DataTools. Cualquier trabajo sin guardar en otras herramientas se perderá. Una vez cerrada la app, puedes cerrar esta ventana.",
"button": "Cerrar la app"
},
"tools": {
"01_deduplicator": {
"name": "Eliminador de duplicados",
"description": "Coincidencia difusa, normalización, selección de superviviente y revisión interactiva."
},
"02_text_cleaner": {
"name": "Limpiador de texto",
"description": "Recorte de espacios, colapso de espacios múltiples, normalización Unicode, manejo de BOM y de finales de línea."
},
"03_format_standardizer": {
"name": "Estandarizador de formatos",
"description": "Estandariza fechas, monedas, nombres, números de teléfono y direcciones."
},
"04_missing_handler": {
"name": "Gestor de valores faltantes",
"description": "Detecta nulos disfrazados, analiza la ausencia de datos y aplica estrategias de imputación."
},
"05_column_mapper": {
"name": "Mapeador de columnas",
"description": "Renombra columnas, aplica un esquema objetivo y fuerza tipos de datos."
},
"06_outlier_detector": {
"name": "Detector de valores atípicos",
"description": "Detección por Z-score, IQR y MAD con reglas de dominio y winsorización."
},
"07_multi_file_merger": {
"name": "Combinador de varios archivos",
"description": "Combina varios archivos CSV/Excel alineando sus esquemas."
},
"08_validator_reporter": {
"name": "Validador e informes",
"description": "Valida contra reglas y genera informes de calidad en PDF/Excel."
},
"09_pipeline_runner": {
"name": "Ejecutor de canalizaciones",
"description": "Encadena herramientas en el orden recomendado y pasa la salida entre pasos."
}
}
}

174
tests/test_lang_packs.py Normal file
View File

@@ -0,0 +1,174 @@
"""Tests for the GUI language-pack i18n module.
Covers:
- t() basic lookup, missing-key fallback to English, then to the key.
- str.format kwargs interpolation and tolerant handling of missing keys.
- Parity between English and Spanish packs so a new key in en.json
doesn't silently regress to English when es is active.
- The JS-escape helper used by the farewell overlay.
"""
from __future__ import annotations
import json
from pathlib import Path
import pytest
from src.i18n import LANGUAGES, available_languages, t
# Loaded once for the parity test.
_PACK_DIR = Path(__file__).resolve().parent.parent / "src" / "i18n" / "packs"
def _flatten(obj, prefix=""):
"""Yield dotted-key paths from a nested dict pack."""
if isinstance(obj, dict):
for k, v in obj.items():
path = f"{prefix}.{k}" if prefix else k
yield from _flatten(v, path)
else:
yield prefix
def _load_pack(code: str) -> dict:
with (_PACK_DIR / f"{code}.json").open("r", encoding="utf-8") as fh:
return json.load(fh)
class TestLookup:
def test_returns_english_value_by_default(self):
assert t("home.title", "en").startswith("🧹 DataTools")
def test_returns_spanish_value(self):
assert "Maestría" in t("home.title", "es")
def test_missing_key_falls_back_to_english(self):
# ``tools.99_pipeline_runner.name`` doesn't exist; the pipeline
# runner is keyed by 09. A wrong key should fall back through to
# the literal key string so the bug is visible, not silent.
out = t("definitely.not.a.real.key", "es")
assert out == "definitely.not.a.real.key"
def test_spanish_missing_key_falls_back_to_english(self, tmp_path, monkeypatch):
# Simulate: a key exists in en.json but not in es.json. The Spanish
# lookup should resolve via the English fallback rather than
# returning the dotted key.
from src import i18n as i18n_mod
i18n_mod._load_pack.cache_clear()
# Point the loader at a temp dir with a sparse Spanish pack.
monkeypatch.setattr(i18n_mod, "_PACK_DIR", tmp_path)
(tmp_path / "en.json").write_text(
json.dumps({"a": {"b": "english-only"}}), encoding="utf-8",
)
(tmp_path / "es.json").write_text(json.dumps({}), encoding="utf-8")
try:
assert i18n_mod.t("a.b", "es") == "english-only"
finally:
i18n_mod._load_pack.cache_clear()
class TestInterpolation:
def test_named_placeholder(self):
# ``upload.using_session_file`` uses ``{name}``.
out = t("upload.using_session_file", "en", name="data.csv")
assert "data.csv" in out
def test_missing_placeholder_is_tolerated(self):
# If a caller forgets a placeholder, return the raw template
# rather than crashing the UI.
out = t("upload.using_session_file", "en")
assert "{name}" in out
class TestPackParity:
"""Every key in en.json must exist in every other registered pack.
A divergence means a user with that language sees an English
fallback for a string the translator hasn't been told about, which
is a translation gap we want CI to surface.
"""
def test_es_mirrors_en(self):
en_keys = set(_flatten(_load_pack("en")))
es_keys = set(_flatten(_load_pack("es")))
missing = en_keys - es_keys
assert not missing, f"Spanish pack missing keys: {sorted(missing)}"
def test_no_orphan_keys_in_es(self):
# The other direction: stale Spanish keys that no longer exist
# in English are dead weight; flag them too.
en_keys = set(_flatten(_load_pack("en")))
es_keys = set(_flatten(_load_pack("es")))
orphans = es_keys - en_keys
assert not orphans, f"Spanish pack has stale keys: {sorted(orphans)}"
class TestRegistry:
def test_languages_listed(self):
codes = {entry["code"] for entry in available_languages()}
assert {"en", "es"} <= codes
def test_every_registered_lang_has_a_pack(self):
for entry in LANGUAGES:
assert (_PACK_DIR / f"{entry['code']}.json").exists()
class TestFarewellEscape:
"""The farewell overlay interpolates pack strings into a JS payload.
A malicious / accidental quote or angle bracket in the translation
must not be able to break out of the JS string or the surrounding
HTML. Test the escape helper directly so the contract is pinned.
"""
def test_escapes_quotes_and_html(self):
from src.gui.components._legacy import _js_html_safe
out = _js_html_safe("Cerrando 'app' <script>x</script>")
# Every single-quote must be backslash-escaped so it can't
# terminate the JS string literal that wraps the payload.
assert "\\'" in out
assert "'" not in out.replace("\\'", "")
assert "<script>" not in out
assert "&lt;script&gt;" in out
def test_backslash_doubled(self):
from src.gui.components._legacy import _js_html_safe
assert _js_html_safe("a\\b") == "a\\\\b"
class TestKeyCoverage:
"""Spot-check a few keys the GUI relies on so a rename in one place
doesn't silently disappear from the other."""
@pytest.mark.parametrize("key", [
"home.title",
"home.caption",
"chrome.footer",
"chrome.language_label",
"upload.heading",
"upload.run_button",
"upload.skip_button",
"findings.header",
"findings.none",
"gate.warning",
"gate.open_review",
"quit.button",
"quit.shutting_down",
"quit.farewell_title",
"quit.farewell_subtitle",
"close_page.title",
"close_page.button",
"status.ready",
"status.coming_soon",
"tools.01_deduplicator.name",
"tools.09_pipeline_runner.description",
])
def test_key_resolves_in_both_packs(self, key):
for lang in ("en", "es"):
value = t(key, lang)
assert value and value != key, f"missing {key!r} in {lang}"