feat(i18n): add language-pack scaffold with English and Spanish

Introduces ``src/i18n`` with a tiny JSON-backed t() lookup, an in-session language preference, and a sidebar selector wired through ``hide_streamlit_chrome`` so every page picks up the same picker. Covers home, tool cards, findings panel, gate, shutdown, and pickup banner strings. Tests pin pack parity and the farewell-overlay JS escape so future packs can't silently regress. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 15:11:30 +00:00
parent 4706ed571e
commit c4ce86bd64
8 changed files with 649 additions and 75 deletions
--- a/src/i18n/init.py
+++ b/src/i18n/init.py
@@ -0,0 +1,155 @@
+"""Language packs for the DataTools GUI.
+
+A language pack is a JSON file under ``src/i18n/packs/`` keyed by ISO 639-1
+language code (``en.json``, ``es.json``, …). Keys are dotted paths
+(``home.title``, ``tools.deduplicator.name``); values are the translated
+strings. The English pack is canonical — missing keys in other packs fall
+back to the English value, and missing keys in English fall back to the
+key itself so a typo surfaces as a visible string instead of a crash.
+
+Adding a language: drop a new ``<code>.json`` next to ``en.json`` mirroring
+its key tree, then add a one-line entry to ``LANGUAGES``. The sidebar
+selector picks it up automatically.
+
+Translation lookup is intentionally tiny — no gettext, no babel, no
+po-file pipeline. Format-string interpolation is supplied by callers via
+``str.format``; this module only resolves keys.
+"""
+
+from __future__ import annotations
+
+import json
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+
+_PACK_DIR = Path(__file__).resolve().parent / "packs"
+_DEFAULT_LANG = "en"
+_SESSION_KEY = "ui_lang"
+
+
+# Display registry. ``label`` is what the sidebar shows; ``code`` is the
+# JSON filename stem. Keep this list short — every entry must have a
+# fully-translated pack in ``packs/``.
+LANGUAGES: list[dict[str, str]] = [
+    {"code": "en", "label": "English"},
+    {"code": "es", "label": "Español"},
+]
+
+
+def available_languages() -> list[dict[str, str]]:
+    """Return the public language registry (a fresh list each call)."""
+    return [dict(entry) for entry in LANGUAGES]
+
+
+@lru_cache(maxsize=8)
+def _load_pack(lang: str) -> dict[str, Any]:
+    """Read a pack JSON off disk. Cached so re-renders don't reparse."""
+    path = _PACK_DIR / f"{lang}.json"
+    if not path.exists():
+        return {}
+    with path.open("r", encoding="utf-8") as fh:
+        return json.load(fh)
+
+
+def _resolve(pack: dict[str, Any], key: str) -> Any:
+    """Walk a dotted key through a nested dict. Returns None if absent."""
+    node: Any = pack
+    for part in key.split("."):
+        if not isinstance(node, dict) or part not in node:
+            return None
+        node = node[part]
+    return node
+
+
+def t(key: str, lang: str | None = None, /, **kwargs: Any) -> str:
+    """Look up *key* in the current language pack.
+
+    Fallback chain: requested lang → English → the key itself. ``kwargs``
+    are passed through ``str.format`` so call sites can use named
+    placeholders (``t("upload.using", name=name)``). Missing placeholders
+    are tolerated — the raw braces remain in the output rather than
+    raising — because a translation file shouldn't be able to crash the
+    UI.
+    """
+    if lang is None:
+        lang = current_language()
+
+    value = _resolve(_load_pack(lang), key)
+    if value is None and lang != _DEFAULT_LANG:
+        value = _resolve(_load_pack(_DEFAULT_LANG), key)
+    if value is None:
+        value = key
+
+    if not isinstance(value, str):
+        return str(value)
+
+    if kwargs:
+        try:
+            return value.format(**kwargs)
+        except (KeyError, IndexError):
+            return value
+    return value
+
+
+def current_language() -> str:
+    """Return the active language code, defaulting to English.
+
+    Reads from ``st.session_state`` when Streamlit is loaded so a sidebar
+    selector can change the language for the current session. Falls back
+    to the default when called outside a Streamlit run (e.g. in tests),
+    which keeps this module importable without Streamlit installed at
+    that import path.
+    """
+    try:
+        import streamlit as st
+    except Exception:
+        return _DEFAULT_LANG
+    return st.session_state.get(_SESSION_KEY, _DEFAULT_LANG)
+
+
+def set_language(lang: str) -> None:
+    """Persist *lang* on the Streamlit session. No-op outside Streamlit."""
+    try:
+        import streamlit as st
+    except Exception:
+        return
+    st.session_state[_SESSION_KEY] = lang
+
+
+def render_language_selector(*, location: str = "sidebar") -> None:
+    """Render the language picker.
+
+    ``location`` is either ``"sidebar"`` (default) or ``"inline"``. The
+    sidebar form is what the home page wires up so every tool page picks
+    up the same selector through Streamlit's shared sidebar.
+    """
+    import streamlit as st
+
+    target = st.sidebar if location == "sidebar" else st
+    codes = [entry["code"] for entry in LANGUAGES]
+    labels = {entry["code"]: entry["label"] for entry in LANGUAGES}
+    current = current_language()
+    if current not in codes:
+        current = _DEFAULT_LANG
+
+    choice = target.selectbox(
+        t("chrome.language_label"),
+        codes,
+        index=codes.index(current),
+        format_func=lambda c: labels.get(c, c),
+        key="_ui_lang_select",
+    )
+    if choice != current:
+        set_language(choice)
+        st.rerun()
+
+
+__all__ = [
+    "LANGUAGES",
+    "available_languages",
+    "current_language",
+    "render_language_selector",
+    "set_language",
+    "t",
+]
--- a/src/i18n/packs/en.json
+++ b/src/i18n/packs/en.json
@@ -0,0 +1,97 @@
+{
+  "chrome": {
+    "language_label": "Language",
+    "footer": "Runs locally. Your data never leaves this computer. | DataTools v3.0"
+  },
+  "home": {
+    "page_title": "DataTools — Data Cleaning Mastery",
+    "title": "🧹 DataTools — Data Cleaning Mastery",
+    "caption": "A 9-tool suite for cleaning, standardizing, and validating tabular data. Runs 100% locally.",
+    "findings_badge_one": "{n} finding",
+    "findings_badge_other": "{n} findings"
+  },
+  "status": {
+    "ready": "Ready",
+    "coming_soon": "Coming Soon"
+  },
+  "upload": {
+    "heading": "📤 Upload a file to start",
+    "intro": "Optional: scan an uploaded file for data quality issues and see which tools can fix each one. Skip if you already know what you need.",
+    "limits": "**Up to 1 GB.** Formats: CSV, TSV, XLSX, XLS. Delimiters auto-detected: comma, tab, semicolon, pipe. Encodings auto-detected: UTF-8 (with/without BOM), UTF-16, cp1252, Latin-1/9, cp1250, ISO-8859-2, cp1251, KOI8-R, Mac Roman, Shift_JIS, GB18030, Big5, EUC-KR — and override on the Review page.",
+    "uploader_label": "Upload CSV or Excel",
+    "uploader_help": "Up to 1 GB. Comma / tab / semicolon / pipe delimiters all auto-detected. Encoding auto-detected with override on the Review page if needed.",
+    "run_button": "Run analysis",
+    "skip_button": "Skip",
+    "scanning": "Scanning…",
+    "skipped_notice": "Analysis skipped. Open any tool below to start working.",
+    "using_session_file": "Using **{name}** from the upload screen.",
+    "use_different_file": "Use a different file",
+    "switch_back": "Switch back to upload-screen file",
+    "pickup_caption": "Up to 1 GB. Delimiters auto-detected: comma, tab, semicolon, pipe. Encoding auto-detected (UTF-8 / UTF-16 / cp1252 / Latin-1 family / cp1250 / cp1251 / KOI8-R / Mac Roman / Shift_JIS / GB18030 / Big5 / EUC-KR), with override on the Review page."
+  },
+  "findings": {
+    "header": "Detected issues",
+    "none": "No issues detected. Open any tool below to start working.",
+    "severity_summary_segment": "{icon} {n} {severity}",
+    "tool_section_label": "{tool} — {n} finding(s)",
+    "other_section_label": "Other / file-level — {n} finding(s)",
+    "open_tool": "Open {tool} →",
+    "untargeted_label": "Informational"
+  },
+  "gate": {
+    "warning": "**{name}** must pass the CSV-normalization gate before you can use this tool. Open the Review page to apply the fixes our analyzer recommends.",
+    "default_name": "the uploaded file",
+    "open_review": "Go to Review & Normalize"
+  },
+  "quit": {
+    "button": "Quit app",
+    "shutting_down": "Shutting down… you can close this window.",
+    "farewell_title": "DataTools has shut down",
+    "farewell_subtitle": "You can close this window."
+  },
+  "close_page": {
+    "page_title": "DataTools — Close",
+    "title": "🛑 Close DataTools",
+    "caption": "Shut down the local app and free the terminal.",
+    "body": "Clicking the button below will terminate the DataTools server. Any unsaved work in other tools will be lost. Once the app shuts down you can close this window.",
+    "button": "Close the app"
+  },
+  "tools": {
+    "01_deduplicator": {
+      "name": "Deduplicator",
+      "description": "Fuzzy matching, normalization, survivor selection, and interactive review."
+    },
+    "02_text_cleaner": {
+      "name": "Text Cleaner",
+      "description": "Whitespace trim, multi-space collapse, Unicode normalization, BOM and line-ending handling."
+    },
+    "03_format_standardizer": {
+      "name": "Format Standardizer",
+      "description": "Standardize dates, currencies, names, phone numbers, and addresses."
+    },
+    "04_missing_handler": {
+      "name": "Missing Value Handler",
+      "description": "Detect disguised nulls, missingness analysis, and imputation strategies."
+    },
+    "05_column_mapper": {
+      "name": "Column Mapper",
+      "description": "Rename columns, enforce a target schema, and coerce types."
+    },
+    "06_outlier_detector": {
+      "name": "Outlier Detector",
+      "description": "Z-score, IQR, and MAD detection with domain-rule violations and winsorization."
+    },
+    "07_multi_file_merger": {
+      "name": "Multi-File Merger",
+      "description": "Combine multiple CSV/Excel files with schema alignment."
+    },
+    "08_validator_reporter": {
+      "name": "Validator & Reporter",
+      "description": "Validate against rules and generate PDF/Excel quality reports."
+    },
+    "09_pipeline_runner": {
+      "name": "Pipeline Runner",
+      "description": "Chain tools in recommended order and pass output between steps."
+    }
+  }
+}
--- a/src/i18n/packs/es.json
+++ b/src/i18n/packs/es.json
@@ -0,0 +1,97 @@
+{
+  "chrome": {
+    "language_label": "Idioma",
+    "footer": "Se ejecuta localmente. Tus datos nunca salen de este equipo. | DataTools v3.0"
+  },
+  "home": {
+    "page_title": "DataTools — Maestría en limpieza de datos",
+    "title": "🧹 DataTools — Maestría en limpieza de datos",
+    "caption": "Conjunto de 9 herramientas para limpiar, estandarizar y validar datos tabulares. Se ejecuta 100% en local.",
+    "findings_badge_one": "{n} hallazgo",
+    "findings_badge_other": "{n} hallazgos"
+  },
+  "status": {
+    "ready": "Listo",
+    "coming_soon": "Próximamente"
+  },
+  "upload": {
+    "heading": "📤 Sube un archivo para empezar",
+    "intro": "Opcional: analiza un archivo para detectar problemas de calidad de datos y ver qué herramientas pueden corregir cada uno. Sáltalo si ya sabes lo que necesitas.",
+    "limits": "**Hasta 1 GB.** Formatos: CSV, TSV, XLSX, XLS. Delimitadores detectados automáticamente: coma, tabulador, punto y coma, barra vertical. Codificaciones detectadas automáticamente: UTF-8 (con/sin BOM), UTF-16, cp1252, Latin-1/9, cp1250, ISO-8859-2, cp1251, KOI8-R, Mac Roman, Shift_JIS, GB18030, Big5, EUC-KR — y se pueden sustituir desde la página Revisar.",
+    "uploader_label": "Sube un archivo CSV o Excel",
+    "uploader_help": "Hasta 1 GB. Delimitadores coma / tabulador / punto y coma / barra vertical detectados automáticamente. Codificación detectada automáticamente, con opción de sustituirla en la página Revisar.",
+    "run_button": "Ejecutar análisis",
+    "skip_button": "Omitir",
+    "scanning": "Analizando…",
+    "skipped_notice": "Análisis omitido. Abre cualquier herramienta de abajo para empezar a trabajar.",
+    "using_session_file": "Usando **{name}** de la pantalla de carga.",
+    "use_different_file": "Usar otro archivo",
+    "switch_back": "Volver al archivo de la pantalla de carga",
+    "pickup_caption": "Hasta 1 GB. Delimitadores detectados automáticamente: coma, tabulador, punto y coma, barra vertical. Codificación detectada automáticamente (UTF-8 / UTF-16 / cp1252 / familia Latin-1 / cp1250 / cp1251 / KOI8-R / Mac Roman / Shift_JIS / GB18030 / Big5 / EUC-KR), con opción de sustituirla en la página Revisar."
+  },
+  "findings": {
+    "header": "Problemas detectados",
+    "none": "No se detectaron problemas. Abre cualquier herramienta de abajo para empezar a trabajar.",
+    "severity_summary_segment": "{icon} {n} {severity}",
+    "tool_section_label": "{tool} — {n} hallazgo(s)",
+    "other_section_label": "Otros / a nivel de archivo — {n} hallazgo(s)",
+    "open_tool": "Abrir {tool} →",
+    "untargeted_label": "Informativo"
+  },
+  "gate": {
+    "warning": "**{name}** debe pasar la verificación de normalización CSV antes de poder usar esta herramienta. Abre la página Revisar para aplicar las correcciones recomendadas por el analizador.",
+    "default_name": "el archivo cargado",
+    "open_review": "Ir a Revisar y Normalizar"
+  },
+  "quit": {
+    "button": "Cerrar app",
+    "shutting_down": "Cerrando… ya puedes cerrar esta ventana.",
+    "farewell_title": "DataTools se ha cerrado",
+    "farewell_subtitle": "Ya puedes cerrar esta ventana."
+  },
+  "close_page": {
+    "page_title": "DataTools — Cerrar",
+    "title": "🛑 Cerrar DataTools",
+    "caption": "Detén la aplicación local y libera la terminal.",
+    "body": "Al pulsar el botón de abajo se cerrará el servidor de DataTools. Cualquier trabajo sin guardar en otras herramientas se perderá. Una vez cerrada la app, puedes cerrar esta ventana.",
+    "button": "Cerrar la app"
+  },
+  "tools": {
+    "01_deduplicator": {
+      "name": "Eliminador de duplicados",
+      "description": "Coincidencia difusa, normalización, selección de superviviente y revisión interactiva."
+    },
+    "02_text_cleaner": {
+      "name": "Limpiador de texto",
+      "description": "Recorte de espacios, colapso de espacios múltiples, normalización Unicode, manejo de BOM y de finales de línea."
+    },
+    "03_format_standardizer": {
+      "name": "Estandarizador de formatos",
+      "description": "Estandariza fechas, monedas, nombres, números de teléfono y direcciones."
+    },
+    "04_missing_handler": {
+      "name": "Gestor de valores faltantes",
+      "description": "Detecta nulos disfrazados, analiza la ausencia de datos y aplica estrategias de imputación."
+    },
+    "05_column_mapper": {
+      "name": "Mapeador de columnas",
+      "description": "Renombra columnas, aplica un esquema objetivo y fuerza tipos de datos."
+    },
+    "06_outlier_detector": {
+      "name": "Detector de valores atípicos",
+      "description": "Detección por Z-score, IQR y MAD con reglas de dominio y winsorización."
+    },
+    "07_multi_file_merger": {
+      "name": "Combinador de varios archivos",
+      "description": "Combina varios archivos CSV/Excel alineando sus esquemas."
+    },
+    "08_validator_reporter": {
+      "name": "Validador e informes",
+      "description": "Valida contra reglas y genera informes de calidad en PDF/Excel."
+    },
+    "09_pipeline_runner": {
+      "name": "Ejecutor de canalizaciones",
+      "description": "Encadena herramientas en el orden recomendado y pasa la salida entre pasos."
+    }
+  }
+}