fix(audit): defensive wrap so audit failures can never blank the GUI

Reported: after pulling commit c73d716 (audit log) the main body of every page showed empty. Sidebar nav still worked. Diagnosis: the most likely path is that something inside the audit calls — ``_render_diagnostics_sidebar()`` calling ``audit_log_path()``, or ``log_session_start()`` itself — raises during ``hide_streamlit_chrome`` on the user's environment (Python 3.14 on Windows, a less-tested combo than the test environment). Streamlit's script runner sees the exception, and on some chrome paths it eats it without surfacing an error block, leaving the page body empty. The audit log is best-effort by design. Make that contract real: 1. ``hide_streamlit_chrome`` now wraps both ``log_session_start()`` and ``_render_diagnostics_sidebar()`` in try/except. Errors print to stderr (so the developer running ``python -m src.gui`` sees them in the launcher's console) but never bubble up to kill the page render. 2. ``audit_log_path()`` already had a tempdir fallback for the primary mkdir failure, but the SECOND mkdir wasn't protected either. Restructured to a two-level fallback: configured dir → tempdir → ``/dev/null`` (or ``NUL`` on Windows). The last fallback ensures the function never raises; ``log_event``'s own try/except handles the eventual unwritable-file case. 3. ``log_page_open(slug)`` now has an outer try/except so it cannot raise either — protecting every tool page's render path. If a user reports the same symptom again, the launcher terminal will now show a real traceback explaining what's wrong, and the GUI will still render normally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-17 02:00:31 +00:00
parent ee0b1f6f6b
commit 59c6d0f914
2 changed files with 63 additions and 19 deletions
--- a/src/audit.py
+++ b/src/audit.py
@@ -85,23 +85,47 @@ def audit_log_path() -> Path:
    The path is created the first time it's queried so each Python
    process gets a single file regardless of how many Streamlit
    reruns happen.
+
+    Two-level fallback so this NEVER raises: first try the configured
+    audit dir, then a tempdir, then ``/dev/null``-equivalent (just a
+    nonexistent path) as a last resort. Callers downstream skip
+    writing when the file isn't writable.
    """
    global _LOG_PATH
    with _LOCK:
        if _LOG_PATH is None:
+            try:
                ts = datetime.now(tz=timezone.utc).strftime("%Y%m%dT%H%M%SZ")
                sid = _session_id()[:8]
-            d = audit_log_dir()
+            except Exception:
+                ts, sid = "unknown", "unknown"
+            name = f"datatools-{ts}-{sid}.jsonl"
+
+            # First choice: configured audit dir.
+            d = None
            try:
+                d = audit_log_dir()
                d.mkdir(parents=True, exist_ok=True)
            except Exception:
-                # If we can't create the dir, fall back to a tmpdir
-                # location so we never crash the app for the audit
-                # log's sake.
+                d = None
+
+            # Fallback: tempdir.
+            if d is None:
+                try:
                    import tempfile
                    d = Path(tempfile.gettempdir()) / "datatools-logs"
                    d.mkdir(parents=True, exist_ok=True)
-            _LOG_PATH = d / f"datatools-{ts}-{sid}.jsonl"
+                except Exception:
+                    d = None
+
+            # Last resort: point at a path we won't try to write to —
+            # ``log_event``'s own try/except handles the eventual
+            # write failure cleanly.
+            if d is None:
+                d = Path("/dev/null") if os.name != "nt" else Path("NUL")
+                _LOG_PATH = d
+            else:
+                _LOG_PATH = d / name
        return _LOG_PATH


@@ -194,7 +218,11 @@ def log_page_open(slug: str) -> None:
    they actually switch pages, not one per rerun. Falls back to
    always-emit when session state is unreachable (running outside
    Streamlit, e.g. in tests).
+
+    Whole body is wrapped in try/except — the audit log is best-
+    effort and MUST NOT crash the page that called it.
    """
+    try:
        try:
            import streamlit as st
            prev = st.session_state.get("_audit_current_page")
@@ -204,6 +232,8 @@ def log_page_open(slug: str) -> None:
        except Exception:
            pass
        log_event("nav", f"Opened {slug}", page=slug)
+    except Exception:
+        pass


 def reset_for_tests() -> None:
--- a/src/gui/components/_legacy.py
+++ b/src/gui/components/_legacy.py
@@ -157,8 +157,14 @@ def hide_streamlit_chrome(*, gate_license: bool = True) -> None:
    st.markdown(_HIDE_CHROME_CSS, unsafe_allow_html=True)
    # Stamp a session-start record into the audit log the first time
    # any page renders. Idempotent — subsequent calls are no-ops.
+    # Wrapped because a broken audit log MUST NOT take the GUI down.
+    try:
        from src.audit import log_session_start
        log_session_start()
+    except Exception:
+        import traceback, sys
+        print("DataTools: audit log session-start failed:", file=sys.stderr)
+        traceback.print_exc()
    # Production-safe check runs first so a misconfigured shipped
    # build refuses to render anything (rather than rendering a
    # broken activation form that doesn't accept real blobs).
@@ -176,7 +182,15 @@ def hide_streamlit_chrome(*, gate_license: bool = True) -> None:
        require_license_or_render_activation,
    )
    render_license_status_sidebar()
+    # Wrapped — a broken audit-log or filesystem permission issue must
+    # NOT kill the page-body render. Errors print to stderr so the
+    # developer can still see what went wrong.
+    try:
        _render_diagnostics_sidebar()
+    except Exception:
+        import traceback, sys
+        print("DataTools: diagnostics sidebar render failed:", file=sys.stderr)
+        traceback.print_exc()
    if gate_license:
        require_license_or_render_activation()