feat(audit): JSONL audit log for support diagnostics

New ``src/audit.py`` module records GUI actions to a per-session JSONL file under ``~/.datatools/logs/`` (overrideable via ``DATATOOLS_AUDIT_DIR``). The file is human-readable (one JSON object per line, each with a ``message`` field) AND trivially machine-parseable — the support flow is "client mails the file, we read it and explain what went wrong." Format example:: {"ts":"2026-05-17T05:30:00.123+00:00","level":"info","category":"session", "session":"a1b2c3d4","message":"Session started", "platform":"Windows 11","python":"3.14.0","user":"Michael Dombaugh", "log_file":"C:\\Users\\Michael Dombaugh\\.datatools\\logs\\datatools-...jsonl"} {"ts":"...","category":"upload","message":"Uploaded customers.csv", "filename":"customers.csv","bytes":24813} {"ts":"...","category":"analyze","message":"Analyzed customers.csv (3 findings)", "filename":"customers.csv","findings":3,"rows":120,"cols":8} {"ts":"...","category":"tool_run","message":"Clean Text run", "page":"2_Text_Cleaner"} {"ts":"...","category":"error","level":"error", "message":"analyze(weird.csv): EmptyDataError: No columns to parse", "filename":"weird.csv","outcome":"empty_after_repair"} Public API: - ``log_event(category, message, **extra)`` - ``log_session_start()`` — idempotent banner with platform info - ``log_page_open(slug)`` — emit a ``nav`` event, deduplicated per Streamlit session so reruns don't spam the log - ``log_exception(where, exc, **extra)`` — convenience wrapper - ``audit_log_path()`` / ``audit_log_dir()`` — for the UI Wired in at: - ``hide_streamlit_chrome``: stamps session start, mounts a small "🩺 Diagnostics" expander in the sidebar with the log path and an "Open log folder" button so the user can grab the file to attach to a support email. - Home page: ``upload`` event on every new file, ``upload`` event on per-file remove, ``analyze`` event with file count when Run-analysis fires. - ``_run_analysis_on_upload``: ``analyze`` event with rows / cols / findings count per file, plus ``error`` events on every caught exception (empty upload, empty after repair, pandas EmptyDataError, generic Exception). - Every Ready tool page (1, 2, 3, 4, 5, 9): ``tool_run`` event immediately after the primary action stashes its result. - Every tool page (1-9): ``log_page_open(slug)`` on render — deduped via session state so we don't get one event per Streamlit rerun. Safety: - ``log_event`` wraps every write in try/except. A broken audit log must NOT crash the GUI. - Non-JSON-serializable extras are ``str()``-coerced before writing. - File CONTENTS are never logged. We capture filename, byte count, and (in the analyzer) a 12-char sha1 fingerprint of the bytes so the same file re-uploaded gets the same trace. - License keys, session cookies, etc. are not logged. - ``DATATOOLS_AUDIT_DIR`` env var lets tests redirect writes into a tmp dir. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-17 01:36:35 +00:00
parent f0885aeb1e
commit c73d716d06
12 changed files with 373 additions and 3 deletions
--- a/src/audit.py
+++ b/src/audit.py
@@ -0,0 +1,227 @@
+"""Audit log — records GUI actions for support diagnostics.
+
+A client running DataTools who hits a bug should be able to grab one
+file off disk, mail it to support, and have us reconstruct what they
+were doing when things broke. That file is the audit log written by
+this module.
+
+Design choices:
+
+- **JSONL**, one event per line. Each line is a valid JSON object; the
+  whole file is grep-friendly, ``jq``-friendly, and still readable in
+  Notepad / TextEdit if no tooling is available. Each event carries a
+  human-readable ``message`` field so the file is useful even without
+  any tooling.
+- **One file per session**, named ``datatools-<utc-timestamp>-<id>.jsonl``.
+  Multiple sessions on the same machine don't clobber each other, and
+  the filename sorts chronologically.
+- **Default location**: ``~/.datatools/logs/`` on every platform.
+  Overrideable via the ``DATATOOLS_AUDIT_DIR`` environment variable —
+  used by tests to redirect writes into a tmp dir.
+- **Never crashes the app**. Every write is wrapped in a try/except;
+  a broken audit log must not take down the GUI.
+- **No PII bytes**: file CONTENTS are never logged. We log the
+  filename, byte size, and a short content hash so the same file
+  re-uploaded gets the same fingerprint, but the actual bytes stay
+  local.
+
+Public API:
+
+- ``log_event(category, message, **extra)`` — write one event.
+- ``log_session_start()`` — emit a session-start record with platform
+  info. Idempotent within a single session.
+- ``audit_log_path()`` — return the path to the current session's file
+  so the GUI can show it to the user.
+- ``audit_log_dir()`` — return the directory holding all session logs.
+"""
+
+from __future__ import annotations
+
+import getpass
+import json
+import os
+import platform
+import sys
+import threading
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+
+# Module-level cache for per-session state. Streamlit reruns the script
+# many times per session but the module is imported once, so these
+# survive across reruns within the same Python process.
+_LOCK = threading.Lock()
+_LOG_PATH: Path | None = None
+_SESSION_ID: str | None = None
+_SESSION_STARTED: bool = False
+
+
+def audit_log_dir() -> Path:
+    """Return the directory where audit logs are written.
+
+    Defaults to ``~/.datatools/logs/``. Overrideable via the
+    ``DATATOOLS_AUDIT_DIR`` environment variable so tests can redirect
+    writes into ``tmp_path``.
+    """
+    override = os.environ.get("DATATOOLS_AUDIT_DIR")
+    if override:
+        return Path(override)
+    return Path.home() / ".datatools" / "logs"
+
+
+def _session_id() -> str:
+    global _SESSION_ID
+    with _LOCK:
+        if _SESSION_ID is None:
+            _SESSION_ID = uuid.uuid4().hex
+        return _SESSION_ID
+
+
+def audit_log_path() -> Path:
+    """Return this session's log file path.
+
+    The path is created the first time it's queried so each Python
+    process gets a single file regardless of how many Streamlit
+    reruns happen.
+    """
+    global _LOG_PATH
+    with _LOCK:
+        if _LOG_PATH is None:
+            ts = datetime.now(tz=timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+            sid = _session_id()[:8]
+            d = audit_log_dir()
+            try:
+                d.mkdir(parents=True, exist_ok=True)
+            except Exception:
+                # If we can't create the dir, fall back to a tmpdir
+                # location so we never crash the app for the audit
+                # log's sake.
+                import tempfile
+                d = Path(tempfile.gettempdir()) / "datatools-logs"
+                d.mkdir(parents=True, exist_ok=True)
+            _LOG_PATH = d / f"datatools-{ts}-{sid}.jsonl"
+        return _LOG_PATH
+
+
+def log_event(
+    category: str,
+    message: str,
+    *,
+    level: str = "info",
+    **extra: Any,
+) -> None:
+    """Append one event to the session log.
+
+    ``category`` groups related events (e.g. ``upload``, ``analyze``,
+    ``tool_run``, ``error``, ``nav``). ``message`` is the human
+    sentence that lands in the file. ``extra`` keys are passed through
+    to the JSON object verbatim, so callers can attach structured
+    context (filename, byte counts, finding counts, timings).
+
+    Failures are swallowed silently — a broken audit log must not
+    take the GUI down.
+    """
+    try:
+        event = {
+            "ts": datetime.now(tz=timezone.utc).isoformat(timespec="milliseconds"),
+            "level": level,
+            "category": category,
+            "session": _session_id()[:8],
+            "message": message,
+        }
+        # Attach extras with serialization safety: non-JSON values get
+        # str()'d so a bad caller can't poison the whole entry.
+        for k, v in extra.items():
+            try:
+                json.dumps(v)
+                event[k] = v
+            except (TypeError, ValueError):
+                event[k] = str(v)
+        with audit_log_path().open("a", encoding="utf-8") as f:
+            f.write(json.dumps(event, ensure_ascii=False) + "\n")
+    except Exception:
+        # Last-ditch silent swallow. Diagnostics is best-effort.
+        pass
+
+
+def log_session_start() -> None:
+    """Write the session-start banner. Idempotent within one process."""
+    global _SESSION_STARTED
+    with _LOCK:
+        if _SESSION_STARTED:
+            return
+        _SESSION_STARTED = True
+    # Best-effort metadata. Failures don't propagate.
+    try:
+        user = getpass.getuser()
+    except Exception:
+        user = "?"
+    try:
+        cwd = str(Path.cwd())
+    except Exception:
+        cwd = "?"
+    log_event(
+        "session",
+        "Session started",
+        platform=f"{platform.system()} {platform.release()}",
+        python=sys.version.split()[0],
+        user=user,
+        cwd=cwd,
+        log_file=str(audit_log_path()),
+    )
+
+
+def log_exception(where: str, exc: BaseException, **extra: Any) -> None:
+    """Convenience wrapper for caught exceptions."""
+    log_event(
+        "error",
+        f"{where}: {type(exc).__name__}: {exc}",
+        level="error",
+        exc_type=type(exc).__name__,
+        exc_message=str(exc),
+        **extra,
+    )
+
+
+def log_page_open(slug: str) -> None:
+    """Emit a "page open" event, deduplicated within a session.
+
+    Streamlit reruns the script many times per page (every widget
+    interaction triggers a rerun). Tracking the last page the user
+    visited in session state lets us emit a single ``nav`` event when
+    they actually switch pages, not one per rerun. Falls back to
+    always-emit when session state is unreachable (running outside
+    Streamlit, e.g. in tests).
+    """
+    try:
+        import streamlit as st
+        prev = st.session_state.get("_audit_current_page")
+        if prev == slug:
+            return
+        st.session_state["_audit_current_page"] = slug
+    except Exception:
+        pass
+    log_event("nav", f"Opened {slug}", page=slug)
+
+
+def reset_for_tests() -> None:
+    """Reset module-level state. Test-only — call from a pytest fixture
+    when isolation between tests matters."""
+    global _LOG_PATH, _SESSION_ID, _SESSION_STARTED
+    with _LOCK:
+        _LOG_PATH = None
+        _SESSION_ID = None
+        _SESSION_STARTED = False
+
+
+__all__ = [
+    "audit_log_dir",
+    "audit_log_path",
+    "log_event",
+    "log_exception",
+    "log_page_open",
+    "log_session_start",
+    "reset_for_tests",
+]