feat(audit): JSONL audit log for support diagnostics

New ``src/audit.py`` module records GUI actions to a per-session
JSONL file under ``~/.datatools/logs/`` (overrideable via
``DATATOOLS_AUDIT_DIR``). The file is human-readable (one JSON
object per line, each with a ``message`` field) AND trivially
machine-parseable — the support flow is "client mails the file,
we read it and explain what went wrong."

Format example::

    {"ts":"2026-05-17T05:30:00.123+00:00","level":"info","category":"session",
     "session":"a1b2c3d4","message":"Session started",
     "platform":"Windows 11","python":"3.14.0","user":"Michael Dombaugh",
     "log_file":"C:\\Users\\Michael Dombaugh\\.datatools\\logs\\datatools-...jsonl"}
    {"ts":"...","category":"upload","message":"Uploaded customers.csv",
     "filename":"customers.csv","bytes":24813}
    {"ts":"...","category":"analyze","message":"Analyzed customers.csv (3 findings)",
     "filename":"customers.csv","findings":3,"rows":120,"cols":8}
    {"ts":"...","category":"tool_run","message":"Clean Text run",
     "page":"2_Text_Cleaner"}
    {"ts":"...","category":"error","level":"error",
     "message":"analyze(weird.csv): EmptyDataError: No columns to parse",
     "filename":"weird.csv","outcome":"empty_after_repair"}

Public API:

- ``log_event(category, message, **extra)``
- ``log_session_start()`` — idempotent banner with platform info
- ``log_page_open(slug)`` — emit a ``nav`` event, deduplicated per
  Streamlit session so reruns don't spam the log
- ``log_exception(where, exc, **extra)`` — convenience wrapper
- ``audit_log_path()`` / ``audit_log_dir()`` — for the UI

Wired in at:

- ``hide_streamlit_chrome``: stamps session start, mounts a small
  "🩺  Diagnostics" expander in the sidebar with the log path and
  an "Open log folder" button so the user can grab the file to
  attach to a support email.
- Home page: ``upload`` event on every new file, ``upload`` event
  on per-file remove, ``analyze`` event with file count when
  Run-analysis fires.
- ``_run_analysis_on_upload``: ``analyze`` event with rows / cols /
  findings count per file, plus ``error`` events on every caught
  exception (empty upload, empty after repair, pandas EmptyDataError,
  generic Exception).
- Every Ready tool page (1, 2, 3, 4, 5, 9): ``tool_run`` event
  immediately after the primary action stashes its result.
- Every tool page (1-9): ``log_page_open(slug)`` on render — deduped
  via session state so we don't get one event per Streamlit rerun.

Safety:

- ``log_event`` wraps every write in try/except. A broken audit
  log must NOT crash the GUI.
- Non-JSON-serializable extras are ``str()``-coerced before writing.
- File CONTENTS are never logged. We capture filename, byte count,
  and (in the analyzer) a 12-char sha1 fingerprint of the bytes so
  the same file re-uploaded gets the same trace.
- License keys, session cookies, etc. are not logged.
- ``DATATOOLS_AUDIT_DIR`` env var lets tests redirect writes into a
  tmp dir.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-17 01:36:35 +00:00
parent f0885aeb1e
commit c73d716d06
12 changed files with 373 additions and 3 deletions

227
src/audit.py Normal file
View File

@@ -0,0 +1,227 @@
"""Audit log — records GUI actions for support diagnostics.
A client running DataTools who hits a bug should be able to grab one
file off disk, mail it to support, and have us reconstruct what they
were doing when things broke. That file is the audit log written by
this module.
Design choices:
- **JSONL**, one event per line. Each line is a valid JSON object; the
whole file is grep-friendly, ``jq``-friendly, and still readable in
Notepad / TextEdit if no tooling is available. Each event carries a
human-readable ``message`` field so the file is useful even without
any tooling.
- **One file per session**, named ``datatools-<utc-timestamp>-<id>.jsonl``.
Multiple sessions on the same machine don't clobber each other, and
the filename sorts chronologically.
- **Default location**: ``~/.datatools/logs/`` on every platform.
Overrideable via the ``DATATOOLS_AUDIT_DIR`` environment variable —
used by tests to redirect writes into a tmp dir.
- **Never crashes the app**. Every write is wrapped in a try/except;
a broken audit log must not take down the GUI.
- **No PII bytes**: file CONTENTS are never logged. We log the
filename, byte size, and a short content hash so the same file
re-uploaded gets the same fingerprint, but the actual bytes stay
local.
Public API:
- ``log_event(category, message, **extra)`` — write one event.
- ``log_session_start()`` — emit a session-start record with platform
info. Idempotent within a single session.
- ``audit_log_path()`` — return the path to the current session's file
so the GUI can show it to the user.
- ``audit_log_dir()`` — return the directory holding all session logs.
"""
from __future__ import annotations
import getpass
import json
import os
import platform
import sys
import threading
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
# Module-level cache for per-session state. Streamlit reruns the script
# many times per session but the module is imported once, so these
# survive across reruns within the same Python process.
_LOCK = threading.Lock()
_LOG_PATH: Path | None = None
_SESSION_ID: str | None = None
_SESSION_STARTED: bool = False
def audit_log_dir() -> Path:
"""Return the directory where audit logs are written.
Defaults to ``~/.datatools/logs/``. Overrideable via the
``DATATOOLS_AUDIT_DIR`` environment variable so tests can redirect
writes into ``tmp_path``.
"""
override = os.environ.get("DATATOOLS_AUDIT_DIR")
if override:
return Path(override)
return Path.home() / ".datatools" / "logs"
def _session_id() -> str:
global _SESSION_ID
with _LOCK:
if _SESSION_ID is None:
_SESSION_ID = uuid.uuid4().hex
return _SESSION_ID
def audit_log_path() -> Path:
"""Return this session's log file path.
The path is created the first time it's queried so each Python
process gets a single file regardless of how many Streamlit
reruns happen.
"""
global _LOG_PATH
with _LOCK:
if _LOG_PATH is None:
ts = datetime.now(tz=timezone.utc).strftime("%Y%m%dT%H%M%SZ")
sid = _session_id()[:8]
d = audit_log_dir()
try:
d.mkdir(parents=True, exist_ok=True)
except Exception:
# If we can't create the dir, fall back to a tmpdir
# location so we never crash the app for the audit
# log's sake.
import tempfile
d = Path(tempfile.gettempdir()) / "datatools-logs"
d.mkdir(parents=True, exist_ok=True)
_LOG_PATH = d / f"datatools-{ts}-{sid}.jsonl"
return _LOG_PATH
def log_event(
category: str,
message: str,
*,
level: str = "info",
**extra: Any,
) -> None:
"""Append one event to the session log.
``category`` groups related events (e.g. ``upload``, ``analyze``,
``tool_run``, ``error``, ``nav``). ``message`` is the human
sentence that lands in the file. ``extra`` keys are passed through
to the JSON object verbatim, so callers can attach structured
context (filename, byte counts, finding counts, timings).
Failures are swallowed silently — a broken audit log must not
take the GUI down.
"""
try:
event = {
"ts": datetime.now(tz=timezone.utc).isoformat(timespec="milliseconds"),
"level": level,
"category": category,
"session": _session_id()[:8],
"message": message,
}
# Attach extras with serialization safety: non-JSON values get
# str()'d so a bad caller can't poison the whole entry.
for k, v in extra.items():
try:
json.dumps(v)
event[k] = v
except (TypeError, ValueError):
event[k] = str(v)
with audit_log_path().open("a", encoding="utf-8") as f:
f.write(json.dumps(event, ensure_ascii=False) + "\n")
except Exception:
# Last-ditch silent swallow. Diagnostics is best-effort.
pass
def log_session_start() -> None:
"""Write the session-start banner. Idempotent within one process."""
global _SESSION_STARTED
with _LOCK:
if _SESSION_STARTED:
return
_SESSION_STARTED = True
# Best-effort metadata. Failures don't propagate.
try:
user = getpass.getuser()
except Exception:
user = "?"
try:
cwd = str(Path.cwd())
except Exception:
cwd = "?"
log_event(
"session",
"Session started",
platform=f"{platform.system()} {platform.release()}",
python=sys.version.split()[0],
user=user,
cwd=cwd,
log_file=str(audit_log_path()),
)
def log_exception(where: str, exc: BaseException, **extra: Any) -> None:
"""Convenience wrapper for caught exceptions."""
log_event(
"error",
f"{where}: {type(exc).__name__}: {exc}",
level="error",
exc_type=type(exc).__name__,
exc_message=str(exc),
**extra,
)
def log_page_open(slug: str) -> None:
"""Emit a "page open" event, deduplicated within a session.
Streamlit reruns the script many times per page (every widget
interaction triggers a rerun). Tracking the last page the user
visited in session state lets us emit a single ``nav`` event when
they actually switch pages, not one per rerun. Falls back to
always-emit when session state is unreachable (running outside
Streamlit, e.g. in tests).
"""
try:
import streamlit as st
prev = st.session_state.get("_audit_current_page")
if prev == slug:
return
st.session_state["_audit_current_page"] = slug
except Exception:
pass
log_event("nav", f"Opened {slug}", page=slug)
def reset_for_tests() -> None:
"""Reset module-level state. Test-only — call from a pytest fixture
when isolation between tests matters."""
global _LOG_PATH, _SESSION_ID, _SESSION_STARTED
with _LOCK:
_LOG_PATH = None
_SESSION_ID = None
_SESSION_STARTED = False
__all__ = [
"audit_log_dir",
"audit_log_path",
"log_event",
"log_exception",
"log_page_open",
"log_session_start",
"reset_for_tests",
]