datatools-dev/src/audit.py

"""Audit log — async, never-blocks-the-GUI file writer.

A client running DataTools who hits a bug should be able to grab one
file off disk, mail it to support, and have us reconstruct what they
were doing when things broke. That file is the audit log written by
this module.

Hard contract: ``log_event`` and friends are **non-blocking**. They
push events onto an in-memory queue in microseconds and return. A
single background daemon thread drains the queue and writes batches
to disk. If the disk is hostile (Windows antivirus locks, slow
network drive, permission failures), only the writer thread waits —
the GUI render thread keeps moving. This is the contract that lets
us re-enable the audit log after the earlier blank-pages incident:
a stuck ``open(...)`` call can't hang the page anymore because the
``open`` happens off the request path.

JSONL on-disk format (one event per line):

  {"ts": "...", "level": "info", "category": "upload",
   "session": "a1b2c3d4", "message": "Uploaded customers.csv",
   "filename": "customers.csv", "bytes": 24813}

Public API:

- ``log_event(category, message, **extra)`` — enqueue one event.
- ``log_session_start()`` — idempotent session banner.
- ``log_page_open(slug)`` — nav event, deduplicated per session.
- ``log_exception(where, exc, **extra)`` — convenience wrapper.
- ``audit_log_path()`` — pure path computation, no disk touch.
- ``audit_log_dir()`` — directory holding all session files.
- ``flush_audit_log(timeout_s=0.5)`` — drain queue (shutdown hook).
"""

from __future__ import annotations

import getpass
import json
import os
import platform
import sys
import threading
import time
import uuid
from collections import deque
from datetime import datetime, timezone
from pathlib import Path
from typing import Any


# Module-level state. All access either lock-free (immutable after
# init) or guarded by ``_LOCK`` / ``_QUEUE_COND``.
_LOCK = threading.Lock()
_SESSION_ID: str | None = None
_SESSION_STARTED: bool = False

# Kill switch — held off by default now that the blank-pages symptom
# is traced to the audit_log_path/_session_id deadlock fixed in
# a8ff8f4. End-to-end validation in session cf2ebbd5 (2026-05-19)
# wrote session/upload/analyze/nav/session-end events with no
# regression. Flip back to ``True`` if the symptom recurs; the env
# vars below stay as escape hatches.
#
# Env-var overrides (still wired):
#   DATATOOLS_AUDIT_ENABLED=1  — force-on even if _DISABLED is True
#   DATATOOLS_AUDIT_TRACE=1    — print "[audit] ..." lines to stderr
#   DATATOOLS_AUDIT_PROBE=...  — bisect the producer path. Values:
#     full (default) | noop | no-events | no-page-open | no-session-start
_DISABLED: bool = False
_ENABLE_OVERRIDE: bool = os.environ.get("DATATOOLS_AUDIT_ENABLED") == "1"
_TRACE: bool = os.environ.get("DATATOOLS_AUDIT_TRACE") == "1"
_PROBE: str = os.environ.get("DATATOOLS_AUDIT_PROBE", "full").lower()


def _trace(_msg: str, **_kw: Any) -> None:
    """Emit a stderr trace line when DATATOOLS_AUDIT_TRACE=1.

    No-op otherwise — the default configuration adds zero extra I/O."""
    if not _TRACE:
        return
    try:
        extras = " ".join(f"{k}={v!r}" for k, v in _kw.items())
        line = f"[audit] {_msg}"
        if extras:
            line = f"{line} {extras}"
        print(line, file=sys.stderr, flush=True)
    except Exception:
        pass

# Bounded in-memory queue. ``deque(maxlen=N)`` drops the OLDEST entry
# when full so the most-recent events are always the ones that
# survive on disk — diagnostic-most-valuable at the moment of a
# crash. The number is generous; typical sessions produce well under
# a hundred events.
_MAX_QUEUED = 5000
_QUEUE: deque = deque(maxlen=_MAX_QUEUED)
_QUEUE_COND = threading.Condition()

# Writer-thread lifecycle.
_WRITER_THREAD: threading.Thread | None = None
_SHUTDOWN_REQUESTED: bool = False
_WRITE_FAILED_REPORTED: bool = False

_trace(
    "module-import",
    disabled=_DISABLED,
    enabled_override=_ENABLE_OVERRIDE,
    probe=_PROBE,
)


# ---------------------------------------------------------------------------
# Path helpers (pure — no I/O)
# ---------------------------------------------------------------------------

def audit_log_dir() -> Path:
    """Return the directory where audit logs are written.

    Defaults to ``~/.datatools/logs/``. Overrideable via the
    ``DATATOOLS_AUDIT_DIR`` env var (used by tests to point at a
    tmp dir). No filesystem I/O happens here — caller is responsible
    for ``mkdir`` if they want the dir to exist.
    """
    override = os.environ.get("DATATOOLS_AUDIT_DIR")
    if override:
        return Path(override)
    try:
        return Path.home() / ".datatools" / "logs"
    except Exception:
        # In some sandboxed environments Path.home() can raise; fall
        # back to a temp dir without touching the disk.
        import tempfile
        return Path(tempfile.gettempdir()) / "datatools-logs"


def _session_id() -> str:
    global _SESSION_ID
    if _SESSION_ID is None:
        with _LOCK:
            if _SESSION_ID is None:
                _SESSION_ID = uuid.uuid4().hex
    return _SESSION_ID


def audit_log_path() -> Path:
    """Return today's log file path. **No I/O performed.**

    Filename is ``datatools-YYYY-MM-DD.jsonl`` keyed on **local** date
    (event ``ts`` fields stay UTC). Local date is what a user sees on
    their calendar when they go looking for "today's log"; UTC would
    flip a day early in the evening on west-coast timezones, which is
    surprising to humans. Concurrent DataTools instances writing on
    the same day share the file via the writer thread's per-batch
    ``open`` + append; append-mode for small writes is atomic on
    POSIX and safe-enough on Windows.

    Recomputed on every call so a session that crosses midnight
    follows the rollover. No caching, no locking — pure path math.
    """
    try:
        date_str = datetime.now().strftime("%Y-%m-%d")
    except Exception:
        date_str = "unknown"
    return audit_log_dir() / f"datatools-{date_str}.jsonl"


_RETENTION_DAYS = 7


def _sweep_old_logs() -> None:
    """Delete ``datatools-*.jsonl`` files older than ``_RETENTION_DAYS``.

    Best-effort housekeeping invoked once per process at writer-thread
    start. Matches the new daily filename AND the legacy per-session
    pattern (``datatools-<ts>-<sid>.jsonl``), so this also clears
    leftovers from before the retention switch.
    """
    try:
        log_dir = audit_log_dir()
        if not log_dir.exists():
            return
        cutoff = time.time() - (_RETENTION_DAYS * 86400)
        deleted = 0
        for p in log_dir.glob("datatools-*.jsonl"):
            try:
                if p.stat().st_mtime < cutoff:
                    p.unlink()
                    deleted += 1
            except Exception:
                continue
        _trace("sweep:done", deleted=deleted, retention_days=_RETENTION_DAYS)
    except Exception as e:
        _trace("sweep:failed", exc=type(e).__name__, msg=str(e))


# ---------------------------------------------------------------------------
# Writer thread
# ---------------------------------------------------------------------------

def _writer_loop() -> None:
    """Drain ``_QUEUE`` to disk forever.

    Runs in a daemon thread. Wakes on new events, batches everything
    currently in the queue into a single ``open``/``write``/``close``
    cycle (cheaper than per-event opens), then goes back to waiting.
    File-system errors are reported once per session to stderr and
    swallowed — the audit log is best-effort.
    """
    global _WRITE_FAILED_REPORTED
    _trace("writer-loop:enter")
    _sweep_old_logs()
    try:
        while True:
            # Wait for something to do.
            with _QUEUE_COND:
                while not _QUEUE and not _SHUTDOWN_REQUESTED:
                    _QUEUE_COND.wait(timeout=5.0)
                if _SHUTDOWN_REQUESTED and not _QUEUE:
                    _trace("writer-loop:exit-shutdown")
                    return
                # Snapshot + clear under the lock; release before doing
                # I/O so producers can keep enqueueing.
                batch = list(_QUEUE)
                _QUEUE.clear()
            _trace("writer-loop:wake", batch=len(batch))

            try:
                path = audit_log_path()
                _trace("writer-loop:mkdir", dir=str(path.parent))
                path.parent.mkdir(parents=True, exist_ok=True)
                _trace("writer-loop:open", path=str(path))
                with path.open("a", encoding="utf-8") as f:
                    for ev in batch:
                        f.write(json.dumps(ev, ensure_ascii=False) + "\n")
                _trace("writer-loop:wrote", count=len(batch))
            except Exception as e:
                _trace(
                    "writer-loop:write-failed",
                    exc=type(e).__name__,
                    msg=str(e),
                )
                if not _WRITE_FAILED_REPORTED:
                    _WRITE_FAILED_REPORTED = True
                    try:
                        print(
                            f"DataTools audit: writes failing — log degraded "
                            f"for the rest of this session. Error: "
                            f"{type(e).__name__}: {e}",
                            file=sys.stderr,
                            flush=True,
                        )
                    except Exception:
                        pass
    except BaseException as e:
        # Outer guard. If the writer thread dies for any reason
        # (unexpected exception, sys.exit from a foreign module),
        # surface it in the launcher terminal instead of going silent.
        try:
            print(
                f"DataTools audit: writer thread died: "
                f"{type(e).__name__}: {e}",
                file=sys.stderr,
                flush=True,
            )
        except Exception:
            pass
        raise


def _ensure_writer_started() -> None:
    """Lazy-start the writer thread on first event.

    Idempotent. Cheap enough to call on every event (a sentinel
    check + an occasional lock acquisition).
    """
    global _WRITER_THREAD
    if _WRITER_THREAD is not None and _WRITER_THREAD.is_alive():
        return
    with _LOCK:
        if _WRITER_THREAD is None or not _WRITER_THREAD.is_alive():
            _trace("ensure-writer:starting")
            t = threading.Thread(
                target=_writer_loop,
                daemon=True,
                name="datatools-audit-writer",
            )
            t.start()
            _WRITER_THREAD = t
            _trace("ensure-writer:started", ident=t.ident)


# ---------------------------------------------------------------------------
# Public producer API
# ---------------------------------------------------------------------------

def log_event(
    category: str,
    message: str,
    *,
    level: str = "info",
    **extra: Any,
) -> None:
    """Enqueue one event. Non-blocking; returns in microseconds.

    Failures inside this function are swallowed; a broken audit log
    must never take the GUI down.
    """
    if _DISABLED and not _ENABLE_OVERRIDE:
        return
    if _PROBE in ("noop", "no-events"):
        _trace("log_event:probe-skip", category=category)
        return
    _trace("log_event:enter", category=category)
    try:
        try:
            ts = datetime.now(tz=timezone.utc).isoformat(timespec="milliseconds")
        except Exception:
            ts = ""
        event: dict[str, Any] = {
            "ts": ts,
            "level": level,
            "category": category,
            "session": _session_id()[:8],
            "message": message,
        }
        for k, v in extra.items():
            try:
                json.dumps(v)
                event[k] = v
            except (TypeError, ValueError):
                event[k] = str(v)

        _ensure_writer_started()
        with _QUEUE_COND:
            _QUEUE.append(event)
            _QUEUE_COND.notify()
    except Exception:
        pass


def log_session_start() -> None:
    """Idempotent session-start banner with platform info."""
    if _DISABLED and not _ENABLE_OVERRIDE:
        return
    if _PROBE in ("noop", "no-session-start"):
        _trace("log_session_start:probe-skip")
        return
    _trace("log_session_start:enter")
    global _SESSION_STARTED
    with _LOCK:
        if _SESSION_STARTED:
            return
        _SESSION_STARTED = True
    try:
        user = getpass.getuser()
    except Exception:
        user = "?"
    try:
        cwd = str(Path.cwd())
    except Exception:
        cwd = "?"
    log_event(
        "session",
        "Session started",
        platform=f"{platform.system()} {platform.release()}",
        python=sys.version.split()[0],
        user=user,
        cwd=cwd,
        log_file=str(audit_log_path()),
    )


def log_page_open(slug: str) -> None:
    """Emit a deduplicated 'page open' nav event."""
    if _DISABLED and not _ENABLE_OVERRIDE:
        return
    if _PROBE in ("noop", "no-page-open"):
        _trace("log_page_open:probe-skip", slug=slug)
        return
    _trace("log_page_open:enter", slug=slug)
    try:
        try:
            import streamlit as st
            prev = st.session_state.get("_audit_current_page")
            if prev == slug:
                return
            st.session_state["_audit_current_page"] = slug
        except Exception:
            pass
        log_event("nav", f"Opened {slug}", page=slug)
    except Exception:
        pass


def log_exception(where: str, exc: BaseException, **extra: Any) -> None:
    """Convenience wrapper for caught exceptions."""
    log_event(
        "error",
        f"{where}: {type(exc).__name__}: {exc}",
        level="error",
        exc_type=type(exc).__name__,
        exc_message=str(exc),
        **extra,
    )


# ---------------------------------------------------------------------------
# Shutdown
# ---------------------------------------------------------------------------

def flush_audit_log(timeout_s: float = 0.5) -> None:
    """Drain queued events to disk, then signal the writer to exit.

    Called from ``shutdown_app`` so the closing session's events make
    it to disk before the process dies. Bounded by ``timeout_s`` so a
    stuck disk can never delay shutdown — events still in the queue
    when the timer expires are dropped.
    """
    if _DISABLED and not _ENABLE_OVERRIDE:
        return
    _trace("flush:enter", timeout_s=timeout_s)
    global _SHUTDOWN_REQUESTED
    deadline = time.monotonic() + max(0.0, timeout_s)
    with _QUEUE_COND:
        _SHUTDOWN_REQUESTED = True
        _QUEUE_COND.notify_all()
    while time.monotonic() < deadline:
        with _QUEUE_COND:
            if not _QUEUE:
                return
        time.sleep(0.02)


# ---------------------------------------------------------------------------
# Test helpers
# ---------------------------------------------------------------------------

def reset_for_tests() -> None:
    """Reset module-level state. Call from a pytest fixture for
    isolation between tests."""
    global _SESSION_ID, _SESSION_STARTED
    global _WRITER_THREAD, _SHUTDOWN_REQUESTED, _WRITE_FAILED_REPORTED
    with _LOCK:
        _SESSION_ID = None
        _SESSION_STARTED = False
        _SHUTDOWN_REQUESTED = False
        _WRITE_FAILED_REPORTED = False
        # The previous writer thread (if any) will exit on its own
        # the next time it wakes — daemon thread, no need to join.
        _WRITER_THREAD = None
    with _QUEUE_COND:
        _QUEUE.clear()


__all__ = [
    "audit_log_dir",
    "audit_log_path",
    "flush_audit_log",
    "log_event",
    "log_exception",
    "log_page_open",
    "log_session_start",
    "reset_for_tests",
]