From 76c9f5a679b2044504778975f6241de866148e8b Mon Sep 17 00:00:00 2001
From: Michael <michael.dombaugh@gmail.com>
Date: Tue, 19 May 2026 14:46:27 +0000
Subject: [PATCH] feat(audit): diagnostic instrumentation env vars +
 writer-thread guard
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 1 of the audit-log re-enablement plan. Adds three opt-in env
vars that let us ship one instrumented build for the user to run,
without flipping the kill switch on for everybody. **Default
behaviour is byte-identical to today**: with no env vars set the
kill switch wins, no writer thread starts, no file is written, no
stderr line is printed.

Env vars (do NOT set in prod):

- ``DATATOOLS_AUDIT_ENABLED=1`` — bypass ``_DISABLED`` for one
  session. ``_DISABLED = True`` stays in the source so an upgrade
  with no env var is still safe.
- ``DATATOOLS_AUDIT_TRACE=1`` — print ``[audit] ...`` lines to
  stderr at module import, every writer-thread state change, and
  every producer entry point. Lets the user share a small log
  instead of attaching a debugger.
- ``DATATOOLS_AUDIT_PROBE=<value>`` — bisect the producer path
  for Phase 2. Values: ``full`` (default), ``noop``, ``no-events``,
  ``no-page-open``, ``no-session-start``. The named variants
  return early from the corresponding ``log_*`` function so we can
  isolate which call is implicated in the blank-pages symptom.

Also:

- ``_writer_loop`` gets an outer ``try/except BaseException`` so
  silent thread death now surfaces a ``"writer thread died: ..."``
  line in the launcher terminal instead of looking like a hang.
- Existing first-write-failure stderr print gets ``flush=True`` so
  the user actually sees it before the process is killed.
- Test fixture switches from the previous-commit ``_DISABLED = False``
  override to ``_ENABLE_OVERRIDE = True`` so tests exercise the same
  bypass path the diagnostic build uses.
- Two new tests pin the safety contract: with the kill switch on
  and no override, every producer is a true no-op (no writer
  thread, no file). And ``DATATOOLS_AUDIT_PROBE=no-events`` bypasses
  ``log_event`` even when the override is on — guards the bisect.

Rollback: ``git revert HEAD`` removes Phase 1 cleanly. The deadlock
fix from the previous commit stays in place.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/audit.py        | 143 +++++++++++++++++++++++++++++++++-----------
 tests/test_audit.py |  53 ++++++++++++++--
 2 files changed, 155 insertions(+), 41 deletions(-)
diff --git a/src/audit.py b/src/audit.py
index 10d48f0..6880973 100644
--- a/src/audit.py
+++ b/src/audit.py
@@ -58,11 +58,33 @@ _SESSION_STARTED: bool = False
 # RESTORED kill switch — the async-writer redesign still triggers the
 # blank-pages symptom on the user's machine despite no synchronous
 # file I/O on the request path. Cause is not yet identified; keep all
-# log_* calls as no-ops while we diagnose so the GUI keeps working.
-# Diagnostic plan: ask the user for the launcher terminal output AND
-# whether anything appears in ``~/.datatools/logs/`` at all — that
-# bisects "writer thread starts" from "writer thread can't write."
+# log_* calls as no-ops by default while we diagnose.
+#
+# Diagnostic overrides for an instrumented run (DO NOT set in prod):
+#   DATATOOLS_AUDIT_ENABLED=1  — bypass _DISABLED for one session
+#   DATATOOLS_AUDIT_TRACE=1    — print "[audit] ..." lines to stderr
+#   DATATOOLS_AUDIT_PROBE=...  — bisect the producer path. Values:
+#     full (default) | noop | no-events | no-page-open | no-session-start
 _DISABLED: bool = True
+_ENABLE_OVERRIDE: bool = os.environ.get("DATATOOLS_AUDIT_ENABLED") == "1"
+_TRACE: bool = os.environ.get("DATATOOLS_AUDIT_TRACE") == "1"
+_PROBE: str = os.environ.get("DATATOOLS_AUDIT_PROBE", "full").lower()
+
+
+def _trace(_msg: str, **_kw: Any) -> None:
+    """Emit a stderr trace line when DATATOOLS_AUDIT_TRACE=1.
+
+    No-op otherwise — the default configuration adds zero extra I/O."""
+    if not _TRACE:
+        return
+    try:
+        extras = " ".join(f"{k}={v!r}" for k, v in _kw.items())
+        line = f"[audit] {_msg}"
+        if extras:
+            line = f"{line} {extras}"
+        print(line, file=sys.stderr, flush=True)
+    except Exception:
+        pass
 
 # Bounded in-memory queue. ``deque(maxlen=N)`` drops the OLDEST entry
 # when full so the most-recent events are always the ones that
@@ -78,6 +100,13 @@ _WRITER_THREAD: threading.Thread | None = None
 _SHUTDOWN_REQUESTED: bool = False
 _WRITE_FAILED_REPORTED: bool = False
 
+_trace(
+    "module-import",
+    disabled=_DISABLED,
+    enabled_override=_ENABLE_OVERRIDE,
+    probe=_PROBE,
+)
+
 
 # ---------------------------------------------------------------------------
 # Path helpers (pure — no I/O)
@@ -151,36 +180,63 @@ def _writer_loop() -> None:
     swallowed — the audit log is best-effort.
     """
     global _WRITE_FAILED_REPORTED
-    while True:
-        # Wait for something to do.
-        with _QUEUE_COND:
-            while not _QUEUE and not _SHUTDOWN_REQUESTED:
-                _QUEUE_COND.wait(timeout=5.0)
-            if _SHUTDOWN_REQUESTED and not _QUEUE:
-                return
-            # Snapshot + clear under the lock; release before doing
-            # I/O so producers can keep enqueueing.
-            batch = list(_QUEUE)
-            _QUEUE.clear()
+    _trace("writer-loop:enter")
+    try:
+        while True:
+            # Wait for something to do.
+            with _QUEUE_COND:
+                while not _QUEUE and not _SHUTDOWN_REQUESTED:
+                    _QUEUE_COND.wait(timeout=5.0)
+                if _SHUTDOWN_REQUESTED and not _QUEUE:
+                    _trace("writer-loop:exit-shutdown")
+                    return
+                # Snapshot + clear under the lock; release before doing
+                # I/O so producers can keep enqueueing.
+                batch = list(_QUEUE)
+                _QUEUE.clear()
+            _trace("writer-loop:wake", batch=len(batch))
 
+            try:
+                path = audit_log_path()
+                _trace("writer-loop:mkdir", dir=str(path.parent))
+                path.parent.mkdir(parents=True, exist_ok=True)
+                _trace("writer-loop:open", path=str(path))
+                with path.open("a", encoding="utf-8") as f:
+                    for ev in batch:
+                        f.write(json.dumps(ev, ensure_ascii=False) + "\n")
+                _trace("writer-loop:wrote", count=len(batch))
+            except Exception as e:
+                _trace(
+                    "writer-loop:write-failed",
+                    exc=type(e).__name__,
+                    msg=str(e),
+                )
+                if not _WRITE_FAILED_REPORTED:
+                    _WRITE_FAILED_REPORTED = True
+                    try:
+                        print(
+                            f"DataTools audit: writes failing — log degraded "
+                            f"for the rest of this session. Error: "
+                            f"{type(e).__name__}: {e}",
+                            file=sys.stderr,
+                            flush=True,
+                        )
+                    except Exception:
+                        pass
+    except BaseException as e:
+        # Outer guard. If the writer thread dies for any reason
+        # (unexpected exception, sys.exit from a foreign module),
+        # surface it in the launcher terminal instead of going silent.
         try:
-            path = audit_log_path()
-            path.parent.mkdir(parents=True, exist_ok=True)
-            with path.open("a", encoding="utf-8") as f:
-                for ev in batch:
-                    f.write(json.dumps(ev, ensure_ascii=False) + "\n")
-        except Exception as e:
-            if not _WRITE_FAILED_REPORTED:
-                _WRITE_FAILED_REPORTED = True
-                try:
-                    print(
-                        f"DataTools audit: writes failing — log degraded "
-                        f"for the rest of this session. Error: "
-                        f"{type(e).__name__}: {e}",
-                        file=sys.stderr,
-                    )
-                except Exception:
-                    pass
+            print(
+                f"DataTools audit: writer thread died: "
+                f"{type(e).__name__}: {e}",
+                file=sys.stderr,
+                flush=True,
+            )
+        except Exception:
+            pass
+        raise
 
 
 def _ensure_writer_started() -> None:
@@ -194,6 +250,7 @@ def _ensure_writer_started() -> None:
         return
     with _LOCK:
         if _WRITER_THREAD is None or not _WRITER_THREAD.is_alive():
+            _trace("ensure-writer:starting")
             t = threading.Thread(
                 target=_writer_loop,
                 daemon=True,
@@ -201,6 +258,7 @@ def _ensure_writer_started() -> None:
             )
             t.start()
             _WRITER_THREAD = t
+            _trace("ensure-writer:started", ident=t.ident)
 
 
 # ---------------------------------------------------------------------------
@@ -219,8 +277,12 @@ def log_event(
     Failures inside this function are swallowed; a broken audit log
     must never take the GUI down.
     """
-    if _DISABLED:
+    if _DISABLED and not _ENABLE_OVERRIDE:
         return
+    if _PROBE in ("noop", "no-events"):
+        _trace("log_event:probe-skip", category=category)
+        return
+    _trace("log_event:enter", category=category)
     try:
         try:
             ts = datetime.now(tz=timezone.utc).isoformat(timespec="milliseconds")
@@ -250,8 +312,12 @@ def log_event(
 
 def log_session_start() -> None:
     """Idempotent session-start banner with platform info."""
-    if _DISABLED:
+    if _DISABLED and not _ENABLE_OVERRIDE:
         return
+    if _PROBE in ("noop", "no-session-start"):
+        _trace("log_session_start:probe-skip")
+        return
+    _trace("log_session_start:enter")
     global _SESSION_STARTED
     with _LOCK:
         if _SESSION_STARTED:
@@ -278,8 +344,12 @@ def log_session_start() -> None:
 
 def log_page_open(slug: str) -> None:
     """Emit a deduplicated 'page open' nav event."""
-    if _DISABLED:
+    if _DISABLED and not _ENABLE_OVERRIDE:
         return
+    if _PROBE in ("noop", "no-page-open"):
+        _trace("log_page_open:probe-skip", slug=slug)
+        return
+    _trace("log_page_open:enter", slug=slug)
     try:
         try:
             import streamlit as st
@@ -318,8 +388,9 @@ def flush_audit_log(timeout_s: float = 0.5) -> None:
     stuck disk can never delay shutdown — events still in the queue
     when the timer expires are dropped.
     """
-    if _DISABLED:
+    if _DISABLED and not _ENABLE_OVERRIDE:
         return
+    _trace("flush:enter", timeout_s=timeout_s)
     global _SHUTDOWN_REQUESTED
     deadline = time.monotonic() + max(0.0, timeout_s)
     with _QUEUE_COND:
diff --git a/tests/test_audit.py b/tests/test_audit.py
index 2ee2f30..3550bde 100644
--- a/tests/test_audit.py
+++ b/tests/test_audit.py
@@ -28,12 +28,13 @@ def isolated_audit(monkeypatch, tmp_path):
     """Redirect audit writes into ``tmp_path`` and reset module state
     so each test starts fresh.
 
-    The kill switch is bypassed for the duration of the test by
-    patching the module-level constant directly — these tests need
-    the real producer path to run."""
+    The kill switch (``_DISABLED``) is bypassed for the duration of
+    the test by patching ``_ENABLE_OVERRIDE`` directly — env vars
+    won't reach the module-level constant after import."""
     monkeypatch.setenv("DATATOOLS_AUDIT_DIR", str(tmp_path))
     from src import audit
-    monkeypatch.setattr(audit, "_DISABLED", False)
+    monkeypatch.setattr(audit, "_ENABLE_OVERRIDE", True)
+    monkeypatch.setattr(audit, "_PROBE", "full")
     audit.reset_for_tests()
     yield audit
     # Best-effort cleanup so a runaway writer thread doesn't keep
@@ -120,7 +121,8 @@ class TestUnwritableTargetDoesntCrash:
         not_a_dir.write_text("hi")
         monkeypatch.setenv("DATATOOLS_AUDIT_DIR", str(not_a_dir))
         from src import audit
-        monkeypatch.setattr(audit, "_DISABLED", False)
+        monkeypatch.setattr(audit, "_ENABLE_OVERRIDE", True)
+        monkeypatch.setattr(audit, "_PROBE", "full")
         audit.reset_for_tests()
         try:
             start = time.perf_counter()
@@ -131,6 +133,47 @@ class TestUnwritableTargetDoesntCrash:
             audit.reset_for_tests()
 
 
+class TestKillSwitchContract:
+    """With ``_DISABLED = True`` and no env-var override, every
+    producer is a true no-op. Pins the safety contract: the default
+    configuration must never touch disk or start a thread."""
+
+    def test_disabled_writes_nothing(self, monkeypatch, tmp_path):
+        monkeypatch.setenv("DATATOOLS_AUDIT_DIR", str(tmp_path))
+        from src import audit
+        monkeypatch.setattr(audit, "_DISABLED", True)
+        monkeypatch.setattr(audit, "_ENABLE_OVERRIDE", False)
+        audit.reset_for_tests()
+        try:
+            audit.log_event("test", "should be a no-op")
+            audit.log_session_start()
+            audit.log_page_open("test_page")
+            audit.flush_audit_log(timeout_s=0.5)
+            assert audit._WRITER_THREAD is None, (
+                "Writer thread must not start when the kill switch is on."
+            )
+            assert list(tmp_path.glob("datatools-*.jsonl")) == [], (
+                "Kill switch leaked a log file."
+            )
+        finally:
+            audit.reset_for_tests()
+
+    def test_probe_no_events_drops_writes(self, monkeypatch, tmp_path):
+        """``DATATOOLS_AUDIT_PROBE=no-events`` bypasses log_event even
+        when the override is on. Bisect aid for Phase 2."""
+        monkeypatch.setenv("DATATOOLS_AUDIT_DIR", str(tmp_path))
+        from src import audit
+        monkeypatch.setattr(audit, "_ENABLE_OVERRIDE", True)
+        monkeypatch.setattr(audit, "_PROBE", "no-events")
+        audit.reset_for_tests()
+        try:
+            audit.log_event("test", "should be dropped")
+            audit.flush_audit_log(timeout_s=0.5)
+            assert list(tmp_path.glob("datatools-*.jsonl")) == []
+        finally:
+            audit.reset_for_tests()
+
+
 class TestSerializationSafety:
     def test_non_json_extras_get_str_coerced(self, isolated_audit, tmp_path):
         audit = isolated_audit