feat(audit): diagnostic instrumentation env vars + writer-thread guard

Phase 1 of the audit-log re-enablement plan. Adds three opt-in env
vars that let us ship one instrumented build for the user to run,
without flipping the kill switch on for everybody. **Default
behaviour is byte-identical to today**: with no env vars set the
kill switch wins, no writer thread starts, no file is written, no
stderr line is printed.

Env vars (do NOT set in prod):

- ``DATATOOLS_AUDIT_ENABLED=1`` — bypass ``_DISABLED`` for one
  session. ``_DISABLED = True`` stays in the source so an upgrade
  with no env var is still safe.
- ``DATATOOLS_AUDIT_TRACE=1`` — print ``[audit] ...`` lines to
  stderr at module import, every writer-thread state change, and
  every producer entry point. Lets the user share a small log
  instead of attaching a debugger.
- ``DATATOOLS_AUDIT_PROBE=<value>`` — bisect the producer path
  for Phase 2. Values: ``full`` (default), ``noop``, ``no-events``,
  ``no-page-open``, ``no-session-start``. The named variants
  return early from the corresponding ``log_*`` function so we can
  isolate which call is implicated in the blank-pages symptom.

Also:

- ``_writer_loop`` gets an outer ``try/except BaseException`` so
  silent thread death now surfaces a ``"writer thread died: ..."``
  line in the launcher terminal instead of looking like a hang.
- Existing first-write-failure stderr print gets ``flush=True`` so
  the user actually sees it before the process is killed.
- Test fixture switches from the previous-commit ``_DISABLED = False``
  override to ``_ENABLE_OVERRIDE = True`` so tests exercise the same
  bypass path the diagnostic build uses.
- Two new tests pin the safety contract: with the kill switch on
  and no override, every producer is a true no-op (no writer
  thread, no file). And ``DATATOOLS_AUDIT_PROBE=no-events`` bypasses
  ``log_event`` even when the override is on — guards the bisect.

Rollback: ``git revert HEAD`` removes Phase 1 cleanly. The deadlock
fix from the previous commit stays in place.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-19 14:46:27 +00:00
parent a8ff8f4bd0
commit 76c9f5a679
2 changed files with 155 additions and 41 deletions

View File

@@ -28,12 +28,13 @@ def isolated_audit(monkeypatch, tmp_path):
"""Redirect audit writes into ``tmp_path`` and reset module state
so each test starts fresh.
The kill switch is bypassed for the duration of the test by
patching the module-level constant directly — these tests need
the real producer path to run."""
The kill switch (``_DISABLED``) is bypassed for the duration of
the test by patching ``_ENABLE_OVERRIDE`` directly — env vars
won't reach the module-level constant after import."""
monkeypatch.setenv("DATATOOLS_AUDIT_DIR", str(tmp_path))
from src import audit
monkeypatch.setattr(audit, "_DISABLED", False)
monkeypatch.setattr(audit, "_ENABLE_OVERRIDE", True)
monkeypatch.setattr(audit, "_PROBE", "full")
audit.reset_for_tests()
yield audit
# Best-effort cleanup so a runaway writer thread doesn't keep
@@ -120,7 +121,8 @@ class TestUnwritableTargetDoesntCrash:
not_a_dir.write_text("hi")
monkeypatch.setenv("DATATOOLS_AUDIT_DIR", str(not_a_dir))
from src import audit
monkeypatch.setattr(audit, "_DISABLED", False)
monkeypatch.setattr(audit, "_ENABLE_OVERRIDE", True)
monkeypatch.setattr(audit, "_PROBE", "full")
audit.reset_for_tests()
try:
start = time.perf_counter()
@@ -131,6 +133,47 @@ class TestUnwritableTargetDoesntCrash:
audit.reset_for_tests()
class TestKillSwitchContract:
"""With ``_DISABLED = True`` and no env-var override, every
producer is a true no-op. Pins the safety contract: the default
configuration must never touch disk or start a thread."""
def test_disabled_writes_nothing(self, monkeypatch, tmp_path):
monkeypatch.setenv("DATATOOLS_AUDIT_DIR", str(tmp_path))
from src import audit
monkeypatch.setattr(audit, "_DISABLED", True)
monkeypatch.setattr(audit, "_ENABLE_OVERRIDE", False)
audit.reset_for_tests()
try:
audit.log_event("test", "should be a no-op")
audit.log_session_start()
audit.log_page_open("test_page")
audit.flush_audit_log(timeout_s=0.5)
assert audit._WRITER_THREAD is None, (
"Writer thread must not start when the kill switch is on."
)
assert list(tmp_path.glob("datatools-*.jsonl")) == [], (
"Kill switch leaked a log file."
)
finally:
audit.reset_for_tests()
def test_probe_no_events_drops_writes(self, monkeypatch, tmp_path):
"""``DATATOOLS_AUDIT_PROBE=no-events`` bypasses log_event even
when the override is on. Bisect aid for Phase 2."""
monkeypatch.setenv("DATATOOLS_AUDIT_DIR", str(tmp_path))
from src import audit
monkeypatch.setattr(audit, "_ENABLE_OVERRIDE", True)
monkeypatch.setattr(audit, "_PROBE", "no-events")
audit.reset_for_tests()
try:
audit.log_event("test", "should be dropped")
audit.flush_audit_log(timeout_s=0.5)
assert list(tmp_path.glob("datatools-*.jsonl")) == []
finally:
audit.reset_for_tests()
class TestSerializationSafety:
def test_non_json_extras_get_str_coerced(self, isolated_audit, tmp_path):
audit = isolated_audit