Files
datatools-dev/tests/test_audit.py
Michael 76c9f5a679 feat(audit): diagnostic instrumentation env vars + writer-thread guard
Phase 1 of the audit-log re-enablement plan. Adds three opt-in env
vars that let us ship one instrumented build for the user to run,
without flipping the kill switch on for everybody. **Default
behaviour is byte-identical to today**: with no env vars set the
kill switch wins, no writer thread starts, no file is written, no
stderr line is printed.

Env vars (do NOT set in prod):

- ``DATATOOLS_AUDIT_ENABLED=1`` — bypass ``_DISABLED`` for one
  session. ``_DISABLED = True`` stays in the source so an upgrade
  with no env var is still safe.
- ``DATATOOLS_AUDIT_TRACE=1`` — print ``[audit] ...`` lines to
  stderr at module import, every writer-thread state change, and
  every producer entry point. Lets the user share a small log
  instead of attaching a debugger.
- ``DATATOOLS_AUDIT_PROBE=<value>`` — bisect the producer path
  for Phase 2. Values: ``full`` (default), ``noop``, ``no-events``,
  ``no-page-open``, ``no-session-start``. The named variants
  return early from the corresponding ``log_*`` function so we can
  isolate which call is implicated in the blank-pages symptom.

Also:

- ``_writer_loop`` gets an outer ``try/except BaseException`` so
  silent thread death now surfaces a ``"writer thread died: ..."``
  line in the launcher terminal instead of looking like a hang.
- Existing first-write-failure stderr print gets ``flush=True`` so
  the user actually sees it before the process is killed.
- Test fixture switches from the previous-commit ``_DISABLED = False``
  override to ``_ENABLE_OVERRIDE = True`` so tests exercise the same
  bypass path the diagnostic build uses.
- Two new tests pin the safety contract: with the kill switch on
  and no override, every producer is a true no-op (no writer
  thread, no file). And ``DATATOOLS_AUDIT_PROBE=no-events`` bypasses
  ``log_event`` even when the override is on — guards the bisect.

Rollback: ``git revert HEAD`` removes Phase 1 cleanly. The deadlock
fix from the previous commit stays in place.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 14:46:27 +00:00

191 lines
7.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Audit-log smoke tests.
The audit log was rewritten to use an async queue + a background
writer thread after a synchronous-write implementation hung the GUI
on a hostile filesystem (Windows antivirus). These tests pin the
critical guarantees:
1. ``log_event`` returns in microseconds (non-blocking).
2. Events queued before ``flush_audit_log`` land on disk in the
correct JSONL shape.
3. Backpressure: when the queue is full, oldest events are dropped
and newest ones survive (most-recent-events-are-most-diagnostic).
4. The writer thread tolerates an unwritable target without
crashing or hanging.
"""
from __future__ import annotations
import json
import time
from pathlib import Path
import pytest
@pytest.fixture
def isolated_audit(monkeypatch, tmp_path):
"""Redirect audit writes into ``tmp_path`` and reset module state
so each test starts fresh.
The kill switch (``_DISABLED``) is bypassed for the duration of
the test by patching ``_ENABLE_OVERRIDE`` directly — env vars
won't reach the module-level constant after import."""
monkeypatch.setenv("DATATOOLS_AUDIT_DIR", str(tmp_path))
from src import audit
monkeypatch.setattr(audit, "_ENABLE_OVERRIDE", True)
monkeypatch.setattr(audit, "_PROBE", "full")
audit.reset_for_tests()
yield audit
# Best-effort cleanup so a runaway writer thread doesn't keep
# touching the tmp_path after the test exits.
audit.flush_audit_log(timeout_s=0.5)
audit.reset_for_tests()
def _read_all_events(audit_dir: Path) -> list[dict]:
"""Read every JSONL file in *audit_dir* and return parsed events."""
out: list[dict] = []
for f in sorted(audit_dir.glob("datatools-*.jsonl")):
for line in f.read_text(encoding="utf-8").splitlines():
if line.strip():
out.append(json.loads(line))
return out
class TestLogEventIsNonBlocking:
def test_returns_in_under_one_millisecond(self, isolated_audit):
"""A pathological 1000-event burst should clear well under a
second on the request path. The write happens off-thread."""
audit = isolated_audit
start = time.perf_counter()
for i in range(1000):
audit.log_event("test", f"event {i}", n=i)
elapsed = time.perf_counter() - start
# 1000 events × 1ms ceiling = 1.0s. In practice this should
# be milliseconds total since put_nowait is microseconds.
assert elapsed < 1.0, (
f"log_event burst took {elapsed:.3f}s for 1000 events — "
f"that's over 1ms per call. Async queue is regressing to "
f"a sync write."
)
class TestEventsLandOnDisk:
def test_basic_round_trip(self, isolated_audit, tmp_path):
audit = isolated_audit
audit.log_event("upload", "Uploaded a.csv", filename="a.csv", bytes=42)
audit.log_event("analyze", "Analyzed a.csv", findings=3)
audit.flush_audit_log(timeout_s=2.0)
events = _read_all_events(tmp_path)
assert len(events) == 2
kinds = [e["category"] for e in events]
assert "upload" in kinds
assert "analyze" in kinds
upload = next(e for e in events if e["category"] == "upload")
assert upload["filename"] == "a.csv"
assert upload["bytes"] == 42
assert "ts" in upload
assert "session" in upload
def test_session_start_renders(self, isolated_audit, tmp_path):
audit = isolated_audit
audit.log_session_start()
audit.flush_audit_log(timeout_s=2.0)
events = _read_all_events(tmp_path)
assert any(
e["category"] == "session" and "Session started" in e["message"]
for e in events
)
def test_log_session_start_is_idempotent(self, isolated_audit, tmp_path):
audit = isolated_audit
for _ in range(5):
audit.log_session_start()
audit.flush_audit_log(timeout_s=2.0)
events = _read_all_events(tmp_path)
starts = [e for e in events if e["category"] == "session"]
assert len(starts) == 1
class TestUnwritableTargetDoesntCrash:
def test_pointing_at_a_nonexistent_parent_doesnt_hang(
self, monkeypatch, tmp_path,
):
# Point the audit dir at a path that DOES exist as a FILE,
# so any mkdir there fails. The writer should swallow the
# error and the GUI-facing producer should keep working.
not_a_dir = tmp_path / "iam-a-file"
not_a_dir.write_text("hi")
monkeypatch.setenv("DATATOOLS_AUDIT_DIR", str(not_a_dir))
from src import audit
monkeypatch.setattr(audit, "_ENABLE_OVERRIDE", True)
monkeypatch.setattr(audit, "_PROBE", "full")
audit.reset_for_tests()
try:
start = time.perf_counter()
audit.log_event("test", "should not crash")
assert time.perf_counter() - start < 0.5
audit.flush_audit_log(timeout_s=0.5)
finally:
audit.reset_for_tests()
class TestKillSwitchContract:
"""With ``_DISABLED = True`` and no env-var override, every
producer is a true no-op. Pins the safety contract: the default
configuration must never touch disk or start a thread."""
def test_disabled_writes_nothing(self, monkeypatch, tmp_path):
monkeypatch.setenv("DATATOOLS_AUDIT_DIR", str(tmp_path))
from src import audit
monkeypatch.setattr(audit, "_DISABLED", True)
monkeypatch.setattr(audit, "_ENABLE_OVERRIDE", False)
audit.reset_for_tests()
try:
audit.log_event("test", "should be a no-op")
audit.log_session_start()
audit.log_page_open("test_page")
audit.flush_audit_log(timeout_s=0.5)
assert audit._WRITER_THREAD is None, (
"Writer thread must not start when the kill switch is on."
)
assert list(tmp_path.glob("datatools-*.jsonl")) == [], (
"Kill switch leaked a log file."
)
finally:
audit.reset_for_tests()
def test_probe_no_events_drops_writes(self, monkeypatch, tmp_path):
"""``DATATOOLS_AUDIT_PROBE=no-events`` bypasses log_event even
when the override is on. Bisect aid for Phase 2."""
monkeypatch.setenv("DATATOOLS_AUDIT_DIR", str(tmp_path))
from src import audit
monkeypatch.setattr(audit, "_ENABLE_OVERRIDE", True)
monkeypatch.setattr(audit, "_PROBE", "no-events")
audit.reset_for_tests()
try:
audit.log_event("test", "should be dropped")
audit.flush_audit_log(timeout_s=0.5)
assert list(tmp_path.glob("datatools-*.jsonl")) == []
finally:
audit.reset_for_tests()
class TestSerializationSafety:
def test_non_json_extras_get_str_coerced(self, isolated_audit, tmp_path):
audit = isolated_audit
class Weird:
def __repr__(self):
return "<Weird>"
audit.log_event("test", "carries a weird extra", obj=Weird())
audit.flush_audit_log(timeout_s=2.0)
events = _read_all_events(tmp_path)
assert len(events) == 1
assert events[0]["obj"] == "<Weird>"