From b3ae913bb9fb26ebe395826419560f1e5e8264e2 Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 19 May 2026 21:22:47 +0000 Subject: [PATCH] feat(audit): daily filename + 7-day retention sweep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the per-session ``datatools--.jsonl`` filename with a single daily file ``datatools-YYYY-MM-DD.jsonl`` (local date). Sessions on the same calendar day share a file via the writer thread's per-batch open+append; multiple DataTools instances running concurrently on the same day fan into the same file (append-mode small writes are atomic on POSIX, safe-enough on Windows under realistic load). Drops the ``_LOG_PATH`` module global and the lock around it — ``audit_log_path()`` is now pure date math, recomputed on every call so a session that crosses midnight follows the rollover into the next day's file. Adds ``_sweep_old_logs()`` invoked once per process at writer- thread start. Deletes any ``datatools-*.jsonl`` whose mtime is older than 7 days. The glob deliberately matches the legacy per-session filename too, so users upgrading from the previous build don't keep a permanent backlog of pre-retention files. Event ``ts`` fields stay UTC; only the filename uses local date, because users go looking for "today's log" on their wall clock. Tests cover: daily filename shape, sweep removes stale files, sweep keeps fresh files, sweep also clears legacy filenames. Rollback: ``git revert HEAD`` restores the per-session filename and removes the sweep. No data migration needed either way — existing files keep working as JSONL. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/audit.py | 71 ++++++++++++++++++++++++++++++--------------- tests/test_audit.py | 57 ++++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+), 23 deletions(-) diff --git a/src/audit.py b/src/audit.py index 882d969..cd3de3f 100644 --- a/src/audit.py +++ b/src/audit.py @@ -51,7 +51,6 @@ from typing import Any # Module-level state. All access either lock-free (immutable after # init) or guarded by ``_LOCK`` / ``_QUEUE_COND``. _LOCK = threading.Lock() -_LOG_PATH: Path | None = None _SESSION_ID: str | None = None _SESSION_STARTED: bool = False @@ -144,28 +143,54 @@ def _session_id() -> str: def audit_log_path() -> Path: - """Return this session's log file path. **No I/O performed.** + """Return today's log file path. **No I/O performed.** - Caller does not need to worry about hangs — this is pure path - arithmetic. The writer thread does the actual ``mkdir`` / - ``open`` work off the request path. + Filename is ``datatools-YYYY-MM-DD.jsonl`` keyed on **local** date + (event ``ts`` fields stay UTC). Local date is what a user sees on + their calendar when they go looking for "today's log"; UTC would + flip a day early in the evening on west-coast timezones, which is + surprising to humans. Concurrent DataTools instances writing on + the same day share the file via the writer thread's per-batch + ``open`` + append; append-mode for small writes is atomic on + POSIX and safe-enough on Windows. + + Recomputed on every call so a session that crosses midnight + follows the rollover. No caching, no locking — pure path math. """ - global _LOG_PATH - if _LOG_PATH is None: - # Resolve the session id BEFORE acquiring _LOCK. ``_session_id`` - # also takes ``_LOCK`` to write its first value, and the lock - # is non-reentrant — nesting them deadlocks the first caller - # in a clean module state. (Hit in ``log_session_start`` since - # it's the first GUI call and both globals are unset.) - sid = _session_id()[:8] - with _LOCK: - if _LOG_PATH is None: - try: - ts = datetime.now(tz=timezone.utc).strftime("%Y%m%dT%H%M%SZ") - except Exception: - ts = "unknown" - _LOG_PATH = audit_log_dir() / f"datatools-{ts}-{sid}.jsonl" - return _LOG_PATH + try: + date_str = datetime.now().strftime("%Y-%m-%d") + except Exception: + date_str = "unknown" + return audit_log_dir() / f"datatools-{date_str}.jsonl" + + +_RETENTION_DAYS = 7 + + +def _sweep_old_logs() -> None: + """Delete ``datatools-*.jsonl`` files older than ``_RETENTION_DAYS``. + + Best-effort housekeeping invoked once per process at writer-thread + start. Matches the new daily filename AND the legacy per-session + pattern (``datatools--.jsonl``), so this also clears + leftovers from before the retention switch. + """ + try: + log_dir = audit_log_dir() + if not log_dir.exists(): + return + cutoff = time.time() - (_RETENTION_DAYS * 86400) + deleted = 0 + for p in log_dir.glob("datatools-*.jsonl"): + try: + if p.stat().st_mtime < cutoff: + p.unlink() + deleted += 1 + except Exception: + continue + _trace("sweep:done", deleted=deleted, retention_days=_RETENTION_DAYS) + except Exception as e: + _trace("sweep:failed", exc=type(e).__name__, msg=str(e)) # --------------------------------------------------------------------------- @@ -183,6 +208,7 @@ def _writer_loop() -> None: """ global _WRITE_FAILED_REPORTED _trace("writer-loop:enter") + _sweep_old_logs() try: while True: # Wait for something to do. @@ -412,10 +438,9 @@ def flush_audit_log(timeout_s: float = 0.5) -> None: def reset_for_tests() -> None: """Reset module-level state. Call from a pytest fixture for isolation between tests.""" - global _LOG_PATH, _SESSION_ID, _SESSION_STARTED + global _SESSION_ID, _SESSION_STARTED global _WRITER_THREAD, _SHUTDOWN_REQUESTED, _WRITE_FAILED_REPORTED with _LOCK: - _LOG_PATH = None _SESSION_ID = None _SESSION_STARTED = False _SHUTDOWN_REQUESTED = False diff --git a/tests/test_audit.py b/tests/test_audit.py index 3550bde..2f1a5da 100644 --- a/tests/test_audit.py +++ b/tests/test_audit.py @@ -174,6 +174,63 @@ class TestKillSwitchContract: audit.reset_for_tests() +class TestDailyFileRetention: + """Daily filename + 7-day retention sweep on writer-thread start.""" + + def test_filename_is_daily_local_date(self, isolated_audit, tmp_path): + from datetime import datetime + audit = isolated_audit + audit.log_event("test", "today") + audit.flush_audit_log(timeout_s=2.0) + today = datetime.now().strftime("%Y-%m-%d") + expected = tmp_path / f"datatools-{today}.jsonl" + assert expected.exists(), ( + f"expected daily file {expected.name}, got " + f"{[p.name for p in tmp_path.iterdir()]}" + ) + + def test_sweep_deletes_files_older_than_retention( + self, isolated_audit, tmp_path, + ): + """Files with mtime > _RETENTION_DAYS old get pruned when the + writer thread starts. Files within the window survive.""" + import os + audit = isolated_audit + stale = tmp_path / "datatools-2025-01-01.jsonl" + stale.write_text('{"ts": "stale"}\n', encoding="utf-8") + old_mtime = time.time() - (audit._RETENTION_DAYS + 1) * 86400 + os.utime(stale, (old_mtime, old_mtime)) + + fresh = tmp_path / "datatools-2026-05-18.jsonl" + fresh.write_text('{"ts": "fresh"}\n', encoding="utf-8") + recent_mtime = time.time() - 1 * 86400 + os.utime(fresh, (recent_mtime, recent_mtime)) + + audit.log_event("test", "kick the writer") + audit.flush_audit_log(timeout_s=2.0) + + assert not stale.exists(), "Stale log should have been swept." + assert fresh.exists(), "Fresh log must not be swept." + + def test_sweep_also_clears_legacy_per_session_files( + self, isolated_audit, tmp_path, + ): + """Old ``datatools--.jsonl`` filenames must also match + the sweep glob, so upgrading users don't keep a permanent + backlog from before the retention switch.""" + import os + audit = isolated_audit + legacy = tmp_path / "datatools-20250101T120000Z-abc12345.jsonl" + legacy.write_text('{"ts": "legacy"}\n', encoding="utf-8") + old_mtime = time.time() - (audit._RETENTION_DAYS + 1) * 86400 + os.utime(legacy, (old_mtime, old_mtime)) + + audit.log_event("test", "kick the writer") + audit.flush_audit_log(timeout_s=2.0) + + assert not legacy.exists(), "Legacy per-session file not swept." + + class TestSerializationSafety: def test_non_json_extras_get_str_coerced(self, isolated_audit, tmp_path): audit = isolated_audit