feat(audit): JSONL audit log for support diagnostics

New ``src/audit.py`` module records GUI actions to a per-session JSONL file under ``~/.datatools/logs/`` (overrideable via ``DATATOOLS_AUDIT_DIR``). The file is human-readable (one JSON object per line, each with a ``message`` field) AND trivially machine-parseable — the support flow is "client mails the file, we read it and explain what went wrong." Format example:: {"ts":"2026-05-17T05:30:00.123+00:00","level":"info","category":"session", "session":"a1b2c3d4","message":"Session started", "platform":"Windows 11","python":"3.14.0","user":"Michael Dombaugh", "log_file":"C:\\Users\\Michael Dombaugh\\.datatools\\logs\\datatools-...jsonl"} {"ts":"...","category":"upload","message":"Uploaded customers.csv", "filename":"customers.csv","bytes":24813} {"ts":"...","category":"analyze","message":"Analyzed customers.csv (3 findings)", "filename":"customers.csv","findings":3,"rows":120,"cols":8} {"ts":"...","category":"tool_run","message":"Clean Text run", "page":"2_Text_Cleaner"} {"ts":"...","category":"error","level":"error", "message":"analyze(weird.csv): EmptyDataError: No columns to parse", "filename":"weird.csv","outcome":"empty_after_repair"} Public API: - ``log_event(category, message, **extra)`` - ``log_session_start()`` — idempotent banner with platform info - ``log_page_open(slug)`` — emit a ``nav`` event, deduplicated per Streamlit session so reruns don't spam the log - ``log_exception(where, exc, **extra)`` — convenience wrapper - ``audit_log_path()`` / ``audit_log_dir()`` — for the UI Wired in at: - ``hide_streamlit_chrome``: stamps session start, mounts a small "🩺 Diagnostics" expander in the sidebar with the log path and an "Open log folder" button so the user can grab the file to attach to a support email. - Home page: ``upload`` event on every new file, ``upload`` event on per-file remove, ``analyze`` event with file count when Run-analysis fires. - ``_run_analysis_on_upload``: ``analyze`` event with rows / cols / findings count per file, plus ``error`` events on every caught exception (empty upload, empty after repair, pandas EmptyDataError, generic Exception). - Every Ready tool page (1, 2, 3, 4, 5, 9): ``tool_run`` event immediately after the primary action stashes its result. - Every tool page (1-9): ``log_page_open(slug)`` on render — deduped via session state so we don't get one event per Streamlit rerun. Safety: - ``log_event`` wraps every write in try/except. A broken audit log must NOT crash the GUI. - Non-JSON-serializable extras are ``str()``-coerced before writing. - File CONTENTS are never logged. We capture filename, byte count, and (in the analyzer) a 12-char sha1 fingerprint of the bytes so the same file re-uploaded gets the same trace. - License keys, session cookies, etc. are not logged. - ``DATATOOLS_AUDIT_DIR`` env var lets tests redirect writes into a tmp dir. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-17 01:36:35 +00:00
parent f0885aeb1e
commit c73d716d06
12 changed files with 373 additions and 3 deletions
--- a/src/audit.py
+++ b/src/audit.py
@@ -0,0 +1,227 @@
+"""Audit log — records GUI actions for support diagnostics.
+
+A client running DataTools who hits a bug should be able to grab one
+file off disk, mail it to support, and have us reconstruct what they
+were doing when things broke. That file is the audit log written by
+this module.
+
+Design choices:
+
+- **JSONL**, one event per line. Each line is a valid JSON object; the
+  whole file is grep-friendly, ``jq``-friendly, and still readable in
+  Notepad / TextEdit if no tooling is available. Each event carries a
+  human-readable ``message`` field so the file is useful even without
+  any tooling.
+- **One file per session**, named ``datatools-<utc-timestamp>-<id>.jsonl``.
+  Multiple sessions on the same machine don't clobber each other, and
+  the filename sorts chronologically.
+- **Default location**: ``~/.datatools/logs/`` on every platform.
+  Overrideable via the ``DATATOOLS_AUDIT_DIR`` environment variable —
+  used by tests to redirect writes into a tmp dir.
+- **Never crashes the app**. Every write is wrapped in a try/except;
+  a broken audit log must not take down the GUI.
+- **No PII bytes**: file CONTENTS are never logged. We log the
+  filename, byte size, and a short content hash so the same file
+  re-uploaded gets the same fingerprint, but the actual bytes stay
+  local.
+
+Public API:
+
+- ``log_event(category, message, **extra)`` — write one event.
+- ``log_session_start()`` — emit a session-start record with platform
+  info. Idempotent within a single session.
+- ``audit_log_path()`` — return the path to the current session's file
+  so the GUI can show it to the user.
+- ``audit_log_dir()`` — return the directory holding all session logs.
+"""
+
+from __future__ import annotations
+
+import getpass
+import json
+import os
+import platform
+import sys
+import threading
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+
+# Module-level cache for per-session state. Streamlit reruns the script
+# many times per session but the module is imported once, so these
+# survive across reruns within the same Python process.
+_LOCK = threading.Lock()
+_LOG_PATH: Path | None = None
+_SESSION_ID: str | None = None
+_SESSION_STARTED: bool = False
+
+
+def audit_log_dir() -> Path:
+    """Return the directory where audit logs are written.
+
+    Defaults to ``~/.datatools/logs/``. Overrideable via the
+    ``DATATOOLS_AUDIT_DIR`` environment variable so tests can redirect
+    writes into ``tmp_path``.
+    """
+    override = os.environ.get("DATATOOLS_AUDIT_DIR")
+    if override:
+        return Path(override)
+    return Path.home() / ".datatools" / "logs"
+
+
+def _session_id() -> str:
+    global _SESSION_ID
+    with _LOCK:
+        if _SESSION_ID is None:
+            _SESSION_ID = uuid.uuid4().hex
+        return _SESSION_ID
+
+
+def audit_log_path() -> Path:
+    """Return this session's log file path.
+
+    The path is created the first time it's queried so each Python
+    process gets a single file regardless of how many Streamlit
+    reruns happen.
+    """
+    global _LOG_PATH
+    with _LOCK:
+        if _LOG_PATH is None:
+            ts = datetime.now(tz=timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+            sid = _session_id()[:8]
+            d = audit_log_dir()
+            try:
+                d.mkdir(parents=True, exist_ok=True)
+            except Exception:
+                # If we can't create the dir, fall back to a tmpdir
+                # location so we never crash the app for the audit
+                # log's sake.
+                import tempfile
+                d = Path(tempfile.gettempdir()) / "datatools-logs"
+                d.mkdir(parents=True, exist_ok=True)
+            _LOG_PATH = d / f"datatools-{ts}-{sid}.jsonl"
+        return _LOG_PATH
+
+
+def log_event(
+    category: str,
+    message: str,
+    *,
+    level: str = "info",
+    **extra: Any,
+) -> None:
+    """Append one event to the session log.
+
+    ``category`` groups related events (e.g. ``upload``, ``analyze``,
+    ``tool_run``, ``error``, ``nav``). ``message`` is the human
+    sentence that lands in the file. ``extra`` keys are passed through
+    to the JSON object verbatim, so callers can attach structured
+    context (filename, byte counts, finding counts, timings).
+
+    Failures are swallowed silently — a broken audit log must not
+    take the GUI down.
+    """
+    try:
+        event = {
+            "ts": datetime.now(tz=timezone.utc).isoformat(timespec="milliseconds"),
+            "level": level,
+            "category": category,
+            "session": _session_id()[:8],
+            "message": message,
+        }
+        # Attach extras with serialization safety: non-JSON values get
+        # str()'d so a bad caller can't poison the whole entry.
+        for k, v in extra.items():
+            try:
+                json.dumps(v)
+                event[k] = v
+            except (TypeError, ValueError):
+                event[k] = str(v)
+        with audit_log_path().open("a", encoding="utf-8") as f:
+            f.write(json.dumps(event, ensure_ascii=False) + "\n")
+    except Exception:
+        # Last-ditch silent swallow. Diagnostics is best-effort.
+        pass
+
+
+def log_session_start() -> None:
+    """Write the session-start banner. Idempotent within one process."""
+    global _SESSION_STARTED
+    with _LOCK:
+        if _SESSION_STARTED:
+            return
+        _SESSION_STARTED = True
+    # Best-effort metadata. Failures don't propagate.
+    try:
+        user = getpass.getuser()
+    except Exception:
+        user = "?"
+    try:
+        cwd = str(Path.cwd())
+    except Exception:
+        cwd = "?"
+    log_event(
+        "session",
+        "Session started",
+        platform=f"{platform.system()} {platform.release()}",
+        python=sys.version.split()[0],
+        user=user,
+        cwd=cwd,
+        log_file=str(audit_log_path()),
+    )
+
+
+def log_exception(where: str, exc: BaseException, **extra: Any) -> None:
+    """Convenience wrapper for caught exceptions."""
+    log_event(
+        "error",
+        f"{where}: {type(exc).__name__}: {exc}",
+        level="error",
+        exc_type=type(exc).__name__,
+        exc_message=str(exc),
+        **extra,
+    )
+
+
+def log_page_open(slug: str) -> None:
+    """Emit a "page open" event, deduplicated within a session.
+
+    Streamlit reruns the script many times per page (every widget
+    interaction triggers a rerun). Tracking the last page the user
+    visited in session state lets us emit a single ``nav`` event when
+    they actually switch pages, not one per rerun. Falls back to
+    always-emit when session state is unreachable (running outside
+    Streamlit, e.g. in tests).
+    """
+    try:
+        import streamlit as st
+        prev = st.session_state.get("_audit_current_page")
+        if prev == slug:
+            return
+        st.session_state["_audit_current_page"] = slug
+    except Exception:
+        pass
+    log_event("nav", f"Opened {slug}", page=slug)
+
+
+def reset_for_tests() -> None:
+    """Reset module-level state. Test-only — call from a pytest fixture
+    when isolation between tests matters."""
+    global _LOG_PATH, _SESSION_ID, _SESSION_STARTED
+    with _LOCK:
+        _LOG_PATH = None
+        _SESSION_ID = None
+        _SESSION_STARTED = False
+
+
+__all__ = [
+    "audit_log_dir",
+    "audit_log_path",
+    "log_event",
+    "log_exception",
+    "log_page_open",
+    "log_session_start",
+    "reset_for_tests",
+]
--- a/src/gui/_home.py
+++ b/src/gui/_home.py
@@ -86,6 +86,7 @@ def _home_page() -> None:
        help=t("upload.uploader_help"),
    )
    if new_files:
+        from src.audit import log_event
        changed = False
        for f in new_files:
            if f.name not in home_uploads:
@@ -94,6 +95,12 @@ def _home_page() -> None:
                    "size": f.size,
                }
                changed = True
+                log_event(
+                    "upload",
+                    f"Uploaded {f.name}",
+                    filename=f.name,
+                    bytes=f.size,
+                )
        if changed:
            st.session_state["home_uploads"] = home_uploads

@@ -139,6 +146,12 @@ def _home_page() -> None:
                to_remove = name

        if to_remove is not None:
+            from src.audit import log_event
+            log_event(
+                "upload",
+                f"Removed {to_remove}",
+                filename=to_remove,
+            )
            del home_uploads[to_remove]
            # Drop any findings/results tied to the removed file.
            findings_by_file_drop = st.session_state.get(
@@ -209,6 +222,12 @@ def _home_page() -> None:
        st.rerun()

    if run_clicked:
+        from src.audit import log_event
+        log_event(
+            "analyze",
+            f"Run analysis clicked on {len(pending)} file(s)",
+            files=list(pending),
+        )
        progress = st.progress(0.0, text=t("upload.scanning"))
        for i, name in enumerate(pending, start=1):
            stashed = _StashedUpload(name, home_uploads[name]["bytes"])
--- a/src/gui/components/_legacy.py
+++ b/src/gui/components/_legacy.py
@@ -155,6 +155,10 @@ def hide_streamlit_chrome(*, gate_license: bool = True) -> None:
    can render its own form without recursion.
    """
    st.markdown(_HIDE_CHROME_CSS, unsafe_allow_html=True)
+    # Stamp a session-start record into the audit log the first time
+    # any page renders. Idempotent — subsequent calls are no-ops.
+    from src.audit import log_session_start
+    log_session_start()
    # Production-safe check runs first so a misconfigured shipped
    # build refuses to render anything (rather than rendering a
    # broken activation form that doesn't accept real blobs).
@@ -172,10 +176,39 @@ def hide_streamlit_chrome(*, gate_license: bool = True) -> None:
        require_license_or_render_activation,
    )
    render_license_status_sidebar()
+    _render_diagnostics_sidebar()
    if gate_license:
        require_license_or_render_activation()


+def _render_diagnostics_sidebar() -> None:
+    """Render a small Diagnostics expander in the sidebar.
+
+    Shows the path to the current session's audit log and an "Open
+    folder" button. Lives behind an expander so it doesn't take
+    screen space until the user opens it; the support flow is
+    "client mails us the file, we tell them what went wrong."
+    """
+    from src.audit import audit_log_dir, audit_log_path
+    log_path = audit_log_path()
+    with st.sidebar:
+        with st.expander("🩺  Diagnostics", expanded=False):
+            st.caption("Audit log for this session:")
+            st.code(str(log_path), language=None)
+            if st.button(
+                "📂  Open log folder",
+                key="_diag_open_logs",
+                type="secondary",
+                use_container_width=True,
+            ):
+                opened = _open_in_file_manager(audit_log_dir(), select=log_path)
+                if not opened:
+                    st.warning(
+                        "Could not open the file manager from here. "
+                        "Path is above — paste it into your file manager."
+                    )
+
+
 # ---------------------------------------------------------------------------
 # Clean shutdown
 # ---------------------------------------------------------------------------
@@ -1669,6 +1702,8 @@ def _run_analysis_on_upload(uploaded):
    one of several uploaded files) should yield a clean red banner for
    that file, not kill the whole multi-file analysis run.
    """
+    import hashlib
+    from src.audit import log_event, log_exception
    from src.core.analyze import Finding, analyze
    from src.core.errors import format_for_user
    from src.core.io import repair_bytes
@@ -1676,6 +1711,18 @@ def _run_analysis_on_upload(uploaded):
    name = uploaded.name
    data = uploaded.getvalue()
    suffix = name.rsplit(".", 1)[-1].lower() if "." in name else ""
+    digest = hashlib.sha1(
+        data, usedforsecurity=False,
+    ).hexdigest()[:12] if data else "empty"
+
+    log_event(
+        "analyze",
+        f"Analyzing {name}",
+        filename=name,
+        bytes=len(data),
+        sha1_12=digest,
+        suffix=suffix,
+    )

    def _error_finding(description: str, fid: str = "analysis_failed") -> list[Finding]:
        return [Finding(
@@ -1689,6 +1736,13 @@ def _run_analysis_on_upload(uploaded):
        )]

    if not data:
+        log_event(
+            "analyze",
+            f"Skipping {name} — 0 bytes",
+            level="warn",
+            filename=name,
+            outcome="empty_upload",
+        )
        return _error_finding(
            f"`{name}` is empty (0 bytes). Please re-upload — the bytes "
            f"may not have transferred correctly from your browser.",
@@ -1698,7 +1752,17 @@ def _run_analysis_on_upload(uploaded):
    try:
        if suffix in ("xlsx", "xls"):
            df = pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False)
-            return analyze(df)
+            findings = analyze(df)
+            log_event(
+                "analyze",
+                f"Analyzed {name} ({len(findings)} findings)",
+                filename=name,
+                bytes=len(data),
+                sha1_12=digest,
+                findings=len(findings),
+                rows=len(df), cols=len(df.columns),
+            )
+            return findings

        # CSV / TSV: run repair_bytes so the user sees csv_* findings.
        text_head = data[:4096].decode("utf-8", errors="replace")
@@ -1710,6 +1774,13 @@ def _run_analysis_on_upload(uploaded):
                    break
        repair = repair_bytes(data, encoding="utf-8", delimiter=delim)
        if not repair.repaired_bytes:
+            log_event(
+                "analyze",
+                f"Skipping {name} — empty after repair",
+                level="warn",
+                filename=name,
+                outcome="empty_after_repair",
+            )
            return _error_finding(
                f"`{name}` is empty after pre-parse repair "
                f"(original was {len(data)} bytes — likely all NUL "
@@ -1723,8 +1794,25 @@ def _run_analysis_on_upload(uploaded):
            encoding="utf-8", delimiter=delim,
            dtype=str, keep_default_na=False, on_bad_lines="warn",
        )
-        return analyze(df, repair_result=repair)
-    except pd.errors.EmptyDataError:
+        findings = analyze(df, repair_result=repair)
+        log_event(
+            "analyze",
+            f"Analyzed {name} ({len(findings)} findings)",
+            filename=name,
+            bytes=len(data),
+            sha1_12=digest,
+            findings=len(findings),
+            rows=len(df), cols=len(df.columns),
+            delimiter=repr(delim),
+        )
+        return findings
+    except pd.errors.EmptyDataError as e:
+        log_exception(
+            f"analyze({name})",
+            e,
+            filename=name,
+            outcome="empty_after_repair",
+        )
        return _error_finding(
            f"`{name}` could not be parsed — pandas reports no columns "
            f"in the file. Original size was {len(data)} bytes. Open "
@@ -1733,6 +1821,12 @@ def _run_analysis_on_upload(uploaded):
            fid="empty_after_repair",
        )
    except Exception as e:
+        log_exception(
+            f"analyze({name})",
+            e,
+            filename=name,
+            outcome="analysis_failed",
+        )
        return _error_finding(
            f"`{name}` could not be analyzed: {format_for_user(e)}",
        )
--- a/src/gui/pages/1_Deduplicator.py
+++ b/src/gui/pages/1_Deduplicator.py
@@ -33,6 +33,8 @@ from src.license import FeatureFlag

 hide_streamlit_chrome()
 render_sticky_footer()
+from src.audit import log_page_open
+log_page_open("1_Deduplicator")
 require_feature_or_render_upgrade(FeatureFlag.DEDUPLICATOR)

 # ---------------------------------------------------------------------------
@@ -231,6 +233,8 @@ if uploaded is not None:

            progress_bar.empty()
            st.session_state["result"] = result
+            from src.audit import log_event
+            log_event("tool_run", "Find Duplicates run", page="1_Deduplicator")
            st.session_state["review_decisions"] = {}
            # One-shot flag for the scroll snippet at the bottom of the
            # page. Force a rerun so the Preview / Options expanders see
--- a/src/gui/pages/2_Text_Cleaner.py
+++ b/src/gui/pages/2_Text_Cleaner.py
@@ -35,6 +35,8 @@ from src.core.text_clean import (

 hide_streamlit_chrome()
 render_sticky_footer()
+from src.audit import log_page_open
+log_page_open("2_Text_Cleaner")
 require_feature_or_render_upgrade(FeatureFlag.TEXT_CLEANER)


@@ -218,6 +220,8 @@ if st.button("Clean Text", type="primary", use_container_width=True):
            st.error(str(e))
            st.stop()
    st.session_state["textclean_result"] = result
+    from src.audit import log_event
+    log_event("tool_run", "Clean Text run", page="2_Text_Cleaner")
    st.session_state["textclean_input_name"] = uploaded.name
    # One-shot flag picked up on the next pass to scroll the parent
    # document to the Results anchor (see scroll snippet below).
--- a/src/gui/pages/3_Format_Standardizer.py
+++ b/src/gui/pages/3_Format_Standardizer.py
@@ -33,6 +33,8 @@ from src.license import FeatureFlag

 hide_streamlit_chrome()
 render_sticky_footer()
+from src.audit import log_page_open
+log_page_open("3_Format_Standardizer")
 require_feature_or_render_upgrade(FeatureFlag.FORMAT_STANDARDIZER)


@@ -537,6 +539,8 @@ if st.button(
            st.error(str(e))
            st.stop()
    st.session_state["fmtstd_result"] = result
+    from src.audit import log_event
+    log_event("tool_run", "Standardize Formats run", page="3_Format_Standardizer")
    st.session_state["fmtstd_input_name"] = uploaded.name
    # One-shot flag picked up on the next pass to scroll the parent
    # document to the Results anchor (see scroll snippet below).
--- a/src/gui/pages/4_Missing_Values.py
+++ b/src/gui/pages/4_Missing_Values.py
@@ -34,6 +34,8 @@ from src.license import FeatureFlag

 hide_streamlit_chrome()
 render_sticky_footer()
+from src.audit import log_page_open
+log_page_open("4_Missing_Values")
 require_feature_or_render_upgrade(FeatureFlag.MISSING_HANDLER)


@@ -291,6 +293,8 @@ if st.button("Handle Missing Values", type="primary", use_container_width=True):
            st.error(format_for_user(e))
            st.stop()
    st.session_state["missing_result"] = result
+    from src.audit import log_event
+    log_event("tool_run", "Fix Missing Values run", page="4_Missing_Values")
    st.session_state["missing_input_name"] = uploaded.name
    st.session_state["missing_options"] = options.to_dict()
    # One-shot flag picked up on the next pass to scroll the parent
--- a/src/gui/pages/5_Column_Mapper.py
+++ b/src/gui/pages/5_Column_Mapper.py
@@ -35,6 +35,8 @@ from src.license import FeatureFlag

 hide_streamlit_chrome()
 render_sticky_footer()
+from src.audit import log_page_open
+log_page_open("5_Column_Mapper")
 require_feature_or_render_upgrade(FeatureFlag.COLUMN_MAPPER)


@@ -338,6 +340,8 @@ if st.button("Apply Column Mapping", type="primary", use_container_width=True):
            st.error(format_for_user(e))
            st.stop()
    st.session_state["colmap_result"] = result
+    from src.audit import log_event
+    log_event("tool_run", "Map Columns run", page="5_Column_Mapper")
    st.session_state["colmap_input_name"] = uploaded.name
    st.session_state["colmap_options"] = options.to_dict()
    # One-shot flag picked up on the next pass to scroll the parent
--- a/src/gui/pages/6_Outlier_Detector.py
+++ b/src/gui/pages/6_Outlier_Detector.py
@@ -22,6 +22,8 @@ from src.license import FeatureFlag

 hide_streamlit_chrome()
 render_sticky_footer()
+from src.audit import log_page_open
+log_page_open("6_Outlier_Detector")
 require_feature_or_render_upgrade(FeatureFlag.OUTLIER_DETECTOR)

 # ---------------------------------------------------------------------------
--- a/src/gui/pages/7_Multi_File_Merger.py
+++ b/src/gui/pages/7_Multi_File_Merger.py
@@ -22,6 +22,8 @@ from src.license import FeatureFlag

 hide_streamlit_chrome()
 render_sticky_footer()
+from src.audit import log_page_open
+log_page_open("7_Multi_File_Merger")
 require_feature_or_render_upgrade(FeatureFlag.MULTI_FILE_MERGER)

 # ---------------------------------------------------------------------------
--- a/src/gui/pages/8_Validator_Reporter.py
+++ b/src/gui/pages/8_Validator_Reporter.py
@@ -22,6 +22,8 @@ from src.license import FeatureFlag

 hide_streamlit_chrome()
 render_sticky_footer()
+from src.audit import log_page_open
+log_page_open("8_Validator_Reporter")
 require_feature_or_render_upgrade(FeatureFlag.VALIDATOR_REPORTER)

 # ---------------------------------------------------------------------------
--- a/src/gui/pages/9_Pipeline_Runner.py
+++ b/src/gui/pages/9_Pipeline_Runner.py
@@ -36,6 +36,8 @@ from src.license import FeatureFlag

 hide_streamlit_chrome()
 render_sticky_footer()
+from src.audit import log_page_open
+log_page_open("9_Pipeline_Runner")
 require_feature_or_render_upgrade(FeatureFlag.PIPELINE_RUNNER)


@@ -283,6 +285,8 @@ if st.button(

    progress.progress(1.0, text="Done")
    st.session_state["pipeline_result"] = result
+    from src.audit import log_event
+    log_event("tool_run", "Automated Workflows run", page="9_Pipeline_Runner")
    st.session_state["pipeline_input_name"] = uploaded.name
    # One-shot flag picked up on the next pass to scroll the parent
    # document to the Results anchor (see scroll snippet at end of file).