feat(audit): JSONL audit log for support diagnostics

New ``src/audit.py`` module records GUI actions to a per-session
JSONL file under ``~/.datatools/logs/`` (overrideable via
``DATATOOLS_AUDIT_DIR``). The file is human-readable (one JSON
object per line, each with a ``message`` field) AND trivially
machine-parseable — the support flow is "client mails the file,
we read it and explain what went wrong."

Format example::

    {"ts":"2026-05-17T05:30:00.123+00:00","level":"info","category":"session",
     "session":"a1b2c3d4","message":"Session started",
     "platform":"Windows 11","python":"3.14.0","user":"Michael Dombaugh",
     "log_file":"C:\\Users\\Michael Dombaugh\\.datatools\\logs\\datatools-...jsonl"}
    {"ts":"...","category":"upload","message":"Uploaded customers.csv",
     "filename":"customers.csv","bytes":24813}
    {"ts":"...","category":"analyze","message":"Analyzed customers.csv (3 findings)",
     "filename":"customers.csv","findings":3,"rows":120,"cols":8}
    {"ts":"...","category":"tool_run","message":"Clean Text run",
     "page":"2_Text_Cleaner"}
    {"ts":"...","category":"error","level":"error",
     "message":"analyze(weird.csv): EmptyDataError: No columns to parse",
     "filename":"weird.csv","outcome":"empty_after_repair"}

Public API:

- ``log_event(category, message, **extra)``
- ``log_session_start()`` — idempotent banner with platform info
- ``log_page_open(slug)`` — emit a ``nav`` event, deduplicated per
  Streamlit session so reruns don't spam the log
- ``log_exception(where, exc, **extra)`` — convenience wrapper
- ``audit_log_path()`` / ``audit_log_dir()`` — for the UI

Wired in at:

- ``hide_streamlit_chrome``: stamps session start, mounts a small
  "🩺  Diagnostics" expander in the sidebar with the log path and
  an "Open log folder" button so the user can grab the file to
  attach to a support email.
- Home page: ``upload`` event on every new file, ``upload`` event
  on per-file remove, ``analyze`` event with file count when
  Run-analysis fires.
- ``_run_analysis_on_upload``: ``analyze`` event with rows / cols /
  findings count per file, plus ``error`` events on every caught
  exception (empty upload, empty after repair, pandas EmptyDataError,
  generic Exception).
- Every Ready tool page (1, 2, 3, 4, 5, 9): ``tool_run`` event
  immediately after the primary action stashes its result.
- Every tool page (1-9): ``log_page_open(slug)`` on render — deduped
  via session state so we don't get one event per Streamlit rerun.

Safety:

- ``log_event`` wraps every write in try/except. A broken audit
  log must NOT crash the GUI.
- Non-JSON-serializable extras are ``str()``-coerced before writing.
- File CONTENTS are never logged. We capture filename, byte count,
  and (in the analyzer) a 12-char sha1 fingerprint of the bytes so
  the same file re-uploaded gets the same trace.
- License keys, session cookies, etc. are not logged.
- ``DATATOOLS_AUDIT_DIR`` env var lets tests redirect writes into a
  tmp dir.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-17 01:36:35 +00:00
parent f0885aeb1e
commit c73d716d06
12 changed files with 373 additions and 3 deletions

View File

@@ -155,6 +155,10 @@ def hide_streamlit_chrome(*, gate_license: bool = True) -> None:
can render its own form without recursion.
"""
st.markdown(_HIDE_CHROME_CSS, unsafe_allow_html=True)
# Stamp a session-start record into the audit log the first time
# any page renders. Idempotent — subsequent calls are no-ops.
from src.audit import log_session_start
log_session_start()
# Production-safe check runs first so a misconfigured shipped
# build refuses to render anything (rather than rendering a
# broken activation form that doesn't accept real blobs).
@@ -172,10 +176,39 @@ def hide_streamlit_chrome(*, gate_license: bool = True) -> None:
require_license_or_render_activation,
)
render_license_status_sidebar()
_render_diagnostics_sidebar()
if gate_license:
require_license_or_render_activation()
def _render_diagnostics_sidebar() -> None:
"""Render a small Diagnostics expander in the sidebar.
Shows the path to the current session's audit log and an "Open
folder" button. Lives behind an expander so it doesn't take
screen space until the user opens it; the support flow is
"client mails us the file, we tell them what went wrong."
"""
from src.audit import audit_log_dir, audit_log_path
log_path = audit_log_path()
with st.sidebar:
with st.expander("🩺 Diagnostics", expanded=False):
st.caption("Audit log for this session:")
st.code(str(log_path), language=None)
if st.button(
"📂 Open log folder",
key="_diag_open_logs",
type="secondary",
use_container_width=True,
):
opened = _open_in_file_manager(audit_log_dir(), select=log_path)
if not opened:
st.warning(
"Could not open the file manager from here. "
"Path is above — paste it into your file manager."
)
# ---------------------------------------------------------------------------
# Clean shutdown
# ---------------------------------------------------------------------------
@@ -1669,6 +1702,8 @@ def _run_analysis_on_upload(uploaded):
one of several uploaded files) should yield a clean red banner for
that file, not kill the whole multi-file analysis run.
"""
import hashlib
from src.audit import log_event, log_exception
from src.core.analyze import Finding, analyze
from src.core.errors import format_for_user
from src.core.io import repair_bytes
@@ -1676,6 +1711,18 @@ def _run_analysis_on_upload(uploaded):
name = uploaded.name
data = uploaded.getvalue()
suffix = name.rsplit(".", 1)[-1].lower() if "." in name else ""
digest = hashlib.sha1(
data, usedforsecurity=False,
).hexdigest()[:12] if data else "empty"
log_event(
"analyze",
f"Analyzing {name}",
filename=name,
bytes=len(data),
sha1_12=digest,
suffix=suffix,
)
def _error_finding(description: str, fid: str = "analysis_failed") -> list[Finding]:
return [Finding(
@@ -1689,6 +1736,13 @@ def _run_analysis_on_upload(uploaded):
)]
if not data:
log_event(
"analyze",
f"Skipping {name} — 0 bytes",
level="warn",
filename=name,
outcome="empty_upload",
)
return _error_finding(
f"`{name}` is empty (0 bytes). Please re-upload — the bytes "
f"may not have transferred correctly from your browser.",
@@ -1698,7 +1752,17 @@ def _run_analysis_on_upload(uploaded):
try:
if suffix in ("xlsx", "xls"):
df = pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False)
return analyze(df)
findings = analyze(df)
log_event(
"analyze",
f"Analyzed {name} ({len(findings)} findings)",
filename=name,
bytes=len(data),
sha1_12=digest,
findings=len(findings),
rows=len(df), cols=len(df.columns),
)
return findings
# CSV / TSV: run repair_bytes so the user sees csv_* findings.
text_head = data[:4096].decode("utf-8", errors="replace")
@@ -1710,6 +1774,13 @@ def _run_analysis_on_upload(uploaded):
break
repair = repair_bytes(data, encoding="utf-8", delimiter=delim)
if not repair.repaired_bytes:
log_event(
"analyze",
f"Skipping {name} — empty after repair",
level="warn",
filename=name,
outcome="empty_after_repair",
)
return _error_finding(
f"`{name}` is empty after pre-parse repair "
f"(original was {len(data)} bytes — likely all NUL "
@@ -1723,8 +1794,25 @@ def _run_analysis_on_upload(uploaded):
encoding="utf-8", delimiter=delim,
dtype=str, keep_default_na=False, on_bad_lines="warn",
)
return analyze(df, repair_result=repair)
except pd.errors.EmptyDataError:
findings = analyze(df, repair_result=repair)
log_event(
"analyze",
f"Analyzed {name} ({len(findings)} findings)",
filename=name,
bytes=len(data),
sha1_12=digest,
findings=len(findings),
rows=len(df), cols=len(df.columns),
delimiter=repr(delim),
)
return findings
except pd.errors.EmptyDataError as e:
log_exception(
f"analyze({name})",
e,
filename=name,
outcome="empty_after_repair",
)
return _error_finding(
f"`{name}` could not be parsed — pandas reports no columns "
f"in the file. Original size was {len(data)} bytes. Open "
@@ -1733,6 +1821,12 @@ def _run_analysis_on_upload(uploaded):
fid="empty_after_repair",
)
except Exception as e:
log_exception(
f"analyze({name})",
e,
filename=name,
outcome="analysis_failed",
)
return _error_finding(
f"`{name}` could not be analyzed: {format_for_user(e)}",
)