From c73d716d064caf153185a077ab7dc3b0b5fc3daf Mon Sep 17 00:00:00 2001 From: Michael Date: Sun, 17 May 2026 01:36:35 +0000 Subject: [PATCH] feat(audit): JSONL audit log for support diagnostics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New ``src/audit.py`` module records GUI actions to a per-session JSONL file under ``~/.datatools/logs/`` (overrideable via ``DATATOOLS_AUDIT_DIR``). The file is human-readable (one JSON object per line, each with a ``message`` field) AND trivially machine-parseable — the support flow is "client mails the file, we read it and explain what went wrong." Format example:: {"ts":"2026-05-17T05:30:00.123+00:00","level":"info","category":"session", "session":"a1b2c3d4","message":"Session started", "platform":"Windows 11","python":"3.14.0","user":"Michael Dombaugh", "log_file":"C:\\Users\\Michael Dombaugh\\.datatools\\logs\\datatools-...jsonl"} {"ts":"...","category":"upload","message":"Uploaded customers.csv", "filename":"customers.csv","bytes":24813} {"ts":"...","category":"analyze","message":"Analyzed customers.csv (3 findings)", "filename":"customers.csv","findings":3,"rows":120,"cols":8} {"ts":"...","category":"tool_run","message":"Clean Text run", "page":"2_Text_Cleaner"} {"ts":"...","category":"error","level":"error", "message":"analyze(weird.csv): EmptyDataError: No columns to parse", "filename":"weird.csv","outcome":"empty_after_repair"} Public API: - ``log_event(category, message, **extra)`` - ``log_session_start()`` — idempotent banner with platform info - ``log_page_open(slug)`` — emit a ``nav`` event, deduplicated per Streamlit session so reruns don't spam the log - ``log_exception(where, exc, **extra)`` — convenience wrapper - ``audit_log_path()`` / ``audit_log_dir()`` — for the UI Wired in at: - ``hide_streamlit_chrome``: stamps session start, mounts a small "🩺 Diagnostics" expander in the sidebar with the log path and an "Open log folder" button so the user can grab the file to attach to a support email. - Home page: ``upload`` event on every new file, ``upload`` event on per-file remove, ``analyze`` event with file count when Run-analysis fires. - ``_run_analysis_on_upload``: ``analyze`` event with rows / cols / findings count per file, plus ``error`` events on every caught exception (empty upload, empty after repair, pandas EmptyDataError, generic Exception). - Every Ready tool page (1, 2, 3, 4, 5, 9): ``tool_run`` event immediately after the primary action stashes its result. - Every tool page (1-9): ``log_page_open(slug)`` on render — deduped via session state so we don't get one event per Streamlit rerun. Safety: - ``log_event`` wraps every write in try/except. A broken audit log must NOT crash the GUI. - Non-JSON-serializable extras are ``str()``-coerced before writing. - File CONTENTS are never logged. We capture filename, byte count, and (in the analyzer) a 12-char sha1 fingerprint of the bytes so the same file re-uploaded gets the same trace. - License keys, session cookies, etc. are not logged. - ``DATATOOLS_AUDIT_DIR`` env var lets tests redirect writes into a tmp dir. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/audit.py | 227 +++++++++++++++++++++++++ src/gui/_home.py | 19 +++ src/gui/components/_legacy.py | 100 ++++++++++- src/gui/pages/1_Deduplicator.py | 4 + src/gui/pages/2_Text_Cleaner.py | 4 + src/gui/pages/3_Format_Standardizer.py | 4 + src/gui/pages/4_Missing_Values.py | 4 + src/gui/pages/5_Column_Mapper.py | 4 + src/gui/pages/6_Outlier_Detector.py | 2 + src/gui/pages/7_Multi_File_Merger.py | 2 + src/gui/pages/8_Validator_Reporter.py | 2 + src/gui/pages/9_Pipeline_Runner.py | 4 + 12 files changed, 373 insertions(+), 3 deletions(-) create mode 100644 src/audit.py diff --git a/src/audit.py b/src/audit.py new file mode 100644 index 0000000..e455169 --- /dev/null +++ b/src/audit.py @@ -0,0 +1,227 @@ +"""Audit log — records GUI actions for support diagnostics. + +A client running DataTools who hits a bug should be able to grab one +file off disk, mail it to support, and have us reconstruct what they +were doing when things broke. That file is the audit log written by +this module. + +Design choices: + +- **JSONL**, one event per line. Each line is a valid JSON object; the + whole file is grep-friendly, ``jq``-friendly, and still readable in + Notepad / TextEdit if no tooling is available. Each event carries a + human-readable ``message`` field so the file is useful even without + any tooling. +- **One file per session**, named ``datatools--.jsonl``. + Multiple sessions on the same machine don't clobber each other, and + the filename sorts chronologically. +- **Default location**: ``~/.datatools/logs/`` on every platform. + Overrideable via the ``DATATOOLS_AUDIT_DIR`` environment variable — + used by tests to redirect writes into a tmp dir. +- **Never crashes the app**. Every write is wrapped in a try/except; + a broken audit log must not take down the GUI. +- **No PII bytes**: file CONTENTS are never logged. We log the + filename, byte size, and a short content hash so the same file + re-uploaded gets the same fingerprint, but the actual bytes stay + local. + +Public API: + +- ``log_event(category, message, **extra)`` — write one event. +- ``log_session_start()`` — emit a session-start record with platform + info. Idempotent within a single session. +- ``audit_log_path()`` — return the path to the current session's file + so the GUI can show it to the user. +- ``audit_log_dir()`` — return the directory holding all session logs. +""" + +from __future__ import annotations + +import getpass +import json +import os +import platform +import sys +import threading +import uuid +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + + +# Module-level cache for per-session state. Streamlit reruns the script +# many times per session but the module is imported once, so these +# survive across reruns within the same Python process. +_LOCK = threading.Lock() +_LOG_PATH: Path | None = None +_SESSION_ID: str | None = None +_SESSION_STARTED: bool = False + + +def audit_log_dir() -> Path: + """Return the directory where audit logs are written. + + Defaults to ``~/.datatools/logs/``. Overrideable via the + ``DATATOOLS_AUDIT_DIR`` environment variable so tests can redirect + writes into ``tmp_path``. + """ + override = os.environ.get("DATATOOLS_AUDIT_DIR") + if override: + return Path(override) + return Path.home() / ".datatools" / "logs" + + +def _session_id() -> str: + global _SESSION_ID + with _LOCK: + if _SESSION_ID is None: + _SESSION_ID = uuid.uuid4().hex + return _SESSION_ID + + +def audit_log_path() -> Path: + """Return this session's log file path. + + The path is created the first time it's queried so each Python + process gets a single file regardless of how many Streamlit + reruns happen. + """ + global _LOG_PATH + with _LOCK: + if _LOG_PATH is None: + ts = datetime.now(tz=timezone.utc).strftime("%Y%m%dT%H%M%SZ") + sid = _session_id()[:8] + d = audit_log_dir() + try: + d.mkdir(parents=True, exist_ok=True) + except Exception: + # If we can't create the dir, fall back to a tmpdir + # location so we never crash the app for the audit + # log's sake. + import tempfile + d = Path(tempfile.gettempdir()) / "datatools-logs" + d.mkdir(parents=True, exist_ok=True) + _LOG_PATH = d / f"datatools-{ts}-{sid}.jsonl" + return _LOG_PATH + + +def log_event( + category: str, + message: str, + *, + level: str = "info", + **extra: Any, +) -> None: + """Append one event to the session log. + + ``category`` groups related events (e.g. ``upload``, ``analyze``, + ``tool_run``, ``error``, ``nav``). ``message`` is the human + sentence that lands in the file. ``extra`` keys are passed through + to the JSON object verbatim, so callers can attach structured + context (filename, byte counts, finding counts, timings). + + Failures are swallowed silently — a broken audit log must not + take the GUI down. + """ + try: + event = { + "ts": datetime.now(tz=timezone.utc).isoformat(timespec="milliseconds"), + "level": level, + "category": category, + "session": _session_id()[:8], + "message": message, + } + # Attach extras with serialization safety: non-JSON values get + # str()'d so a bad caller can't poison the whole entry. + for k, v in extra.items(): + try: + json.dumps(v) + event[k] = v + except (TypeError, ValueError): + event[k] = str(v) + with audit_log_path().open("a", encoding="utf-8") as f: + f.write(json.dumps(event, ensure_ascii=False) + "\n") + except Exception: + # Last-ditch silent swallow. Diagnostics is best-effort. + pass + + +def log_session_start() -> None: + """Write the session-start banner. Idempotent within one process.""" + global _SESSION_STARTED + with _LOCK: + if _SESSION_STARTED: + return + _SESSION_STARTED = True + # Best-effort metadata. Failures don't propagate. + try: + user = getpass.getuser() + except Exception: + user = "?" + try: + cwd = str(Path.cwd()) + except Exception: + cwd = "?" + log_event( + "session", + "Session started", + platform=f"{platform.system()} {platform.release()}", + python=sys.version.split()[0], + user=user, + cwd=cwd, + log_file=str(audit_log_path()), + ) + + +def log_exception(where: str, exc: BaseException, **extra: Any) -> None: + """Convenience wrapper for caught exceptions.""" + log_event( + "error", + f"{where}: {type(exc).__name__}: {exc}", + level="error", + exc_type=type(exc).__name__, + exc_message=str(exc), + **extra, + ) + + +def log_page_open(slug: str) -> None: + """Emit a "page open" event, deduplicated within a session. + + Streamlit reruns the script many times per page (every widget + interaction triggers a rerun). Tracking the last page the user + visited in session state lets us emit a single ``nav`` event when + they actually switch pages, not one per rerun. Falls back to + always-emit when session state is unreachable (running outside + Streamlit, e.g. in tests). + """ + try: + import streamlit as st + prev = st.session_state.get("_audit_current_page") + if prev == slug: + return + st.session_state["_audit_current_page"] = slug + except Exception: + pass + log_event("nav", f"Opened {slug}", page=slug) + + +def reset_for_tests() -> None: + """Reset module-level state. Test-only — call from a pytest fixture + when isolation between tests matters.""" + global _LOG_PATH, _SESSION_ID, _SESSION_STARTED + with _LOCK: + _LOG_PATH = None + _SESSION_ID = None + _SESSION_STARTED = False + + +__all__ = [ + "audit_log_dir", + "audit_log_path", + "log_event", + "log_exception", + "log_page_open", + "log_session_start", + "reset_for_tests", +] diff --git a/src/gui/_home.py b/src/gui/_home.py index 317f59c..5f2a279 100644 --- a/src/gui/_home.py +++ b/src/gui/_home.py @@ -86,6 +86,7 @@ def _home_page() -> None: help=t("upload.uploader_help"), ) if new_files: + from src.audit import log_event changed = False for f in new_files: if f.name not in home_uploads: @@ -94,6 +95,12 @@ def _home_page() -> None: "size": f.size, } changed = True + log_event( + "upload", + f"Uploaded {f.name}", + filename=f.name, + bytes=f.size, + ) if changed: st.session_state["home_uploads"] = home_uploads @@ -139,6 +146,12 @@ def _home_page() -> None: to_remove = name if to_remove is not None: + from src.audit import log_event + log_event( + "upload", + f"Removed {to_remove}", + filename=to_remove, + ) del home_uploads[to_remove] # Drop any findings/results tied to the removed file. findings_by_file_drop = st.session_state.get( @@ -209,6 +222,12 @@ def _home_page() -> None: st.rerun() if run_clicked: + from src.audit import log_event + log_event( + "analyze", + f"Run analysis clicked on {len(pending)} file(s)", + files=list(pending), + ) progress = st.progress(0.0, text=t("upload.scanning")) for i, name in enumerate(pending, start=1): stashed = _StashedUpload(name, home_uploads[name]["bytes"]) diff --git a/src/gui/components/_legacy.py b/src/gui/components/_legacy.py index 246e850..300fb9b 100644 --- a/src/gui/components/_legacy.py +++ b/src/gui/components/_legacy.py @@ -155,6 +155,10 @@ def hide_streamlit_chrome(*, gate_license: bool = True) -> None: can render its own form without recursion. """ st.markdown(_HIDE_CHROME_CSS, unsafe_allow_html=True) + # Stamp a session-start record into the audit log the first time + # any page renders. Idempotent — subsequent calls are no-ops. + from src.audit import log_session_start + log_session_start() # Production-safe check runs first so a misconfigured shipped # build refuses to render anything (rather than rendering a # broken activation form that doesn't accept real blobs). @@ -172,10 +176,39 @@ def hide_streamlit_chrome(*, gate_license: bool = True) -> None: require_license_or_render_activation, ) render_license_status_sidebar() + _render_diagnostics_sidebar() if gate_license: require_license_or_render_activation() +def _render_diagnostics_sidebar() -> None: + """Render a small Diagnostics expander in the sidebar. + + Shows the path to the current session's audit log and an "Open + folder" button. Lives behind an expander so it doesn't take + screen space until the user opens it; the support flow is + "client mails us the file, we tell them what went wrong." + """ + from src.audit import audit_log_dir, audit_log_path + log_path = audit_log_path() + with st.sidebar: + with st.expander("🩺 Diagnostics", expanded=False): + st.caption("Audit log for this session:") + st.code(str(log_path), language=None) + if st.button( + "📂 Open log folder", + key="_diag_open_logs", + type="secondary", + use_container_width=True, + ): + opened = _open_in_file_manager(audit_log_dir(), select=log_path) + if not opened: + st.warning( + "Could not open the file manager from here. " + "Path is above — paste it into your file manager." + ) + + # --------------------------------------------------------------------------- # Clean shutdown # --------------------------------------------------------------------------- @@ -1669,6 +1702,8 @@ def _run_analysis_on_upload(uploaded): one of several uploaded files) should yield a clean red banner for that file, not kill the whole multi-file analysis run. """ + import hashlib + from src.audit import log_event, log_exception from src.core.analyze import Finding, analyze from src.core.errors import format_for_user from src.core.io import repair_bytes @@ -1676,6 +1711,18 @@ def _run_analysis_on_upload(uploaded): name = uploaded.name data = uploaded.getvalue() suffix = name.rsplit(".", 1)[-1].lower() if "." in name else "" + digest = hashlib.sha1( + data, usedforsecurity=False, + ).hexdigest()[:12] if data else "empty" + + log_event( + "analyze", + f"Analyzing {name}", + filename=name, + bytes=len(data), + sha1_12=digest, + suffix=suffix, + ) def _error_finding(description: str, fid: str = "analysis_failed") -> list[Finding]: return [Finding( @@ -1689,6 +1736,13 @@ def _run_analysis_on_upload(uploaded): )] if not data: + log_event( + "analyze", + f"Skipping {name} — 0 bytes", + level="warn", + filename=name, + outcome="empty_upload", + ) return _error_finding( f"`{name}` is empty (0 bytes). Please re-upload — the bytes " f"may not have transferred correctly from your browser.", @@ -1698,7 +1752,17 @@ def _run_analysis_on_upload(uploaded): try: if suffix in ("xlsx", "xls"): df = pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False) - return analyze(df) + findings = analyze(df) + log_event( + "analyze", + f"Analyzed {name} ({len(findings)} findings)", + filename=name, + bytes=len(data), + sha1_12=digest, + findings=len(findings), + rows=len(df), cols=len(df.columns), + ) + return findings # CSV / TSV: run repair_bytes so the user sees csv_* findings. text_head = data[:4096].decode("utf-8", errors="replace") @@ -1710,6 +1774,13 @@ def _run_analysis_on_upload(uploaded): break repair = repair_bytes(data, encoding="utf-8", delimiter=delim) if not repair.repaired_bytes: + log_event( + "analyze", + f"Skipping {name} — empty after repair", + level="warn", + filename=name, + outcome="empty_after_repair", + ) return _error_finding( f"`{name}` is empty after pre-parse repair " f"(original was {len(data)} bytes — likely all NUL " @@ -1723,8 +1794,25 @@ def _run_analysis_on_upload(uploaded): encoding="utf-8", delimiter=delim, dtype=str, keep_default_na=False, on_bad_lines="warn", ) - return analyze(df, repair_result=repair) - except pd.errors.EmptyDataError: + findings = analyze(df, repair_result=repair) + log_event( + "analyze", + f"Analyzed {name} ({len(findings)} findings)", + filename=name, + bytes=len(data), + sha1_12=digest, + findings=len(findings), + rows=len(df), cols=len(df.columns), + delimiter=repr(delim), + ) + return findings + except pd.errors.EmptyDataError as e: + log_exception( + f"analyze({name})", + e, + filename=name, + outcome="empty_after_repair", + ) return _error_finding( f"`{name}` could not be parsed — pandas reports no columns " f"in the file. Original size was {len(data)} bytes. Open " @@ -1733,6 +1821,12 @@ def _run_analysis_on_upload(uploaded): fid="empty_after_repair", ) except Exception as e: + log_exception( + f"analyze({name})", + e, + filename=name, + outcome="analysis_failed", + ) return _error_finding( f"`{name}` could not be analyzed: {format_for_user(e)}", ) diff --git a/src/gui/pages/1_Deduplicator.py b/src/gui/pages/1_Deduplicator.py index 05716dc..ed03144 100644 --- a/src/gui/pages/1_Deduplicator.py +++ b/src/gui/pages/1_Deduplicator.py @@ -33,6 +33,8 @@ from src.license import FeatureFlag hide_streamlit_chrome() render_sticky_footer() +from src.audit import log_page_open +log_page_open("1_Deduplicator") require_feature_or_render_upgrade(FeatureFlag.DEDUPLICATOR) # --------------------------------------------------------------------------- @@ -231,6 +233,8 @@ if uploaded is not None: progress_bar.empty() st.session_state["result"] = result + from src.audit import log_event + log_event("tool_run", "Find Duplicates run", page="1_Deduplicator") st.session_state["review_decisions"] = {} # One-shot flag for the scroll snippet at the bottom of the # page. Force a rerun so the Preview / Options expanders see diff --git a/src/gui/pages/2_Text_Cleaner.py b/src/gui/pages/2_Text_Cleaner.py index e808668..495a631 100644 --- a/src/gui/pages/2_Text_Cleaner.py +++ b/src/gui/pages/2_Text_Cleaner.py @@ -35,6 +35,8 @@ from src.core.text_clean import ( hide_streamlit_chrome() render_sticky_footer() +from src.audit import log_page_open +log_page_open("2_Text_Cleaner") require_feature_or_render_upgrade(FeatureFlag.TEXT_CLEANER) @@ -218,6 +220,8 @@ if st.button("Clean Text", type="primary", use_container_width=True): st.error(str(e)) st.stop() st.session_state["textclean_result"] = result + from src.audit import log_event + log_event("tool_run", "Clean Text run", page="2_Text_Cleaner") st.session_state["textclean_input_name"] = uploaded.name # One-shot flag picked up on the next pass to scroll the parent # document to the Results anchor (see scroll snippet below). diff --git a/src/gui/pages/3_Format_Standardizer.py b/src/gui/pages/3_Format_Standardizer.py index c043f95..35d7ab1 100644 --- a/src/gui/pages/3_Format_Standardizer.py +++ b/src/gui/pages/3_Format_Standardizer.py @@ -33,6 +33,8 @@ from src.license import FeatureFlag hide_streamlit_chrome() render_sticky_footer() +from src.audit import log_page_open +log_page_open("3_Format_Standardizer") require_feature_or_render_upgrade(FeatureFlag.FORMAT_STANDARDIZER) @@ -537,6 +539,8 @@ if st.button( st.error(str(e)) st.stop() st.session_state["fmtstd_result"] = result + from src.audit import log_event + log_event("tool_run", "Standardize Formats run", page="3_Format_Standardizer") st.session_state["fmtstd_input_name"] = uploaded.name # One-shot flag picked up on the next pass to scroll the parent # document to the Results anchor (see scroll snippet below). diff --git a/src/gui/pages/4_Missing_Values.py b/src/gui/pages/4_Missing_Values.py index 0bb7d75..ee997af 100644 --- a/src/gui/pages/4_Missing_Values.py +++ b/src/gui/pages/4_Missing_Values.py @@ -34,6 +34,8 @@ from src.license import FeatureFlag hide_streamlit_chrome() render_sticky_footer() +from src.audit import log_page_open +log_page_open("4_Missing_Values") require_feature_or_render_upgrade(FeatureFlag.MISSING_HANDLER) @@ -291,6 +293,8 @@ if st.button("Handle Missing Values", type="primary", use_container_width=True): st.error(format_for_user(e)) st.stop() st.session_state["missing_result"] = result + from src.audit import log_event + log_event("tool_run", "Fix Missing Values run", page="4_Missing_Values") st.session_state["missing_input_name"] = uploaded.name st.session_state["missing_options"] = options.to_dict() # One-shot flag picked up on the next pass to scroll the parent diff --git a/src/gui/pages/5_Column_Mapper.py b/src/gui/pages/5_Column_Mapper.py index 52e0636..38b28e7 100644 --- a/src/gui/pages/5_Column_Mapper.py +++ b/src/gui/pages/5_Column_Mapper.py @@ -35,6 +35,8 @@ from src.license import FeatureFlag hide_streamlit_chrome() render_sticky_footer() +from src.audit import log_page_open +log_page_open("5_Column_Mapper") require_feature_or_render_upgrade(FeatureFlag.COLUMN_MAPPER) @@ -338,6 +340,8 @@ if st.button("Apply Column Mapping", type="primary", use_container_width=True): st.error(format_for_user(e)) st.stop() st.session_state["colmap_result"] = result + from src.audit import log_event + log_event("tool_run", "Map Columns run", page="5_Column_Mapper") st.session_state["colmap_input_name"] = uploaded.name st.session_state["colmap_options"] = options.to_dict() # One-shot flag picked up on the next pass to scroll the parent diff --git a/src/gui/pages/6_Outlier_Detector.py b/src/gui/pages/6_Outlier_Detector.py index 033e921..e878877 100644 --- a/src/gui/pages/6_Outlier_Detector.py +++ b/src/gui/pages/6_Outlier_Detector.py @@ -22,6 +22,8 @@ from src.license import FeatureFlag hide_streamlit_chrome() render_sticky_footer() +from src.audit import log_page_open +log_page_open("6_Outlier_Detector") require_feature_or_render_upgrade(FeatureFlag.OUTLIER_DETECTOR) # --------------------------------------------------------------------------- diff --git a/src/gui/pages/7_Multi_File_Merger.py b/src/gui/pages/7_Multi_File_Merger.py index f4c6616..dc94ce4 100644 --- a/src/gui/pages/7_Multi_File_Merger.py +++ b/src/gui/pages/7_Multi_File_Merger.py @@ -22,6 +22,8 @@ from src.license import FeatureFlag hide_streamlit_chrome() render_sticky_footer() +from src.audit import log_page_open +log_page_open("7_Multi_File_Merger") require_feature_or_render_upgrade(FeatureFlag.MULTI_FILE_MERGER) # --------------------------------------------------------------------------- diff --git a/src/gui/pages/8_Validator_Reporter.py b/src/gui/pages/8_Validator_Reporter.py index 184a171..c0ee773 100644 --- a/src/gui/pages/8_Validator_Reporter.py +++ b/src/gui/pages/8_Validator_Reporter.py @@ -22,6 +22,8 @@ from src.license import FeatureFlag hide_streamlit_chrome() render_sticky_footer() +from src.audit import log_page_open +log_page_open("8_Validator_Reporter") require_feature_or_render_upgrade(FeatureFlag.VALIDATOR_REPORTER) # --------------------------------------------------------------------------- diff --git a/src/gui/pages/9_Pipeline_Runner.py b/src/gui/pages/9_Pipeline_Runner.py index d954c5a..d598133 100644 --- a/src/gui/pages/9_Pipeline_Runner.py +++ b/src/gui/pages/9_Pipeline_Runner.py @@ -36,6 +36,8 @@ from src.license import FeatureFlag hide_streamlit_chrome() render_sticky_footer() +from src.audit import log_page_open +log_page_open("9_Pipeline_Runner") require_feature_or_render_upgrade(FeatureFlag.PIPELINE_RUNNER) @@ -283,6 +285,8 @@ if st.button( progress.progress(1.0, text="Done") st.session_state["pipeline_result"] = result + from src.audit import log_event + log_event("tool_run", "Automated Workflows run", page="9_Pipeline_Runner") st.session_state["pipeline_input_name"] = uploaded.name # One-shot flag picked up on the next pass to scroll the parent # document to the Results anchor (see scroll snippet at end of file).