feat(audit): JSONL audit log for support diagnostics

New ``src/audit.py`` module records GUI actions to a per-session JSONL file under ``~/.datatools/logs/`` (overrideable via ``DATATOOLS_AUDIT_DIR``). The file is human-readable (one JSON object per line, each with a ``message`` field) AND trivially machine-parseable — the support flow is "client mails the file, we read it and explain what went wrong." Format example:: {"ts":"2026-05-17T05:30:00.123+00:00","level":"info","category":"session", "session":"a1b2c3d4","message":"Session started", "platform":"Windows 11","python":"3.14.0","user":"Michael Dombaugh", "log_file":"C:\\Users\\Michael Dombaugh\\.datatools\\logs\\datatools-...jsonl"} {"ts":"...","category":"upload","message":"Uploaded customers.csv", "filename":"customers.csv","bytes":24813} {"ts":"...","category":"analyze","message":"Analyzed customers.csv (3 findings)", "filename":"customers.csv","findings":3,"rows":120,"cols":8} {"ts":"...","category":"tool_run","message":"Clean Text run", "page":"2_Text_Cleaner"} {"ts":"...","category":"error","level":"error", "message":"analyze(weird.csv): EmptyDataError: No columns to parse", "filename":"weird.csv","outcome":"empty_after_repair"} Public API: - ``log_event(category, message, **extra)`` - ``log_session_start()`` — idempotent banner with platform info - ``log_page_open(slug)`` — emit a ``nav`` event, deduplicated per Streamlit session so reruns don't spam the log - ``log_exception(where, exc, **extra)`` — convenience wrapper - ``audit_log_path()`` / ``audit_log_dir()`` — for the UI Wired in at: - ``hide_streamlit_chrome``: stamps session start, mounts a small "🩺 Diagnostics" expander in the sidebar with the log path and an "Open log folder" button so the user can grab the file to attach to a support email. - Home page: ``upload`` event on every new file, ``upload`` event on per-file remove, ``analyze`` event with file count when Run-analysis fires. - ``_run_analysis_on_upload``: ``analyze`` event with rows / cols / findings count per file, plus ``error`` events on every caught exception (empty upload, empty after repair, pandas EmptyDataError, generic Exception). - Every Ready tool page (1, 2, 3, 4, 5, 9): ``tool_run`` event immediately after the primary action stashes its result. - Every tool page (1-9): ``log_page_open(slug)`` on render — deduped via session state so we don't get one event per Streamlit rerun. Safety: - ``log_event`` wraps every write in try/except. A broken audit log must NOT crash the GUI. - Non-JSON-serializable extras are ``str()``-coerced before writing. - File CONTENTS are never logged. We capture filename, byte count, and (in the analyzer) a 12-char sha1 fingerprint of the bytes so the same file re-uploaded gets the same trace. - License keys, session cookies, etc. are not logged. - ``DATATOOLS_AUDIT_DIR`` env var lets tests redirect writes into a tmp dir. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-17 01:36:35 +00:00
parent f0885aeb1e
commit c73d716d06
12 changed files with 373 additions and 3 deletions
--- a/src/gui/components/_legacy.py
+++ b/src/gui/components/_legacy.py
@@ -155,6 +155,10 @@ def hide_streamlit_chrome(*, gate_license: bool = True) -> None:
    can render its own form without recursion.
    """
    st.markdown(_HIDE_CHROME_CSS, unsafe_allow_html=True)
+    # Stamp a session-start record into the audit log the first time
+    # any page renders. Idempotent — subsequent calls are no-ops.
+    from src.audit import log_session_start
+    log_session_start()
    # Production-safe check runs first so a misconfigured shipped
    # build refuses to render anything (rather than rendering a
    # broken activation form that doesn't accept real blobs).
@@ -172,10 +176,39 @@ def hide_streamlit_chrome(*, gate_license: bool = True) -> None:
        require_license_or_render_activation,
    )
    render_license_status_sidebar()
+    _render_diagnostics_sidebar()
    if gate_license:
        require_license_or_render_activation()


+def _render_diagnostics_sidebar() -> None:
+    """Render a small Diagnostics expander in the sidebar.
+
+    Shows the path to the current session's audit log and an "Open
+    folder" button. Lives behind an expander so it doesn't take
+    screen space until the user opens it; the support flow is
+    "client mails us the file, we tell them what went wrong."
+    """
+    from src.audit import audit_log_dir, audit_log_path
+    log_path = audit_log_path()
+    with st.sidebar:
+        with st.expander("🩺  Diagnostics", expanded=False):
+            st.caption("Audit log for this session:")
+            st.code(str(log_path), language=None)
+            if st.button(
+                "📂  Open log folder",
+                key="_diag_open_logs",
+                type="secondary",
+                use_container_width=True,
+            ):
+                opened = _open_in_file_manager(audit_log_dir(), select=log_path)
+                if not opened:
+                    st.warning(
+                        "Could not open the file manager from here. "
+                        "Path is above — paste it into your file manager."
+                    )
+
+
 # ---------------------------------------------------------------------------
 # Clean shutdown
 # ---------------------------------------------------------------------------
@@ -1669,6 +1702,8 @@ def _run_analysis_on_upload(uploaded):
    one of several uploaded files) should yield a clean red banner for
    that file, not kill the whole multi-file analysis run.
    """
+    import hashlib
+    from src.audit import log_event, log_exception
    from src.core.analyze import Finding, analyze
    from src.core.errors import format_for_user
    from src.core.io import repair_bytes
@@ -1676,6 +1711,18 @@ def _run_analysis_on_upload(uploaded):
    name = uploaded.name
    data = uploaded.getvalue()
    suffix = name.rsplit(".", 1)[-1].lower() if "." in name else ""
+    digest = hashlib.sha1(
+        data, usedforsecurity=False,
+    ).hexdigest()[:12] if data else "empty"
+
+    log_event(
+        "analyze",
+        f"Analyzing {name}",
+        filename=name,
+        bytes=len(data),
+        sha1_12=digest,
+        suffix=suffix,
+    )

    def _error_finding(description: str, fid: str = "analysis_failed") -> list[Finding]:
        return [Finding(
@@ -1689,6 +1736,13 @@ def _run_analysis_on_upload(uploaded):
        )]

    if not data:
+        log_event(
+            "analyze",
+            f"Skipping {name} — 0 bytes",
+            level="warn",
+            filename=name,
+            outcome="empty_upload",
+        )
        return _error_finding(
            f"`{name}` is empty (0 bytes). Please re-upload — the bytes "
            f"may not have transferred correctly from your browser.",
@@ -1698,7 +1752,17 @@ def _run_analysis_on_upload(uploaded):
    try:
        if suffix in ("xlsx", "xls"):
            df = pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False)
-            return analyze(df)
+            findings = analyze(df)
+            log_event(
+                "analyze",
+                f"Analyzed {name} ({len(findings)} findings)",
+                filename=name,
+                bytes=len(data),
+                sha1_12=digest,
+                findings=len(findings),
+                rows=len(df), cols=len(df.columns),
+            )
+            return findings

        # CSV / TSV: run repair_bytes so the user sees csv_* findings.
        text_head = data[:4096].decode("utf-8", errors="replace")
@@ -1710,6 +1774,13 @@ def _run_analysis_on_upload(uploaded):
                    break
        repair = repair_bytes(data, encoding="utf-8", delimiter=delim)
        if not repair.repaired_bytes:
+            log_event(
+                "analyze",
+                f"Skipping {name} — empty after repair",
+                level="warn",
+                filename=name,
+                outcome="empty_after_repair",
+            )
            return _error_finding(
                f"`{name}` is empty after pre-parse repair "
                f"(original was {len(data)} bytes — likely all NUL "
@@ -1723,8 +1794,25 @@ def _run_analysis_on_upload(uploaded):
            encoding="utf-8", delimiter=delim,
            dtype=str, keep_default_na=False, on_bad_lines="warn",
        )
-        return analyze(df, repair_result=repair)
-    except pd.errors.EmptyDataError:
+        findings = analyze(df, repair_result=repair)
+        log_event(
+            "analyze",
+            f"Analyzed {name} ({len(findings)} findings)",
+            filename=name,
+            bytes=len(data),
+            sha1_12=digest,
+            findings=len(findings),
+            rows=len(df), cols=len(df.columns),
+            delimiter=repr(delim),
+        )
+        return findings
+    except pd.errors.EmptyDataError as e:
+        log_exception(
+            f"analyze({name})",
+            e,
+            filename=name,
+            outcome="empty_after_repair",
+        )
        return _error_finding(
            f"`{name}` could not be parsed — pandas reports no columns "
            f"in the file. Original size was {len(data)} bytes. Open "
@@ -1733,6 +1821,12 @@ def _run_analysis_on_upload(uploaded):
            fid="empty_after_repair",
        )
    except Exception as e:
+        log_exception(
+            f"analyze({name})",
+            e,
+            filename=name,
+            outcome="analysis_failed",
+        )
        return _error_finding(
            f"`{name}` could not be analyzed: {format_for_user(e)}",
        )