feat(audit): JSONL audit log for support diagnostics

New ``src/audit.py`` module records GUI actions to a per-session JSONL file under ``~/.datatools/logs/`` (overrideable via ``DATATOOLS_AUDIT_DIR``). The file is human-readable (one JSON object per line, each with a ``message`` field) AND trivially machine-parseable — the support flow is "client mails the file, we read it and explain what went wrong." Format example:: {"ts":"2026-05-17T05:30:00.123+00:00","level":"info","category":"session", "session":"a1b2c3d4","message":"Session started", "platform":"Windows 11","python":"3.14.0","user":"Michael Dombaugh", "log_file":"C:\\Users\\Michael Dombaugh\\.datatools\\logs\\datatools-...jsonl"} {"ts":"...","category":"upload","message":"Uploaded customers.csv", "filename":"customers.csv","bytes":24813} {"ts":"...","category":"analyze","message":"Analyzed customers.csv (3 findings)", "filename":"customers.csv","findings":3,"rows":120,"cols":8} {"ts":"...","category":"tool_run","message":"Clean Text run", "page":"2_Text_Cleaner"} {"ts":"...","category":"error","level":"error", "message":"analyze(weird.csv): EmptyDataError: No columns to parse", "filename":"weird.csv","outcome":"empty_after_repair"} Public API: - ``log_event(category, message, **extra)`` - ``log_session_start()`` — idempotent banner with platform info - ``log_page_open(slug)`` — emit a ``nav`` event, deduplicated per Streamlit session so reruns don't spam the log - ``log_exception(where, exc, **extra)`` — convenience wrapper - ``audit_log_path()`` / ``audit_log_dir()`` — for the UI Wired in at: - ``hide_streamlit_chrome``: stamps session start, mounts a small "🩺 Diagnostics" expander in the sidebar with the log path and an "Open log folder" button so the user can grab the file to attach to a support email. - Home page: ``upload`` event on every new file, ``upload`` event on per-file remove, ``analyze`` event with file count when Run-analysis fires. - ``_run_analysis_on_upload``: ``analyze`` event with rows / cols / findings count per file, plus ``error`` events on every caught exception (empty upload, empty after repair, pandas EmptyDataError, generic Exception). - Every Ready tool page (1, 2, 3, 4, 5, 9): ``tool_run`` event immediately after the primary action stashes its result. - Every tool page (1-9): ``log_page_open(slug)`` on render — deduped via session state so we don't get one event per Streamlit rerun. Safety: - ``log_event`` wraps every write in try/except. A broken audit log must NOT crash the GUI. - Non-JSON-serializable extras are ``str()``-coerced before writing. - File CONTENTS are never logged. We capture filename, byte count, and (in the analyzer) a 12-char sha1 fingerprint of the bytes so the same file re-uploaded gets the same trace. - License keys, session cookies, etc. are not logged. - ``DATATOOLS_AUDIT_DIR`` env var lets tests redirect writes into a tmp dir. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-17 01:36:35 +00:00
parent f0885aeb1e
commit c73d716d06
12 changed files with 373 additions and 3 deletions
--- a/src/gui/_home.py
+++ b/src/gui/_home.py
@@ -86,6 +86,7 @@ def _home_page() -> None:
        help=t("upload.uploader_help"),
    )
    if new_files:
+        from src.audit import log_event
        changed = False
        for f in new_files:
            if f.name not in home_uploads:
@@ -94,6 +95,12 @@ def _home_page() -> None:
                    "size": f.size,
                }
                changed = True
+                log_event(
+                    "upload",
+                    f"Uploaded {f.name}",
+                    filename=f.name,
+                    bytes=f.size,
+                )
        if changed:
            st.session_state["home_uploads"] = home_uploads

@@ -139,6 +146,12 @@ def _home_page() -> None:
                to_remove = name

        if to_remove is not None:
+            from src.audit import log_event
+            log_event(
+                "upload",
+                f"Removed {to_remove}",
+                filename=to_remove,
+            )
            del home_uploads[to_remove]
            # Drop any findings/results tied to the removed file.
            findings_by_file_drop = st.session_state.get(
@@ -209,6 +222,12 @@ def _home_page() -> None:
        st.rerun()

    if run_clicked:
+        from src.audit import log_event
+        log_event(
+            "analyze",
+            f"Run analysis clicked on {len(pending)} file(s)",
+            files=list(pending),
+        )
        progress = st.progress(0.0, text=t("upload.scanning"))
        for i, name in enumerate(pending, start=1):
            stashed = _StashedUpload(name, home_uploads[name]["bytes"])
--- a/src/gui/components/_legacy.py
+++ b/src/gui/components/_legacy.py
@@ -155,6 +155,10 @@ def hide_streamlit_chrome(*, gate_license: bool = True) -> None:
    can render its own form without recursion.
    """
    st.markdown(_HIDE_CHROME_CSS, unsafe_allow_html=True)
+    # Stamp a session-start record into the audit log the first time
+    # any page renders. Idempotent — subsequent calls are no-ops.
+    from src.audit import log_session_start
+    log_session_start()
    # Production-safe check runs first so a misconfigured shipped
    # build refuses to render anything (rather than rendering a
    # broken activation form that doesn't accept real blobs).
@@ -172,10 +176,39 @@ def hide_streamlit_chrome(*, gate_license: bool = True) -> None:
        require_license_or_render_activation,
    )
    render_license_status_sidebar()
+    _render_diagnostics_sidebar()
    if gate_license:
        require_license_or_render_activation()


+def _render_diagnostics_sidebar() -> None:
+    """Render a small Diagnostics expander in the sidebar.
+
+    Shows the path to the current session's audit log and an "Open
+    folder" button. Lives behind an expander so it doesn't take
+    screen space until the user opens it; the support flow is
+    "client mails us the file, we tell them what went wrong."
+    """
+    from src.audit import audit_log_dir, audit_log_path
+    log_path = audit_log_path()
+    with st.sidebar:
+        with st.expander("🩺  Diagnostics", expanded=False):
+            st.caption("Audit log for this session:")
+            st.code(str(log_path), language=None)
+            if st.button(
+                "📂  Open log folder",
+                key="_diag_open_logs",
+                type="secondary",
+                use_container_width=True,
+            ):
+                opened = _open_in_file_manager(audit_log_dir(), select=log_path)
+                if not opened:
+                    st.warning(
+                        "Could not open the file manager from here. "
+                        "Path is above — paste it into your file manager."
+                    )
+
+
 # ---------------------------------------------------------------------------
 # Clean shutdown
 # ---------------------------------------------------------------------------
@@ -1669,6 +1702,8 @@ def _run_analysis_on_upload(uploaded):
    one of several uploaded files) should yield a clean red banner for
    that file, not kill the whole multi-file analysis run.
    """
+    import hashlib
+    from src.audit import log_event, log_exception
    from src.core.analyze import Finding, analyze
    from src.core.errors import format_for_user
    from src.core.io import repair_bytes
@@ -1676,6 +1711,18 @@ def _run_analysis_on_upload(uploaded):
    name = uploaded.name
    data = uploaded.getvalue()
    suffix = name.rsplit(".", 1)[-1].lower() if "." in name else ""
+    digest = hashlib.sha1(
+        data, usedforsecurity=False,
+    ).hexdigest()[:12] if data else "empty"
+
+    log_event(
+        "analyze",
+        f"Analyzing {name}",
+        filename=name,
+        bytes=len(data),
+        sha1_12=digest,
+        suffix=suffix,
+    )

    def _error_finding(description: str, fid: str = "analysis_failed") -> list[Finding]:
        return [Finding(
@@ -1689,6 +1736,13 @@ def _run_analysis_on_upload(uploaded):
        )]

    if not data:
+        log_event(
+            "analyze",
+            f"Skipping {name} — 0 bytes",
+            level="warn",
+            filename=name,
+            outcome="empty_upload",
+        )
        return _error_finding(
            f"`{name}` is empty (0 bytes). Please re-upload — the bytes "
            f"may not have transferred correctly from your browser.",
@@ -1698,7 +1752,17 @@ def _run_analysis_on_upload(uploaded):
    try:
        if suffix in ("xlsx", "xls"):
            df = pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False)
-            return analyze(df)
+            findings = analyze(df)
+            log_event(
+                "analyze",
+                f"Analyzed {name} ({len(findings)} findings)",
+                filename=name,
+                bytes=len(data),
+                sha1_12=digest,
+                findings=len(findings),
+                rows=len(df), cols=len(df.columns),
+            )
+            return findings

        # CSV / TSV: run repair_bytes so the user sees csv_* findings.
        text_head = data[:4096].decode("utf-8", errors="replace")
@@ -1710,6 +1774,13 @@ def _run_analysis_on_upload(uploaded):
                    break
        repair = repair_bytes(data, encoding="utf-8", delimiter=delim)
        if not repair.repaired_bytes:
+            log_event(
+                "analyze",
+                f"Skipping {name} — empty after repair",
+                level="warn",
+                filename=name,
+                outcome="empty_after_repair",
+            )
            return _error_finding(
                f"`{name}` is empty after pre-parse repair "
                f"(original was {len(data)} bytes — likely all NUL "
@@ -1723,8 +1794,25 @@ def _run_analysis_on_upload(uploaded):
            encoding="utf-8", delimiter=delim,
            dtype=str, keep_default_na=False, on_bad_lines="warn",
        )
-        return analyze(df, repair_result=repair)
-    except pd.errors.EmptyDataError:
+        findings = analyze(df, repair_result=repair)
+        log_event(
+            "analyze",
+            f"Analyzed {name} ({len(findings)} findings)",
+            filename=name,
+            bytes=len(data),
+            sha1_12=digest,
+            findings=len(findings),
+            rows=len(df), cols=len(df.columns),
+            delimiter=repr(delim),
+        )
+        return findings
+    except pd.errors.EmptyDataError as e:
+        log_exception(
+            f"analyze({name})",
+            e,
+            filename=name,
+            outcome="empty_after_repair",
+        )
        return _error_finding(
            f"`{name}` could not be parsed — pandas reports no columns "
            f"in the file. Original size was {len(data)} bytes. Open "
@@ -1733,6 +1821,12 @@ def _run_analysis_on_upload(uploaded):
            fid="empty_after_repair",
        )
    except Exception as e:
+        log_exception(
+            f"analyze({name})",
+            e,
+            filename=name,
+            outcome="analysis_failed",
+        )
        return _error_finding(
            f"`{name}` could not be analyzed: {format_for_user(e)}",
        )
--- a/src/gui/pages/1_Deduplicator.py
+++ b/src/gui/pages/1_Deduplicator.py
@@ -33,6 +33,8 @@ from src.license import FeatureFlag

 hide_streamlit_chrome()
 render_sticky_footer()
+from src.audit import log_page_open
+log_page_open("1_Deduplicator")
 require_feature_or_render_upgrade(FeatureFlag.DEDUPLICATOR)

 # ---------------------------------------------------------------------------
@@ -231,6 +233,8 @@ if uploaded is not None:

            progress_bar.empty()
            st.session_state["result"] = result
+            from src.audit import log_event
+            log_event("tool_run", "Find Duplicates run", page="1_Deduplicator")
            st.session_state["review_decisions"] = {}
            # One-shot flag for the scroll snippet at the bottom of the
            # page. Force a rerun so the Preview / Options expanders see
--- a/src/gui/pages/2_Text_Cleaner.py
+++ b/src/gui/pages/2_Text_Cleaner.py
@@ -35,6 +35,8 @@ from src.core.text_clean import (

 hide_streamlit_chrome()
 render_sticky_footer()
+from src.audit import log_page_open
+log_page_open("2_Text_Cleaner")
 require_feature_or_render_upgrade(FeatureFlag.TEXT_CLEANER)


@@ -218,6 +220,8 @@ if st.button("Clean Text", type="primary", use_container_width=True):
            st.error(str(e))
            st.stop()
    st.session_state["textclean_result"] = result
+    from src.audit import log_event
+    log_event("tool_run", "Clean Text run", page="2_Text_Cleaner")
    st.session_state["textclean_input_name"] = uploaded.name
    # One-shot flag picked up on the next pass to scroll the parent
    # document to the Results anchor (see scroll snippet below).
--- a/src/gui/pages/3_Format_Standardizer.py
+++ b/src/gui/pages/3_Format_Standardizer.py
@@ -33,6 +33,8 @@ from src.license import FeatureFlag

 hide_streamlit_chrome()
 render_sticky_footer()
+from src.audit import log_page_open
+log_page_open("3_Format_Standardizer")
 require_feature_or_render_upgrade(FeatureFlag.FORMAT_STANDARDIZER)


@@ -537,6 +539,8 @@ if st.button(
            st.error(str(e))
            st.stop()
    st.session_state["fmtstd_result"] = result
+    from src.audit import log_event
+    log_event("tool_run", "Standardize Formats run", page="3_Format_Standardizer")
    st.session_state["fmtstd_input_name"] = uploaded.name
    # One-shot flag picked up on the next pass to scroll the parent
    # document to the Results anchor (see scroll snippet below).
--- a/src/gui/pages/4_Missing_Values.py
+++ b/src/gui/pages/4_Missing_Values.py
@@ -34,6 +34,8 @@ from src.license import FeatureFlag

 hide_streamlit_chrome()
 render_sticky_footer()
+from src.audit import log_page_open
+log_page_open("4_Missing_Values")
 require_feature_or_render_upgrade(FeatureFlag.MISSING_HANDLER)


@@ -291,6 +293,8 @@ if st.button("Handle Missing Values", type="primary", use_container_width=True):
            st.error(format_for_user(e))
            st.stop()
    st.session_state["missing_result"] = result
+    from src.audit import log_event
+    log_event("tool_run", "Fix Missing Values run", page="4_Missing_Values")
    st.session_state["missing_input_name"] = uploaded.name
    st.session_state["missing_options"] = options.to_dict()
    # One-shot flag picked up on the next pass to scroll the parent
--- a/src/gui/pages/5_Column_Mapper.py
+++ b/src/gui/pages/5_Column_Mapper.py
@@ -35,6 +35,8 @@ from src.license import FeatureFlag

 hide_streamlit_chrome()
 render_sticky_footer()
+from src.audit import log_page_open
+log_page_open("5_Column_Mapper")
 require_feature_or_render_upgrade(FeatureFlag.COLUMN_MAPPER)


@@ -338,6 +340,8 @@ if st.button("Apply Column Mapping", type="primary", use_container_width=True):
            st.error(format_for_user(e))
            st.stop()
    st.session_state["colmap_result"] = result
+    from src.audit import log_event
+    log_event("tool_run", "Map Columns run", page="5_Column_Mapper")
    st.session_state["colmap_input_name"] = uploaded.name
    st.session_state["colmap_options"] = options.to_dict()
    # One-shot flag picked up on the next pass to scroll the parent
--- a/src/gui/pages/6_Outlier_Detector.py
+++ b/src/gui/pages/6_Outlier_Detector.py
@@ -22,6 +22,8 @@ from src.license import FeatureFlag

 hide_streamlit_chrome()
 render_sticky_footer()
+from src.audit import log_page_open
+log_page_open("6_Outlier_Detector")
 require_feature_or_render_upgrade(FeatureFlag.OUTLIER_DETECTOR)

 # ---------------------------------------------------------------------------
--- a/src/gui/pages/7_Multi_File_Merger.py
+++ b/src/gui/pages/7_Multi_File_Merger.py
@@ -22,6 +22,8 @@ from src.license import FeatureFlag

 hide_streamlit_chrome()
 render_sticky_footer()
+from src.audit import log_page_open
+log_page_open("7_Multi_File_Merger")
 require_feature_or_render_upgrade(FeatureFlag.MULTI_FILE_MERGER)

 # ---------------------------------------------------------------------------
--- a/src/gui/pages/8_Validator_Reporter.py
+++ b/src/gui/pages/8_Validator_Reporter.py
@@ -22,6 +22,8 @@ from src.license import FeatureFlag

 hide_streamlit_chrome()
 render_sticky_footer()
+from src.audit import log_page_open
+log_page_open("8_Validator_Reporter")
 require_feature_or_render_upgrade(FeatureFlag.VALIDATOR_REPORTER)

 # ---------------------------------------------------------------------------
--- a/src/gui/pages/9_Pipeline_Runner.py
+++ b/src/gui/pages/9_Pipeline_Runner.py
@@ -36,6 +36,8 @@ from src.license import FeatureFlag

 hide_streamlit_chrome()
 render_sticky_footer()
+from src.audit import log_page_open
+log_page_open("9_Pipeline_Runner")
 require_feature_or_render_upgrade(FeatureFlag.PIPELINE_RUNNER)


@@ -283,6 +285,8 @@ if st.button(

    progress.progress(1.0, text="Done")
    st.session_state["pipeline_result"] = result
+    from src.audit import log_event
+    log_event("tool_run", "Automated Workflows run", page="9_Pipeline_Runner")
    st.session_state["pipeline_input_name"] = uploaded.name
    # One-shot flag picked up on the next pass to scroll the parent
    # document to the Results anchor (see scroll snippet at end of file).