feat(audit): JSONL audit log for support diagnostics
New ``src/audit.py`` module records GUI actions to a per-session
JSONL file under ``~/.datatools/logs/`` (overrideable via
``DATATOOLS_AUDIT_DIR``). The file is human-readable (one JSON
object per line, each with a ``message`` field) AND trivially
machine-parseable — the support flow is "client mails the file,
we read it and explain what went wrong."
Format example::
{"ts":"2026-05-17T05:30:00.123+00:00","level":"info","category":"session",
"session":"a1b2c3d4","message":"Session started",
"platform":"Windows 11","python":"3.14.0","user":"Michael Dombaugh",
"log_file":"C:\\Users\\Michael Dombaugh\\.datatools\\logs\\datatools-...jsonl"}
{"ts":"...","category":"upload","message":"Uploaded customers.csv",
"filename":"customers.csv","bytes":24813}
{"ts":"...","category":"analyze","message":"Analyzed customers.csv (3 findings)",
"filename":"customers.csv","findings":3,"rows":120,"cols":8}
{"ts":"...","category":"tool_run","message":"Clean Text run",
"page":"2_Text_Cleaner"}
{"ts":"...","category":"error","level":"error",
"message":"analyze(weird.csv): EmptyDataError: No columns to parse",
"filename":"weird.csv","outcome":"empty_after_repair"}
Public API:
- ``log_event(category, message, **extra)``
- ``log_session_start()`` — idempotent banner with platform info
- ``log_page_open(slug)`` — emit a ``nav`` event, deduplicated per
Streamlit session so reruns don't spam the log
- ``log_exception(where, exc, **extra)`` — convenience wrapper
- ``audit_log_path()`` / ``audit_log_dir()`` — for the UI
Wired in at:
- ``hide_streamlit_chrome``: stamps session start, mounts a small
"🩺 Diagnostics" expander in the sidebar with the log path and
an "Open log folder" button so the user can grab the file to
attach to a support email.
- Home page: ``upload`` event on every new file, ``upload`` event
on per-file remove, ``analyze`` event with file count when
Run-analysis fires.
- ``_run_analysis_on_upload``: ``analyze`` event with rows / cols /
findings count per file, plus ``error`` events on every caught
exception (empty upload, empty after repair, pandas EmptyDataError,
generic Exception).
- Every Ready tool page (1, 2, 3, 4, 5, 9): ``tool_run`` event
immediately after the primary action stashes its result.
- Every tool page (1-9): ``log_page_open(slug)`` on render — deduped
via session state so we don't get one event per Streamlit rerun.
Safety:
- ``log_event`` wraps every write in try/except. A broken audit
log must NOT crash the GUI.
- Non-JSON-serializable extras are ``str()``-coerced before writing.
- File CONTENTS are never logged. We capture filename, byte count,
and (in the analyzer) a 12-char sha1 fingerprint of the bytes so
the same file re-uploaded gets the same trace.
- License keys, session cookies, etc. are not logged.
- ``DATATOOLS_AUDIT_DIR`` env var lets tests redirect writes into a
tmp dir.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -86,6 +86,7 @@ def _home_page() -> None:
|
||||
help=t("upload.uploader_help"),
|
||||
)
|
||||
if new_files:
|
||||
from src.audit import log_event
|
||||
changed = False
|
||||
for f in new_files:
|
||||
if f.name not in home_uploads:
|
||||
@@ -94,6 +95,12 @@ def _home_page() -> None:
|
||||
"size": f.size,
|
||||
}
|
||||
changed = True
|
||||
log_event(
|
||||
"upload",
|
||||
f"Uploaded {f.name}",
|
||||
filename=f.name,
|
||||
bytes=f.size,
|
||||
)
|
||||
if changed:
|
||||
st.session_state["home_uploads"] = home_uploads
|
||||
|
||||
@@ -139,6 +146,12 @@ def _home_page() -> None:
|
||||
to_remove = name
|
||||
|
||||
if to_remove is not None:
|
||||
from src.audit import log_event
|
||||
log_event(
|
||||
"upload",
|
||||
f"Removed {to_remove}",
|
||||
filename=to_remove,
|
||||
)
|
||||
del home_uploads[to_remove]
|
||||
# Drop any findings/results tied to the removed file.
|
||||
findings_by_file_drop = st.session_state.get(
|
||||
@@ -209,6 +222,12 @@ def _home_page() -> None:
|
||||
st.rerun()
|
||||
|
||||
if run_clicked:
|
||||
from src.audit import log_event
|
||||
log_event(
|
||||
"analyze",
|
||||
f"Run analysis clicked on {len(pending)} file(s)",
|
||||
files=list(pending),
|
||||
)
|
||||
progress = st.progress(0.0, text=t("upload.scanning"))
|
||||
for i, name in enumerate(pending, start=1):
|
||||
stashed = _StashedUpload(name, home_uploads[name]["bytes"])
|
||||
|
||||
@@ -155,6 +155,10 @@ def hide_streamlit_chrome(*, gate_license: bool = True) -> None:
|
||||
can render its own form without recursion.
|
||||
"""
|
||||
st.markdown(_HIDE_CHROME_CSS, unsafe_allow_html=True)
|
||||
# Stamp a session-start record into the audit log the first time
|
||||
# any page renders. Idempotent — subsequent calls are no-ops.
|
||||
from src.audit import log_session_start
|
||||
log_session_start()
|
||||
# Production-safe check runs first so a misconfigured shipped
|
||||
# build refuses to render anything (rather than rendering a
|
||||
# broken activation form that doesn't accept real blobs).
|
||||
@@ -172,10 +176,39 @@ def hide_streamlit_chrome(*, gate_license: bool = True) -> None:
|
||||
require_license_or_render_activation,
|
||||
)
|
||||
render_license_status_sidebar()
|
||||
_render_diagnostics_sidebar()
|
||||
if gate_license:
|
||||
require_license_or_render_activation()
|
||||
|
||||
|
||||
def _render_diagnostics_sidebar() -> None:
|
||||
"""Render a small Diagnostics expander in the sidebar.
|
||||
|
||||
Shows the path to the current session's audit log and an "Open
|
||||
folder" button. Lives behind an expander so it doesn't take
|
||||
screen space until the user opens it; the support flow is
|
||||
"client mails us the file, we tell them what went wrong."
|
||||
"""
|
||||
from src.audit import audit_log_dir, audit_log_path
|
||||
log_path = audit_log_path()
|
||||
with st.sidebar:
|
||||
with st.expander("🩺 Diagnostics", expanded=False):
|
||||
st.caption("Audit log for this session:")
|
||||
st.code(str(log_path), language=None)
|
||||
if st.button(
|
||||
"📂 Open log folder",
|
||||
key="_diag_open_logs",
|
||||
type="secondary",
|
||||
use_container_width=True,
|
||||
):
|
||||
opened = _open_in_file_manager(audit_log_dir(), select=log_path)
|
||||
if not opened:
|
||||
st.warning(
|
||||
"Could not open the file manager from here. "
|
||||
"Path is above — paste it into your file manager."
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Clean shutdown
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -1669,6 +1702,8 @@ def _run_analysis_on_upload(uploaded):
|
||||
one of several uploaded files) should yield a clean red banner for
|
||||
that file, not kill the whole multi-file analysis run.
|
||||
"""
|
||||
import hashlib
|
||||
from src.audit import log_event, log_exception
|
||||
from src.core.analyze import Finding, analyze
|
||||
from src.core.errors import format_for_user
|
||||
from src.core.io import repair_bytes
|
||||
@@ -1676,6 +1711,18 @@ def _run_analysis_on_upload(uploaded):
|
||||
name = uploaded.name
|
||||
data = uploaded.getvalue()
|
||||
suffix = name.rsplit(".", 1)[-1].lower() if "." in name else ""
|
||||
digest = hashlib.sha1(
|
||||
data, usedforsecurity=False,
|
||||
).hexdigest()[:12] if data else "empty"
|
||||
|
||||
log_event(
|
||||
"analyze",
|
||||
f"Analyzing {name}",
|
||||
filename=name,
|
||||
bytes=len(data),
|
||||
sha1_12=digest,
|
||||
suffix=suffix,
|
||||
)
|
||||
|
||||
def _error_finding(description: str, fid: str = "analysis_failed") -> list[Finding]:
|
||||
return [Finding(
|
||||
@@ -1689,6 +1736,13 @@ def _run_analysis_on_upload(uploaded):
|
||||
)]
|
||||
|
||||
if not data:
|
||||
log_event(
|
||||
"analyze",
|
||||
f"Skipping {name} — 0 bytes",
|
||||
level="warn",
|
||||
filename=name,
|
||||
outcome="empty_upload",
|
||||
)
|
||||
return _error_finding(
|
||||
f"`{name}` is empty (0 bytes). Please re-upload — the bytes "
|
||||
f"may not have transferred correctly from your browser.",
|
||||
@@ -1698,7 +1752,17 @@ def _run_analysis_on_upload(uploaded):
|
||||
try:
|
||||
if suffix in ("xlsx", "xls"):
|
||||
df = pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False)
|
||||
return analyze(df)
|
||||
findings = analyze(df)
|
||||
log_event(
|
||||
"analyze",
|
||||
f"Analyzed {name} ({len(findings)} findings)",
|
||||
filename=name,
|
||||
bytes=len(data),
|
||||
sha1_12=digest,
|
||||
findings=len(findings),
|
||||
rows=len(df), cols=len(df.columns),
|
||||
)
|
||||
return findings
|
||||
|
||||
# CSV / TSV: run repair_bytes so the user sees csv_* findings.
|
||||
text_head = data[:4096].decode("utf-8", errors="replace")
|
||||
@@ -1710,6 +1774,13 @@ def _run_analysis_on_upload(uploaded):
|
||||
break
|
||||
repair = repair_bytes(data, encoding="utf-8", delimiter=delim)
|
||||
if not repair.repaired_bytes:
|
||||
log_event(
|
||||
"analyze",
|
||||
f"Skipping {name} — empty after repair",
|
||||
level="warn",
|
||||
filename=name,
|
||||
outcome="empty_after_repair",
|
||||
)
|
||||
return _error_finding(
|
||||
f"`{name}` is empty after pre-parse repair "
|
||||
f"(original was {len(data)} bytes — likely all NUL "
|
||||
@@ -1723,8 +1794,25 @@ def _run_analysis_on_upload(uploaded):
|
||||
encoding="utf-8", delimiter=delim,
|
||||
dtype=str, keep_default_na=False, on_bad_lines="warn",
|
||||
)
|
||||
return analyze(df, repair_result=repair)
|
||||
except pd.errors.EmptyDataError:
|
||||
findings = analyze(df, repair_result=repair)
|
||||
log_event(
|
||||
"analyze",
|
||||
f"Analyzed {name} ({len(findings)} findings)",
|
||||
filename=name,
|
||||
bytes=len(data),
|
||||
sha1_12=digest,
|
||||
findings=len(findings),
|
||||
rows=len(df), cols=len(df.columns),
|
||||
delimiter=repr(delim),
|
||||
)
|
||||
return findings
|
||||
except pd.errors.EmptyDataError as e:
|
||||
log_exception(
|
||||
f"analyze({name})",
|
||||
e,
|
||||
filename=name,
|
||||
outcome="empty_after_repair",
|
||||
)
|
||||
return _error_finding(
|
||||
f"`{name}` could not be parsed — pandas reports no columns "
|
||||
f"in the file. Original size was {len(data)} bytes. Open "
|
||||
@@ -1733,6 +1821,12 @@ def _run_analysis_on_upload(uploaded):
|
||||
fid="empty_after_repair",
|
||||
)
|
||||
except Exception as e:
|
||||
log_exception(
|
||||
f"analyze({name})",
|
||||
e,
|
||||
filename=name,
|
||||
outcome="analysis_failed",
|
||||
)
|
||||
return _error_finding(
|
||||
f"`{name}` could not be analyzed: {format_for_user(e)}",
|
||||
)
|
||||
|
||||
@@ -33,6 +33,8 @@ from src.license import FeatureFlag
|
||||
|
||||
hide_streamlit_chrome()
|
||||
render_sticky_footer()
|
||||
from src.audit import log_page_open
|
||||
log_page_open("1_Deduplicator")
|
||||
require_feature_or_render_upgrade(FeatureFlag.DEDUPLICATOR)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -231,6 +233,8 @@ if uploaded is not None:
|
||||
|
||||
progress_bar.empty()
|
||||
st.session_state["result"] = result
|
||||
from src.audit import log_event
|
||||
log_event("tool_run", "Find Duplicates run", page="1_Deduplicator")
|
||||
st.session_state["review_decisions"] = {}
|
||||
# One-shot flag for the scroll snippet at the bottom of the
|
||||
# page. Force a rerun so the Preview / Options expanders see
|
||||
|
||||
@@ -35,6 +35,8 @@ from src.core.text_clean import (
|
||||
|
||||
hide_streamlit_chrome()
|
||||
render_sticky_footer()
|
||||
from src.audit import log_page_open
|
||||
log_page_open("2_Text_Cleaner")
|
||||
require_feature_or_render_upgrade(FeatureFlag.TEXT_CLEANER)
|
||||
|
||||
|
||||
@@ -218,6 +220,8 @@ if st.button("Clean Text", type="primary", use_container_width=True):
|
||||
st.error(str(e))
|
||||
st.stop()
|
||||
st.session_state["textclean_result"] = result
|
||||
from src.audit import log_event
|
||||
log_event("tool_run", "Clean Text run", page="2_Text_Cleaner")
|
||||
st.session_state["textclean_input_name"] = uploaded.name
|
||||
# One-shot flag picked up on the next pass to scroll the parent
|
||||
# document to the Results anchor (see scroll snippet below).
|
||||
|
||||
@@ -33,6 +33,8 @@ from src.license import FeatureFlag
|
||||
|
||||
hide_streamlit_chrome()
|
||||
render_sticky_footer()
|
||||
from src.audit import log_page_open
|
||||
log_page_open("3_Format_Standardizer")
|
||||
require_feature_or_render_upgrade(FeatureFlag.FORMAT_STANDARDIZER)
|
||||
|
||||
|
||||
@@ -537,6 +539,8 @@ if st.button(
|
||||
st.error(str(e))
|
||||
st.stop()
|
||||
st.session_state["fmtstd_result"] = result
|
||||
from src.audit import log_event
|
||||
log_event("tool_run", "Standardize Formats run", page="3_Format_Standardizer")
|
||||
st.session_state["fmtstd_input_name"] = uploaded.name
|
||||
# One-shot flag picked up on the next pass to scroll the parent
|
||||
# document to the Results anchor (see scroll snippet below).
|
||||
|
||||
@@ -34,6 +34,8 @@ from src.license import FeatureFlag
|
||||
|
||||
hide_streamlit_chrome()
|
||||
render_sticky_footer()
|
||||
from src.audit import log_page_open
|
||||
log_page_open("4_Missing_Values")
|
||||
require_feature_or_render_upgrade(FeatureFlag.MISSING_HANDLER)
|
||||
|
||||
|
||||
@@ -291,6 +293,8 @@ if st.button("Handle Missing Values", type="primary", use_container_width=True):
|
||||
st.error(format_for_user(e))
|
||||
st.stop()
|
||||
st.session_state["missing_result"] = result
|
||||
from src.audit import log_event
|
||||
log_event("tool_run", "Fix Missing Values run", page="4_Missing_Values")
|
||||
st.session_state["missing_input_name"] = uploaded.name
|
||||
st.session_state["missing_options"] = options.to_dict()
|
||||
# One-shot flag picked up on the next pass to scroll the parent
|
||||
|
||||
@@ -35,6 +35,8 @@ from src.license import FeatureFlag
|
||||
|
||||
hide_streamlit_chrome()
|
||||
render_sticky_footer()
|
||||
from src.audit import log_page_open
|
||||
log_page_open("5_Column_Mapper")
|
||||
require_feature_or_render_upgrade(FeatureFlag.COLUMN_MAPPER)
|
||||
|
||||
|
||||
@@ -338,6 +340,8 @@ if st.button("Apply Column Mapping", type="primary", use_container_width=True):
|
||||
st.error(format_for_user(e))
|
||||
st.stop()
|
||||
st.session_state["colmap_result"] = result
|
||||
from src.audit import log_event
|
||||
log_event("tool_run", "Map Columns run", page="5_Column_Mapper")
|
||||
st.session_state["colmap_input_name"] = uploaded.name
|
||||
st.session_state["colmap_options"] = options.to_dict()
|
||||
# One-shot flag picked up on the next pass to scroll the parent
|
||||
|
||||
@@ -22,6 +22,8 @@ from src.license import FeatureFlag
|
||||
|
||||
hide_streamlit_chrome()
|
||||
render_sticky_footer()
|
||||
from src.audit import log_page_open
|
||||
log_page_open("6_Outlier_Detector")
|
||||
require_feature_or_render_upgrade(FeatureFlag.OUTLIER_DETECTOR)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@@ -22,6 +22,8 @@ from src.license import FeatureFlag
|
||||
|
||||
hide_streamlit_chrome()
|
||||
render_sticky_footer()
|
||||
from src.audit import log_page_open
|
||||
log_page_open("7_Multi_File_Merger")
|
||||
require_feature_or_render_upgrade(FeatureFlag.MULTI_FILE_MERGER)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@@ -22,6 +22,8 @@ from src.license import FeatureFlag
|
||||
|
||||
hide_streamlit_chrome()
|
||||
render_sticky_footer()
|
||||
from src.audit import log_page_open
|
||||
log_page_open("8_Validator_Reporter")
|
||||
require_feature_or_render_upgrade(FeatureFlag.VALIDATOR_REPORTER)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@@ -36,6 +36,8 @@ from src.license import FeatureFlag
|
||||
|
||||
hide_streamlit_chrome()
|
||||
render_sticky_footer()
|
||||
from src.audit import log_page_open
|
||||
log_page_open("9_Pipeline_Runner")
|
||||
require_feature_or_render_upgrade(FeatureFlag.PIPELINE_RUNNER)
|
||||
|
||||
|
||||
@@ -283,6 +285,8 @@ if st.button(
|
||||
|
||||
progress.progress(1.0, text="Done")
|
||||
st.session_state["pipeline_result"] = result
|
||||
from src.audit import log_event
|
||||
log_event("tool_run", "Automated Workflows run", page="9_Pipeline_Runner")
|
||||
st.session_state["pipeline_input_name"] = uploaded.name
|
||||
# One-shot flag picked up on the next pass to scroll the parent
|
||||
# document to the Results anchor (see scroll snippet at end of file).
|
||||
|
||||
Reference in New Issue
Block a user