New ``src/audit.py`` module records GUI actions to a per-session
JSONL file under ``~/.datatools/logs/`` (overrideable via
``DATATOOLS_AUDIT_DIR``). The file is human-readable (one JSON
object per line, each with a ``message`` field) AND trivially
machine-parseable — the support flow is "client mails the file,
we read it and explain what went wrong."
Format example::
{"ts":"2026-05-17T05:30:00.123+00:00","level":"info","category":"session",
"session":"a1b2c3d4","message":"Session started",
"platform":"Windows 11","python":"3.14.0","user":"Michael Dombaugh",
"log_file":"C:\\Users\\Michael Dombaugh\\.datatools\\logs\\datatools-...jsonl"}
{"ts":"...","category":"upload","message":"Uploaded customers.csv",
"filename":"customers.csv","bytes":24813}
{"ts":"...","category":"analyze","message":"Analyzed customers.csv (3 findings)",
"filename":"customers.csv","findings":3,"rows":120,"cols":8}
{"ts":"...","category":"tool_run","message":"Clean Text run",
"page":"2_Text_Cleaner"}
{"ts":"...","category":"error","level":"error",
"message":"analyze(weird.csv): EmptyDataError: No columns to parse",
"filename":"weird.csv","outcome":"empty_after_repair"}
Public API:
- ``log_event(category, message, **extra)``
- ``log_session_start()`` — idempotent banner with platform info
- ``log_page_open(slug)`` — emit a ``nav`` event, deduplicated per
Streamlit session so reruns don't spam the log
- ``log_exception(where, exc, **extra)`` — convenience wrapper
- ``audit_log_path()`` / ``audit_log_dir()`` — for the UI
Wired in at:
- ``hide_streamlit_chrome``: stamps session start, mounts a small
"🩺 Diagnostics" expander in the sidebar with the log path and
an "Open log folder" button so the user can grab the file to
attach to a support email.
- Home page: ``upload`` event on every new file, ``upload`` event
on per-file remove, ``analyze`` event with file count when
Run-analysis fires.
- ``_run_analysis_on_upload``: ``analyze`` event with rows / cols /
findings count per file, plus ``error`` events on every caught
exception (empty upload, empty after repair, pandas EmptyDataError,
generic Exception).
- Every Ready tool page (1, 2, 3, 4, 5, 9): ``tool_run`` event
immediately after the primary action stashes its result.
- Every tool page (1-9): ``log_page_open(slug)`` on render — deduped
via session state so we don't get one event per Streamlit rerun.
Safety:
- ``log_event`` wraps every write in try/except. A broken audit
log must NOT crash the GUI.
- Non-JSON-serializable extras are ``str()``-coerced before writing.
- File CONTENTS are never logged. We capture filename, byte count,
and (in the analyzer) a 12-char sha1 fingerprint of the bytes so
the same file re-uploaded gets the same trace.
- License keys, session cookies, etc. are not logged.
- ``DATATOOLS_AUDIT_DIR`` env var lets tests redirect writes into a
tmp dir.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
257 lines
9.7 KiB
Python
257 lines
9.7 KiB
Python
"""Home-page renderer extracted into its own module.
|
|
|
|
This used to live inside ``src/gui/app.py`` as a local function. Pulling
|
|
it out into a side-effect-free module lets the ``back_to_home_link``
|
|
helper (in ``components/_legacy.py``) import the home callable to pass
|
|
into ``st.switch_page`` — without re-running ``app.py``'s navigation
|
|
setup, which would itself blow up because tool pages have a different
|
|
"main script" context that breaks the registry's relative ``pages/…``
|
|
paths.
|
|
|
|
Keep this module imports-light: nothing that runs Streamlit commands
|
|
at module top level, nothing that triggers config loads. Just the
|
|
``_home_page`` callable.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import streamlit as st
|
|
|
|
|
|
class _StashedUpload:
|
|
"""Duck-types Streamlit's ``UploadedFile`` so ``_run_analysis_on_upload``
|
|
accepts entries restored from session-state without changes. Exposes
|
|
``.name``, ``.size``, and ``.getvalue()`` — the contract used by the
|
|
analyzer's read path.
|
|
"""
|
|
|
|
__slots__ = ("name", "size", "_data")
|
|
|
|
def __init__(self, name: str, data: bytes) -> None:
|
|
self.name = name
|
|
self.size = len(data)
|
|
self._data = data
|
|
|
|
def getvalue(self) -> bytes:
|
|
return self._data
|
|
|
|
|
|
def _home_page() -> None:
|
|
"""Render the home page — multi-file upload + per-file analysis.
|
|
|
|
Uploaded files live in ``st.session_state["home_uploads"]`` (a
|
|
dict keyed by filename), NOT in the widget's transient state.
|
|
Streamlit's ``st.file_uploader`` widget gets unmounted when the
|
|
user navigates away to a tool page, and its ``UploadedFile``
|
|
objects don't always re-attach on remount — so we capture the
|
|
bytes into our own session-state stash on first sight and treat
|
|
that stash as the source of truth for everything downstream
|
|
(active-file pickup, analysis, findings rendering).
|
|
|
|
Removing a file: per-row "✕" buttons next to each uploaded
|
|
filename. Clearing findings: the "Clear results" button only
|
|
wipes the analysis cache, not the upload stash — the files
|
|
persist until the user explicitly removes them.
|
|
"""
|
|
from src.gui.components import hide_streamlit_chrome, render_findings_panel
|
|
from src.gui.components._legacy import _run_analysis_on_upload
|
|
from src.i18n import t
|
|
|
|
st.set_page_config(
|
|
page_title=t("home.page_title"),
|
|
page_icon="🧹",
|
|
layout="wide",
|
|
)
|
|
hide_streamlit_chrome()
|
|
|
|
st.title(t("home.title"))
|
|
st.caption(t("home.caption"))
|
|
st.divider()
|
|
|
|
st.markdown(f"### {t('upload.heading')}")
|
|
st.caption(t("upload.intro_multi"))
|
|
|
|
# Source of truth for uploaded files. dict[name -> {"bytes", "size"}].
|
|
home_uploads: dict = st.session_state.setdefault("home_uploads", {})
|
|
|
|
# File uploader — for ADDING new files only. On every render we
|
|
# merge widget-returned files INTO home_uploads but never remove
|
|
# via the widget. (Widget state can return ``[]`` after navigation,
|
|
# which we deliberately don't treat as "user cleared their files".)
|
|
new_files = st.file_uploader(
|
|
t("upload.uploader_label_multi"),
|
|
type=["csv", "tsv", "xlsx", "xls"],
|
|
accept_multiple_files=True,
|
|
key="home_upload",
|
|
help=t("upload.uploader_help"),
|
|
)
|
|
if new_files:
|
|
from src.audit import log_event
|
|
changed = False
|
|
for f in new_files:
|
|
if f.name not in home_uploads:
|
|
home_uploads[f.name] = {
|
|
"bytes": f.getvalue(),
|
|
"size": f.size,
|
|
}
|
|
changed = True
|
|
log_event(
|
|
"upload",
|
|
f"Uploaded {f.name}",
|
|
filename=f.name,
|
|
bytes=f.size,
|
|
)
|
|
if changed:
|
|
st.session_state["home_uploads"] = home_uploads
|
|
|
|
# Persistent file list with per-file remove buttons. We render this
|
|
# ourselves rather than trusting Streamlit's widget chrome because
|
|
# the widget's "✕" only mutates widget-state, leaving home_uploads
|
|
# out of sync.
|
|
#
|
|
# Two-phase click capture pattern (avoids the "hit-or-miss" click
|
|
# losses we had previously):
|
|
#
|
|
# 1. ``st.button(key=stable_hash)`` returns True on the rerun where
|
|
# it was clicked. We use a sha1 hash of the filename as the key
|
|
# so it's identifier-safe regardless of spaces / dots / unicode
|
|
# in the file name — Streamlit's widget-identity hashing on raw
|
|
# filenames was the root cause of inconsistent removals.
|
|
# 2. Inside a single pass we collect WHICH file to remove (if any),
|
|
# then mutate state ONCE after the loop and rerun. Mutating mid
|
|
# -loop while continuing to render other buttons risked
|
|
# interleaving widget-key updates with state changes.
|
|
if home_uploads:
|
|
import hashlib
|
|
st.markdown("**Uploaded files**")
|
|
to_remove: str | None = None
|
|
for name in list(home_uploads.keys()):
|
|
digest = hashlib.sha1(
|
|
name.encode("utf-8"), usedforsecurity=False,
|
|
).hexdigest()[:10]
|
|
col_file, col_remove = st.columns([8, 1])
|
|
col_file.markdown(
|
|
f"📄 `{name}` "
|
|
f"<span style='opacity:0.6'>"
|
|
f"({home_uploads[name]['size']:,} bytes)</span>",
|
|
unsafe_allow_html=True,
|
|
)
|
|
if col_remove.button(
|
|
"Remove",
|
|
key=f"_home_remove_{digest}",
|
|
help=f"Remove {name}",
|
|
type="secondary",
|
|
use_container_width=True,
|
|
):
|
|
to_remove = name
|
|
|
|
if to_remove is not None:
|
|
from src.audit import log_event
|
|
log_event(
|
|
"upload",
|
|
f"Removed {to_remove}",
|
|
filename=to_remove,
|
|
)
|
|
del home_uploads[to_remove]
|
|
# Drop any findings/results tied to the removed file.
|
|
findings_by_file_drop = st.session_state.get(
|
|
"home_findings_by_file", {}
|
|
)
|
|
findings_by_file_drop.pop(to_remove, None)
|
|
st.session_state["home_uploads"] = home_uploads
|
|
st.session_state["home_findings_by_file"] = findings_by_file_drop
|
|
# If we just removed the active upload, also clear the
|
|
# singular ``home_uploaded_*`` keys so tool pages don't
|
|
# pick up stale bytes; the next render will repopulate
|
|
# them from whatever file is now first.
|
|
if st.session_state.get("home_uploaded_name") == to_remove:
|
|
st.session_state.pop("home_uploaded_name", None)
|
|
st.session_state.pop("home_uploaded_size", None)
|
|
st.session_state.pop("home_uploaded_bytes", None)
|
|
st.rerun()
|
|
|
|
if not home_uploads:
|
|
st.info(t("upload.empty_state"))
|
|
return
|
|
|
|
# Expose the first uploaded file via the singular ``home_uploaded_*``
|
|
# session keys so tool pages reached via "Open <Tool>" still find an
|
|
# active upload through ``pickup_or_upload``.
|
|
first_name = next(iter(home_uploads))
|
|
first_meta = home_uploads[first_name]
|
|
if (
|
|
st.session_state.get("home_uploaded_name") != first_name
|
|
or st.session_state.get("home_uploaded_size") != first_meta["size"]
|
|
):
|
|
st.session_state["home_uploaded_name"] = first_name
|
|
st.session_state["home_uploaded_size"] = first_meta["size"]
|
|
st.session_state["home_uploaded_bytes"] = first_meta["bytes"]
|
|
|
|
# Findings cache — drop entries whose underlying file is no longer
|
|
# in the stash (e.g. user just clicked "✕").
|
|
findings_by_file: dict = st.session_state.setdefault(
|
|
"home_findings_by_file", {}
|
|
)
|
|
findings_by_file = {
|
|
name: result for name, result in findings_by_file.items()
|
|
if name in home_uploads
|
|
}
|
|
st.session_state["home_findings_by_file"] = findings_by_file
|
|
|
|
pending = [name for name in home_uploads if name not in findings_by_file]
|
|
|
|
col_run, col_clear, _ = st.columns([1, 1, 4])
|
|
with col_run:
|
|
run_clicked = st.button(
|
|
t("upload.run_button"),
|
|
type="primary",
|
|
key="home_run_analysis",
|
|
disabled=not pending,
|
|
use_container_width=True,
|
|
)
|
|
with col_clear:
|
|
clear_clicked = st.button(
|
|
t("upload.clear_results"),
|
|
key="home_clear_results",
|
|
disabled=not findings_by_file,
|
|
use_container_width=True,
|
|
)
|
|
|
|
if clear_clicked:
|
|
st.session_state["home_findings_by_file"] = {}
|
|
st.rerun()
|
|
|
|
if run_clicked:
|
|
from src.audit import log_event
|
|
log_event(
|
|
"analyze",
|
|
f"Run analysis clicked on {len(pending)} file(s)",
|
|
files=list(pending),
|
|
)
|
|
progress = st.progress(0.0, text=t("upload.scanning"))
|
|
for i, name in enumerate(pending, start=1):
|
|
stashed = _StashedUpload(name, home_uploads[name]["bytes"])
|
|
findings_by_file[name] = _run_analysis_on_upload(stashed)
|
|
progress.progress(i / len(pending), text=name)
|
|
st.session_state["home_findings_by_file"] = findings_by_file
|
|
progress.empty()
|
|
st.rerun()
|
|
|
|
if findings_by_file:
|
|
st.divider()
|
|
# Preserve the upload-stash order so the user sees results in
|
|
# the same order they appear in the file list above.
|
|
for name in home_uploads:
|
|
if name not in findings_by_file:
|
|
continue
|
|
findings = findings_by_file[name]
|
|
with st.container(border=True):
|
|
if not findings:
|
|
st.markdown(f"### 📄 {name}")
|
|
st.success(t("findings.none"))
|
|
else:
|
|
render_findings_panel(findings, header=f"📄 {name}")
|
|
|
|
st.divider()
|
|
st.caption(t("chrome.footer"))
|