fix(home): defensive analysis errors don't crash the whole page
Reported: uploading 13_non_latin_scripts.csv made the home page bubble a ``pandas.errors.EmptyDataError`` traceback up through the page chrome instead of surfacing as a per-file error. In a multi-file analysis run that kills every other file's results too, which is worse than the symptom itself. Wrap ``_run_analysis_on_upload`` in proper error handling: - Empty bytes ``getvalue() == b""`` short-circuits with a synthetic error Finding telling the user the upload was zero-byte and to re-upload. - Empty ``repair.repaired_bytes`` (file was all NULs / BOM / stripped to nothing) likewise surfaces as a synthetic Finding rather than reaching pd.read_csv. - ``pd.errors.EmptyDataError`` from pandas is caught and rendered as a Finding that names the file, its byte size, and suggests opening it in a text editor to verify the header row matches the data row delimiter. - Any other exception during read/analyze is caught and surfaces as a Finding via ``format_for_user`` so the user gets a clean message, not a Python traceback. Each file in a multi-file run now stands alone: a bad file produces one red banner in its own card, every other file analyzes normally. The 13_non_latin_scripts.csv corpus file is 249 bytes of valid UTF-8 on disk and parses cleanly under the same code path locally — the user's specific symptom is likely a zero-byte upload (browser / network / Python 3.14 + Streamlit edge case). The new ``empty_upload`` finding will name the bytes count so they can confirm. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1336,33 +1336,81 @@ def upload_and_analyze_section() -> None:
|
|||||||
|
|
||||||
|
|
||||||
def _run_analysis_on_upload(uploaded):
|
def _run_analysis_on_upload(uploaded):
|
||||||
"""Read the uploaded file with pre-parse repair, then analyze."""
|
"""Read the uploaded file with pre-parse repair, then analyze.
|
||||||
from src.core.analyze import analyze
|
|
||||||
|
Errors are caught and surfaced as a single synthetic ``Finding``
|
||||||
|
instead of bubbling a traceback up into the page chrome. A bad
|
||||||
|
file (empty bytes, unreadable encoding, pandas parse failure on
|
||||||
|
one of several uploaded files) should yield a clean red banner for
|
||||||
|
that file, not kill the whole multi-file analysis run.
|
||||||
|
"""
|
||||||
|
from src.core.analyze import Finding, analyze
|
||||||
|
from src.core.errors import format_for_user
|
||||||
from src.core.io import repair_bytes
|
from src.core.io import repair_bytes
|
||||||
|
|
||||||
name = uploaded.name
|
name = uploaded.name
|
||||||
data = uploaded.getvalue()
|
data = uploaded.getvalue()
|
||||||
suffix = name.rsplit(".", 1)[-1].lower() if "." in name else ""
|
suffix = name.rsplit(".", 1)[-1].lower() if "." in name else ""
|
||||||
|
|
||||||
if suffix in ("xlsx", "xls"):
|
def _error_finding(description: str, fid: str = "analysis_failed") -> list[Finding]:
|
||||||
df = pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False)
|
return [Finding(
|
||||||
return analyze(df)
|
id=fid,
|
||||||
|
severity="error",
|
||||||
|
tool="",
|
||||||
|
count=1,
|
||||||
|
description=description,
|
||||||
|
confidence="high",
|
||||||
|
fix_action="",
|
||||||
|
)]
|
||||||
|
|
||||||
# CSV / TSV: run repair_bytes so the user sees csv_* findings.
|
if not data:
|
||||||
text_head = data[:4096].decode("utf-8", errors="replace")
|
return _error_finding(
|
||||||
delim = "\t" if suffix == "tsv" else ","
|
f"`{name}` is empty (0 bytes). Please re-upload — the bytes "
|
||||||
if delim == ",":
|
f"may not have transferred correctly from your browser.",
|
||||||
for cand in ("\t", ";", "|"):
|
fid="empty_upload",
|
||||||
if text_head.count(cand) > text_head.count(",") * 1.5:
|
)
|
||||||
delim = cand
|
|
||||||
break
|
try:
|
||||||
repair = repair_bytes(data, encoding="utf-8", delimiter=delim)
|
if suffix in ("xlsx", "xls"):
|
||||||
df = pd.read_csv(
|
df = pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False)
|
||||||
io.BytesIO(repair.repaired_bytes),
|
return analyze(df)
|
||||||
encoding="utf-8", delimiter=delim,
|
|
||||||
dtype=str, keep_default_na=False, on_bad_lines="warn",
|
# CSV / TSV: run repair_bytes so the user sees csv_* findings.
|
||||||
)
|
text_head = data[:4096].decode("utf-8", errors="replace")
|
||||||
return analyze(df, repair_result=repair)
|
delim = "\t" if suffix == "tsv" else ","
|
||||||
|
if delim == ",":
|
||||||
|
for cand in ("\t", ";", "|"):
|
||||||
|
if text_head.count(cand) > text_head.count(",") * 1.5:
|
||||||
|
delim = cand
|
||||||
|
break
|
||||||
|
repair = repair_bytes(data, encoding="utf-8", delimiter=delim)
|
||||||
|
if not repair.repaired_bytes:
|
||||||
|
return _error_finding(
|
||||||
|
f"`{name}` is empty after pre-parse repair "
|
||||||
|
f"(original was {len(data)} bytes — likely all NUL "
|
||||||
|
f"bytes or stripped during a BOM/line-ending pass). "
|
||||||
|
f"Open the file in a text editor to confirm it has "
|
||||||
|
f"content.",
|
||||||
|
fid="empty_after_repair",
|
||||||
|
)
|
||||||
|
df = pd.read_csv(
|
||||||
|
io.BytesIO(repair.repaired_bytes),
|
||||||
|
encoding="utf-8", delimiter=delim,
|
||||||
|
dtype=str, keep_default_na=False, on_bad_lines="warn",
|
||||||
|
)
|
||||||
|
return analyze(df, repair_result=repair)
|
||||||
|
except pd.errors.EmptyDataError:
|
||||||
|
return _error_finding(
|
||||||
|
f"`{name}` could not be parsed — pandas reports no columns "
|
||||||
|
f"in the file. Original size was {len(data)} bytes. Open "
|
||||||
|
f"the file in a text editor to confirm the header row is "
|
||||||
|
f"present and uses the same delimiter as the data rows.",
|
||||||
|
fid="empty_after_repair",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
return _error_finding(
|
||||||
|
f"`{name}` could not be analyzed: {format_for_user(e)}",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def findings_count_for_tool(tool_id: str) -> int:
|
def findings_count_for_tool(tool_id: str) -> int:
|
||||||
|
|||||||
Reference in New Issue
Block a user