From ae9d4a2db5855ef9e94389d502217468b736f623 Mon Sep 17 00:00:00 2001 From: Michael Date: Sat, 16 May 2026 21:22:10 +0000 Subject: [PATCH] fix(home): defensive analysis errors don't crash the whole page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reported: uploading 13_non_latin_scripts.csv made the home page bubble a ``pandas.errors.EmptyDataError`` traceback up through the page chrome instead of surfacing as a per-file error. In a multi-file analysis run that kills every other file's results too, which is worse than the symptom itself. Wrap ``_run_analysis_on_upload`` in proper error handling: - Empty bytes ``getvalue() == b""`` short-circuits with a synthetic error Finding telling the user the upload was zero-byte and to re-upload. - Empty ``repair.repaired_bytes`` (file was all NULs / BOM / stripped to nothing) likewise surfaces as a synthetic Finding rather than reaching pd.read_csv. - ``pd.errors.EmptyDataError`` from pandas is caught and rendered as a Finding that names the file, its byte size, and suggests opening it in a text editor to verify the header row matches the data row delimiter. - Any other exception during read/analyze is caught and surfaces as a Finding via ``format_for_user`` so the user gets a clean message, not a Python traceback. Each file in a multi-file run now stands alone: a bad file produces one red banner in its own card, every other file analyzes normally. The 13_non_latin_scripts.csv corpus file is 249 bytes of valid UTF-8 on disk and parses cleanly under the same code path locally — the user's specific symptom is likely a zero-byte upload (browser / network / Python 3.14 + Streamlit edge case). The new ``empty_upload`` finding will name the bytes count so they can confirm. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/gui/components/_legacy.py | 88 +++++++++++++++++++++++++++-------- 1 file changed, 68 insertions(+), 20 deletions(-) diff --git a/src/gui/components/_legacy.py b/src/gui/components/_legacy.py index 5bf5707..8234517 100644 --- a/src/gui/components/_legacy.py +++ b/src/gui/components/_legacy.py @@ -1336,33 +1336,81 @@ def upload_and_analyze_section() -> None: def _run_analysis_on_upload(uploaded): - """Read the uploaded file with pre-parse repair, then analyze.""" - from src.core.analyze import analyze + """Read the uploaded file with pre-parse repair, then analyze. + + Errors are caught and surfaced as a single synthetic ``Finding`` + instead of bubbling a traceback up into the page chrome. A bad + file (empty bytes, unreadable encoding, pandas parse failure on + one of several uploaded files) should yield a clean red banner for + that file, not kill the whole multi-file analysis run. + """ + from src.core.analyze import Finding, analyze + from src.core.errors import format_for_user from src.core.io import repair_bytes name = uploaded.name data = uploaded.getvalue() suffix = name.rsplit(".", 1)[-1].lower() if "." in name else "" - if suffix in ("xlsx", "xls"): - df = pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False) - return analyze(df) + def _error_finding(description: str, fid: str = "analysis_failed") -> list[Finding]: + return [Finding( + id=fid, + severity="error", + tool="", + count=1, + description=description, + confidence="high", + fix_action="", + )] - # CSV / TSV: run repair_bytes so the user sees csv_* findings. - text_head = data[:4096].decode("utf-8", errors="replace") - delim = "\t" if suffix == "tsv" else "," - if delim == ",": - for cand in ("\t", ";", "|"): - if text_head.count(cand) > text_head.count(",") * 1.5: - delim = cand - break - repair = repair_bytes(data, encoding="utf-8", delimiter=delim) - df = pd.read_csv( - io.BytesIO(repair.repaired_bytes), - encoding="utf-8", delimiter=delim, - dtype=str, keep_default_na=False, on_bad_lines="warn", - ) - return analyze(df, repair_result=repair) + if not data: + return _error_finding( + f"`{name}` is empty (0 bytes). Please re-upload — the bytes " + f"may not have transferred correctly from your browser.", + fid="empty_upload", + ) + + try: + if suffix in ("xlsx", "xls"): + df = pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False) + return analyze(df) + + # CSV / TSV: run repair_bytes so the user sees csv_* findings. + text_head = data[:4096].decode("utf-8", errors="replace") + delim = "\t" if suffix == "tsv" else "," + if delim == ",": + for cand in ("\t", ";", "|"): + if text_head.count(cand) > text_head.count(",") * 1.5: + delim = cand + break + repair = repair_bytes(data, encoding="utf-8", delimiter=delim) + if not repair.repaired_bytes: + return _error_finding( + f"`{name}` is empty after pre-parse repair " + f"(original was {len(data)} bytes — likely all NUL " + f"bytes or stripped during a BOM/line-ending pass). " + f"Open the file in a text editor to confirm it has " + f"content.", + fid="empty_after_repair", + ) + df = pd.read_csv( + io.BytesIO(repair.repaired_bytes), + encoding="utf-8", delimiter=delim, + dtype=str, keep_default_na=False, on_bad_lines="warn", + ) + return analyze(df, repair_result=repair) + except pd.errors.EmptyDataError: + return _error_finding( + f"`{name}` could not be parsed — pandas reports no columns " + f"in the file. Original size was {len(data)} bytes. Open " + f"the file in a text editor to confirm the header row is " + f"present and uses the same delimiter as the data rows.", + fid="empty_after_repair", + ) + except Exception as e: + return _error_finding( + f"`{name}` could not be analyzed: {format_for_user(e)}", + ) def findings_count_for_tool(tool_id: str) -> int: