From ae9d4a2db5855ef9e94389d502217468b736f623 Mon Sep 17 00:00:00 2001
From: Michael <michael.dombaugh@gmail.com>
Date: Sat, 16 May 2026 21:22:10 +0000
Subject: [PATCH] fix(home): defensive analysis errors don't crash the whole
 page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reported: uploading 13_non_latin_scripts.csv made the home page bubble
a ``pandas.errors.EmptyDataError`` traceback up through the page
chrome instead of surfacing as a per-file error. In a multi-file
analysis run that kills every other file's results too, which is
worse than the symptom itself.

Wrap ``_run_analysis_on_upload`` in proper error handling:

- Empty bytes ``getvalue() == b""`` short-circuits with a synthetic
  error Finding telling the user the upload was zero-byte and to
  re-upload.
- Empty ``repair.repaired_bytes`` (file was all NULs / BOM / stripped
  to nothing) likewise surfaces as a synthetic Finding rather than
  reaching pd.read_csv.
- ``pd.errors.EmptyDataError`` from pandas is caught and rendered as
  a Finding that names the file, its byte size, and suggests opening
  it in a text editor to verify the header row matches the data row
  delimiter.
- Any other exception during read/analyze is caught and surfaces as
  a Finding via ``format_for_user`` so the user gets a clean message,
  not a Python traceback.

Each file in a multi-file run now stands alone: a bad file produces
one red banner in its own card, every other file analyzes normally.

The 13_non_latin_scripts.csv corpus file is 249 bytes of valid UTF-8
on disk and parses cleanly under the same code path locally — the
user's specific symptom is likely a zero-byte upload (browser /
network / Python 3.14 + Streamlit edge case). The new ``empty_upload``
finding will name the bytes count so they can confirm.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/gui/components/_legacy.py | 88 +++++++++++++++++++++++++++--------
 1 file changed, 68 insertions(+), 20 deletions(-)

diff --git a/src/gui/components/_legacy.py b/src/gui/components/_legacy.py
index 5bf5707..8234517 100644
--- a/src/gui/components/_legacy.py
+++ b/src/gui/components/_legacy.py
@@ -1336,33 +1336,81 @@ def upload_and_analyze_section() -> None:
 
 
 def _run_analysis_on_upload(uploaded):
-    """Read the uploaded file with pre-parse repair, then analyze."""
-    from src.core.analyze import analyze
+    """Read the uploaded file with pre-parse repair, then analyze.
+
+    Errors are caught and surfaced as a single synthetic ``Finding``
+    instead of bubbling a traceback up into the page chrome. A bad
+    file (empty bytes, unreadable encoding, pandas parse failure on
+    one of several uploaded files) should yield a clean red banner for
+    that file, not kill the whole multi-file analysis run.
+    """
+    from src.core.analyze import Finding, analyze
+    from src.core.errors import format_for_user
     from src.core.io import repair_bytes
 
     name = uploaded.name
     data = uploaded.getvalue()
     suffix = name.rsplit(".", 1)[-1].lower() if "." in name else ""
 
-    if suffix in ("xlsx", "xls"):
-        df = pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False)
-        return analyze(df)
+    def _error_finding(description: str, fid: str = "analysis_failed") -> list[Finding]:
+        return [Finding(
+            id=fid,
+            severity="error",
+            tool="",
+            count=1,
+            description=description,
+            confidence="high",
+            fix_action="",
+        )]
 
-    # CSV / TSV: run repair_bytes so the user sees csv_* findings.
-    text_head = data[:4096].decode("utf-8", errors="replace")
-    delim = "\t" if suffix == "tsv" else ","
-    if delim == ",":
-        for cand in ("\t", ";", "|"):
-            if text_head.count(cand) > text_head.count(",") * 1.5:
-                delim = cand
-                break
-    repair = repair_bytes(data, encoding="utf-8", delimiter=delim)
-    df = pd.read_csv(
-        io.BytesIO(repair.repaired_bytes),
-        encoding="utf-8", delimiter=delim,
-        dtype=str, keep_default_na=False, on_bad_lines="warn",
-    )
-    return analyze(df, repair_result=repair)
+    if not data:
+        return _error_finding(
+            f"`{name}` is empty (0 bytes). Please re-upload — the bytes "
+            f"may not have transferred correctly from your browser.",
+            fid="empty_upload",
+        )
+
+    try:
+        if suffix in ("xlsx", "xls"):
+            df = pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False)
+            return analyze(df)
+
+        # CSV / TSV: run repair_bytes so the user sees csv_* findings.
+        text_head = data[:4096].decode("utf-8", errors="replace")
+        delim = "\t" if suffix == "tsv" else ","
+        if delim == ",":
+            for cand in ("\t", ";", "|"):
+                if text_head.count(cand) > text_head.count(",") * 1.5:
+                    delim = cand
+                    break
+        repair = repair_bytes(data, encoding="utf-8", delimiter=delim)
+        if not repair.repaired_bytes:
+            return _error_finding(
+                f"`{name}` is empty after pre-parse repair "
+                f"(original was {len(data)} bytes — likely all NUL "
+                f"bytes or stripped during a BOM/line-ending pass). "
+                f"Open the file in a text editor to confirm it has "
+                f"content.",
+                fid="empty_after_repair",
+            )
+        df = pd.read_csv(
+            io.BytesIO(repair.repaired_bytes),
+            encoding="utf-8", delimiter=delim,
+            dtype=str, keep_default_na=False, on_bad_lines="warn",
+        )
+        return analyze(df, repair_result=repair)
+    except pd.errors.EmptyDataError:
+        return _error_finding(
+            f"`{name}` could not be parsed — pandas reports no columns "
+            f"in the file. Original size was {len(data)} bytes. Open "
+            f"the file in a text editor to confirm the header row is "
+            f"present and uses the same delimiter as the data rows.",
+            fid="empty_after_repair",
+        )
+    except Exception as e:
+        return _error_finding(
+            f"`{name}` could not be analyzed: {format_for_user(e)}",
+        )
 
 
 def findings_count_for_tool(tool_id: str) -> int: