From 7f959e6f4cbaef1cd4b7257793aa2d6896b894ad Mon Sep 17 00:00:00 2001 From: Brandon Walter <51866976+echoparkbaby@users.noreply.github.com> Date: Sat, 9 May 2026 12:06:11 -0700 Subject: [PATCH] feat: prominent failure-reason block + heuristic in drawer (1.0.0-50) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a stage ends in failed/cancelled/unknown the drawer now shows a coloured "Reason" pill at the top of that stage's section. Three sources, in order of preference: 1. stage.error_text (the canonical, when set) 2. job.error_text (backfilled in the drawer endpoint when stage's own is empty — catches orphan rows from hard crashes like the pre-busy-timeout DB-locked failures) 3. Heuristic: if log_text is tiny (<500 bytes, just the START banner) AND no real badblocks progress was recorded, label as "Stopped without recording an error — likely cause: SSH connection drop or container restart while this stage was running." This catches the fingerprint of a deploy-during-burn-in killing the SSH session. Otherwise: "No error message recorded." so there's never a blank where the operator expects to see why something broke. Red styling for failed, yellow for cancelled/unknown. Replaces the inline stage-error-line for terminal states; the existing stage-error-line still renders for non-terminal contexts. --- app/config.py | 2 +- app/routes/drives.py | 16 +++++++++++++++- app/static/app.css | 43 +++++++++++++++++++++++++++++++++++++++++++ app/static/app.js | 22 +++++++++++++++++++++- 4 files changed, 80 insertions(+), 3 deletions(-) diff --git a/app/config.py b/app/config.py index f9816f2..2b16c34 100644 --- a/app/config.py +++ b/app/config.py @@ -86,7 +86,7 @@ class Settings(BaseSettings): ssh_key: str = "" # PEM private key content (paste full key including headers) # Application version — used by the /api/v1/updates/check endpoint - app_version: str = "1.0.0-49" + app_version: str = "1.0.0-50" # ---- Authentication (1.0.0-22) ---- # session_secret: HMAC key for signing session cookies. Empty = generate diff --git a/app/routes/drives.py b/app/routes/drives.py index 9031a52..977eef8 100644 --- a/app/routes/drives.py +++ b/app/routes/drives.py @@ -62,7 +62,21 @@ async def drive_drawer(drive_id: int, db: aiosqlite.Connection = Depends(get_db) "FROM burnin_stages WHERE burnin_job_id=? ORDER BY id", (job_row["id"],), ) - job["stages"] = [dict(r) for r in await cur.fetchall()] + stages = [dict(r) for r in await cur.fetchall()] + # Backfill empty stage.error_text from the parent job's error_text + # for any stage that ended in a terminal state without recording + # an error of its own. This catches the orphan pattern from hard + # crashes (DB-locked, SSH disconnect, container restart) where + # the failure didn't get to write a per-stage explanation. + job_err = job.get("error_text") + for s in stages: + if ( + s.get("state") in ("failed", "cancelled", "unknown") + and not s.get("error_text") + and job_err + ): + s["error_text"] = job_err + job["stages"] = stages burnin_job = job # SMART raw output from smart_tests table diff --git a/app/static/app.css b/app/static/app.css index 1b325eb..cee9114 100644 --- a/app/static/app.css +++ b/app/static/app.css @@ -2896,3 +2896,46 @@ th.sort-desc::after { opacity: 1; border-top: 5px solid var(--blue, #58a6ff); } + +/* ----------------------------------------------------------------------- + Stage "Reason" block — explains why a stage ended in a terminal + state. Replaces the old single-line stage-error-line for + failed/cancelled/unknown stages so the operator gets a clear, + prominent explanation at the top. +----------------------------------------------------------------------- */ +.stage-reason { + display: flex; + gap: 10px; + align-items: baseline; + padding: 8px 12px; + margin: 6px 0; + border-radius: 5px; + font-size: 12px; + border: 1px solid; +} +.stage-reason-failed { + background: var(--red-bg, color-mix(in srgb, var(--red) 12%, transparent)); + border-color: var(--red-bd, color-mix(in srgb, var(--red) 40%, transparent)); +} +.stage-reason-cancelled, +.stage-reason-unknown { + background: var(--yellow-bg, color-mix(in srgb, var(--yellow) 12%, transparent)); + border-color: var(--yellow-bd, color-mix(in srgb, var(--yellow) 40%, transparent)); +} +.stage-reason-label { + font-size: 10px; + text-transform: uppercase; + letter-spacing: .06em; + font-weight: 600; + color: var(--text-muted); + flex-shrink: 0; +} +.stage-reason-text { + flex: 1; + color: var(--text-strong, #f0f6fc); + line-height: 1.4; + word-wrap: break-word; +} +.stage-reason-failed .stage-reason-text { color: var(--red, #f85149); } +.stage-reason-cancelled .stage-reason-text, +.stage-reason-unknown .stage-reason-text { color: var(--yellow, #d29922); } diff --git a/app/static/app.js b/app/static/app.js index 7207da1..ec4f82a 100644 --- a/app/static/app.js +++ b/app/static/app.js @@ -1564,7 +1564,27 @@ html += '' + _drawerFmtDuration(s.started_at, s.finished_at) + ''; } html += ''; - if (s.error_text) { + // Prominent "Why it failed" block at the top of failed/cancelled/ + // unknown stages. Falls back to a heuristic when no error was + // recorded — e.g. a tiny log + no badblocks progress + terminal + // state means the stage was killed externally (SSH disconnect or + // container restart) before it could record an error. + if (s.state === 'failed' || s.state === 'cancelled' || s.state === 'unknown') { + var reason = s.error_text; + if (!reason) { + var logLen = (s.log_text || '').length; + var noBbProgress = !s.bb_phase || (s.bb_phase === 1 && (parseFloat(s.bb_phase_pct || 0) < 0.1)); + if (logLen < 500 && noBbProgress) { + reason = 'Stopped without recording an error — likely cause: SSH connection drop or container restart while this stage was running.'; + } else { + reason = 'No error message recorded.'; + } + } + html += '
'; + html += 'Reason'; + html += '' + _esc(reason) + ''; + html += '
'; + } else if (s.error_text) { html += '
' + _esc(s.error_text) + '
'; } // Per-pattern meters for badblocks surface_validate, plus the