feat: prominent failure-reason block + heuristic in drawer (1.0.0-50)
Some checks are pending
Security scan / pip-audit (push) Waiting to run
Security scan / bandit (push) Waiting to run
Security scan / gitleaks (push) Waiting to run
Security scan / mypy (push) Waiting to run

When a stage ends in failed/cancelled/unknown the drawer now shows
a coloured "Reason" pill at the top of that stage's section. Three
sources, in order of preference:

1. stage.error_text (the canonical, when set)
2. job.error_text (backfilled in the drawer endpoint when stage's
   own is empty — catches orphan rows from hard crashes like the
   pre-busy-timeout DB-locked failures)
3. Heuristic: if log_text is tiny (<500 bytes, just the START
   banner) AND no real badblocks progress was recorded, label as
   "Stopped without recording an error — likely cause: SSH
   connection drop or container restart while this stage was
   running." This catches the fingerprint of a deploy-during-burn-in
   killing the SSH session.

Otherwise: "No error message recorded." so there's never a blank
where the operator expects to see why something broke.

Red styling for failed, yellow for cancelled/unknown. Replaces the
inline stage-error-line for terminal states; the existing
stage-error-line still renders for non-terminal contexts.
This commit is contained in:
Brandon Walter 2026-05-09 12:06:11 -07:00
parent 28d046f42e
commit 7f959e6f4c
4 changed files with 80 additions and 3 deletions

View file

@ -86,7 +86,7 @@ class Settings(BaseSettings):
ssh_key: str = "" # PEM private key content (paste full key including headers) ssh_key: str = "" # PEM private key content (paste full key including headers)
# Application version — used by the /api/v1/updates/check endpoint # Application version — used by the /api/v1/updates/check endpoint
app_version: str = "1.0.0-49" app_version: str = "1.0.0-50"
# ---- Authentication (1.0.0-22) ---- # ---- Authentication (1.0.0-22) ----
# session_secret: HMAC key for signing session cookies. Empty = generate # session_secret: HMAC key for signing session cookies. Empty = generate

View file

@ -62,7 +62,21 @@ async def drive_drawer(drive_id: int, db: aiosqlite.Connection = Depends(get_db)
"FROM burnin_stages WHERE burnin_job_id=? ORDER BY id", "FROM burnin_stages WHERE burnin_job_id=? ORDER BY id",
(job_row["id"],), (job_row["id"],),
) )
job["stages"] = [dict(r) for r in await cur.fetchall()] stages = [dict(r) for r in await cur.fetchall()]
# Backfill empty stage.error_text from the parent job's error_text
# for any stage that ended in a terminal state without recording
# an error of its own. This catches the orphan pattern from hard
# crashes (DB-locked, SSH disconnect, container restart) where
# the failure didn't get to write a per-stage explanation.
job_err = job.get("error_text")
for s in stages:
if (
s.get("state") in ("failed", "cancelled", "unknown")
and not s.get("error_text")
and job_err
):
s["error_text"] = job_err
job["stages"] = stages
burnin_job = job burnin_job = job
# SMART raw output from smart_tests table # SMART raw output from smart_tests table

View file

@ -2896,3 +2896,46 @@ th.sort-desc::after {
opacity: 1; opacity: 1;
border-top: 5px solid var(--blue, #58a6ff); border-top: 5px solid var(--blue, #58a6ff);
} }
/* -----------------------------------------------------------------------
Stage "Reason" block explains why a stage ended in a terminal
state. Replaces the old single-line stage-error-line for
failed/cancelled/unknown stages so the operator gets a clear,
prominent explanation at the top.
----------------------------------------------------------------------- */
.stage-reason {
display: flex;
gap: 10px;
align-items: baseline;
padding: 8px 12px;
margin: 6px 0;
border-radius: 5px;
font-size: 12px;
border: 1px solid;
}
.stage-reason-failed {
background: var(--red-bg, color-mix(in srgb, var(--red) 12%, transparent));
border-color: var(--red-bd, color-mix(in srgb, var(--red) 40%, transparent));
}
.stage-reason-cancelled,
.stage-reason-unknown {
background: var(--yellow-bg, color-mix(in srgb, var(--yellow) 12%, transparent));
border-color: var(--yellow-bd, color-mix(in srgb, var(--yellow) 40%, transparent));
}
.stage-reason-label {
font-size: 10px;
text-transform: uppercase;
letter-spacing: .06em;
font-weight: 600;
color: var(--text-muted);
flex-shrink: 0;
}
.stage-reason-text {
flex: 1;
color: var(--text-strong, #f0f6fc);
line-height: 1.4;
word-wrap: break-word;
}
.stage-reason-failed .stage-reason-text { color: var(--red, #f85149); }
.stage-reason-cancelled .stage-reason-text,
.stage-reason-unknown .stage-reason-text { color: var(--yellow, #d29922); }

View file

@ -1564,7 +1564,27 @@
html += '<span class="stage-duration">' + _drawerFmtDuration(s.started_at, s.finished_at) + '</span>'; html += '<span class="stage-duration">' + _drawerFmtDuration(s.started_at, s.finished_at) + '</span>';
} }
html += '</div>'; html += '</div>';
if (s.error_text) { // Prominent "Why it failed" block at the top of failed/cancelled/
// unknown stages. Falls back to a heuristic when no error was
// recorded — e.g. a tiny log + no badblocks progress + terminal
// state means the stage was killed externally (SSH disconnect or
// container restart) before it could record an error.
if (s.state === 'failed' || s.state === 'cancelled' || s.state === 'unknown') {
var reason = s.error_text;
if (!reason) {
var logLen = (s.log_text || '').length;
var noBbProgress = !s.bb_phase || (s.bb_phase === 1 && (parseFloat(s.bb_phase_pct || 0) < 0.1));
if (logLen < 500 && noBbProgress) {
reason = 'Stopped without recording an error — likely cause: SSH connection drop or container restart while this stage was running.';
} else {
reason = 'No error message recorded.';
}
}
html += '<div class="stage-reason stage-reason-' + _esc(s.state) + '">';
html += '<span class="stage-reason-label">Reason</span>';
html += '<span class="stage-reason-text">' + _esc(reason) + '</span>';
html += '</div>';
} else if (s.error_text) {
html += '<div class="stage-error-line">' + _esc(s.error_text) + '</div>'; html += '<div class="stage-error-line">' + _esc(s.error_text) + '</div>';
} }
// Per-pattern meters for badblocks surface_validate, plus the // Per-pattern meters for badblocks surface_validate, plus the