feat: prominent failure-reason block + heuristic in drawer (1.0.0-50)
When a stage ends in failed/cancelled/unknown the drawer now shows a coloured "Reason" pill at the top of that stage's section. Three sources, in order of preference: 1. stage.error_text (the canonical, when set) 2. job.error_text (backfilled in the drawer endpoint when stage's own is empty — catches orphan rows from hard crashes like the pre-busy-timeout DB-locked failures) 3. Heuristic: if log_text is tiny (<500 bytes, just the START banner) AND no real badblocks progress was recorded, label as "Stopped without recording an error — likely cause: SSH connection drop or container restart while this stage was running." This catches the fingerprint of a deploy-during-burn-in killing the SSH session. Otherwise: "No error message recorded." so there's never a blank where the operator expects to see why something broke. Red styling for failed, yellow for cancelled/unknown. Replaces the inline stage-error-line for terminal states; the existing stage-error-line still renders for non-terminal contexts.
This commit is contained in:
parent
28d046f42e
commit
7f959e6f4c
4 changed files with 80 additions and 3 deletions
|
|
@ -86,7 +86,7 @@ class Settings(BaseSettings):
|
|||
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
||||
|
||||
# Application version — used by the /api/v1/updates/check endpoint
|
||||
app_version: str = "1.0.0-49"
|
||||
app_version: str = "1.0.0-50"
|
||||
|
||||
# ---- Authentication (1.0.0-22) ----
|
||||
# session_secret: HMAC key for signing session cookies. Empty = generate
|
||||
|
|
|
|||
|
|
@ -62,7 +62,21 @@ async def drive_drawer(drive_id: int, db: aiosqlite.Connection = Depends(get_db)
|
|||
"FROM burnin_stages WHERE burnin_job_id=? ORDER BY id",
|
||||
(job_row["id"],),
|
||||
)
|
||||
job["stages"] = [dict(r) for r in await cur.fetchall()]
|
||||
stages = [dict(r) for r in await cur.fetchall()]
|
||||
# Backfill empty stage.error_text from the parent job's error_text
|
||||
# for any stage that ended in a terminal state without recording
|
||||
# an error of its own. This catches the orphan pattern from hard
|
||||
# crashes (DB-locked, SSH disconnect, container restart) where
|
||||
# the failure didn't get to write a per-stage explanation.
|
||||
job_err = job.get("error_text")
|
||||
for s in stages:
|
||||
if (
|
||||
s.get("state") in ("failed", "cancelled", "unknown")
|
||||
and not s.get("error_text")
|
||||
and job_err
|
||||
):
|
||||
s["error_text"] = job_err
|
||||
job["stages"] = stages
|
||||
burnin_job = job
|
||||
|
||||
# SMART raw output from smart_tests table
|
||||
|
|
|
|||
|
|
@ -2896,3 +2896,46 @@ th.sort-desc::after {
|
|||
opacity: 1;
|
||||
border-top: 5px solid var(--blue, #58a6ff);
|
||||
}
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Stage "Reason" block — explains why a stage ended in a terminal
|
||||
state. Replaces the old single-line stage-error-line for
|
||||
failed/cancelled/unknown stages so the operator gets a clear,
|
||||
prominent explanation at the top.
|
||||
----------------------------------------------------------------------- */
|
||||
.stage-reason {
|
||||
display: flex;
|
||||
gap: 10px;
|
||||
align-items: baseline;
|
||||
padding: 8px 12px;
|
||||
margin: 6px 0;
|
||||
border-radius: 5px;
|
||||
font-size: 12px;
|
||||
border: 1px solid;
|
||||
}
|
||||
.stage-reason-failed {
|
||||
background: var(--red-bg, color-mix(in srgb, var(--red) 12%, transparent));
|
||||
border-color: var(--red-bd, color-mix(in srgb, var(--red) 40%, transparent));
|
||||
}
|
||||
.stage-reason-cancelled,
|
||||
.stage-reason-unknown {
|
||||
background: var(--yellow-bg, color-mix(in srgb, var(--yellow) 12%, transparent));
|
||||
border-color: var(--yellow-bd, color-mix(in srgb, var(--yellow) 40%, transparent));
|
||||
}
|
||||
.stage-reason-label {
|
||||
font-size: 10px;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: .06em;
|
||||
font-weight: 600;
|
||||
color: var(--text-muted);
|
||||
flex-shrink: 0;
|
||||
}
|
||||
.stage-reason-text {
|
||||
flex: 1;
|
||||
color: var(--text-strong, #f0f6fc);
|
||||
line-height: 1.4;
|
||||
word-wrap: break-word;
|
||||
}
|
||||
.stage-reason-failed .stage-reason-text { color: var(--red, #f85149); }
|
||||
.stage-reason-cancelled .stage-reason-text,
|
||||
.stage-reason-unknown .stage-reason-text { color: var(--yellow, #d29922); }
|
||||
|
|
|
|||
|
|
@ -1564,7 +1564,27 @@
|
|||
html += '<span class="stage-duration">' + _drawerFmtDuration(s.started_at, s.finished_at) + '</span>';
|
||||
}
|
||||
html += '</div>';
|
||||
if (s.error_text) {
|
||||
// Prominent "Why it failed" block at the top of failed/cancelled/
|
||||
// unknown stages. Falls back to a heuristic when no error was
|
||||
// recorded — e.g. a tiny log + no badblocks progress + terminal
|
||||
// state means the stage was killed externally (SSH disconnect or
|
||||
// container restart) before it could record an error.
|
||||
if (s.state === 'failed' || s.state === 'cancelled' || s.state === 'unknown') {
|
||||
var reason = s.error_text;
|
||||
if (!reason) {
|
||||
var logLen = (s.log_text || '').length;
|
||||
var noBbProgress = !s.bb_phase || (s.bb_phase === 1 && (parseFloat(s.bb_phase_pct || 0) < 0.1));
|
||||
if (logLen < 500 && noBbProgress) {
|
||||
reason = 'Stopped without recording an error — likely cause: SSH connection drop or container restart while this stage was running.';
|
||||
} else {
|
||||
reason = 'No error message recorded.';
|
||||
}
|
||||
}
|
||||
html += '<div class="stage-reason stage-reason-' + _esc(s.state) + '">';
|
||||
html += '<span class="stage-reason-label">Reason</span>';
|
||||
html += '<span class="stage-reason-text">' + _esc(reason) + '</span>';
|
||||
html += '</div>';
|
||||
} else if (s.error_text) {
|
||||
html += '<div class="stage-error-line">' + _esc(s.error_text) + '</div>';
|
||||
}
|
||||
// Per-pattern meters for badblocks surface_validate, plus the
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue