feat: prominent failure-reason block + heuristic in drawer (1.0.0-50)
When a stage ends in failed/cancelled/unknown the drawer now shows a coloured "Reason" pill at the top of that stage's section. Three sources, in order of preference: 1. stage.error_text (the canonical, when set) 2. job.error_text (backfilled in the drawer endpoint when stage's own is empty — catches orphan rows from hard crashes like the pre-busy-timeout DB-locked failures) 3. Heuristic: if log_text is tiny (<500 bytes, just the START banner) AND no real badblocks progress was recorded, label as "Stopped without recording an error — likely cause: SSH connection drop or container restart while this stage was running." This catches the fingerprint of a deploy-during-burn-in killing the SSH session. Otherwise: "No error message recorded." so there's never a blank where the operator expects to see why something broke. Red styling for failed, yellow for cancelled/unknown. Replaces the inline stage-error-line for terminal states; the existing stage-error-line still renders for non-terminal contexts.
This commit is contained in:
parent
28d046f42e
commit
7f959e6f4c
4 changed files with 80 additions and 3 deletions
|
|
@ -86,7 +86,7 @@ class Settings(BaseSettings):
|
||||||
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
||||||
|
|
||||||
# Application version — used by the /api/v1/updates/check endpoint
|
# Application version — used by the /api/v1/updates/check endpoint
|
||||||
app_version: str = "1.0.0-49"
|
app_version: str = "1.0.0-50"
|
||||||
|
|
||||||
# ---- Authentication (1.0.0-22) ----
|
# ---- Authentication (1.0.0-22) ----
|
||||||
# session_secret: HMAC key for signing session cookies. Empty = generate
|
# session_secret: HMAC key for signing session cookies. Empty = generate
|
||||||
|
|
|
||||||
|
|
@ -62,7 +62,21 @@ async def drive_drawer(drive_id: int, db: aiosqlite.Connection = Depends(get_db)
|
||||||
"FROM burnin_stages WHERE burnin_job_id=? ORDER BY id",
|
"FROM burnin_stages WHERE burnin_job_id=? ORDER BY id",
|
||||||
(job_row["id"],),
|
(job_row["id"],),
|
||||||
)
|
)
|
||||||
job["stages"] = [dict(r) for r in await cur.fetchall()]
|
stages = [dict(r) for r in await cur.fetchall()]
|
||||||
|
# Backfill empty stage.error_text from the parent job's error_text
|
||||||
|
# for any stage that ended in a terminal state without recording
|
||||||
|
# an error of its own. This catches the orphan pattern from hard
|
||||||
|
# crashes (DB-locked, SSH disconnect, container restart) where
|
||||||
|
# the failure didn't get to write a per-stage explanation.
|
||||||
|
job_err = job.get("error_text")
|
||||||
|
for s in stages:
|
||||||
|
if (
|
||||||
|
s.get("state") in ("failed", "cancelled", "unknown")
|
||||||
|
and not s.get("error_text")
|
||||||
|
and job_err
|
||||||
|
):
|
||||||
|
s["error_text"] = job_err
|
||||||
|
job["stages"] = stages
|
||||||
burnin_job = job
|
burnin_job = job
|
||||||
|
|
||||||
# SMART raw output from smart_tests table
|
# SMART raw output from smart_tests table
|
||||||
|
|
|
||||||
|
|
@ -2896,3 +2896,46 @@ th.sort-desc::after {
|
||||||
opacity: 1;
|
opacity: 1;
|
||||||
border-top: 5px solid var(--blue, #58a6ff);
|
border-top: 5px solid var(--blue, #58a6ff);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* -----------------------------------------------------------------------
|
||||||
|
Stage "Reason" block — explains why a stage ended in a terminal
|
||||||
|
state. Replaces the old single-line stage-error-line for
|
||||||
|
failed/cancelled/unknown stages so the operator gets a clear,
|
||||||
|
prominent explanation at the top.
|
||||||
|
----------------------------------------------------------------------- */
|
||||||
|
.stage-reason {
|
||||||
|
display: flex;
|
||||||
|
gap: 10px;
|
||||||
|
align-items: baseline;
|
||||||
|
padding: 8px 12px;
|
||||||
|
margin: 6px 0;
|
||||||
|
border-radius: 5px;
|
||||||
|
font-size: 12px;
|
||||||
|
border: 1px solid;
|
||||||
|
}
|
||||||
|
.stage-reason-failed {
|
||||||
|
background: var(--red-bg, color-mix(in srgb, var(--red) 12%, transparent));
|
||||||
|
border-color: var(--red-bd, color-mix(in srgb, var(--red) 40%, transparent));
|
||||||
|
}
|
||||||
|
.stage-reason-cancelled,
|
||||||
|
.stage-reason-unknown {
|
||||||
|
background: var(--yellow-bg, color-mix(in srgb, var(--yellow) 12%, transparent));
|
||||||
|
border-color: var(--yellow-bd, color-mix(in srgb, var(--yellow) 40%, transparent));
|
||||||
|
}
|
||||||
|
.stage-reason-label {
|
||||||
|
font-size: 10px;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: .06em;
|
||||||
|
font-weight: 600;
|
||||||
|
color: var(--text-muted);
|
||||||
|
flex-shrink: 0;
|
||||||
|
}
|
||||||
|
.stage-reason-text {
|
||||||
|
flex: 1;
|
||||||
|
color: var(--text-strong, #f0f6fc);
|
||||||
|
line-height: 1.4;
|
||||||
|
word-wrap: break-word;
|
||||||
|
}
|
||||||
|
.stage-reason-failed .stage-reason-text { color: var(--red, #f85149); }
|
||||||
|
.stage-reason-cancelled .stage-reason-text,
|
||||||
|
.stage-reason-unknown .stage-reason-text { color: var(--yellow, #d29922); }
|
||||||
|
|
|
||||||
|
|
@ -1564,7 +1564,27 @@
|
||||||
html += '<span class="stage-duration">' + _drawerFmtDuration(s.started_at, s.finished_at) + '</span>';
|
html += '<span class="stage-duration">' + _drawerFmtDuration(s.started_at, s.finished_at) + '</span>';
|
||||||
}
|
}
|
||||||
html += '</div>';
|
html += '</div>';
|
||||||
if (s.error_text) {
|
// Prominent "Why it failed" block at the top of failed/cancelled/
|
||||||
|
// unknown stages. Falls back to a heuristic when no error was
|
||||||
|
// recorded — e.g. a tiny log + no badblocks progress + terminal
|
||||||
|
// state means the stage was killed externally (SSH disconnect or
|
||||||
|
// container restart) before it could record an error.
|
||||||
|
if (s.state === 'failed' || s.state === 'cancelled' || s.state === 'unknown') {
|
||||||
|
var reason = s.error_text;
|
||||||
|
if (!reason) {
|
||||||
|
var logLen = (s.log_text || '').length;
|
||||||
|
var noBbProgress = !s.bb_phase || (s.bb_phase === 1 && (parseFloat(s.bb_phase_pct || 0) < 0.1));
|
||||||
|
if (logLen < 500 && noBbProgress) {
|
||||||
|
reason = 'Stopped without recording an error — likely cause: SSH connection drop or container restart while this stage was running.';
|
||||||
|
} else {
|
||||||
|
reason = 'No error message recorded.';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
html += '<div class="stage-reason stage-reason-' + _esc(s.state) + '">';
|
||||||
|
html += '<span class="stage-reason-label">Reason</span>';
|
||||||
|
html += '<span class="stage-reason-text">' + _esc(reason) + '</span>';
|
||||||
|
html += '</div>';
|
||||||
|
} else if (s.error_text) {
|
||||||
html += '<div class="stage-error-line">' + _esc(s.error_text) + '</div>';
|
html += '<div class="stage-error-line">' + _esc(s.error_text) + '</div>';
|
||||||
}
|
}
|
||||||
// Per-pattern meters for badblocks surface_validate, plus the
|
// Per-pattern meters for badblocks surface_validate, plus the
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue