fix: SMART overlay shows terminal states + reconciles orphans (1.0.0-49)
The Long SMART column showed "—" while the Burn-In column showed
"FAILED (LONG SMART)" — clear contradiction. Two reasons:
1. The overlay query in _drives_helpers only fetched SMART stage
data for burn-ins in ('running','queued') state. Failed/passed/
cancelled jobs got their stage data filtered out, so the SMART
columns went blank when you most wanted to see them. Removed
the state filter so all burn-ins overlay.
2. A pre-busy-timeout `database is locked` failure mode (sdj job 5
from Mar 2026) left long_smart stage rows recorded as state=
'running' even though the parent job ended in state='failed'.
The overlay now translates that orphan state at render time:
if the parent job is failed/cancelled/unknown but the stage is
still 'running', display the stage as failed (or the parent's
terminal state) so the column matches the Burn-In column.
The translation is purely display-time; no DB writes. error_text
falls back to the parent job's error_text when the stage's own is
NULL, so the operator sees what actually broke.
This commit is contained in:
parent
f5c6b85402
commit
28d046f42e
2 changed files with 23 additions and 10 deletions
|
|
@ -86,7 +86,7 @@ class Settings(BaseSettings):
|
||||||
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
||||||
|
|
||||||
# Application version — used by the /api/v1/updates/check endpoint
|
# Application version — used by the /api/v1/updates/check endpoint
|
||||||
app_version: str = "1.0.0-48"
|
app_version: str = "1.0.0-49"
|
||||||
|
|
||||||
# ---- Authentication (1.0.0-22) ----
|
# ---- Authentication (1.0.0-22) ----
|
||||||
# session_secret: HMAC key for signing session cookies. Empty = generate
|
# session_secret: HMAC key for signing session cookies. Empty = generate
|
||||||
|
|
|
||||||
|
|
@ -147,11 +147,12 @@ async def _fetch_drives_for_template(db: aiosqlite.Connection) -> list[dict]:
|
||||||
|
|
||||||
# For burn-ins that include SMART stages, fetch those stages so we can
|
# For burn-ins that include SMART stages, fetch those stages so we can
|
||||||
# mirror their progress/result in the Short/Long SMART columns.
|
# mirror their progress/result in the Short/Long SMART columns.
|
||||||
|
# We include burn-ins in ANY state — including failed/passed/cancelled —
|
||||||
|
# so the SMART columns don't go blank when the burn-in finishes. Without
|
||||||
|
# this, "FAILED (LONG SMART)" appears in the Burn-In column while the
|
||||||
|
# Long SMART column shows "—", which contradicts itself.
|
||||||
bi_smart_stages: dict[int, dict[str, dict]] = {} # job_id -> {stage_name: row}
|
bi_smart_stages: dict[int, dict[str, dict]] = {} # job_id -> {stage_name: row}
|
||||||
bi_ids_with_smart = [
|
bi_ids_with_smart = [bi["id"] for bi in burnin_by_drive.values()]
|
||||||
bi["id"] for bi in burnin_by_drive.values()
|
|
||||||
if bi["state"] in ("running", "queued")
|
|
||||||
]
|
|
||||||
if bi_ids_with_smart:
|
if bi_ids_with_smart:
|
||||||
placeholders = ",".join("?" * len(bi_ids_with_smart))
|
placeholders = ",".join("?" * len(bi_ids_with_smart))
|
||||||
# placeholders is purely structural ("?,?,?"); IDs themselves are
|
# placeholders is purely structural ("?,?,?"); IDs themselves are
|
||||||
|
|
@ -163,7 +164,7 @@ async def _fetch_drives_for_template(db: aiosqlite.Connection) -> list[dict]:
|
||||||
"FROM burnin_stages bs "
|
"FROM burnin_stages bs "
|
||||||
"WHERE bs.burnin_job_id IN (" + placeholders + ") "
|
"WHERE bs.burnin_job_id IN (" + placeholders + ") "
|
||||||
" AND bs.stage_name IN ('short_smart', 'long_smart') "
|
" AND bs.stage_name IN ('short_smart', 'long_smart') "
|
||||||
" AND bs.state IN ('running', 'passed', 'failed')"
|
" AND bs.state IN ('running', 'passed', 'failed', 'aborted')"
|
||||||
)
|
)
|
||||||
cur = await db.execute(sql, bi_ids_with_smart)
|
cur = await db.execute(sql, bi_ids_with_smart)
|
||||||
for r in await cur.fetchall():
|
for r in await cur.fetchall():
|
||||||
|
|
@ -185,14 +186,26 @@ async def _fetch_drives_for_template(db: aiosqlite.Connection) -> list[dict]:
|
||||||
if existing.get("state") not in (None, "idle"):
|
if existing.get("state") not in (None, "idle"):
|
||||||
continue
|
continue
|
||||||
pct = stage["percent"] or 0
|
pct = stage["percent"] or 0
|
||||||
|
stage_state = stage["state"]
|
||||||
|
# If the parent burn-in ended in failure but this SMART
|
||||||
|
# stage is still recorded as "running", that's an
|
||||||
|
# orphaned stage row from a hard crash (e.g. the old
|
||||||
|
# `database is locked` failure mode). Surface as failed
|
||||||
|
# so the column matches the Burn-In column.
|
||||||
|
if stage_state == "running" and bi.get("state") in (
|
||||||
|
"failed", "cancelled", "unknown"
|
||||||
|
):
|
||||||
|
stage_state = bi["state"] if bi["state"] != "unknown" else "failed"
|
||||||
d[target] = {
|
d[target] = {
|
||||||
"state": stage["state"],
|
"state": stage_state,
|
||||||
"percent": pct if stage["state"] == "running" else (100 if stage["state"] == "passed" else 0),
|
"percent": pct if stage_state == "running" else (100 if stage_state == "passed" else 0),
|
||||||
"eta_seconds": _compute_eta_seconds(stage["started_at"], pct) if stage["state"] == "running" else None,
|
"eta_seconds": _compute_eta_seconds(stage["started_at"], pct) if stage_state == "running" else None,
|
||||||
"eta_timestamp": None,
|
"eta_timestamp": None,
|
||||||
"started_at": stage["started_at"],
|
"started_at": stage["started_at"],
|
||||||
"finished_at": stage["finished_at"],
|
"finished_at": stage["finished_at"],
|
||||||
"error_text": stage["error_text"],
|
"error_text": stage["error_text"] or (
|
||||||
|
bi.get("error_text") if stage_state == "failed" else None
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
drives.append(d)
|
drives.append(d)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue