From 28d046f42e4ba359f4e06985e89640d85336e76c Mon Sep 17 00:00:00 2001 From: Brandon Walter <51866976+echoparkbaby@users.noreply.github.com> Date: Sat, 9 May 2026 11:46:45 -0700 Subject: [PATCH] fix: SMART overlay shows terminal states + reconciles orphans (1.0.0-49) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Long SMART column showed "—" while the Burn-In column showed "FAILED (LONG SMART)" — clear contradiction. Two reasons: 1. The overlay query in _drives_helpers only fetched SMART stage data for burn-ins in ('running','queued') state. Failed/passed/ cancelled jobs got their stage data filtered out, so the SMART columns went blank when you most wanted to see them. Removed the state filter so all burn-ins overlay. 2. A pre-busy-timeout `database is locked` failure mode (sdj job 5 from Mar 2026) left long_smart stage rows recorded as state= 'running' even though the parent job ended in state='failed'. The overlay now translates that orphan state at render time: if the parent job is failed/cancelled/unknown but the stage is still 'running', display the stage as failed (or the parent's terminal state) so the column matches the Burn-In column. The translation is purely display-time; no DB writes. error_text falls back to the parent job's error_text when the stage's own is NULL, so the operator sees what actually broke. --- app/config.py | 2 +- app/routes/_drives_helpers.py | 31 ++++++++++++++++++++++--------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/app/config.py b/app/config.py index 318507e..f9816f2 100644 --- a/app/config.py +++ b/app/config.py @@ -86,7 +86,7 @@ class Settings(BaseSettings): ssh_key: str = "" # PEM private key content (paste full key including headers) # Application version — used by the /api/v1/updates/check endpoint - app_version: str = "1.0.0-48" + app_version: str = "1.0.0-49" # ---- Authentication (1.0.0-22) ---- # session_secret: HMAC key for signing session cookies. Empty = generate diff --git a/app/routes/_drives_helpers.py b/app/routes/_drives_helpers.py index 2bf5cf8..1bbf0f4 100644 --- a/app/routes/_drives_helpers.py +++ b/app/routes/_drives_helpers.py @@ -147,11 +147,12 @@ async def _fetch_drives_for_template(db: aiosqlite.Connection) -> list[dict]: # For burn-ins that include SMART stages, fetch those stages so we can # mirror their progress/result in the Short/Long SMART columns. + # We include burn-ins in ANY state — including failed/passed/cancelled — + # so the SMART columns don't go blank when the burn-in finishes. Without + # this, "FAILED (LONG SMART)" appears in the Burn-In column while the + # Long SMART column shows "—", which contradicts itself. bi_smart_stages: dict[int, dict[str, dict]] = {} # job_id -> {stage_name: row} - bi_ids_with_smart = [ - bi["id"] for bi in burnin_by_drive.values() - if bi["state"] in ("running", "queued") - ] + bi_ids_with_smart = [bi["id"] for bi in burnin_by_drive.values()] if bi_ids_with_smart: placeholders = ",".join("?" * len(bi_ids_with_smart)) # placeholders is purely structural ("?,?,?"); IDs themselves are @@ -163,7 +164,7 @@ async def _fetch_drives_for_template(db: aiosqlite.Connection) -> list[dict]: "FROM burnin_stages bs " "WHERE bs.burnin_job_id IN (" + placeholders + ") " " AND bs.stage_name IN ('short_smart', 'long_smart') " - " AND bs.state IN ('running', 'passed', 'failed')" + " AND bs.state IN ('running', 'passed', 'failed', 'aborted')" ) cur = await db.execute(sql, bi_ids_with_smart) for r in await cur.fetchall(): @@ -185,14 +186,26 @@ async def _fetch_drives_for_template(db: aiosqlite.Connection) -> list[dict]: if existing.get("state") not in (None, "idle"): continue pct = stage["percent"] or 0 + stage_state = stage["state"] + # If the parent burn-in ended in failure but this SMART + # stage is still recorded as "running", that's an + # orphaned stage row from a hard crash (e.g. the old + # `database is locked` failure mode). Surface as failed + # so the column matches the Burn-In column. + if stage_state == "running" and bi.get("state") in ( + "failed", "cancelled", "unknown" + ): + stage_state = bi["state"] if bi["state"] != "unknown" else "failed" d[target] = { - "state": stage["state"], - "percent": pct if stage["state"] == "running" else (100 if stage["state"] == "passed" else 0), - "eta_seconds": _compute_eta_seconds(stage["started_at"], pct) if stage["state"] == "running" else None, + "state": stage_state, + "percent": pct if stage_state == "running" else (100 if stage_state == "passed" else 0), + "eta_seconds": _compute_eta_seconds(stage["started_at"], pct) if stage_state == "running" else None, "eta_timestamp": None, "started_at": stage["started_at"], "finished_at": stage["finished_at"], - "error_text": stage["error_text"], + "error_text": stage["error_text"] or ( + bi.get("error_text") if stage_state == "failed" else None + ), } drives.append(d)