From 1bc1b378abaaed205ea222690a8496232541ec7a Mon Sep 17 00:00:00 2001
From: Brandon Walter <51866976+echoparkbaby@users.noreply.github.com>
Date: Sat, 9 May 2026 12:32:46 -0700
Subject: [PATCH] fix: cancel-mid-stage marks job 'unknown' not 'failed'
 (1.0.0-51)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Container restarts (uvicorn shutdown / 'docker compose up -d') were
silently classifying running burn-ins as 'failed' with empty
error_text. Two reasons converged:

1. _stage_surface_validate_ssh caught asyncio.CancelledError at the
   stage level and returned False, *swallowing* the cancel signal.
2. _run_job's outer CancelledError handler then never fired, so
   was_cancelled stayed False and the job got marked 'failed' (the
   "burn-in itself failed" classification) instead of 'unknown'
   (the honest "we don't know whether it would have passed").

Fix:
- Stage now does best-effort kill of remote badblocks (shielded so
  loop shutdown doesn't interrupt the kill), appends an [ABORTED]
  marker to the log, and re-raises CancelledError. _execute_stages
  doesn't catch it (CancelledError is BaseException, not Exception
  in 3.8+) so it propagates up to _run_job.
- _run_job's existing CancelledError handler now also reconciles
  any stage rows still recorded as 'running' by setting them to
  'unknown' with a clear error_text: "Task cancelled mid-run —
  likely container restart or shutdown". The job's error_text gets
  the same message so the drawer's Reason block has something
  specific to display, instead of falling back to the heuristic.

Future container restarts on running burn-ins will now show as
yellow "UNKNOWN" with the explicit cancel reason, matching the
existing behaviour of check_stuck_jobs() for stuck timeouts.
---
 app/burnin/__init__.py | 23 ++++++++++++++++++++++-
 app/burnin/stages.py   | 16 +++++++++++++++-
 app/config.py          |  2 +-
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/app/burnin/__init__.py b/app/burnin/__init__.py
index 14ea800..47657b4 100644
--- a/app/burnin/__init__.py
+++ b/app/burnin/__init__.py
@@ -411,12 +411,33 @@ async def _run_job(job_id: int) -> None:
             final_state = "unknown"
         else:
             final_state = "passed" if success else "failed"
+        # If the asyncio task was cancelled mid-stage (container shutdown,
+        # uvicorn reload, etc.), CancelledError propagates past
+        # _execute_stages, so any running stage row is still marked
+        # 'running' in the DB. Reconcile here: mark every still-running
+        # stage on this job as 'unknown' with the parent's finished_at,
+        # and stamp a default error_text so the drawer's Reason block has
+        # something concrete to show. Use a write that's idempotent under
+        # repeat (only touches rows still 'running').
+        cancel_err = (
+            "Task cancelled mid-run — likely container restart or shutdown"
+            if was_cancelled else None
+        )
         async with _db() as db:
             await db.execute("PRAGMA journal_mode=WAL")
             await db.execute(
                 "UPDATE burnin_jobs SET state=?, percent=?, finished_at=?, error_text=? WHERE id=?",
-                (final_state, 100 if success else None, _now(), error_text, job_id),
+                (final_state, 100 if success else None, _now(),
+                 error_text or cancel_err, job_id),
             )
+            if was_cancelled:
+                await db.execute(
+                    """UPDATE burnin_stages
+                       SET state='unknown', finished_at=?,
+                           error_text=COALESCE(error_text, ?)
+                       WHERE burnin_job_id=? AND state='running'""",
+                    (_now(), cancel_err, job_id),
+                )
             await db.execute(
                 """INSERT INTO audit_events (event_type, drive_id, burnin_job_id, operator, message)
                    VALUES (?,?,?,(SELECT operator FROM burnin_jobs WHERE id=?),?)""",
diff --git a/app/burnin/stages.py b/app/burnin/stages.py
index 6921632..1eaa25e 100644
--- a/app/burnin/stages.py
+++ b/app/burnin/stages.py
@@ -655,7 +655,21 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
         result["aborted"] = bad_blocks_total > settings.bad_block_threshold
 
     except asyncio.CancelledError:
-        return False
+        # Best-effort kill of the remote badblocks process before
+        # propagating the cancel. asyncio.shield() so the kill attempt
+        # itself isn't interrupted by ongoing loop shutdown. Then
+        # re-raise so _run_job marks the job 'unknown' (honest about
+        # the indeterminate outcome) instead of 'failed' (which
+        # implies the burn-in itself failed, which we don't know).
+        try:
+            await asyncio.shield(kill.kill_remote_process(job_id))
+        except Exception:
+            pass
+        await _append_stage_log(
+            job_id, "surface_validate",
+            "\n[ABORTED] task cancelled (likely container restart or shutdown)\n",
+        )
+        raise
     except Exception as exc:
         await _append_stage_log(job_id, "surface_validate", f"\n[SSH error] {exc}\n")
         await _set_stage_error(job_id, "surface_validate", f"SSH badblocks error: {exc}")
diff --git a/app/config.py b/app/config.py
index 2b16c34..be8a38a 100644
--- a/app/config.py
+++ b/app/config.py
@@ -86,7 +86,7 @@ class Settings(BaseSettings):
     ssh_key: str = ""             # PEM private key content (paste full key including headers)
 
     # Application version — used by the /api/v1/updates/check endpoint
-    app_version: str = "1.0.0-50"
+    app_version: str = "1.0.0-51"
 
     # ---- Authentication (1.0.0-22) ----
     # session_secret: HMAC key for signing session cookies. Empty = generate