From 1bc1b378abaaed205ea222690a8496232541ec7a Mon Sep 17 00:00:00 2001 From: Brandon Walter <51866976+echoparkbaby@users.noreply.github.com> Date: Sat, 9 May 2026 12:32:46 -0700 Subject: [PATCH] fix: cancel-mid-stage marks job 'unknown' not 'failed' (1.0.0-51) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Container restarts (uvicorn shutdown / 'docker compose up -d') were silently classifying running burn-ins as 'failed' with empty error_text. Two reasons converged: 1. _stage_surface_validate_ssh caught asyncio.CancelledError at the stage level and returned False, *swallowing* the cancel signal. 2. _run_job's outer CancelledError handler then never fired, so was_cancelled stayed False and the job got marked 'failed' (the "burn-in itself failed" classification) instead of 'unknown' (the honest "we don't know whether it would have passed"). Fix: - Stage now does best-effort kill of remote badblocks (shielded so loop shutdown doesn't interrupt the kill), appends an [ABORTED] marker to the log, and re-raises CancelledError. _execute_stages doesn't catch it (CancelledError is BaseException, not Exception in 3.8+) so it propagates up to _run_job. - _run_job's existing CancelledError handler now also reconciles any stage rows still recorded as 'running' by setting them to 'unknown' with a clear error_text: "Task cancelled mid-run — likely container restart or shutdown". The job's error_text gets the same message so the drawer's Reason block has something specific to display, instead of falling back to the heuristic. Future container restarts on running burn-ins will now show as yellow "UNKNOWN" with the explicit cancel reason, matching the existing behaviour of check_stuck_jobs() for stuck timeouts. --- app/burnin/__init__.py | 23 ++++++++++++++++++++++- app/burnin/stages.py | 16 +++++++++++++++- app/config.py | 2 +- 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/app/burnin/__init__.py b/app/burnin/__init__.py index 14ea800..47657b4 100644 --- a/app/burnin/__init__.py +++ b/app/burnin/__init__.py @@ -411,12 +411,33 @@ async def _run_job(job_id: int) -> None: final_state = "unknown" else: final_state = "passed" if success else "failed" + # If the asyncio task was cancelled mid-stage (container shutdown, + # uvicorn reload, etc.), CancelledError propagates past + # _execute_stages, so any running stage row is still marked + # 'running' in the DB. Reconcile here: mark every still-running + # stage on this job as 'unknown' with the parent's finished_at, + # and stamp a default error_text so the drawer's Reason block has + # something concrete to show. Use a write that's idempotent under + # repeat (only touches rows still 'running'). + cancel_err = ( + "Task cancelled mid-run — likely container restart or shutdown" + if was_cancelled else None + ) async with _db() as db: await db.execute("PRAGMA journal_mode=WAL") await db.execute( "UPDATE burnin_jobs SET state=?, percent=?, finished_at=?, error_text=? WHERE id=?", - (final_state, 100 if success else None, _now(), error_text, job_id), + (final_state, 100 if success else None, _now(), + error_text or cancel_err, job_id), ) + if was_cancelled: + await db.execute( + """UPDATE burnin_stages + SET state='unknown', finished_at=?, + error_text=COALESCE(error_text, ?) + WHERE burnin_job_id=? AND state='running'""", + (_now(), cancel_err, job_id), + ) await db.execute( """INSERT INTO audit_events (event_type, drive_id, burnin_job_id, operator, message) VALUES (?,?,?,(SELECT operator FROM burnin_jobs WHERE id=?),?)""", diff --git a/app/burnin/stages.py b/app/burnin/stages.py index 6921632..1eaa25e 100644 --- a/app/burnin/stages.py +++ b/app/burnin/stages.py @@ -655,7 +655,21 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) result["aborted"] = bad_blocks_total > settings.bad_block_threshold except asyncio.CancelledError: - return False + # Best-effort kill of the remote badblocks process before + # propagating the cancel. asyncio.shield() so the kill attempt + # itself isn't interrupted by ongoing loop shutdown. Then + # re-raise so _run_job marks the job 'unknown' (honest about + # the indeterminate outcome) instead of 'failed' (which + # implies the burn-in itself failed, which we don't know). + try: + await asyncio.shield(kill.kill_remote_process(job_id)) + except Exception: + pass + await _append_stage_log( + job_id, "surface_validate", + "\n[ABORTED] task cancelled (likely container restart or shutdown)\n", + ) + raise except Exception as exc: await _append_stage_log(job_id, "surface_validate", f"\n[SSH error] {exc}\n") await _set_stage_error(job_id, "surface_validate", f"SSH badblocks error: {exc}") diff --git a/app/config.py b/app/config.py index 2b16c34..be8a38a 100644 --- a/app/config.py +++ b/app/config.py @@ -86,7 +86,7 @@ class Settings(BaseSettings): ssh_key: str = "" # PEM private key content (paste full key including headers) # Application version — used by the /api/v1/updates/check endpoint - app_version: str = "1.0.0-50" + app_version: str = "1.0.0-51" # ---- Authentication (1.0.0-22) ---- # session_secret: HMAC key for signing session cookies. Empty = generate