diff --git a/app/burnin/__init__.py b/app/burnin/__init__.py index 14ea800..47657b4 100644 --- a/app/burnin/__init__.py +++ b/app/burnin/__init__.py @@ -411,12 +411,33 @@ async def _run_job(job_id: int) -> None: final_state = "unknown" else: final_state = "passed" if success else "failed" + # If the asyncio task was cancelled mid-stage (container shutdown, + # uvicorn reload, etc.), CancelledError propagates past + # _execute_stages, so any running stage row is still marked + # 'running' in the DB. Reconcile here: mark every still-running + # stage on this job as 'unknown' with the parent's finished_at, + # and stamp a default error_text so the drawer's Reason block has + # something concrete to show. Use a write that's idempotent under + # repeat (only touches rows still 'running'). + cancel_err = ( + "Task cancelled mid-run — likely container restart or shutdown" + if was_cancelled else None + ) async with _db() as db: await db.execute("PRAGMA journal_mode=WAL") await db.execute( "UPDATE burnin_jobs SET state=?, percent=?, finished_at=?, error_text=? WHERE id=?", - (final_state, 100 if success else None, _now(), error_text, job_id), + (final_state, 100 if success else None, _now(), + error_text or cancel_err, job_id), ) + if was_cancelled: + await db.execute( + """UPDATE burnin_stages + SET state='unknown', finished_at=?, + error_text=COALESCE(error_text, ?) + WHERE burnin_job_id=? AND state='running'""", + (_now(), cancel_err, job_id), + ) await db.execute( """INSERT INTO audit_events (event_type, drive_id, burnin_job_id, operator, message) VALUES (?,?,?,(SELECT operator FROM burnin_jobs WHERE id=?),?)""", diff --git a/app/burnin/stages.py b/app/burnin/stages.py index 6921632..1eaa25e 100644 --- a/app/burnin/stages.py +++ b/app/burnin/stages.py @@ -655,7 +655,21 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) result["aborted"] = bad_blocks_total > settings.bad_block_threshold except asyncio.CancelledError: - return False + # Best-effort kill of the remote badblocks process before + # propagating the cancel. asyncio.shield() so the kill attempt + # itself isn't interrupted by ongoing loop shutdown. Then + # re-raise so _run_job marks the job 'unknown' (honest about + # the indeterminate outcome) instead of 'failed' (which + # implies the burn-in itself failed, which we don't know). + try: + await asyncio.shield(kill.kill_remote_process(job_id)) + except Exception: + pass + await _append_stage_log( + job_id, "surface_validate", + "\n[ABORTED] task cancelled (likely container restart or shutdown)\n", + ) + raise except Exception as exc: await _append_stage_log(job_id, "surface_validate", f"\n[SSH error] {exc}\n") await _set_stage_error(job_id, "surface_validate", f"SSH badblocks error: {exc}") diff --git a/app/config.py b/app/config.py index 2b16c34..be8a38a 100644 --- a/app/config.py +++ b/app/config.py @@ -86,7 +86,7 @@ class Settings(BaseSettings): ssh_key: str = "" # PEM private key content (paste full key including headers) # Application version — used by the /api/v1/updates/check endpoint - app_version: str = "1.0.0-50" + app_version: str = "1.0.0-51" # ---- Authentication (1.0.0-22) ---- # session_secret: HMAC key for signing session cookies. Empty = generate