fix: cancel-mid-stage marks job 'unknown' not 'failed' (1.0.0-51)
Container restarts (uvicorn shutdown / 'docker compose up -d') were silently classifying running burn-ins as 'failed' with empty error_text. Two reasons converged: 1. _stage_surface_validate_ssh caught asyncio.CancelledError at the stage level and returned False, *swallowing* the cancel signal. 2. _run_job's outer CancelledError handler then never fired, so was_cancelled stayed False and the job got marked 'failed' (the "burn-in itself failed" classification) instead of 'unknown' (the honest "we don't know whether it would have passed"). Fix: - Stage now does best-effort kill of remote badblocks (shielded so loop shutdown doesn't interrupt the kill), appends an [ABORTED] marker to the log, and re-raises CancelledError. _execute_stages doesn't catch it (CancelledError is BaseException, not Exception in 3.8+) so it propagates up to _run_job. - _run_job's existing CancelledError handler now also reconciles any stage rows still recorded as 'running' by setting them to 'unknown' with a clear error_text: "Task cancelled mid-run — likely container restart or shutdown". The job's error_text gets the same message so the drawer's Reason block has something specific to display, instead of falling back to the heuristic. Future container restarts on running burn-ins will now show as yellow "UNKNOWN" with the explicit cancel reason, matching the existing behaviour of check_stuck_jobs() for stuck timeouts.
This commit is contained in:
parent
7f959e6f4c
commit
1bc1b378ab
3 changed files with 38 additions and 3 deletions
|
|
@ -411,12 +411,33 @@ async def _run_job(job_id: int) -> None:
|
||||||
final_state = "unknown"
|
final_state = "unknown"
|
||||||
else:
|
else:
|
||||||
final_state = "passed" if success else "failed"
|
final_state = "passed" if success else "failed"
|
||||||
|
# If the asyncio task was cancelled mid-stage (container shutdown,
|
||||||
|
# uvicorn reload, etc.), CancelledError propagates past
|
||||||
|
# _execute_stages, so any running stage row is still marked
|
||||||
|
# 'running' in the DB. Reconcile here: mark every still-running
|
||||||
|
# stage on this job as 'unknown' with the parent's finished_at,
|
||||||
|
# and stamp a default error_text so the drawer's Reason block has
|
||||||
|
# something concrete to show. Use a write that's idempotent under
|
||||||
|
# repeat (only touches rows still 'running').
|
||||||
|
cancel_err = (
|
||||||
|
"Task cancelled mid-run — likely container restart or shutdown"
|
||||||
|
if was_cancelled else None
|
||||||
|
)
|
||||||
async with _db() as db:
|
async with _db() as db:
|
||||||
await db.execute("PRAGMA journal_mode=WAL")
|
await db.execute("PRAGMA journal_mode=WAL")
|
||||||
await db.execute(
|
await db.execute(
|
||||||
"UPDATE burnin_jobs SET state=?, percent=?, finished_at=?, error_text=? WHERE id=?",
|
"UPDATE burnin_jobs SET state=?, percent=?, finished_at=?, error_text=? WHERE id=?",
|
||||||
(final_state, 100 if success else None, _now(), error_text, job_id),
|
(final_state, 100 if success else None, _now(),
|
||||||
|
error_text or cancel_err, job_id),
|
||||||
)
|
)
|
||||||
|
if was_cancelled:
|
||||||
|
await db.execute(
|
||||||
|
"""UPDATE burnin_stages
|
||||||
|
SET state='unknown', finished_at=?,
|
||||||
|
error_text=COALESCE(error_text, ?)
|
||||||
|
WHERE burnin_job_id=? AND state='running'""",
|
||||||
|
(_now(), cancel_err, job_id),
|
||||||
|
)
|
||||||
await db.execute(
|
await db.execute(
|
||||||
"""INSERT INTO audit_events (event_type, drive_id, burnin_job_id, operator, message)
|
"""INSERT INTO audit_events (event_type, drive_id, burnin_job_id, operator, message)
|
||||||
VALUES (?,?,?,(SELECT operator FROM burnin_jobs WHERE id=?),?)""",
|
VALUES (?,?,?,(SELECT operator FROM burnin_jobs WHERE id=?),?)""",
|
||||||
|
|
|
||||||
|
|
@ -655,7 +655,21 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
||||||
result["aborted"] = bad_blocks_total > settings.bad_block_threshold
|
result["aborted"] = bad_blocks_total > settings.bad_block_threshold
|
||||||
|
|
||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
return False
|
# Best-effort kill of the remote badblocks process before
|
||||||
|
# propagating the cancel. asyncio.shield() so the kill attempt
|
||||||
|
# itself isn't interrupted by ongoing loop shutdown. Then
|
||||||
|
# re-raise so _run_job marks the job 'unknown' (honest about
|
||||||
|
# the indeterminate outcome) instead of 'failed' (which
|
||||||
|
# implies the burn-in itself failed, which we don't know).
|
||||||
|
try:
|
||||||
|
await asyncio.shield(kill.kill_remote_process(job_id))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
await _append_stage_log(
|
||||||
|
job_id, "surface_validate",
|
||||||
|
"\n[ABORTED] task cancelled (likely container restart or shutdown)\n",
|
||||||
|
)
|
||||||
|
raise
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
await _append_stage_log(job_id, "surface_validate", f"\n[SSH error] {exc}\n")
|
await _append_stage_log(job_id, "surface_validate", f"\n[SSH error] {exc}\n")
|
||||||
await _set_stage_error(job_id, "surface_validate", f"SSH badblocks error: {exc}")
|
await _set_stage_error(job_id, "surface_validate", f"SSH badblocks error: {exc}")
|
||||||
|
|
|
||||||
|
|
@ -86,7 +86,7 @@ class Settings(BaseSettings):
|
||||||
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
||||||
|
|
||||||
# Application version — used by the /api/v1/updates/check endpoint
|
# Application version — used by the /api/v1/updates/check endpoint
|
||||||
app_version: str = "1.0.0-50"
|
app_version: str = "1.0.0-51"
|
||||||
|
|
||||||
# ---- Authentication (1.0.0-22) ----
|
# ---- Authentication (1.0.0-22) ----
|
||||||
# session_secret: HMAC key for signing session cookies. Empty = generate
|
# session_secret: HMAC key for signing session cookies. Empty = generate
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue