"""Remote process kill machinery. asyncssh's ``proc.kill()`` sends an SSH "signal" channel request that OpenSSH's sshd ignores by default — the remote process keeps running and ``proc.wait()`` hangs forever, pinning the asyncio.Semaphore slot. The fix: capture the remote PID at command launch (via the ``sh -c 'echo PID:$$; exec ...'`` wrapper) and issue ``kill -9 `` over a fresh SSH session when we need to abort. This module owns that state and the kill helper. Public surface (used by the rest of the burnin package): set_remote_pid(job_id, pid) — call from the stage when launch succeeds clear_remote_pid(job_id) — call from the cleanup callback kill_remote_process(job_id) — fire-and-clear; safe to call repeatedly """ from __future__ import annotations import asyncio import logging log = logging.getLogger(__name__) # job_id -> remote PID. Module-level dict so it survives across the # stage / task / __init__ split without needing to thread it through # function signatures. _remote_pids: dict[int, int] = {} def set_remote_pid(job_id: int, pid: int) -> None: """Record the remote PID captured by the running stage.""" _remote_pids[job_id] = pid def clear_remote_pid(job_id: int) -> None: """Drop the PID without trying to kill — used by the task cleanup callback so a normally-completed job doesn't carry stale state.""" _remote_pids.pop(job_id, None) def get_remote_pid(job_id: int) -> int | None: return _remote_pids.get(job_id) async def kill_remote_process(job_id: int) -> None: """Send kill -9 to the remote PID associated with this job, if any. Idempotent — pops the PID before attempting the kill so a second call is a no-op. SSH connection failure is logged but never raised (we'd rather best-effort-kill than block the cancel path). """ pid = _remote_pids.pop(job_id, None) if not pid: return try: # Local import to avoid pulling asyncssh into module load if # this helper is never used (tests, mock mode). from app import ssh_client async with await ssh_client._connect() as conn: await asyncio.wait_for( conn.run( f"kill -9 {pid} 2>/dev/null || true", check=False, ), timeout=10, ) log.info("Remote-killed PID %d for job %d", pid, job_id) except Exception as exc: log.warning( "Failed to remote-kill PID %d for job %d: %s", pid, job_id, exc, )