nas-burnin/app/burnin/kill.py

"""Remote process kill machinery.

asyncssh's ``proc.kill()`` sends an SSH "signal" channel request that
OpenSSH's sshd ignores by default — the remote process keeps running and
``proc.wait()`` hangs forever, pinning the asyncio.Semaphore slot.
The fix: capture the remote PID at command launch (via the
``sh -c 'echo PID:$$; exec ...'`` wrapper) and issue ``kill -9 <pid>``
over a fresh SSH session when we need to abort. This module owns that
state and the kill helper.

Public surface (used by the rest of the burnin package):
    set_remote_pid(job_id, pid)    — call from the stage when launch succeeds
    clear_remote_pid(job_id)       — call from the cleanup callback
    kill_remote_process(job_id)    — fire-and-clear; safe to call repeatedly
"""

from __future__ import annotations

import asyncio
import logging

log = logging.getLogger(__name__)


# job_id -> remote PID. Module-level dict so it survives across the
# stage / task / __init__ split without needing to thread it through
# function signatures.
_remote_pids: dict[int, int] = {}


def set_remote_pid(job_id: int, pid: int) -> None:
    """Record the remote PID captured by the running stage."""
    _remote_pids[job_id] = pid


def clear_remote_pid(job_id: int) -> None:
    """Drop the PID without trying to kill — used by the task cleanup
    callback so a normally-completed job doesn't carry stale state."""
    _remote_pids.pop(job_id, None)


def get_remote_pid(job_id: int) -> int | None:
    return _remote_pids.get(job_id)


async def kill_remote_process(job_id: int) -> None:
    """Send kill -9 to the remote PID associated with this job, if any.

    Idempotent — pops the PID before attempting the kill so a second
    call is a no-op. SSH connection failure is logged but never raised
    (we'd rather best-effort-kill than block the cancel path).
    """
    pid = _remote_pids.pop(job_id, None)
    if not pid:
        return
    try:
        # Local import to avoid pulling asyncssh into module load if
        # this helper is never used (tests, mock mode).
        from app import ssh_client
        async with await ssh_client._connect() as conn:
            await asyncio.wait_for(
                conn.run(
                    f"kill -9 {pid} 2>/dev/null || true", check=False,
                ),
                timeout=10,
            )
        log.info("Remote-killed PID %d for job %d", pid, job_id)
    except Exception as exc:
        log.warning(
            "Failed to remote-kill PID %d for job %d: %s", pid, job_id, exc,
        )