diff --git a/app/burnin.py b/app/burnin/__init__.py similarity index 87% rename from app/burnin.py rename to app/burnin/__init__.py index 743a33f..62e869c 100644 --- a/app/burnin.py +++ b/app/burnin/__init__.py @@ -72,11 +72,29 @@ _client: TrueNASClient | None = None # cancel_job / check_stuck_jobs can actually unwedge a stuck task. _active_tasks: dict[int, "asyncio.Task"] = {} -# Remote PID of any long-running SSH child process (currently only badblocks) -# so we can kill it via a fresh SSH session — proc.kill() over asyncssh sends -# a "signal" channel request that OpenSSH sshd ignores by default, leaving -# the remote process running and proc.wait() hanging forever. -_remote_pids: dict[int, int] = {} +# Remote-PID kill machinery + pool-drive unlock state both live in their +# own submodules. We re-export the names the rest of the app reaches for +# (and keep the _kill_remote_process / _is_unlocked aliases for callers +# that grew up before the split). +from . import kill as _kill # noqa: E402 +from . import unlock as _unlock # noqa: E402 + +_remote_pids = _kill._remote_pids +_unlock_grants = _unlock._unlock_grants + +PoolMemberError = _unlock.PoolMemberError +UNLOCK_TTL_SECONDS = _unlock.UNLOCK_TTL_SECONDS +BOOT_POOL_NAME = _unlock.BOOT_POOL_NAME +BOOT_POOL_CONFIRM_TOKEN = _unlock.BOOT_POOL_CONFIRM_TOKEN +EXPORTED_POOL_ROLE = _unlock.EXPORTED_POOL_ROLE +EXPORTED_CONFIRM_TOKEN = _unlock.EXPORTED_CONFIRM_TOKEN +MOUNTED_ROLE = _unlock.MOUNTED_ROLE +MOUNTED_CONFIRM_TOKEN = _unlock.MOUNTED_CONFIRM_TOKEN + +unlock_expiry = _unlock.unlock_expiry +grant_pool_unlock = _unlock.grant_pool_unlock +_is_unlocked = _unlock.is_unlocked # legacy private name +_kill_remote_process = _kill.kill_remote_process def _now() -> str: @@ -145,208 +163,8 @@ def _spawn_run_job(job_id: int) -> "asyncio.Task": return task -async def _kill_remote_process(job_id: int) -> None: - """Send kill -9 to the remote PID associated with this job, if any. - - asyncssh's proc.kill() sends an SSH 'signal' channel request which - OpenSSH's sshd does not honor by default. Opening a fresh session and - running /bin/kill is the reliable way to actually terminate the process. - """ - pid = _remote_pids.pop(job_id, None) - if not pid: - return - try: - from app import ssh_client - async with await ssh_client._connect() as conn: - await asyncio.wait_for( - conn.run(f"kill -9 {pid} 2>/dev/null || true", check=False), - timeout=10, - ) - log.info("Remote-killed PID %d for job %d", pid, job_id) - except Exception as exc: - log.warning("Failed to remote-kill PID %d for job %d: %s", pid, job_id, exc) - - -# --------------------------------------------------------------------------- -# Pool-drive unlock state -# --------------------------------------------------------------------------- -# -# Drives that ZFS reports as belonging to an active zpool (including the -# boot pool) are locked from burn-in until the operator explicitly unlocks -# them via POST /api/v1/drives/{id}/unlock. Grants live in memory only — -# a container restart wipes them, which is the right default for "this is -# very dangerous." TTL is bounded so an unlock you forgot about can't sit -# armed indefinitely. - -import time as _time -from dataclasses import dataclass - -UNLOCK_TTL_SECONDS = 600 # 10 minutes -BOOT_POOL_NAME = "boot-pool" -BOOT_POOL_CONFIRM_TOKEN = "DESTROY BOOT POOL" -EXPORTED_POOL_ROLE = "exported" -EXPORTED_CONFIRM_TOKEN = "DESTROY EXPORTED POOL" -MOUNTED_ROLE = "mounted" -MOUNTED_CONFIRM_TOKEN = "DESTROY MOUNTED FILESYSTEM" - - -@dataclass -class _UnlockGrant: - """An operator-issued, time-bounded permission to burn-in a pool drive. - - The grant is BOUND to the (pool_name, pool_role) observed at unlock - time. If a subsequent poll reclassifies the drive — e.g. it was - "(exported)" when unlocked but is now in active pool "tank", or it - used to be a cache vdev and now shows as data — the grant is - invalidated. Otherwise the operator's "I confirm this exported drive - is decommissioned" judgement would silently authorise destruction - of a live pool. - """ - expiry: float - pool_name: str - pool_role: str | None - - -_unlock_grants: dict[int, _UnlockGrant] = {} - - -class PoolMemberError(Exception): - """Raised by start_job when a drive is in a zpool and not unlocked.""" - def __init__(self, drive_id: int, pool_name: str, pool_role: str | None): - self.drive_id = drive_id - self.pool_name = pool_name - self.pool_role = pool_role - is_boot = pool_name == BOOT_POOL_NAME - super().__init__( - f"Drive is part of {'BOOT POOL' if is_boot else 'pool'} " - f"'{pool_name}'{' (' + pool_role + ')' if pool_role else ''}. " - f"Unlock required before burn-in." - ) - - -def _is_unlocked(drive_id: int, current_pool_name: str | None, - current_pool_role: str | None) -> bool: - """True iff a non-expired grant exists AND the drive's pool identity - matches what was observed at unlock time.""" - grant = _unlock_grants.get(drive_id) - if grant is None: - return False - if _time.time() >= grant.expiry: - _unlock_grants.pop(drive_id, None) - return False - if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role: - # Pool identity changed since unlock — drive may now belong to a - # different (or live) pool. Invalidate the grant; operator must - # re-unlock with eyes-open against the current state. - _unlock_grants.pop(drive_id, None) - log.warning( - "Invalidating unlock grant for drive_id=%d: pool changed from " - "(%s, %s) to (%s, %s)", - drive_id, grant.pool_name, grant.pool_role, - current_pool_name, current_pool_role, - ) - return False - return True - - -def unlock_expiry(drive_id: int, current_pool_name: str | None, - current_pool_role: str | None) -> float | None: - """Return the absolute expiry of an active grant, or None. - - Same identity-binding semantics as _is_unlocked: a grant whose stored - pool identity no longer matches the current row is treated as expired - and reaped. This is what the dashboard reads to decide whether to show - the unlocked-Burn-In affordance vs the locked-Unlock affordance. - """ - grant = _unlock_grants.get(drive_id) - if grant is None: - return None - if _time.time() >= grant.expiry: - _unlock_grants.pop(drive_id, None) - return None - if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role: - _unlock_grants.pop(drive_id, None) - return None - return grant.expiry - - -async def grant_pool_unlock(drive_id: int, confirm_token: str, - operator: str, reason: str) -> float: - """Validate confirmation token + reason and grant a time-limited unlock. - - Raises ValueError on bad confirm_token, missing reason, or drive not - actually in a pool. Returns the unix expiry timestamp on success. - """ - if not reason or len(reason.strip()) < 5: - raise ValueError("A reason of at least 5 characters is required.") - if not operator or not operator.strip(): - raise ValueError("Operator name is required.") - - async with _db() as db: - db.row_factory = aiosqlite.Row - cur = await db.execute( - "SELECT pool_name, pool_role, devname FROM drives WHERE id=?", - (drive_id,), - ) - row = await cur.fetchone() - if not row: - raise ValueError("Drive not found.") - pool_name = row["pool_name"] - pool_role = row["pool_role"] - if not pool_name: - raise ValueError( - "This drive is not part of any pool — no unlock needed." - ) - - # Boot-pool / exported / mounted-fs all get dedicated, harder-to- - # fat-finger tokens. Active data pools just need their pool name - # typed. - if pool_name == BOOT_POOL_NAME: - expected = BOOT_POOL_CONFIRM_TOKEN - elif pool_role == EXPORTED_POOL_ROLE: - expected = EXPORTED_CONFIRM_TOKEN - elif pool_role == MOUNTED_ROLE: - expected = MOUNTED_CONFIRM_TOKEN - else: - expected = pool_name - if (confirm_token or "").strip() != expected: - raise ValueError("Confirmation token does not match.") - - if pool_name == BOOT_POOL_NAME: - evt = "boot_pool_drive_unlocked" - elif pool_role == EXPORTED_POOL_ROLE: - evt = "exported_pool_drive_unlocked" - elif pool_role == MOUNTED_ROLE: - evt = "mounted_drive_unlocked" - else: - evt = "pool_drive_unlocked" - await db.execute( - """INSERT INTO audit_events - (event_type, drive_id, burnin_job_id, operator, message) - VALUES (?,?,?,?,?)""", - (evt, drive_id, None, operator.strip(), - f"Unlocked {pool_name} drive {row['devname']} for burn-in: {reason.strip()}"), - ) - await db.commit() - - # Arm the in-memory grant ONLY after the audit row is durable. If the - # commit above raises, we exit without writing _unlock_grants — no - # unaudited active unlocks. The grant is bound to the (pool_name, - # pool_role) we observed under the open transaction so a later poll - # that reclassifies the drive invalidates it (see _is_unlocked). - expiry = _time.time() + UNLOCK_TTL_SECONDS - _unlock_grants[drive_id] = _UnlockGrant( - expiry=expiry, - pool_name=pool_name, - pool_role=pool_role, - ) - - log.warning( - "Pool-drive unlock granted: drive_id=%d pool=%s role=%s " - "operator=%s reason=%r", - drive_id, pool_name, pool_role, operator, reason, - ) - return expiry +# _kill_remote_process is re-exported above from .kill — the original +# definition was extracted to app/burnin/kill.py in 1.0.0-30. # --------------------------------------------------------------------------- diff --git a/app/burnin/kill.py b/app/burnin/kill.py new file mode 100644 index 0000000..f5e038e --- /dev/null +++ b/app/burnin/kill.py @@ -0,0 +1,71 @@ +"""Remote process kill machinery. + +asyncssh's ``proc.kill()`` sends an SSH "signal" channel request that +OpenSSH's sshd ignores by default — the remote process keeps running and +``proc.wait()`` hangs forever, pinning the asyncio.Semaphore slot. +The fix: capture the remote PID at command launch (via the +``sh -c 'echo PID:$$; exec ...'`` wrapper) and issue ``kill -9 `` +over a fresh SSH session when we need to abort. This module owns that +state and the kill helper. + +Public surface (used by the rest of the burnin package): + set_remote_pid(job_id, pid) — call from the stage when launch succeeds + clear_remote_pid(job_id) — call from the cleanup callback + kill_remote_process(job_id) — fire-and-clear; safe to call repeatedly +""" + +from __future__ import annotations + +import asyncio +import logging + +log = logging.getLogger(__name__) + + +# job_id -> remote PID. Module-level dict so it survives across the +# stage / task / __init__ split without needing to thread it through +# function signatures. +_remote_pids: dict[int, int] = {} + + +def set_remote_pid(job_id: int, pid: int) -> None: + """Record the remote PID captured by the running stage.""" + _remote_pids[job_id] = pid + + +def clear_remote_pid(job_id: int) -> None: + """Drop the PID without trying to kill — used by the task cleanup + callback so a normally-completed job doesn't carry stale state.""" + _remote_pids.pop(job_id, None) + + +def get_remote_pid(job_id: int) -> int | None: + return _remote_pids.get(job_id) + + +async def kill_remote_process(job_id: int) -> None: + """Send kill -9 to the remote PID associated with this job, if any. + + Idempotent — pops the PID before attempting the kill so a second + call is a no-op. SSH connection failure is logged but never raised + (we'd rather best-effort-kill than block the cancel path). + """ + pid = _remote_pids.pop(job_id, None) + if not pid: + return + try: + # Local import to avoid pulling asyncssh into module load if + # this helper is never used (tests, mock mode). + from app import ssh_client + async with await ssh_client._connect() as conn: + await asyncio.wait_for( + conn.run( + f"kill -9 {pid} 2>/dev/null || true", check=False, + ), + timeout=10, + ) + log.info("Remote-killed PID %d for job %d", pid, job_id) + except Exception as exc: + log.warning( + "Failed to remote-kill PID %d for job %d: %s", pid, job_id, exc, + ) diff --git a/app/burnin/unlock.py b/app/burnin/unlock.py new file mode 100644 index 0000000..da2116b --- /dev/null +++ b/app/burnin/unlock.py @@ -0,0 +1,209 @@ +"""Pool-drive unlock state. + +Drives that ZFS reports as belonging to an active zpool (including the +boot pool), drives carrying ZFS labels from a previously-imported pool +("exported"), and drives with a non-ZFS mount somewhere ("mounted") are +all locked from burn-in until the operator explicitly unlocks them via +``POST /api/v1/drives/{id}/unlock``. Grants live in memory only — a +container restart wipes them, which is the right default for "this is +very dangerous." TTL is bounded so an unlock you forgot about can't sit +armed indefinitely. + +Each lock kind has its own confirm token to make the override +deliberate; see grant_pool_unlock for the matching logic. + +Public surface: + is_unlocked(drive_id, current_pool_name, current_pool_role) -> bool + unlock_expiry(drive_id, current_pool_name, current_pool_role) -> float|None + grant_pool_unlock(drive_id, confirm_token, operator, reason) -> float + PoolMemberError — raised by start_job + UNLOCK_TTL_SECONDS — for the unlock endpoint response + BOOT_POOL_NAME / *_TOKEN consts — for the UI / audit +""" + +from __future__ import annotations + +import logging +import time as _time +from dataclasses import dataclass + +import aiosqlite + +from app.config import settings + +log = logging.getLogger(__name__) + + +UNLOCK_TTL_SECONDS = 600 # 10 minutes +BOOT_POOL_NAME = "boot-pool" +BOOT_POOL_CONFIRM_TOKEN = "DESTROY BOOT POOL" +EXPORTED_POOL_ROLE = "exported" +EXPORTED_CONFIRM_TOKEN = "DESTROY EXPORTED POOL" +MOUNTED_ROLE = "mounted" +MOUNTED_CONFIRM_TOKEN = "DESTROY MOUNTED FILESYSTEM" + + +@dataclass +class _UnlockGrant: + """An operator-issued, time-bounded permission to burn-in a pool drive. + + The grant is BOUND to the (pool_name, pool_role) observed at unlock + time. If a subsequent poll reclassifies the drive — e.g. it was + "(exported)" when unlocked but is now in active pool "tank", or it + used to be a cache vdev and now shows as data — the grant is + invalidated. Otherwise the operator's "I confirm this exported drive + is decommissioned" judgement would silently authorise destruction + of a live pool. + """ + expiry: float + pool_name: str + pool_role: str | None + + +_unlock_grants: dict[int, _UnlockGrant] = {} + + +class PoolMemberError(Exception): + """Raised by start_job when a drive is in a zpool and not unlocked.""" + def __init__(self, drive_id: int, pool_name: str, pool_role: str | None): + self.drive_id = drive_id + self.pool_name = pool_name + self.pool_role = pool_role + is_boot = pool_name == BOOT_POOL_NAME + super().__init__( + f"Drive is part of {'BOOT POOL' if is_boot else 'pool'} " + f"'{pool_name}'{' (' + pool_role + ')' if pool_role else ''}. " + f"Unlock required before burn-in." + ) + + +def is_unlocked(drive_id: int, current_pool_name: str | None, + current_pool_role: str | None) -> bool: + """True iff a non-expired grant exists AND the drive's pool identity + matches what was observed at unlock time.""" + grant = _unlock_grants.get(drive_id) + if grant is None: + return False + if _time.time() >= grant.expiry: + _unlock_grants.pop(drive_id, None) + return False + if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role: + # Pool identity changed since unlock — drive may now belong to a + # different (or live) pool. Invalidate the grant; operator must + # re-unlock with eyes-open against the current state. + _unlock_grants.pop(drive_id, None) + log.warning( + "Invalidating unlock grant for drive_id=%d: pool changed from " + "(%s, %s) to (%s, %s)", + drive_id, grant.pool_name, grant.pool_role, + current_pool_name, current_pool_role, + ) + return False + return True + + +def unlock_expiry(drive_id: int, current_pool_name: str | None, + current_pool_role: str | None) -> float | None: + """Return the absolute expiry of an active grant, or None. + + Same identity-binding semantics as is_unlocked: a grant whose stored + pool identity no longer matches the current row is treated as expired + and reaped. This is what the dashboard reads to decide whether to show + the unlocked-Burn-In affordance vs the locked-Unlock affordance. + """ + grant = _unlock_grants.get(drive_id) + if grant is None: + return None + if _time.time() >= grant.expiry: + _unlock_grants.pop(drive_id, None) + return None + if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role: + _unlock_grants.pop(drive_id, None) + return None + return grant.expiry + + +def invalidate_grant(drive_id: int) -> None: + """Drop a grant unconditionally — used by start_job when a fresh + SSH-side pool check shows the drive's identity has shifted.""" + _unlock_grants.pop(drive_id, None) + + +async def grant_pool_unlock(drive_id: int, confirm_token: str, + operator: str, reason: str) -> float: + """Validate confirmation token + reason and grant a time-limited unlock. + + Raises ValueError on bad confirm_token, missing reason, or drive not + actually in a pool. Returns the unix expiry timestamp on success. + """ + if not reason or len(reason.strip()) < 5: + raise ValueError("A reason of at least 5 characters is required.") + if not operator or not operator.strip(): + raise ValueError("Operator name is required.") + + async with aiosqlite.connect(settings.db_path) as db: + db.row_factory = aiosqlite.Row + await db.execute("PRAGMA busy_timeout=10000") + cur = await db.execute( + "SELECT pool_name, pool_role, devname FROM drives WHERE id=?", + (drive_id,), + ) + row = await cur.fetchone() + if not row: + raise ValueError("Drive not found.") + pool_name = row["pool_name"] + pool_role = row["pool_role"] + if not pool_name: + raise ValueError( + "This drive is not part of any pool — no unlock needed." + ) + + # Boot-pool / exported / mounted-fs all get dedicated, harder-to- + # fat-finger tokens. Active data pools just need their pool name + # typed. + if pool_name == BOOT_POOL_NAME: + expected = BOOT_POOL_CONFIRM_TOKEN + elif pool_role == EXPORTED_POOL_ROLE: + expected = EXPORTED_CONFIRM_TOKEN + elif pool_role == MOUNTED_ROLE: + expected = MOUNTED_CONFIRM_TOKEN + else: + expected = pool_name + if (confirm_token or "").strip() != expected: + raise ValueError("Confirmation token does not match.") + + if pool_name == BOOT_POOL_NAME: + evt = "boot_pool_drive_unlocked" + elif pool_role == EXPORTED_POOL_ROLE: + evt = "exported_pool_drive_unlocked" + elif pool_role == MOUNTED_ROLE: + evt = "mounted_drive_unlocked" + else: + evt = "pool_drive_unlocked" + await db.execute( + """INSERT INTO audit_events + (event_type, drive_id, burnin_job_id, operator, message) + VALUES (?,?,?,?,?)""", + (evt, drive_id, None, operator.strip(), + f"Unlocked {pool_name} drive {row['devname']} for burn-in: {reason.strip()}"), + ) + await db.commit() + + # Arm the in-memory grant ONLY after the audit row is durable. If the + # commit above raises, we exit without writing _unlock_grants — no + # unaudited active unlocks. The grant is bound to the (pool_name, + # pool_role) we observed under the open transaction so a later poll + # that reclassifies the drive invalidates it (see is_unlocked). + expiry = _time.time() + UNLOCK_TTL_SECONDS + _unlock_grants[drive_id] = _UnlockGrant( + expiry=expiry, + pool_name=pool_name, + pool_role=pool_role, + ) + + log.warning( + "Pool-drive unlock granted: drive_id=%d pool=%s role=%s " + "operator=%s reason=%r", + drive_id, pool_name, pool_role, operator, reason, + ) + return expiry diff --git a/app/config.py b/app/config.py index 6016134..90bd3bd 100644 --- a/app/config.py +++ b/app/config.py @@ -83,7 +83,7 @@ class Settings(BaseSettings): ssh_key: str = "" # PEM private key content (paste full key including headers) # Application version — used by the /api/v1/updates/check endpoint - app_version: str = "1.0.0-29" + app_version: str = "1.0.0-30" # ---- Authentication (1.0.0-22) ---- # session_secret: HMAC key for signing session cookies. Empty = generate diff --git a/tests/test_unlock_flow.py b/tests/test_unlock_flow.py index 626a3fd..789bb39 100644 --- a/tests/test_unlock_flow.py +++ b/tests/test_unlock_flow.py @@ -169,15 +169,18 @@ class TestUnlockFlow(unittest.IsolatedAsyncioTestCase): async def test_expired_grant_returns_false(self): from app import burnin - # Drop TTL to 0 so the grant is born expired. - original = burnin.UNLOCK_TTL_SECONDS - burnin.UNLOCK_TTL_SECONDS = 0 + from app.burnin import unlock as _unlock + # Drop TTL to 0 so the grant is born expired. Monkey-patch the + # real source-of-truth in app.burnin.unlock — the alias on the + # package root is bound at import time and won't propagate back. + original = _unlock.UNLOCK_TTL_SECONDS + _unlock.UNLOCK_TTL_SECONDS = 0 try: await burnin.grant_pool_unlock(1, "tank", "op", "valid reason") self.assertFalse(burnin._is_unlocked(1, "tank", "data")) self.assertNotIn(1, burnin._unlock_grants) finally: - burnin.UNLOCK_TTL_SECONDS = original + _unlock.UNLOCK_TTL_SECONDS = original # ----- audit commit ordering (Codex finding #3) -----