refactor: split burnin.py into a package — extract unlock + kill (1.0.0-30)

First slice of the planned tech-debt cleanup. burnin.py was 1667 lines and growing; staged extraction gives smaller diffs to review and a clear bisect target if anything regresses. Mechanical move only — no behaviour change. The two extracted modules: * app/burnin/unlock.py — _UnlockGrant, _unlock_grants, PoolMemberError, is_unlocked / unlock_expiry / grant_pool_unlock, plus the four *_TOKEN constants and UNLOCK_TTL_SECONDS. Owns its module-level state; opens its own DB connection in grant_pool_unlock so it doesn't depend on the parent package's _db() helper. * app/burnin/kill.py — _remote_pids dict and the kill_remote_process / set_remote_pid / clear_remote_pid / get_remote_pid helpers. Pulled out of __init__.py so the asyncssh-ignores-signals workaround lives next to the state it operates on. app/burnin/__init__.py re-exports every public symbol the rest of the app imports — `from app import burnin; burnin.start_job(...)`, `burnin.PoolMemberError`, `burnin.UNLOCK_TTL_SECONDS`, etc. all keep working unchanged. Internal aliases `_remote_pids` and `_unlock_grants` on the package root point at the SAME dict objects in the submodules, so existing in-package mutations (set in stages, cleared in cleanup callbacks) work without rewrite. Test fix: tests/test_unlock_flow.py:test_expired_grant_returns_false monkey-patches UNLOCK_TTL_SECONDS. The package-root alias is bound at import time and won't propagate back to the submodule's read site, so the test now patches `app.burnin.unlock.UNLOCK_TTL_SECONDS` directly. Verification: 44/44 unit tests pass in container; /health 200; container boots clean. routes.py, mailer.py, poller.py untouched — the public API is identical. Future: extract stages, task, _common in subsequent versions. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:44:28 -04:00 · 2026-05-03 00:44:28 -04:00 · 9cbae44495
commit 9cbae44495
parent 6c20e57fd8
5 changed files with 313 additions and 212 deletions
--- a/app/burnin/init.py
+++ b/app/burnin/init.py
@ -72,11 +72,29 @@ _client: TrueNASClient | None = None
 # cancel_job / check_stuck_jobs can actually unwedge a stuck task.
 _active_tasks: dict[int, "asyncio.Task"] = {}
-# Remote PID of any long-running SSH child process (currently only badblocks)
+# Remote-PID kill machinery + pool-drive unlock state both live in their
-# so we can kill it via a fresh SSH session — proc.kill() over asyncssh sends
+# own submodules. We re-export the names the rest of the app reaches for
-# a "signal" channel request that OpenSSH sshd ignores by default, leaving
+# (and keep the _kill_remote_process / _is_unlocked aliases for callers
-# the remote process running and proc.wait() hanging forever.
+# that grew up before the split).
-_remote_pids: dict[int, int] = {}
+from . import kill as _kill                              # noqa: E402
 from . import unlock as _unlock                          # noqa: E402
 _remote_pids = _kill._remote_pids
 _unlock_grants = _unlock._unlock_grants
 PoolMemberError = _unlock.PoolMemberError
 UNLOCK_TTL_SECONDS = _unlock.UNLOCK_TTL_SECONDS
 BOOT_POOL_NAME = _unlock.BOOT_POOL_NAME
 BOOT_POOL_CONFIRM_TOKEN = _unlock.BOOT_POOL_CONFIRM_TOKEN
 EXPORTED_POOL_ROLE = _unlock.EXPORTED_POOL_ROLE
 EXPORTED_CONFIRM_TOKEN = _unlock.EXPORTED_CONFIRM_TOKEN
 MOUNTED_ROLE = _unlock.MOUNTED_ROLE
 MOUNTED_CONFIRM_TOKEN = _unlock.MOUNTED_CONFIRM_TOKEN
 unlock_expiry = _unlock.unlock_expiry
 grant_pool_unlock = _unlock.grant_pool_unlock
 _is_unlocked = _unlock.is_unlocked        # legacy private name
 _kill_remote_process = _kill.kill_remote_process
 def _now() -> str:
@ -145,208 +163,8 @@ def _spawn_run_job(job_id: int) -> "asyncio.Task":
    return task
-async def _kill_remote_process(job_id: int) -> None:
+# _kill_remote_process is re-exported above from .kill — the original
-    """Send kill -9 to the remote PID associated with this job, if any.
+# definition was extracted to app/burnin/kill.py in 1.0.0-30.
    asyncssh's proc.kill() sends an SSH 'signal' channel request which
    OpenSSH's sshd does not honor by default. Opening a fresh session and
    running /bin/kill is the reliable way to actually terminate the process.
    """
    pid = _remote_pids.pop(job_id, None)
    if not pid:
        return
    try:
        from app import ssh_client
        async with await ssh_client._connect() as conn:
            await asyncio.wait_for(
                conn.run(f"kill -9 {pid} 2>/dev/null || true", check=False),
                timeout=10,
            )
        log.info("Remote-killed PID %d for job %d", pid, job_id)
    except Exception as exc:
        log.warning("Failed to remote-kill PID %d for job %d: %s", pid, job_id, exc)
 # ---------------------------------------------------------------------------
 # Pool-drive unlock state
 # ---------------------------------------------------------------------------
 #
 # Drives that ZFS reports as belonging to an active zpool (including the
 # boot pool) are locked from burn-in until the operator explicitly unlocks
 # them via POST /api/v1/drives/{id}/unlock. Grants live in memory only —
 # a container restart wipes them, which is the right default for "this is
 # very dangerous." TTL is bounded so an unlock you forgot about can't sit
 # armed indefinitely.
 import time as _time
 from dataclasses import dataclass
 UNLOCK_TTL_SECONDS = 600  # 10 minutes
 BOOT_POOL_NAME = "boot-pool"
 BOOT_POOL_CONFIRM_TOKEN = "DESTROY BOOT POOL"
 EXPORTED_POOL_ROLE = "exported"
 EXPORTED_CONFIRM_TOKEN = "DESTROY EXPORTED POOL"
 MOUNTED_ROLE = "mounted"
 MOUNTED_CONFIRM_TOKEN = "DESTROY MOUNTED FILESYSTEM"
@dataclass
 class _UnlockGrant:
    """An operator-issued, time-bounded permission to burn-in a pool drive.
    The grant is BOUND to the (pool_name, pool_role) observed at unlock
    time. If a subsequent poll reclassifies the drive — e.g. it was
    "(exported)" when unlocked but is now in active pool "tank", or it
    used to be a cache vdev and now shows as data — the grant is
    invalidated. Otherwise the operator's "I confirm this exported drive
    is decommissioned" judgement would silently authorise destruction
    of a live pool.
    """
    expiry: float
    pool_name: str
    pool_role: str | None
 _unlock_grants: dict[int, _UnlockGrant] = {}
 class PoolMemberError(Exception):
    """Raised by start_job when a drive is in a zpool and not unlocked."""
    def __init__(self, drive_id: int, pool_name: str, pool_role: str | None):
        self.drive_id = drive_id
        self.pool_name = pool_name
        self.pool_role = pool_role
        is_boot = pool_name == BOOT_POOL_NAME
        super().__init__(
            f"Drive is part of {'BOOT POOL' if is_boot else 'pool'} "
            f"'{pool_name}'{' (' + pool_role + ')' if pool_role else ''}. "
            f"Unlock required before burn-in."
        )
 def _is_unlocked(drive_id: int, current_pool_name: str | None,
                 current_pool_role: str | None) -> bool:
    """True iff a non-expired grant exists AND the drive's pool identity
    matches what was observed at unlock time."""
    grant = _unlock_grants.get(drive_id)
    if grant is None:
        return False
    if _time.time() >= grant.expiry:
        _unlock_grants.pop(drive_id, None)
        return False
    if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role:
        # Pool identity changed since unlock — drive may now belong to a
        # different (or live) pool. Invalidate the grant; operator must
        # re-unlock with eyes-open against the current state.
        _unlock_grants.pop(drive_id, None)
        log.warning(
            "Invalidating unlock grant for drive_id=%d: pool changed from "
            "(%s, %s) to (%s, %s)",
            drive_id, grant.pool_name, grant.pool_role,
            current_pool_name, current_pool_role,
        )
        return False
    return True
 def unlock_expiry(drive_id: int, current_pool_name: str | None,
                  current_pool_role: str | None) -> float | None:
    """Return the absolute expiry of an active grant, or None.
    Same identity-binding semantics as _is_unlocked: a grant whose stored
    pool identity no longer matches the current row is treated as expired
    and reaped. This is what the dashboard reads to decide whether to show
    the unlocked-Burn-In affordance vs the locked-Unlock affordance.
    """
    grant = _unlock_grants.get(drive_id)
    if grant is None:
        return None
    if _time.time() >= grant.expiry:
        _unlock_grants.pop(drive_id, None)
        return None
    if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role:
        _unlock_grants.pop(drive_id, None)
        return None
    return grant.expiry
 async def grant_pool_unlock(drive_id: int, confirm_token: str,
                            operator: str, reason: str) -> float:
    """Validate confirmation token + reason and grant a time-limited unlock.
    Raises ValueError on bad confirm_token, missing reason, or drive not
    actually in a pool. Returns the unix expiry timestamp on success.
    """
    if not reason or len(reason.strip()) < 5:
        raise ValueError("A reason of at least 5 characters is required.")
    if not operator or not operator.strip():
        raise ValueError("Operator name is required.")
    async with _db() as db:
        db.row_factory = aiosqlite.Row
        cur = await db.execute(
            "SELECT pool_name, pool_role, devname FROM drives WHERE id=?",
            (drive_id,),
        )
        row = await cur.fetchone()
        if not row:
            raise ValueError("Drive not found.")
        pool_name = row["pool_name"]
        pool_role = row["pool_role"]
        if not pool_name:
            raise ValueError(
                "This drive is not part of any pool — no unlock needed."
            )
        # Boot-pool / exported / mounted-fs all get dedicated, harder-to-
        # fat-finger tokens. Active data pools just need their pool name
        # typed.
        if pool_name == BOOT_POOL_NAME:
            expected = BOOT_POOL_CONFIRM_TOKEN
        elif pool_role == EXPORTED_POOL_ROLE:
            expected = EXPORTED_CONFIRM_TOKEN
        elif pool_role == MOUNTED_ROLE:
            expected = MOUNTED_CONFIRM_TOKEN
        else:
            expected = pool_name
        if (confirm_token or "").strip() != expected:
            raise ValueError("Confirmation token does not match.")
        if pool_name == BOOT_POOL_NAME:
            evt = "boot_pool_drive_unlocked"
        elif pool_role == EXPORTED_POOL_ROLE:
            evt = "exported_pool_drive_unlocked"
        elif pool_role == MOUNTED_ROLE:
            evt = "mounted_drive_unlocked"
        else:
            evt = "pool_drive_unlocked"
        await db.execute(
            """INSERT INTO audit_events
                   (event_type, drive_id, burnin_job_id, operator, message)
               VALUES (?,?,?,?,?)""",
            (evt, drive_id, None, operator.strip(),
             f"Unlocked {pool_name} drive {row['devname']} for burn-in: {reason.strip()}"),
        )
        await db.commit()
    # Arm the in-memory grant ONLY after the audit row is durable. If the
    # commit above raises, we exit without writing _unlock_grants — no
    # unaudited active unlocks. The grant is bound to the (pool_name,
    # pool_role) we observed under the open transaction so a later poll
    # that reclassifies the drive invalidates it (see _is_unlocked).
    expiry = _time.time() + UNLOCK_TTL_SECONDS
    _unlock_grants[drive_id] = _UnlockGrant(
        expiry=expiry,
        pool_name=pool_name,
        pool_role=pool_role,
    )
    log.warning(
        "Pool-drive unlock granted: drive_id=%d pool=%s role=%s "
        "operator=%s reason=%r",
        drive_id, pool_name, pool_role, operator, reason,
    )
    return expiry
 # ---------------------------------------------------------------------------
--- a/app/burnin/kill.py
+++ b/app/burnin/kill.py
@ -0,0 +1,71 @@
 """Remote process kill machinery.
 asyncssh's ``proc.kill()`` sends an SSH "signal" channel request that
 OpenSSH's sshd ignores by default — the remote process keeps running and
 ``proc.wait()`` hangs forever, pinning the asyncio.Semaphore slot.
 The fix: capture the remote PID at command launch (via the
 ``sh -c 'echo PID:$$; exec ...'`` wrapper) and issue ``kill -9 <pid>``
 over a fresh SSH session when we need to abort. This module owns that
 state and the kill helper.
 Public surface (used by the rest of the burnin package):
    set_remote_pid(job_id, pid)    — call from the stage when launch succeeds
    clear_remote_pid(job_id)       — call from the cleanup callback
    kill_remote_process(job_id)    — fire-and-clear; safe to call repeatedly
 """
 from __future__ import annotations
 import asyncio
 import logging
 log = logging.getLogger(__name__)
 # job_id -> remote PID. Module-level dict so it survives across the
 # stage / task / __init__ split without needing to thread it through
 # function signatures.
 _remote_pids: dict[int, int] = {}
 def set_remote_pid(job_id: int, pid: int) -> None:
    """Record the remote PID captured by the running stage."""
    _remote_pids[job_id] = pid
 def clear_remote_pid(job_id: int) -> None:
    """Drop the PID without trying to kill — used by the task cleanup
    callback so a normally-completed job doesn't carry stale state."""
    _remote_pids.pop(job_id, None)
 def get_remote_pid(job_id: int) -> int | None:
    return _remote_pids.get(job_id)
 async def kill_remote_process(job_id: int) -> None:
    """Send kill -9 to the remote PID associated with this job, if any.
    Idempotent — pops the PID before attempting the kill so a second
    call is a no-op. SSH connection failure is logged but never raised
    (we'd rather best-effort-kill than block the cancel path).
    """
    pid = _remote_pids.pop(job_id, None)
    if not pid:
        return
    try:
        # Local import to avoid pulling asyncssh into module load if
        # this helper is never used (tests, mock mode).
        from app import ssh_client
        async with await ssh_client._connect() as conn:
            await asyncio.wait_for(
                conn.run(
                    f"kill -9 {pid} 2>/dev/null || true", check=False,
                ),
                timeout=10,
            )
        log.info("Remote-killed PID %d for job %d", pid, job_id)
    except Exception as exc:
        log.warning(
            "Failed to remote-kill PID %d for job %d: %s", pid, job_id, exc,
        )
--- a/app/burnin/unlock.py
+++ b/app/burnin/unlock.py
@ -0,0 +1,209 @@
 """Pool-drive unlock state.
 Drives that ZFS reports as belonging to an active zpool (including the
 boot pool), drives carrying ZFS labels from a previously-imported pool
 ("exported"), and drives with a non-ZFS mount somewhere ("mounted") are
 all locked from burn-in until the operator explicitly unlocks them via
 ``POST /api/v1/drives/{id}/unlock``. Grants live in memory only — a
 container restart wipes them, which is the right default for "this is
 very dangerous." TTL is bounded so an unlock you forgot about can't sit
 armed indefinitely.
 Each lock kind has its own confirm token to make the override
 deliberate; see grant_pool_unlock for the matching logic.
 Public surface:
    is_unlocked(drive_id, current_pool_name, current_pool_role) -> bool
    unlock_expiry(drive_id, current_pool_name, current_pool_role) -> float|None
    grant_pool_unlock(drive_id, confirm_token, operator, reason) -> float
    PoolMemberError                  — raised by start_job
    UNLOCK_TTL_SECONDS               — for the unlock endpoint response
    BOOT_POOL_NAME / *_TOKEN consts  — for the UI / audit
 """
 from __future__ import annotations
 import logging
 import time as _time
 from dataclasses import dataclass
 import aiosqlite
 from app.config import settings
 log = logging.getLogger(__name__)
 UNLOCK_TTL_SECONDS = 600  # 10 minutes
 BOOT_POOL_NAME = "boot-pool"
 BOOT_POOL_CONFIRM_TOKEN = "DESTROY BOOT POOL"
 EXPORTED_POOL_ROLE = "exported"
 EXPORTED_CONFIRM_TOKEN = "DESTROY EXPORTED POOL"
 MOUNTED_ROLE = "mounted"
 MOUNTED_CONFIRM_TOKEN = "DESTROY MOUNTED FILESYSTEM"
@dataclass
 class _UnlockGrant:
    """An operator-issued, time-bounded permission to burn-in a pool drive.
    The grant is BOUND to the (pool_name, pool_role) observed at unlock
    time. If a subsequent poll reclassifies the drive — e.g. it was
    "(exported)" when unlocked but is now in active pool "tank", or it
    used to be a cache vdev and now shows as data — the grant is
    invalidated. Otherwise the operator's "I confirm this exported drive
    is decommissioned" judgement would silently authorise destruction
    of a live pool.
    """
    expiry: float
    pool_name: str
    pool_role: str | None
 _unlock_grants: dict[int, _UnlockGrant] = {}
 class PoolMemberError(Exception):
    """Raised by start_job when a drive is in a zpool and not unlocked."""
    def __init__(self, drive_id: int, pool_name: str, pool_role: str | None):
        self.drive_id = drive_id
        self.pool_name = pool_name
        self.pool_role = pool_role
        is_boot = pool_name == BOOT_POOL_NAME
        super().__init__(
            f"Drive is part of {'BOOT POOL' if is_boot else 'pool'} "
            f"'{pool_name}'{' (' + pool_role + ')' if pool_role else ''}. "
            f"Unlock required before burn-in."
        )
 def is_unlocked(drive_id: int, current_pool_name: str | None,
                current_pool_role: str | None) -> bool:
    """True iff a non-expired grant exists AND the drive's pool identity
    matches what was observed at unlock time."""
    grant = _unlock_grants.get(drive_id)
    if grant is None:
        return False
    if _time.time() >= grant.expiry:
        _unlock_grants.pop(drive_id, None)
        return False
    if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role:
        # Pool identity changed since unlock — drive may now belong to a
        # different (or live) pool. Invalidate the grant; operator must
        # re-unlock with eyes-open against the current state.
        _unlock_grants.pop(drive_id, None)
        log.warning(
            "Invalidating unlock grant for drive_id=%d: pool changed from "
            "(%s, %s) to (%s, %s)",
            drive_id, grant.pool_name, grant.pool_role,
            current_pool_name, current_pool_role,
        )
        return False
    return True
 def unlock_expiry(drive_id: int, current_pool_name: str | None,
                  current_pool_role: str | None) -> float | None:
    """Return the absolute expiry of an active grant, or None.
    Same identity-binding semantics as is_unlocked: a grant whose stored
    pool identity no longer matches the current row is treated as expired
    and reaped. This is what the dashboard reads to decide whether to show
    the unlocked-Burn-In affordance vs the locked-Unlock affordance.
    """
    grant = _unlock_grants.get(drive_id)
    if grant is None:
        return None
    if _time.time() >= grant.expiry:
        _unlock_grants.pop(drive_id, None)
        return None
    if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role:
        _unlock_grants.pop(drive_id, None)
        return None
    return grant.expiry
 def invalidate_grant(drive_id: int) -> None:
    """Drop a grant unconditionally — used by start_job when a fresh
    SSH-side pool check shows the drive's identity has shifted."""
    _unlock_grants.pop(drive_id, None)
 async def grant_pool_unlock(drive_id: int, confirm_token: str,
                            operator: str, reason: str) -> float:
    """Validate confirmation token + reason and grant a time-limited unlock.
    Raises ValueError on bad confirm_token, missing reason, or drive not
    actually in a pool. Returns the unix expiry timestamp on success.
    """
    if not reason or len(reason.strip()) < 5:
        raise ValueError("A reason of at least 5 characters is required.")
    if not operator or not operator.strip():
        raise ValueError("Operator name is required.")
    async with aiosqlite.connect(settings.db_path) as db:
        db.row_factory = aiosqlite.Row
        await db.execute("PRAGMA busy_timeout=10000")
        cur = await db.execute(
            "SELECT pool_name, pool_role, devname FROM drives WHERE id=?",
            (drive_id,),
        )
        row = await cur.fetchone()
        if not row:
            raise ValueError("Drive not found.")
        pool_name = row["pool_name"]
        pool_role = row["pool_role"]
        if not pool_name:
            raise ValueError(
                "This drive is not part of any pool — no unlock needed."
            )
        # Boot-pool / exported / mounted-fs all get dedicated, harder-to-
        # fat-finger tokens. Active data pools just need their pool name
        # typed.
        if pool_name == BOOT_POOL_NAME:
            expected = BOOT_POOL_CONFIRM_TOKEN
        elif pool_role == EXPORTED_POOL_ROLE:
            expected = EXPORTED_CONFIRM_TOKEN
        elif pool_role == MOUNTED_ROLE:
            expected = MOUNTED_CONFIRM_TOKEN
        else:
            expected = pool_name
        if (confirm_token or "").strip() != expected:
            raise ValueError("Confirmation token does not match.")
        if pool_name == BOOT_POOL_NAME:
            evt = "boot_pool_drive_unlocked"
        elif pool_role == EXPORTED_POOL_ROLE:
            evt = "exported_pool_drive_unlocked"
        elif pool_role == MOUNTED_ROLE:
            evt = "mounted_drive_unlocked"
        else:
            evt = "pool_drive_unlocked"
        await db.execute(
            """INSERT INTO audit_events
                   (event_type, drive_id, burnin_job_id, operator, message)
               VALUES (?,?,?,?,?)""",
            (evt, drive_id, None, operator.strip(),
             f"Unlocked {pool_name} drive {row['devname']} for burn-in: {reason.strip()}"),
        )
        await db.commit()
    # Arm the in-memory grant ONLY after the audit row is durable. If the
    # commit above raises, we exit without writing _unlock_grants — no
    # unaudited active unlocks. The grant is bound to the (pool_name,
    # pool_role) we observed under the open transaction so a later poll
    # that reclassifies the drive invalidates it (see is_unlocked).
    expiry = _time.time() + UNLOCK_TTL_SECONDS
    _unlock_grants[drive_id] = _UnlockGrant(
        expiry=expiry,
        pool_name=pool_name,
        pool_role=pool_role,
    )
    log.warning(
        "Pool-drive unlock granted: drive_id=%d pool=%s role=%s "
        "operator=%s reason=%r",
        drive_id, pool_name, pool_role, operator, reason,
    )
    return expiry
--- a/app/config.py
+++ b/app/config.py
@ -83,7 +83,7 @@ class Settings(BaseSettings):
    ssh_key: str = ""             # PEM private key content (paste full key including headers)
    # Application version — used by the /api/v1/updates/check endpoint
-    app_version: str = "1.0.0-29"
+    app_version: str = "1.0.0-30"
    # ---- Authentication (1.0.0-22) ----
    # session_secret: HMAC key for signing session cookies. Empty = generate
--- a/tests/test_unlock_flow.py
+++ b/tests/test_unlock_flow.py
@ -169,15 +169,18 @@ class TestUnlockFlow(unittest.IsolatedAsyncioTestCase):
    async def test_expired_grant_returns_false(self):
        from app import burnin
-        # Drop TTL to 0 so the grant is born expired.
+        from app.burnin import unlock as _unlock
-        original = burnin.UNLOCK_TTL_SECONDS
+        # Drop TTL to 0 so the grant is born expired. Monkey-patch the
-        burnin.UNLOCK_TTL_SECONDS = 0
+        # real source-of-truth in app.burnin.unlock — the alias on the
        # package root is bound at import time and won't propagate back.
        original = _unlock.UNLOCK_TTL_SECONDS
        _unlock.UNLOCK_TTL_SECONDS = 0
        try:
            await burnin.grant_pool_unlock(1, "tank", "op", "valid reason")
            self.assertFalse(burnin._is_unlocked(1, "tank", "data"))
            self.assertNotIn(1, burnin._unlock_grants)
        finally:
-            burnin.UNLOCK_TTL_SECONDS = original
+            _unlock.UNLOCK_TTL_SECONDS = original
    # ----- audit commit ordering (Codex finding #3) -----