nas-burnin/app/burnin/unlock.py

"""Pool-drive unlock state.

Drives that ZFS reports as belonging to an active zpool (including the
boot pool), drives carrying ZFS labels from a previously-imported pool
("exported"), and drives with a non-ZFS mount somewhere ("mounted") are
all locked from burn-in until the operator explicitly unlocks them via
``POST /api/v1/drives/{id}/unlock``. Grants live in memory only — a
container restart wipes them, which is the right default for "this is
very dangerous." TTL is bounded so an unlock you forgot about can't sit
armed indefinitely.

Each lock kind has its own confirm token to make the override
deliberate; see grant_pool_unlock for the matching logic.

Public surface:
    is_unlocked(drive_id, current_pool_name, current_pool_role) -> bool
    unlock_expiry(drive_id, current_pool_name, current_pool_role) -> float|None
    grant_pool_unlock(drive_id, confirm_token, operator, reason) -> float
    PoolMemberError                  — raised by start_job
    UNLOCK_TTL_SECONDS               — for the unlock endpoint response
    BOOT_POOL_NAME / *_TOKEN consts  — for the UI / audit
"""

from __future__ import annotations

import logging
import time as _time
from dataclasses import dataclass

import aiosqlite

from app.config import settings

log = logging.getLogger(__name__)


UNLOCK_TTL_SECONDS = 600  # 10 minutes
BOOT_POOL_NAME = "boot-pool"
BOOT_POOL_CONFIRM_TOKEN = "DESTROY BOOT POOL"
EXPORTED_POOL_ROLE = "exported"
EXPORTED_CONFIRM_TOKEN = "DESTROY EXPORTED POOL"
MOUNTED_ROLE = "mounted"
MOUNTED_CONFIRM_TOKEN = "DESTROY MOUNTED FILESYSTEM"


@dataclass
class _UnlockGrant:
    """An operator-issued, time-bounded permission to burn-in a pool drive.

    The grant is BOUND to the (pool_name, pool_role) observed at unlock
    time. If a subsequent poll reclassifies the drive — e.g. it was
    "(exported)" when unlocked but is now in active pool "tank", or it
    used to be a cache vdev and now shows as data — the grant is
    invalidated. Otherwise the operator's "I confirm this exported drive
    is decommissioned" judgement would silently authorise destruction
    of a live pool.
    """
    expiry: float
    pool_name: str
    pool_role: str | None


_unlock_grants: dict[int, _UnlockGrant] = {}


class PoolMemberError(Exception):
    """Raised by start_job when a drive is in a zpool and not unlocked."""
    def __init__(self, drive_id: int, pool_name: str, pool_role: str | None):
        self.drive_id = drive_id
        self.pool_name = pool_name
        self.pool_role = pool_role
        is_boot = pool_name == BOOT_POOL_NAME
        super().__init__(
            f"Drive is part of {'BOOT POOL' if is_boot else 'pool'} "
            f"'{pool_name}'{' (' + pool_role + ')' if pool_role else ''}. "
            f"Unlock required before burn-in."
        )


def is_unlocked(drive_id: int, current_pool_name: str | None,
                current_pool_role: str | None) -> bool:
    """True iff a non-expired grant exists AND the drive's pool identity
    matches what was observed at unlock time."""
    grant = _unlock_grants.get(drive_id)
    if grant is None:
        return False
    if _time.time() >= grant.expiry:
        _unlock_grants.pop(drive_id, None)
        return False
    if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role:
        # Pool identity changed since unlock — drive may now belong to a
        # different (or live) pool. Invalidate the grant; operator must
        # re-unlock with eyes-open against the current state.
        _unlock_grants.pop(drive_id, None)
        log.warning(
            "Invalidating unlock grant for drive_id=%d: pool changed from "
            "(%s, %s) to (%s, %s)",
            drive_id, grant.pool_name, grant.pool_role,
            current_pool_name, current_pool_role,
        )
        return False
    return True


def unlock_expiry(drive_id: int, current_pool_name: str | None,
                  current_pool_role: str | None) -> float | None:
    """Return the absolute expiry of an active grant, or None.

    Same identity-binding semantics as is_unlocked: a grant whose stored
    pool identity no longer matches the current row is treated as expired
    and reaped. This is what the dashboard reads to decide whether to show
    the unlocked-Burn-In affordance vs the locked-Unlock affordance.
    """
    grant = _unlock_grants.get(drive_id)
    if grant is None:
        return None
    if _time.time() >= grant.expiry:
        _unlock_grants.pop(drive_id, None)
        return None
    if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role:
        _unlock_grants.pop(drive_id, None)
        return None
    return grant.expiry


def invalidate_grant(drive_id: int) -> None:
    """Drop a grant unconditionally — used by start_job when a fresh
    SSH-side pool check shows the drive's identity has shifted."""
    _unlock_grants.pop(drive_id, None)


async def grant_pool_unlock(drive_id: int, confirm_token: str,
                            operator: str, reason: str) -> float:
    """Validate confirmation token + reason and grant a time-limited unlock.

    Raises ValueError on bad confirm_token, missing reason, or drive not
    actually in a pool. Returns the unix expiry timestamp on success.
    """
    if not reason or len(reason.strip()) < 5:
        raise ValueError("A reason of at least 5 characters is required.")
    if not operator or not operator.strip():
        raise ValueError("Operator name is required.")

    async with aiosqlite.connect(settings.db_path) as db:
        db.row_factory = aiosqlite.Row
        await db.execute("PRAGMA busy_timeout=10000")
        cur = await db.execute(
            "SELECT pool_name, pool_role, devname FROM drives WHERE id=?",
            (drive_id,),
        )
        row = await cur.fetchone()
        if not row:
            raise ValueError("Drive not found.")
        pool_name = row["pool_name"]
        pool_role = row["pool_role"]
        if not pool_name:
            raise ValueError(
                "This drive is not part of any pool — no unlock needed."
            )

        # Boot-pool / exported / mounted-fs all get dedicated, harder-to-
        # fat-finger tokens. Active data pools just need their pool name
        # typed.
        if pool_name == BOOT_POOL_NAME:
            expected = BOOT_POOL_CONFIRM_TOKEN
        elif pool_role == EXPORTED_POOL_ROLE:
            expected = EXPORTED_CONFIRM_TOKEN
        elif pool_role == MOUNTED_ROLE:
            expected = MOUNTED_CONFIRM_TOKEN
        else:
            expected = pool_name
        if (confirm_token or "").strip() != expected:
            raise ValueError("Confirmation token does not match.")

        if pool_name == BOOT_POOL_NAME:
            evt = "boot_pool_drive_unlocked"
        elif pool_role == EXPORTED_POOL_ROLE:
            evt = "exported_pool_drive_unlocked"
        elif pool_role == MOUNTED_ROLE:
            evt = "mounted_drive_unlocked"
        else:
            evt = "pool_drive_unlocked"
        await db.execute(
            """INSERT INTO audit_events
                   (event_type, drive_id, burnin_job_id, operator, message)
               VALUES (?,?,?,?,?)""",
            (evt, drive_id, None, operator.strip(),
             f"Unlocked {pool_name} drive {row['devname']} for burn-in: {reason.strip()}"),
        )
        await db.commit()

    # Arm the in-memory grant ONLY after the audit row is durable. If the
    # commit above raises, we exit without writing _unlock_grants — no
    # unaudited active unlocks. The grant is bound to the (pool_name,
    # pool_role) we observed under the open transaction so a later poll
    # that reclassifies the drive invalidates it (see is_unlocked).
    expiry = _time.time() + UNLOCK_TTL_SECONDS
    _unlock_grants[drive_id] = _UnlockGrant(
        expiry=expiry,
        pool_name=pool_name,
        pool_role=pool_role,
    )

    log.warning(
        "Pool-drive unlock granted: drive_id=%d pool=%s role=%s "
        "operator=%s reason=%r",
        drive_id, pool_name, pool_role, operator, reason,
    )
    return expiry