First slice of the planned tech-debt cleanup. burnin.py was 1667 lines and growing; staged extraction gives smaller diffs to review and a clear bisect target if anything regresses. Mechanical move only — no behaviour change. The two extracted modules: * app/burnin/unlock.py — _UnlockGrant, _unlock_grants, PoolMemberError, is_unlocked / unlock_expiry / grant_pool_unlock, plus the four *_TOKEN constants and UNLOCK_TTL_SECONDS. Owns its module-level state; opens its own DB connection in grant_pool_unlock so it doesn't depend on the parent package's _db() helper. * app/burnin/kill.py — _remote_pids dict and the kill_remote_process / set_remote_pid / clear_remote_pid / get_remote_pid helpers. Pulled out of __init__.py so the asyncssh-ignores-signals workaround lives next to the state it operates on. app/burnin/__init__.py re-exports every public symbol the rest of the app imports — `from app import burnin; burnin.start_job(...)`, `burnin.PoolMemberError`, `burnin.UNLOCK_TTL_SECONDS`, etc. all keep working unchanged. Internal aliases `_remote_pids` and `_unlock_grants` on the package root point at the SAME dict objects in the submodules, so existing in-package mutations (set in stages, cleared in cleanup callbacks) work without rewrite. Test fix: tests/test_unlock_flow.py:test_expired_grant_returns_false monkey-patches UNLOCK_TTL_SECONDS. The package-root alias is bound at import time and won't propagate back to the submodule's read site, so the test now patches `app.burnin.unlock.UNLOCK_TTL_SECONDS` directly. Verification: 44/44 unit tests pass in container; /health 200; container boots clean. routes.py, mailer.py, poller.py untouched — the public API is identical. Future: extract stages, task, _common in subsequent versions. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
209 lines
8 KiB
Python
209 lines
8 KiB
Python
"""Pool-drive unlock state.
|
|
|
|
Drives that ZFS reports as belonging to an active zpool (including the
|
|
boot pool), drives carrying ZFS labels from a previously-imported pool
|
|
("exported"), and drives with a non-ZFS mount somewhere ("mounted") are
|
|
all locked from burn-in until the operator explicitly unlocks them via
|
|
``POST /api/v1/drives/{id}/unlock``. Grants live in memory only — a
|
|
container restart wipes them, which is the right default for "this is
|
|
very dangerous." TTL is bounded so an unlock you forgot about can't sit
|
|
armed indefinitely.
|
|
|
|
Each lock kind has its own confirm token to make the override
|
|
deliberate; see grant_pool_unlock for the matching logic.
|
|
|
|
Public surface:
|
|
is_unlocked(drive_id, current_pool_name, current_pool_role) -> bool
|
|
unlock_expiry(drive_id, current_pool_name, current_pool_role) -> float|None
|
|
grant_pool_unlock(drive_id, confirm_token, operator, reason) -> float
|
|
PoolMemberError — raised by start_job
|
|
UNLOCK_TTL_SECONDS — for the unlock endpoint response
|
|
BOOT_POOL_NAME / *_TOKEN consts — for the UI / audit
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import time as _time
|
|
from dataclasses import dataclass
|
|
|
|
import aiosqlite
|
|
|
|
from app.config import settings
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
UNLOCK_TTL_SECONDS = 600 # 10 minutes
|
|
BOOT_POOL_NAME = "boot-pool"
|
|
BOOT_POOL_CONFIRM_TOKEN = "DESTROY BOOT POOL"
|
|
EXPORTED_POOL_ROLE = "exported"
|
|
EXPORTED_CONFIRM_TOKEN = "DESTROY EXPORTED POOL"
|
|
MOUNTED_ROLE = "mounted"
|
|
MOUNTED_CONFIRM_TOKEN = "DESTROY MOUNTED FILESYSTEM"
|
|
|
|
|
|
@dataclass
|
|
class _UnlockGrant:
|
|
"""An operator-issued, time-bounded permission to burn-in a pool drive.
|
|
|
|
The grant is BOUND to the (pool_name, pool_role) observed at unlock
|
|
time. If a subsequent poll reclassifies the drive — e.g. it was
|
|
"(exported)" when unlocked but is now in active pool "tank", or it
|
|
used to be a cache vdev and now shows as data — the grant is
|
|
invalidated. Otherwise the operator's "I confirm this exported drive
|
|
is decommissioned" judgement would silently authorise destruction
|
|
of a live pool.
|
|
"""
|
|
expiry: float
|
|
pool_name: str
|
|
pool_role: str | None
|
|
|
|
|
|
_unlock_grants: dict[int, _UnlockGrant] = {}
|
|
|
|
|
|
class PoolMemberError(Exception):
|
|
"""Raised by start_job when a drive is in a zpool and not unlocked."""
|
|
def __init__(self, drive_id: int, pool_name: str, pool_role: str | None):
|
|
self.drive_id = drive_id
|
|
self.pool_name = pool_name
|
|
self.pool_role = pool_role
|
|
is_boot = pool_name == BOOT_POOL_NAME
|
|
super().__init__(
|
|
f"Drive is part of {'BOOT POOL' if is_boot else 'pool'} "
|
|
f"'{pool_name}'{' (' + pool_role + ')' if pool_role else ''}. "
|
|
f"Unlock required before burn-in."
|
|
)
|
|
|
|
|
|
def is_unlocked(drive_id: int, current_pool_name: str | None,
|
|
current_pool_role: str | None) -> bool:
|
|
"""True iff a non-expired grant exists AND the drive's pool identity
|
|
matches what was observed at unlock time."""
|
|
grant = _unlock_grants.get(drive_id)
|
|
if grant is None:
|
|
return False
|
|
if _time.time() >= grant.expiry:
|
|
_unlock_grants.pop(drive_id, None)
|
|
return False
|
|
if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role:
|
|
# Pool identity changed since unlock — drive may now belong to a
|
|
# different (or live) pool. Invalidate the grant; operator must
|
|
# re-unlock with eyes-open against the current state.
|
|
_unlock_grants.pop(drive_id, None)
|
|
log.warning(
|
|
"Invalidating unlock grant for drive_id=%d: pool changed from "
|
|
"(%s, %s) to (%s, %s)",
|
|
drive_id, grant.pool_name, grant.pool_role,
|
|
current_pool_name, current_pool_role,
|
|
)
|
|
return False
|
|
return True
|
|
|
|
|
|
def unlock_expiry(drive_id: int, current_pool_name: str | None,
|
|
current_pool_role: str | None) -> float | None:
|
|
"""Return the absolute expiry of an active grant, or None.
|
|
|
|
Same identity-binding semantics as is_unlocked: a grant whose stored
|
|
pool identity no longer matches the current row is treated as expired
|
|
and reaped. This is what the dashboard reads to decide whether to show
|
|
the unlocked-Burn-In affordance vs the locked-Unlock affordance.
|
|
"""
|
|
grant = _unlock_grants.get(drive_id)
|
|
if grant is None:
|
|
return None
|
|
if _time.time() >= grant.expiry:
|
|
_unlock_grants.pop(drive_id, None)
|
|
return None
|
|
if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role:
|
|
_unlock_grants.pop(drive_id, None)
|
|
return None
|
|
return grant.expiry
|
|
|
|
|
|
def invalidate_grant(drive_id: int) -> None:
|
|
"""Drop a grant unconditionally — used by start_job when a fresh
|
|
SSH-side pool check shows the drive's identity has shifted."""
|
|
_unlock_grants.pop(drive_id, None)
|
|
|
|
|
|
async def grant_pool_unlock(drive_id: int, confirm_token: str,
|
|
operator: str, reason: str) -> float:
|
|
"""Validate confirmation token + reason and grant a time-limited unlock.
|
|
|
|
Raises ValueError on bad confirm_token, missing reason, or drive not
|
|
actually in a pool. Returns the unix expiry timestamp on success.
|
|
"""
|
|
if not reason or len(reason.strip()) < 5:
|
|
raise ValueError("A reason of at least 5 characters is required.")
|
|
if not operator or not operator.strip():
|
|
raise ValueError("Operator name is required.")
|
|
|
|
async with aiosqlite.connect(settings.db_path) as db:
|
|
db.row_factory = aiosqlite.Row
|
|
await db.execute("PRAGMA busy_timeout=10000")
|
|
cur = await db.execute(
|
|
"SELECT pool_name, pool_role, devname FROM drives WHERE id=?",
|
|
(drive_id,),
|
|
)
|
|
row = await cur.fetchone()
|
|
if not row:
|
|
raise ValueError("Drive not found.")
|
|
pool_name = row["pool_name"]
|
|
pool_role = row["pool_role"]
|
|
if not pool_name:
|
|
raise ValueError(
|
|
"This drive is not part of any pool — no unlock needed."
|
|
)
|
|
|
|
# Boot-pool / exported / mounted-fs all get dedicated, harder-to-
|
|
# fat-finger tokens. Active data pools just need their pool name
|
|
# typed.
|
|
if pool_name == BOOT_POOL_NAME:
|
|
expected = BOOT_POOL_CONFIRM_TOKEN
|
|
elif pool_role == EXPORTED_POOL_ROLE:
|
|
expected = EXPORTED_CONFIRM_TOKEN
|
|
elif pool_role == MOUNTED_ROLE:
|
|
expected = MOUNTED_CONFIRM_TOKEN
|
|
else:
|
|
expected = pool_name
|
|
if (confirm_token or "").strip() != expected:
|
|
raise ValueError("Confirmation token does not match.")
|
|
|
|
if pool_name == BOOT_POOL_NAME:
|
|
evt = "boot_pool_drive_unlocked"
|
|
elif pool_role == EXPORTED_POOL_ROLE:
|
|
evt = "exported_pool_drive_unlocked"
|
|
elif pool_role == MOUNTED_ROLE:
|
|
evt = "mounted_drive_unlocked"
|
|
else:
|
|
evt = "pool_drive_unlocked"
|
|
await db.execute(
|
|
"""INSERT INTO audit_events
|
|
(event_type, drive_id, burnin_job_id, operator, message)
|
|
VALUES (?,?,?,?,?)""",
|
|
(evt, drive_id, None, operator.strip(),
|
|
f"Unlocked {pool_name} drive {row['devname']} for burn-in: {reason.strip()}"),
|
|
)
|
|
await db.commit()
|
|
|
|
# Arm the in-memory grant ONLY after the audit row is durable. If the
|
|
# commit above raises, we exit without writing _unlock_grants — no
|
|
# unaudited active unlocks. The grant is bound to the (pool_name,
|
|
# pool_role) we observed under the open transaction so a later poll
|
|
# that reclassifies the drive invalidates it (see is_unlocked).
|
|
expiry = _time.time() + UNLOCK_TTL_SECONDS
|
|
_unlock_grants[drive_id] = _UnlockGrant(
|
|
expiry=expiry,
|
|
pool_name=pool_name,
|
|
pool_role=pool_role,
|
|
)
|
|
|
|
log.warning(
|
|
"Pool-drive unlock granted: drive_id=%d pool=%s role=%s "
|
|
"operator=%s reason=%r",
|
|
drive_id, pool_name, pool_role, operator, reason,
|
|
)
|
|
return expiry
|