refactor: split burnin.py into a package — extract unlock + kill (1.0.0-30)
First slice of the planned tech-debt cleanup. burnin.py was 1667 lines and growing; staged extraction gives smaller diffs to review and a clear bisect target if anything regresses. Mechanical move only — no behaviour change. The two extracted modules: * app/burnin/unlock.py — _UnlockGrant, _unlock_grants, PoolMemberError, is_unlocked / unlock_expiry / grant_pool_unlock, plus the four *_TOKEN constants and UNLOCK_TTL_SECONDS. Owns its module-level state; opens its own DB connection in grant_pool_unlock so it doesn't depend on the parent package's _db() helper. * app/burnin/kill.py — _remote_pids dict and the kill_remote_process / set_remote_pid / clear_remote_pid / get_remote_pid helpers. Pulled out of __init__.py so the asyncssh-ignores-signals workaround lives next to the state it operates on. app/burnin/__init__.py re-exports every public symbol the rest of the app imports — `from app import burnin; burnin.start_job(...)`, `burnin.PoolMemberError`, `burnin.UNLOCK_TTL_SECONDS`, etc. all keep working unchanged. Internal aliases `_remote_pids` and `_unlock_grants` on the package root point at the SAME dict objects in the submodules, so existing in-package mutations (set in stages, cleared in cleanup callbacks) work without rewrite. Test fix: tests/test_unlock_flow.py:test_expired_grant_returns_false monkey-patches UNLOCK_TTL_SECONDS. The package-root alias is bound at import time and won't propagate back to the submodule's read site, so the test now patches `app.burnin.unlock.UNLOCK_TTL_SECONDS` directly. Verification: 44/44 unit tests pass in container; /health 200; container boots clean. routes.py, mailer.py, poller.py untouched — the public API is identical. Future: extract stages, task, _common in subsequent versions. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
6c20e57fd8
commit
9cbae44495
5 changed files with 313 additions and 212 deletions
|
|
@ -72,11 +72,29 @@ _client: TrueNASClient | None = None
|
||||||
# cancel_job / check_stuck_jobs can actually unwedge a stuck task.
|
# cancel_job / check_stuck_jobs can actually unwedge a stuck task.
|
||||||
_active_tasks: dict[int, "asyncio.Task"] = {}
|
_active_tasks: dict[int, "asyncio.Task"] = {}
|
||||||
|
|
||||||
# Remote PID of any long-running SSH child process (currently only badblocks)
|
# Remote-PID kill machinery + pool-drive unlock state both live in their
|
||||||
# so we can kill it via a fresh SSH session — proc.kill() over asyncssh sends
|
# own submodules. We re-export the names the rest of the app reaches for
|
||||||
# a "signal" channel request that OpenSSH sshd ignores by default, leaving
|
# (and keep the _kill_remote_process / _is_unlocked aliases for callers
|
||||||
# the remote process running and proc.wait() hanging forever.
|
# that grew up before the split).
|
||||||
_remote_pids: dict[int, int] = {}
|
from . import kill as _kill # noqa: E402
|
||||||
|
from . import unlock as _unlock # noqa: E402
|
||||||
|
|
||||||
|
_remote_pids = _kill._remote_pids
|
||||||
|
_unlock_grants = _unlock._unlock_grants
|
||||||
|
|
||||||
|
PoolMemberError = _unlock.PoolMemberError
|
||||||
|
UNLOCK_TTL_SECONDS = _unlock.UNLOCK_TTL_SECONDS
|
||||||
|
BOOT_POOL_NAME = _unlock.BOOT_POOL_NAME
|
||||||
|
BOOT_POOL_CONFIRM_TOKEN = _unlock.BOOT_POOL_CONFIRM_TOKEN
|
||||||
|
EXPORTED_POOL_ROLE = _unlock.EXPORTED_POOL_ROLE
|
||||||
|
EXPORTED_CONFIRM_TOKEN = _unlock.EXPORTED_CONFIRM_TOKEN
|
||||||
|
MOUNTED_ROLE = _unlock.MOUNTED_ROLE
|
||||||
|
MOUNTED_CONFIRM_TOKEN = _unlock.MOUNTED_CONFIRM_TOKEN
|
||||||
|
|
||||||
|
unlock_expiry = _unlock.unlock_expiry
|
||||||
|
grant_pool_unlock = _unlock.grant_pool_unlock
|
||||||
|
_is_unlocked = _unlock.is_unlocked # legacy private name
|
||||||
|
_kill_remote_process = _kill.kill_remote_process
|
||||||
|
|
||||||
|
|
||||||
def _now() -> str:
|
def _now() -> str:
|
||||||
|
|
@ -145,208 +163,8 @@ def _spawn_run_job(job_id: int) -> "asyncio.Task":
|
||||||
return task
|
return task
|
||||||
|
|
||||||
|
|
||||||
async def _kill_remote_process(job_id: int) -> None:
|
# _kill_remote_process is re-exported above from .kill — the original
|
||||||
"""Send kill -9 to the remote PID associated with this job, if any.
|
# definition was extracted to app/burnin/kill.py in 1.0.0-30.
|
||||||
|
|
||||||
asyncssh's proc.kill() sends an SSH 'signal' channel request which
|
|
||||||
OpenSSH's sshd does not honor by default. Opening a fresh session and
|
|
||||||
running /bin/kill is the reliable way to actually terminate the process.
|
|
||||||
"""
|
|
||||||
pid = _remote_pids.pop(job_id, None)
|
|
||||||
if not pid:
|
|
||||||
return
|
|
||||||
try:
|
|
||||||
from app import ssh_client
|
|
||||||
async with await ssh_client._connect() as conn:
|
|
||||||
await asyncio.wait_for(
|
|
||||||
conn.run(f"kill -9 {pid} 2>/dev/null || true", check=False),
|
|
||||||
timeout=10,
|
|
||||||
)
|
|
||||||
log.info("Remote-killed PID %d for job %d", pid, job_id)
|
|
||||||
except Exception as exc:
|
|
||||||
log.warning("Failed to remote-kill PID %d for job %d: %s", pid, job_id, exc)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Pool-drive unlock state
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
#
|
|
||||||
# Drives that ZFS reports as belonging to an active zpool (including the
|
|
||||||
# boot pool) are locked from burn-in until the operator explicitly unlocks
|
|
||||||
# them via POST /api/v1/drives/{id}/unlock. Grants live in memory only —
|
|
||||||
# a container restart wipes them, which is the right default for "this is
|
|
||||||
# very dangerous." TTL is bounded so an unlock you forgot about can't sit
|
|
||||||
# armed indefinitely.
|
|
||||||
|
|
||||||
import time as _time
|
|
||||||
from dataclasses import dataclass
|
|
||||||
|
|
||||||
UNLOCK_TTL_SECONDS = 600 # 10 minutes
|
|
||||||
BOOT_POOL_NAME = "boot-pool"
|
|
||||||
BOOT_POOL_CONFIRM_TOKEN = "DESTROY BOOT POOL"
|
|
||||||
EXPORTED_POOL_ROLE = "exported"
|
|
||||||
EXPORTED_CONFIRM_TOKEN = "DESTROY EXPORTED POOL"
|
|
||||||
MOUNTED_ROLE = "mounted"
|
|
||||||
MOUNTED_CONFIRM_TOKEN = "DESTROY MOUNTED FILESYSTEM"
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class _UnlockGrant:
|
|
||||||
"""An operator-issued, time-bounded permission to burn-in a pool drive.
|
|
||||||
|
|
||||||
The grant is BOUND to the (pool_name, pool_role) observed at unlock
|
|
||||||
time. If a subsequent poll reclassifies the drive — e.g. it was
|
|
||||||
"(exported)" when unlocked but is now in active pool "tank", or it
|
|
||||||
used to be a cache vdev and now shows as data — the grant is
|
|
||||||
invalidated. Otherwise the operator's "I confirm this exported drive
|
|
||||||
is decommissioned" judgement would silently authorise destruction
|
|
||||||
of a live pool.
|
|
||||||
"""
|
|
||||||
expiry: float
|
|
||||||
pool_name: str
|
|
||||||
pool_role: str | None
|
|
||||||
|
|
||||||
|
|
||||||
_unlock_grants: dict[int, _UnlockGrant] = {}
|
|
||||||
|
|
||||||
|
|
||||||
class PoolMemberError(Exception):
|
|
||||||
"""Raised by start_job when a drive is in a zpool and not unlocked."""
|
|
||||||
def __init__(self, drive_id: int, pool_name: str, pool_role: str | None):
|
|
||||||
self.drive_id = drive_id
|
|
||||||
self.pool_name = pool_name
|
|
||||||
self.pool_role = pool_role
|
|
||||||
is_boot = pool_name == BOOT_POOL_NAME
|
|
||||||
super().__init__(
|
|
||||||
f"Drive is part of {'BOOT POOL' if is_boot else 'pool'} "
|
|
||||||
f"'{pool_name}'{' (' + pool_role + ')' if pool_role else ''}. "
|
|
||||||
f"Unlock required before burn-in."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _is_unlocked(drive_id: int, current_pool_name: str | None,
|
|
||||||
current_pool_role: str | None) -> bool:
|
|
||||||
"""True iff a non-expired grant exists AND the drive's pool identity
|
|
||||||
matches what was observed at unlock time."""
|
|
||||||
grant = _unlock_grants.get(drive_id)
|
|
||||||
if grant is None:
|
|
||||||
return False
|
|
||||||
if _time.time() >= grant.expiry:
|
|
||||||
_unlock_grants.pop(drive_id, None)
|
|
||||||
return False
|
|
||||||
if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role:
|
|
||||||
# Pool identity changed since unlock — drive may now belong to a
|
|
||||||
# different (or live) pool. Invalidate the grant; operator must
|
|
||||||
# re-unlock with eyes-open against the current state.
|
|
||||||
_unlock_grants.pop(drive_id, None)
|
|
||||||
log.warning(
|
|
||||||
"Invalidating unlock grant for drive_id=%d: pool changed from "
|
|
||||||
"(%s, %s) to (%s, %s)",
|
|
||||||
drive_id, grant.pool_name, grant.pool_role,
|
|
||||||
current_pool_name, current_pool_role,
|
|
||||||
)
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def unlock_expiry(drive_id: int, current_pool_name: str | None,
|
|
||||||
current_pool_role: str | None) -> float | None:
|
|
||||||
"""Return the absolute expiry of an active grant, or None.
|
|
||||||
|
|
||||||
Same identity-binding semantics as _is_unlocked: a grant whose stored
|
|
||||||
pool identity no longer matches the current row is treated as expired
|
|
||||||
and reaped. This is what the dashboard reads to decide whether to show
|
|
||||||
the unlocked-Burn-In affordance vs the locked-Unlock affordance.
|
|
||||||
"""
|
|
||||||
grant = _unlock_grants.get(drive_id)
|
|
||||||
if grant is None:
|
|
||||||
return None
|
|
||||||
if _time.time() >= grant.expiry:
|
|
||||||
_unlock_grants.pop(drive_id, None)
|
|
||||||
return None
|
|
||||||
if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role:
|
|
||||||
_unlock_grants.pop(drive_id, None)
|
|
||||||
return None
|
|
||||||
return grant.expiry
|
|
||||||
|
|
||||||
|
|
||||||
async def grant_pool_unlock(drive_id: int, confirm_token: str,
|
|
||||||
operator: str, reason: str) -> float:
|
|
||||||
"""Validate confirmation token + reason and grant a time-limited unlock.
|
|
||||||
|
|
||||||
Raises ValueError on bad confirm_token, missing reason, or drive not
|
|
||||||
actually in a pool. Returns the unix expiry timestamp on success.
|
|
||||||
"""
|
|
||||||
if not reason or len(reason.strip()) < 5:
|
|
||||||
raise ValueError("A reason of at least 5 characters is required.")
|
|
||||||
if not operator or not operator.strip():
|
|
||||||
raise ValueError("Operator name is required.")
|
|
||||||
|
|
||||||
async with _db() as db:
|
|
||||||
db.row_factory = aiosqlite.Row
|
|
||||||
cur = await db.execute(
|
|
||||||
"SELECT pool_name, pool_role, devname FROM drives WHERE id=?",
|
|
||||||
(drive_id,),
|
|
||||||
)
|
|
||||||
row = await cur.fetchone()
|
|
||||||
if not row:
|
|
||||||
raise ValueError("Drive not found.")
|
|
||||||
pool_name = row["pool_name"]
|
|
||||||
pool_role = row["pool_role"]
|
|
||||||
if not pool_name:
|
|
||||||
raise ValueError(
|
|
||||||
"This drive is not part of any pool — no unlock needed."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Boot-pool / exported / mounted-fs all get dedicated, harder-to-
|
|
||||||
# fat-finger tokens. Active data pools just need their pool name
|
|
||||||
# typed.
|
|
||||||
if pool_name == BOOT_POOL_NAME:
|
|
||||||
expected = BOOT_POOL_CONFIRM_TOKEN
|
|
||||||
elif pool_role == EXPORTED_POOL_ROLE:
|
|
||||||
expected = EXPORTED_CONFIRM_TOKEN
|
|
||||||
elif pool_role == MOUNTED_ROLE:
|
|
||||||
expected = MOUNTED_CONFIRM_TOKEN
|
|
||||||
else:
|
|
||||||
expected = pool_name
|
|
||||||
if (confirm_token or "").strip() != expected:
|
|
||||||
raise ValueError("Confirmation token does not match.")
|
|
||||||
|
|
||||||
if pool_name == BOOT_POOL_NAME:
|
|
||||||
evt = "boot_pool_drive_unlocked"
|
|
||||||
elif pool_role == EXPORTED_POOL_ROLE:
|
|
||||||
evt = "exported_pool_drive_unlocked"
|
|
||||||
elif pool_role == MOUNTED_ROLE:
|
|
||||||
evt = "mounted_drive_unlocked"
|
|
||||||
else:
|
|
||||||
evt = "pool_drive_unlocked"
|
|
||||||
await db.execute(
|
|
||||||
"""INSERT INTO audit_events
|
|
||||||
(event_type, drive_id, burnin_job_id, operator, message)
|
|
||||||
VALUES (?,?,?,?,?)""",
|
|
||||||
(evt, drive_id, None, operator.strip(),
|
|
||||||
f"Unlocked {pool_name} drive {row['devname']} for burn-in: {reason.strip()}"),
|
|
||||||
)
|
|
||||||
await db.commit()
|
|
||||||
|
|
||||||
# Arm the in-memory grant ONLY after the audit row is durable. If the
|
|
||||||
# commit above raises, we exit without writing _unlock_grants — no
|
|
||||||
# unaudited active unlocks. The grant is bound to the (pool_name,
|
|
||||||
# pool_role) we observed under the open transaction so a later poll
|
|
||||||
# that reclassifies the drive invalidates it (see _is_unlocked).
|
|
||||||
expiry = _time.time() + UNLOCK_TTL_SECONDS
|
|
||||||
_unlock_grants[drive_id] = _UnlockGrant(
|
|
||||||
expiry=expiry,
|
|
||||||
pool_name=pool_name,
|
|
||||||
pool_role=pool_role,
|
|
||||||
)
|
|
||||||
|
|
||||||
log.warning(
|
|
||||||
"Pool-drive unlock granted: drive_id=%d pool=%s role=%s "
|
|
||||||
"operator=%s reason=%r",
|
|
||||||
drive_id, pool_name, pool_role, operator, reason,
|
|
||||||
)
|
|
||||||
return expiry
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
71
app/burnin/kill.py
Normal file
71
app/burnin/kill.py
Normal file
|
|
@ -0,0 +1,71 @@
|
||||||
|
"""Remote process kill machinery.
|
||||||
|
|
||||||
|
asyncssh's ``proc.kill()`` sends an SSH "signal" channel request that
|
||||||
|
OpenSSH's sshd ignores by default — the remote process keeps running and
|
||||||
|
``proc.wait()`` hangs forever, pinning the asyncio.Semaphore slot.
|
||||||
|
The fix: capture the remote PID at command launch (via the
|
||||||
|
``sh -c 'echo PID:$$; exec ...'`` wrapper) and issue ``kill -9 <pid>``
|
||||||
|
over a fresh SSH session when we need to abort. This module owns that
|
||||||
|
state and the kill helper.
|
||||||
|
|
||||||
|
Public surface (used by the rest of the burnin package):
|
||||||
|
set_remote_pid(job_id, pid) — call from the stage when launch succeeds
|
||||||
|
clear_remote_pid(job_id) — call from the cleanup callback
|
||||||
|
kill_remote_process(job_id) — fire-and-clear; safe to call repeatedly
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# job_id -> remote PID. Module-level dict so it survives across the
|
||||||
|
# stage / task / __init__ split without needing to thread it through
|
||||||
|
# function signatures.
|
||||||
|
_remote_pids: dict[int, int] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def set_remote_pid(job_id: int, pid: int) -> None:
|
||||||
|
"""Record the remote PID captured by the running stage."""
|
||||||
|
_remote_pids[job_id] = pid
|
||||||
|
|
||||||
|
|
||||||
|
def clear_remote_pid(job_id: int) -> None:
|
||||||
|
"""Drop the PID without trying to kill — used by the task cleanup
|
||||||
|
callback so a normally-completed job doesn't carry stale state."""
|
||||||
|
_remote_pids.pop(job_id, None)
|
||||||
|
|
||||||
|
|
||||||
|
def get_remote_pid(job_id: int) -> int | None:
|
||||||
|
return _remote_pids.get(job_id)
|
||||||
|
|
||||||
|
|
||||||
|
async def kill_remote_process(job_id: int) -> None:
|
||||||
|
"""Send kill -9 to the remote PID associated with this job, if any.
|
||||||
|
|
||||||
|
Idempotent — pops the PID before attempting the kill so a second
|
||||||
|
call is a no-op. SSH connection failure is logged but never raised
|
||||||
|
(we'd rather best-effort-kill than block the cancel path).
|
||||||
|
"""
|
||||||
|
pid = _remote_pids.pop(job_id, None)
|
||||||
|
if not pid:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
# Local import to avoid pulling asyncssh into module load if
|
||||||
|
# this helper is never used (tests, mock mode).
|
||||||
|
from app import ssh_client
|
||||||
|
async with await ssh_client._connect() as conn:
|
||||||
|
await asyncio.wait_for(
|
||||||
|
conn.run(
|
||||||
|
f"kill -9 {pid} 2>/dev/null || true", check=False,
|
||||||
|
),
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
log.info("Remote-killed PID %d for job %d", pid, job_id)
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning(
|
||||||
|
"Failed to remote-kill PID %d for job %d: %s", pid, job_id, exc,
|
||||||
|
)
|
||||||
209
app/burnin/unlock.py
Normal file
209
app/burnin/unlock.py
Normal file
|
|
@ -0,0 +1,209 @@
|
||||||
|
"""Pool-drive unlock state.
|
||||||
|
|
||||||
|
Drives that ZFS reports as belonging to an active zpool (including the
|
||||||
|
boot pool), drives carrying ZFS labels from a previously-imported pool
|
||||||
|
("exported"), and drives with a non-ZFS mount somewhere ("mounted") are
|
||||||
|
all locked from burn-in until the operator explicitly unlocks them via
|
||||||
|
``POST /api/v1/drives/{id}/unlock``. Grants live in memory only — a
|
||||||
|
container restart wipes them, which is the right default for "this is
|
||||||
|
very dangerous." TTL is bounded so an unlock you forgot about can't sit
|
||||||
|
armed indefinitely.
|
||||||
|
|
||||||
|
Each lock kind has its own confirm token to make the override
|
||||||
|
deliberate; see grant_pool_unlock for the matching logic.
|
||||||
|
|
||||||
|
Public surface:
|
||||||
|
is_unlocked(drive_id, current_pool_name, current_pool_role) -> bool
|
||||||
|
unlock_expiry(drive_id, current_pool_name, current_pool_role) -> float|None
|
||||||
|
grant_pool_unlock(drive_id, confirm_token, operator, reason) -> float
|
||||||
|
PoolMemberError — raised by start_job
|
||||||
|
UNLOCK_TTL_SECONDS — for the unlock endpoint response
|
||||||
|
BOOT_POOL_NAME / *_TOKEN consts — for the UI / audit
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import time as _time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import aiosqlite
|
||||||
|
|
||||||
|
from app.config import settings
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
UNLOCK_TTL_SECONDS = 600 # 10 minutes
|
||||||
|
BOOT_POOL_NAME = "boot-pool"
|
||||||
|
BOOT_POOL_CONFIRM_TOKEN = "DESTROY BOOT POOL"
|
||||||
|
EXPORTED_POOL_ROLE = "exported"
|
||||||
|
EXPORTED_CONFIRM_TOKEN = "DESTROY EXPORTED POOL"
|
||||||
|
MOUNTED_ROLE = "mounted"
|
||||||
|
MOUNTED_CONFIRM_TOKEN = "DESTROY MOUNTED FILESYSTEM"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class _UnlockGrant:
|
||||||
|
"""An operator-issued, time-bounded permission to burn-in a pool drive.
|
||||||
|
|
||||||
|
The grant is BOUND to the (pool_name, pool_role) observed at unlock
|
||||||
|
time. If a subsequent poll reclassifies the drive — e.g. it was
|
||||||
|
"(exported)" when unlocked but is now in active pool "tank", or it
|
||||||
|
used to be a cache vdev and now shows as data — the grant is
|
||||||
|
invalidated. Otherwise the operator's "I confirm this exported drive
|
||||||
|
is decommissioned" judgement would silently authorise destruction
|
||||||
|
of a live pool.
|
||||||
|
"""
|
||||||
|
expiry: float
|
||||||
|
pool_name: str
|
||||||
|
pool_role: str | None
|
||||||
|
|
||||||
|
|
||||||
|
_unlock_grants: dict[int, _UnlockGrant] = {}
|
||||||
|
|
||||||
|
|
||||||
|
class PoolMemberError(Exception):
|
||||||
|
"""Raised by start_job when a drive is in a zpool and not unlocked."""
|
||||||
|
def __init__(self, drive_id: int, pool_name: str, pool_role: str | None):
|
||||||
|
self.drive_id = drive_id
|
||||||
|
self.pool_name = pool_name
|
||||||
|
self.pool_role = pool_role
|
||||||
|
is_boot = pool_name == BOOT_POOL_NAME
|
||||||
|
super().__init__(
|
||||||
|
f"Drive is part of {'BOOT POOL' if is_boot else 'pool'} "
|
||||||
|
f"'{pool_name}'{' (' + pool_role + ')' if pool_role else ''}. "
|
||||||
|
f"Unlock required before burn-in."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def is_unlocked(drive_id: int, current_pool_name: str | None,
|
||||||
|
current_pool_role: str | None) -> bool:
|
||||||
|
"""True iff a non-expired grant exists AND the drive's pool identity
|
||||||
|
matches what was observed at unlock time."""
|
||||||
|
grant = _unlock_grants.get(drive_id)
|
||||||
|
if grant is None:
|
||||||
|
return False
|
||||||
|
if _time.time() >= grant.expiry:
|
||||||
|
_unlock_grants.pop(drive_id, None)
|
||||||
|
return False
|
||||||
|
if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role:
|
||||||
|
# Pool identity changed since unlock — drive may now belong to a
|
||||||
|
# different (or live) pool. Invalidate the grant; operator must
|
||||||
|
# re-unlock with eyes-open against the current state.
|
||||||
|
_unlock_grants.pop(drive_id, None)
|
||||||
|
log.warning(
|
||||||
|
"Invalidating unlock grant for drive_id=%d: pool changed from "
|
||||||
|
"(%s, %s) to (%s, %s)",
|
||||||
|
drive_id, grant.pool_name, grant.pool_role,
|
||||||
|
current_pool_name, current_pool_role,
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def unlock_expiry(drive_id: int, current_pool_name: str | None,
|
||||||
|
current_pool_role: str | None) -> float | None:
|
||||||
|
"""Return the absolute expiry of an active grant, or None.
|
||||||
|
|
||||||
|
Same identity-binding semantics as is_unlocked: a grant whose stored
|
||||||
|
pool identity no longer matches the current row is treated as expired
|
||||||
|
and reaped. This is what the dashboard reads to decide whether to show
|
||||||
|
the unlocked-Burn-In affordance vs the locked-Unlock affordance.
|
||||||
|
"""
|
||||||
|
grant = _unlock_grants.get(drive_id)
|
||||||
|
if grant is None:
|
||||||
|
return None
|
||||||
|
if _time.time() >= grant.expiry:
|
||||||
|
_unlock_grants.pop(drive_id, None)
|
||||||
|
return None
|
||||||
|
if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role:
|
||||||
|
_unlock_grants.pop(drive_id, None)
|
||||||
|
return None
|
||||||
|
return grant.expiry
|
||||||
|
|
||||||
|
|
||||||
|
def invalidate_grant(drive_id: int) -> None:
|
||||||
|
"""Drop a grant unconditionally — used by start_job when a fresh
|
||||||
|
SSH-side pool check shows the drive's identity has shifted."""
|
||||||
|
_unlock_grants.pop(drive_id, None)
|
||||||
|
|
||||||
|
|
||||||
|
async def grant_pool_unlock(drive_id: int, confirm_token: str,
|
||||||
|
operator: str, reason: str) -> float:
|
||||||
|
"""Validate confirmation token + reason and grant a time-limited unlock.
|
||||||
|
|
||||||
|
Raises ValueError on bad confirm_token, missing reason, or drive not
|
||||||
|
actually in a pool. Returns the unix expiry timestamp on success.
|
||||||
|
"""
|
||||||
|
if not reason or len(reason.strip()) < 5:
|
||||||
|
raise ValueError("A reason of at least 5 characters is required.")
|
||||||
|
if not operator or not operator.strip():
|
||||||
|
raise ValueError("Operator name is required.")
|
||||||
|
|
||||||
|
async with aiosqlite.connect(settings.db_path) as db:
|
||||||
|
db.row_factory = aiosqlite.Row
|
||||||
|
await db.execute("PRAGMA busy_timeout=10000")
|
||||||
|
cur = await db.execute(
|
||||||
|
"SELECT pool_name, pool_role, devname FROM drives WHERE id=?",
|
||||||
|
(drive_id,),
|
||||||
|
)
|
||||||
|
row = await cur.fetchone()
|
||||||
|
if not row:
|
||||||
|
raise ValueError("Drive not found.")
|
||||||
|
pool_name = row["pool_name"]
|
||||||
|
pool_role = row["pool_role"]
|
||||||
|
if not pool_name:
|
||||||
|
raise ValueError(
|
||||||
|
"This drive is not part of any pool — no unlock needed."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Boot-pool / exported / mounted-fs all get dedicated, harder-to-
|
||||||
|
# fat-finger tokens. Active data pools just need their pool name
|
||||||
|
# typed.
|
||||||
|
if pool_name == BOOT_POOL_NAME:
|
||||||
|
expected = BOOT_POOL_CONFIRM_TOKEN
|
||||||
|
elif pool_role == EXPORTED_POOL_ROLE:
|
||||||
|
expected = EXPORTED_CONFIRM_TOKEN
|
||||||
|
elif pool_role == MOUNTED_ROLE:
|
||||||
|
expected = MOUNTED_CONFIRM_TOKEN
|
||||||
|
else:
|
||||||
|
expected = pool_name
|
||||||
|
if (confirm_token or "").strip() != expected:
|
||||||
|
raise ValueError("Confirmation token does not match.")
|
||||||
|
|
||||||
|
if pool_name == BOOT_POOL_NAME:
|
||||||
|
evt = "boot_pool_drive_unlocked"
|
||||||
|
elif pool_role == EXPORTED_POOL_ROLE:
|
||||||
|
evt = "exported_pool_drive_unlocked"
|
||||||
|
elif pool_role == MOUNTED_ROLE:
|
||||||
|
evt = "mounted_drive_unlocked"
|
||||||
|
else:
|
||||||
|
evt = "pool_drive_unlocked"
|
||||||
|
await db.execute(
|
||||||
|
"""INSERT INTO audit_events
|
||||||
|
(event_type, drive_id, burnin_job_id, operator, message)
|
||||||
|
VALUES (?,?,?,?,?)""",
|
||||||
|
(evt, drive_id, None, operator.strip(),
|
||||||
|
f"Unlocked {pool_name} drive {row['devname']} for burn-in: {reason.strip()}"),
|
||||||
|
)
|
||||||
|
await db.commit()
|
||||||
|
|
||||||
|
# Arm the in-memory grant ONLY after the audit row is durable. If the
|
||||||
|
# commit above raises, we exit without writing _unlock_grants — no
|
||||||
|
# unaudited active unlocks. The grant is bound to the (pool_name,
|
||||||
|
# pool_role) we observed under the open transaction so a later poll
|
||||||
|
# that reclassifies the drive invalidates it (see is_unlocked).
|
||||||
|
expiry = _time.time() + UNLOCK_TTL_SECONDS
|
||||||
|
_unlock_grants[drive_id] = _UnlockGrant(
|
||||||
|
expiry=expiry,
|
||||||
|
pool_name=pool_name,
|
||||||
|
pool_role=pool_role,
|
||||||
|
)
|
||||||
|
|
||||||
|
log.warning(
|
||||||
|
"Pool-drive unlock granted: drive_id=%d pool=%s role=%s "
|
||||||
|
"operator=%s reason=%r",
|
||||||
|
drive_id, pool_name, pool_role, operator, reason,
|
||||||
|
)
|
||||||
|
return expiry
|
||||||
|
|
@ -83,7 +83,7 @@ class Settings(BaseSettings):
|
||||||
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
||||||
|
|
||||||
# Application version — used by the /api/v1/updates/check endpoint
|
# Application version — used by the /api/v1/updates/check endpoint
|
||||||
app_version: str = "1.0.0-29"
|
app_version: str = "1.0.0-30"
|
||||||
|
|
||||||
# ---- Authentication (1.0.0-22) ----
|
# ---- Authentication (1.0.0-22) ----
|
||||||
# session_secret: HMAC key for signing session cookies. Empty = generate
|
# session_secret: HMAC key for signing session cookies. Empty = generate
|
||||||
|
|
|
||||||
|
|
@ -169,15 +169,18 @@ class TestUnlockFlow(unittest.IsolatedAsyncioTestCase):
|
||||||
|
|
||||||
async def test_expired_grant_returns_false(self):
|
async def test_expired_grant_returns_false(self):
|
||||||
from app import burnin
|
from app import burnin
|
||||||
# Drop TTL to 0 so the grant is born expired.
|
from app.burnin import unlock as _unlock
|
||||||
original = burnin.UNLOCK_TTL_SECONDS
|
# Drop TTL to 0 so the grant is born expired. Monkey-patch the
|
||||||
burnin.UNLOCK_TTL_SECONDS = 0
|
# real source-of-truth in app.burnin.unlock — the alias on the
|
||||||
|
# package root is bound at import time and won't propagate back.
|
||||||
|
original = _unlock.UNLOCK_TTL_SECONDS
|
||||||
|
_unlock.UNLOCK_TTL_SECONDS = 0
|
||||||
try:
|
try:
|
||||||
await burnin.grant_pool_unlock(1, "tank", "op", "valid reason")
|
await burnin.grant_pool_unlock(1, "tank", "op", "valid reason")
|
||||||
self.assertFalse(burnin._is_unlocked(1, "tank", "data"))
|
self.assertFalse(burnin._is_unlocked(1, "tank", "data"))
|
||||||
self.assertNotIn(1, burnin._unlock_grants)
|
self.assertNotIn(1, burnin._unlock_grants)
|
||||||
finally:
|
finally:
|
||||||
burnin.UNLOCK_TTL_SECONDS = original
|
_unlock.UNLOCK_TTL_SECONDS = original
|
||||||
|
|
||||||
# ----- audit commit ordering (Codex finding #3) -----
|
# ----- audit commit ordering (Codex finding #3) -----
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue