refactor: split burnin.py into a package — extract unlock + kill (1.0.0-30)
First slice of the planned tech-debt cleanup. burnin.py was 1667 lines and growing; staged extraction gives smaller diffs to review and a clear bisect target if anything regresses. Mechanical move only — no behaviour change. The two extracted modules: * app/burnin/unlock.py — _UnlockGrant, _unlock_grants, PoolMemberError, is_unlocked / unlock_expiry / grant_pool_unlock, plus the four *_TOKEN constants and UNLOCK_TTL_SECONDS. Owns its module-level state; opens its own DB connection in grant_pool_unlock so it doesn't depend on the parent package's _db() helper. * app/burnin/kill.py — _remote_pids dict and the kill_remote_process / set_remote_pid / clear_remote_pid / get_remote_pid helpers. Pulled out of __init__.py so the asyncssh-ignores-signals workaround lives next to the state it operates on. app/burnin/__init__.py re-exports every public symbol the rest of the app imports — `from app import burnin; burnin.start_job(...)`, `burnin.PoolMemberError`, `burnin.UNLOCK_TTL_SECONDS`, etc. all keep working unchanged. Internal aliases `_remote_pids` and `_unlock_grants` on the package root point at the SAME dict objects in the submodules, so existing in-package mutations (set in stages, cleared in cleanup callbacks) work without rewrite. Test fix: tests/test_unlock_flow.py:test_expired_grant_returns_false monkey-patches UNLOCK_TTL_SECONDS. The package-root alias is bound at import time and won't propagate back to the submodule's read site, so the test now patches `app.burnin.unlock.UNLOCK_TTL_SECONDS` directly. Verification: 44/44 unit tests pass in container; /health 200; container boots clean. routes.py, mailer.py, poller.py untouched — the public API is identical. Future: extract stages, task, _common in subsequent versions. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
6c20e57fd8
commit
9cbae44495
5 changed files with 313 additions and 212 deletions
|
|
@ -72,11 +72,29 @@ _client: TrueNASClient | None = None
|
|||
# cancel_job / check_stuck_jobs can actually unwedge a stuck task.
|
||||
_active_tasks: dict[int, "asyncio.Task"] = {}
|
||||
|
||||
# Remote PID of any long-running SSH child process (currently only badblocks)
|
||||
# so we can kill it via a fresh SSH session — proc.kill() over asyncssh sends
|
||||
# a "signal" channel request that OpenSSH sshd ignores by default, leaving
|
||||
# the remote process running and proc.wait() hanging forever.
|
||||
_remote_pids: dict[int, int] = {}
|
||||
# Remote-PID kill machinery + pool-drive unlock state both live in their
|
||||
# own submodules. We re-export the names the rest of the app reaches for
|
||||
# (and keep the _kill_remote_process / _is_unlocked aliases for callers
|
||||
# that grew up before the split).
|
||||
from . import kill as _kill # noqa: E402
|
||||
from . import unlock as _unlock # noqa: E402
|
||||
|
||||
_remote_pids = _kill._remote_pids
|
||||
_unlock_grants = _unlock._unlock_grants
|
||||
|
||||
PoolMemberError = _unlock.PoolMemberError
|
||||
UNLOCK_TTL_SECONDS = _unlock.UNLOCK_TTL_SECONDS
|
||||
BOOT_POOL_NAME = _unlock.BOOT_POOL_NAME
|
||||
BOOT_POOL_CONFIRM_TOKEN = _unlock.BOOT_POOL_CONFIRM_TOKEN
|
||||
EXPORTED_POOL_ROLE = _unlock.EXPORTED_POOL_ROLE
|
||||
EXPORTED_CONFIRM_TOKEN = _unlock.EXPORTED_CONFIRM_TOKEN
|
||||
MOUNTED_ROLE = _unlock.MOUNTED_ROLE
|
||||
MOUNTED_CONFIRM_TOKEN = _unlock.MOUNTED_CONFIRM_TOKEN
|
||||
|
||||
unlock_expiry = _unlock.unlock_expiry
|
||||
grant_pool_unlock = _unlock.grant_pool_unlock
|
||||
_is_unlocked = _unlock.is_unlocked # legacy private name
|
||||
_kill_remote_process = _kill.kill_remote_process
|
||||
|
||||
|
||||
def _now() -> str:
|
||||
|
|
@ -145,208 +163,8 @@ def _spawn_run_job(job_id: int) -> "asyncio.Task":
|
|||
return task
|
||||
|
||||
|
||||
async def _kill_remote_process(job_id: int) -> None:
|
||||
"""Send kill -9 to the remote PID associated with this job, if any.
|
||||
|
||||
asyncssh's proc.kill() sends an SSH 'signal' channel request which
|
||||
OpenSSH's sshd does not honor by default. Opening a fresh session and
|
||||
running /bin/kill is the reliable way to actually terminate the process.
|
||||
"""
|
||||
pid = _remote_pids.pop(job_id, None)
|
||||
if not pid:
|
||||
return
|
||||
try:
|
||||
from app import ssh_client
|
||||
async with await ssh_client._connect() as conn:
|
||||
await asyncio.wait_for(
|
||||
conn.run(f"kill -9 {pid} 2>/dev/null || true", check=False),
|
||||
timeout=10,
|
||||
)
|
||||
log.info("Remote-killed PID %d for job %d", pid, job_id)
|
||||
except Exception as exc:
|
||||
log.warning("Failed to remote-kill PID %d for job %d: %s", pid, job_id, exc)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pool-drive unlock state
|
||||
# ---------------------------------------------------------------------------
|
||||
#
|
||||
# Drives that ZFS reports as belonging to an active zpool (including the
|
||||
# boot pool) are locked from burn-in until the operator explicitly unlocks
|
||||
# them via POST /api/v1/drives/{id}/unlock. Grants live in memory only —
|
||||
# a container restart wipes them, which is the right default for "this is
|
||||
# very dangerous." TTL is bounded so an unlock you forgot about can't sit
|
||||
# armed indefinitely.
|
||||
|
||||
import time as _time
|
||||
from dataclasses import dataclass
|
||||
|
||||
UNLOCK_TTL_SECONDS = 600 # 10 minutes
|
||||
BOOT_POOL_NAME = "boot-pool"
|
||||
BOOT_POOL_CONFIRM_TOKEN = "DESTROY BOOT POOL"
|
||||
EXPORTED_POOL_ROLE = "exported"
|
||||
EXPORTED_CONFIRM_TOKEN = "DESTROY EXPORTED POOL"
|
||||
MOUNTED_ROLE = "mounted"
|
||||
MOUNTED_CONFIRM_TOKEN = "DESTROY MOUNTED FILESYSTEM"
|
||||
|
||||
|
||||
@dataclass
|
||||
class _UnlockGrant:
|
||||
"""An operator-issued, time-bounded permission to burn-in a pool drive.
|
||||
|
||||
The grant is BOUND to the (pool_name, pool_role) observed at unlock
|
||||
time. If a subsequent poll reclassifies the drive — e.g. it was
|
||||
"(exported)" when unlocked but is now in active pool "tank", or it
|
||||
used to be a cache vdev and now shows as data — the grant is
|
||||
invalidated. Otherwise the operator's "I confirm this exported drive
|
||||
is decommissioned" judgement would silently authorise destruction
|
||||
of a live pool.
|
||||
"""
|
||||
expiry: float
|
||||
pool_name: str
|
||||
pool_role: str | None
|
||||
|
||||
|
||||
_unlock_grants: dict[int, _UnlockGrant] = {}
|
||||
|
||||
|
||||
class PoolMemberError(Exception):
|
||||
"""Raised by start_job when a drive is in a zpool and not unlocked."""
|
||||
def __init__(self, drive_id: int, pool_name: str, pool_role: str | None):
|
||||
self.drive_id = drive_id
|
||||
self.pool_name = pool_name
|
||||
self.pool_role = pool_role
|
||||
is_boot = pool_name == BOOT_POOL_NAME
|
||||
super().__init__(
|
||||
f"Drive is part of {'BOOT POOL' if is_boot else 'pool'} "
|
||||
f"'{pool_name}'{' (' + pool_role + ')' if pool_role else ''}. "
|
||||
f"Unlock required before burn-in."
|
||||
)
|
||||
|
||||
|
||||
def _is_unlocked(drive_id: int, current_pool_name: str | None,
|
||||
current_pool_role: str | None) -> bool:
|
||||
"""True iff a non-expired grant exists AND the drive's pool identity
|
||||
matches what was observed at unlock time."""
|
||||
grant = _unlock_grants.get(drive_id)
|
||||
if grant is None:
|
||||
return False
|
||||
if _time.time() >= grant.expiry:
|
||||
_unlock_grants.pop(drive_id, None)
|
||||
return False
|
||||
if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role:
|
||||
# Pool identity changed since unlock — drive may now belong to a
|
||||
# different (or live) pool. Invalidate the grant; operator must
|
||||
# re-unlock with eyes-open against the current state.
|
||||
_unlock_grants.pop(drive_id, None)
|
||||
log.warning(
|
||||
"Invalidating unlock grant for drive_id=%d: pool changed from "
|
||||
"(%s, %s) to (%s, %s)",
|
||||
drive_id, grant.pool_name, grant.pool_role,
|
||||
current_pool_name, current_pool_role,
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def unlock_expiry(drive_id: int, current_pool_name: str | None,
|
||||
current_pool_role: str | None) -> float | None:
|
||||
"""Return the absolute expiry of an active grant, or None.
|
||||
|
||||
Same identity-binding semantics as _is_unlocked: a grant whose stored
|
||||
pool identity no longer matches the current row is treated as expired
|
||||
and reaped. This is what the dashboard reads to decide whether to show
|
||||
the unlocked-Burn-In affordance vs the locked-Unlock affordance.
|
||||
"""
|
||||
grant = _unlock_grants.get(drive_id)
|
||||
if grant is None:
|
||||
return None
|
||||
if _time.time() >= grant.expiry:
|
||||
_unlock_grants.pop(drive_id, None)
|
||||
return None
|
||||
if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role:
|
||||
_unlock_grants.pop(drive_id, None)
|
||||
return None
|
||||
return grant.expiry
|
||||
|
||||
|
||||
async def grant_pool_unlock(drive_id: int, confirm_token: str,
|
||||
operator: str, reason: str) -> float:
|
||||
"""Validate confirmation token + reason and grant a time-limited unlock.
|
||||
|
||||
Raises ValueError on bad confirm_token, missing reason, or drive not
|
||||
actually in a pool. Returns the unix expiry timestamp on success.
|
||||
"""
|
||||
if not reason or len(reason.strip()) < 5:
|
||||
raise ValueError("A reason of at least 5 characters is required.")
|
||||
if not operator or not operator.strip():
|
||||
raise ValueError("Operator name is required.")
|
||||
|
||||
async with _db() as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
cur = await db.execute(
|
||||
"SELECT pool_name, pool_role, devname FROM drives WHERE id=?",
|
||||
(drive_id,),
|
||||
)
|
||||
row = await cur.fetchone()
|
||||
if not row:
|
||||
raise ValueError("Drive not found.")
|
||||
pool_name = row["pool_name"]
|
||||
pool_role = row["pool_role"]
|
||||
if not pool_name:
|
||||
raise ValueError(
|
||||
"This drive is not part of any pool — no unlock needed."
|
||||
)
|
||||
|
||||
# Boot-pool / exported / mounted-fs all get dedicated, harder-to-
|
||||
# fat-finger tokens. Active data pools just need their pool name
|
||||
# typed.
|
||||
if pool_name == BOOT_POOL_NAME:
|
||||
expected = BOOT_POOL_CONFIRM_TOKEN
|
||||
elif pool_role == EXPORTED_POOL_ROLE:
|
||||
expected = EXPORTED_CONFIRM_TOKEN
|
||||
elif pool_role == MOUNTED_ROLE:
|
||||
expected = MOUNTED_CONFIRM_TOKEN
|
||||
else:
|
||||
expected = pool_name
|
||||
if (confirm_token or "").strip() != expected:
|
||||
raise ValueError("Confirmation token does not match.")
|
||||
|
||||
if pool_name == BOOT_POOL_NAME:
|
||||
evt = "boot_pool_drive_unlocked"
|
||||
elif pool_role == EXPORTED_POOL_ROLE:
|
||||
evt = "exported_pool_drive_unlocked"
|
||||
elif pool_role == MOUNTED_ROLE:
|
||||
evt = "mounted_drive_unlocked"
|
||||
else:
|
||||
evt = "pool_drive_unlocked"
|
||||
await db.execute(
|
||||
"""INSERT INTO audit_events
|
||||
(event_type, drive_id, burnin_job_id, operator, message)
|
||||
VALUES (?,?,?,?,?)""",
|
||||
(evt, drive_id, None, operator.strip(),
|
||||
f"Unlocked {pool_name} drive {row['devname']} for burn-in: {reason.strip()}"),
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
# Arm the in-memory grant ONLY after the audit row is durable. If the
|
||||
# commit above raises, we exit without writing _unlock_grants — no
|
||||
# unaudited active unlocks. The grant is bound to the (pool_name,
|
||||
# pool_role) we observed under the open transaction so a later poll
|
||||
# that reclassifies the drive invalidates it (see _is_unlocked).
|
||||
expiry = _time.time() + UNLOCK_TTL_SECONDS
|
||||
_unlock_grants[drive_id] = _UnlockGrant(
|
||||
expiry=expiry,
|
||||
pool_name=pool_name,
|
||||
pool_role=pool_role,
|
||||
)
|
||||
|
||||
log.warning(
|
||||
"Pool-drive unlock granted: drive_id=%d pool=%s role=%s "
|
||||
"operator=%s reason=%r",
|
||||
drive_id, pool_name, pool_role, operator, reason,
|
||||
)
|
||||
return expiry
|
||||
# _kill_remote_process is re-exported above from .kill — the original
|
||||
# definition was extracted to app/burnin/kill.py in 1.0.0-30.
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
71
app/burnin/kill.py
Normal file
71
app/burnin/kill.py
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
"""Remote process kill machinery.
|
||||
|
||||
asyncssh's ``proc.kill()`` sends an SSH "signal" channel request that
|
||||
OpenSSH's sshd ignores by default — the remote process keeps running and
|
||||
``proc.wait()`` hangs forever, pinning the asyncio.Semaphore slot.
|
||||
The fix: capture the remote PID at command launch (via the
|
||||
``sh -c 'echo PID:$$; exec ...'`` wrapper) and issue ``kill -9 <pid>``
|
||||
over a fresh SSH session when we need to abort. This module owns that
|
||||
state and the kill helper.
|
||||
|
||||
Public surface (used by the rest of the burnin package):
|
||||
set_remote_pid(job_id, pid) — call from the stage when launch succeeds
|
||||
clear_remote_pid(job_id) — call from the cleanup callback
|
||||
kill_remote_process(job_id) — fire-and-clear; safe to call repeatedly
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# job_id -> remote PID. Module-level dict so it survives across the
|
||||
# stage / task / __init__ split without needing to thread it through
|
||||
# function signatures.
|
||||
_remote_pids: dict[int, int] = {}
|
||||
|
||||
|
||||
def set_remote_pid(job_id: int, pid: int) -> None:
|
||||
"""Record the remote PID captured by the running stage."""
|
||||
_remote_pids[job_id] = pid
|
||||
|
||||
|
||||
def clear_remote_pid(job_id: int) -> None:
|
||||
"""Drop the PID without trying to kill — used by the task cleanup
|
||||
callback so a normally-completed job doesn't carry stale state."""
|
||||
_remote_pids.pop(job_id, None)
|
||||
|
||||
|
||||
def get_remote_pid(job_id: int) -> int | None:
|
||||
return _remote_pids.get(job_id)
|
||||
|
||||
|
||||
async def kill_remote_process(job_id: int) -> None:
|
||||
"""Send kill -9 to the remote PID associated with this job, if any.
|
||||
|
||||
Idempotent — pops the PID before attempting the kill so a second
|
||||
call is a no-op. SSH connection failure is logged but never raised
|
||||
(we'd rather best-effort-kill than block the cancel path).
|
||||
"""
|
||||
pid = _remote_pids.pop(job_id, None)
|
||||
if not pid:
|
||||
return
|
||||
try:
|
||||
# Local import to avoid pulling asyncssh into module load if
|
||||
# this helper is never used (tests, mock mode).
|
||||
from app import ssh_client
|
||||
async with await ssh_client._connect() as conn:
|
||||
await asyncio.wait_for(
|
||||
conn.run(
|
||||
f"kill -9 {pid} 2>/dev/null || true", check=False,
|
||||
),
|
||||
timeout=10,
|
||||
)
|
||||
log.info("Remote-killed PID %d for job %d", pid, job_id)
|
||||
except Exception as exc:
|
||||
log.warning(
|
||||
"Failed to remote-kill PID %d for job %d: %s", pid, job_id, exc,
|
||||
)
|
||||
209
app/burnin/unlock.py
Normal file
209
app/burnin/unlock.py
Normal file
|
|
@ -0,0 +1,209 @@
|
|||
"""Pool-drive unlock state.
|
||||
|
||||
Drives that ZFS reports as belonging to an active zpool (including the
|
||||
boot pool), drives carrying ZFS labels from a previously-imported pool
|
||||
("exported"), and drives with a non-ZFS mount somewhere ("mounted") are
|
||||
all locked from burn-in until the operator explicitly unlocks them via
|
||||
``POST /api/v1/drives/{id}/unlock``. Grants live in memory only — a
|
||||
container restart wipes them, which is the right default for "this is
|
||||
very dangerous." TTL is bounded so an unlock you forgot about can't sit
|
||||
armed indefinitely.
|
||||
|
||||
Each lock kind has its own confirm token to make the override
|
||||
deliberate; see grant_pool_unlock for the matching logic.
|
||||
|
||||
Public surface:
|
||||
is_unlocked(drive_id, current_pool_name, current_pool_role) -> bool
|
||||
unlock_expiry(drive_id, current_pool_name, current_pool_role) -> float|None
|
||||
grant_pool_unlock(drive_id, confirm_token, operator, reason) -> float
|
||||
PoolMemberError — raised by start_job
|
||||
UNLOCK_TTL_SECONDS — for the unlock endpoint response
|
||||
BOOT_POOL_NAME / *_TOKEN consts — for the UI / audit
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import time as _time
|
||||
from dataclasses import dataclass
|
||||
|
||||
import aiosqlite
|
||||
|
||||
from app.config import settings
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
UNLOCK_TTL_SECONDS = 600 # 10 minutes
|
||||
BOOT_POOL_NAME = "boot-pool"
|
||||
BOOT_POOL_CONFIRM_TOKEN = "DESTROY BOOT POOL"
|
||||
EXPORTED_POOL_ROLE = "exported"
|
||||
EXPORTED_CONFIRM_TOKEN = "DESTROY EXPORTED POOL"
|
||||
MOUNTED_ROLE = "mounted"
|
||||
MOUNTED_CONFIRM_TOKEN = "DESTROY MOUNTED FILESYSTEM"
|
||||
|
||||
|
||||
@dataclass
|
||||
class _UnlockGrant:
|
||||
"""An operator-issued, time-bounded permission to burn-in a pool drive.
|
||||
|
||||
The grant is BOUND to the (pool_name, pool_role) observed at unlock
|
||||
time. If a subsequent poll reclassifies the drive — e.g. it was
|
||||
"(exported)" when unlocked but is now in active pool "tank", or it
|
||||
used to be a cache vdev and now shows as data — the grant is
|
||||
invalidated. Otherwise the operator's "I confirm this exported drive
|
||||
is decommissioned" judgement would silently authorise destruction
|
||||
of a live pool.
|
||||
"""
|
||||
expiry: float
|
||||
pool_name: str
|
||||
pool_role: str | None
|
||||
|
||||
|
||||
_unlock_grants: dict[int, _UnlockGrant] = {}
|
||||
|
||||
|
||||
class PoolMemberError(Exception):
|
||||
"""Raised by start_job when a drive is in a zpool and not unlocked."""
|
||||
def __init__(self, drive_id: int, pool_name: str, pool_role: str | None):
|
||||
self.drive_id = drive_id
|
||||
self.pool_name = pool_name
|
||||
self.pool_role = pool_role
|
||||
is_boot = pool_name == BOOT_POOL_NAME
|
||||
super().__init__(
|
||||
f"Drive is part of {'BOOT POOL' if is_boot else 'pool'} "
|
||||
f"'{pool_name}'{' (' + pool_role + ')' if pool_role else ''}. "
|
||||
f"Unlock required before burn-in."
|
||||
)
|
||||
|
||||
|
||||
def is_unlocked(drive_id: int, current_pool_name: str | None,
|
||||
current_pool_role: str | None) -> bool:
|
||||
"""True iff a non-expired grant exists AND the drive's pool identity
|
||||
matches what was observed at unlock time."""
|
||||
grant = _unlock_grants.get(drive_id)
|
||||
if grant is None:
|
||||
return False
|
||||
if _time.time() >= grant.expiry:
|
||||
_unlock_grants.pop(drive_id, None)
|
||||
return False
|
||||
if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role:
|
||||
# Pool identity changed since unlock — drive may now belong to a
|
||||
# different (or live) pool. Invalidate the grant; operator must
|
||||
# re-unlock with eyes-open against the current state.
|
||||
_unlock_grants.pop(drive_id, None)
|
||||
log.warning(
|
||||
"Invalidating unlock grant for drive_id=%d: pool changed from "
|
||||
"(%s, %s) to (%s, %s)",
|
||||
drive_id, grant.pool_name, grant.pool_role,
|
||||
current_pool_name, current_pool_role,
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def unlock_expiry(drive_id: int, current_pool_name: str | None,
|
||||
current_pool_role: str | None) -> float | None:
|
||||
"""Return the absolute expiry of an active grant, or None.
|
||||
|
||||
Same identity-binding semantics as is_unlocked: a grant whose stored
|
||||
pool identity no longer matches the current row is treated as expired
|
||||
and reaped. This is what the dashboard reads to decide whether to show
|
||||
the unlocked-Burn-In affordance vs the locked-Unlock affordance.
|
||||
"""
|
||||
grant = _unlock_grants.get(drive_id)
|
||||
if grant is None:
|
||||
return None
|
||||
if _time.time() >= grant.expiry:
|
||||
_unlock_grants.pop(drive_id, None)
|
||||
return None
|
||||
if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role:
|
||||
_unlock_grants.pop(drive_id, None)
|
||||
return None
|
||||
return grant.expiry
|
||||
|
||||
|
||||
def invalidate_grant(drive_id: int) -> None:
|
||||
"""Drop a grant unconditionally — used by start_job when a fresh
|
||||
SSH-side pool check shows the drive's identity has shifted."""
|
||||
_unlock_grants.pop(drive_id, None)
|
||||
|
||||
|
||||
async def grant_pool_unlock(drive_id: int, confirm_token: str,
|
||||
operator: str, reason: str) -> float:
|
||||
"""Validate confirmation token + reason and grant a time-limited unlock.
|
||||
|
||||
Raises ValueError on bad confirm_token, missing reason, or drive not
|
||||
actually in a pool. Returns the unix expiry timestamp on success.
|
||||
"""
|
||||
if not reason or len(reason.strip()) < 5:
|
||||
raise ValueError("A reason of at least 5 characters is required.")
|
||||
if not operator or not operator.strip():
|
||||
raise ValueError("Operator name is required.")
|
||||
|
||||
async with aiosqlite.connect(settings.db_path) as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
await db.execute("PRAGMA busy_timeout=10000")
|
||||
cur = await db.execute(
|
||||
"SELECT pool_name, pool_role, devname FROM drives WHERE id=?",
|
||||
(drive_id,),
|
||||
)
|
||||
row = await cur.fetchone()
|
||||
if not row:
|
||||
raise ValueError("Drive not found.")
|
||||
pool_name = row["pool_name"]
|
||||
pool_role = row["pool_role"]
|
||||
if not pool_name:
|
||||
raise ValueError(
|
||||
"This drive is not part of any pool — no unlock needed."
|
||||
)
|
||||
|
||||
# Boot-pool / exported / mounted-fs all get dedicated, harder-to-
|
||||
# fat-finger tokens. Active data pools just need their pool name
|
||||
# typed.
|
||||
if pool_name == BOOT_POOL_NAME:
|
||||
expected = BOOT_POOL_CONFIRM_TOKEN
|
||||
elif pool_role == EXPORTED_POOL_ROLE:
|
||||
expected = EXPORTED_CONFIRM_TOKEN
|
||||
elif pool_role == MOUNTED_ROLE:
|
||||
expected = MOUNTED_CONFIRM_TOKEN
|
||||
else:
|
||||
expected = pool_name
|
||||
if (confirm_token or "").strip() != expected:
|
||||
raise ValueError("Confirmation token does not match.")
|
||||
|
||||
if pool_name == BOOT_POOL_NAME:
|
||||
evt = "boot_pool_drive_unlocked"
|
||||
elif pool_role == EXPORTED_POOL_ROLE:
|
||||
evt = "exported_pool_drive_unlocked"
|
||||
elif pool_role == MOUNTED_ROLE:
|
||||
evt = "mounted_drive_unlocked"
|
||||
else:
|
||||
evt = "pool_drive_unlocked"
|
||||
await db.execute(
|
||||
"""INSERT INTO audit_events
|
||||
(event_type, drive_id, burnin_job_id, operator, message)
|
||||
VALUES (?,?,?,?,?)""",
|
||||
(evt, drive_id, None, operator.strip(),
|
||||
f"Unlocked {pool_name} drive {row['devname']} for burn-in: {reason.strip()}"),
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
# Arm the in-memory grant ONLY after the audit row is durable. If the
|
||||
# commit above raises, we exit without writing _unlock_grants — no
|
||||
# unaudited active unlocks. The grant is bound to the (pool_name,
|
||||
# pool_role) we observed under the open transaction so a later poll
|
||||
# that reclassifies the drive invalidates it (see is_unlocked).
|
||||
expiry = _time.time() + UNLOCK_TTL_SECONDS
|
||||
_unlock_grants[drive_id] = _UnlockGrant(
|
||||
expiry=expiry,
|
||||
pool_name=pool_name,
|
||||
pool_role=pool_role,
|
||||
)
|
||||
|
||||
log.warning(
|
||||
"Pool-drive unlock granted: drive_id=%d pool=%s role=%s "
|
||||
"operator=%s reason=%r",
|
||||
drive_id, pool_name, pool_role, operator, reason,
|
||||
)
|
||||
return expiry
|
||||
|
|
@ -83,7 +83,7 @@ class Settings(BaseSettings):
|
|||
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
||||
|
||||
# Application version — used by the /api/v1/updates/check endpoint
|
||||
app_version: str = "1.0.0-29"
|
||||
app_version: str = "1.0.0-30"
|
||||
|
||||
# ---- Authentication (1.0.0-22) ----
|
||||
# session_secret: HMAC key for signing session cookies. Empty = generate
|
||||
|
|
|
|||
|
|
@ -169,15 +169,18 @@ class TestUnlockFlow(unittest.IsolatedAsyncioTestCase):
|
|||
|
||||
async def test_expired_grant_returns_false(self):
|
||||
from app import burnin
|
||||
# Drop TTL to 0 so the grant is born expired.
|
||||
original = burnin.UNLOCK_TTL_SECONDS
|
||||
burnin.UNLOCK_TTL_SECONDS = 0
|
||||
from app.burnin import unlock as _unlock
|
||||
# Drop TTL to 0 so the grant is born expired. Monkey-patch the
|
||||
# real source-of-truth in app.burnin.unlock — the alias on the
|
||||
# package root is bound at import time and won't propagate back.
|
||||
original = _unlock.UNLOCK_TTL_SECONDS
|
||||
_unlock.UNLOCK_TTL_SECONDS = 0
|
||||
try:
|
||||
await burnin.grant_pool_unlock(1, "tank", "op", "valid reason")
|
||||
self.assertFalse(burnin._is_unlocked(1, "tank", "data"))
|
||||
self.assertNotIn(1, burnin._unlock_grants)
|
||||
finally:
|
||||
burnin.UNLOCK_TTL_SECONDS = original
|
||||
_unlock.UNLOCK_TTL_SECONDS = original
|
||||
|
||||
# ----- audit commit ordering (Codex finding #3) -----
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue