refactor: split burnin.py into a package — extract unlock + kill (1.0.0-30)
Some checks are pending
Security scan / pip-audit (push) Waiting to run
Security scan / bandit (push) Waiting to run
Security scan / gitleaks (push) Waiting to run

First slice of the planned tech-debt cleanup. burnin.py was 1667 lines
and growing; staged extraction gives smaller diffs to review and a
clear bisect target if anything regresses.

Mechanical move only — no behaviour change. The two extracted modules:

* app/burnin/unlock.py — _UnlockGrant, _unlock_grants, PoolMemberError,
  is_unlocked / unlock_expiry / grant_pool_unlock, plus the four
  *_TOKEN constants and UNLOCK_TTL_SECONDS. Owns its module-level
  state; opens its own DB connection in grant_pool_unlock so it
  doesn't depend on the parent package's _db() helper.

* app/burnin/kill.py — _remote_pids dict and the kill_remote_process /
  set_remote_pid / clear_remote_pid / get_remote_pid helpers. Pulled
  out of __init__.py so the asyncssh-ignores-signals workaround lives
  next to the state it operates on.

app/burnin/__init__.py re-exports every public symbol the rest of the
app imports — `from app import burnin; burnin.start_job(...)`,
`burnin.PoolMemberError`, `burnin.UNLOCK_TTL_SECONDS`, etc. all keep
working unchanged. Internal aliases `_remote_pids` and `_unlock_grants`
on the package root point at the SAME dict objects in the submodules,
so existing in-package mutations (set in stages, cleared in cleanup
callbacks) work without rewrite.

Test fix: tests/test_unlock_flow.py:test_expired_grant_returns_false
monkey-patches UNLOCK_TTL_SECONDS. The package-root alias is bound at
import time and won't propagate back to the submodule's read site, so
the test now patches `app.burnin.unlock.UNLOCK_TTL_SECONDS` directly.

Verification: 44/44 unit tests pass in container; /health 200;
container boots clean. routes.py, mailer.py, poller.py untouched —
the public API is identical.

Future: extract stages, task, _common in subsequent versions.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Brandon Walter 2026-05-03 00:44:28 -04:00
parent 6c20e57fd8
commit 9cbae44495
5 changed files with 313 additions and 212 deletions

View file

@ -72,11 +72,29 @@ _client: TrueNASClient | None = None
# cancel_job / check_stuck_jobs can actually unwedge a stuck task. # cancel_job / check_stuck_jobs can actually unwedge a stuck task.
_active_tasks: dict[int, "asyncio.Task"] = {} _active_tasks: dict[int, "asyncio.Task"] = {}
# Remote PID of any long-running SSH child process (currently only badblocks) # Remote-PID kill machinery + pool-drive unlock state both live in their
# so we can kill it via a fresh SSH session — proc.kill() over asyncssh sends # own submodules. We re-export the names the rest of the app reaches for
# a "signal" channel request that OpenSSH sshd ignores by default, leaving # (and keep the _kill_remote_process / _is_unlocked aliases for callers
# the remote process running and proc.wait() hanging forever. # that grew up before the split).
_remote_pids: dict[int, int] = {} from . import kill as _kill # noqa: E402
from . import unlock as _unlock # noqa: E402
_remote_pids = _kill._remote_pids
_unlock_grants = _unlock._unlock_grants
PoolMemberError = _unlock.PoolMemberError
UNLOCK_TTL_SECONDS = _unlock.UNLOCK_TTL_SECONDS
BOOT_POOL_NAME = _unlock.BOOT_POOL_NAME
BOOT_POOL_CONFIRM_TOKEN = _unlock.BOOT_POOL_CONFIRM_TOKEN
EXPORTED_POOL_ROLE = _unlock.EXPORTED_POOL_ROLE
EXPORTED_CONFIRM_TOKEN = _unlock.EXPORTED_CONFIRM_TOKEN
MOUNTED_ROLE = _unlock.MOUNTED_ROLE
MOUNTED_CONFIRM_TOKEN = _unlock.MOUNTED_CONFIRM_TOKEN
unlock_expiry = _unlock.unlock_expiry
grant_pool_unlock = _unlock.grant_pool_unlock
_is_unlocked = _unlock.is_unlocked # legacy private name
_kill_remote_process = _kill.kill_remote_process
def _now() -> str: def _now() -> str:
@ -145,208 +163,8 @@ def _spawn_run_job(job_id: int) -> "asyncio.Task":
return task return task
async def _kill_remote_process(job_id: int) -> None: # _kill_remote_process is re-exported above from .kill — the original
"""Send kill -9 to the remote PID associated with this job, if any. # definition was extracted to app/burnin/kill.py in 1.0.0-30.
asyncssh's proc.kill() sends an SSH 'signal' channel request which
OpenSSH's sshd does not honor by default. Opening a fresh session and
running /bin/kill is the reliable way to actually terminate the process.
"""
pid = _remote_pids.pop(job_id, None)
if not pid:
return
try:
from app import ssh_client
async with await ssh_client._connect() as conn:
await asyncio.wait_for(
conn.run(f"kill -9 {pid} 2>/dev/null || true", check=False),
timeout=10,
)
log.info("Remote-killed PID %d for job %d", pid, job_id)
except Exception as exc:
log.warning("Failed to remote-kill PID %d for job %d: %s", pid, job_id, exc)
# ---------------------------------------------------------------------------
# Pool-drive unlock state
# ---------------------------------------------------------------------------
#
# Drives that ZFS reports as belonging to an active zpool (including the
# boot pool) are locked from burn-in until the operator explicitly unlocks
# them via POST /api/v1/drives/{id}/unlock. Grants live in memory only —
# a container restart wipes them, which is the right default for "this is
# very dangerous." TTL is bounded so an unlock you forgot about can't sit
# armed indefinitely.
import time as _time
from dataclasses import dataclass
UNLOCK_TTL_SECONDS = 600 # 10 minutes
BOOT_POOL_NAME = "boot-pool"
BOOT_POOL_CONFIRM_TOKEN = "DESTROY BOOT POOL"
EXPORTED_POOL_ROLE = "exported"
EXPORTED_CONFIRM_TOKEN = "DESTROY EXPORTED POOL"
MOUNTED_ROLE = "mounted"
MOUNTED_CONFIRM_TOKEN = "DESTROY MOUNTED FILESYSTEM"
@dataclass
class _UnlockGrant:
"""An operator-issued, time-bounded permission to burn-in a pool drive.
The grant is BOUND to the (pool_name, pool_role) observed at unlock
time. If a subsequent poll reclassifies the drive e.g. it was
"(exported)" when unlocked but is now in active pool "tank", or it
used to be a cache vdev and now shows as data the grant is
invalidated. Otherwise the operator's "I confirm this exported drive
is decommissioned" judgement would silently authorise destruction
of a live pool.
"""
expiry: float
pool_name: str
pool_role: str | None
_unlock_grants: dict[int, _UnlockGrant] = {}
class PoolMemberError(Exception):
"""Raised by start_job when a drive is in a zpool and not unlocked."""
def __init__(self, drive_id: int, pool_name: str, pool_role: str | None):
self.drive_id = drive_id
self.pool_name = pool_name
self.pool_role = pool_role
is_boot = pool_name == BOOT_POOL_NAME
super().__init__(
f"Drive is part of {'BOOT POOL' if is_boot else 'pool'} "
f"'{pool_name}'{' (' + pool_role + ')' if pool_role else ''}. "
f"Unlock required before burn-in."
)
def _is_unlocked(drive_id: int, current_pool_name: str | None,
current_pool_role: str | None) -> bool:
"""True iff a non-expired grant exists AND the drive's pool identity
matches what was observed at unlock time."""
grant = _unlock_grants.get(drive_id)
if grant is None:
return False
if _time.time() >= grant.expiry:
_unlock_grants.pop(drive_id, None)
return False
if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role:
# Pool identity changed since unlock — drive may now belong to a
# different (or live) pool. Invalidate the grant; operator must
# re-unlock with eyes-open against the current state.
_unlock_grants.pop(drive_id, None)
log.warning(
"Invalidating unlock grant for drive_id=%d: pool changed from "
"(%s, %s) to (%s, %s)",
drive_id, grant.pool_name, grant.pool_role,
current_pool_name, current_pool_role,
)
return False
return True
def unlock_expiry(drive_id: int, current_pool_name: str | None,
current_pool_role: str | None) -> float | None:
"""Return the absolute expiry of an active grant, or None.
Same identity-binding semantics as _is_unlocked: a grant whose stored
pool identity no longer matches the current row is treated as expired
and reaped. This is what the dashboard reads to decide whether to show
the unlocked-Burn-In affordance vs the locked-Unlock affordance.
"""
grant = _unlock_grants.get(drive_id)
if grant is None:
return None
if _time.time() >= grant.expiry:
_unlock_grants.pop(drive_id, None)
return None
if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role:
_unlock_grants.pop(drive_id, None)
return None
return grant.expiry
async def grant_pool_unlock(drive_id: int, confirm_token: str,
operator: str, reason: str) -> float:
"""Validate confirmation token + reason and grant a time-limited unlock.
Raises ValueError on bad confirm_token, missing reason, or drive not
actually in a pool. Returns the unix expiry timestamp on success.
"""
if not reason or len(reason.strip()) < 5:
raise ValueError("A reason of at least 5 characters is required.")
if not operator or not operator.strip():
raise ValueError("Operator name is required.")
async with _db() as db:
db.row_factory = aiosqlite.Row
cur = await db.execute(
"SELECT pool_name, pool_role, devname FROM drives WHERE id=?",
(drive_id,),
)
row = await cur.fetchone()
if not row:
raise ValueError("Drive not found.")
pool_name = row["pool_name"]
pool_role = row["pool_role"]
if not pool_name:
raise ValueError(
"This drive is not part of any pool — no unlock needed."
)
# Boot-pool / exported / mounted-fs all get dedicated, harder-to-
# fat-finger tokens. Active data pools just need their pool name
# typed.
if pool_name == BOOT_POOL_NAME:
expected = BOOT_POOL_CONFIRM_TOKEN
elif pool_role == EXPORTED_POOL_ROLE:
expected = EXPORTED_CONFIRM_TOKEN
elif pool_role == MOUNTED_ROLE:
expected = MOUNTED_CONFIRM_TOKEN
else:
expected = pool_name
if (confirm_token or "").strip() != expected:
raise ValueError("Confirmation token does not match.")
if pool_name == BOOT_POOL_NAME:
evt = "boot_pool_drive_unlocked"
elif pool_role == EXPORTED_POOL_ROLE:
evt = "exported_pool_drive_unlocked"
elif pool_role == MOUNTED_ROLE:
evt = "mounted_drive_unlocked"
else:
evt = "pool_drive_unlocked"
await db.execute(
"""INSERT INTO audit_events
(event_type, drive_id, burnin_job_id, operator, message)
VALUES (?,?,?,?,?)""",
(evt, drive_id, None, operator.strip(),
f"Unlocked {pool_name} drive {row['devname']} for burn-in: {reason.strip()}"),
)
await db.commit()
# Arm the in-memory grant ONLY after the audit row is durable. If the
# commit above raises, we exit without writing _unlock_grants — no
# unaudited active unlocks. The grant is bound to the (pool_name,
# pool_role) we observed under the open transaction so a later poll
# that reclassifies the drive invalidates it (see _is_unlocked).
expiry = _time.time() + UNLOCK_TTL_SECONDS
_unlock_grants[drive_id] = _UnlockGrant(
expiry=expiry,
pool_name=pool_name,
pool_role=pool_role,
)
log.warning(
"Pool-drive unlock granted: drive_id=%d pool=%s role=%s "
"operator=%s reason=%r",
drive_id, pool_name, pool_role, operator, reason,
)
return expiry
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------

71
app/burnin/kill.py Normal file
View file

@ -0,0 +1,71 @@
"""Remote process kill machinery.
asyncssh's ``proc.kill()`` sends an SSH "signal" channel request that
OpenSSH's sshd ignores by default — the remote process keeps running and
``proc.wait()`` hangs forever, pinning the asyncio.Semaphore slot.
The fix: capture the remote PID at command launch (via the
``sh -c 'echo PID:$$; exec ...'`` wrapper) and issue ``kill -9 <pid>``
over a fresh SSH session when we need to abort. This module owns that
state and the kill helper.
Public surface (used by the rest of the burnin package):
set_remote_pid(job_id, pid) call from the stage when launch succeeds
clear_remote_pid(job_id) call from the cleanup callback
kill_remote_process(job_id) fire-and-clear; safe to call repeatedly
"""
from __future__ import annotations
import asyncio
import logging
log = logging.getLogger(__name__)
# job_id -> remote PID. Module-level dict so it survives across the
# stage / task / __init__ split without needing to thread it through
# function signatures.
_remote_pids: dict[int, int] = {}
def set_remote_pid(job_id: int, pid: int) -> None:
"""Record the remote PID captured by the running stage."""
_remote_pids[job_id] = pid
def clear_remote_pid(job_id: int) -> None:
"""Drop the PID without trying to kill — used by the task cleanup
callback so a normally-completed job doesn't carry stale state."""
_remote_pids.pop(job_id, None)
def get_remote_pid(job_id: int) -> int | None:
return _remote_pids.get(job_id)
async def kill_remote_process(job_id: int) -> None:
"""Send kill -9 to the remote PID associated with this job, if any.
Idempotent pops the PID before attempting the kill so a second
call is a no-op. SSH connection failure is logged but never raised
(we'd rather best-effort-kill than block the cancel path).
"""
pid = _remote_pids.pop(job_id, None)
if not pid:
return
try:
# Local import to avoid pulling asyncssh into module load if
# this helper is never used (tests, mock mode).
from app import ssh_client
async with await ssh_client._connect() as conn:
await asyncio.wait_for(
conn.run(
f"kill -9 {pid} 2>/dev/null || true", check=False,
),
timeout=10,
)
log.info("Remote-killed PID %d for job %d", pid, job_id)
except Exception as exc:
log.warning(
"Failed to remote-kill PID %d for job %d: %s", pid, job_id, exc,
)

209
app/burnin/unlock.py Normal file
View file

@ -0,0 +1,209 @@
"""Pool-drive unlock state.
Drives that ZFS reports as belonging to an active zpool (including the
boot pool), drives carrying ZFS labels from a previously-imported pool
("exported"), and drives with a non-ZFS mount somewhere ("mounted") are
all locked from burn-in until the operator explicitly unlocks them via
``POST /api/v1/drives/{id}/unlock``. Grants live in memory only a
container restart wipes them, which is the right default for "this is
very dangerous." TTL is bounded so an unlock you forgot about can't sit
armed indefinitely.
Each lock kind has its own confirm token to make the override
deliberate; see grant_pool_unlock for the matching logic.
Public surface:
is_unlocked(drive_id, current_pool_name, current_pool_role) -> bool
unlock_expiry(drive_id, current_pool_name, current_pool_role) -> float|None
grant_pool_unlock(drive_id, confirm_token, operator, reason) -> float
PoolMemberError raised by start_job
UNLOCK_TTL_SECONDS for the unlock endpoint response
BOOT_POOL_NAME / *_TOKEN consts for the UI / audit
"""
from __future__ import annotations
import logging
import time as _time
from dataclasses import dataclass
import aiosqlite
from app.config import settings
log = logging.getLogger(__name__)
UNLOCK_TTL_SECONDS = 600 # 10 minutes
BOOT_POOL_NAME = "boot-pool"
BOOT_POOL_CONFIRM_TOKEN = "DESTROY BOOT POOL"
EXPORTED_POOL_ROLE = "exported"
EXPORTED_CONFIRM_TOKEN = "DESTROY EXPORTED POOL"
MOUNTED_ROLE = "mounted"
MOUNTED_CONFIRM_TOKEN = "DESTROY MOUNTED FILESYSTEM"
@dataclass
class _UnlockGrant:
"""An operator-issued, time-bounded permission to burn-in a pool drive.
The grant is BOUND to the (pool_name, pool_role) observed at unlock
time. If a subsequent poll reclassifies the drive e.g. it was
"(exported)" when unlocked but is now in active pool "tank", or it
used to be a cache vdev and now shows as data the grant is
invalidated. Otherwise the operator's "I confirm this exported drive
is decommissioned" judgement would silently authorise destruction
of a live pool.
"""
expiry: float
pool_name: str
pool_role: str | None
_unlock_grants: dict[int, _UnlockGrant] = {}
class PoolMemberError(Exception):
"""Raised by start_job when a drive is in a zpool and not unlocked."""
def __init__(self, drive_id: int, pool_name: str, pool_role: str | None):
self.drive_id = drive_id
self.pool_name = pool_name
self.pool_role = pool_role
is_boot = pool_name == BOOT_POOL_NAME
super().__init__(
f"Drive is part of {'BOOT POOL' if is_boot else 'pool'} "
f"'{pool_name}'{' (' + pool_role + ')' if pool_role else ''}. "
f"Unlock required before burn-in."
)
def is_unlocked(drive_id: int, current_pool_name: str | None,
current_pool_role: str | None) -> bool:
"""True iff a non-expired grant exists AND the drive's pool identity
matches what was observed at unlock time."""
grant = _unlock_grants.get(drive_id)
if grant is None:
return False
if _time.time() >= grant.expiry:
_unlock_grants.pop(drive_id, None)
return False
if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role:
# Pool identity changed since unlock — drive may now belong to a
# different (or live) pool. Invalidate the grant; operator must
# re-unlock with eyes-open against the current state.
_unlock_grants.pop(drive_id, None)
log.warning(
"Invalidating unlock grant for drive_id=%d: pool changed from "
"(%s, %s) to (%s, %s)",
drive_id, grant.pool_name, grant.pool_role,
current_pool_name, current_pool_role,
)
return False
return True
def unlock_expiry(drive_id: int, current_pool_name: str | None,
current_pool_role: str | None) -> float | None:
"""Return the absolute expiry of an active grant, or None.
Same identity-binding semantics as is_unlocked: a grant whose stored
pool identity no longer matches the current row is treated as expired
and reaped. This is what the dashboard reads to decide whether to show
the unlocked-Burn-In affordance vs the locked-Unlock affordance.
"""
grant = _unlock_grants.get(drive_id)
if grant is None:
return None
if _time.time() >= grant.expiry:
_unlock_grants.pop(drive_id, None)
return None
if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role:
_unlock_grants.pop(drive_id, None)
return None
return grant.expiry
def invalidate_grant(drive_id: int) -> None:
"""Drop a grant unconditionally — used by start_job when a fresh
SSH-side pool check shows the drive's identity has shifted."""
_unlock_grants.pop(drive_id, None)
async def grant_pool_unlock(drive_id: int, confirm_token: str,
operator: str, reason: str) -> float:
"""Validate confirmation token + reason and grant a time-limited unlock.
Raises ValueError on bad confirm_token, missing reason, or drive not
actually in a pool. Returns the unix expiry timestamp on success.
"""
if not reason or len(reason.strip()) < 5:
raise ValueError("A reason of at least 5 characters is required.")
if not operator or not operator.strip():
raise ValueError("Operator name is required.")
async with aiosqlite.connect(settings.db_path) as db:
db.row_factory = aiosqlite.Row
await db.execute("PRAGMA busy_timeout=10000")
cur = await db.execute(
"SELECT pool_name, pool_role, devname FROM drives WHERE id=?",
(drive_id,),
)
row = await cur.fetchone()
if not row:
raise ValueError("Drive not found.")
pool_name = row["pool_name"]
pool_role = row["pool_role"]
if not pool_name:
raise ValueError(
"This drive is not part of any pool — no unlock needed."
)
# Boot-pool / exported / mounted-fs all get dedicated, harder-to-
# fat-finger tokens. Active data pools just need their pool name
# typed.
if pool_name == BOOT_POOL_NAME:
expected = BOOT_POOL_CONFIRM_TOKEN
elif pool_role == EXPORTED_POOL_ROLE:
expected = EXPORTED_CONFIRM_TOKEN
elif pool_role == MOUNTED_ROLE:
expected = MOUNTED_CONFIRM_TOKEN
else:
expected = pool_name
if (confirm_token or "").strip() != expected:
raise ValueError("Confirmation token does not match.")
if pool_name == BOOT_POOL_NAME:
evt = "boot_pool_drive_unlocked"
elif pool_role == EXPORTED_POOL_ROLE:
evt = "exported_pool_drive_unlocked"
elif pool_role == MOUNTED_ROLE:
evt = "mounted_drive_unlocked"
else:
evt = "pool_drive_unlocked"
await db.execute(
"""INSERT INTO audit_events
(event_type, drive_id, burnin_job_id, operator, message)
VALUES (?,?,?,?,?)""",
(evt, drive_id, None, operator.strip(),
f"Unlocked {pool_name} drive {row['devname']} for burn-in: {reason.strip()}"),
)
await db.commit()
# Arm the in-memory grant ONLY after the audit row is durable. If the
# commit above raises, we exit without writing _unlock_grants — no
# unaudited active unlocks. The grant is bound to the (pool_name,
# pool_role) we observed under the open transaction so a later poll
# that reclassifies the drive invalidates it (see is_unlocked).
expiry = _time.time() + UNLOCK_TTL_SECONDS
_unlock_grants[drive_id] = _UnlockGrant(
expiry=expiry,
pool_name=pool_name,
pool_role=pool_role,
)
log.warning(
"Pool-drive unlock granted: drive_id=%d pool=%s role=%s "
"operator=%s reason=%r",
drive_id, pool_name, pool_role, operator, reason,
)
return expiry

View file

@ -83,7 +83,7 @@ class Settings(BaseSettings):
ssh_key: str = "" # PEM private key content (paste full key including headers) ssh_key: str = "" # PEM private key content (paste full key including headers)
# Application version — used by the /api/v1/updates/check endpoint # Application version — used by the /api/v1/updates/check endpoint
app_version: str = "1.0.0-29" app_version: str = "1.0.0-30"
# ---- Authentication (1.0.0-22) ---- # ---- Authentication (1.0.0-22) ----
# session_secret: HMAC key for signing session cookies. Empty = generate # session_secret: HMAC key for signing session cookies. Empty = generate

View file

@ -169,15 +169,18 @@ class TestUnlockFlow(unittest.IsolatedAsyncioTestCase):
async def test_expired_grant_returns_false(self): async def test_expired_grant_returns_false(self):
from app import burnin from app import burnin
# Drop TTL to 0 so the grant is born expired. from app.burnin import unlock as _unlock
original = burnin.UNLOCK_TTL_SECONDS # Drop TTL to 0 so the grant is born expired. Monkey-patch the
burnin.UNLOCK_TTL_SECONDS = 0 # real source-of-truth in app.burnin.unlock — the alias on the
# package root is bound at import time and won't propagate back.
original = _unlock.UNLOCK_TTL_SECONDS
_unlock.UNLOCK_TTL_SECONDS = 0
try: try:
await burnin.grant_pool_unlock(1, "tank", "op", "valid reason") await burnin.grant_pool_unlock(1, "tank", "op", "valid reason")
self.assertFalse(burnin._is_unlocked(1, "tank", "data")) self.assertFalse(burnin._is_unlocked(1, "tank", "data"))
self.assertNotIn(1, burnin._unlock_grants) self.assertNotIn(1, burnin._unlock_grants)
finally: finally:
burnin.UNLOCK_TTL_SECONDS = original _unlock.UNLOCK_TTL_SECONDS = original
# ----- audit commit ordering (Codex finding #3) ----- # ----- audit commit ordering (Codex finding #3) -----