diff --git a/app/burnin.py b/app/burnin.py index d1cb87a..0a1ec13 100644 --- a/app/burnin.py +++ b/app/burnin.py @@ -19,6 +19,7 @@ Cancellation: import asyncio import logging import time +from contextlib import asynccontextmanager from datetime import datetime, timezone import aiosqlite @@ -66,14 +67,29 @@ POLL_INTERVAL = 5.0 # seconds between progress checks during active stages _semaphore: asyncio.Semaphore | None = None _client: TrueNASClient | None = None +# Live job tracking — keeps a strong reference to every _run_job task so it +# isn't garbage-collected (asyncio.create_task only keeps a weak ref) and so +# cancel_job / check_stuck_jobs can actually unwedge a stuck task. +_active_tasks: dict[int, "asyncio.Task"] = {} + +# Remote PID of any long-running SSH child process (currently only badblocks) +# so we can kill it via a fresh SSH session — proc.kill() over asyncssh sends +# a "signal" channel request that OpenSSH sshd ignores by default, leaving +# the remote process running and proc.wait() hanging forever. +_remote_pids: dict[int, int] = {} + def _now() -> str: return datetime.now(timezone.utc).isoformat() -def _db(): - """Open a fresh WAL-mode connection. Caller must use 'async with'.""" - return aiosqlite.connect(settings.db_path) +@asynccontextmanager +async def _db(): + """Open a WAL-mode connection with busy_timeout so writers wait for the lock + instead of immediately raising 'database is locked' under contention.""" + async with aiosqlite.connect(settings.db_path) as db: + await db.execute("PRAGMA busy_timeout=10000") + yield db # --------------------------------------------------------------------------- @@ -104,11 +120,228 @@ async def init(client: TrueNASClient) -> None: await db.commit() for job_id in queued: - asyncio.create_task(_run_job(job_id)) + _spawn_run_job(job_id) log.info("Burn-in orchestrator ready (max_concurrent=%d)", settings.max_parallel_burnins) +def _spawn_run_job(job_id: int) -> "asyncio.Task": + """Schedule a _run_job task and keep a strong reference to it. + + Plain asyncio.create_task() only leaves a weak reference behind, so the + task can be GC'd before it ever runs. Storing it in _active_tasks also + lets cancel_job / check_stuck_jobs cancel it directly. + """ + task = asyncio.create_task(_run_job(job_id)) + _active_tasks[job_id] = task + + def _cleanup(t: "asyncio.Task") -> None: + # Remove only if it's still us — avoid clobbering a re-enqueued task. + if _active_tasks.get(job_id) is t: + _active_tasks.pop(job_id, None) + _remote_pids.pop(job_id, None) + + task.add_done_callback(_cleanup) + return task + + +async def _kill_remote_process(job_id: int) -> None: + """Send kill -9 to the remote PID associated with this job, if any. + + asyncssh's proc.kill() sends an SSH 'signal' channel request which + OpenSSH's sshd does not honor by default. Opening a fresh session and + running /bin/kill is the reliable way to actually terminate the process. + """ + pid = _remote_pids.pop(job_id, None) + if not pid: + return + try: + from app import ssh_client + async with await ssh_client._connect() as conn: + await asyncio.wait_for( + conn.run(f"kill -9 {pid} 2>/dev/null || true", check=False), + timeout=10, + ) + log.info("Remote-killed PID %d for job %d", pid, job_id) + except Exception as exc: + log.warning("Failed to remote-kill PID %d for job %d: %s", pid, job_id, exc) + + +# --------------------------------------------------------------------------- +# Pool-drive unlock state +# --------------------------------------------------------------------------- +# +# Drives that ZFS reports as belonging to an active zpool (including the +# boot pool) are locked from burn-in until the operator explicitly unlocks +# them via POST /api/v1/drives/{id}/unlock. Grants live in memory only — +# a container restart wipes them, which is the right default for "this is +# very dangerous." TTL is bounded so an unlock you forgot about can't sit +# armed indefinitely. + +import time as _time +from dataclasses import dataclass + +UNLOCK_TTL_SECONDS = 600 # 10 minutes +BOOT_POOL_NAME = "boot-pool" +BOOT_POOL_CONFIRM_TOKEN = "DESTROY BOOT POOL" +EXPORTED_POOL_ROLE = "exported" +EXPORTED_CONFIRM_TOKEN = "DESTROY EXPORTED POOL" + + +@dataclass +class _UnlockGrant: + """An operator-issued, time-bounded permission to burn-in a pool drive. + + The grant is BOUND to the (pool_name, pool_role) observed at unlock + time. If a subsequent poll reclassifies the drive — e.g. it was + "(exported)" when unlocked but is now in active pool "tank", or it + used to be a cache vdev and now shows as data — the grant is + invalidated. Otherwise the operator's "I confirm this exported drive + is decommissioned" judgement would silently authorise destruction + of a live pool. + """ + expiry: float + pool_name: str + pool_role: str | None + + +_unlock_grants: dict[int, _UnlockGrant] = {} + + +class PoolMemberError(Exception): + """Raised by start_job when a drive is in a zpool and not unlocked.""" + def __init__(self, drive_id: int, pool_name: str, pool_role: str | None): + self.drive_id = drive_id + self.pool_name = pool_name + self.pool_role = pool_role + is_boot = pool_name == BOOT_POOL_NAME + super().__init__( + f"Drive is part of {'BOOT POOL' if is_boot else 'pool'} " + f"'{pool_name}'{' (' + pool_role + ')' if pool_role else ''}. " + f"Unlock required before burn-in." + ) + + +def _is_unlocked(drive_id: int, current_pool_name: str | None, + current_pool_role: str | None) -> bool: + """True iff a non-expired grant exists AND the drive's pool identity + matches what was observed at unlock time.""" + grant = _unlock_grants.get(drive_id) + if grant is None: + return False + if _time.time() >= grant.expiry: + _unlock_grants.pop(drive_id, None) + return False + if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role: + # Pool identity changed since unlock — drive may now belong to a + # different (or live) pool. Invalidate the grant; operator must + # re-unlock with eyes-open against the current state. + _unlock_grants.pop(drive_id, None) + log.warning( + "Invalidating unlock grant for drive_id=%d: pool changed from " + "(%s, %s) to (%s, %s)", + drive_id, grant.pool_name, grant.pool_role, + current_pool_name, current_pool_role, + ) + return False + return True + + +def unlock_expiry(drive_id: int, current_pool_name: str | None, + current_pool_role: str | None) -> float | None: + """Return the absolute expiry of an active grant, or None. + + Same identity-binding semantics as _is_unlocked: a grant whose stored + pool identity no longer matches the current row is treated as expired + and reaped. This is what the dashboard reads to decide whether to show + the unlocked-Burn-In affordance vs the locked-Unlock affordance. + """ + grant = _unlock_grants.get(drive_id) + if grant is None: + return None + if _time.time() >= grant.expiry: + _unlock_grants.pop(drive_id, None) + return None + if grant.pool_name != current_pool_name or grant.pool_role != current_pool_role: + _unlock_grants.pop(drive_id, None) + return None + return grant.expiry + + +async def grant_pool_unlock(drive_id: int, confirm_token: str, + operator: str, reason: str) -> float: + """Validate confirmation token + reason and grant a time-limited unlock. + + Raises ValueError on bad confirm_token, missing reason, or drive not + actually in a pool. Returns the unix expiry timestamp on success. + """ + if not reason or len(reason.strip()) < 5: + raise ValueError("A reason of at least 5 characters is required.") + if not operator or not operator.strip(): + raise ValueError("Operator name is required.") + + async with _db() as db: + db.row_factory = aiosqlite.Row + cur = await db.execute( + "SELECT pool_name, pool_role, devname FROM drives WHERE id=?", + (drive_id,), + ) + row = await cur.fetchone() + if not row: + raise ValueError("Drive not found.") + pool_name = row["pool_name"] + pool_role = row["pool_role"] + if not pool_name: + raise ValueError( + "This drive is not part of any pool — no unlock needed." + ) + + # Boot-pool and exported pools both get dedicated, harder-to-fat- + # finger tokens. Active data pools just need their pool name typed. + if pool_name == BOOT_POOL_NAME: + expected = BOOT_POOL_CONFIRM_TOKEN + elif pool_role == EXPORTED_POOL_ROLE: + expected = EXPORTED_CONFIRM_TOKEN + else: + expected = pool_name + if (confirm_token or "").strip() != expected: + raise ValueError("Confirmation token does not match.") + + if pool_name == BOOT_POOL_NAME: + evt = "boot_pool_drive_unlocked" + elif pool_role == EXPORTED_POOL_ROLE: + evt = "exported_pool_drive_unlocked" + else: + evt = "pool_drive_unlocked" + await db.execute( + """INSERT INTO audit_events + (event_type, drive_id, burnin_job_id, operator, message) + VALUES (?,?,?,?,?)""", + (evt, drive_id, None, operator.strip(), + f"Unlocked {pool_name} drive {row['devname']} for burn-in: {reason.strip()}"), + ) + await db.commit() + + # Arm the in-memory grant ONLY after the audit row is durable. If the + # commit above raises, we exit without writing _unlock_grants — no + # unaudited active unlocks. The grant is bound to the (pool_name, + # pool_role) we observed under the open transaction so a later poll + # that reclassifies the drive invalidates it (see _is_unlocked). + expiry = _time.time() + UNLOCK_TTL_SECONDS + _unlock_grants[drive_id] = _UnlockGrant( + expiry=expiry, + pool_name=pool_name, + pool_role=pool_role, + ) + + log.warning( + "Pool-drive unlock granted: drive_id=%d pool=%s role=%s " + "operator=%s reason=%r", + drive_id, pool_name, pool_role, operator, reason, + ) + return expiry + + # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- @@ -142,13 +375,35 @@ async def start_job(drive_id: int, profile: str, operator: str, if (await cur.fetchone())[0] > 0: raise ValueError("Drive already has an active burn-in job") - # Create job + # Pool-membership gate: locked unless the operator explicitly + # unlocked this drive via /api/v1/drives/{id}/unlock recently. + # _is_unlocked also checks that the grant's stored (pool_name, + # pool_role) still matches the live row — a grant issued for an + # exported drive doesn't carry over if the drive turns out to be + # in an active pool on the next poll. cur = await db.execute( - """INSERT INTO burnin_jobs (drive_id, profile, state, percent, operator, created_at) - VALUES (?,?,?,?,?,?) RETURNING id""", - (drive_id, profile, "queued", 0, operator, now), + "SELECT pool_name, pool_role FROM drives WHERE id=?", (drive_id,) ) - job_id = (await cur.fetchone())["id"] + drow = await cur.fetchone() + if drow and drow["pool_name"] and not _is_unlocked( + drive_id, drow["pool_name"], drow["pool_role"] + ): + raise PoolMemberError(drive_id, drow["pool_name"], drow["pool_role"]) + + # Create job. The partial unique index uniq_active_burnin_per_drive + # (database.py) is the actual race-stopper here: if two concurrent + # /api/v1/burnin/start calls both pass the SELECT-COUNT check above, + # only one INSERT can win; the loser raises IntegrityError, which + # we surface with the same ValueError as the inline duplicate check. + try: + cur = await db.execute( + """INSERT INTO burnin_jobs (drive_id, profile, state, percent, operator, created_at) + VALUES (?,?,?,?,?,?) RETURNING id""", + (drive_id, profile, "queued", 0, operator, now), + ) + job_id = (await cur.fetchone())["id"] + except aiosqlite.IntegrityError: + raise ValueError("Drive already has an active burn-in job") # Create stage rows in the desired execution order for stage_name in stages: @@ -164,7 +419,7 @@ async def start_job(drive_id: int, profile: str, operator: str, ) await db.commit() - asyncio.create_task(_run_job(job_id)) + _spawn_run_job(job_id) log.info("Burn-in job %d queued (drive_id=%d profile=%s operator=%s)", job_id, drive_id, profile, operator) return job_id @@ -198,6 +453,13 @@ async def cancel_job(job_id: int, operator: str) -> bool: ) await db.commit() + # Kill the remote child process FIRST (so proc.wait() in the running task + # can return), then cancel the task so any other awaits unblock. + await _kill_remote_process(job_id) + task = _active_tasks.get(job_id) + if task and not task.done(): + task.cancel() + log.info("Burn-in job %d cancelled by %s", job_id, operator) return True @@ -206,10 +468,45 @@ async def cancel_job(job_id: int, operator: str) -> bool: # Job runner # --------------------------------------------------------------------------- +async def _thermal_gate_ok() -> bool: + """True if it's thermally safe to start a new burn-in. + Checks the peak temperature of drives currently under active burn-in. + """ + try: + async with _db() as db: + cur = await db.execute(""" + SELECT MAX(d.temperature_c) + FROM drives d + JOIN burnin_jobs bj ON bj.drive_id = d.id + WHERE bj.state = 'running' AND d.temperature_c IS NOT NULL + """) + row = await cur.fetchone() + max_temp = row[0] if row and row[0] is not None else None + return max_temp is None or max_temp < settings.temp_warn_c + except Exception: + return True # Never block on error + + async def _run_job(job_id: int) -> None: """Acquire semaphore slot, execute all stages, persist final state.""" assert _semaphore is not None, "burnin.init() not called" + # Adaptive thermal gate: wait before competing for a slot if running drives + # are already at or above the warning threshold. This prevents layering a + # new burn-in on top of a thermally-stressed system. Gives up after 3 min + # and proceeds anyway so jobs don't queue indefinitely. + for _attempt in range(18): # 18 × 10 s = 3 min max + if await _thermal_gate_ok(): + break + if _attempt == 0: + log.info( + "Thermal gate: job %d waiting — running drive temps at or above %d°C", + job_id, settings.temp_warn_c, + ) + await asyncio.sleep(10) + else: + log.warning("Thermal gate timed out for job %d — proceeding anyway", job_id) + async with _semaphore: if await _is_cancelled(job_id): return @@ -254,18 +551,35 @@ async def _run_job(job_id: int) -> None: success = False error_text = None + was_cancelled = False try: success = await _execute_stages(job_id, job_stages, devname, drive_id) except asyncio.CancelledError: - pass + was_cancelled = True except Exception as exc: error_text = str(exc) log.exception("Burn-in raised exception", extra={"job_id": job_id, "devname": devname}) - if await _is_cancelled(job_id): + # If the job has already moved to a terminal state — by cancel_job + # ('cancelled') or check_stuck_jobs ('unknown') — leave it alone. The + # task may have been cancelled mid-stage; finalizing as 'failed' would + # clobber that audit-meaningful terminal state. + async with _db() as db: + cur = await db.execute("SELECT state FROM burnin_jobs WHERE id=?", (job_id,)) + cur_row = await cur.fetchone() + if cur_row and cur_row[0] != "running": return - final_state = "passed" if success else "failed" + # Cancellation arriving here means the asyncio task was cancelled + # by something other than cancel_job/check_stuck_jobs (shutdown, + # uvicorn reload, future code paths). The DB still says 'running', + # so we have to write *some* terminal state, but classifying the + # interrupted job as 'failed' would lie — we don't actually know + # whether the underlying SMART/badblocks work passed or not. + if was_cancelled: + final_state = "unknown" + else: + final_state = "passed" if success else "failed" async with _db() as db: await db.execute("PRAGMA journal_mode=WAL") await db.execute( @@ -464,6 +778,14 @@ async def _stage_smart_test_ssh(job_id: int, devname: str, test_type: str, stage # Brief pause to let the test register in smartctl output await asyncio.sleep(3) + # Throttle log_text appends — every poll on a multi-hour long_smart bloated + # log_text to 50+ MB and triggered SQLite "database is locked" because each + # COALESCE-then-append rewrites the whole column. Append every ~60s, on the + # first poll, and on any state change. + LOG_EVERY_N_POLLS = 12 + poll_count = 0 + last_state: str | None = None + # Poll until complete while True: if await _is_cancelled(job_id): @@ -482,7 +804,11 @@ async def _stage_smart_test_ssh(job_id: int, devname: str, test_type: str, stage await _append_stage_log(job_id, stage_name, f"[poll error] {exc}\n") continue - await _append_stage_log(job_id, stage_name, progress["output"] + "\n---\n") + poll_count += 1 + state_changed = progress["state"] != last_state + last_state = progress["state"] + if poll_count == 1 or poll_count % LOG_EVERY_N_POLLS == 0 or state_changed: + await _append_stage_log(job_id, stage_name, progress["output"] + "\n---\n") if progress["state"] == "running": pct = max(0, 100 - progress["percent_remaining"]) @@ -519,15 +845,39 @@ async def _stage_smart_test_ssh(job_id: int, devname: str, test_type: str, stage # "unknown" → keep polling +async def _badblocks_available() -> bool: + """Check if badblocks is installed on the remote host (Linux/SCALE only).""" + from app import ssh_client + try: + async with await ssh_client._connect() as conn: + result = await conn.run("which badblocks", check=False) + return result.returncode == 0 + except Exception: + return False + + async def _stage_surface_validate(job_id: int, devname: str, drive_id: int) -> bool: """ - Surface validation stage. - SSH mode: runs badblocks -wsv -b 4096 -p 1 /dev/{devname}. - Mock mode: simulated timed progress (no real I/O). + Surface validation stage — auto-routes to the right implementation: + + 1. SSH configured + badblocks available (TrueNAS SCALE / Linux): + → runs badblocks -wsv -b 4096 -p 1 /dev/{devname} directly over SSH. + 2. SSH configured + badblocks NOT available (TrueNAS CORE / FreeBSD): + → uses TrueNAS REST API disk.wipe FULL job + post-wipe SMART check. + 3. No SSH: + → simulated timed progress (dev/mock mode). """ from app import ssh_client if ssh_client.is_configured(): - return await _stage_surface_validate_ssh(job_id, devname, drive_id) + if await _badblocks_available(): + return await _stage_surface_validate_ssh(job_id, devname, drive_id) + # TrueNAS CORE/FreeBSD: badblocks not available — use native wipe API + await _append_stage_log( + job_id, "surface_validate", + "[INFO] badblocks not found on host (TrueNAS CORE/FreeBSD) — " + "using TrueNAS disk.wipe API (FULL write pass).\n\n" + ) + return await _stage_surface_validate_truenas(job_id, devname, drive_id) return await _stage_timed_simulate(job_id, "surface_validate", settings.surface_validate_seconds) @@ -537,8 +887,11 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) await _append_stage_log( job_id, "surface_validate", - f"[START] badblocks -wsv -b 4096 -p 1 /dev/{devname}\n" - f"[NOTE] This is a DESTRUCTIVE write test. All data on /dev/{devname} will be overwritten.\n\n" + f"[START] badblocks -wsv -b {settings.surface_validate_block_size} " + f"-c {settings.surface_validate_block_buffer} " + f"-p {settings.surface_validate_passes} /dev/{devname}\n" + f"[NOTE] This is a DESTRUCTIVE write test. " + f"All data on /dev/{devname} will be overwritten.\n\n" ) def _is_cancelled_sync() -> bool: @@ -580,14 +933,50 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) output_lines: list[str] = [] async with await ssh_client._connect() as conn: - cmd = f"badblocks -wsv -b 4096 -p 1 /dev/{devname}" + # Wrap in `sh -c 'echo PID:$$; exec ...'` so we get the remote + # PID on the first stdout line. asyncssh's proc.kill() sends an + # SSH signal request that OpenSSH's sshd ignores by default, so + # we need the PID to issue an out-of-band `kill -9` over a fresh + # session when we want to abort. + # + # Block geometry is operator-tunable (Settings → Burn-in): + # -b N block size in bytes (settings.surface_validate_block_size) + # -c N blocks held per IO (settings.surface_validate_block_buffer) + # -p N pass count (settings.surface_validate_passes) + # Defaults preserve original behavior (-b 4096 -c 64 -p 1). + bb_args = ( + f"-wsv " + f"-b {settings.surface_validate_block_size} " + f"-c {settings.surface_validate_block_buffer} " + f"-p {settings.surface_validate_passes}" + ) + cmd = ( + f"sh -c 'echo PID:$$; exec badblocks {bb_args} /dev/{devname}'" + ) async with conn.create_process(cmd) as proc: import re as _re + pid_seen = False + async def _drain(stream, is_stderr: bool): - nonlocal bad_blocks_total + nonlocal bad_blocks_total, pid_seen async for raw in stream: line = raw if isinstance(raw, str) else raw.decode("utf-8", errors="replace") + + # First stdout line is "PID:" from the wrapping shell. + # Capture it and don't append it to the user-visible log. + if not is_stderr and not pid_seen and line.startswith("PID:"): + pid_seen = True + try: + _remote_pids[job_id] = int(line[4:].strip()) + log.info( + "Captured remote PID %d for job %d (badblocks)", + _remote_pids[job_id], job_id, + ) + except ValueError: + pass + continue + output_lines.append(line) if is_stderr: @@ -610,7 +999,7 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) # Abort on bad block threshold if bad_blocks_total > settings.bad_block_threshold: - proc.kill() + await _kill_remote_process(job_id) output_lines.append( f"\n[ABORTED] {bad_blocks_total} bad block(s) exceeded " f"threshold ({settings.bad_block_threshold})\n" @@ -618,7 +1007,7 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) return if await _is_cancelled(job_id): - proc.kill() + await _kill_remote_process(job_id) return await asyncio.gather( @@ -626,7 +1015,17 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) _drain(proc.stderr, True), return_exceptions=True, ) - await proc.wait() + # Bound proc.wait so a remote process that ignored our kill + # signal (or that we never managed to kill) can't pin this + # task in the semaphore forever. Closing the connection on + # exit will deliver SIGPIPE to the remote on its next write. + try: + await asyncio.wait_for(proc.wait(), timeout=15) + except asyncio.TimeoutError: + log.warning( + "proc.wait() timed out for job %d — abandoning channel", + job_id, + ) # Flush remaining output remainder = "".join(output_lines) @@ -655,6 +1054,116 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) return True +async def _stage_surface_validate_truenas(job_id: int, devname: str, drive_id: int) -> bool: + """ + Surface validation via TrueNAS CORE disk.wipe REST API. + Used on FreeBSD (TrueNAS CORE) where badblocks is unavailable. + + Sends a FULL write-zero pass across the entire disk, polls progress, + then runs a post-wipe SMART attribute check to catch reallocated sectors. + """ + from app import ssh_client + + await _append_stage_log( + job_id, "surface_validate", + f"[START] TrueNAS disk.wipe FULL — {devname}\n" + f"[NOTE] DESTRUCTIVE: all data on {devname} will be overwritten.\n\n" + ) + + # Start the wipe job + try: + tn_job_id = await _client.wipe_disk(devname, "FULL") + except Exception as exc: + await _set_stage_error(job_id, "surface_validate", f"Failed to start disk.wipe: {exc}") + return False + + await _append_stage_log( + job_id, "surface_validate", + f"[JOB] TrueNAS wipe job started (job_id={tn_job_id})\n" + ) + + # Poll until complete + log_flush_counter = 0 + while True: + if await _is_cancelled(job_id): + try: + await _client.abort_job(tn_job_id) + except Exception: + pass + return False + + await asyncio.sleep(POLL_INTERVAL) + + try: + job = await _client.get_job(tn_job_id) + except Exception as exc: + log.warning("Wipe job poll failed: %s", exc, extra={"job_id": job_id}) + await _append_stage_log(job_id, "surface_validate", f"[poll error] {exc}\n") + continue + + if not job: + await _set_stage_error(job_id, "surface_validate", f"Wipe job {tn_job_id} not found") + return False + + state = job.get("state", "") + pct = int(job.get("progress", {}).get("percent", 0) or 0) + desc = job.get("progress", {}).get("description", "") + + await _update_stage_percent(job_id, "surface_validate", min(pct, 99)) + await _recalculate_progress(job_id) + _push_update() + + # Log progress description every ~5 polls to avoid DB spam + log_flush_counter += 1 + if desc and log_flush_counter % 5 == 0: + await _append_stage_log(job_id, "surface_validate", f"[{pct}%] {desc}\n") + + if state == "SUCCESS": + await _update_stage_percent(job_id, "surface_validate", 100) + await _append_stage_log( + job_id, "surface_validate", + f"\n[DONE] Wipe job {tn_job_id} completed successfully.\n" + ) + # Post-wipe SMART check — catch any sectors that failed under write stress + if ssh_client.is_configured() and drive_id is not None: + await _append_stage_log( + job_id, "surface_validate", + "[CHECK] Running post-wipe SMART attribute check...\n" + ) + try: + attrs = await ssh_client.get_smart_attributes(devname) + await _store_smart_attrs(drive_id, attrs) + if attrs["failures"]: + error = "Post-wipe SMART check: " + "; ".join(attrs["failures"]) + await _set_stage_error(job_id, "surface_validate", error) + return False + if attrs["warnings"]: + await _append_stage_log( + job_id, "surface_validate", + "[WARNING] " + "; ".join(attrs["warnings"]) + "\n" + ) + await _append_stage_log( + job_id, "surface_validate", + f"[CHECK] SMART health: {attrs['health']} — no critical attributes.\n" + ) + except Exception as exc: + log.warning("Post-wipe SMART check failed: %s", exc) + await _append_stage_log( + job_id, "surface_validate", + f"[WARN] Post-wipe SMART check failed (non-fatal): {exc}\n" + ) + return True + + elif state in ("FAILED", "ABORTED", "ERROR"): + error_msg = job.get("error") or f"Disk wipe failed (state={state})" + await _set_stage_error( + job_id, "surface_validate", + f"TrueNAS disk.wipe FAILED: {error_msg}" + ) + return False + # RUNNING or WAITING — keep polling + + async def _stage_timed_simulate(job_id: int, stage_name: str, duration_seconds: int) -> bool: """Simulate a timed stage with progress updates (mock / dev mode).""" start = time.monotonic() @@ -681,21 +1190,47 @@ async def _stage_final_check(job_id: int, devname: str, drive_id: int | None = N Verify drive passed all tests. SSH mode: run smartctl -a and check critical attributes. Mock mode: check SMART health field in DB. + + A transient SSH connectivity failure here must NOT invalidate a prior + multi-day surface_validate. Retry SSH-only failures, then soft-pass. """ await asyncio.sleep(1) from app import ssh_client + + def _ssh_only(failures: list[str]) -> bool: + return bool(failures) and all(f.startswith("SSH error:") for f in failures) + if ssh_client.is_configured() and drive_id is not None: try: attrs = await ssh_client.get_smart_attributes(devname) + for attempt in range(2): + if not _ssh_only(attrs.get("failures") or []): + break + log.warning( + "final_check SSH unreachable (attempt %d/3); retrying in 30s", + attempt + 1, + extra={"job_id": job_id, "devname": devname}, + ) + await asyncio.sleep(30) + attrs = await ssh_client.get_smart_attributes(devname) + + failures = attrs.get("failures") or [] + if _ssh_only(failures): + log.warning( + "final_check soft-pass: SSH unreachable after retries; prior stages stand", + extra={"job_id": job_id, "devname": devname, "ssh_error": failures}, + ) + return True + await _store_smart_attrs(drive_id, attrs) - if attrs["health"] == "FAILED" or attrs["failures"]: - failures = attrs["failures"] or [f"SMART health: {attrs['health']}"] + if attrs["health"] == "FAILED" or failures: + msg = failures or [f"SMART health: {attrs['health']}"] await _set_stage_error(job_id, "final_check", - "Final check failed: " + "; ".join(failures)) + "Final check failed: " + "; ".join(msg)) return False return True except Exception as exc: - log.warning("SSH final_check failed, falling back to DB check: %s", exc) + log.warning("SSH final_check raised, falling back to DB check: %s", exc) # DB check (mock mode fallback) async with _db() as db: @@ -942,6 +1477,11 @@ async def check_stuck_jobs() -> None: "UPDATE burnin_jobs SET state='unknown', finished_at=? WHERE id=?", (now, job_id), ) + await db.execute( + """UPDATE burnin_stages SET state='unknown', finished_at=? + WHERE burnin_job_id=? AND state='running'""", + (now, job_id), + ) await db.execute( """INSERT INTO audit_events (event_type, drive_id, burnin_job_id, operator, message) VALUES (?,?,?,?,?)""", @@ -951,5 +1491,16 @@ async def check_stuck_jobs() -> None: await db.commit() + # Actually unstick the running tasks so they release their semaphore slot. + # Without this the DB state becomes 'unknown' but the asyncio task keeps + # holding the slot forever — which is the bug that left subsequent jobs + # permanently 'queued' until container restart. + for row in stuck: + job_id = row[0] + await _kill_remote_process(job_id) + task = _active_tasks.get(job_id) + if task and not task.done(): + task.cancel() + _push_update() log.warning("Marked %d stuck job(s) as unknown", len(stuck)) diff --git a/app/config.py b/app/config.py index 7c1f3cf..6250bde 100644 --- a/app/config.py +++ b/app/config.py @@ -58,6 +58,21 @@ class Settings(BaseSettings): # Bad-block tolerance — surface_validate fails if bad blocks exceed this bad_block_threshold: int = 0 + # Surface-validate (badblocks) tunables — defaults match the Spearfoot + # disk-burnin.sh community script's recommended geometry for large HDDs. + # block_size : -b in bytes; aligned to AF (4 KiB) sectors. Bumping + # to 8192 roughly halves badblocks runtime on multi-TB + # drives at the cost of ~2x RAM in the test buffer. + # block_buffer : -c blocks held in memory per IO. 64 = badblocks + # default. Higher values = larger buffer, faster IO, + # more RAM (block_size * block_buffer bytes per pass). + # passes : -p value. 1 = repeat until one consecutive clean + # scan (current behavior). 2-3 for paranoid burn-in + # that re-confirms after finding errors. + surface_validate_block_size: int = 4096 + surface_validate_block_buffer: int = 64 + surface_validate_passes: int = 1 + # SSH credentials for direct TrueNAS command execution (Stage 7) # When ssh_host is set, burn-in stages use SSH for smartctl/badblocks instead of REST API. # Leave ssh_host empty to use the mock/REST API (development mode). @@ -68,7 +83,7 @@ class Settings(BaseSettings): ssh_key: str = "" # PEM private key content (paste full key including headers) # Application version — used by the /api/v1/updates/check endpoint - app_version: str = "1.0.0-7" + app_version: str = "1.0.0-21" settings = Settings() diff --git a/app/database.py b/app/database.py index 477b60a..ce02a64 100644 --- a/app/database.py +++ b/app/database.py @@ -89,6 +89,16 @@ _MIGRATIONS = [ "ALTER TABLE smart_tests ADD COLUMN raw_output TEXT", # Stage 8: track last reset time so dashboard burn-in col clears after reset "ALTER TABLE drives ADD COLUMN last_reset_at TEXT", + # 1.0.0-15: pool-membership lock + "ALTER TABLE drives ADD COLUMN pool_name TEXT", + "ALTER TABLE drives ADD COLUMN pool_role TEXT", + "ALTER TABLE drives ADD COLUMN pool_seen_at TEXT", + # 1.0.0-19: enforce one active burn-in per drive at the storage layer. + # Closes the read-then-insert race in burnin.start_job — without this, + # two concurrent /api/v1/burnin/start requests for the same drive could + # both observe zero active jobs and both insert queued rows. + """CREATE UNIQUE INDEX IF NOT EXISTS uniq_active_burnin_per_drive + ON burnin_jobs (drive_id) WHERE state IN ('queued', 'running')""", ] diff --git a/app/mailer.py b/app/mailer.py index 7800d60..642cc8e 100644 --- a/app/mailer.py +++ b/app/mailer.py @@ -5,6 +5,7 @@ Disabled when SMTP_HOST is not set. """ import asyncio +import html import logging import smtplib import ssl @@ -109,17 +110,61 @@ def _drive_rows_html(drives: list[dict]) -> str: return "\n".join(rows) -def _build_html(drives: list[dict], generated_at: str) -> str: +def _build_unlock_banner_html(events: list[dict]) -> str: + """Banner listing every pool-drive unlock granted in the last 24h. + + Every interpolated DB field is run through html.escape — operator and + reason are free-text from the unlock modal and otherwise inject into + the email body verbatim. + """ + if not events: + return "" + rows = [] + for e in events: + evt = e.get("event_type") or "" + is_boot = evt == "boot_pool_drive_unlocked" + is_exported = evt == "exported_pool_drive_unlocked" + kind = ( + "BOOT POOL" if is_boot + else "EXPORTED ZFS" if is_exported + else "pool" + ) + when = html.escape((e.get("created_at") or "")[:19]) + operator = html.escape(e.get("operator") or "?") + devname = html.escape(e.get("devname") or "?") + # `message` already includes pool name, devname, and the operator's + # reason — surface it verbatim so the audit trail is faithful. + message = html.escape(e.get("message") or "") + rows.append( + f"
  • {when} · " + f"{operator} unlocked a {kind} drive " + f"({devname}): " + f"{message}
  • " + ) + return f""" +
    +
    + ⚠ {len(events)} pool-drive unlock(s) in the last 24h +
    + +
    """ + + +def _build_html(drives: list[dict], generated_at: str, + unlock_events: list[dict] | None = None) -> str: total = len(drives) failed_drives = [d for d in drives if d.get("smart_health") == "FAILED"] running_burnin = [d for d in drives if (d.get("burnin") or {}).get("state") == "running"] passed_burnin = [d for d in drives if (d.get("burnin") or {}).get("state") == "passed"] - # Alert banner - alert_html = "" + # Alert banners (unlock events first — the audit-grade signal) + alert_html = _build_unlock_banner_html(unlock_events or []) if failed_drives: names = ", ".join(d["devname"] for d in failed_drives) - alert_html = f""" + alert_html += f"""
    ⚠ SMART health FAILED on {len(failed_drives)} drive(s): {names}
    """ @@ -287,6 +332,36 @@ async def _fetch_report_data() -> list[dict]: return await _fetch_drives_for_template(db) +async def _fetch_unlock_events_24h() -> list[dict]: + """Return pool-drive unlock audit events from the last 24 hours. + + These are operator overrides of the pool-membership lock — every entry + represents a deliberate decision to risk a pool, so the daily report + surfaces them as an audit-grade banner. + """ + async with aiosqlite.connect(settings.db_path) as db: + db.row_factory = aiosqlite.Row + await db.execute("PRAGMA journal_mode=WAL") + # julianday() handles the 'YYYY-MM-DDTHH:MM:SS.fff+00:00' format + # we write from Python; comparing the raw string against + # datetime('now','-1 day') (which formats as 'YYYY-MM-DD HH:MM:SS') + # produces subtle off-by-up-to-a-day errors because of the + # 'T' vs ' ' separator and the '+00:00' suffix. + cur = await db.execute(""" + SELECT ae.event_type, ae.operator, ae.message, ae.created_at, + d.devname, d.pool_name, d.pool_role + FROM audit_events ae + LEFT JOIN drives d ON d.id = ae.drive_id + WHERE ae.event_type IN ( + 'pool_drive_unlocked', + 'boot_pool_drive_unlocked', + 'exported_pool_drive_unlocked') + AND julianday(ae.created_at) >= julianday('now', '-1 day') + ORDER BY ae.created_at DESC + """) + return [dict(r) for r in await cur.fetchall()] + + # --------------------------------------------------------------------------- # Scheduler # --------------------------------------------------------------------------- @@ -411,9 +486,16 @@ async def test_smtp_connection() -> dict: async def send_report_now() -> None: """Send a report immediately (used by on-demand API endpoint).""" drives = await _fetch_report_data() + unlock_events = await _fetch_unlock_events_24h() now_str = datetime.now().strftime("%Y-%m-%d %H:%M") - html = _build_html(drives, now_str) - subject = f"Burn-In Report — {datetime.now().strftime('%Y-%m-%d')} ({len(drives)} drives)" + html = _build_html(drives, now_str, unlock_events) + suffix = "" + if unlock_events: + suffix = f" — {len(unlock_events)} pool unlock(s)" + subject = ( + f"Burn-In Report — {datetime.now().strftime('%Y-%m-%d')} " + f"({len(drives)} drives){suffix}" + ) await asyncio.to_thread(_send_email, subject, html) diff --git a/app/models.py b/app/models.py index 1748c7b..d9195a4 100644 --- a/app/models.py +++ b/app/models.py @@ -97,8 +97,17 @@ class DriveResponse(BaseModel): smart_long: SmartTestState notes: str | None = None location: str | None = None + pool_name: str | None = None + pool_role: str | None = None + pool_unlocked_until: float | None = None # unix epoch; null = locked class UpdateDriveRequest(BaseModel): notes: str | None = None location: str | None = None + + +class UnlockPoolDriveRequest(BaseModel): + confirm_token: str + operator: str + reason: str diff --git a/app/poller.py b/app/poller.py index 67f625c..4c013d7 100644 --- a/app/poller.py +++ b/app/poller.py @@ -20,13 +20,15 @@ from app.truenas import TrueNASClient log = logging.getLogger(__name__) -# Shared state read by the /health endpoint +# Shared state read by the /health endpoint and dashboard template _state: dict[str, Any] = { "last_poll_at": None, "last_error": None, "healthy": False, "drives_seen": 0, "consecutive_failures": 0, + "system_temps": {}, # {"cpu_c": int|None, "pch_c": int|None} + "thermal_pressure": "ok", # "ok" | "warn" | "crit" — based on running burn-in drive temps } # SSE subscriber queues — notified after each successful poll @@ -87,18 +89,60 @@ def _map_history_state(status: str) -> str: # DB helpers # --------------------------------------------------------------------------- -async def _upsert_drive(db: aiosqlite.Connection, disk: dict, now: str) -> int: - await db.execute( +async def _upsert_drive(db: aiosqlite.Connection, disk: dict, now: str, + pool_info: dict | None = None, + update_pool: bool = True) -> int: + """Insert/update a drive row. + + pool_info: {"pool": str, "role": str} if this drive is currently in a + zpool, else None. None values clear pool columns so a removed-from-pool + drive doesn't stay locked. + + update_pool: when False, pool columns are preserved on conflict and + initialised to NULL on insert. Callers pass False on detection failure + so a transient SSH outage doesn't silently unlock every drive. + """ + pool_name = pool_info["pool"] if pool_info else None + pool_role = pool_info["role"] if pool_info else None + pool_seen_at = now if pool_info else None + + if update_pool: + update_clause = """ + devname = excluded.devname, + serial = excluded.serial, + model = excluded.model, + size_bytes = excluded.size_bytes, + temperature_c = excluded.temperature_c, + smart_health = excluded.smart_health, + last_seen_at = excluded.last_seen_at, + last_polled_at = excluded.last_polled_at, + pool_name = excluded.pool_name, + pool_role = excluded.pool_role, + pool_seen_at = excluded.pool_seen_at """ - INSERT INTO drives - (truenas_disk_id, devname, serial, model, size_bytes, - temperature_c, smart_health, last_seen_at, last_polled_at) - VALUES (?,?,?,?,?,?,?,?,?) - ON CONFLICT(truenas_disk_id) DO UPDATE SET + else: + # Preserve pool_name / pool_role / pool_seen_at — detection failed + # this cycle, so we have no fresh data and must not overwrite. + update_clause = """ + devname = excluded.devname, + serial = excluded.serial, + model = excluded.model, + size_bytes = excluded.size_bytes, temperature_c = excluded.temperature_c, smart_health = excluded.smart_health, last_seen_at = excluded.last_seen_at, last_polled_at = excluded.last_polled_at + """ + + await db.execute( + f""" + INSERT INTO drives + (truenas_disk_id, devname, serial, model, size_bytes, + temperature_c, smart_health, last_seen_at, last_polled_at, + pool_name, pool_role, pool_seen_at) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?) + ON CONFLICT(truenas_disk_id) DO UPDATE SET + {update_clause} """, ( disk["identifier"], @@ -110,6 +154,9 @@ async def _upsert_drive(db: aiosqlite.Connection, disk: dict, now: str) -> int: disk.get("smart_health", "UNKNOWN"), now, now, + pool_name, + pool_role, + pool_seen_at, ), ) cur = await db.execute( @@ -208,6 +255,67 @@ async def _sync_history( # Poll cycle # --------------------------------------------------------------------------- +async def _poll_smart_via_ssh(db: aiosqlite.Connection, now: str) -> None: + """ + Poll progress for SMART tests started via SSH (truenas_job_id IS NULL). + Used on TrueNAS SCALE 25.10+ where the REST smart/test API no longer exists. + """ + from app import ssh_client + if not ssh_client.is_configured(): + return + + cur = await db.execute( + """SELECT st.id, st.test_type, st.drive_id, d.devname, st.started_at + FROM smart_tests st + JOIN drives d ON d.id = st.drive_id + WHERE st.state = 'running' AND st.truenas_job_id IS NULL""" + ) + rows = await cur.fetchall() + if not rows: + return + + for row in rows: + test_id, ttype, drive_id, devname, started_at = row[0], row[1], row[2], row[3], row[4] + try: + progress = await ssh_client.poll_smart_progress(devname) + except Exception as exc: + log.warning("SSH SMART poll failed for %s: %s", devname, exc) + continue + + state = progress["state"] + pct_remaining = progress.get("percent_remaining") # None = not yet in output + raw_output = progress.get("output", "") + + if state == "running": + # pct_remaining=None means smartctl output doesn't have the % line yet + # (test just started) — keep percent at 0 rather than jumping to 100 + if pct_remaining is None: + pct = 0 + else: + pct = max(0, 100 - pct_remaining) + eta = _eta_from_progress(pct, started_at) + await db.execute( + "UPDATE smart_tests SET percent=?, eta_at=?, raw_output=? WHERE id=?", + (pct, eta, raw_output, test_id), + ) + elif state == "passed": + await db.execute( + "UPDATE smart_tests SET state='passed', percent=100, finished_at=?, raw_output=? WHERE id=?", + (now, raw_output, test_id), + ) + log.info("SSH SMART %s passed on %s", ttype, devname) + elif state == "failed": + await db.execute( + "UPDATE smart_tests SET state='failed', percent=0, finished_at=?, " + "error_text=?, raw_output=? WHERE id=?", + (now, f"SMART {ttype.upper()} test failed", raw_output, test_id), + ) + log.warning("SSH SMART %s FAILED on %s", ttype, devname) + # state == "unknown" → keep polling, no update + + await db.commit() + + async def poll_cycle(client: TrueNASClient) -> int: """Run one full poll. Returns number of drives seen.""" now = _now() @@ -215,6 +323,88 @@ async def poll_cycle(client: TrueNASClient) -> int: disks = await client.get_disks() running_jobs = await client.get_smart_jobs(state="RUNNING") + # Fetch temperatures via SCALE-specific endpoint. + # CORE doesn't have this endpoint — silently skip on any error. + try: + temps = await client.get_disk_temperatures() + except Exception: + temps = {} + + # Inject temperature into each disk dict (SCALE 25.10 has no temp in /disk) + for disk in disks: + devname = disk.get("devname", "") + t = temps.get(devname) + if t is not None: + disk["temperature"] = int(round(t)) + + # SMART health — TrueNAS /api/v2.0/disk doesn't expose smart_health, + # so without this every drive defaults to UNKNOWN forever (only burn-in + # stages used to populate it). Run `smartctl -H` over a single SSH + # session for every drive every Nth cycle. Cache between cycles via + # _state so the dashboard always renders the most recent answer. + SMART_HEALTH_EVERY_N_CYCLES = 5 # ~1 minute at default 12s interval + _state.setdefault("smart_health_cache", {}) + cycle_n = _state.get("cycle", 0) + 1 + _state["cycle"] = cycle_n + try: + from app import ssh_client as _ssh + if _ssh.is_configured() and (cycle_n % SMART_HEALTH_EVERY_N_CYCLES == 1): + health_map = await _ssh.get_smart_health_map( + [d["devname"] for d in disks if d.get("devname")] + ) + if health_map is not None: + _state["smart_health_cache"] = health_map + except Exception as exc: + log.warning("smart_health refresh failed: %s", exc) + health_cache = _state.get("smart_health_cache") or {} + for disk in disks: + devname = disk.get("devname", "") + h = health_cache.get(devname) + if h: + disk["smart_health"] = h + + # Pool membership map — drives in any zpool are locked from burn-in. + # ssh_client returns None on failure (distinct from {} which means "no + # pools"). If EITHER detection call fails we fail-closed: leave + # pool_name / pool_role columns alone so previously-locked drives stay + # locked, and previously-unlocked drives stay unlocked, until detection + # recovers. Treating a transient SSH blip as "no pool members" would + # silently unlock every drive on the next poll. + detection_ok = True + pool_map: dict = {} + zfs_member_set: set = set() + try: + from app import ssh_client as _ssh + if _ssh.is_configured(): + pm = await _ssh.get_pool_membership() + zs = await _ssh.get_zfs_member_drives() + if pm is None or zs is None: + detection_ok = False + else: + pool_map = pm + zfs_member_set = zs + # SSH unconfigured (mock/dev mode) — detection_ok stays True with + # empty maps, so dev mode never artificially locks drives. + except Exception: + detection_ok = False + + if not detection_ok: + log.warning( + "Pool detection failed this cycle — preserving existing " + "pool_name/pool_role columns. Locked drives stay locked, " + "unlocked drives stay unlocked, until SSH recovers." + ) + + if detection_ok: + # Drives carrying ZFS labels but not in any active pool are + # "exported" — same hazard as an active pool member, so lock them + # too. We don't know the original pool name without + # `zpool import`-style scanning (slow + blocks); display + # "(exported)" and use a special token. + for devname in zfs_member_set: + if devname not in pool_map: + pool_map[devname] = {"pool": "(exported)", "role": "exported"} + # Index running jobs by (devname, test_type) active: dict[tuple[str, str], dict] = {} for job in running_jobs: @@ -233,7 +423,11 @@ async def poll_cycle(client: TrueNASClient) -> int: for disk in disks: devname = disk["devname"] - drive_id = await _upsert_drive(db, disk, now) + drive_id = await _upsert_drive( + db, disk, now, + pool_map.get(devname) if detection_ok else None, + update_pool=detection_ok, + ) for ttype in ("short", "long"): if (devname, ttype) in active: @@ -243,6 +437,9 @@ async def poll_cycle(client: TrueNASClient) -> int: await db.commit() + # SSH SMART polling — for tests started via smartctl (no TrueNAS REST job) + await _poll_smart_via_ssh(db, now) + return len(disks) @@ -263,6 +460,39 @@ async def run(client: TrueNASClient) -> None: _state["drives_seen"] = count _state["consecutive_failures"] = 0 log.debug("Poll OK", extra={"drives": count}) + + # System sensor temps via SSH (non-fatal) + from app import ssh_client as _ssh + if _ssh.is_configured(): + try: + _state["system_temps"] = await _ssh.get_system_sensors() + except Exception: + pass + + # Thermal pressure: max temp of drives currently under burn-in + try: + async with aiosqlite.connect(settings.db_path) as _tdb: + _tdb.row_factory = aiosqlite.Row + await _tdb.execute("PRAGMA journal_mode=WAL") + _cur = await _tdb.execute(""" + SELECT MAX(d.temperature_c) + FROM drives d + JOIN burnin_jobs bj ON bj.drive_id = d.id + WHERE bj.state = 'running' AND d.temperature_c IS NOT NULL + """) + _row = await _cur.fetchone() + _max_t = _row[0] if _row and _row[0] is not None else None + if _max_t is None: + _state["thermal_pressure"] = "ok" + elif _max_t >= settings.temp_crit_c: + _state["thermal_pressure"] = "crit" + elif _max_t >= settings.temp_warn_c: + _state["thermal_pressure"] = "warn" + else: + _state["thermal_pressure"] = "ok" + except Exception: + _state["thermal_pressure"] = "ok" + _notify_subscribers() # Check for stuck jobs every 5 cycles (~1 min at default 12s interval) diff --git a/app/routes.py b/app/routes.py index ddff766..1cfadc2 100644 --- a/app/routes.py +++ b/app/routes.py @@ -15,7 +15,8 @@ from app.database import get_db from app.models import ( BurninJobResponse, BurninStageResponse, CancelBurninRequest, DriveResponse, - SmartTestState, StartBurninRequest, UpdateDriveRequest, + SmartTestState, StartBurninRequest, UnlockPoolDriveRequest, + UpdateDriveRequest, ) from app.renderer import templates @@ -48,6 +49,22 @@ def _is_stale(last_polled_at: str) -> bool: return True +def _compute_eta_seconds(started_at: str | None, percent: int) -> int | None: + """Linear ETA extrapolation from started_at and percent complete.""" + if not started_at or percent <= 0: + return None + try: + start = datetime.fromisoformat(started_at) + if start.tzinfo is None: + start = start.replace(tzinfo=timezone.utc) + elapsed = (datetime.now(timezone.utc) - start).total_seconds() + total_est = elapsed / (percent / 100) + remaining = max(0, int(total_est - elapsed)) + return remaining + except Exception: + return None + + def _build_smart(row: aiosqlite.Row, prefix: str) -> SmartTestState: eta_at = row[f"{prefix}_eta_at"] return SmartTestState( @@ -76,6 +93,11 @@ def _row_to_drive(row: aiosqlite.Row) -> DriveResponse: smart_long=_build_smart(row, "long"), notes=row["notes"], location=row["location"], + pool_name=row["pool_name"], + pool_role=row["pool_role"], + pool_unlocked_until=burnin.unlock_expiry( + row["id"], row["pool_name"], row["pool_role"], + ), ) @@ -96,7 +118,7 @@ _DRIVES_QUERY = """ SELECT d.id, d.devname, d.serial, d.model, d.size_bytes, d.temperature_c, d.smart_health, d.last_polled_at, - d.notes, d.location, + d.notes, d.location, d.pool_name, d.pool_role, s.state AS short_state, s.percent AS short_percent, s.started_at AS short_started_at, @@ -112,6 +134,7 @@ _DRIVES_QUERY = """ FROM drives d LEFT JOIN smart_tests s ON s.drive_id = d.id AND s.test_type = 'short' LEFT JOIN smart_tests l ON l.drive_id = d.id AND l.test_type = 'long' + WHERE d.last_seen_at >= datetime('now', '-7 days') {where} ORDER BY d.devname """ @@ -138,11 +161,55 @@ async def _fetch_drives_for_template(db: aiosqlite.Connection) -> list[dict]: cur = await db.execute(_DRIVES_QUERY.format(where="")) rows = await cur.fetchall() burnin_by_drive = await _fetch_burnin_by_drive(db) + + # For burn-ins that include SMART stages, fetch those stages so we can + # mirror their progress/result in the Short/Long SMART columns. + # This covers both running stages (showing live progress) and completed + # stages (showing passed/failed after the burn-in moves to the next stage). + bi_smart_stages: dict[int, dict[str, dict]] = {} # job_id -> {stage_name: row} + bi_ids_with_smart = [ + bi["id"] for bi in burnin_by_drive.values() + if bi["state"] in ("running", "queued") + ] + if bi_ids_with_smart: + placeholders = ",".join("?" * len(bi_ids_with_smart)) + cur = await db.execute(f""" + SELECT bs.burnin_job_id, bs.stage_name, bs.state, bs.percent, + bs.started_at, bs.finished_at, bs.error_text + FROM burnin_stages bs + WHERE bs.burnin_job_id IN ({placeholders}) + AND bs.stage_name IN ('short_smart', 'long_smart') + AND bs.state IN ('running', 'passed', 'failed') + """, bi_ids_with_smart) + for r in await cur.fetchall(): + bi_smart_stages.setdefault(r["burnin_job_id"], {})[r["stage_name"]] = dict(r) + drives = [] for row in rows: d = _row_to_drive(row).model_dump() d["status"] = _compute_status(d) - d["burnin"] = burnin_by_drive.get(d["id"]) + bi = burnin_by_drive.get(d["id"]) + d["burnin"] = bi + + # Overlay burn-in SMART stage progress/results onto the SMART columns + if bi and bi["id"] in bi_smart_stages: + for stage_name, stage in bi_smart_stages[bi["id"]].items(): + target = "smart_short" if stage_name == "short_smart" else "smart_long" + # Only overlay if the standalone SMART column is idle/empty + existing = d.get(target) or {} + if existing.get("state") not in (None, "idle"): + continue + pct = stage["percent"] or 0 + d[target] = { + "state": stage["state"], + "percent": pct if stage["state"] == "running" else (100 if stage["state"] == "passed" else 0), + "eta_seconds": _compute_eta_seconds(stage["started_at"], pct) if stage["state"] == "running" else None, + "eta_timestamp": None, + "started_at": stage["started_at"], + "finished_at": stage["finished_at"], + "error_text": stage["error_text"], + } + drives.append(d) return drives @@ -170,7 +237,7 @@ def _stale_context(poller_state: dict) -> dict: async def dashboard(request: Request, db: aiosqlite.Connection = Depends(get_db)): drives = await _fetch_drives_for_template(db) ps = poller.get_state() - return templates.TemplateResponse("dashboard.html", { + return templates.TemplateResponse(request, "dashboard.html", { "request": request, "drives": drives, "poller": ps, @@ -218,6 +285,18 @@ async def sse_drives(request: Request): yield {"event": "drives-update", "data": html} + # Push system sensor state so JS can update temp chips live + ps = poller.get_state() + yield { + "event": "system-sensors", + "data": json.dumps({ + "system_temps": ps.get("system_temps", {}), + "thermal_pressure": ps.get("thermal_pressure", "ok"), + "temp_warn_c": settings.temp_warn_c, + "temp_crit_c": settings.temp_crit_c, + }), + } + # Push browser notification event if this was a job completion if alert: yield {"event": "job-alert", "data": json.dumps(alert)} @@ -258,7 +337,7 @@ async def list_drives(db: aiosqlite.Connection = Depends(get_db)): @router.get("/api/v1/drives/{drive_id}/drawer") async def drive_drawer(drive_id: int, db: aiosqlite.Connection = Depends(get_db)): """Data for the log drawer — latest burn-in job + stages, SMART tests, audit events.""" - cur = await db.execute(_DRIVES_QUERY.format(where="WHERE d.id = ?"), (drive_id,)) + cur = await db.execute(_DRIVES_QUERY.format(where="AND d.id = ?"), (drive_id,)) row = await cur.fetchone() if not row: raise HTTPException(status_code=404, detail="Drive not found") @@ -339,7 +418,7 @@ async def drive_drawer(drive_id: int, db: aiosqlite.Connection = Depends(get_db) @router.get("/api/v1/drives/{drive_id}", response_model=DriveResponse) async def get_drive(drive_id: int, db: aiosqlite.Connection = Depends(get_db)): cur = await db.execute( - _DRIVES_QUERY.format(where="WHERE d.id = ?"), (drive_id,) + _DRIVES_QUERY.format(where="AND d.id = ?"), (drive_id,) ) row = await cur.fetchone() if not row: @@ -353,9 +432,13 @@ async def smart_start( body: dict, db: aiosqlite.Connection = Depends(get_db), ): - """Start a standalone SHORT or LONG SMART test on a single drive.""" - from app.truenas import TrueNASClient - from app import burnin as _burnin + """Start a standalone SHORT or LONG SMART test on a single drive. + + Uses SSH (smartctl) when configured — required for TrueNAS SCALE 25.10+ + where the REST smart/test endpoint no longer exists. + Falls back to TrueNAS REST API for older versions. + """ + from app import burnin as _burnin, ssh_client test_type = (body.get("type") or "").upper() if test_type not in ("SHORT", "LONG"): @@ -367,17 +450,42 @@ async def smart_start( raise HTTPException(status_code=404, detail="Drive not found") devname = row[0] - # Use the shared TrueNAS client held by the burnin module - client = _burnin._client - if client is None: - raise HTTPException(status_code=503, detail="TrueNAS client not ready") + now = datetime.now(timezone.utc).isoformat() + ttype_lower = test_type.lower() - try: - tn_job_id = await client.start_smart_test([devname], test_type) - except Exception as exc: - raise HTTPException(status_code=502, detail=f"TrueNAS error: {exc}") + if ssh_client.is_configured(): + # SSH path — works on TrueNAS SCALE 25.10+ and CORE + try: + output = await ssh_client.start_smart_test(devname, test_type) + except Exception as exc: + raise HTTPException(status_code=502, detail=f"SSH error: {exc}") - return {"job_id": tn_job_id, "devname": devname, "type": test_type} + # Mark as running in DB (truenas_job_id=NULL signals SSH-managed test) + # Store smartctl start output as proof the test was initiated + await db.execute( + """INSERT INTO smart_tests (drive_id, test_type, state, percent, started_at, raw_output) + VALUES (?,?,?,?,?,?) + ON CONFLICT(drive_id, test_type) DO UPDATE SET + state='running', percent=0, truenas_job_id=NULL, + started_at=excluded.started_at, finished_at=NULL, error_text=NULL, + raw_output=excluded.raw_output""", + (drive_id, ttype_lower, "running", 0, now, output), + ) + await db.commit() + from app import poller as _poller + _poller._notify_subscribers() + return {"devname": devname, "type": test_type, "message": output[:200]} + + else: + # REST path — older TrueNAS CORE / SCALE versions + client = _burnin._client + if client is None: + raise HTTPException(status_code=503, detail="TrueNAS client not ready") + try: + tn_job_id = await client.start_smart_test([devname], test_type) + except Exception as exc: + raise HTTPException(status_code=502, detail=f"TrueNAS error: {exc}") + return {"job_id": tn_job_id, "devname": devname, "type": test_type} @router.post("/api/v1/drives/{drive_id}/smart/cancel") @@ -403,28 +511,37 @@ async def smart_cancel( if client is None: raise HTTPException(status_code=503, detail="TrueNAS client not ready") - # Find the running TrueNAS job for this drive/test-type - try: - jobs = await client.get_smart_jobs() - tn_job_id = None - for j in jobs: - if j.get("state") != "RUNNING": - continue - args = j.get("arguments", []) - if not args or not isinstance(args[0], dict): - continue - if devname in args[0].get("disks", []): - tn_job_id = j["id"] - break + from app import ssh_client - if tn_job_id is None: - raise HTTPException(status_code=404, detail="No running SMART test found for this drive") + if ssh_client.is_configured(): + # SSH path — abort via smartctl -X + try: + await ssh_client.abort_smart_test(devname) + except Exception as exc: + raise HTTPException(status_code=502, detail=f"SSH abort error: {exc}") + else: + # REST path — find TrueNAS job and abort it + try: + jobs = await client.get_smart_jobs() + tn_job_id = None + for j in jobs: + if j.get("state") != "RUNNING": + continue + args = j.get("arguments", []) + if not args or not isinstance(args[0], dict): + continue + if devname in args[0].get("disks", []): + tn_job_id = j["id"] + break - await client.abort_job(tn_job_id) - except HTTPException: - raise - except Exception as exc: - raise HTTPException(status_code=502, detail=f"TrueNAS error: {exc}") + if tn_job_id is None: + raise HTTPException(status_code=404, detail="No running SMART test found for this drive") + + await client.abort_job(tn_job_id) + except HTTPException: + raise + except Exception as exc: + raise HTTPException(status_code=502, detail=f"TrueNAS error: {exc}") # Update local DB state now = datetime.now(timezone.utc).isoformat() @@ -479,13 +596,35 @@ async def burnin_start(req: StartBurninRequest): drive_id, req.profile, req.operator, stage_order=req.stage_order ) results.append({"drive_id": drive_id, "job_id": job_id}) + except burnin.PoolMemberError as exc: + errors.append({ + "drive_id": drive_id, + "error": str(exc), + "pool_name": exc.pool_name, + "pool_role": exc.pool_role, + "pool_locked": True, + }) except ValueError as exc: errors.append({"drive_id": drive_id, "error": str(exc)}) if errors and not results: - raise HTTPException(status_code=409, detail=errors[0]["error"]) + # Surface the first error's structured fields so the UI can render + # an unlock affordance instead of a generic toast. + raise HTTPException(status_code=409, detail=errors[0]) return {"queued": results, "errors": errors} +@router.post("/api/v1/drives/{drive_id}/unlock") +async def unlock_pool_drive(drive_id: int, req: UnlockPoolDriveRequest): + try: + expiry = await burnin.grant_pool_unlock( + drive_id, req.confirm_token, req.operator, req.reason, + ) + except ValueError as exc: + raise HTTPException(status_code=400, detail=str(exc)) + return {"unlocked": True, "expires_at": expiry, + "ttl_seconds": burnin.UNLOCK_TTL_SECONDS} + + @router.post("/api/v1/burnin/{job_id}/cancel") async def burnin_cancel(job_id: int, req: CancelBurninRequest): ok = await burnin.cancel_job(job_id, req.operator) @@ -562,7 +701,7 @@ async def history_list( jobs = [dict(r) for r in rows] ps = poller.get_state() - return templates.TemplateResponse("history.html", { + return templates.TemplateResponse(request, "history.html", { "request": request, "jobs": jobs, "active_state": state, @@ -612,7 +751,7 @@ async def history_detail( job["stages"] = [dict(r) for r in await cur.fetchall()] ps = poller.get_state() - return templates.TemplateResponse("job_detail.html", { + return templates.TemplateResponse(request, "job_detail.html", { "request": request, "job": job, "poller": ps, @@ -791,7 +930,7 @@ async def audit_log( cur = await db.execute(_AUDIT_QUERY) rows = [dict(r) for r in await cur.fetchall()] ps = poller.get_state() - return templates.TemplateResponse("audit.html", { + return templates.TemplateResponse(request, "audit.html", { "request": request, "events": rows, "event_colors": _AUDIT_EVENT_COLORS, @@ -887,7 +1026,7 @@ async def stats_page( drives_total = (await cur.fetchone())[0] ps = poller.get_state() - return templates.TemplateResponse("stats.html", { + return templates.TemplateResponse(request, "stats.html", { "request": request, "overall": overall, "by_model": by_model, @@ -931,6 +1070,9 @@ async def settings_page( "temp_warn_c": settings.temp_warn_c, "temp_crit_c": settings.temp_crit_c, "bad_block_threshold": settings.bad_block_threshold, + "surface_validate_block_size": settings.surface_validate_block_size, + "surface_validate_block_buffer": settings.surface_validate_block_buffer, + "surface_validate_passes": settings.surface_validate_passes, # SSH credentials (take effect immediately — each SSH call reads live settings) "ssh_host": settings.ssh_host, "ssh_port": settings.ssh_port, @@ -948,7 +1090,7 @@ async def settings_page( from app import ssh_client as _ssh ps = poller.get_state() - return templates.TemplateResponse("settings.html", { + return templates.TemplateResponse(request, "settings.html", { "request": request, "editable": editable, "smtp_enabled": bool(settings.smtp_host), @@ -1069,7 +1211,7 @@ async def history_print( """, (job_id,)) job["stages"] = [dict(r) for r in await cur.fetchall()] - return templates.TemplateResponse("job_print.html", { + return templates.TemplateResponse(request, "job_print.html", { "request": request, "job": job, }) diff --git a/app/settings_store.py b/app/settings_store.py index ace86df..f1bbac8 100644 --- a/app/settings_store.py +++ b/app/settings_store.py @@ -38,6 +38,9 @@ _EDITABLE: dict[str, type] = { "temp_warn_c": int, "temp_crit_c": int, "bad_block_threshold": int, + "surface_validate_block_size": int, + "surface_validate_block_buffer": int, + "surface_validate_passes": int, # SSH credentials — take effect immediately (each connection reads live settings) "ssh_host": str, "ssh_port": int, @@ -96,6 +99,26 @@ def _apply(data: dict) -> None: if key == "bad_block_threshold" and int(val) < 0: log.warning("settings_store: bad_block_threshold must be >= 0 — ignoring") continue + if key == "surface_validate_block_size": + # badblocks accepts any positive int but in practice the + # useful range is 512..1048576 and it should be a power of 2. + v = int(val) + if v < 512 or v > 1048576 or (v & (v - 1)) != 0: + log.warning( + "settings_store: surface_validate_block_size must be " + "a power of 2 between 512 and 1048576 — ignoring %r", val + ) + continue + if key == "surface_validate_block_buffer" and not (1 <= int(val) <= 4096): + log.warning( + "settings_store: surface_validate_block_buffer must be 1..4096 — ignoring" + ) + continue + if key == "surface_validate_passes" and not (0 <= int(val) <= 16): + log.warning( + "settings_store: surface_validate_passes must be 0..16 — ignoring" + ) + continue if key == "ssh_port" and not (1 <= int(val) <= 65535): log.warning("settings_store: ssh_port out of range — ignoring") continue diff --git a/app/ssh_client.py b/app/ssh_client.py index 183c650..f84ca9c 100644 --- a/app/ssh_client.py +++ b/app/ssh_client.py @@ -38,15 +38,26 @@ SMART_ATTRS: dict[int, tuple[str, bool]] = { # --------------------------------------------------------------------------- def is_configured() -> bool: - """Returns True when SSH credentials are present and usable.""" + """Returns True when SSH host + at least one auth method is available.""" + import os from app.config import settings - return bool(settings.ssh_host and (settings.ssh_password or settings.ssh_key)) + if not settings.ssh_host: + return False + has_creds = bool( + settings.ssh_key + or settings.ssh_password + or os.path.exists(os.environ.get("SSH_KEY_FILE", _MOUNTED_KEY_PATH)) + ) + return has_creds # --------------------------------------------------------------------------- # Low-level connection # --------------------------------------------------------------------------- +_MOUNTED_KEY_PATH = "/run/secrets/ssh_key" + + async def _connect(): """Open a single-use SSH connection. Caller must use `async with`.""" import asyncssh @@ -59,9 +70,17 @@ async def _connect(): "known_hosts": None, # trust all hosts (same spirit as TRUENAS_VERIFY_TLS=false) } if settings.ssh_key: + # Key material provided via env var (base case) kwargs["client_keys"] = [asyncssh.import_private_key(settings.ssh_key)] - if settings.ssh_password: + elif settings.ssh_password: kwargs["password"] = settings.ssh_password + else: + # Fall back to mounted key file (preferred for production — no key in env vars) + import os + key_path = os.environ.get("SSH_KEY_FILE", _MOUNTED_KEY_PATH) + if os.path.exists(key_path): + kwargs["client_keys"] = [key_path] + # If nothing is configured, asyncssh will attempt agent/default key lookup return asyncssh.connect(**kwargs) @@ -228,6 +247,294 @@ async def run_badblocks( } +def _parse_zpool_list_output(stdout: str) -> dict: + """Pure parser for `zpool list -vHP` stdout. Exposed for unit tests. + + See get_pool_membership() for output semantics. This function never + raises — malformed lines are silently skipped. + """ + import re as _re + + def _strip_partition(name: str) -> str: + m = _re.match(r"^(nvme\d+n\d+)", name) + if m: + return m.group(1) + m = _re.match(r"^(sd[a-z]+)", name) + if m: + return m.group(1) + return name + + SECTION_MARKERS = {"cache", "log", "logs", "spare", "spares", + "special", "dedup"} + SECTION_NORMALIZE = {"logs": "log", "spares": "spare"} + + out: dict = {} + current_pool: str | None = None + current_role: str = "data" + + for raw in stdout.splitlines(): + if not raw.strip(): + continue + depth = 0 + while depth < len(raw) and raw[depth] == "\t": + depth += 1 + first = raw[depth:].split("\t", 1)[0].strip() + + if depth == 0: + current_pool = first + current_role = "data" + continue + + if depth == 1: + if first in SECTION_MARKERS: + current_role = SECTION_NORMALIZE.get(first, first) + continue + if first.startswith(("mirror", "raidz", "draid")): + continue + if first.startswith("/dev/") and current_pool: + dn = _strip_partition(first[len("/dev/"):]) + out[dn] = {"pool": current_pool, "role": current_role} + continue + + if first.startswith("/dev/") and current_pool: + dn = _strip_partition(first[len("/dev/"):]) + out[dn] = {"pool": current_pool, "role": current_role} + + return out + + +def _parse_lsblk_zfs_output(stdout: str) -> set: + """Pure parser for `lsblk -no NAME,FSTYPE -l` stdout. Returns base + devnames carrying ZFS labels (whole-disk OR via partition). Exposed + for unit tests.""" + import re as _re + out: set = set() + for line in stdout.splitlines(): + parts = line.split() + if len(parts) < 2: + continue + name, fstype = parts[0], parts[1] + if fstype != "zfs_member": + continue + if name.startswith("nvme"): + m = _re.match(r"^(nvme\d+n\d+)", name) + if m: + out.add(m.group(1)) + else: + m = _re.match(r"^(sd[a-z]+)", name) + if m: + out.add(m.group(1)) + return out + + +async def get_pool_membership() -> dict | None: + """Return {devname: {"pool": str, "role": str}} for every drive in any zpool. + + Parses `zpool list -vHP` output. Tab-indent depth tells us structure: + depth 0 pool name line + depth 1 vdev type line (mirror-N, raidz*N, draid*) OR section + marker (cache/log/spare/special/dedup/logs) OR a single-disk + vdev that is itself a /dev/... entry + depth 2 device line within a vdev — '/dev/sdX', '/dev/nvmeXnY', etc. + may have a partition suffix that we strip back to the + base devname so it matches what TrueNAS reports. + Roles: data | cache | log | spare | special | dedup + Returns: + - {} when the SSH call succeeded and there are genuinely no pools + - None on any failure (SSH down, parse error, non-zero exit, no + stdout). Callers MUST treat None differently from {}: an + empty dict is "definitely no pool members," None is "we + couldn't tell." Treating None as "no pool members" is a + fail-open security regression. + """ + import re as _re + if not is_configured(): + return {} + cmd = "zpool list -vHP 2>/dev/null" + try: + async with await _connect() as conn: + r = await conn.run(cmd, check=False) + if r.returncode != 0: + return None + except Exception: + return None + if not r.stdout: + # rc==0 with empty output = host has no pools. (`zpool list -H` + # returns no rows when zero pools are imported.) That's a real + # answer, not a failure. + return {} + return _parse_zpool_list_output(r.stdout) + + +async def get_smart_health_map(devnames: list[str]) -> dict | None: + """Return {devname: 'PASSED'|'FAILED'|'UNKNOWN'} for every devname. + + Runs `smartctl -H` for each disk in a single SSH session — much faster + than one connection per disk. Returns None on any SSH failure so the + poller can fall back to the previously-stored health value rather than + silently overwriting everything as 'UNKNOWN'. + + `smartctl -H` is the cheap SMART self-assessment lookup (no full + attribute scan) — milliseconds per drive. The output format is stable: + SMART overall-health self-assessment test result: PASSED + SMART overall-health self-assessment test result: FAILED! + For drives that don't support the command at all, smartctl exits + non-zero and we record UNKNOWN for that device specifically. + """ + if not is_configured() or not devnames: + return {} if devnames else None + # Build one shell pipeline that prefixes each result with "@@DEVNAME@@" + # so we can split the combined stdout deterministically. + parts = [] + for d in devnames: + # Reject anything that doesn't look like a basic devname so we + # never inject shell metacharacters into the remote command. + if not d.replace("nvme", "").replace("n", "").replace("p", "").replace("sd", "").isalnum(): + continue + parts.append(f"echo '@@{d}@@'; smartctl -H /dev/{d} 2>&1; echo '@@END@@'") + if not parts: + return {} + cmd = "; ".join(parts) + try: + async with await _connect() as conn: + r = await asyncio.wait_for(conn.run(cmd, check=False), timeout=30) + except Exception: + return None + if not r.stdout: + return None + return _parse_smart_health_batch(r.stdout) + + +def _parse_smart_health_batch(stdout: str) -> dict: + """Pure parser for the batched smartctl -H output. Exposed for tests.""" + result: dict[str, str] = {} + current: str | None = None + buf: list[str] = [] + + def _flush(): + if current is None: + return + text = "\n".join(buf) + if "PASSED" in text: + result[current] = "PASSED" + elif "FAILED" in text or "FAILURE" in text: + result[current] = "FAILED" + else: + result[current] = "UNKNOWN" + + for raw in stdout.splitlines(): + line = raw.strip() + if line.startswith("@@") and line.endswith("@@"): + inner = line[2:-2] + if inner == "END": + _flush() + current = None + buf = [] + else: + _flush() + current = inner + buf = [] + else: + buf.append(line) + _flush() + return result + + +async def get_zfs_member_drives() -> set | None: + """Return devnames of every drive whose partitions carry a ZFS label. + + Combined with get_pool_membership(): a drive in this set but NOT in the + active-pool map carries ZFS data from a previously-imported pool that + was exported (or imported on a different system). We treat those as + locked too — wiping them would silently destroy a pool. + + Returns: + - set() when lsblk succeeded and no drives carry ZFS labels + - None on any failure. Same fail-closed semantics as + get_pool_membership() — callers must NOT treat None as + "no exported drives," that's a security regression. + """ + if not is_configured(): + return set() + cmd = "lsblk -no NAME,FSTYPE -l 2>/dev/null" + try: + async with await _connect() as conn: + r = await conn.run(cmd, check=False) + if r.returncode != 0: + return None + except Exception: + return None + if not r.stdout: + # lsblk with rc==0 and no output is impossible on a normal Linux + # host; treat as failure rather than "no drives at all." + return None + return _parse_lsblk_zfs_output(r.stdout) + + +async def get_system_sensors() -> dict: + """ + Run `sensors -j` on TrueNAS and extract system-level temperatures. + Returns {"cpu_c": int|None, "pch_c": int|None}. + cpu_c = CPU package temp (coretemp chip) + pch_c = PCH/chipset temp (pch_* chip) — proxy for storage I/O lane thermals + Falls back gracefully if SSH is not configured or lm-sensors is unavailable. + """ + if not is_configured(): + return {} + try: + async with await _connect() as conn: + result = await conn.run("sensors -j 2>/dev/null", check=False) + output = result.stdout.strip() + if not output: + return {} + return _parse_sensors_json(output) + except Exception as exc: + log.debug("get_system_sensors failed: %s", exc) + return {} + + +def _parse_sensors_json(output: str) -> dict: + import json as _json + try: + data = _json.loads(output) + except Exception: + return {} + + cpu_c: int | None = None + pch_c: int | None = None + + for chip_name, chip_data in data.items(): + if not isinstance(chip_data, dict): + continue + + # CPU package temp — coretemp chip, "Package id N" sensor + if chip_name.startswith("coretemp") and cpu_c is None: + for sensor_name, sensor_vals in chip_data.items(): + if not isinstance(sensor_vals, dict): + continue + if "package" in sensor_name.lower(): + for k, v in sensor_vals.items(): + if k.endswith("_input") and isinstance(v, (int, float)): + cpu_c = int(round(v)) + break + if cpu_c is not None: + break + + # PCH / chipset temp — manages PCIe lanes including HBA / storage I/O + elif chip_name.startswith("pch_") and pch_c is None: + for sensor_name, sensor_vals in chip_data.items(): + if not isinstance(sensor_vals, dict): + continue + for k, v in sensor_vals.items(): + if k.endswith("_input") and isinstance(v, (int, float)): + pch_c = int(round(v)) + break + if pch_c is not None: + break + + return {"cpu_c": cpu_c, "pch_c": pch_c} + + # --------------------------------------------------------------------------- # Parsers # --------------------------------------------------------------------------- @@ -275,7 +582,7 @@ def _parse_smartctl(output: str) -> dict: def _parse_smart_progress(output: str) -> dict: state = "unknown" - percent_remaining = 0 + percent_remaining = None # None = "in progress but no % line parsed yet" lower = output.lower() diff --git a/app/static/app.css b/app/static/app.css index 94dafc4..094f58f 100644 --- a/app/static/app.css +++ b/app/static/app.css @@ -281,7 +281,11 @@ tr:hover td { .col-size { min-width: 70px; text-align: right; } .col-temp { min-width: 75px; text-align: right; } .col-health { min-width: 85px; } -.col-smart { min-width: 150px; } +.col-smart { min-width: 95px; } +/* Tighter horizontal padding on the SMART columns — they hold short + pills ("Passed"/"—") or a progress bar, so the default 14px gutter + wastes space on 13" laptops. */ +th.col-smart, td.col-smart { padding-left: 6px; padding-right: 6px; } .col-actions { min-width: 170px; } /* ----------------------------------------------------------------------- @@ -1076,6 +1080,56 @@ a.stat-card:hover { .stat-passed .stat-value { color: var(--green); } .stat-idle .stat-value { color: var(--text-muted); } +/* Vertical separator between drive-count cards and sensor chips */ +.stats-bar-sep { + width: 1px; + height: 36px; + background: var(--border); + align-self: center; + flex-shrink: 0; +} + +/* Compact sensor chip — CPU / PCH / Thermal */ +.stat-sensor { + background: var(--bg-card); + border: 1px solid var(--border); + border-radius: 8px; + padding: 6px 12px; + text-align: center; + min-width: 52px; + display: flex; + flex-direction: column; + gap: 2px; +} + +.stat-sensor-val { + font-size: 16px; + font-weight: 700; + font-variant-numeric: tabular-nums; + line-height: 1.1; +} + +.stat-sensor-label { + font-size: 9px; + text-transform: uppercase; + letter-spacing: 0.08em; + color: var(--text-muted); + line-height: 1.2; +} + +/* Thermal pressure states */ +.stat-sensor-thermal-warn { + border-color: var(--yellow-bd); + background: var(--yellow-bg); +} +.stat-sensor-thermal-warn .stat-sensor-val { color: var(--yellow); } + +.stat-sensor-thermal-crit { + border-color: var(--red-bd); + background: var(--red-bg); +} +.stat-sensor-thermal-crit .stat-sensor-val { color: var(--red); } + /* ----------------------------------------------------------------------- Batch action bar (inside filter-bar) ----------------------------------------------------------------------- */ @@ -2372,6 +2426,85 @@ tr.drawer-row-active { color: var(--yellow); } +/* ----------------------------------------------------------------------- + Pool-membership lock indicators +----------------------------------------------------------------------- */ +.pool-lock-icon { + display: inline-block; + margin-right: 4px; + font-size: 12px; + color: var(--yellow); + vertical-align: baseline; +} +.pool-lock-icon.pool-lock-boot { + color: var(--red, #e25555); +} +.pool-pill { + display: inline-block; + margin-top: 3px; + padding: 1px 7px; + font-size: 10.5px; + font-weight: 600; + letter-spacing: 0.3px; + text-transform: uppercase; + border-radius: 4px; + background: color-mix(in srgb, var(--yellow) 14%, transparent); + color: var(--yellow); + border: 1px solid color-mix(in srgb, var(--yellow) 35%, transparent); +} +.pool-pill.pool-pill-boot { + background: color-mix(in srgb, var(--red, #e25555) 16%, transparent); + color: var(--red, #e25555); + border-color: color-mix(in srgb, var(--red, #e25555) 45%, transparent); +} +.pool-pill.pool-pill-exported { + background: color-mix(in srgb, #e07a3f 16%, transparent); + color: #e07a3f; + border-color: color-mix(in srgb, #e07a3f 45%, transparent); +} +.pool-lock-icon.pool-lock-exported { + color: #e07a3f; +} +.btn-unlock { + background: transparent; + border: 1px solid color-mix(in srgb, var(--yellow) 50%, transparent); + color: var(--yellow); + border-radius: 5px; + padding: 3px 9px; + font-size: 12px; + cursor: pointer; + transition: background .15s, color .15s, border-color .15s; +} +.btn-unlock:hover { + background: color-mix(in srgb, var(--yellow) 14%, transparent); +} +.btn-unlock-boot { + border-color: color-mix(in srgb, var(--red, #e25555) 55%, transparent); + color: var(--red, #e25555); +} +.btn-unlock-boot:hover { + background: color-mix(in srgb, var(--red, #e25555) 14%, transparent); +} +.btn-unlock-exported { + border-color: color-mix(in srgb, #e07a3f 55%, transparent); + color: #e07a3f; +} +.btn-unlock-exported:hover { + background: color-mix(in srgb, #e07a3f 14%, transparent); +} +.unlock-countdown { + margin-left: 4px; + font-size: 11px; + color: var(--green, #39c179); + font-variant-numeric: tabular-nums; +} +.unlock-countdown-expired { + color: var(--yellow); +} +.modal.modal-danger { + border-top: 3px solid var(--red, #e25555); +} + /* ----------------------------------------------------------------------- Parallel burn-in inline warning ----------------------------------------------------------------------- */ @@ -2409,41 +2542,3 @@ tr.drawer-row-active { font-variant-numeric: tabular-nums; } -/* ----------------------------------------------------------------------- - Live Terminal drawer panel (xterm.js) ------------------------------------------------------------------------ */ -.drawer-panel-terminal { - padding: 0 !important; - overflow: hidden !important; - position: relative; - background: #0d1117; -} - -/* Let xterm fill the full panel height */ -.drawer-panel-terminal .xterm { - height: 100%; -} -.drawer-panel-terminal .xterm-viewport { - overflow-y: auto !important; -} - -/* Reconnect bar — floats over the terminal when disconnected */ -.term-reconnect-bar { - position: absolute; - bottom: 12px; - right: 12px; - z-index: 20; - display: flex; - align-items: center; - gap: 8px; - background: rgba(13,17,23,0.85); - border: 1px solid var(--border); - border-radius: 6px; - padding: 6px 10px; - font-size: 12px; - color: var(--text-muted); -} -.term-reconnect-bar .btn-secondary { - padding: 3px 10px; - font-size: 11px; -} diff --git a/app/static/app.js b/app/static/app.js index ca59f68..c840936 100644 --- a/app/static/app.js +++ b/app/static/app.js @@ -68,6 +68,7 @@ applyFilter(activeFilter); restoreCheckboxes(); initElapsedTimers(); + initUnlockCountdowns(); initLocationEdits(); if (_drawerDriveId) { _drawerHighlightRow(_drawerDriveId); @@ -135,14 +136,59 @@ if (nb) nb.style.display = 'none'; } - // Handle job-alert SSE events for browser notifications + // Handle SSE events document.addEventListener('htmx:sseMessage', function (e) { - if (!e.detail || e.detail.type !== 'job-alert') return; - try { - handleJobAlert(JSON.parse(e.detail.data)); - } catch (_) {} + if (!e.detail) return; + if (e.detail.type === 'job-alert') { + try { handleJobAlert(JSON.parse(e.detail.data)); } catch (_) {} + } else if (e.detail.type === 'system-sensors') { + try { handleSystemSensors(JSON.parse(e.detail.data)); } catch (_) {} + } }); + function handleSystemSensors(data) { + var st = data.system_temps || {}; + var tp = data.thermal_pressure || 'ok'; + var warn = data.temp_warn_c || 46; + var crit = data.temp_crit_c || 55; + + function tempClass(c) { + if (c == null) return ''; + return c >= crit ? 'temp-hot' : c >= warn ? 'temp-warm' : 'temp-cool'; + } + + // CPU chip + var cpuChip = document.getElementById('sensor-cpu'); + var cpuVal = document.getElementById('sensor-cpu-val'); + if (cpuVal && st.cpu_c != null) { + if (cpuChip) cpuChip.hidden = false; + cpuVal.textContent = st.cpu_c + '°'; + cpuVal.className = 'stat-sensor-val ' + tempClass(st.cpu_c); + } + + // PCH chip + var pchChip = document.getElementById('sensor-pch'); + var pchVal = document.getElementById('sensor-pch-val'); + if (pchVal && st.pch_c != null) { + if (pchChip) pchChip.hidden = false; + pchVal.textContent = st.pch_c + '°'; + pchVal.className = 'stat-sensor-val ' + tempClass(st.pch_c); + } + + // Thermal pressure chip + var tChip = document.getElementById('sensor-thermal'); + var tVal = document.getElementById('sensor-thermal-val'); + if (tChip && tVal) { + if (tp === 'warn' || tp === 'crit') { + tChip.hidden = false; + tChip.className = 'stat-sensor stat-sensor-thermal stat-sensor-thermal-' + tp; + tVal.textContent = tp === 'warn' ? 'WARM' : 'HOT'; + } else { + tChip.hidden = true; + } + } + } + function handleJobAlert(data) { var isPass = data.state === 'passed'; var icon = isPass ? '✓' : '✕'; @@ -203,6 +249,41 @@ initElapsedTimers(); + // Live countdown for pool-drive unlock TTL — runs once per second; ticker + // self-stops when no .unlock-countdown spans remain on the page. + var _unlockTickInterval = null; + function tickUnlockCountdowns() { + var spans = document.querySelectorAll('.unlock-countdown[data-expires]'); + if (spans.length === 0) { + if (_unlockTickInterval) { + clearInterval(_unlockTickInterval); + _unlockTickInterval = null; + } + return; + } + var nowSec = Date.now() / 1000; + spans.forEach(function (el) { + var exp = parseFloat(el.dataset.expires); + if (!exp || isNaN(exp)) return; + var rem = Math.max(0, exp - nowSec); + if (rem <= 0) { + el.textContent = 'expired'; + el.className = 'unlock-countdown unlock-countdown-expired'; + return; + } + var m = Math.floor(rem / 60); + var s = Math.floor(rem % 60); + el.textContent = '\u{1F513} ' + m + ':' + (s < 10 ? '0' : '') + s; + }); + } + function initUnlockCountdowns() { + if (_unlockTickInterval) return; + if (document.querySelectorAll('.unlock-countdown[data-expires]').length === 0) return; + _unlockTickInterval = setInterval(tickUnlockCountdowns, 1000); + tickUnlockCountdowns(); + } + initUnlockCountdowns(); + // ----------------------------------------------------------------------- // Inline location / notes edit // ----------------------------------------------------------------------- @@ -538,7 +619,16 @@ var data = await resp.json(); if (!resp.ok) { - showToast(data.detail || 'Failed to start burn-in', 'error'); + // detail may be the structured pool-locked object {drive_id, + // pool_name, pool_role, pool_locked: true, error: "..."}. + // The user already opened the start modal, so the unlock TTL must + // have just expired between modal-open and submit. Auto-flip to + // the unlock modal for that drive. + if (_handlePoolLockedError(data.detail)) { + closeModal(); + return; + } + showToast(_extractErrorMessage(data.detail) || 'Failed to start burn-in', 'error'); return; } @@ -549,6 +639,161 @@ } } + // Helpers shared between single-drive and batch start error paths. + // Backend returns either a string (legacy errors) or, for pool-locked + // drives, an object: {drive_id, error, pool_name, pool_role, pool_locked}. + function _extractErrorMessage(detail) { + if (!detail) return null; + if (typeof detail === 'string') return detail; + if (typeof detail === 'object' && detail.error) return detail.error; + return null; + } + // Returns true if it handled a pool-locked error by opening the unlock + // modal for the offending drive. Caller should bail out. + function _handlePoolLockedError(detail) { + if (!detail || typeof detail !== 'object' || !detail.pool_locked) return false; + var driveId = detail.drive_id; + if (driveId == null) return false; + var btn = document.querySelector('.btn-unlock[data-drive-id="' + driveId + '"]'); + if (btn) { + // openUnlockModal closes any other open modals as a side effect of + // calling its own close handlers; we still need to close the + // start/batch modal explicitly in the caller, since openUnlockModal + // doesn't know which one is open. + openUnlockModal(btn); + return true; + } + // Unlock button not in the DOM (drive row may have refreshed). + // Surface a descriptive toast instead of [object Object]. + showToast( + (detail.error || 'Drive is pool-locked') + + ' Reload the page and click Unlock on the drive row.', + 'error', + ); + return true; + } + + // ----------------------------------------------------------------------- + // Pool-drive Unlock modal + // ----------------------------------------------------------------------- + + var unlockDriveId = null; + var unlockExpectedToken = null; + + function openUnlockModal(btn) { + unlockDriveId = btn.dataset.driveId; + var poolName = btn.dataset.poolName || ''; + var poolRole = btn.dataset.poolRole || 'data'; + var isBoot = btn.dataset.isBootPool === '1'; + var isExported = btn.dataset.isExported === '1'; + if (isBoot) unlockExpectedToken = 'DESTROY BOOT POOL'; + else if (isExported) unlockExpectedToken = 'DESTROY EXPORTED POOL'; + else unlockExpectedToken = poolName; + + document.getElementById('unlock-devname').textContent = btn.dataset.devname || '—'; + document.getElementById('unlock-model').textContent = btn.dataset.model || '—'; + document.getElementById('unlock-serial').textContent = btn.dataset.serial || '—'; + document.getElementById('unlock-size').textContent = btn.dataset.size || '—'; + + var chip = document.getElementById('unlock-pool-chip'); + if (isExported) { + chip.textContent = 'exported ZFS'; + chip.className = 'chip chip-aborted'; + } else { + chip.textContent = poolName + ' · ' + poolRole; + chip.className = 'chip ' + (isBoot ? 'chip-failed' : 'chip-aborted'); + } + + var titleEl = document.getElementById('unlock-modal-title'); + var warnTitle = document.getElementById('unlock-warning-title'); + var warnBody = document.getElementById('unlock-warning-body'); + if (isBoot) { + titleEl.textContent = 'Unlock BOOT POOL drive'; + warnTitle.textContent = 'This is a TrueNAS BOOT drive.'; + warnBody.textContent = + 'Running burn-in on this drive will destroy the operating system on it. ' + + 'If this drive is half of a mirrored boot pool, the system will continue running on the other mirror, ' + + 'but you must already have a replacement plan. Proceeding without one bricks the host.'; + } else if (isExported) { + titleEl.textContent = 'Unlock drive with EXPORTED ZFS data'; + warnTitle.textContent = 'This drive carries ZFS data from a previously-imported pool.'; + warnBody.textContent = + "TrueNAS isn't using this pool right now, but the drive still holds the labels and data. " + + 'Burning it in will silently destroy whatever pool that data belongs to — including ' + + 'pools that another system may be relying on. Confirm you have already evacuated or ' + + 'reassigned the pool before continuing.'; + } else { + titleEl.textContent = 'Unlock pool drive'; + warnTitle.textContent = "This drive belongs to zpool '" + poolName + "'."; + warnBody.textContent = + 'Running a destructive burn-in stage will overwrite all data on this drive ' + + 'and almost certainly destroy the pool. Only proceed if you have already ' + + 'removed this drive from the pool, or if you are intentionally decommissioning the pool.'; + } + document.getElementById('unlock-confirm-token').textContent = unlockExpectedToken; + document.getElementById('unlock-confirm-hint').textContent = 'Expected: ' + unlockExpectedToken; + + document.getElementById('unlock-confirm-input').value = ''; + document.getElementById('unlock-reason-input').value = ''; + var savedOp = localStorage.getItem('burnin_operator') || ''; + document.getElementById('unlock-operator-input').value = savedOp; + validateUnlockModal(); + + document.getElementById('unlock-modal').removeAttribute('hidden'); + setTimeout(function () { + document.getElementById('unlock-operator-input').focus(); + }, 50); + } + + function closeUnlockModal() { + document.getElementById('unlock-modal').setAttribute('hidden', ''); + unlockDriveId = null; + unlockExpectedToken = null; + } + + function validateUnlockModal() { + var op = (document.getElementById('unlock-operator-input').value || '').trim(); + var rsn = (document.getElementById('unlock-reason-input').value || '').trim(); + var tok = (document.getElementById('unlock-confirm-input').value || '').trim(); + var ok = op.length > 0 && rsn.length >= 5 && tok === unlockExpectedToken; + document.getElementById('unlock-modal-submit-btn').disabled = !ok; + } + + async function submitUnlock() { + var op = (document.getElementById('unlock-operator-input').value || '').trim(); + var rsn = (document.getElementById('unlock-reason-input').value || '').trim(); + var tok = (document.getElementById('unlock-confirm-input').value || '').trim(); + localStorage.setItem('burnin_operator', op); + + var btn = document.getElementById('unlock-modal-submit-btn'); + btn.disabled = true; + + try { + var resp = await fetch('/api/v1/drives/' + unlockDriveId + '/unlock', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + confirm_token: tok, + operator: op, + reason: rsn, + }), + }); + var data = await resp.json(); + if (!resp.ok) { + showToast(data.detail || 'Unlock failed', 'error'); + btn.disabled = false; + return; + } + closeUnlockModal(); + showToast('Unlocked for 10 minutes — start burn-in now to use it.', 'success'); + // Force a drive list refresh so the row flips from Unlock → Burn-In + if (typeof refreshDrives === 'function') refreshDrives(); + } catch (err) { + showToast('Network error', 'error'); + btn.disabled = false; + } + } + // ----------------------------------------------------------------------- // Batch Burn-In // ----------------------------------------------------------------------- @@ -729,7 +974,11 @@ }); var data = await resp.json(); if (!resp.ok) { - showToast(data.detail || 'Failed to queue batch', 'error'); + if (_handlePoolLockedError(data.detail)) { + closeBatchModal(); + return; + } + showToast(_extractErrorMessage(data.detail) || 'Failed to queue batch', 'error'); if (btn) btn.disabled = false; return; } @@ -737,11 +986,18 @@ closeBatchModal(); checkedDriveIds.clear(); updateBatchBar(); - var queued = (data.queued || []).length; - var errors = (data.errors || []).length; - var msg = queued + ' burn-in(s) queued'; - if (errors) msg += ', ' + errors + ' skipped (already active)'; - showToast(msg, errors && !queued ? 'error' : 'success'); + var queued = (data.queued || []).length; + var allErrors = data.errors || []; + var poolLocked = allErrors.filter(function (e) { return e && e.pool_locked; }); + var alreadyActive = allErrors.length - poolLocked.length; + + var parts = [queued + ' burn-in(s) queued']; + if (alreadyActive) parts.push(alreadyActive + ' skipped (already active)'); + if (poolLocked.length) { + parts.push(poolLocked.length + ' pool-locked (use Unlock on each row)'); + } + var tone = (queued === 0 && allErrors.length) ? 'error' : 'success'; + showToast(parts.join(', '), tone); } catch (err) { showToast('Network error', 'error'); if (btn) btn.disabled = false; @@ -792,6 +1048,10 @@ var cancelSmartBtn = e.target.closest('.btn-cancel-smart'); if (cancelSmartBtn && !cancelSmartBtn.disabled) { cancelSmartTest(cancelSmartBtn); return; } + // Pool-drive unlock button (single drive) + var unlockBtn = e.target.closest('.btn-unlock'); + if (unlockBtn && !unlockBtn.disabled) { openUnlockModal(unlockBtn); return; } + // Burn-in start button (single drive) var startBtn = e.target.closest('.btn-start'); if (startBtn && !startBtn.disabled) { openModal(startBtn); return; } @@ -820,6 +1080,14 @@ return; } + // Unlock modal + if (e.target.closest('#unlock-modal-close-btn') || e.target.closest('#unlock-modal-cancel-btn')) { + closeUnlockModal(); + return; + } + if (e.target.id === 'unlock-modal') { closeUnlockModal(); return; } + if (e.target.id === 'unlock-modal-submit-btn') { submitUnlock(); return; } + // Batch modal close if (e.target.closest('#batch-modal-close-btn') || e.target.closest('#batch-modal-cancel-btn')) { closeBatchModal(); @@ -837,11 +1105,15 @@ document.addEventListener('input', function (e) { var id = e.target.id; + if (id === 'unlock-operator-input' || id === 'unlock-reason-input' || + id === 'unlock-confirm-input') validateUnlockModal(); if (id === 'operator-input' || id === 'confirm-serial') validateModal(); }); document.addEventListener('keydown', function (e) { if (e.key === 'Escape') { + var uModal = document.getElementById('unlock-modal'); + if (uModal && !uModal.hidden) { closeUnlockModal(); return; } var modal = document.getElementById('start-modal'); if (modal && !modal.hidden) { closeModal(); return; } var bModal = document.getElementById('batch-modal'); @@ -1117,14 +1389,6 @@ document.querySelectorAll('.drawer-panel').forEach(function (p) { p.classList.toggle('active', p.id === 'drawer-panel-' + _drawerTab); }); - // Terminal tab: init/fit on activation; hide autoscroll (N/A for terminal) - var asl = document.querySelector('.autoscroll-label'); - if (_drawerTab === 'terminal') { - if (asl) asl.style.visibility = 'hidden'; - openTerminalTab(); - } else { - if (asl) asl.style.visibility = ''; - } }); // Close button @@ -1149,155 +1413,4 @@ }).catch(function () { showToast('Network error', 'error'); }); }); - // ----------------------------------------------------------------------- - // Live Terminal (xterm.js + SSH WebSocket) - // ----------------------------------------------------------------------- - - var _xtermReady = false; // xterm.js + FitAddon libraries loaded - var _terminal = null; // xterm.js Terminal instance - var _termFit = null; // FitAddon instance - var _termWs = null; // active WebSocket (null = disconnected) - - function _loadXtermLibs(cb) { - var link = document.createElement('link'); - link.rel = 'stylesheet'; - link.href = 'https://cdn.jsdelivr.net/npm/xterm@5.3.0/css/xterm.css'; - document.head.appendChild(link); - - var s1 = document.createElement('script'); - s1.src = 'https://cdn.jsdelivr.net/npm/xterm@5.3.0/lib/xterm.js'; - s1.onload = function () { - var s2 = document.createElement('script'); - s2.src = 'https://cdn.jsdelivr.net/npm/xterm-addon-fit@0.8.0/lib/xterm-addon-fit.js'; - s2.onload = cb; - document.head.appendChild(s2); - }; - document.head.appendChild(s1); - } - - function openTerminalTab() { - var panel = document.getElementById('drawer-panel-terminal'); - if (!panel) return; - - if (!_xtermReady) { - panel.innerHTML = '
    Loading terminal\u2026
    '; - _loadXtermLibs(function () { - _xtermReady = true; - _termInit(panel); - }); - return; - } - - if (!_terminal) { - _termInit(panel); - return; - } - - // Already initialised — refit to current panel dimensions - setTimeout(function () { - if (_termFit) try { _termFit.fit(); } catch (_) {} - }, 30); - } - - function _termInit(panel) { - panel.innerHTML = ''; - - var term = new Terminal({ - cursorBlink: true, - fontSize: 13, - fontFamily: '"SF Mono","Fira Code",Consolas,"DejaVu Sans Mono",monospace', - theme: { - background: '#0d1117', - foreground: '#e6edf3', - cursor: '#58a6ff', - cursorAccent: '#0d1117', - selectionBackground: 'rgba(88,166,255,0.25)', - black: '#484f58', red: '#ff7b72', green: '#3fb950', yellow: '#d29922', - blue: '#58a6ff', magenta: '#bc8cff', cyan: '#39c5cf', white: '#b1bac4', - brightBlack: '#6e7681', brightRed: '#ffa198', brightGreen: '#56d364', - brightYellow: '#e3b341', brightBlue: '#79c0ff', brightMagenta: '#d2a8ff', - brightCyan: '#56d4dd', brightWhite: '#f0f6fc', - }, - scrollback: 2000, - allowProposedApi: true, - }); - - var fit = new FitAddon.FitAddon(); - term.loadAddon(fit); - term.open(panel); - - _terminal = term; - _termFit = fit; - - // Initial fit after the panel is visible - setTimeout(function () { - if (_termFit) try { _termFit.fit(); } catch (_) {} - }, 30); - - // Forward all keystrokes → SSH (onData registered once here) - term.onData(function (data) { - if (_termWs && _termWs.readyState === 1) { - _termWs.send(new TextEncoder().encode(data)); - } - }); - - // Refit + notify server on resize - new ResizeObserver(function () { - if (!_termFit) return; - try { _termFit.fit(); } catch (_) {} - if (_termWs && _termWs.readyState === 1 && _terminal) { - _termWs.send(JSON.stringify({ type: 'resize', cols: _terminal.cols, rows: _terminal.rows })); - } - }).observe(panel); - - _termConnect(); - } - - function _termConnect() { - if (_termWs && _termWs.readyState <= 1) return; // already open or connecting - - var proto = location.protocol === 'https:' ? 'wss:' : 'ws:'; - var ws = new WebSocket(proto + '//' + location.host + '/ws/terminal'); - ws.binaryType = 'arraybuffer'; - _termWs = ws; - - ws.onopen = function () { - _termHideReconnect(); - if (_terminal && ws.readyState === 1) { - ws.send(JSON.stringify({ type: 'resize', cols: _terminal.cols, rows: _terminal.rows })); - } - }; - - ws.onmessage = function (e) { - if (!_terminal) return; - _terminal.write(e.data instanceof ArrayBuffer ? new Uint8Array(e.data) : e.data); - }; - - ws.onclose = function () { - if (_terminal) _terminal.write('\r\n\x1b[33m\u2500\u2500 disconnected \u2500\u2500\x1b[0m\r\n'); - _termShowReconnect(); - }; - - ws.onerror = function () { /* onclose fires too */ }; - } - - function _termShowReconnect() { - var panel = document.getElementById('drawer-panel-terminal'); - if (!panel || panel.querySelector('.term-reconnect-bar')) return; - var bar = document.createElement('div'); - bar.className = 'term-reconnect-bar'; - bar.innerHTML = 'Connection closed' - + ''; - bar.querySelector('button').onclick = function () { - bar.remove(); - _termConnect(); - }; - panel.appendChild(bar); - } - - function _termHideReconnect() { - var bar = document.querySelector('.term-reconnect-bar'); - if (bar) bar.remove(); - } - }()); diff --git a/app/templates/components/drives_table.html b/app/templates/components/drives_table.html index b9c7de2..77ea2ff 100644 --- a/app/templates/components/drives_table.html +++ b/app/templates/components/drives_table.html @@ -80,11 +80,14 @@ {%- set bi_active = drive.burnin and drive.burnin.state in ('queued', 'running') %} {%- set short_busy = drive.smart_short and drive.smart_short.state == 'running' %} {%- set long_busy = drive.smart_long and drive.smart_long.state == 'running' %} - {%- set selectable = not bi_active and not short_busy and not long_busy %} + {%- set pool_locked = drive.pool_name and not drive.pool_unlocked_until %} + {%- set is_boot_pool = drive.pool_name == 'boot-pool' %} + {%- set is_exported = drive.pool_role == 'exported' %} + {%- set selectable = not bi_active and not short_busy and not long_busy and not pool_locked %} {%- set bi_done = drive.burnin and drive.burnin.state in ('passed', 'failed', 'cancelled', 'unknown') %} {%- set smart_done = (drive.smart_short and drive.smart_short.state in ('passed','failed','aborted')) or (drive.smart_long and drive.smart_long.state in ('passed','failed','aborted')) %} - {%- set can_reset = (bi_done or smart_done) and not bi_active and not short_busy and not long_busy %} + {%- set can_reset = (bi_done or smart_done) and not bi_active and not short_busy and not long_busy and not pool_locked %} {%- if selectable %} @@ -92,8 +95,18 @@ {%- endif %} - {{ drive.devname }} + + {%- if drive.pool_name -%} + 🔒 + {%- endif -%} + {{ drive.devname }} + {{ drive.model or "Unknown" }} + {%- if drive.pool_name %} + {% if is_exported %}exported ZFS{% else %}{{ drive.pool_name }} · {{ drive.pool_role or 'data' }}{% endif %} + {%- endif %} {%- if drive.location %} Long {%- endif %} + {%- if pool_locked %} + + + {%- else %} + title="Start Burn-In{% if drive.pool_name %} (UNLOCKED — pool drive){% endif %}">Burn-In{% if drive.pool_name %} 🔓{% endif %} {%- if can_reset %} {%- endif %} {%- endif %} + {%- endif %} diff --git a/app/templates/components/modal_unlock.html b/app/templates/components/modal_unlock.html new file mode 100644 index 0000000..5d6c761 --- /dev/null +++ b/app/templates/components/modal_unlock.html @@ -0,0 +1,69 @@ + diff --git a/app/templates/dashboard.html b/app/templates/dashboard.html index dd7ec38..b9c9cdf 100644 --- a/app/templates/dashboard.html +++ b/app/templates/dashboard.html @@ -5,8 +5,9 @@ {% block content %} {% include "components/modal_start.html" %} {% include "components/modal_batch.html" %} +{% include "components/modal_unlock.html" %} - +
    {{ drives | length }} @@ -28,6 +29,33 @@ 0 Idle
    + + {%- set st = poller.system_temps if (poller and poller.system_temps) else {} %} + {%- if st.get('cpu_c') is not none or st.get('pch_c') is not none %} +
    + {%- if st.get('cpu_c') is not none %} +
    + {{ st.get('cpu_c') }}° + CPU +
    + {%- endif %} + {%- if st.get('pch_c') is not none %} +
    + {{ st.get('pch_c') }}° + PCH +
    + {%- endif %} + {%- endif %} + + {%- set tp = poller.thermal_pressure if poller else 'ok' %} +
    + + {%- if tp == 'warn' %}WARM{%- elif tp == 'crit' %}HOT{%- else %}OK{%- endif %} + + Thermal +
    @@ -83,7 +111,6 @@ -
    {% endblock %} diff --git a/app/templates/settings.html b/app/templates/settings.html index 2578e06..17671aa 100644 --- a/app/templates/settings.html +++ b/app/templates/settings.html @@ -248,6 +248,30 @@ type="number" min="0" max="9999" value="{{ editable.bad_block_threshold }}"> Max bad blocks before surface validate fails (Stage 7) + +
    + + + badblocks -b. 4096 (default) is conservative; 8192 is faster on multi-TB HDDs (~2x RAM, ~half the runtime). Power of 2. +
    + +
    + + + badblocks -c. 64 (default) matches the upstream tool. Buffer = block_size × this many blocks per IO. +
    + +
    + + + badblocks -p. 1 = repeat until one consecutive clean scan (default). 2-3 for paranoid burn-in that re-confirms after errors. +
    diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_pool_parser.py b/tests/test_pool_parser.py new file mode 100644 index 0000000..95e0947 --- /dev/null +++ b/tests/test_pool_parser.py @@ -0,0 +1,283 @@ +"""Unit tests for the zpool-list and lsblk parsers in ssh_client. + +These cover the structural cases that drive the pool-membership lock: +mirror/raidz/draid container vdevs, single-disk vdevs at depth 1, the +flattened-indentation behaviour of `zpool list -vHP` on TrueNAS, partition +suffix stripping for NVMe and SCSI, and the cache/log/spare/special +section markers (including plural variants). + +Run with: python -m unittest discover tests/ -v +""" + +import unittest + +from app.ssh_client import ( + _parse_zpool_list_output, + _parse_lsblk_zfs_output, + _parse_smart_health_batch, +) + + +class TestParseZpoolList(unittest.TestCase): + + def test_empty_output_returns_empty(self): + self.assertEqual(_parse_zpool_list_output(""), {}) + + def test_single_pool_with_mirror(self): + # TrueNAS-flattened output: pool at depth 0, vdev type and devices + # all at depth 1. + out = _parse_zpool_list_output( + "boot-pool\t232G\t8.4G\t224G\t-\t-\t17%\t3%\t1.00x\tONLINE\t-\n" + "\tmirror-0\t232G\t8.4G\t224G\t-\t-\t17%\t3.6%\t-\tONLINE\n" + "\t/dev/nvme0n1p3\t232G\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "\t/dev/sdd3\t232G\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + ) + self.assertEqual(out, { + "nvme0n1": {"pool": "boot-pool", "role": "data"}, + "sdd": {"pool": "boot-pool", "role": "data"}, + }) + + def test_raidz2_pool(self): + out = _parse_zpool_list_output( + "tank\t127T\t4.5T\t122T\t-\t-\t0%\t3%\t1.00x\tONLINE\t-\n" + "\traidz2-0\t127T\t4.5T\t122T\t-\t-\t0%\t3%\t-\tONLINE\n" + "\t/dev/sdc\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "\t/dev/sde\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "\t/dev/sdf\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + ) + self.assertEqual(set(out.keys()), {"sdc", "sde", "sdf"}) + for v in out.values(): + self.assertEqual(v, {"pool": "tank", "role": "data"}) + + def test_draid_pool(self): + out = _parse_zpool_list_output( + "warm\t100T\t-\t-\t-\t-\t-\t-\t-\tONLINE\t-\n" + "\tdraid2:8d:10c:1s-0\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "\t/dev/sdg\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "\t/dev/sdh\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + ) + self.assertEqual(out["sdg"], {"pool": "warm", "role": "data"}) + self.assertEqual(out["sdh"], {"pool": "warm", "role": "data"}) + + def test_single_disk_vdev_at_depth_1(self): + # No mirror/raidz wrapper — a `/dev/...` line itself sits at depth 1. + out = _parse_zpool_list_output( + "scratch\t1T\t-\t-\t-\t-\t-\t-\t-\tONLINE\t-\n" + "\t/dev/sdi\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + ) + self.assertEqual(out, {"sdi": {"pool": "scratch", "role": "data"}}) + + def test_section_markers_switch_role(self): + # cache / log / spare / special / dedup all at depth 1; subsequent + # /dev/... lines (also at depth 1) inherit that role. + out = _parse_zpool_list_output( + "tank\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\t-\n" + "\tmirror-0\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "\t/dev/sda\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "\t/dev/sdb\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "\tcache\n" + "\t/dev/nvme1n1\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "\tlog\n" + "\t/dev/nvme2n1\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "\tspare\n" + "\t/dev/sdz\t-\t-\t-\t-\t-\t-\t-\t-\tAVAIL\n" + ) + self.assertEqual(out["sda"], {"pool": "tank", "role": "data"}) + self.assertEqual(out["sdb"], {"pool": "tank", "role": "data"}) + self.assertEqual(out["nvme1n1"], {"pool": "tank", "role": "cache"}) + self.assertEqual(out["nvme2n1"], {"pool": "tank", "role": "log"}) + self.assertEqual(out["sdz"], {"pool": "tank", "role": "spare"}) + + def test_section_markers_plurals_normalize(self): + # ZFS sometimes emits 'logs'/'spares' instead of 'log'/'spare'. + out = _parse_zpool_list_output( + "tank\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\t-\n" + "\tlogs\n" + "\t/dev/nvme0n1\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "\tspares\n" + "\t/dev/sdz\t-\t-\t-\t-\t-\t-\t-\t-\tAVAIL\n" + ) + self.assertEqual(out["nvme0n1"]["role"], "log") + self.assertEqual(out["sdz"]["role"], "spare") + + def test_special_and_dedup_section(self): + out = _parse_zpool_list_output( + "tank\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\t-\n" + "\tspecial\n" + "\t/dev/sda\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "\tdedup\n" + "\t/dev/sdb\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + ) + self.assertEqual(out["sda"]["role"], "special") + self.assertEqual(out["sdb"]["role"], "dedup") + + def test_partition_suffix_stripped(self): + out = _parse_zpool_list_output( + "tank\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\t-\n" + "\tmirror-0\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "\t/dev/sda3\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "\t/dev/nvme0n1p3\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + ) + self.assertIn("sda", out) + self.assertNotIn("sda3", out) + self.assertIn("nvme0n1", out) + self.assertNotIn("nvme0n1p3", out) + + def test_long_scsi_devname(self): + # Past sdz: sdaa, sdab, ... + out = _parse_zpool_list_output( + "big\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\t-\n" + "\traidz3-0\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "\t/dev/sdaa\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "\t/dev/sdab1\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + ) + self.assertEqual(out["sdaa"]["pool"], "big") + self.assertEqual(out["sdab"]["pool"], "big") # partition stripped + + def test_pool_name_with_dashes_dots_underscores(self): + out = _parse_zpool_list_output( + "my-cool_pool.v2\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\t-\n" + "\t/dev/sda\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + ) + self.assertEqual(out["sda"]["pool"], "my-cool_pool.v2") + + def test_multiple_pools(self): + out = _parse_zpool_list_output( + "boot-pool\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\t-\n" + "\tmirror-0\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "\t/dev/nvme0n1p3\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "\t/dev/sdd3\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "tank\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\t-\n" + "\traidz2-0\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "\t/dev/sda\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "\t/dev/sdb\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + ) + self.assertEqual(out["nvme0n1"]["pool"], "boot-pool") + self.assertEqual(out["sdd"]["pool"], "boot-pool") + self.assertEqual(out["sda"]["pool"], "tank") + self.assertEqual(out["sdb"]["pool"], "tank") + + def test_pool_role_resets_between_pools(self): + # Section marker in pool A must not carry into pool B. + out = _parse_zpool_list_output( + "a\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\t-\n" + "\tcache\n" + "\t/dev/sda\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + "b\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\t-\n" + "\t/dev/sdb\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + ) + self.assertEqual(out["sda"]["role"], "cache") + self.assertEqual(out["sdb"]["role"], "data") + + def test_blank_lines_skipped(self): + out = _parse_zpool_list_output( + "\n" + "tank\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\t-\n" + "\n" + "\t/dev/sda\t-\t-\t-\t-\t-\t-\t-\t-\tONLINE\n" + ) + self.assertEqual(out, {"sda": {"pool": "tank", "role": "data"}}) + + +class TestParseLsblkZfs(unittest.TestCase): + + def test_empty_returns_empty_set(self): + self.assertEqual(_parse_lsblk_zfs_output(""), set()) + + def test_partition_zfs_member(self): + # Typical TrueNAS layout: zpool members are partitions. + out = _parse_lsblk_zfs_output( + "sda \n" + "sda1 \n" + "sda3 zfs_member\n" + "sdb \n" + "sdb3 zfs_member\n" + ) + self.assertEqual(out, {"sda", "sdb"}) + + def test_whole_disk_zfs_member(self): + # Some configurations put zfs_member on the whole disk. + out = _parse_lsblk_zfs_output( + "sdc zfs_member\n" + ) + self.assertEqual(out, {"sdc"}) + + def test_nvme_partitioned_and_whole(self): + out = _parse_lsblk_zfs_output( + "nvme0n1 \n" + "nvme0n1p3 zfs_member\n" + "nvme1n1 zfs_member\n" + ) + self.assertEqual(out, {"nvme0n1", "nvme1n1"}) + + def test_non_zfs_fstypes_ignored(self): + out = _parse_lsblk_zfs_output( + "sda1 ext4\n" + "sda2 swap\n" + "sdb1 btrfs\n" + ) + self.assertEqual(out, set()) + + def test_long_scsi_devnames(self): + out = _parse_lsblk_zfs_output( + "sdaa zfs_member\n" + "sdab1 zfs_member\n" + ) + self.assertEqual(out, {"sdaa", "sdab"}) + + def test_short_lines_skipped(self): + out = _parse_lsblk_zfs_output( + "sda\n" + "\n" + "sdb1 zfs_member\n" + ) + self.assertEqual(out, {"sdb"}) + + +class TestParseSmartHealthBatch(unittest.TestCase): + + def test_passed_drive(self): + out = _parse_smart_health_batch( + "@@sda@@\n" + "smartctl 7.4 2023-08-01 r5530 [x86_64-linux-6.6]\n" + "SMART overall-health self-assessment test result: PASSED\n" + "@@END@@\n" + ) + self.assertEqual(out, {"sda": "PASSED"}) + + def test_failed_drive(self): + out = _parse_smart_health_batch( + "@@sdb@@\n" + "SMART overall-health self-assessment test result: FAILED!\n" + "@@END@@\n" + ) + self.assertEqual(out, {"sdb": "FAILED"}) + + def test_unknown_when_no_marker(self): + out = _parse_smart_health_batch( + "@@sdc@@\n" + "/dev/sdc: Unknown USB bridge\n" + "@@END@@\n" + ) + self.assertEqual(out, {"sdc": "UNKNOWN"}) + + def test_multiple_drives_mixed_states(self): + out = _parse_smart_health_batch( + "@@sda@@\n" + "SMART overall-health self-assessment test result: PASSED\n" + "@@END@@\n" + "@@sdb@@\n" + "SMART overall-health self-assessment test result: FAILED!\n" + "@@END@@\n" + "@@nvme0n1@@\n" + "SMART overall-health self-assessment test result: PASSED\n" + "@@END@@\n" + ) + self.assertEqual(out, {"sda": "PASSED", "sdb": "FAILED", "nvme0n1": "PASSED"}) + + def test_empty_returns_empty(self): + self.assertEqual(_parse_smart_health_batch(""), {}) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_unlock_flow.py b/tests/test_unlock_flow.py new file mode 100644 index 0000000..626a3fd --- /dev/null +++ b/tests/test_unlock_flow.py @@ -0,0 +1,303 @@ +"""Unit tests for the pool-drive unlock state machine in burnin.py. + +Covers: token validation per pool kind, identity-binding (grant +invalidated when pool_name/pool_role changes), TTL expiry, the +audit-commit-then-arm ordering (a failing audit insert leaves no +in-memory grant), and the unique-active-burnin partial index that +prevents duplicate queued rows for the same drive. + +Uses an in-memory SQLite DB and monkeypatches app.config.settings.db_path. +No SSH, no network, no FastAPI. + +Run with: python -m unittest discover tests/ -v +""" + +import os +import tempfile +import time +import unittest + +import aiosqlite + + +async def _setup_temp_db() -> str: + """Create a temp SQLite file, point app.config at it, init schema. + Async-callable from IsolatedAsyncioTestCase.asyncSetUp.""" + fd, path = tempfile.mkstemp(suffix=".db") + os.close(fd) + from app.config import settings + settings.db_path = path + + from app.database import init_db + await init_db() + # Seed pool drives so unlock_flow tests have something to grant on. + async with aiosqlite.connect(path) as db: + await db.execute(""" + INSERT INTO drives + (truenas_disk_id, devname, serial, model, size_bytes, + temperature_c, smart_health, last_seen_at, last_polled_at, + pool_name, pool_role, pool_seen_at) + VALUES ('test-id-1', 'sda', 'TESTSER1', 'TestModel', 1000, + 30, 'PASSED', '2026-05-02T00:00:00+00:00', + '2026-05-02T00:00:00+00:00', + 'tank', 'data', '2026-05-02T00:00:00+00:00') + """) + await db.execute(""" + INSERT INTO drives + (truenas_disk_id, devname, serial, model, size_bytes, + temperature_c, smart_health, last_seen_at, last_polled_at, + pool_name, pool_role, pool_seen_at) + VALUES ('test-id-2', 'sdb', 'TESTSER2', 'TestModel', 1000, + 30, 'PASSED', '2026-05-02T00:00:00+00:00', + '2026-05-02T00:00:00+00:00', + 'boot-pool', 'data', '2026-05-02T00:00:00+00:00') + """) + await db.execute(""" + INSERT INTO drives + (truenas_disk_id, devname, serial, model, size_bytes, + temperature_c, smart_health, last_seen_at, last_polled_at, + pool_name, pool_role, pool_seen_at) + VALUES ('test-id-3', 'sdc', 'TESTSER3', 'TestModel', 1000, + 30, 'PASSED', '2026-05-02T00:00:00+00:00', + '2026-05-02T00:00:00+00:00', + '(exported)', 'exported', '2026-05-02T00:00:00+00:00') + """) + await db.commit() + return path + + +class TestUnlockFlow(unittest.IsolatedAsyncioTestCase): + + async def asyncSetUp(self): + self.db_path = await _setup_temp_db() + # Reset module state so previous test runs don't bleed in. + from app import burnin + burnin._unlock_grants.clear() + + async def asyncTearDown(self): + try: + os.unlink(self.db_path) + except OSError: + pass + + # ----- token validation per pool kind ----- + + async def test_active_pool_token_is_pool_name(self): + from app import burnin + # Drive 1 = tank/data + with self.assertRaises(ValueError): + await burnin.grant_pool_unlock(1, "wrong", "op", "valid reason") + expiry = await burnin.grant_pool_unlock(1, "tank", "op", "valid reason") + self.assertGreater(expiry, time.time()) + + async def test_boot_pool_token_is_destroy_phrase(self): + from app import burnin + # Drive 2 = boot-pool — typing the pool name must NOT work. + with self.assertRaises(ValueError): + await burnin.grant_pool_unlock(2, "boot-pool", "op", "valid reason") + expiry = await burnin.grant_pool_unlock( + 2, "DESTROY BOOT POOL", "op", "valid reason" + ) + self.assertGreater(expiry, time.time()) + + async def test_exported_token_is_destroy_phrase(self): + from app import burnin + # Drive 3 = (exported)/exported + with self.assertRaises(ValueError): + await burnin.grant_pool_unlock(3, "(exported)", "op", "valid reason") + expiry = await burnin.grant_pool_unlock( + 3, "DESTROY EXPORTED POOL", "op", "valid reason" + ) + self.assertGreater(expiry, time.time()) + + # ----- input validation ----- + + async def test_empty_reason_rejected(self): + from app import burnin + with self.assertRaises(ValueError): + await burnin.grant_pool_unlock(1, "tank", "op", "") + + async def test_short_reason_rejected(self): + from app import burnin + with self.assertRaises(ValueError): + await burnin.grant_pool_unlock(1, "tank", "op", "hi") + + async def test_empty_operator_rejected(self): + from app import burnin + with self.assertRaises(ValueError): + await burnin.grant_pool_unlock(1, "tank", "", "valid reason") + + async def test_unknown_drive_rejected(self): + from app import burnin + with self.assertRaises(ValueError): + await burnin.grant_pool_unlock(99999, "anything", "op", "valid reason") + + async def test_drive_not_in_pool_rejected(self): + from app import burnin + # Manually clear pool fields on drive 1 + async with aiosqlite.connect(self.db_path) as db: + await db.execute("UPDATE drives SET pool_name=NULL, pool_role=NULL WHERE id=1") + await db.commit() + with self.assertRaises(ValueError): + await burnin.grant_pool_unlock(1, "tank", "op", "valid reason") + + # ----- identity binding (Codex finding #2) ----- + + async def test_grant_invalidated_when_pool_name_changes(self): + from app import burnin + await burnin.grant_pool_unlock(1, "tank", "op", "valid reason") + # Operator's grant references tank/data; pool detection now reports tank2. + self.assertTrue(burnin._is_unlocked(1, "tank", "data")) + self.assertFalse(burnin._is_unlocked(1, "tank2", "data")) + # And the side effect: the grant is reaped, not just temporarily denied. + self.assertNotIn(1, burnin._unlock_grants) + + async def test_grant_invalidated_when_pool_role_changes(self): + from app import burnin + await burnin.grant_pool_unlock(1, "tank", "op", "valid reason") + # Same pool, different role (data -> cache). + self.assertFalse(burnin._is_unlocked(1, "tank", "cache")) + self.assertNotIn(1, burnin._unlock_grants) + + async def test_unlock_expiry_returns_none_for_mismatched_identity(self): + from app import burnin + await burnin.grant_pool_unlock(1, "tank", "op", "valid reason") + self.assertIsNotNone(burnin.unlock_expiry(1, "tank", "data")) + self.assertIsNone(burnin.unlock_expiry(1, "tank2", "data")) + + # ----- TTL expiry ----- + + async def test_expired_grant_returns_false(self): + from app import burnin + # Drop TTL to 0 so the grant is born expired. + original = burnin.UNLOCK_TTL_SECONDS + burnin.UNLOCK_TTL_SECONDS = 0 + try: + await burnin.grant_pool_unlock(1, "tank", "op", "valid reason") + self.assertFalse(burnin._is_unlocked(1, "tank", "data")) + self.assertNotIn(1, burnin._unlock_grants) + finally: + burnin.UNLOCK_TTL_SECONDS = original + + # ----- audit commit ordering (Codex finding #3) ----- + + async def test_audit_event_recorded_for_active_pool(self): + from app import burnin + await burnin.grant_pool_unlock(1, "tank", "alice", "swapping out drive") + async with aiosqlite.connect(self.db_path) as db: + db.row_factory = aiosqlite.Row + cur = await db.execute( + "SELECT event_type, operator, message FROM audit_events " + "WHERE drive_id=? ORDER BY id DESC LIMIT 1", (1,) + ) + row = await cur.fetchone() + self.assertEqual(row["event_type"], "pool_drive_unlocked") + self.assertEqual(row["operator"], "alice") + self.assertIn("swapping out drive", row["message"]) + + async def test_audit_event_for_boot_pool_uses_distinct_type(self): + from app import burnin + await burnin.grant_pool_unlock( + 2, "DESTROY BOOT POOL", "alice", "replacing failed mirror" + ) + async with aiosqlite.connect(self.db_path) as db: + db.row_factory = aiosqlite.Row + cur = await db.execute( + "SELECT event_type FROM audit_events WHERE drive_id=? ORDER BY id DESC LIMIT 1", + (2,), + ) + row = await cur.fetchone() + self.assertEqual(row["event_type"], "boot_pool_drive_unlocked") + + async def test_audit_event_for_exported_uses_distinct_type(self): + from app import burnin + await burnin.grant_pool_unlock( + 3, "DESTROY EXPORTED POOL", "alice", "decommissioned pool" + ) + async with aiosqlite.connect(self.db_path) as db: + db.row_factory = aiosqlite.Row + cur = await db.execute( + "SELECT event_type FROM audit_events WHERE drive_id=? ORDER BY id DESC LIMIT 1", + (3,), + ) + row = await cur.fetchone() + self.assertEqual(row["event_type"], "exported_pool_drive_unlocked") + + async def test_failed_token_does_not_record_audit_event(self): + from app import burnin + try: + await burnin.grant_pool_unlock(1, "wrong-token", "op", "valid reason") + except ValueError: + pass + async with aiosqlite.connect(self.db_path) as db: + cur = await db.execute( + "SELECT COUNT(*) FROM audit_events WHERE drive_id=?", (1,) + ) + self.assertEqual((await cur.fetchone())[0], 0) + # And no in-memory grant was armed. + self.assertNotIn(1, burnin._unlock_grants) + + +class TestActiveJobUniqueIndex(unittest.IsolatedAsyncioTestCase): + """Codex finding #4 — the partial unique index on burnin_jobs(drive_id) + WHERE state IN ('queued','running') must reject a second active row even + when two requests pass the SELECT-COUNT check concurrently.""" + + async def asyncSetUp(self): + self.db_path = await _setup_temp_db() + from app import burnin + burnin._unlock_grants.clear() + # Need to clear the pool field on drive 1 so unlock isn't required + # for these race tests. + async with aiosqlite.connect(self.db_path) as db: + await db.execute("UPDATE drives SET pool_name=NULL, pool_role=NULL WHERE id=1") + await db.commit() + # Burnin orchestrator init for the semaphore + from app import burnin as b + import asyncio as _a + b._semaphore = _a.Semaphore(4) + + async def asyncTearDown(self): + try: + os.unlink(self.db_path) + except OSError: + pass + + async def test_index_blocks_second_active_insert(self): + # Insert a queued row by hand, then try a second one — index fires. + async with aiosqlite.connect(self.db_path) as db: + await db.execute( + """INSERT INTO burnin_jobs (drive_id, profile, state, percent, operator, created_at) + VALUES (?,?,?,?,?,?)""", + (1, "surface", "queued", 0, "op", "2026-05-02T00:00:00+00:00"), + ) + await db.commit() + with self.assertRaises(aiosqlite.IntegrityError): + await db.execute( + """INSERT INTO burnin_jobs (drive_id, profile, state, percent, operator, created_at) + VALUES (?,?,?,?,?,?)""", + (1, "surface", "queued", 0, "op", "2026-05-02T00:00:01+00:00"), + ) + await db.commit() + + async def test_index_allows_terminal_state_then_new_job(self): + # passed/failed/cancelled/unknown rows must not block a fresh queue. + async with aiosqlite.connect(self.db_path) as db: + for state in ("passed", "failed", "cancelled", "unknown"): + await db.execute( + """INSERT INTO burnin_jobs (drive_id, profile, state, percent, operator, created_at) + VALUES (?,?,?,?,?,?)""", + (1, "surface", state, 100, "op", "2026-05-02T00:00:00+00:00"), + ) + await db.commit() + # Should succeed — no other queued/running row exists. + await db.execute( + """INSERT INTO burnin_jobs (drive_id, profile, state, percent, operator, created_at) + VALUES (?,?,?,?,?,?)""", + (1, "surface", "queued", 0, "op", "2026-05-02T00:00:00+00:00"), + ) + await db.commit() + + +if __name__ == "__main__": + unittest.main()