truenas-burnin/claude-sandbox/truenas-burnin/app/truenas.py
echoparkbaby 3e0000528f TrueNAS Burn-In Dashboard v0.9.0 — Live mode, thermal monitoring, adaptive concurrency
Go live against real TrueNAS SCALE 25.10:
- Remove mock-truenas dependency; mount SSH key as Docker secret
- Filter expired disk records from /api/v2.0/disk (expiretime field)
- Route all SMART operations through SSH (SCALE 25.10 removed REST smart/test endpoint)
- Poll drive temperatures via POST /api/v2.0/disk/temperatures (SCALE-specific)
- Store raw smartctl output in smart_tests.raw_output for proof of test execution
- Fix percent-remaining=0 false jump to 100% on test start
- Fix terminal WebSocket: add mounted key file fallback (/run/secrets/ssh_key)
- Fix WebSocket support: uvicorn → uvicorn[standard] (installs websockets)

HBA/system sensor temps on dashboard:
- SSH to TrueNAS and run sensors -j each poll cycle
- Parse coretemp (CPU package) and pch_* (PCH/chipset — storage I/O proxy)
- Render as compact chips in stats bar, color-coded green/yellow/red
- Live updates via new SSE system-sensors event every 12s

Adaptive concurrency signal:
- Thermal pressure indicator in stats bar: hidden when OK, WARM/HOT when running
  burn-in drives hit temp_warn_c / temp_crit_c thresholds
- Thermal gate in burn-in queue: jobs wait up to 3 min before acquiring semaphore
  slot if running drives are already at warning temp; times out and proceeds

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-27 06:33:36 -05:00

164 lines
5.3 KiB
Python

import asyncio
import logging
from collections.abc import Callable, Coroutine
from typing import Any, TypeVar
import httpx
from app.config import settings
log = logging.getLogger(__name__)
T = TypeVar("T")
# Exceptions that are safe to retry (transient network issues)
_RETRYABLE = (
httpx.ConnectError,
httpx.TimeoutException,
httpx.RemoteProtocolError,
httpx.ReadError,
)
async def _with_retry(
factory: Callable[[], Coroutine[Any, Any, T]],
label: str,
max_attempts: int = 3,
) -> T:
"""
Call factory() to get a fresh coroutine and await it, retrying with
exponential backoff on transient failures.
A factory (not a bare coroutine) is required so each attempt gets a
new coroutine object — an already-awaited coroutine cannot be reused.
"""
backoff = 1.0
for attempt in range(1, max_attempts + 1):
try:
return await factory()
except _RETRYABLE as exc:
if attempt == max_attempts:
raise
log.warning(
"TrueNAS %s transient error (attempt %d/%d): %s — retrying in %.0fs",
label, attempt, max_attempts, exc, backoff,
)
await asyncio.sleep(backoff)
backoff *= 2
class TrueNASClient:
def __init__(self) -> None:
self._client = httpx.AsyncClient(
base_url=settings.truenas_base_url,
headers={"Authorization": f"Bearer {settings.truenas_api_key}"},
verify=settings.truenas_verify_tls,
timeout=10.0,
)
async def close(self) -> None:
await self._client.aclose()
async def get_disks(self) -> list[dict]:
r = await _with_retry(
lambda: self._client.get("/api/v2.0/disk"),
"get_disks",
)
r.raise_for_status()
disks = r.json()
# Filter out expired records — TrueNAS keeps historical entries for removed
# disks with expiretime set. Only return currently-present drives.
active = [d for d in disks if not d.get("expiretime")]
if len(active) < len(disks):
log.debug("get_disks: filtered %d expired record(s)", len(disks) - len(active))
return active
async def get_smart_jobs(self, state: str | None = None) -> list[dict]:
params: dict = {"method": "smart.test"}
if state:
params["state"] = state
r = await _with_retry(
lambda: self._client.get("/api/v2.0/core/get_jobs", params=params),
"get_smart_jobs",
)
r.raise_for_status()
return r.json()
async def get_smart_results(self, devname: str) -> list[dict]:
r = await _with_retry(
lambda: self._client.get(f"/api/v2.0/smart/test/results/{devname}"),
f"get_smart_results({devname})",
)
r.raise_for_status()
return r.json()
async def start_smart_test(self, disks: list[str], test_type: str) -> int:
"""Start a SMART test. Not retried — a duplicate start would launch a second job."""
r = await self._client.post(
"/api/v2.0/smart/test",
json={"disks": disks, "type": test_type},
)
r.raise_for_status()
return r.json()
async def abort_job(self, job_id: int) -> None:
"""Abort a TrueNAS job. Not retried — best-effort cancel."""
r = await self._client.post(
"/api/v2.0/core/job_abort",
json={"id": job_id},
)
r.raise_for_status()
async def get_system_info(self) -> dict:
r = await _with_retry(
lambda: self._client.get("/api/v2.0/system/info"),
"get_system_info",
)
r.raise_for_status()
return r.json()
async def get_disk_temperatures(self) -> dict[str, float | None]:
"""
Returns {devname: celsius | None}.
Uses POST /api/v2.0/disk/temperatures — available on TrueNAS SCALE 25.10+.
CORE compatibility: raises on 404/405, caller should catch and skip.
"""
r = await _with_retry(
lambda: self._client.post("/api/v2.0/disk/temperatures", json={}),
"get_disk_temperatures",
)
r.raise_for_status()
return r.json()
async def wipe_disk(self, devname: str, mode: str = "FULL") -> int:
"""
Start a disk wipe job. Not retried — duplicate starts would launch a second wipe.
mode: "QUICK" (wipe MBR/partitions only), "FULL" (write zeros), "FULL_RANDOM" (write random)
devname: basename only, e.g. "ada0" (not "/dev/ada0")
Returns the TrueNAS job ID.
"""
r = await self._client.post(
"/api/v2.0/disk/wipe",
json={"dev": devname, "mode": mode},
)
r.raise_for_status()
return r.json()
async def get_job(self, job_id: int) -> dict | None:
"""
Fetch a single TrueNAS job by ID.
Returns the job dict, or None if not found.
"""
import json as _json
r = await _with_retry(
lambda: self._client.get(
"/api/v2.0/core/get_jobs",
params={"filters": _json.dumps([["id", "=", job_id]])},
),
f"get_job({job_id})",
)
r.raise_for_status()
jobs = r.json()
if isinstance(jobs, list) and jobs:
return jobs[0]
return None