Compare commits
No commits in common. "main" and "25d4622aa4cb877de079a3bdf5825b20359701ca" have entirely different histories.
main
...
25d4622aa4
39 changed files with 676 additions and 78 deletions
|
|
@ -209,7 +209,7 @@ All read from `.env` via `pydantic-settings`. See `.env.example` for full list.
|
||||||
| `TEMP_WARN_C` | `46` | Temperature warning threshold (°C) |
|
| `TEMP_WARN_C` | `46` | Temperature warning threshold (°C) |
|
||||||
| `TEMP_CRIT_C` | `55` | Temperature critical threshold — precheck fails above this |
|
| `TEMP_CRIT_C` | `55` | Temperature critical threshold — precheck fails above this |
|
||||||
| `BAD_BLOCK_THRESHOLD` | `0` | Max bad blocks allowed before surface_validate fails (0 = any bad = fail) |
|
| `BAD_BLOCK_THRESHOLD` | `0` | Max bad blocks allowed before surface_validate fails (0 = any bad = fail) |
|
||||||
| `APP_VERSION` | `1.0.0-7` | Displayed in header version badge |
|
| `APP_VERSION` | `1.0.0-8` | Displayed in header version badge |
|
||||||
| `SSH_HOST` | `` | TrueNAS SSH hostname/IP — empty disables SSH mode (uses mock/REST) |
|
| `SSH_HOST` | `` | TrueNAS SSH hostname/IP — empty disables SSH mode (uses mock/REST) |
|
||||||
| `SSH_PORT` | `22` | TrueNAS SSH port |
|
| `SSH_PORT` | `22` | TrueNAS SSH port |
|
||||||
| `SSH_USER` | `root` | TrueNAS SSH username |
|
| `SSH_USER` | `root` | TrueNAS SSH username |
|
||||||
|
|
@ -206,10 +206,45 @@ async def cancel_job(job_id: int, operator: str) -> bool:
|
||||||
# Job runner
|
# Job runner
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def _thermal_gate_ok() -> bool:
|
||||||
|
"""True if it's thermally safe to start a new burn-in.
|
||||||
|
Checks the peak temperature of drives currently under active burn-in.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
async with _db() as db:
|
||||||
|
cur = await db.execute("""
|
||||||
|
SELECT MAX(d.temperature_c)
|
||||||
|
FROM drives d
|
||||||
|
JOIN burnin_jobs bj ON bj.drive_id = d.id
|
||||||
|
WHERE bj.state = 'running' AND d.temperature_c IS NOT NULL
|
||||||
|
""")
|
||||||
|
row = await cur.fetchone()
|
||||||
|
max_temp = row[0] if row and row[0] is not None else None
|
||||||
|
return max_temp is None or max_temp < settings.temp_warn_c
|
||||||
|
except Exception:
|
||||||
|
return True # Never block on error
|
||||||
|
|
||||||
|
|
||||||
async def _run_job(job_id: int) -> None:
|
async def _run_job(job_id: int) -> None:
|
||||||
"""Acquire semaphore slot, execute all stages, persist final state."""
|
"""Acquire semaphore slot, execute all stages, persist final state."""
|
||||||
assert _semaphore is not None, "burnin.init() not called"
|
assert _semaphore is not None, "burnin.init() not called"
|
||||||
|
|
||||||
|
# Adaptive thermal gate: wait before competing for a slot if running drives
|
||||||
|
# are already at or above the warning threshold. This prevents layering a
|
||||||
|
# new burn-in on top of a thermally-stressed system. Gives up after 3 min
|
||||||
|
# and proceeds anyway so jobs don't queue indefinitely.
|
||||||
|
for _attempt in range(18): # 18 × 10 s = 3 min max
|
||||||
|
if await _thermal_gate_ok():
|
||||||
|
break
|
||||||
|
if _attempt == 0:
|
||||||
|
log.info(
|
||||||
|
"Thermal gate: job %d waiting — running drive temps at or above %d°C",
|
||||||
|
job_id, settings.temp_warn_c,
|
||||||
|
)
|
||||||
|
await asyncio.sleep(10)
|
||||||
|
else:
|
||||||
|
log.warning("Thermal gate timed out for job %d — proceeding anyway", job_id)
|
||||||
|
|
||||||
async with _semaphore:
|
async with _semaphore:
|
||||||
if await _is_cancelled(job_id):
|
if await _is_cancelled(job_id):
|
||||||
return
|
return
|
||||||
|
|
@ -519,15 +554,39 @@ async def _stage_smart_test_ssh(job_id: int, devname: str, test_type: str, stage
|
||||||
# "unknown" → keep polling
|
# "unknown" → keep polling
|
||||||
|
|
||||||
|
|
||||||
|
async def _badblocks_available() -> bool:
|
||||||
|
"""Check if badblocks is installed on the remote host (Linux/SCALE only)."""
|
||||||
|
from app import ssh_client
|
||||||
|
try:
|
||||||
|
async with await ssh_client._connect() as conn:
|
||||||
|
result = await conn.run("which badblocks", check=False)
|
||||||
|
return result.returncode == 0
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
async def _stage_surface_validate(job_id: int, devname: str, drive_id: int) -> bool:
|
async def _stage_surface_validate(job_id: int, devname: str, drive_id: int) -> bool:
|
||||||
"""
|
"""
|
||||||
Surface validation stage.
|
Surface validation stage — auto-routes to the right implementation:
|
||||||
SSH mode: runs badblocks -wsv -b 4096 -p 1 /dev/{devname}.
|
|
||||||
Mock mode: simulated timed progress (no real I/O).
|
1. SSH configured + badblocks available (TrueNAS SCALE / Linux):
|
||||||
|
→ runs badblocks -wsv -b 4096 -p 1 /dev/{devname} directly over SSH.
|
||||||
|
2. SSH configured + badblocks NOT available (TrueNAS CORE / FreeBSD):
|
||||||
|
→ uses TrueNAS REST API disk.wipe FULL job + post-wipe SMART check.
|
||||||
|
3. No SSH:
|
||||||
|
→ simulated timed progress (dev/mock mode).
|
||||||
"""
|
"""
|
||||||
from app import ssh_client
|
from app import ssh_client
|
||||||
if ssh_client.is_configured():
|
if ssh_client.is_configured():
|
||||||
return await _stage_surface_validate_ssh(job_id, devname, drive_id)
|
if await _badblocks_available():
|
||||||
|
return await _stage_surface_validate_ssh(job_id, devname, drive_id)
|
||||||
|
# TrueNAS CORE/FreeBSD: badblocks not available — use native wipe API
|
||||||
|
await _append_stage_log(
|
||||||
|
job_id, "surface_validate",
|
||||||
|
"[INFO] badblocks not found on host (TrueNAS CORE/FreeBSD) — "
|
||||||
|
"using TrueNAS disk.wipe API (FULL write pass).\n\n"
|
||||||
|
)
|
||||||
|
return await _stage_surface_validate_truenas(job_id, devname, drive_id)
|
||||||
return await _stage_timed_simulate(job_id, "surface_validate", settings.surface_validate_seconds)
|
return await _stage_timed_simulate(job_id, "surface_validate", settings.surface_validate_seconds)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -655,6 +714,116 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
async def _stage_surface_validate_truenas(job_id: int, devname: str, drive_id: int) -> bool:
|
||||||
|
"""
|
||||||
|
Surface validation via TrueNAS CORE disk.wipe REST API.
|
||||||
|
Used on FreeBSD (TrueNAS CORE) where badblocks is unavailable.
|
||||||
|
|
||||||
|
Sends a FULL write-zero pass across the entire disk, polls progress,
|
||||||
|
then runs a post-wipe SMART attribute check to catch reallocated sectors.
|
||||||
|
"""
|
||||||
|
from app import ssh_client
|
||||||
|
|
||||||
|
await _append_stage_log(
|
||||||
|
job_id, "surface_validate",
|
||||||
|
f"[START] TrueNAS disk.wipe FULL — {devname}\n"
|
||||||
|
f"[NOTE] DESTRUCTIVE: all data on {devname} will be overwritten.\n\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Start the wipe job
|
||||||
|
try:
|
||||||
|
tn_job_id = await _client.wipe_disk(devname, "FULL")
|
||||||
|
except Exception as exc:
|
||||||
|
await _set_stage_error(job_id, "surface_validate", f"Failed to start disk.wipe: {exc}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
await _append_stage_log(
|
||||||
|
job_id, "surface_validate",
|
||||||
|
f"[JOB] TrueNAS wipe job started (job_id={tn_job_id})\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Poll until complete
|
||||||
|
log_flush_counter = 0
|
||||||
|
while True:
|
||||||
|
if await _is_cancelled(job_id):
|
||||||
|
try:
|
||||||
|
await _client.abort_job(tn_job_id)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
await asyncio.sleep(POLL_INTERVAL)
|
||||||
|
|
||||||
|
try:
|
||||||
|
job = await _client.get_job(tn_job_id)
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("Wipe job poll failed: %s", exc, extra={"job_id": job_id})
|
||||||
|
await _append_stage_log(job_id, "surface_validate", f"[poll error] {exc}\n")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not job:
|
||||||
|
await _set_stage_error(job_id, "surface_validate", f"Wipe job {tn_job_id} not found")
|
||||||
|
return False
|
||||||
|
|
||||||
|
state = job.get("state", "")
|
||||||
|
pct = int(job.get("progress", {}).get("percent", 0) or 0)
|
||||||
|
desc = job.get("progress", {}).get("description", "")
|
||||||
|
|
||||||
|
await _update_stage_percent(job_id, "surface_validate", min(pct, 99))
|
||||||
|
await _recalculate_progress(job_id)
|
||||||
|
_push_update()
|
||||||
|
|
||||||
|
# Log progress description every ~5 polls to avoid DB spam
|
||||||
|
log_flush_counter += 1
|
||||||
|
if desc and log_flush_counter % 5 == 0:
|
||||||
|
await _append_stage_log(job_id, "surface_validate", f"[{pct}%] {desc}\n")
|
||||||
|
|
||||||
|
if state == "SUCCESS":
|
||||||
|
await _update_stage_percent(job_id, "surface_validate", 100)
|
||||||
|
await _append_stage_log(
|
||||||
|
job_id, "surface_validate",
|
||||||
|
f"\n[DONE] Wipe job {tn_job_id} completed successfully.\n"
|
||||||
|
)
|
||||||
|
# Post-wipe SMART check — catch any sectors that failed under write stress
|
||||||
|
if ssh_client.is_configured() and drive_id is not None:
|
||||||
|
await _append_stage_log(
|
||||||
|
job_id, "surface_validate",
|
||||||
|
"[CHECK] Running post-wipe SMART attribute check...\n"
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
attrs = await ssh_client.get_smart_attributes(devname)
|
||||||
|
await _store_smart_attrs(drive_id, attrs)
|
||||||
|
if attrs["failures"]:
|
||||||
|
error = "Post-wipe SMART check: " + "; ".join(attrs["failures"])
|
||||||
|
await _set_stage_error(job_id, "surface_validate", error)
|
||||||
|
return False
|
||||||
|
if attrs["warnings"]:
|
||||||
|
await _append_stage_log(
|
||||||
|
job_id, "surface_validate",
|
||||||
|
"[WARNING] " + "; ".join(attrs["warnings"]) + "\n"
|
||||||
|
)
|
||||||
|
await _append_stage_log(
|
||||||
|
job_id, "surface_validate",
|
||||||
|
f"[CHECK] SMART health: {attrs['health']} — no critical attributes.\n"
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("Post-wipe SMART check failed: %s", exc)
|
||||||
|
await _append_stage_log(
|
||||||
|
job_id, "surface_validate",
|
||||||
|
f"[WARN] Post-wipe SMART check failed (non-fatal): {exc}\n"
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
|
||||||
|
elif state in ("FAILED", "ABORTED", "ERROR"):
|
||||||
|
error_msg = job.get("error") or f"Disk wipe failed (state={state})"
|
||||||
|
await _set_stage_error(
|
||||||
|
job_id, "surface_validate",
|
||||||
|
f"TrueNAS disk.wipe FAILED: {error_msg}"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
# RUNNING or WAITING — keep polling
|
||||||
|
|
||||||
|
|
||||||
async def _stage_timed_simulate(job_id: int, stage_name: str, duration_seconds: int) -> bool:
|
async def _stage_timed_simulate(job_id: int, stage_name: str, duration_seconds: int) -> bool:
|
||||||
"""Simulate a timed stage with progress updates (mock / dev mode)."""
|
"""Simulate a timed stage with progress updates (mock / dev mode)."""
|
||||||
start = time.monotonic()
|
start = time.monotonic()
|
||||||
|
|
@ -68,7 +68,7 @@ class Settings(BaseSettings):
|
||||||
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
||||||
|
|
||||||
# Application version — used by the /api/v1/updates/check endpoint
|
# Application version — used by the /api/v1/updates/check endpoint
|
||||||
app_version: str = "1.0.0-7"
|
app_version: str = "1.0.0-8"
|
||||||
|
|
||||||
|
|
||||||
settings = Settings()
|
settings = Settings()
|
||||||
|
|
@ -20,13 +20,15 @@ from app.truenas import TrueNASClient
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Shared state read by the /health endpoint
|
# Shared state read by the /health endpoint and dashboard template
|
||||||
_state: dict[str, Any] = {
|
_state: dict[str, Any] = {
|
||||||
"last_poll_at": None,
|
"last_poll_at": None,
|
||||||
"last_error": None,
|
"last_error": None,
|
||||||
"healthy": False,
|
"healthy": False,
|
||||||
"drives_seen": 0,
|
"drives_seen": 0,
|
||||||
"consecutive_failures": 0,
|
"consecutive_failures": 0,
|
||||||
|
"system_temps": {}, # {"cpu_c": int|None, "pch_c": int|None}
|
||||||
|
"thermal_pressure": "ok", # "ok" | "warn" | "crit" — based on running burn-in drive temps
|
||||||
}
|
}
|
||||||
|
|
||||||
# SSE subscriber queues — notified after each successful poll
|
# SSE subscriber queues — notified after each successful poll
|
||||||
|
|
@ -208,6 +210,67 @@ async def _sync_history(
|
||||||
# Poll cycle
|
# Poll cycle
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def _poll_smart_via_ssh(db: aiosqlite.Connection, now: str) -> None:
|
||||||
|
"""
|
||||||
|
Poll progress for SMART tests started via SSH (truenas_job_id IS NULL).
|
||||||
|
Used on TrueNAS SCALE 25.10+ where the REST smart/test API no longer exists.
|
||||||
|
"""
|
||||||
|
from app import ssh_client
|
||||||
|
if not ssh_client.is_configured():
|
||||||
|
return
|
||||||
|
|
||||||
|
cur = await db.execute(
|
||||||
|
"""SELECT st.id, st.test_type, st.drive_id, d.devname, st.started_at
|
||||||
|
FROM smart_tests st
|
||||||
|
JOIN drives d ON d.id = st.drive_id
|
||||||
|
WHERE st.state = 'running' AND st.truenas_job_id IS NULL"""
|
||||||
|
)
|
||||||
|
rows = await cur.fetchall()
|
||||||
|
if not rows:
|
||||||
|
return
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
test_id, ttype, drive_id, devname, started_at = row[0], row[1], row[2], row[3], row[4]
|
||||||
|
try:
|
||||||
|
progress = await ssh_client.poll_smart_progress(devname)
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("SSH SMART poll failed for %s: %s", devname, exc)
|
||||||
|
continue
|
||||||
|
|
||||||
|
state = progress["state"]
|
||||||
|
pct_remaining = progress.get("percent_remaining") # None = not yet in output
|
||||||
|
raw_output = progress.get("output", "")
|
||||||
|
|
||||||
|
if state == "running":
|
||||||
|
# pct_remaining=None means smartctl output doesn't have the % line yet
|
||||||
|
# (test just started) — keep percent at 0 rather than jumping to 100
|
||||||
|
if pct_remaining is None:
|
||||||
|
pct = 0
|
||||||
|
else:
|
||||||
|
pct = max(0, 100 - pct_remaining)
|
||||||
|
eta = _eta_from_progress(pct, started_at)
|
||||||
|
await db.execute(
|
||||||
|
"UPDATE smart_tests SET percent=?, eta_at=?, raw_output=? WHERE id=?",
|
||||||
|
(pct, eta, raw_output, test_id),
|
||||||
|
)
|
||||||
|
elif state == "passed":
|
||||||
|
await db.execute(
|
||||||
|
"UPDATE smart_tests SET state='passed', percent=100, finished_at=?, raw_output=? WHERE id=?",
|
||||||
|
(now, raw_output, test_id),
|
||||||
|
)
|
||||||
|
log.info("SSH SMART %s passed on %s", ttype, devname)
|
||||||
|
elif state == "failed":
|
||||||
|
await db.execute(
|
||||||
|
"UPDATE smart_tests SET state='failed', percent=0, finished_at=?, "
|
||||||
|
"error_text=?, raw_output=? WHERE id=?",
|
||||||
|
(now, f"SMART {ttype.upper()} test failed", raw_output, test_id),
|
||||||
|
)
|
||||||
|
log.warning("SSH SMART %s FAILED on %s", ttype, devname)
|
||||||
|
# state == "unknown" → keep polling, no update
|
||||||
|
|
||||||
|
await db.commit()
|
||||||
|
|
||||||
|
|
||||||
async def poll_cycle(client: TrueNASClient) -> int:
|
async def poll_cycle(client: TrueNASClient) -> int:
|
||||||
"""Run one full poll. Returns number of drives seen."""
|
"""Run one full poll. Returns number of drives seen."""
|
||||||
now = _now()
|
now = _now()
|
||||||
|
|
@ -215,6 +278,20 @@ async def poll_cycle(client: TrueNASClient) -> int:
|
||||||
disks = await client.get_disks()
|
disks = await client.get_disks()
|
||||||
running_jobs = await client.get_smart_jobs(state="RUNNING")
|
running_jobs = await client.get_smart_jobs(state="RUNNING")
|
||||||
|
|
||||||
|
# Fetch temperatures via SCALE-specific endpoint.
|
||||||
|
# CORE doesn't have this endpoint — silently skip on any error.
|
||||||
|
try:
|
||||||
|
temps = await client.get_disk_temperatures()
|
||||||
|
except Exception:
|
||||||
|
temps = {}
|
||||||
|
|
||||||
|
# Inject temperature into each disk dict (SCALE 25.10 has no temp in /disk)
|
||||||
|
for disk in disks:
|
||||||
|
devname = disk.get("devname", "")
|
||||||
|
t = temps.get(devname)
|
||||||
|
if t is not None:
|
||||||
|
disk["temperature"] = int(round(t))
|
||||||
|
|
||||||
# Index running jobs by (devname, test_type)
|
# Index running jobs by (devname, test_type)
|
||||||
active: dict[tuple[str, str], dict] = {}
|
active: dict[tuple[str, str], dict] = {}
|
||||||
for job in running_jobs:
|
for job in running_jobs:
|
||||||
|
|
@ -243,6 +320,9 @@ async def poll_cycle(client: TrueNASClient) -> int:
|
||||||
|
|
||||||
await db.commit()
|
await db.commit()
|
||||||
|
|
||||||
|
# SSH SMART polling — for tests started via smartctl (no TrueNAS REST job)
|
||||||
|
await _poll_smart_via_ssh(db, now)
|
||||||
|
|
||||||
return len(disks)
|
return len(disks)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -263,6 +343,39 @@ async def run(client: TrueNASClient) -> None:
|
||||||
_state["drives_seen"] = count
|
_state["drives_seen"] = count
|
||||||
_state["consecutive_failures"] = 0
|
_state["consecutive_failures"] = 0
|
||||||
log.debug("Poll OK", extra={"drives": count})
|
log.debug("Poll OK", extra={"drives": count})
|
||||||
|
|
||||||
|
# System sensor temps via SSH (non-fatal)
|
||||||
|
from app import ssh_client as _ssh
|
||||||
|
if _ssh.is_configured():
|
||||||
|
try:
|
||||||
|
_state["system_temps"] = await _ssh.get_system_sensors()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Thermal pressure: max temp of drives currently under burn-in
|
||||||
|
try:
|
||||||
|
async with aiosqlite.connect(settings.db_path) as _tdb:
|
||||||
|
_tdb.row_factory = aiosqlite.Row
|
||||||
|
await _tdb.execute("PRAGMA journal_mode=WAL")
|
||||||
|
_cur = await _tdb.execute("""
|
||||||
|
SELECT MAX(d.temperature_c)
|
||||||
|
FROM drives d
|
||||||
|
JOIN burnin_jobs bj ON bj.drive_id = d.id
|
||||||
|
WHERE bj.state = 'running' AND d.temperature_c IS NOT NULL
|
||||||
|
""")
|
||||||
|
_row = await _cur.fetchone()
|
||||||
|
_max_t = _row[0] if _row and _row[0] is not None else None
|
||||||
|
if _max_t is None:
|
||||||
|
_state["thermal_pressure"] = "ok"
|
||||||
|
elif _max_t >= settings.temp_crit_c:
|
||||||
|
_state["thermal_pressure"] = "crit"
|
||||||
|
elif _max_t >= settings.temp_warn_c:
|
||||||
|
_state["thermal_pressure"] = "warn"
|
||||||
|
else:
|
||||||
|
_state["thermal_pressure"] = "ok"
|
||||||
|
except Exception:
|
||||||
|
_state["thermal_pressure"] = "ok"
|
||||||
|
|
||||||
_notify_subscribers()
|
_notify_subscribers()
|
||||||
|
|
||||||
# Check for stuck jobs every 5 cycles (~1 min at default 12s interval)
|
# Check for stuck jobs every 5 cycles (~1 min at default 12s interval)
|
||||||
|
|
@ -218,6 +218,18 @@ async def sse_drives(request: Request):
|
||||||
|
|
||||||
yield {"event": "drives-update", "data": html}
|
yield {"event": "drives-update", "data": html}
|
||||||
|
|
||||||
|
# Push system sensor state so JS can update temp chips live
|
||||||
|
ps = poller.get_state()
|
||||||
|
yield {
|
||||||
|
"event": "system-sensors",
|
||||||
|
"data": json.dumps({
|
||||||
|
"system_temps": ps.get("system_temps", {}),
|
||||||
|
"thermal_pressure": ps.get("thermal_pressure", "ok"),
|
||||||
|
"temp_warn_c": settings.temp_warn_c,
|
||||||
|
"temp_crit_c": settings.temp_crit_c,
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
|
||||||
# Push browser notification event if this was a job completion
|
# Push browser notification event if this was a job completion
|
||||||
if alert:
|
if alert:
|
||||||
yield {"event": "job-alert", "data": json.dumps(alert)}
|
yield {"event": "job-alert", "data": json.dumps(alert)}
|
||||||
|
|
@ -353,9 +365,13 @@ async def smart_start(
|
||||||
body: dict,
|
body: dict,
|
||||||
db: aiosqlite.Connection = Depends(get_db),
|
db: aiosqlite.Connection = Depends(get_db),
|
||||||
):
|
):
|
||||||
"""Start a standalone SHORT or LONG SMART test on a single drive."""
|
"""Start a standalone SHORT or LONG SMART test on a single drive.
|
||||||
from app.truenas import TrueNASClient
|
|
||||||
from app import burnin as _burnin
|
Uses SSH (smartctl) when configured — required for TrueNAS SCALE 25.10+
|
||||||
|
where the REST smart/test endpoint no longer exists.
|
||||||
|
Falls back to TrueNAS REST API for older versions.
|
||||||
|
"""
|
||||||
|
from app import burnin as _burnin, ssh_client
|
||||||
|
|
||||||
test_type = (body.get("type") or "").upper()
|
test_type = (body.get("type") or "").upper()
|
||||||
if test_type not in ("SHORT", "LONG"):
|
if test_type not in ("SHORT", "LONG"):
|
||||||
|
|
@ -367,17 +383,42 @@ async def smart_start(
|
||||||
raise HTTPException(status_code=404, detail="Drive not found")
|
raise HTTPException(status_code=404, detail="Drive not found")
|
||||||
devname = row[0]
|
devname = row[0]
|
||||||
|
|
||||||
# Use the shared TrueNAS client held by the burnin module
|
now = datetime.now(timezone.utc).isoformat()
|
||||||
client = _burnin._client
|
ttype_lower = test_type.lower()
|
||||||
if client is None:
|
|
||||||
raise HTTPException(status_code=503, detail="TrueNAS client not ready")
|
|
||||||
|
|
||||||
try:
|
if ssh_client.is_configured():
|
||||||
tn_job_id = await client.start_smart_test([devname], test_type)
|
# SSH path — works on TrueNAS SCALE 25.10+ and CORE
|
||||||
except Exception as exc:
|
try:
|
||||||
raise HTTPException(status_code=502, detail=f"TrueNAS error: {exc}")
|
output = await ssh_client.start_smart_test(devname, test_type)
|
||||||
|
except Exception as exc:
|
||||||
|
raise HTTPException(status_code=502, detail=f"SSH error: {exc}")
|
||||||
|
|
||||||
return {"job_id": tn_job_id, "devname": devname, "type": test_type}
|
# Mark as running in DB (truenas_job_id=NULL signals SSH-managed test)
|
||||||
|
# Store smartctl start output as proof the test was initiated
|
||||||
|
await db.execute(
|
||||||
|
"""INSERT INTO smart_tests (drive_id, test_type, state, percent, started_at, raw_output)
|
||||||
|
VALUES (?,?,?,?,?,?)
|
||||||
|
ON CONFLICT(drive_id, test_type) DO UPDATE SET
|
||||||
|
state='running', percent=0, truenas_job_id=NULL,
|
||||||
|
started_at=excluded.started_at, finished_at=NULL, error_text=NULL,
|
||||||
|
raw_output=excluded.raw_output""",
|
||||||
|
(drive_id, ttype_lower, "running", 0, now, output),
|
||||||
|
)
|
||||||
|
await db.commit()
|
||||||
|
from app import poller as _poller
|
||||||
|
_poller._notify_subscribers()
|
||||||
|
return {"devname": devname, "type": test_type, "message": output[:200]}
|
||||||
|
|
||||||
|
else:
|
||||||
|
# REST path — older TrueNAS CORE / SCALE versions
|
||||||
|
client = _burnin._client
|
||||||
|
if client is None:
|
||||||
|
raise HTTPException(status_code=503, detail="TrueNAS client not ready")
|
||||||
|
try:
|
||||||
|
tn_job_id = await client.start_smart_test([devname], test_type)
|
||||||
|
except Exception as exc:
|
||||||
|
raise HTTPException(status_code=502, detail=f"TrueNAS error: {exc}")
|
||||||
|
return {"job_id": tn_job_id, "devname": devname, "type": test_type}
|
||||||
|
|
||||||
|
|
||||||
@router.post("/api/v1/drives/{drive_id}/smart/cancel")
|
@router.post("/api/v1/drives/{drive_id}/smart/cancel")
|
||||||
|
|
@ -403,28 +444,37 @@ async def smart_cancel(
|
||||||
if client is None:
|
if client is None:
|
||||||
raise HTTPException(status_code=503, detail="TrueNAS client not ready")
|
raise HTTPException(status_code=503, detail="TrueNAS client not ready")
|
||||||
|
|
||||||
# Find the running TrueNAS job for this drive/test-type
|
from app import ssh_client
|
||||||
try:
|
|
||||||
jobs = await client.get_smart_jobs()
|
|
||||||
tn_job_id = None
|
|
||||||
for j in jobs:
|
|
||||||
if j.get("state") != "RUNNING":
|
|
||||||
continue
|
|
||||||
args = j.get("arguments", [])
|
|
||||||
if not args or not isinstance(args[0], dict):
|
|
||||||
continue
|
|
||||||
if devname in args[0].get("disks", []):
|
|
||||||
tn_job_id = j["id"]
|
|
||||||
break
|
|
||||||
|
|
||||||
if tn_job_id is None:
|
if ssh_client.is_configured():
|
||||||
raise HTTPException(status_code=404, detail="No running SMART test found for this drive")
|
# SSH path — abort via smartctl -X
|
||||||
|
try:
|
||||||
|
await ssh_client.abort_smart_test(devname)
|
||||||
|
except Exception as exc:
|
||||||
|
raise HTTPException(status_code=502, detail=f"SSH abort error: {exc}")
|
||||||
|
else:
|
||||||
|
# REST path — find TrueNAS job and abort it
|
||||||
|
try:
|
||||||
|
jobs = await client.get_smart_jobs()
|
||||||
|
tn_job_id = None
|
||||||
|
for j in jobs:
|
||||||
|
if j.get("state") != "RUNNING":
|
||||||
|
continue
|
||||||
|
args = j.get("arguments", [])
|
||||||
|
if not args or not isinstance(args[0], dict):
|
||||||
|
continue
|
||||||
|
if devname in args[0].get("disks", []):
|
||||||
|
tn_job_id = j["id"]
|
||||||
|
break
|
||||||
|
|
||||||
await client.abort_job(tn_job_id)
|
if tn_job_id is None:
|
||||||
except HTTPException:
|
raise HTTPException(status_code=404, detail="No running SMART test found for this drive")
|
||||||
raise
|
|
||||||
except Exception as exc:
|
await client.abort_job(tn_job_id)
|
||||||
raise HTTPException(status_code=502, detail=f"TrueNAS error: {exc}")
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as exc:
|
||||||
|
raise HTTPException(status_code=502, detail=f"TrueNAS error: {exc}")
|
||||||
|
|
||||||
# Update local DB state
|
# Update local DB state
|
||||||
now = datetime.now(timezone.utc).isoformat()
|
now = datetime.now(timezone.utc).isoformat()
|
||||||
|
|
@ -38,15 +38,26 @@ SMART_ATTRS: dict[int, tuple[str, bool]] = {
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def is_configured() -> bool:
|
def is_configured() -> bool:
|
||||||
"""Returns True when SSH credentials are present and usable."""
|
"""Returns True when SSH host + at least one auth method is available."""
|
||||||
|
import os
|
||||||
from app.config import settings
|
from app.config import settings
|
||||||
return bool(settings.ssh_host and (settings.ssh_password or settings.ssh_key))
|
if not settings.ssh_host:
|
||||||
|
return False
|
||||||
|
has_creds = bool(
|
||||||
|
settings.ssh_key
|
||||||
|
or settings.ssh_password
|
||||||
|
or os.path.exists(os.environ.get("SSH_KEY_FILE", _MOUNTED_KEY_PATH))
|
||||||
|
)
|
||||||
|
return has_creds
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Low-level connection
|
# Low-level connection
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_MOUNTED_KEY_PATH = "/run/secrets/ssh_key"
|
||||||
|
|
||||||
|
|
||||||
async def _connect():
|
async def _connect():
|
||||||
"""Open a single-use SSH connection. Caller must use `async with`."""
|
"""Open a single-use SSH connection. Caller must use `async with`."""
|
||||||
import asyncssh
|
import asyncssh
|
||||||
|
|
@ -59,9 +70,17 @@ async def _connect():
|
||||||
"known_hosts": None, # trust all hosts (same spirit as TRUENAS_VERIFY_TLS=false)
|
"known_hosts": None, # trust all hosts (same spirit as TRUENAS_VERIFY_TLS=false)
|
||||||
}
|
}
|
||||||
if settings.ssh_key:
|
if settings.ssh_key:
|
||||||
|
# Key material provided via env var (base case)
|
||||||
kwargs["client_keys"] = [asyncssh.import_private_key(settings.ssh_key)]
|
kwargs["client_keys"] = [asyncssh.import_private_key(settings.ssh_key)]
|
||||||
if settings.ssh_password:
|
elif settings.ssh_password:
|
||||||
kwargs["password"] = settings.ssh_password
|
kwargs["password"] = settings.ssh_password
|
||||||
|
else:
|
||||||
|
# Fall back to mounted key file (preferred for production — no key in env vars)
|
||||||
|
import os
|
||||||
|
key_path = os.environ.get("SSH_KEY_FILE", _MOUNTED_KEY_PATH)
|
||||||
|
if os.path.exists(key_path):
|
||||||
|
kwargs["client_keys"] = [key_path]
|
||||||
|
# If nothing is configured, asyncssh will attempt agent/default key lookup
|
||||||
|
|
||||||
return asyncssh.connect(**kwargs)
|
return asyncssh.connect(**kwargs)
|
||||||
|
|
||||||
|
|
@ -228,6 +247,70 @@ async def run_badblocks(
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def get_system_sensors() -> dict:
|
||||||
|
"""
|
||||||
|
Run `sensors -j` on TrueNAS and extract system-level temperatures.
|
||||||
|
Returns {"cpu_c": int|None, "pch_c": int|None}.
|
||||||
|
cpu_c = CPU package temp (coretemp chip)
|
||||||
|
pch_c = PCH/chipset temp (pch_* chip) — proxy for storage I/O lane thermals
|
||||||
|
Falls back gracefully if SSH is not configured or lm-sensors is unavailable.
|
||||||
|
"""
|
||||||
|
if not is_configured():
|
||||||
|
return {}
|
||||||
|
try:
|
||||||
|
async with await _connect() as conn:
|
||||||
|
result = await conn.run("sensors -j 2>/dev/null", check=False)
|
||||||
|
output = result.stdout.strip()
|
||||||
|
if not output:
|
||||||
|
return {}
|
||||||
|
return _parse_sensors_json(output)
|
||||||
|
except Exception as exc:
|
||||||
|
log.debug("get_system_sensors failed: %s", exc)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_sensors_json(output: str) -> dict:
|
||||||
|
import json as _json
|
||||||
|
try:
|
||||||
|
data = _json.loads(output)
|
||||||
|
except Exception:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
cpu_c: int | None = None
|
||||||
|
pch_c: int | None = None
|
||||||
|
|
||||||
|
for chip_name, chip_data in data.items():
|
||||||
|
if not isinstance(chip_data, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# CPU package temp — coretemp chip, "Package id N" sensor
|
||||||
|
if chip_name.startswith("coretemp") and cpu_c is None:
|
||||||
|
for sensor_name, sensor_vals in chip_data.items():
|
||||||
|
if not isinstance(sensor_vals, dict):
|
||||||
|
continue
|
||||||
|
if "package" in sensor_name.lower():
|
||||||
|
for k, v in sensor_vals.items():
|
||||||
|
if k.endswith("_input") and isinstance(v, (int, float)):
|
||||||
|
cpu_c = int(round(v))
|
||||||
|
break
|
||||||
|
if cpu_c is not None:
|
||||||
|
break
|
||||||
|
|
||||||
|
# PCH / chipset temp — manages PCIe lanes including HBA / storage I/O
|
||||||
|
elif chip_name.startswith("pch_") and pch_c is None:
|
||||||
|
for sensor_name, sensor_vals in chip_data.items():
|
||||||
|
if not isinstance(sensor_vals, dict):
|
||||||
|
continue
|
||||||
|
for k, v in sensor_vals.items():
|
||||||
|
if k.endswith("_input") and isinstance(v, (int, float)):
|
||||||
|
pch_c = int(round(v))
|
||||||
|
break
|
||||||
|
if pch_c is not None:
|
||||||
|
break
|
||||||
|
|
||||||
|
return {"cpu_c": cpu_c, "pch_c": pch_c}
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Parsers
|
# Parsers
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -275,7 +358,7 @@ def _parse_smartctl(output: str) -> dict:
|
||||||
|
|
||||||
def _parse_smart_progress(output: str) -> dict:
|
def _parse_smart_progress(output: str) -> dict:
|
||||||
state = "unknown"
|
state = "unknown"
|
||||||
percent_remaining = 0
|
percent_remaining = None # None = "in progress but no % line parsed yet"
|
||||||
|
|
||||||
lower = output.lower()
|
lower = output.lower()
|
||||||
|
|
||||||
|
|
@ -1076,6 +1076,56 @@ a.stat-card:hover {
|
||||||
.stat-passed .stat-value { color: var(--green); }
|
.stat-passed .stat-value { color: var(--green); }
|
||||||
.stat-idle .stat-value { color: var(--text-muted); }
|
.stat-idle .stat-value { color: var(--text-muted); }
|
||||||
|
|
||||||
|
/* Vertical separator between drive-count cards and sensor chips */
|
||||||
|
.stats-bar-sep {
|
||||||
|
width: 1px;
|
||||||
|
height: 36px;
|
||||||
|
background: var(--border);
|
||||||
|
align-self: center;
|
||||||
|
flex-shrink: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Compact sensor chip — CPU / PCH / Thermal */
|
||||||
|
.stat-sensor {
|
||||||
|
background: var(--bg-card);
|
||||||
|
border: 1px solid var(--border);
|
||||||
|
border-radius: 8px;
|
||||||
|
padding: 6px 12px;
|
||||||
|
text-align: center;
|
||||||
|
min-width: 52px;
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 2px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.stat-sensor-val {
|
||||||
|
font-size: 16px;
|
||||||
|
font-weight: 700;
|
||||||
|
font-variant-numeric: tabular-nums;
|
||||||
|
line-height: 1.1;
|
||||||
|
}
|
||||||
|
|
||||||
|
.stat-sensor-label {
|
||||||
|
font-size: 9px;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.08em;
|
||||||
|
color: var(--text-muted);
|
||||||
|
line-height: 1.2;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Thermal pressure states */
|
||||||
|
.stat-sensor-thermal-warn {
|
||||||
|
border-color: var(--yellow-bd);
|
||||||
|
background: var(--yellow-bg);
|
||||||
|
}
|
||||||
|
.stat-sensor-thermal-warn .stat-sensor-val { color: var(--yellow); }
|
||||||
|
|
||||||
|
.stat-sensor-thermal-crit {
|
||||||
|
border-color: var(--red-bd);
|
||||||
|
background: var(--red-bg);
|
||||||
|
}
|
||||||
|
.stat-sensor-thermal-crit .stat-sensor-val { color: var(--red); }
|
||||||
|
|
||||||
/* -----------------------------------------------------------------------
|
/* -----------------------------------------------------------------------
|
||||||
Batch action bar (inside filter-bar)
|
Batch action bar (inside filter-bar)
|
||||||
----------------------------------------------------------------------- */
|
----------------------------------------------------------------------- */
|
||||||
|
|
@ -135,14 +135,59 @@
|
||||||
if (nb) nb.style.display = 'none';
|
if (nb) nb.style.display = 'none';
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle job-alert SSE events for browser notifications
|
// Handle SSE events
|
||||||
document.addEventListener('htmx:sseMessage', function (e) {
|
document.addEventListener('htmx:sseMessage', function (e) {
|
||||||
if (!e.detail || e.detail.type !== 'job-alert') return;
|
if (!e.detail) return;
|
||||||
try {
|
if (e.detail.type === 'job-alert') {
|
||||||
handleJobAlert(JSON.parse(e.detail.data));
|
try { handleJobAlert(JSON.parse(e.detail.data)); } catch (_) {}
|
||||||
} catch (_) {}
|
} else if (e.detail.type === 'system-sensors') {
|
||||||
|
try { handleSystemSensors(JSON.parse(e.detail.data)); } catch (_) {}
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
function handleSystemSensors(data) {
|
||||||
|
var st = data.system_temps || {};
|
||||||
|
var tp = data.thermal_pressure || 'ok';
|
||||||
|
var warn = data.temp_warn_c || 46;
|
||||||
|
var crit = data.temp_crit_c || 55;
|
||||||
|
|
||||||
|
function tempClass(c) {
|
||||||
|
if (c == null) return '';
|
||||||
|
return c >= crit ? 'temp-hot' : c >= warn ? 'temp-warm' : 'temp-cool';
|
||||||
|
}
|
||||||
|
|
||||||
|
// CPU chip
|
||||||
|
var cpuChip = document.getElementById('sensor-cpu');
|
||||||
|
var cpuVal = document.getElementById('sensor-cpu-val');
|
||||||
|
if (cpuVal && st.cpu_c != null) {
|
||||||
|
if (cpuChip) cpuChip.hidden = false;
|
||||||
|
cpuVal.textContent = st.cpu_c + '°';
|
||||||
|
cpuVal.className = 'stat-sensor-val ' + tempClass(st.cpu_c);
|
||||||
|
}
|
||||||
|
|
||||||
|
// PCH chip
|
||||||
|
var pchChip = document.getElementById('sensor-pch');
|
||||||
|
var pchVal = document.getElementById('sensor-pch-val');
|
||||||
|
if (pchVal && st.pch_c != null) {
|
||||||
|
if (pchChip) pchChip.hidden = false;
|
||||||
|
pchVal.textContent = st.pch_c + '°';
|
||||||
|
pchVal.className = 'stat-sensor-val ' + tempClass(st.pch_c);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Thermal pressure chip
|
||||||
|
var tChip = document.getElementById('sensor-thermal');
|
||||||
|
var tVal = document.getElementById('sensor-thermal-val');
|
||||||
|
if (tChip && tVal) {
|
||||||
|
if (tp === 'warn' || tp === 'crit') {
|
||||||
|
tChip.hidden = false;
|
||||||
|
tChip.className = 'stat-sensor stat-sensor-thermal stat-sensor-thermal-' + tp;
|
||||||
|
tVal.textContent = tp === 'warn' ? 'WARM' : 'HOT';
|
||||||
|
} else {
|
||||||
|
tChip.hidden = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function handleJobAlert(data) {
|
function handleJobAlert(data) {
|
||||||
var isPass = data.state === 'passed';
|
var isPass = data.state === 'passed';
|
||||||
var icon = isPass ? '✓' : '✕';
|
var icon = isPass ? '✓' : '✕';
|
||||||
|
|
@ -6,7 +6,7 @@
|
||||||
{% include "components/modal_start.html" %}
|
{% include "components/modal_start.html" %}
|
||||||
{% include "components/modal_batch.html" %}
|
{% include "components/modal_batch.html" %}
|
||||||
|
|
||||||
<!-- Stats bar — counts are updated live by app.js updateCounts() -->
|
<!-- Stats bar — drive counts updated live by app.js updateCounts(); sensor chips updated by SSE system-sensors event -->
|
||||||
<div class="stats-bar">
|
<div class="stats-bar">
|
||||||
<div class="stat-card" data-stat-filter="all">
|
<div class="stat-card" data-stat-filter="all">
|
||||||
<span class="stat-value" id="stat-all">{{ drives | length }}</span>
|
<span class="stat-value" id="stat-all">{{ drives | length }}</span>
|
||||||
|
|
@ -28,6 +28,33 @@
|
||||||
<span class="stat-value" id="stat-idle">0</span>
|
<span class="stat-value" id="stat-idle">0</span>
|
||||||
<span class="stat-label">Idle</span>
|
<span class="stat-label">Idle</span>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{%- set st = poller.system_temps if (poller and poller.system_temps) else {} %}
|
||||||
|
{%- if st.get('cpu_c') is not none or st.get('pch_c') is not none %}
|
||||||
|
<div class="stats-bar-sep"></div>
|
||||||
|
{%- if st.get('cpu_c') is not none %}
|
||||||
|
<div class="stat-sensor" id="sensor-cpu">
|
||||||
|
<span class="stat-sensor-val {{ st.get('cpu_c') | temp_class }}" id="sensor-cpu-val">{{ st.get('cpu_c') }}°</span>
|
||||||
|
<span class="stat-sensor-label">CPU</span>
|
||||||
|
</div>
|
||||||
|
{%- endif %}
|
||||||
|
{%- if st.get('pch_c') is not none %}
|
||||||
|
<div class="stat-sensor" id="sensor-pch">
|
||||||
|
<span class="stat-sensor-val {{ st.get('pch_c') | temp_class }}" id="sensor-pch-val">{{ st.get('pch_c') }}°</span>
|
||||||
|
<span class="stat-sensor-label">PCH</span>
|
||||||
|
</div>
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
|
||||||
|
{%- set tp = poller.thermal_pressure if poller else 'ok' %}
|
||||||
|
<div class="stat-sensor stat-sensor-thermal stat-sensor-thermal-{{ tp }}"
|
||||||
|
id="sensor-thermal"
|
||||||
|
{% if not tp or tp == 'ok' %}hidden{% endif %}>
|
||||||
|
<span class="stat-sensor-val" id="sensor-thermal-val">
|
||||||
|
{%- if tp == 'warn' %}WARM{%- elif tp == 'crit' %}HOT{%- else %}OK{%- endif %}
|
||||||
|
</span>
|
||||||
|
<span class="stat-sensor-label">Thermal</span>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- Failed drive banner — shown/hidden by JS when failed count > 0 -->
|
<!-- Failed drive banner — shown/hidden by JS when failed count > 0 -->
|
||||||
|
|
@ -50,12 +50,19 @@ async def handle(ws: WebSocket) -> None:
|
||||||
elif settings.ssh_password:
|
elif settings.ssh_password:
|
||||||
connect_kw["password"] = settings.ssh_password
|
connect_kw["password"] = settings.ssh_password
|
||||||
else:
|
else:
|
||||||
await _send(ws,
|
# Fall back to mounted key file (same logic as ssh_client._connect)
|
||||||
b"\r\n\x1b[33mNo SSH credentials configured.\x1b[0m "
|
import os
|
||||||
b"Set a password or private key in Settings.\r\n"
|
from app import ssh_client as _sc
|
||||||
)
|
key_path = os.environ.get("SSH_KEY_FILE", _sc._MOUNTED_KEY_PATH)
|
||||||
await ws.close(1008)
|
if os.path.exists(key_path):
|
||||||
return
|
connect_kw["client_keys"] = [key_path]
|
||||||
|
else:
|
||||||
|
await _send(ws,
|
||||||
|
b"\r\n\x1b[33mNo SSH credentials configured.\x1b[0m "
|
||||||
|
b"Set a password or private key in Settings.\r\n"
|
||||||
|
)
|
||||||
|
await ws.close(1008)
|
||||||
|
return
|
||||||
|
|
||||||
await _send(ws,
|
await _send(ws,
|
||||||
f"\r\n\x1b[36mConnecting to {settings.ssh_host}\u2026\x1b[0m\r\n".encode()
|
f"\r\n\x1b[36mConnecting to {settings.ssh_host}\u2026\x1b[0m\r\n".encode()
|
||||||
|
|
@ -65,7 +65,13 @@ class TrueNASClient:
|
||||||
"get_disks",
|
"get_disks",
|
||||||
)
|
)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
return r.json()
|
disks = r.json()
|
||||||
|
# Filter out expired records — TrueNAS keeps historical entries for removed
|
||||||
|
# disks with expiretime set. Only return currently-present drives.
|
||||||
|
active = [d for d in disks if not d.get("expiretime")]
|
||||||
|
if len(active) < len(disks):
|
||||||
|
log.debug("get_disks: filtered %d expired record(s)", len(disks) - len(active))
|
||||||
|
return active
|
||||||
|
|
||||||
async def get_smart_jobs(self, state: str | None = None) -> list[dict]:
|
async def get_smart_jobs(self, state: str | None = None) -> list[dict]:
|
||||||
params: dict = {"method": "smart.test"}
|
params: dict = {"method": "smart.test"}
|
||||||
|
|
@ -110,3 +116,49 @@ class TrueNASClient:
|
||||||
)
|
)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
return r.json()
|
return r.json()
|
||||||
|
|
||||||
|
async def get_disk_temperatures(self) -> dict[str, float | None]:
|
||||||
|
"""
|
||||||
|
Returns {devname: celsius | None}.
|
||||||
|
Uses POST /api/v2.0/disk/temperatures — available on TrueNAS SCALE 25.10+.
|
||||||
|
CORE compatibility: raises on 404/405, caller should catch and skip.
|
||||||
|
"""
|
||||||
|
r = await _with_retry(
|
||||||
|
lambda: self._client.post("/api/v2.0/disk/temperatures", json={}),
|
||||||
|
"get_disk_temperatures",
|
||||||
|
)
|
||||||
|
r.raise_for_status()
|
||||||
|
return r.json()
|
||||||
|
|
||||||
|
async def wipe_disk(self, devname: str, mode: str = "FULL") -> int:
|
||||||
|
"""
|
||||||
|
Start a disk wipe job. Not retried — duplicate starts would launch a second wipe.
|
||||||
|
mode: "QUICK" (wipe MBR/partitions only), "FULL" (write zeros), "FULL_RANDOM" (write random)
|
||||||
|
devname: basename only, e.g. "ada0" (not "/dev/ada0")
|
||||||
|
Returns the TrueNAS job ID.
|
||||||
|
"""
|
||||||
|
r = await self._client.post(
|
||||||
|
"/api/v2.0/disk/wipe",
|
||||||
|
json={"dev": devname, "mode": mode},
|
||||||
|
)
|
||||||
|
r.raise_for_status()
|
||||||
|
return r.json()
|
||||||
|
|
||||||
|
async def get_job(self, job_id: int) -> dict | None:
|
||||||
|
"""
|
||||||
|
Fetch a single TrueNAS job by ID.
|
||||||
|
Returns the job dict, or None if not found.
|
||||||
|
"""
|
||||||
|
import json as _json
|
||||||
|
r = await _with_retry(
|
||||||
|
lambda: self._client.get(
|
||||||
|
"/api/v2.0/core/get_jobs",
|
||||||
|
params={"filters": _json.dumps([["id", "=", job_id]])},
|
||||||
|
),
|
||||||
|
f"get_job({job_id})",
|
||||||
|
)
|
||||||
|
r.raise_for_status()
|
||||||
|
jobs = r.json()
|
||||||
|
if isinstance(jobs, list) and jobs:
|
||||||
|
return jobs[0]
|
||||||
|
return None
|
||||||
23
claude-sandbox/truenas-burnin/docker-compose.yml
Normal file
23
claude-sandbox/truenas-burnin/docker-compose.yml
Normal file
|
|
@ -0,0 +1,23 @@
|
||||||
|
services:
|
||||||
|
# mock-truenas is kept for local dev — not started in production
|
||||||
|
# To use mock mode: docker compose --profile mock up
|
||||||
|
# mock-truenas:
|
||||||
|
# build: ./mock-truenas
|
||||||
|
# container_name: mock-truenas
|
||||||
|
# ports:
|
||||||
|
# - "8000:8000"
|
||||||
|
# profiles: [mock]
|
||||||
|
# restart: unless-stopped
|
||||||
|
|
||||||
|
app:
|
||||||
|
build: .
|
||||||
|
container_name: truenas-burnin
|
||||||
|
ports:
|
||||||
|
- "8084:8084"
|
||||||
|
env_file: .env
|
||||||
|
volumes:
|
||||||
|
- ./data:/data
|
||||||
|
- ./app/templates:/opt/app/app/templates
|
||||||
|
- ./app/static:/opt/app/app/static
|
||||||
|
- /home/brandon/.ssh/id_ed25519:/run/secrets/ssh_key:ro
|
||||||
|
restart: unless-stopped
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
fastapi
|
fastapi
|
||||||
uvicorn
|
uvicorn[standard]
|
||||||
aiosqlite
|
aiosqlite
|
||||||
httpx
|
httpx
|
||||||
pydantic-settings
|
pydantic-settings
|
||||||
|
|
@ -1,21 +0,0 @@
|
||||||
services:
|
|
||||||
mock-truenas:
|
|
||||||
build: ./mock-truenas
|
|
||||||
container_name: mock-truenas
|
|
||||||
ports:
|
|
||||||
- "8000:8000"
|
|
||||||
restart: unless-stopped
|
|
||||||
|
|
||||||
app:
|
|
||||||
build: .
|
|
||||||
container_name: truenas-burnin
|
|
||||||
ports:
|
|
||||||
- "8084:8084"
|
|
||||||
env_file: .env
|
|
||||||
volumes:
|
|
||||||
- ./data:/data
|
|
||||||
- ./app/templates:/opt/app/app/templates
|
|
||||||
- ./app/static:/opt/app/app/static
|
|
||||||
depends_on:
|
|
||||||
- mock-truenas
|
|
||||||
restart: unless-stopped
|
|
||||||
Loading…
Add table
Reference in a new issue