Compare commits
No commits in common. "25d4622aa4cb877de079a3bdf5825b20359701ca" and "main" have entirely different histories.
25d4622aa4
...
main
39 changed files with 78 additions and 676 deletions
|
|
@ -209,7 +209,7 @@ All read from `.env` via `pydantic-settings`. See `.env.example` for full list.
|
|||
| `TEMP_WARN_C` | `46` | Temperature warning threshold (°C) |
|
||||
| `TEMP_CRIT_C` | `55` | Temperature critical threshold — precheck fails above this |
|
||||
| `BAD_BLOCK_THRESHOLD` | `0` | Max bad blocks allowed before surface_validate fails (0 = any bad = fail) |
|
||||
| `APP_VERSION` | `1.0.0-8` | Displayed in header version badge |
|
||||
| `APP_VERSION` | `1.0.0-7` | Displayed in header version badge |
|
||||
| `SSH_HOST` | `` | TrueNAS SSH hostname/IP — empty disables SSH mode (uses mock/REST) |
|
||||
| `SSH_PORT` | `22` | TrueNAS SSH port |
|
||||
| `SSH_USER` | `root` | TrueNAS SSH username |
|
||||
|
|
@ -206,45 +206,10 @@ async def cancel_job(job_id: int, operator: str) -> bool:
|
|||
# Job runner
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
async def _thermal_gate_ok() -> bool:
|
||||
"""True if it's thermally safe to start a new burn-in.
|
||||
Checks the peak temperature of drives currently under active burn-in.
|
||||
"""
|
||||
try:
|
||||
async with _db() as db:
|
||||
cur = await db.execute("""
|
||||
SELECT MAX(d.temperature_c)
|
||||
FROM drives d
|
||||
JOIN burnin_jobs bj ON bj.drive_id = d.id
|
||||
WHERE bj.state = 'running' AND d.temperature_c IS NOT NULL
|
||||
""")
|
||||
row = await cur.fetchone()
|
||||
max_temp = row[0] if row and row[0] is not None else None
|
||||
return max_temp is None or max_temp < settings.temp_warn_c
|
||||
except Exception:
|
||||
return True # Never block on error
|
||||
|
||||
|
||||
async def _run_job(job_id: int) -> None:
|
||||
"""Acquire semaphore slot, execute all stages, persist final state."""
|
||||
assert _semaphore is not None, "burnin.init() not called"
|
||||
|
||||
# Adaptive thermal gate: wait before competing for a slot if running drives
|
||||
# are already at or above the warning threshold. This prevents layering a
|
||||
# new burn-in on top of a thermally-stressed system. Gives up after 3 min
|
||||
# and proceeds anyway so jobs don't queue indefinitely.
|
||||
for _attempt in range(18): # 18 × 10 s = 3 min max
|
||||
if await _thermal_gate_ok():
|
||||
break
|
||||
if _attempt == 0:
|
||||
log.info(
|
||||
"Thermal gate: job %d waiting — running drive temps at or above %d°C",
|
||||
job_id, settings.temp_warn_c,
|
||||
)
|
||||
await asyncio.sleep(10)
|
||||
else:
|
||||
log.warning("Thermal gate timed out for job %d — proceeding anyway", job_id)
|
||||
|
||||
async with _semaphore:
|
||||
if await _is_cancelled(job_id):
|
||||
return
|
||||
|
|
@ -554,39 +519,15 @@ async def _stage_smart_test_ssh(job_id: int, devname: str, test_type: str, stage
|
|||
# "unknown" → keep polling
|
||||
|
||||
|
||||
async def _badblocks_available() -> bool:
|
||||
"""Check if badblocks is installed on the remote host (Linux/SCALE only)."""
|
||||
from app import ssh_client
|
||||
try:
|
||||
async with await ssh_client._connect() as conn:
|
||||
result = await conn.run("which badblocks", check=False)
|
||||
return result.returncode == 0
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
async def _stage_surface_validate(job_id: int, devname: str, drive_id: int) -> bool:
|
||||
"""
|
||||
Surface validation stage — auto-routes to the right implementation:
|
||||
|
||||
1. SSH configured + badblocks available (TrueNAS SCALE / Linux):
|
||||
→ runs badblocks -wsv -b 4096 -p 1 /dev/{devname} directly over SSH.
|
||||
2. SSH configured + badblocks NOT available (TrueNAS CORE / FreeBSD):
|
||||
→ uses TrueNAS REST API disk.wipe FULL job + post-wipe SMART check.
|
||||
3. No SSH:
|
||||
→ simulated timed progress (dev/mock mode).
|
||||
Surface validation stage.
|
||||
SSH mode: runs badblocks -wsv -b 4096 -p 1 /dev/{devname}.
|
||||
Mock mode: simulated timed progress (no real I/O).
|
||||
"""
|
||||
from app import ssh_client
|
||||
if ssh_client.is_configured():
|
||||
if await _badblocks_available():
|
||||
return await _stage_surface_validate_ssh(job_id, devname, drive_id)
|
||||
# TrueNAS CORE/FreeBSD: badblocks not available — use native wipe API
|
||||
await _append_stage_log(
|
||||
job_id, "surface_validate",
|
||||
"[INFO] badblocks not found on host (TrueNAS CORE/FreeBSD) — "
|
||||
"using TrueNAS disk.wipe API (FULL write pass).\n\n"
|
||||
)
|
||||
return await _stage_surface_validate_truenas(job_id, devname, drive_id)
|
||||
return await _stage_surface_validate_ssh(job_id, devname, drive_id)
|
||||
return await _stage_timed_simulate(job_id, "surface_validate", settings.surface_validate_seconds)
|
||||
|
||||
|
||||
|
|
@ -714,116 +655,6 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
|||
return True
|
||||
|
||||
|
||||
async def _stage_surface_validate_truenas(job_id: int, devname: str, drive_id: int) -> bool:
|
||||
"""
|
||||
Surface validation via TrueNAS CORE disk.wipe REST API.
|
||||
Used on FreeBSD (TrueNAS CORE) where badblocks is unavailable.
|
||||
|
||||
Sends a FULL write-zero pass across the entire disk, polls progress,
|
||||
then runs a post-wipe SMART attribute check to catch reallocated sectors.
|
||||
"""
|
||||
from app import ssh_client
|
||||
|
||||
await _append_stage_log(
|
||||
job_id, "surface_validate",
|
||||
f"[START] TrueNAS disk.wipe FULL — {devname}\n"
|
||||
f"[NOTE] DESTRUCTIVE: all data on {devname} will be overwritten.\n\n"
|
||||
)
|
||||
|
||||
# Start the wipe job
|
||||
try:
|
||||
tn_job_id = await _client.wipe_disk(devname, "FULL")
|
||||
except Exception as exc:
|
||||
await _set_stage_error(job_id, "surface_validate", f"Failed to start disk.wipe: {exc}")
|
||||
return False
|
||||
|
||||
await _append_stage_log(
|
||||
job_id, "surface_validate",
|
||||
f"[JOB] TrueNAS wipe job started (job_id={tn_job_id})\n"
|
||||
)
|
||||
|
||||
# Poll until complete
|
||||
log_flush_counter = 0
|
||||
while True:
|
||||
if await _is_cancelled(job_id):
|
||||
try:
|
||||
await _client.abort_job(tn_job_id)
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
await asyncio.sleep(POLL_INTERVAL)
|
||||
|
||||
try:
|
||||
job = await _client.get_job(tn_job_id)
|
||||
except Exception as exc:
|
||||
log.warning("Wipe job poll failed: %s", exc, extra={"job_id": job_id})
|
||||
await _append_stage_log(job_id, "surface_validate", f"[poll error] {exc}\n")
|
||||
continue
|
||||
|
||||
if not job:
|
||||
await _set_stage_error(job_id, "surface_validate", f"Wipe job {tn_job_id} not found")
|
||||
return False
|
||||
|
||||
state = job.get("state", "")
|
||||
pct = int(job.get("progress", {}).get("percent", 0) or 0)
|
||||
desc = job.get("progress", {}).get("description", "")
|
||||
|
||||
await _update_stage_percent(job_id, "surface_validate", min(pct, 99))
|
||||
await _recalculate_progress(job_id)
|
||||
_push_update()
|
||||
|
||||
# Log progress description every ~5 polls to avoid DB spam
|
||||
log_flush_counter += 1
|
||||
if desc and log_flush_counter % 5 == 0:
|
||||
await _append_stage_log(job_id, "surface_validate", f"[{pct}%] {desc}\n")
|
||||
|
||||
if state == "SUCCESS":
|
||||
await _update_stage_percent(job_id, "surface_validate", 100)
|
||||
await _append_stage_log(
|
||||
job_id, "surface_validate",
|
||||
f"\n[DONE] Wipe job {tn_job_id} completed successfully.\n"
|
||||
)
|
||||
# Post-wipe SMART check — catch any sectors that failed under write stress
|
||||
if ssh_client.is_configured() and drive_id is not None:
|
||||
await _append_stage_log(
|
||||
job_id, "surface_validate",
|
||||
"[CHECK] Running post-wipe SMART attribute check...\n"
|
||||
)
|
||||
try:
|
||||
attrs = await ssh_client.get_smart_attributes(devname)
|
||||
await _store_smart_attrs(drive_id, attrs)
|
||||
if attrs["failures"]:
|
||||
error = "Post-wipe SMART check: " + "; ".join(attrs["failures"])
|
||||
await _set_stage_error(job_id, "surface_validate", error)
|
||||
return False
|
||||
if attrs["warnings"]:
|
||||
await _append_stage_log(
|
||||
job_id, "surface_validate",
|
||||
"[WARNING] " + "; ".join(attrs["warnings"]) + "\n"
|
||||
)
|
||||
await _append_stage_log(
|
||||
job_id, "surface_validate",
|
||||
f"[CHECK] SMART health: {attrs['health']} — no critical attributes.\n"
|
||||
)
|
||||
except Exception as exc:
|
||||
log.warning("Post-wipe SMART check failed: %s", exc)
|
||||
await _append_stage_log(
|
||||
job_id, "surface_validate",
|
||||
f"[WARN] Post-wipe SMART check failed (non-fatal): {exc}\n"
|
||||
)
|
||||
return True
|
||||
|
||||
elif state in ("FAILED", "ABORTED", "ERROR"):
|
||||
error_msg = job.get("error") or f"Disk wipe failed (state={state})"
|
||||
await _set_stage_error(
|
||||
job_id, "surface_validate",
|
||||
f"TrueNAS disk.wipe FAILED: {error_msg}"
|
||||
)
|
||||
return False
|
||||
# RUNNING or WAITING — keep polling
|
||||
|
||||
|
||||
async def _stage_timed_simulate(job_id: int, stage_name: str, duration_seconds: int) -> bool:
|
||||
"""Simulate a timed stage with progress updates (mock / dev mode)."""
|
||||
start = time.monotonic()
|
||||
|
|
@ -68,7 +68,7 @@ class Settings(BaseSettings):
|
|||
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
||||
|
||||
# Application version — used by the /api/v1/updates/check endpoint
|
||||
app_version: str = "1.0.0-8"
|
||||
app_version: str = "1.0.0-7"
|
||||
|
||||
|
||||
settings = Settings()
|
||||
|
|
@ -20,15 +20,13 @@ from app.truenas import TrueNASClient
|
|||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# Shared state read by the /health endpoint and dashboard template
|
||||
# Shared state read by the /health endpoint
|
||||
_state: dict[str, Any] = {
|
||||
"last_poll_at": None,
|
||||
"last_error": None,
|
||||
"healthy": False,
|
||||
"drives_seen": 0,
|
||||
"consecutive_failures": 0,
|
||||
"system_temps": {}, # {"cpu_c": int|None, "pch_c": int|None}
|
||||
"thermal_pressure": "ok", # "ok" | "warn" | "crit" — based on running burn-in drive temps
|
||||
}
|
||||
|
||||
# SSE subscriber queues — notified after each successful poll
|
||||
|
|
@ -210,67 +208,6 @@ async def _sync_history(
|
|||
# Poll cycle
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
async def _poll_smart_via_ssh(db: aiosqlite.Connection, now: str) -> None:
|
||||
"""
|
||||
Poll progress for SMART tests started via SSH (truenas_job_id IS NULL).
|
||||
Used on TrueNAS SCALE 25.10+ where the REST smart/test API no longer exists.
|
||||
"""
|
||||
from app import ssh_client
|
||||
if not ssh_client.is_configured():
|
||||
return
|
||||
|
||||
cur = await db.execute(
|
||||
"""SELECT st.id, st.test_type, st.drive_id, d.devname, st.started_at
|
||||
FROM smart_tests st
|
||||
JOIN drives d ON d.id = st.drive_id
|
||||
WHERE st.state = 'running' AND st.truenas_job_id IS NULL"""
|
||||
)
|
||||
rows = await cur.fetchall()
|
||||
if not rows:
|
||||
return
|
||||
|
||||
for row in rows:
|
||||
test_id, ttype, drive_id, devname, started_at = row[0], row[1], row[2], row[3], row[4]
|
||||
try:
|
||||
progress = await ssh_client.poll_smart_progress(devname)
|
||||
except Exception as exc:
|
||||
log.warning("SSH SMART poll failed for %s: %s", devname, exc)
|
||||
continue
|
||||
|
||||
state = progress["state"]
|
||||
pct_remaining = progress.get("percent_remaining") # None = not yet in output
|
||||
raw_output = progress.get("output", "")
|
||||
|
||||
if state == "running":
|
||||
# pct_remaining=None means smartctl output doesn't have the % line yet
|
||||
# (test just started) — keep percent at 0 rather than jumping to 100
|
||||
if pct_remaining is None:
|
||||
pct = 0
|
||||
else:
|
||||
pct = max(0, 100 - pct_remaining)
|
||||
eta = _eta_from_progress(pct, started_at)
|
||||
await db.execute(
|
||||
"UPDATE smart_tests SET percent=?, eta_at=?, raw_output=? WHERE id=?",
|
||||
(pct, eta, raw_output, test_id),
|
||||
)
|
||||
elif state == "passed":
|
||||
await db.execute(
|
||||
"UPDATE smart_tests SET state='passed', percent=100, finished_at=?, raw_output=? WHERE id=?",
|
||||
(now, raw_output, test_id),
|
||||
)
|
||||
log.info("SSH SMART %s passed on %s", ttype, devname)
|
||||
elif state == "failed":
|
||||
await db.execute(
|
||||
"UPDATE smart_tests SET state='failed', percent=0, finished_at=?, "
|
||||
"error_text=?, raw_output=? WHERE id=?",
|
||||
(now, f"SMART {ttype.upper()} test failed", raw_output, test_id),
|
||||
)
|
||||
log.warning("SSH SMART %s FAILED on %s", ttype, devname)
|
||||
# state == "unknown" → keep polling, no update
|
||||
|
||||
await db.commit()
|
||||
|
||||
|
||||
async def poll_cycle(client: TrueNASClient) -> int:
|
||||
"""Run one full poll. Returns number of drives seen."""
|
||||
now = _now()
|
||||
|
|
@ -278,20 +215,6 @@ async def poll_cycle(client: TrueNASClient) -> int:
|
|||
disks = await client.get_disks()
|
||||
running_jobs = await client.get_smart_jobs(state="RUNNING")
|
||||
|
||||
# Fetch temperatures via SCALE-specific endpoint.
|
||||
# CORE doesn't have this endpoint — silently skip on any error.
|
||||
try:
|
||||
temps = await client.get_disk_temperatures()
|
||||
except Exception:
|
||||
temps = {}
|
||||
|
||||
# Inject temperature into each disk dict (SCALE 25.10 has no temp in /disk)
|
||||
for disk in disks:
|
||||
devname = disk.get("devname", "")
|
||||
t = temps.get(devname)
|
||||
if t is not None:
|
||||
disk["temperature"] = int(round(t))
|
||||
|
||||
# Index running jobs by (devname, test_type)
|
||||
active: dict[tuple[str, str], dict] = {}
|
||||
for job in running_jobs:
|
||||
|
|
@ -320,9 +243,6 @@ async def poll_cycle(client: TrueNASClient) -> int:
|
|||
|
||||
await db.commit()
|
||||
|
||||
# SSH SMART polling — for tests started via smartctl (no TrueNAS REST job)
|
||||
await _poll_smart_via_ssh(db, now)
|
||||
|
||||
return len(disks)
|
||||
|
||||
|
||||
|
|
@ -343,39 +263,6 @@ async def run(client: TrueNASClient) -> None:
|
|||
_state["drives_seen"] = count
|
||||
_state["consecutive_failures"] = 0
|
||||
log.debug("Poll OK", extra={"drives": count})
|
||||
|
||||
# System sensor temps via SSH (non-fatal)
|
||||
from app import ssh_client as _ssh
|
||||
if _ssh.is_configured():
|
||||
try:
|
||||
_state["system_temps"] = await _ssh.get_system_sensors()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Thermal pressure: max temp of drives currently under burn-in
|
||||
try:
|
||||
async with aiosqlite.connect(settings.db_path) as _tdb:
|
||||
_tdb.row_factory = aiosqlite.Row
|
||||
await _tdb.execute("PRAGMA journal_mode=WAL")
|
||||
_cur = await _tdb.execute("""
|
||||
SELECT MAX(d.temperature_c)
|
||||
FROM drives d
|
||||
JOIN burnin_jobs bj ON bj.drive_id = d.id
|
||||
WHERE bj.state = 'running' AND d.temperature_c IS NOT NULL
|
||||
""")
|
||||
_row = await _cur.fetchone()
|
||||
_max_t = _row[0] if _row and _row[0] is not None else None
|
||||
if _max_t is None:
|
||||
_state["thermal_pressure"] = "ok"
|
||||
elif _max_t >= settings.temp_crit_c:
|
||||
_state["thermal_pressure"] = "crit"
|
||||
elif _max_t >= settings.temp_warn_c:
|
||||
_state["thermal_pressure"] = "warn"
|
||||
else:
|
||||
_state["thermal_pressure"] = "ok"
|
||||
except Exception:
|
||||
_state["thermal_pressure"] = "ok"
|
||||
|
||||
_notify_subscribers()
|
||||
|
||||
# Check for stuck jobs every 5 cycles (~1 min at default 12s interval)
|
||||
|
|
@ -218,18 +218,6 @@ async def sse_drives(request: Request):
|
|||
|
||||
yield {"event": "drives-update", "data": html}
|
||||
|
||||
# Push system sensor state so JS can update temp chips live
|
||||
ps = poller.get_state()
|
||||
yield {
|
||||
"event": "system-sensors",
|
||||
"data": json.dumps({
|
||||
"system_temps": ps.get("system_temps", {}),
|
||||
"thermal_pressure": ps.get("thermal_pressure", "ok"),
|
||||
"temp_warn_c": settings.temp_warn_c,
|
||||
"temp_crit_c": settings.temp_crit_c,
|
||||
}),
|
||||
}
|
||||
|
||||
# Push browser notification event if this was a job completion
|
||||
if alert:
|
||||
yield {"event": "job-alert", "data": json.dumps(alert)}
|
||||
|
|
@ -365,13 +353,9 @@ async def smart_start(
|
|||
body: dict,
|
||||
db: aiosqlite.Connection = Depends(get_db),
|
||||
):
|
||||
"""Start a standalone SHORT or LONG SMART test on a single drive.
|
||||
|
||||
Uses SSH (smartctl) when configured — required for TrueNAS SCALE 25.10+
|
||||
where the REST smart/test endpoint no longer exists.
|
||||
Falls back to TrueNAS REST API for older versions.
|
||||
"""
|
||||
from app import burnin as _burnin, ssh_client
|
||||
"""Start a standalone SHORT or LONG SMART test on a single drive."""
|
||||
from app.truenas import TrueNASClient
|
||||
from app import burnin as _burnin
|
||||
|
||||
test_type = (body.get("type") or "").upper()
|
||||
if test_type not in ("SHORT", "LONG"):
|
||||
|
|
@ -383,42 +367,17 @@ async def smart_start(
|
|||
raise HTTPException(status_code=404, detail="Drive not found")
|
||||
devname = row[0]
|
||||
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
ttype_lower = test_type.lower()
|
||||
# Use the shared TrueNAS client held by the burnin module
|
||||
client = _burnin._client
|
||||
if client is None:
|
||||
raise HTTPException(status_code=503, detail="TrueNAS client not ready")
|
||||
|
||||
if ssh_client.is_configured():
|
||||
# SSH path — works on TrueNAS SCALE 25.10+ and CORE
|
||||
try:
|
||||
output = await ssh_client.start_smart_test(devname, test_type)
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=502, detail=f"SSH error: {exc}")
|
||||
try:
|
||||
tn_job_id = await client.start_smart_test([devname], test_type)
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=502, detail=f"TrueNAS error: {exc}")
|
||||
|
||||
# Mark as running in DB (truenas_job_id=NULL signals SSH-managed test)
|
||||
# Store smartctl start output as proof the test was initiated
|
||||
await db.execute(
|
||||
"""INSERT INTO smart_tests (drive_id, test_type, state, percent, started_at, raw_output)
|
||||
VALUES (?,?,?,?,?,?)
|
||||
ON CONFLICT(drive_id, test_type) DO UPDATE SET
|
||||
state='running', percent=0, truenas_job_id=NULL,
|
||||
started_at=excluded.started_at, finished_at=NULL, error_text=NULL,
|
||||
raw_output=excluded.raw_output""",
|
||||
(drive_id, ttype_lower, "running", 0, now, output),
|
||||
)
|
||||
await db.commit()
|
||||
from app import poller as _poller
|
||||
_poller._notify_subscribers()
|
||||
return {"devname": devname, "type": test_type, "message": output[:200]}
|
||||
|
||||
else:
|
||||
# REST path — older TrueNAS CORE / SCALE versions
|
||||
client = _burnin._client
|
||||
if client is None:
|
||||
raise HTTPException(status_code=503, detail="TrueNAS client not ready")
|
||||
try:
|
||||
tn_job_id = await client.start_smart_test([devname], test_type)
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=502, detail=f"TrueNAS error: {exc}")
|
||||
return {"job_id": tn_job_id, "devname": devname, "type": test_type}
|
||||
return {"job_id": tn_job_id, "devname": devname, "type": test_type}
|
||||
|
||||
|
||||
@router.post("/api/v1/drives/{drive_id}/smart/cancel")
|
||||
|
|
@ -444,37 +403,28 @@ async def smart_cancel(
|
|||
if client is None:
|
||||
raise HTTPException(status_code=503, detail="TrueNAS client not ready")
|
||||
|
||||
from app import ssh_client
|
||||
# Find the running TrueNAS job for this drive/test-type
|
||||
try:
|
||||
jobs = await client.get_smart_jobs()
|
||||
tn_job_id = None
|
||||
for j in jobs:
|
||||
if j.get("state") != "RUNNING":
|
||||
continue
|
||||
args = j.get("arguments", [])
|
||||
if not args or not isinstance(args[0], dict):
|
||||
continue
|
||||
if devname in args[0].get("disks", []):
|
||||
tn_job_id = j["id"]
|
||||
break
|
||||
|
||||
if ssh_client.is_configured():
|
||||
# SSH path — abort via smartctl -X
|
||||
try:
|
||||
await ssh_client.abort_smart_test(devname)
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=502, detail=f"SSH abort error: {exc}")
|
||||
else:
|
||||
# REST path — find TrueNAS job and abort it
|
||||
try:
|
||||
jobs = await client.get_smart_jobs()
|
||||
tn_job_id = None
|
||||
for j in jobs:
|
||||
if j.get("state") != "RUNNING":
|
||||
continue
|
||||
args = j.get("arguments", [])
|
||||
if not args or not isinstance(args[0], dict):
|
||||
continue
|
||||
if devname in args[0].get("disks", []):
|
||||
tn_job_id = j["id"]
|
||||
break
|
||||
if tn_job_id is None:
|
||||
raise HTTPException(status_code=404, detail="No running SMART test found for this drive")
|
||||
|
||||
if tn_job_id is None:
|
||||
raise HTTPException(status_code=404, detail="No running SMART test found for this drive")
|
||||
|
||||
await client.abort_job(tn_job_id)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=502, detail=f"TrueNAS error: {exc}")
|
||||
await client.abort_job(tn_job_id)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=502, detail=f"TrueNAS error: {exc}")
|
||||
|
||||
# Update local DB state
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
|
|
@ -38,26 +38,15 @@ SMART_ATTRS: dict[int, tuple[str, bool]] = {
|
|||
# ---------------------------------------------------------------------------
|
||||
|
||||
def is_configured() -> bool:
|
||||
"""Returns True when SSH host + at least one auth method is available."""
|
||||
import os
|
||||
"""Returns True when SSH credentials are present and usable."""
|
||||
from app.config import settings
|
||||
if not settings.ssh_host:
|
||||
return False
|
||||
has_creds = bool(
|
||||
settings.ssh_key
|
||||
or settings.ssh_password
|
||||
or os.path.exists(os.environ.get("SSH_KEY_FILE", _MOUNTED_KEY_PATH))
|
||||
)
|
||||
return has_creds
|
||||
return bool(settings.ssh_host and (settings.ssh_password or settings.ssh_key))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Low-level connection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_MOUNTED_KEY_PATH = "/run/secrets/ssh_key"
|
||||
|
||||
|
||||
async def _connect():
|
||||
"""Open a single-use SSH connection. Caller must use `async with`."""
|
||||
import asyncssh
|
||||
|
|
@ -70,17 +59,9 @@ async def _connect():
|
|||
"known_hosts": None, # trust all hosts (same spirit as TRUENAS_VERIFY_TLS=false)
|
||||
}
|
||||
if settings.ssh_key:
|
||||
# Key material provided via env var (base case)
|
||||
kwargs["client_keys"] = [asyncssh.import_private_key(settings.ssh_key)]
|
||||
elif settings.ssh_password:
|
||||
if settings.ssh_password:
|
||||
kwargs["password"] = settings.ssh_password
|
||||
else:
|
||||
# Fall back to mounted key file (preferred for production — no key in env vars)
|
||||
import os
|
||||
key_path = os.environ.get("SSH_KEY_FILE", _MOUNTED_KEY_PATH)
|
||||
if os.path.exists(key_path):
|
||||
kwargs["client_keys"] = [key_path]
|
||||
# If nothing is configured, asyncssh will attempt agent/default key lookup
|
||||
|
||||
return asyncssh.connect(**kwargs)
|
||||
|
||||
|
|
@ -247,70 +228,6 @@ async def run_badblocks(
|
|||
}
|
||||
|
||||
|
||||
async def get_system_sensors() -> dict:
|
||||
"""
|
||||
Run `sensors -j` on TrueNAS and extract system-level temperatures.
|
||||
Returns {"cpu_c": int|None, "pch_c": int|None}.
|
||||
cpu_c = CPU package temp (coretemp chip)
|
||||
pch_c = PCH/chipset temp (pch_* chip) — proxy for storage I/O lane thermals
|
||||
Falls back gracefully if SSH is not configured or lm-sensors is unavailable.
|
||||
"""
|
||||
if not is_configured():
|
||||
return {}
|
||||
try:
|
||||
async with await _connect() as conn:
|
||||
result = await conn.run("sensors -j 2>/dev/null", check=False)
|
||||
output = result.stdout.strip()
|
||||
if not output:
|
||||
return {}
|
||||
return _parse_sensors_json(output)
|
||||
except Exception as exc:
|
||||
log.debug("get_system_sensors failed: %s", exc)
|
||||
return {}
|
||||
|
||||
|
||||
def _parse_sensors_json(output: str) -> dict:
|
||||
import json as _json
|
||||
try:
|
||||
data = _json.loads(output)
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
cpu_c: int | None = None
|
||||
pch_c: int | None = None
|
||||
|
||||
for chip_name, chip_data in data.items():
|
||||
if not isinstance(chip_data, dict):
|
||||
continue
|
||||
|
||||
# CPU package temp — coretemp chip, "Package id N" sensor
|
||||
if chip_name.startswith("coretemp") and cpu_c is None:
|
||||
for sensor_name, sensor_vals in chip_data.items():
|
||||
if not isinstance(sensor_vals, dict):
|
||||
continue
|
||||
if "package" in sensor_name.lower():
|
||||
for k, v in sensor_vals.items():
|
||||
if k.endswith("_input") and isinstance(v, (int, float)):
|
||||
cpu_c = int(round(v))
|
||||
break
|
||||
if cpu_c is not None:
|
||||
break
|
||||
|
||||
# PCH / chipset temp — manages PCIe lanes including HBA / storage I/O
|
||||
elif chip_name.startswith("pch_") and pch_c is None:
|
||||
for sensor_name, sensor_vals in chip_data.items():
|
||||
if not isinstance(sensor_vals, dict):
|
||||
continue
|
||||
for k, v in sensor_vals.items():
|
||||
if k.endswith("_input") and isinstance(v, (int, float)):
|
||||
pch_c = int(round(v))
|
||||
break
|
||||
if pch_c is not None:
|
||||
break
|
||||
|
||||
return {"cpu_c": cpu_c, "pch_c": pch_c}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parsers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -358,7 +275,7 @@ def _parse_smartctl(output: str) -> dict:
|
|||
|
||||
def _parse_smart_progress(output: str) -> dict:
|
||||
state = "unknown"
|
||||
percent_remaining = None # None = "in progress but no % line parsed yet"
|
||||
percent_remaining = 0
|
||||
|
||||
lower = output.lower()
|
||||
|
||||
|
|
@ -1076,56 +1076,6 @@ a.stat-card:hover {
|
|||
.stat-passed .stat-value { color: var(--green); }
|
||||
.stat-idle .stat-value { color: var(--text-muted); }
|
||||
|
||||
/* Vertical separator between drive-count cards and sensor chips */
|
||||
.stats-bar-sep {
|
||||
width: 1px;
|
||||
height: 36px;
|
||||
background: var(--border);
|
||||
align-self: center;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
/* Compact sensor chip — CPU / PCH / Thermal */
|
||||
.stat-sensor {
|
||||
background: var(--bg-card);
|
||||
border: 1px solid var(--border);
|
||||
border-radius: 8px;
|
||||
padding: 6px 12px;
|
||||
text-align: center;
|
||||
min-width: 52px;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 2px;
|
||||
}
|
||||
|
||||
.stat-sensor-val {
|
||||
font-size: 16px;
|
||||
font-weight: 700;
|
||||
font-variant-numeric: tabular-nums;
|
||||
line-height: 1.1;
|
||||
}
|
||||
|
||||
.stat-sensor-label {
|
||||
font-size: 9px;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.08em;
|
||||
color: var(--text-muted);
|
||||
line-height: 1.2;
|
||||
}
|
||||
|
||||
/* Thermal pressure states */
|
||||
.stat-sensor-thermal-warn {
|
||||
border-color: var(--yellow-bd);
|
||||
background: var(--yellow-bg);
|
||||
}
|
||||
.stat-sensor-thermal-warn .stat-sensor-val { color: var(--yellow); }
|
||||
|
||||
.stat-sensor-thermal-crit {
|
||||
border-color: var(--red-bd);
|
||||
background: var(--red-bg);
|
||||
}
|
||||
.stat-sensor-thermal-crit .stat-sensor-val { color: var(--red); }
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Batch action bar (inside filter-bar)
|
||||
----------------------------------------------------------------------- */
|
||||
|
|
@ -135,59 +135,14 @@
|
|||
if (nb) nb.style.display = 'none';
|
||||
}
|
||||
|
||||
// Handle SSE events
|
||||
// Handle job-alert SSE events for browser notifications
|
||||
document.addEventListener('htmx:sseMessage', function (e) {
|
||||
if (!e.detail) return;
|
||||
if (e.detail.type === 'job-alert') {
|
||||
try { handleJobAlert(JSON.parse(e.detail.data)); } catch (_) {}
|
||||
} else if (e.detail.type === 'system-sensors') {
|
||||
try { handleSystemSensors(JSON.parse(e.detail.data)); } catch (_) {}
|
||||
}
|
||||
if (!e.detail || e.detail.type !== 'job-alert') return;
|
||||
try {
|
||||
handleJobAlert(JSON.parse(e.detail.data));
|
||||
} catch (_) {}
|
||||
});
|
||||
|
||||
function handleSystemSensors(data) {
|
||||
var st = data.system_temps || {};
|
||||
var tp = data.thermal_pressure || 'ok';
|
||||
var warn = data.temp_warn_c || 46;
|
||||
var crit = data.temp_crit_c || 55;
|
||||
|
||||
function tempClass(c) {
|
||||
if (c == null) return '';
|
||||
return c >= crit ? 'temp-hot' : c >= warn ? 'temp-warm' : 'temp-cool';
|
||||
}
|
||||
|
||||
// CPU chip
|
||||
var cpuChip = document.getElementById('sensor-cpu');
|
||||
var cpuVal = document.getElementById('sensor-cpu-val');
|
||||
if (cpuVal && st.cpu_c != null) {
|
||||
if (cpuChip) cpuChip.hidden = false;
|
||||
cpuVal.textContent = st.cpu_c + '°';
|
||||
cpuVal.className = 'stat-sensor-val ' + tempClass(st.cpu_c);
|
||||
}
|
||||
|
||||
// PCH chip
|
||||
var pchChip = document.getElementById('sensor-pch');
|
||||
var pchVal = document.getElementById('sensor-pch-val');
|
||||
if (pchVal && st.pch_c != null) {
|
||||
if (pchChip) pchChip.hidden = false;
|
||||
pchVal.textContent = st.pch_c + '°';
|
||||
pchVal.className = 'stat-sensor-val ' + tempClass(st.pch_c);
|
||||
}
|
||||
|
||||
// Thermal pressure chip
|
||||
var tChip = document.getElementById('sensor-thermal');
|
||||
var tVal = document.getElementById('sensor-thermal-val');
|
||||
if (tChip && tVal) {
|
||||
if (tp === 'warn' || tp === 'crit') {
|
||||
tChip.hidden = false;
|
||||
tChip.className = 'stat-sensor stat-sensor-thermal stat-sensor-thermal-' + tp;
|
||||
tVal.textContent = tp === 'warn' ? 'WARM' : 'HOT';
|
||||
} else {
|
||||
tChip.hidden = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function handleJobAlert(data) {
|
||||
var isPass = data.state === 'passed';
|
||||
var icon = isPass ? '✓' : '✕';
|
||||
|
|
@ -6,7 +6,7 @@
|
|||
{% include "components/modal_start.html" %}
|
||||
{% include "components/modal_batch.html" %}
|
||||
|
||||
<!-- Stats bar — drive counts updated live by app.js updateCounts(); sensor chips updated by SSE system-sensors event -->
|
||||
<!-- Stats bar — counts are updated live by app.js updateCounts() -->
|
||||
<div class="stats-bar">
|
||||
<div class="stat-card" data-stat-filter="all">
|
||||
<span class="stat-value" id="stat-all">{{ drives | length }}</span>
|
||||
|
|
@ -28,33 +28,6 @@
|
|||
<span class="stat-value" id="stat-idle">0</span>
|
||||
<span class="stat-label">Idle</span>
|
||||
</div>
|
||||
|
||||
{%- set st = poller.system_temps if (poller and poller.system_temps) else {} %}
|
||||
{%- if st.get('cpu_c') is not none or st.get('pch_c') is not none %}
|
||||
<div class="stats-bar-sep"></div>
|
||||
{%- if st.get('cpu_c') is not none %}
|
||||
<div class="stat-sensor" id="sensor-cpu">
|
||||
<span class="stat-sensor-val {{ st.get('cpu_c') | temp_class }}" id="sensor-cpu-val">{{ st.get('cpu_c') }}°</span>
|
||||
<span class="stat-sensor-label">CPU</span>
|
||||
</div>
|
||||
{%- endif %}
|
||||
{%- if st.get('pch_c') is not none %}
|
||||
<div class="stat-sensor" id="sensor-pch">
|
||||
<span class="stat-sensor-val {{ st.get('pch_c') | temp_class }}" id="sensor-pch-val">{{ st.get('pch_c') }}°</span>
|
||||
<span class="stat-sensor-label">PCH</span>
|
||||
</div>
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
|
||||
{%- set tp = poller.thermal_pressure if poller else 'ok' %}
|
||||
<div class="stat-sensor stat-sensor-thermal stat-sensor-thermal-{{ tp }}"
|
||||
id="sensor-thermal"
|
||||
{% if not tp or tp == 'ok' %}hidden{% endif %}>
|
||||
<span class="stat-sensor-val" id="sensor-thermal-val">
|
||||
{%- if tp == 'warn' %}WARM{%- elif tp == 'crit' %}HOT{%- else %}OK{%- endif %}
|
||||
</span>
|
||||
<span class="stat-sensor-label">Thermal</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Failed drive banner — shown/hidden by JS when failed count > 0 -->
|
||||
|
|
@ -50,19 +50,12 @@ async def handle(ws: WebSocket) -> None:
|
|||
elif settings.ssh_password:
|
||||
connect_kw["password"] = settings.ssh_password
|
||||
else:
|
||||
# Fall back to mounted key file (same logic as ssh_client._connect)
|
||||
import os
|
||||
from app import ssh_client as _sc
|
||||
key_path = os.environ.get("SSH_KEY_FILE", _sc._MOUNTED_KEY_PATH)
|
||||
if os.path.exists(key_path):
|
||||
connect_kw["client_keys"] = [key_path]
|
||||
else:
|
||||
await _send(ws,
|
||||
b"\r\n\x1b[33mNo SSH credentials configured.\x1b[0m "
|
||||
b"Set a password or private key in Settings.\r\n"
|
||||
)
|
||||
await ws.close(1008)
|
||||
return
|
||||
await _send(ws,
|
||||
b"\r\n\x1b[33mNo SSH credentials configured.\x1b[0m "
|
||||
b"Set a password or private key in Settings.\r\n"
|
||||
)
|
||||
await ws.close(1008)
|
||||
return
|
||||
|
||||
await _send(ws,
|
||||
f"\r\n\x1b[36mConnecting to {settings.ssh_host}\u2026\x1b[0m\r\n".encode()
|
||||
|
|
@ -65,13 +65,7 @@ class TrueNASClient:
|
|||
"get_disks",
|
||||
)
|
||||
r.raise_for_status()
|
||||
disks = r.json()
|
||||
# Filter out expired records — TrueNAS keeps historical entries for removed
|
||||
# disks with expiretime set. Only return currently-present drives.
|
||||
active = [d for d in disks if not d.get("expiretime")]
|
||||
if len(active) < len(disks):
|
||||
log.debug("get_disks: filtered %d expired record(s)", len(disks) - len(active))
|
||||
return active
|
||||
return r.json()
|
||||
|
||||
async def get_smart_jobs(self, state: str | None = None) -> list[dict]:
|
||||
params: dict = {"method": "smart.test"}
|
||||
|
|
@ -116,49 +110,3 @@ class TrueNASClient:
|
|||
)
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
async def get_disk_temperatures(self) -> dict[str, float | None]:
|
||||
"""
|
||||
Returns {devname: celsius | None}.
|
||||
Uses POST /api/v2.0/disk/temperatures — available on TrueNAS SCALE 25.10+.
|
||||
CORE compatibility: raises on 404/405, caller should catch and skip.
|
||||
"""
|
||||
r = await _with_retry(
|
||||
lambda: self._client.post("/api/v2.0/disk/temperatures", json={}),
|
||||
"get_disk_temperatures",
|
||||
)
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
async def wipe_disk(self, devname: str, mode: str = "FULL") -> int:
|
||||
"""
|
||||
Start a disk wipe job. Not retried — duplicate starts would launch a second wipe.
|
||||
mode: "QUICK" (wipe MBR/partitions only), "FULL" (write zeros), "FULL_RANDOM" (write random)
|
||||
devname: basename only, e.g. "ada0" (not "/dev/ada0")
|
||||
Returns the TrueNAS job ID.
|
||||
"""
|
||||
r = await self._client.post(
|
||||
"/api/v2.0/disk/wipe",
|
||||
json={"dev": devname, "mode": mode},
|
||||
)
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
async def get_job(self, job_id: int) -> dict | None:
|
||||
"""
|
||||
Fetch a single TrueNAS job by ID.
|
||||
Returns the job dict, or None if not found.
|
||||
"""
|
||||
import json as _json
|
||||
r = await _with_retry(
|
||||
lambda: self._client.get(
|
||||
"/api/v2.0/core/get_jobs",
|
||||
params={"filters": _json.dumps([["id", "=", job_id]])},
|
||||
),
|
||||
f"get_job({job_id})",
|
||||
)
|
||||
r.raise_for_status()
|
||||
jobs = r.json()
|
||||
if isinstance(jobs, list) and jobs:
|
||||
return jobs[0]
|
||||
return None
|
||||
|
|
@ -1,23 +0,0 @@
|
|||
services:
|
||||
# mock-truenas is kept for local dev — not started in production
|
||||
# To use mock mode: docker compose --profile mock up
|
||||
# mock-truenas:
|
||||
# build: ./mock-truenas
|
||||
# container_name: mock-truenas
|
||||
# ports:
|
||||
# - "8000:8000"
|
||||
# profiles: [mock]
|
||||
# restart: unless-stopped
|
||||
|
||||
app:
|
||||
build: .
|
||||
container_name: truenas-burnin
|
||||
ports:
|
||||
- "8084:8084"
|
||||
env_file: .env
|
||||
volumes:
|
||||
- ./data:/data
|
||||
- ./app/templates:/opt/app/app/templates
|
||||
- ./app/static:/opt/app/app/static
|
||||
- /home/brandon/.ssh/id_ed25519:/run/secrets/ssh_key:ro
|
||||
restart: unless-stopped
|
||||
21
docker-compose.yml
Normal file
21
docker-compose.yml
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
services:
|
||||
mock-truenas:
|
||||
build: ./mock-truenas
|
||||
container_name: mock-truenas
|
||||
ports:
|
||||
- "8000:8000"
|
||||
restart: unless-stopped
|
||||
|
||||
app:
|
||||
build: .
|
||||
container_name: truenas-burnin
|
||||
ports:
|
||||
- "8084:8084"
|
||||
env_file: .env
|
||||
volumes:
|
||||
- ./data:/data
|
||||
- ./app/templates:/opt/app/app/templates
|
||||
- ./app/static:/opt/app/app/static
|
||||
depends_on:
|
||||
- mock-truenas
|
||||
restart: unless-stopped
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
fastapi
|
||||
uvicorn[standard]
|
||||
uvicorn
|
||||
aiosqlite
|
||||
httpx
|
||||
pydantic-settings
|
||||
Loading…
Add table
Reference in a new issue