39 changed files with 676 additions and 78 deletions
--- a/claude-sandbox/truenas-burnin/.env.example
+++ b/claude-sandbox/truenas-burnin/.env.example
--- a/claude-sandbox/truenas-burnin/.gitignore
+++ b/claude-sandbox/truenas-burnin/.gitignore
--- a/claude-sandbox/truenas-burnin/CLAUDE.md
+++ b/claude-sandbox/truenas-burnin/CLAUDE.md
@ -209,7 +209,7 @@ All read from `.env` via `pydantic-settings`. See `.env.example` for full list.
 | `TEMP_WARN_C` | `46` | Temperature warning threshold (°C) |
 | `TEMP_CRIT_C` | `55` | Temperature critical threshold — precheck fails above this |
 | `BAD_BLOCK_THRESHOLD` | `0` | Max bad blocks allowed before surface_validate fails (0 = any bad = fail) |
-| `APP_VERSION` | `1.0.0-7` | Displayed in header version badge |
+| `APP_VERSION` | `1.0.0-8` | Displayed in header version badge |
 | `SSH_HOST` | `` | TrueNAS SSH hostname/IP — empty disables SSH mode (uses mock/REST) |
 | `SSH_PORT` | `22` | TrueNAS SSH port |
 | `SSH_USER` | `root` | TrueNAS SSH username |
--- a/claude-sandbox/truenas-burnin/Dockerfile
+++ b/claude-sandbox/truenas-burnin/Dockerfile
--- a/claude-sandbox/truenas-burnin/SPEC.md
+++ b/claude-sandbox/truenas-burnin/SPEC.md
--- a/claude-sandbox/truenas-burnin/app/init.py
+++ b/claude-sandbox/truenas-burnin/app/init.py
--- a/claude-sandbox/truenas-burnin/app/burnin.py
+++ b/claude-sandbox/truenas-burnin/app/burnin.py
@ -206,10 +206,45 @@ async def cancel_job(job_id: int, operator: str) -> bool:
 # Job runner
 # ---------------------------------------------------------------------------
 async def _thermal_gate_ok() -> bool:
    """True if it's thermally safe to start a new burn-in.
    Checks the peak temperature of drives currently under active burn-in.
    """
    try:
        async with _db() as db:
            cur = await db.execute("""
                SELECT MAX(d.temperature_c)
                FROM drives d
                JOIN burnin_jobs bj ON bj.drive_id = d.id
                WHERE bj.state = 'running' AND d.temperature_c IS NOT NULL
            """)
            row = await cur.fetchone()
            max_temp = row[0] if row and row[0] is not None else None
        return max_temp is None or max_temp < settings.temp_warn_c
    except Exception:
        return True  # Never block on error
 async def _run_job(job_id: int) -> None:
    """Acquire semaphore slot, execute all stages, persist final state."""
    assert _semaphore is not None, "burnin.init() not called"
    # Adaptive thermal gate: wait before competing for a slot if running drives
    # are already at or above the warning threshold.  This prevents layering a
    # new burn-in on top of a thermally-stressed system.  Gives up after 3 min
    # and proceeds anyway so jobs don't queue indefinitely.
    for _attempt in range(18):  # 18 × 10 s = 3 min max
        if await _thermal_gate_ok():
            break
        if _attempt == 0:
            log.info(
                "Thermal gate: job %d waiting — running drive temps at or above %d°C",
                job_id, settings.temp_warn_c,
            )
        await asyncio.sleep(10)
    else:
        log.warning("Thermal gate timed out for job %d — proceeding anyway", job_id)
    async with _semaphore:
        if await _is_cancelled(job_id):
            return
@ -519,15 +554,39 @@ async def _stage_smart_test_ssh(job_id: int, devname: str, test_type: str, stage
        # "unknown" → keep polling
 async def _badblocks_available() -> bool:
    """Check if badblocks is installed on the remote host (Linux/SCALE only)."""
    from app import ssh_client
    try:
        async with await ssh_client._connect() as conn:
            result = await conn.run("which badblocks", check=False)
            return result.returncode == 0
    except Exception:
        return False
 async def _stage_surface_validate(job_id: int, devname: str, drive_id: int) -> bool:
    """
-    Surface validation stage.
+    Surface validation stage — auto-routes to the right implementation:
-    SSH mode: runs badblocks -wsv -b 4096 -p 1 /dev/{devname}.
+
-    Mock mode: simulated timed progress (no real I/O).
+    1. SSH configured + badblocks available (TrueNAS SCALE / Linux):
       → runs badblocks -wsv -b 4096 -p 1 /dev/{devname} directly over SSH.
    2. SSH configured + badblocks NOT available (TrueNAS CORE / FreeBSD):
       → uses TrueNAS REST API disk.wipe FULL job + post-wipe SMART check.
    3. No SSH:
       → simulated timed progress (dev/mock mode).
    """
    from app import ssh_client
    if ssh_client.is_configured():
-        return await _stage_surface_validate_ssh(job_id, devname, drive_id)
+        if await _badblocks_available():
            return await _stage_surface_validate_ssh(job_id, devname, drive_id)
        # TrueNAS CORE/FreeBSD: badblocks not available — use native wipe API
        await _append_stage_log(
            job_id, "surface_validate",
            "[INFO] badblocks not found on host (TrueNAS CORE/FreeBSD) — "
            "using TrueNAS disk.wipe API (FULL write pass).\n\n"
        )
        return await _stage_surface_validate_truenas(job_id, devname, drive_id)
    return await _stage_timed_simulate(job_id, "surface_validate", settings.surface_validate_seconds)
@ -655,6 +714,116 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
    return True
 async def _stage_surface_validate_truenas(job_id: int, devname: str, drive_id: int) -> bool:
    """
    Surface validation via TrueNAS CORE disk.wipe REST API.
    Used on FreeBSD (TrueNAS CORE) where badblocks is unavailable.
    Sends a FULL write-zero pass across the entire disk, polls progress,
    then runs a post-wipe SMART attribute check to catch reallocated sectors.
    """
    from app import ssh_client
    await _append_stage_log(
        job_id, "surface_validate",
        f"[START] TrueNAS disk.wipe FULL — {devname}\n"
        f"[NOTE]  DESTRUCTIVE: all data on {devname} will be overwritten.\n\n"
    )
    # Start the wipe job
    try:
        tn_job_id = await _client.wipe_disk(devname, "FULL")
    except Exception as exc:
        await _set_stage_error(job_id, "surface_validate", f"Failed to start disk.wipe: {exc}")
        return False
    await _append_stage_log(
        job_id, "surface_validate",
        f"[JOB] TrueNAS wipe job started (job_id={tn_job_id})\n"
    )
    # Poll until complete
    log_flush_counter = 0
    while True:
        if await _is_cancelled(job_id):
            try:
                await _client.abort_job(tn_job_id)
            except Exception:
                pass
            return False
        await asyncio.sleep(POLL_INTERVAL)
        try:
            job = await _client.get_job(tn_job_id)
        except Exception as exc:
            log.warning("Wipe job poll failed: %s", exc, extra={"job_id": job_id})
            await _append_stage_log(job_id, "surface_validate", f"[poll error] {exc}\n")
            continue
        if not job:
            await _set_stage_error(job_id, "surface_validate", f"Wipe job {tn_job_id} not found")
            return False
        state = job.get("state", "")
        pct = int(job.get("progress", {}).get("percent", 0) or 0)
        desc = job.get("progress", {}).get("description", "")
        await _update_stage_percent(job_id, "surface_validate", min(pct, 99))
        await _recalculate_progress(job_id)
        _push_update()
        # Log progress description every ~5 polls to avoid DB spam
        log_flush_counter += 1
        if desc and log_flush_counter % 5 == 0:
            await _append_stage_log(job_id, "surface_validate", f"[{pct}%] {desc}\n")
        if state == "SUCCESS":
            await _update_stage_percent(job_id, "surface_validate", 100)
            await _append_stage_log(
                job_id, "surface_validate",
                f"\n[DONE] Wipe job {tn_job_id} completed successfully.\n"
            )
            # Post-wipe SMART check — catch any sectors that failed under write stress
            if ssh_client.is_configured() and drive_id is not None:
                await _append_stage_log(
                    job_id, "surface_validate",
                    "[CHECK] Running post-wipe SMART attribute check...\n"
                )
                try:
                    attrs = await ssh_client.get_smart_attributes(devname)
                    await _store_smart_attrs(drive_id, attrs)
                    if attrs["failures"]:
                        error = "Post-wipe SMART check: " + "; ".join(attrs["failures"])
                        await _set_stage_error(job_id, "surface_validate", error)
                        return False
                    if attrs["warnings"]:
                        await _append_stage_log(
                            job_id, "surface_validate",
                            "[WARNING] " + "; ".join(attrs["warnings"]) + "\n"
                        )
                    await _append_stage_log(
                        job_id, "surface_validate",
                        f"[CHECK] SMART health: {attrs['health']} — no critical attributes.\n"
                    )
                except Exception as exc:
                    log.warning("Post-wipe SMART check failed: %s", exc)
                    await _append_stage_log(
                        job_id, "surface_validate",
                        f"[WARN] Post-wipe SMART check failed (non-fatal): {exc}\n"
                    )
            return True
        elif state in ("FAILED", "ABORTED", "ERROR"):
            error_msg = job.get("error") or f"Disk wipe failed (state={state})"
            await _set_stage_error(
                job_id, "surface_validate",
                f"TrueNAS disk.wipe FAILED: {error_msg}"
            )
            return False
        # RUNNING or WAITING — keep polling
 async def _stage_timed_simulate(job_id: int, stage_name: str, duration_seconds: int) -> bool:
    """Simulate a timed stage with progress updates (mock / dev mode)."""
    start = time.monotonic()
--- a/claude-sandbox/truenas-burnin/app/config.py
+++ b/claude-sandbox/truenas-burnin/app/config.py
@ -68,7 +68,7 @@ class Settings(BaseSettings):
    ssh_key: str = ""             # PEM private key content (paste full key including headers)
    # Application version — used by the /api/v1/updates/check endpoint
-    app_version: str = "1.0.0-7"
+    app_version: str = "1.0.0-8"
 settings = Settings()
--- a/claude-sandbox/truenas-burnin/app/database.py
+++ b/claude-sandbox/truenas-burnin/app/database.py
--- a/claude-sandbox/truenas-burnin/app/logging_config.py
+++ b/claude-sandbox/truenas-burnin/app/logging_config.py
--- a/claude-sandbox/truenas-burnin/app/mailer.py
+++ b/claude-sandbox/truenas-burnin/app/mailer.py
--- a/claude-sandbox/truenas-burnin/app/main.py
+++ b/claude-sandbox/truenas-burnin/app/main.py
--- a/claude-sandbox/truenas-burnin/app/models.py
+++ b/claude-sandbox/truenas-burnin/app/models.py
--- a/claude-sandbox/truenas-burnin/app/notifier.py
+++ b/claude-sandbox/truenas-burnin/app/notifier.py
--- a/claude-sandbox/truenas-burnin/app/poller.py
+++ b/claude-sandbox/truenas-burnin/app/poller.py
@ -20,13 +20,15 @@ from app.truenas import TrueNASClient
 log = logging.getLogger(__name__)
-# Shared state read by the /health endpoint
+# Shared state read by the /health endpoint and dashboard template
 _state: dict[str, Any] = {
    "last_poll_at": None,
    "last_error": None,
    "healthy": False,
    "drives_seen": 0,
    "consecutive_failures": 0,
    "system_temps": {},        # {"cpu_c": int|None, "pch_c": int|None}
    "thermal_pressure": "ok",  # "ok" | "warn" | "crit" — based on running burn-in drive temps
 }
 # SSE subscriber queues — notified after each successful poll
@ -208,6 +210,67 @@ async def _sync_history(
 # Poll cycle
 # ---------------------------------------------------------------------------
 async def _poll_smart_via_ssh(db: aiosqlite.Connection, now: str) -> None:
    """
    Poll progress for SMART tests started via SSH (truenas_job_id IS NULL).
    Used on TrueNAS SCALE 25.10+ where the REST smart/test API no longer exists.
    """
    from app import ssh_client
    if not ssh_client.is_configured():
        return
    cur = await db.execute(
        """SELECT st.id, st.test_type, st.drive_id, d.devname, st.started_at
           FROM smart_tests st
           JOIN drives d ON d.id = st.drive_id
           WHERE st.state = 'running' AND st.truenas_job_id IS NULL"""
    )
    rows = await cur.fetchall()
    if not rows:
        return
    for row in rows:
        test_id, ttype, drive_id, devname, started_at = row[0], row[1], row[2], row[3], row[4]
        try:
            progress = await ssh_client.poll_smart_progress(devname)
        except Exception as exc:
            log.warning("SSH SMART poll failed for %s: %s", devname, exc)
            continue
        state = progress["state"]
        pct_remaining = progress.get("percent_remaining")  # None = not yet in output
        raw_output = progress.get("output", "")
        if state == "running":
            # pct_remaining=None means smartctl output doesn't have the % line yet
            # (test just started) — keep percent at 0 rather than jumping to 100
            if pct_remaining is None:
                pct = 0
            else:
                pct = max(0, 100 - pct_remaining)
            eta = _eta_from_progress(pct, started_at)
            await db.execute(
                "UPDATE smart_tests SET percent=?, eta_at=?, raw_output=? WHERE id=?",
                (pct, eta, raw_output, test_id),
            )
        elif state == "passed":
            await db.execute(
                "UPDATE smart_tests SET state='passed', percent=100, finished_at=?, raw_output=? WHERE id=?",
                (now, raw_output, test_id),
            )
            log.info("SSH SMART %s passed on %s", ttype, devname)
        elif state == "failed":
            await db.execute(
                "UPDATE smart_tests SET state='failed', percent=0, finished_at=?, "
                "error_text=?, raw_output=? WHERE id=?",
                (now, f"SMART {ttype.upper()} test failed", raw_output, test_id),
            )
            log.warning("SSH SMART %s FAILED on %s", ttype, devname)
        # state == "unknown" → keep polling, no update
    await db.commit()
 async def poll_cycle(client: TrueNASClient) -> int:
    """Run one full poll. Returns number of drives seen."""
    now = _now()
@ -215,6 +278,20 @@ async def poll_cycle(client: TrueNASClient) -> int:
    disks = await client.get_disks()
    running_jobs = await client.get_smart_jobs(state="RUNNING")
    # Fetch temperatures via SCALE-specific endpoint.
    # CORE doesn't have this endpoint — silently skip on any error.
    try:
        temps = await client.get_disk_temperatures()
    except Exception:
        temps = {}
    # Inject temperature into each disk dict (SCALE 25.10 has no temp in /disk)
    for disk in disks:
        devname = disk.get("devname", "")
        t = temps.get(devname)
        if t is not None:
            disk["temperature"] = int(round(t))
    # Index running jobs by (devname, test_type)
    active: dict[tuple[str, str], dict] = {}
    for job in running_jobs:
@ -243,6 +320,9 @@ async def poll_cycle(client: TrueNASClient) -> int:
        await db.commit()
        # SSH SMART polling — for tests started via smartctl (no TrueNAS REST job)
        await _poll_smart_via_ssh(db, now)
    return len(disks)
@ -263,6 +343,39 @@ async def run(client: TrueNASClient) -> None:
            _state["drives_seen"] = count
            _state["consecutive_failures"] = 0
            log.debug("Poll OK", extra={"drives": count})
            # System sensor temps via SSH (non-fatal)
            from app import ssh_client as _ssh
            if _ssh.is_configured():
                try:
                    _state["system_temps"] = await _ssh.get_system_sensors()
                except Exception:
                    pass
            # Thermal pressure: max temp of drives currently under burn-in
            try:
                async with aiosqlite.connect(settings.db_path) as _tdb:
                    _tdb.row_factory = aiosqlite.Row
                    await _tdb.execute("PRAGMA journal_mode=WAL")
                    _cur = await _tdb.execute("""
                        SELECT MAX(d.temperature_c)
                        FROM drives d
                        JOIN burnin_jobs bj ON bj.drive_id = d.id
                        WHERE bj.state = 'running' AND d.temperature_c IS NOT NULL
                    """)
                    _row = await _cur.fetchone()
                    _max_t = _row[0] if _row and _row[0] is not None else None
                if _max_t is None:
                    _state["thermal_pressure"] = "ok"
                elif _max_t >= settings.temp_crit_c:
                    _state["thermal_pressure"] = "crit"
                elif _max_t >= settings.temp_warn_c:
                    _state["thermal_pressure"] = "warn"
                else:
                    _state["thermal_pressure"] = "ok"
            except Exception:
                _state["thermal_pressure"] = "ok"
            _notify_subscribers()
            # Check for stuck jobs every 5 cycles (~1 min at default 12s interval)
--- a/claude-sandbox/truenas-burnin/app/renderer.py
+++ b/claude-sandbox/truenas-burnin/app/renderer.py
--- a/claude-sandbox/truenas-burnin/app/routes.py
+++ b/claude-sandbox/truenas-burnin/app/routes.py
@ -218,6 +218,18 @@ async def sse_drives(request: Request):
                yield {"event": "drives-update", "data": html}
                # Push system sensor state so JS can update temp chips live
                ps = poller.get_state()
                yield {
                    "event": "system-sensors",
                    "data": json.dumps({
                        "system_temps":    ps.get("system_temps", {}),
                        "thermal_pressure": ps.get("thermal_pressure", "ok"),
                        "temp_warn_c":     settings.temp_warn_c,
                        "temp_crit_c":     settings.temp_crit_c,
                    }),
                }
                # Push browser notification event if this was a job completion
                if alert:
                    yield {"event": "job-alert", "data": json.dumps(alert)}
@ -353,9 +365,13 @@ async def smart_start(
    body: dict,
    db: aiosqlite.Connection = Depends(get_db),
 ):
-    """Start a standalone SHORT or LONG SMART test on a single drive."""
+    """Start a standalone SHORT or LONG SMART test on a single drive.
-    from app.truenas import TrueNASClient
+
-    from app import burnin as _burnin
+    Uses SSH (smartctl) when configured — required for TrueNAS SCALE 25.10+
    where the REST smart/test endpoint no longer exists.
    Falls back to TrueNAS REST API for older versions.
    """
    from app import burnin as _burnin, ssh_client
    test_type = (body.get("type") or "").upper()
    if test_type not in ("SHORT", "LONG"):
@ -367,17 +383,42 @@ async def smart_start(
        raise HTTPException(status_code=404, detail="Drive not found")
    devname = row[0]
-    # Use the shared TrueNAS client held by the burnin module
+    now = datetime.now(timezone.utc).isoformat()
-    client = _burnin._client
+    ttype_lower = test_type.lower()
    if client is None:
        raise HTTPException(status_code=503, detail="TrueNAS client not ready")
-    try:
+    if ssh_client.is_configured():
-        tn_job_id = await client.start_smart_test([devname], test_type)
+        # SSH path — works on TrueNAS SCALE 25.10+ and CORE
-    except Exception as exc:
+        try:
-        raise HTTPException(status_code=502, detail=f"TrueNAS error: {exc}")
+            output = await ssh_client.start_smart_test(devname, test_type)
        except Exception as exc:
            raise HTTPException(status_code=502, detail=f"SSH error: {exc}")
-    return {"job_id": tn_job_id, "devname": devname, "type": test_type}
+        # Mark as running in DB (truenas_job_id=NULL signals SSH-managed test)
        # Store smartctl start output as proof the test was initiated
        await db.execute(
            """INSERT INTO smart_tests (drive_id, test_type, state, percent, started_at, raw_output)
               VALUES (?,?,?,?,?,?)
               ON CONFLICT(drive_id, test_type) DO UPDATE SET
                   state='running', percent=0, truenas_job_id=NULL,
                   started_at=excluded.started_at, finished_at=NULL, error_text=NULL,
                   raw_output=excluded.raw_output""",
            (drive_id, ttype_lower, "running", 0, now, output),
        )
        await db.commit()
        from app import poller as _poller
        _poller._notify_subscribers()
        return {"devname": devname, "type": test_type, "message": output[:200]}
    else:
        # REST path — older TrueNAS CORE / SCALE versions
        client = _burnin._client
        if client is None:
            raise HTTPException(status_code=503, detail="TrueNAS client not ready")
        try:
            tn_job_id = await client.start_smart_test([devname], test_type)
        except Exception as exc:
            raise HTTPException(status_code=502, detail=f"TrueNAS error: {exc}")
        return {"job_id": tn_job_id, "devname": devname, "type": test_type}
@router.post("/api/v1/drives/{drive_id}/smart/cancel")
@ -403,28 +444,37 @@ async def smart_cancel(
    if client is None:
        raise HTTPException(status_code=503, detail="TrueNAS client not ready")
-    # Find the running TrueNAS job for this drive/test-type
+    from app import ssh_client
    try:
        jobs = await client.get_smart_jobs()
        tn_job_id = None
        for j in jobs:
            if j.get("state") != "RUNNING":
                continue
            args = j.get("arguments", [])
            if not args or not isinstance(args[0], dict):
                continue
            if devname in args[0].get("disks", []):
                tn_job_id = j["id"]
                break
-        if tn_job_id is None:
+    if ssh_client.is_configured():
-            raise HTTPException(status_code=404, detail="No running SMART test found for this drive")
+        # SSH path — abort via smartctl -X
        try:
            await ssh_client.abort_smart_test(devname)
        except Exception as exc:
            raise HTTPException(status_code=502, detail=f"SSH abort error: {exc}")
    else:
        # REST path — find TrueNAS job and abort it
        try:
            jobs = await client.get_smart_jobs()
            tn_job_id = None
            for j in jobs:
                if j.get("state") != "RUNNING":
                    continue
                args = j.get("arguments", [])
                if not args or not isinstance(args[0], dict):
                    continue
                if devname in args[0].get("disks", []):
                    tn_job_id = j["id"]
                    break
-        await client.abort_job(tn_job_id)
+            if tn_job_id is None:
-    except HTTPException:
+                raise HTTPException(status_code=404, detail="No running SMART test found for this drive")
-        raise
+
-    except Exception as exc:
+            await client.abort_job(tn_job_id)
-        raise HTTPException(status_code=502, detail=f"TrueNAS error: {exc}")
+        except HTTPException:
            raise
        except Exception as exc:
            raise HTTPException(status_code=502, detail=f"TrueNAS error: {exc}")
    # Update local DB state
    now = datetime.now(timezone.utc).isoformat()
--- a/claude-sandbox/truenas-burnin/app/settings_store.py
+++ b/claude-sandbox/truenas-burnin/app/settings_store.py
--- a/claude-sandbox/truenas-burnin/app/ssh_client.py
+++ b/claude-sandbox/truenas-burnin/app/ssh_client.py
@ -38,15 +38,26 @@ SMART_ATTRS: dict[int, tuple[str, bool]] = {
 # ---------------------------------------------------------------------------
 def is_configured() -> bool:
-    """Returns True when SSH credentials are present and usable."""
+    """Returns True when SSH host + at least one auth method is available."""
    import os
    from app.config import settings
-    return bool(settings.ssh_host and (settings.ssh_password or settings.ssh_key))
+    if not settings.ssh_host:
        return False
    has_creds = bool(
        settings.ssh_key
        or settings.ssh_password
        or os.path.exists(os.environ.get("SSH_KEY_FILE", _MOUNTED_KEY_PATH))
    )
    return has_creds
 # ---------------------------------------------------------------------------
 # Low-level connection
 # ---------------------------------------------------------------------------
 _MOUNTED_KEY_PATH = "/run/secrets/ssh_key"
 async def _connect():
    """Open a single-use SSH connection. Caller must use `async with`."""
    import asyncssh
@ -59,9 +70,17 @@ async def _connect():
        "known_hosts": None,          # trust all hosts (same spirit as TRUENAS_VERIFY_TLS=false)
    }
    if settings.ssh_key:
        # Key material provided via env var (base case)
        kwargs["client_keys"] = [asyncssh.import_private_key(settings.ssh_key)]
-    if settings.ssh_password:
+    elif settings.ssh_password:
        kwargs["password"] = settings.ssh_password
    else:
        # Fall back to mounted key file (preferred for production — no key in env vars)
        import os
        key_path = os.environ.get("SSH_KEY_FILE", _MOUNTED_KEY_PATH)
        if os.path.exists(key_path):
            kwargs["client_keys"] = [key_path]
        # If nothing is configured, asyncssh will attempt agent/default key lookup
    return asyncssh.connect(**kwargs)
@ -228,6 +247,70 @@ async def run_badblocks(
    }
 async def get_system_sensors() -> dict:
    """
    Run `sensors -j` on TrueNAS and extract system-level temperatures.
    Returns {"cpu_c": int|None, "pch_c": int|None}.
    cpu_c  = CPU package temp (coretemp chip)
    pch_c  = PCH/chipset temp (pch_* chip) — proxy for storage I/O lane thermals
    Falls back gracefully if SSH is not configured or lm-sensors is unavailable.
    """
    if not is_configured():
        return {}
    try:
        async with await _connect() as conn:
            result = await conn.run("sensors -j 2>/dev/null", check=False)
            output = result.stdout.strip()
            if not output:
                return {}
            return _parse_sensors_json(output)
    except Exception as exc:
        log.debug("get_system_sensors failed: %s", exc)
        return {}
 def _parse_sensors_json(output: str) -> dict:
    import json as _json
    try:
        data = _json.loads(output)
    except Exception:
        return {}
    cpu_c: int | None = None
    pch_c: int | None = None
    for chip_name, chip_data in data.items():
        if not isinstance(chip_data, dict):
            continue
        # CPU package temp — coretemp chip, "Package id N" sensor
        if chip_name.startswith("coretemp") and cpu_c is None:
            for sensor_name, sensor_vals in chip_data.items():
                if not isinstance(sensor_vals, dict):
                    continue
                if "package" in sensor_name.lower():
                    for k, v in sensor_vals.items():
                        if k.endswith("_input") and isinstance(v, (int, float)):
                            cpu_c = int(round(v))
                            break
                if cpu_c is not None:
                    break
        # PCH / chipset temp — manages PCIe lanes including HBA / storage I/O
        elif chip_name.startswith("pch_") and pch_c is None:
            for sensor_name, sensor_vals in chip_data.items():
                if not isinstance(sensor_vals, dict):
                    continue
                for k, v in sensor_vals.items():
                    if k.endswith("_input") and isinstance(v, (int, float)):
                        pch_c = int(round(v))
                        break
                if pch_c is not None:
                    break
    return {"cpu_c": cpu_c, "pch_c": pch_c}
 # ---------------------------------------------------------------------------
 # Parsers
 # ---------------------------------------------------------------------------
@ -275,7 +358,7 @@ def _parse_smartctl(output: str) -> dict:
 def _parse_smart_progress(output: str) -> dict:
    state = "unknown"
-    percent_remaining = 0
+    percent_remaining = None  # None = "in progress but no % line parsed yet"
    lower = output.lower()
--- a/claude-sandbox/truenas-burnin/app/static/app.css
+++ b/claude-sandbox/truenas-burnin/app/static/app.css
@ -1076,6 +1076,56 @@ a.stat-card:hover {
 .stat-passed  .stat-value { color: var(--green); }
 .stat-idle    .stat-value { color: var(--text-muted); }
 /* Vertical separator between drive-count cards and sensor chips */
 .stats-bar-sep {
  width: 1px;
  height: 36px;
  background: var(--border);
  align-self: center;
  flex-shrink: 0;
 }
 /* Compact sensor chip — CPU / PCH / Thermal */
 .stat-sensor {
  background: var(--bg-card);
  border: 1px solid var(--border);
  border-radius: 8px;
  padding: 6px 12px;
  text-align: center;
  min-width: 52px;
  display: flex;
  flex-direction: column;
  gap: 2px;
 }
 .stat-sensor-val {
  font-size: 16px;
  font-weight: 700;
  font-variant-numeric: tabular-nums;
  line-height: 1.1;
 }
 .stat-sensor-label {
  font-size: 9px;
  text-transform: uppercase;
  letter-spacing: 0.08em;
  color: var(--text-muted);
  line-height: 1.2;
 }
 /* Thermal pressure states */
 .stat-sensor-thermal-warn {
  border-color: var(--yellow-bd);
  background: var(--yellow-bg);
 }
 .stat-sensor-thermal-warn .stat-sensor-val { color: var(--yellow); }
 .stat-sensor-thermal-crit {
  border-color: var(--red-bd);
  background: var(--red-bg);
 }
 .stat-sensor-thermal-crit .stat-sensor-val { color: var(--red); }
 /* -----------------------------------------------------------------------
   Batch action bar (inside filter-bar)
 ----------------------------------------------------------------------- */
--- a/claude-sandbox/truenas-burnin/app/static/app.js
+++ b/claude-sandbox/truenas-burnin/app/static/app.js
@ -135,14 +135,59 @@
    if (nb) nb.style.display = 'none';
  }
-  // Handle job-alert SSE events for browser notifications
+  // Handle SSE events
  document.addEventListener('htmx:sseMessage', function (e) {
-    if (!e.detail || e.detail.type !== 'job-alert') return;
+    if (!e.detail) return;
-    try {
+    if (e.detail.type === 'job-alert') {
-      handleJobAlert(JSON.parse(e.detail.data));
+      try { handleJobAlert(JSON.parse(e.detail.data)); } catch (_) {}
-    } catch (_) {}
+    } else if (e.detail.type === 'system-sensors') {
      try { handleSystemSensors(JSON.parse(e.detail.data)); } catch (_) {}
    }
  });
  function handleSystemSensors(data) {
    var st   = data.system_temps  || {};
    var tp   = data.thermal_pressure || 'ok';
    var warn = data.temp_warn_c   || 46;
    var crit = data.temp_crit_c   || 55;
    function tempClass(c) {
      if (c == null) return '';
      return c >= crit ? 'temp-hot' : c >= warn ? 'temp-warm' : 'temp-cool';
    }
    // CPU chip
    var cpuChip = document.getElementById('sensor-cpu');
    var cpuVal  = document.getElementById('sensor-cpu-val');
    if (cpuVal && st.cpu_c != null) {
      if (cpuChip) cpuChip.hidden = false;
      cpuVal.textContent = st.cpu_c + '°';
      cpuVal.className   = 'stat-sensor-val ' + tempClass(st.cpu_c);
    }
    // PCH chip
    var pchChip = document.getElementById('sensor-pch');
    var pchVal  = document.getElementById('sensor-pch-val');
    if (pchVal && st.pch_c != null) {
      if (pchChip) pchChip.hidden = false;
      pchVal.textContent = st.pch_c + '°';
      pchVal.className   = 'stat-sensor-val ' + tempClass(st.pch_c);
    }
    // Thermal pressure chip
    var tChip = document.getElementById('sensor-thermal');
    var tVal  = document.getElementById('sensor-thermal-val');
    if (tChip && tVal) {
      if (tp === 'warn' || tp === 'crit') {
        tChip.hidden = false;
        tChip.className = 'stat-sensor stat-sensor-thermal stat-sensor-thermal-' + tp;
        tVal.textContent = tp === 'warn' ? 'WARM' : 'HOT';
      } else {
        tChip.hidden = true;
      }
    }
  }
  function handleJobAlert(data) {
    var isPass   = data.state === 'passed';
    var icon     = isPass ? '✓' : '✕';
--- a/claude-sandbox/truenas-burnin/app/templates/audit.html
+++ b/claude-sandbox/truenas-burnin/app/templates/audit.html
--- a/claude-sandbox/truenas-burnin/app/templates/components/drives_table.html
+++ b/claude-sandbox/truenas-burnin/app/templates/components/drives_table.html
--- a/claude-sandbox/truenas-burnin/app/templates/components/modal_batch.html
+++ b/claude-sandbox/truenas-burnin/app/templates/components/modal_batch.html
--- a/claude-sandbox/truenas-burnin/app/templates/components/modal_start.html
+++ b/claude-sandbox/truenas-burnin/app/templates/components/modal_start.html
--- a/claude-sandbox/truenas-burnin/app/templates/dashboard.html
+++ b/claude-sandbox/truenas-burnin/app/templates/dashboard.html
@ -6,7 +6,7 @@
 {% include "components/modal_start.html" %}
 {% include "components/modal_batch.html" %}
-<!-- Stats bar — counts are updated live by app.js updateCounts() -->
+<!-- Stats bar — drive counts updated live by app.js updateCounts(); sensor chips updated by SSE system-sensors event -->
 <div class="stats-bar">
  <div class="stat-card" data-stat-filter="all">
    <span class="stat-value" id="stat-all">{{ drives | length }}</span>
@ -28,6 +28,33 @@
    <span class="stat-value" id="stat-idle">0</span>
    <span class="stat-label">Idle</span>
  </div>
  {%- set st = poller.system_temps if (poller and poller.system_temps) else {} %}
  {%- if st.get('cpu_c') is not none or st.get('pch_c') is not none %}
  <div class="stats-bar-sep"></div>
  {%- if st.get('cpu_c') is not none %}
  <div class="stat-sensor" id="sensor-cpu">
    <span class="stat-sensor-val {{ st.get('cpu_c') | temp_class }}" id="sensor-cpu-val">{{ st.get('cpu_c') }}°</span>
    <span class="stat-sensor-label">CPU</span>
  </div>
  {%- endif %}
  {%- if st.get('pch_c') is not none %}
  <div class="stat-sensor" id="sensor-pch">
    <span class="stat-sensor-val {{ st.get('pch_c') | temp_class }}" id="sensor-pch-val">{{ st.get('pch_c') }}°</span>
    <span class="stat-sensor-label">PCH</span>
  </div>
  {%- endif %}
  {%- endif %}
  {%- set tp = poller.thermal_pressure if poller else 'ok' %}
  <div class="stat-sensor stat-sensor-thermal stat-sensor-thermal-{{ tp }}"
       id="sensor-thermal"
       {% if not tp or tp == 'ok' %}hidden{% endif %}>
    <span class="stat-sensor-val" id="sensor-thermal-val">
      {%- if tp == 'warn' %}WARM{%- elif tp == 'crit' %}HOT{%- else %}OK{%- endif %}
    </span>
    <span class="stat-sensor-label">Thermal</span>
  </div>
 </div>
 <!-- Failed drive banner — shown/hidden by JS when failed count > 0 -->
--- a/claude-sandbox/truenas-burnin/app/templates/history.html
+++ b/claude-sandbox/truenas-burnin/app/templates/history.html
--- a/claude-sandbox/truenas-burnin/app/templates/job_detail.html
+++ b/claude-sandbox/truenas-burnin/app/templates/job_detail.html
--- a/claude-sandbox/truenas-burnin/app/templates/job_print.html
+++ b/claude-sandbox/truenas-burnin/app/templates/job_print.html
--- a/claude-sandbox/truenas-burnin/app/templates/layout.html
+++ b/claude-sandbox/truenas-burnin/app/templates/layout.html
--- a/claude-sandbox/truenas-burnin/app/templates/settings.html
+++ b/claude-sandbox/truenas-burnin/app/templates/settings.html
--- a/claude-sandbox/truenas-burnin/app/templates/stats.html
+++ b/claude-sandbox/truenas-burnin/app/templates/stats.html
--- a/claude-sandbox/truenas-burnin/app/terminal.py
+++ b/claude-sandbox/truenas-burnin/app/terminal.py
@ -50,12 +50,19 @@ async def handle(ws: WebSocket) -> None:
    elif settings.ssh_password:
        connect_kw["password"] = settings.ssh_password
    else:
-        await _send(ws,
+        # Fall back to mounted key file (same logic as ssh_client._connect)
-            b"\r\n\x1b[33mNo SSH credentials configured.\x1b[0m "
+        import os
-            b"Set a password or private key in Settings.\r\n"
+        from app import ssh_client as _sc
-        )
+        key_path = os.environ.get("SSH_KEY_FILE", _sc._MOUNTED_KEY_PATH)
-        await ws.close(1008)
+        if os.path.exists(key_path):
-        return
+            connect_kw["client_keys"] = [key_path]
        else:
            await _send(ws,
                b"\r\n\x1b[33mNo SSH credentials configured.\x1b[0m "
                b"Set a password or private key in Settings.\r\n"
            )
            await ws.close(1008)
            return
    await _send(ws,
        f"\r\n\x1b[36mConnecting to {settings.ssh_host}\u2026\x1b[0m\r\n".encode()
--- a/claude-sandbox/truenas-burnin/app/truenas.py
+++ b/claude-sandbox/truenas-burnin/app/truenas.py
@ -65,7 +65,13 @@ class TrueNASClient:
            "get_disks",
        )
        r.raise_for_status()
-        return r.json()
+        disks = r.json()
        # Filter out expired records — TrueNAS keeps historical entries for removed
        # disks with expiretime set. Only return currently-present drives.
        active = [d for d in disks if not d.get("expiretime")]
        if len(active) < len(disks):
            log.debug("get_disks: filtered %d expired record(s)", len(disks) - len(active))
        return active
    async def get_smart_jobs(self, state: str | None = None) -> list[dict]:
        params: dict = {"method": "smart.test"}
@ -110,3 +116,49 @@ class TrueNASClient:
        )
        r.raise_for_status()
        return r.json()
    async def get_disk_temperatures(self) -> dict[str, float | None]:
        """
        Returns {devname: celsius | None}.
        Uses POST /api/v2.0/disk/temperatures — available on TrueNAS SCALE 25.10+.
        CORE compatibility: raises on 404/405, caller should catch and skip.
        """
        r = await _with_retry(
            lambda: self._client.post("/api/v2.0/disk/temperatures", json={}),
            "get_disk_temperatures",
        )
        r.raise_for_status()
        return r.json()
    async def wipe_disk(self, devname: str, mode: str = "FULL") -> int:
        """
        Start a disk wipe job. Not retried — duplicate starts would launch a second wipe.
        mode: "QUICK" (wipe MBR/partitions only), "FULL" (write zeros), "FULL_RANDOM" (write random)
        devname: basename only, e.g. "ada0" (not "/dev/ada0")
        Returns the TrueNAS job ID.
        """
        r = await self._client.post(
            "/api/v2.0/disk/wipe",
            json={"dev": devname, "mode": mode},
        )
        r.raise_for_status()
        return r.json()
    async def get_job(self, job_id: int) -> dict | None:
        """
        Fetch a single TrueNAS job by ID.
        Returns the job dict, or None if not found.
        """
        import json as _json
        r = await _with_retry(
            lambda: self._client.get(
                "/api/v2.0/core/get_jobs",
                params={"filters": _json.dumps([["id", "=", job_id]])},
            ),
            f"get_job({job_id})",
        )
        r.raise_for_status()
        jobs = r.json()
        if isinstance(jobs, list) and jobs:
            return jobs[0]
        return None
--- a/claude-sandbox/truenas-burnin/docker-compose.yml
+++ b/claude-sandbox/truenas-burnin/docker-compose.yml
@ -0,0 +1,23 @@
 services:
  # mock-truenas is kept for local dev — not started in production
  # To use mock mode: docker compose --profile mock up
  # mock-truenas:
  #   build: ./mock-truenas
  #   container_name: mock-truenas
  #   ports:
  #     - "8000:8000"
  #   profiles: [mock]
  #   restart: unless-stopped
  app:
    build: .
    container_name: truenas-burnin
    ports:
      - "8084:8084"
    env_file: .env
    volumes:
      - ./data:/data
      - ./app/templates:/opt/app/app/templates
      - ./app/static:/opt/app/app/static
      - /home/brandon/.ssh/id_ed25519:/run/secrets/ssh_key:ro
    restart: unless-stopped
--- a/claude-sandbox/truenas-burnin/mock-truenas/Dockerfile
+++ b/claude-sandbox/truenas-burnin/mock-truenas/Dockerfile
--- a/claude-sandbox/truenas-burnin/mock-truenas/app.py
+++ b/claude-sandbox/truenas-burnin/mock-truenas/app.py
--- a/claude-sandbox/truenas-burnin/requirements.txt
+++ b/claude-sandbox/truenas-burnin/requirements.txt
@ -1,5 +1,5 @@
 fastapi
-uvicorn
+uvicorn[standard]
 aiosqlite
 httpx
 pydantic-settings
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,21 +0,0 @@
 services:
  mock-truenas:
    build: ./mock-truenas
    container_name: mock-truenas
    ports:
      - "8000:8000"
    restart: unless-stopped
  app:
    build: .
    container_name: truenas-burnin
    ports:
      - "8084:8084"
    env_file: .env
    volumes:
      - ./data:/data
      - ./app/templates:/opt/app/app/templates
      - ./app/static:/opt/app/app/static
    depends_on:
      - mock-truenas
    restart: unless-stopped