Compare commits

..

No commits in common. "main" and "25d4622aa4cb877de079a3bdf5825b20359701ca" have entirely different histories.

39 changed files with 676 additions and 78 deletions

View file

@ -209,7 +209,7 @@ All read from `.env` via `pydantic-settings`. See `.env.example` for full list.
| `TEMP_WARN_C` | `46` | Temperature warning threshold (°C) | | `TEMP_WARN_C` | `46` | Temperature warning threshold (°C) |
| `TEMP_CRIT_C` | `55` | Temperature critical threshold — precheck fails above this | | `TEMP_CRIT_C` | `55` | Temperature critical threshold — precheck fails above this |
| `BAD_BLOCK_THRESHOLD` | `0` | Max bad blocks allowed before surface_validate fails (0 = any bad = fail) | | `BAD_BLOCK_THRESHOLD` | `0` | Max bad blocks allowed before surface_validate fails (0 = any bad = fail) |
| `APP_VERSION` | `1.0.0-7` | Displayed in header version badge | | `APP_VERSION` | `1.0.0-8` | Displayed in header version badge |
| `SSH_HOST` | `` | TrueNAS SSH hostname/IP — empty disables SSH mode (uses mock/REST) | | `SSH_HOST` | `` | TrueNAS SSH hostname/IP — empty disables SSH mode (uses mock/REST) |
| `SSH_PORT` | `22` | TrueNAS SSH port | | `SSH_PORT` | `22` | TrueNAS SSH port |
| `SSH_USER` | `root` | TrueNAS SSH username | | `SSH_USER` | `root` | TrueNAS SSH username |

View file

@ -206,10 +206,45 @@ async def cancel_job(job_id: int, operator: str) -> bool:
# Job runner # Job runner
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
async def _thermal_gate_ok() -> bool:
"""True if it's thermally safe to start a new burn-in.
Checks the peak temperature of drives currently under active burn-in.
"""
try:
async with _db() as db:
cur = await db.execute("""
SELECT MAX(d.temperature_c)
FROM drives d
JOIN burnin_jobs bj ON bj.drive_id = d.id
WHERE bj.state = 'running' AND d.temperature_c IS NOT NULL
""")
row = await cur.fetchone()
max_temp = row[0] if row and row[0] is not None else None
return max_temp is None or max_temp < settings.temp_warn_c
except Exception:
return True # Never block on error
async def _run_job(job_id: int) -> None: async def _run_job(job_id: int) -> None:
"""Acquire semaphore slot, execute all stages, persist final state.""" """Acquire semaphore slot, execute all stages, persist final state."""
assert _semaphore is not None, "burnin.init() not called" assert _semaphore is not None, "burnin.init() not called"
# Adaptive thermal gate: wait before competing for a slot if running drives
# are already at or above the warning threshold. This prevents layering a
# new burn-in on top of a thermally-stressed system. Gives up after 3 min
# and proceeds anyway so jobs don't queue indefinitely.
for _attempt in range(18): # 18 × 10 s = 3 min max
if await _thermal_gate_ok():
break
if _attempt == 0:
log.info(
"Thermal gate: job %d waiting — running drive temps at or above %d°C",
job_id, settings.temp_warn_c,
)
await asyncio.sleep(10)
else:
log.warning("Thermal gate timed out for job %d — proceeding anyway", job_id)
async with _semaphore: async with _semaphore:
if await _is_cancelled(job_id): if await _is_cancelled(job_id):
return return
@ -519,15 +554,39 @@ async def _stage_smart_test_ssh(job_id: int, devname: str, test_type: str, stage
# "unknown" → keep polling # "unknown" → keep polling
async def _badblocks_available() -> bool:
"""Check if badblocks is installed on the remote host (Linux/SCALE only)."""
from app import ssh_client
try:
async with await ssh_client._connect() as conn:
result = await conn.run("which badblocks", check=False)
return result.returncode == 0
except Exception:
return False
async def _stage_surface_validate(job_id: int, devname: str, drive_id: int) -> bool: async def _stage_surface_validate(job_id: int, devname: str, drive_id: int) -> bool:
""" """
Surface validation stage. Surface validation stage auto-routes to the right implementation:
SSH mode: runs badblocks -wsv -b 4096 -p 1 /dev/{devname}.
Mock mode: simulated timed progress (no real I/O). 1. SSH configured + badblocks available (TrueNAS SCALE / Linux):
runs badblocks -wsv -b 4096 -p 1 /dev/{devname} directly over SSH.
2. SSH configured + badblocks NOT available (TrueNAS CORE / FreeBSD):
uses TrueNAS REST API disk.wipe FULL job + post-wipe SMART check.
3. No SSH:
simulated timed progress (dev/mock mode).
""" """
from app import ssh_client from app import ssh_client
if ssh_client.is_configured(): if ssh_client.is_configured():
if await _badblocks_available():
return await _stage_surface_validate_ssh(job_id, devname, drive_id) return await _stage_surface_validate_ssh(job_id, devname, drive_id)
# TrueNAS CORE/FreeBSD: badblocks not available — use native wipe API
await _append_stage_log(
job_id, "surface_validate",
"[INFO] badblocks not found on host (TrueNAS CORE/FreeBSD) — "
"using TrueNAS disk.wipe API (FULL write pass).\n\n"
)
return await _stage_surface_validate_truenas(job_id, devname, drive_id)
return await _stage_timed_simulate(job_id, "surface_validate", settings.surface_validate_seconds) return await _stage_timed_simulate(job_id, "surface_validate", settings.surface_validate_seconds)
@ -655,6 +714,116 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
return True return True
async def _stage_surface_validate_truenas(job_id: int, devname: str, drive_id: int) -> bool:
"""
Surface validation via TrueNAS CORE disk.wipe REST API.
Used on FreeBSD (TrueNAS CORE) where badblocks is unavailable.
Sends a FULL write-zero pass across the entire disk, polls progress,
then runs a post-wipe SMART attribute check to catch reallocated sectors.
"""
from app import ssh_client
await _append_stage_log(
job_id, "surface_validate",
f"[START] TrueNAS disk.wipe FULL — {devname}\n"
f"[NOTE] DESTRUCTIVE: all data on {devname} will be overwritten.\n\n"
)
# Start the wipe job
try:
tn_job_id = await _client.wipe_disk(devname, "FULL")
except Exception as exc:
await _set_stage_error(job_id, "surface_validate", f"Failed to start disk.wipe: {exc}")
return False
await _append_stage_log(
job_id, "surface_validate",
f"[JOB] TrueNAS wipe job started (job_id={tn_job_id})\n"
)
# Poll until complete
log_flush_counter = 0
while True:
if await _is_cancelled(job_id):
try:
await _client.abort_job(tn_job_id)
except Exception:
pass
return False
await asyncio.sleep(POLL_INTERVAL)
try:
job = await _client.get_job(tn_job_id)
except Exception as exc:
log.warning("Wipe job poll failed: %s", exc, extra={"job_id": job_id})
await _append_stage_log(job_id, "surface_validate", f"[poll error] {exc}\n")
continue
if not job:
await _set_stage_error(job_id, "surface_validate", f"Wipe job {tn_job_id} not found")
return False
state = job.get("state", "")
pct = int(job.get("progress", {}).get("percent", 0) or 0)
desc = job.get("progress", {}).get("description", "")
await _update_stage_percent(job_id, "surface_validate", min(pct, 99))
await _recalculate_progress(job_id)
_push_update()
# Log progress description every ~5 polls to avoid DB spam
log_flush_counter += 1
if desc and log_flush_counter % 5 == 0:
await _append_stage_log(job_id, "surface_validate", f"[{pct}%] {desc}\n")
if state == "SUCCESS":
await _update_stage_percent(job_id, "surface_validate", 100)
await _append_stage_log(
job_id, "surface_validate",
f"\n[DONE] Wipe job {tn_job_id} completed successfully.\n"
)
# Post-wipe SMART check — catch any sectors that failed under write stress
if ssh_client.is_configured() and drive_id is not None:
await _append_stage_log(
job_id, "surface_validate",
"[CHECK] Running post-wipe SMART attribute check...\n"
)
try:
attrs = await ssh_client.get_smart_attributes(devname)
await _store_smart_attrs(drive_id, attrs)
if attrs["failures"]:
error = "Post-wipe SMART check: " + "; ".join(attrs["failures"])
await _set_stage_error(job_id, "surface_validate", error)
return False
if attrs["warnings"]:
await _append_stage_log(
job_id, "surface_validate",
"[WARNING] " + "; ".join(attrs["warnings"]) + "\n"
)
await _append_stage_log(
job_id, "surface_validate",
f"[CHECK] SMART health: {attrs['health']} — no critical attributes.\n"
)
except Exception as exc:
log.warning("Post-wipe SMART check failed: %s", exc)
await _append_stage_log(
job_id, "surface_validate",
f"[WARN] Post-wipe SMART check failed (non-fatal): {exc}\n"
)
return True
elif state in ("FAILED", "ABORTED", "ERROR"):
error_msg = job.get("error") or f"Disk wipe failed (state={state})"
await _set_stage_error(
job_id, "surface_validate",
f"TrueNAS disk.wipe FAILED: {error_msg}"
)
return False
# RUNNING or WAITING — keep polling
async def _stage_timed_simulate(job_id: int, stage_name: str, duration_seconds: int) -> bool: async def _stage_timed_simulate(job_id: int, stage_name: str, duration_seconds: int) -> bool:
"""Simulate a timed stage with progress updates (mock / dev mode).""" """Simulate a timed stage with progress updates (mock / dev mode)."""
start = time.monotonic() start = time.monotonic()

View file

@ -68,7 +68,7 @@ class Settings(BaseSettings):
ssh_key: str = "" # PEM private key content (paste full key including headers) ssh_key: str = "" # PEM private key content (paste full key including headers)
# Application version — used by the /api/v1/updates/check endpoint # Application version — used by the /api/v1/updates/check endpoint
app_version: str = "1.0.0-7" app_version: str = "1.0.0-8"
settings = Settings() settings = Settings()

View file

@ -20,13 +20,15 @@ from app.truenas import TrueNASClient
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
# Shared state read by the /health endpoint # Shared state read by the /health endpoint and dashboard template
_state: dict[str, Any] = { _state: dict[str, Any] = {
"last_poll_at": None, "last_poll_at": None,
"last_error": None, "last_error": None,
"healthy": False, "healthy": False,
"drives_seen": 0, "drives_seen": 0,
"consecutive_failures": 0, "consecutive_failures": 0,
"system_temps": {}, # {"cpu_c": int|None, "pch_c": int|None}
"thermal_pressure": "ok", # "ok" | "warn" | "crit" — based on running burn-in drive temps
} }
# SSE subscriber queues — notified after each successful poll # SSE subscriber queues — notified after each successful poll
@ -208,6 +210,67 @@ async def _sync_history(
# Poll cycle # Poll cycle
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
async def _poll_smart_via_ssh(db: aiosqlite.Connection, now: str) -> None:
"""
Poll progress for SMART tests started via SSH (truenas_job_id IS NULL).
Used on TrueNAS SCALE 25.10+ where the REST smart/test API no longer exists.
"""
from app import ssh_client
if not ssh_client.is_configured():
return
cur = await db.execute(
"""SELECT st.id, st.test_type, st.drive_id, d.devname, st.started_at
FROM smart_tests st
JOIN drives d ON d.id = st.drive_id
WHERE st.state = 'running' AND st.truenas_job_id IS NULL"""
)
rows = await cur.fetchall()
if not rows:
return
for row in rows:
test_id, ttype, drive_id, devname, started_at = row[0], row[1], row[2], row[3], row[4]
try:
progress = await ssh_client.poll_smart_progress(devname)
except Exception as exc:
log.warning("SSH SMART poll failed for %s: %s", devname, exc)
continue
state = progress["state"]
pct_remaining = progress.get("percent_remaining") # None = not yet in output
raw_output = progress.get("output", "")
if state == "running":
# pct_remaining=None means smartctl output doesn't have the % line yet
# (test just started) — keep percent at 0 rather than jumping to 100
if pct_remaining is None:
pct = 0
else:
pct = max(0, 100 - pct_remaining)
eta = _eta_from_progress(pct, started_at)
await db.execute(
"UPDATE smart_tests SET percent=?, eta_at=?, raw_output=? WHERE id=?",
(pct, eta, raw_output, test_id),
)
elif state == "passed":
await db.execute(
"UPDATE smart_tests SET state='passed', percent=100, finished_at=?, raw_output=? WHERE id=?",
(now, raw_output, test_id),
)
log.info("SSH SMART %s passed on %s", ttype, devname)
elif state == "failed":
await db.execute(
"UPDATE smart_tests SET state='failed', percent=0, finished_at=?, "
"error_text=?, raw_output=? WHERE id=?",
(now, f"SMART {ttype.upper()} test failed", raw_output, test_id),
)
log.warning("SSH SMART %s FAILED on %s", ttype, devname)
# state == "unknown" → keep polling, no update
await db.commit()
async def poll_cycle(client: TrueNASClient) -> int: async def poll_cycle(client: TrueNASClient) -> int:
"""Run one full poll. Returns number of drives seen.""" """Run one full poll. Returns number of drives seen."""
now = _now() now = _now()
@ -215,6 +278,20 @@ async def poll_cycle(client: TrueNASClient) -> int:
disks = await client.get_disks() disks = await client.get_disks()
running_jobs = await client.get_smart_jobs(state="RUNNING") running_jobs = await client.get_smart_jobs(state="RUNNING")
# Fetch temperatures via SCALE-specific endpoint.
# CORE doesn't have this endpoint — silently skip on any error.
try:
temps = await client.get_disk_temperatures()
except Exception:
temps = {}
# Inject temperature into each disk dict (SCALE 25.10 has no temp in /disk)
for disk in disks:
devname = disk.get("devname", "")
t = temps.get(devname)
if t is not None:
disk["temperature"] = int(round(t))
# Index running jobs by (devname, test_type) # Index running jobs by (devname, test_type)
active: dict[tuple[str, str], dict] = {} active: dict[tuple[str, str], dict] = {}
for job in running_jobs: for job in running_jobs:
@ -243,6 +320,9 @@ async def poll_cycle(client: TrueNASClient) -> int:
await db.commit() await db.commit()
# SSH SMART polling — for tests started via smartctl (no TrueNAS REST job)
await _poll_smart_via_ssh(db, now)
return len(disks) return len(disks)
@ -263,6 +343,39 @@ async def run(client: TrueNASClient) -> None:
_state["drives_seen"] = count _state["drives_seen"] = count
_state["consecutive_failures"] = 0 _state["consecutive_failures"] = 0
log.debug("Poll OK", extra={"drives": count}) log.debug("Poll OK", extra={"drives": count})
# System sensor temps via SSH (non-fatal)
from app import ssh_client as _ssh
if _ssh.is_configured():
try:
_state["system_temps"] = await _ssh.get_system_sensors()
except Exception:
pass
# Thermal pressure: max temp of drives currently under burn-in
try:
async with aiosqlite.connect(settings.db_path) as _tdb:
_tdb.row_factory = aiosqlite.Row
await _tdb.execute("PRAGMA journal_mode=WAL")
_cur = await _tdb.execute("""
SELECT MAX(d.temperature_c)
FROM drives d
JOIN burnin_jobs bj ON bj.drive_id = d.id
WHERE bj.state = 'running' AND d.temperature_c IS NOT NULL
""")
_row = await _cur.fetchone()
_max_t = _row[0] if _row and _row[0] is not None else None
if _max_t is None:
_state["thermal_pressure"] = "ok"
elif _max_t >= settings.temp_crit_c:
_state["thermal_pressure"] = "crit"
elif _max_t >= settings.temp_warn_c:
_state["thermal_pressure"] = "warn"
else:
_state["thermal_pressure"] = "ok"
except Exception:
_state["thermal_pressure"] = "ok"
_notify_subscribers() _notify_subscribers()
# Check for stuck jobs every 5 cycles (~1 min at default 12s interval) # Check for stuck jobs every 5 cycles (~1 min at default 12s interval)

View file

@ -218,6 +218,18 @@ async def sse_drives(request: Request):
yield {"event": "drives-update", "data": html} yield {"event": "drives-update", "data": html}
# Push system sensor state so JS can update temp chips live
ps = poller.get_state()
yield {
"event": "system-sensors",
"data": json.dumps({
"system_temps": ps.get("system_temps", {}),
"thermal_pressure": ps.get("thermal_pressure", "ok"),
"temp_warn_c": settings.temp_warn_c,
"temp_crit_c": settings.temp_crit_c,
}),
}
# Push browser notification event if this was a job completion # Push browser notification event if this was a job completion
if alert: if alert:
yield {"event": "job-alert", "data": json.dumps(alert)} yield {"event": "job-alert", "data": json.dumps(alert)}
@ -353,9 +365,13 @@ async def smart_start(
body: dict, body: dict,
db: aiosqlite.Connection = Depends(get_db), db: aiosqlite.Connection = Depends(get_db),
): ):
"""Start a standalone SHORT or LONG SMART test on a single drive.""" """Start a standalone SHORT or LONG SMART test on a single drive.
from app.truenas import TrueNASClient
from app import burnin as _burnin Uses SSH (smartctl) when configured required for TrueNAS SCALE 25.10+
where the REST smart/test endpoint no longer exists.
Falls back to TrueNAS REST API for older versions.
"""
from app import burnin as _burnin, ssh_client
test_type = (body.get("type") or "").upper() test_type = (body.get("type") or "").upper()
if test_type not in ("SHORT", "LONG"): if test_type not in ("SHORT", "LONG"):
@ -367,16 +383,41 @@ async def smart_start(
raise HTTPException(status_code=404, detail="Drive not found") raise HTTPException(status_code=404, detail="Drive not found")
devname = row[0] devname = row[0]
# Use the shared TrueNAS client held by the burnin module now = datetime.now(timezone.utc).isoformat()
ttype_lower = test_type.lower()
if ssh_client.is_configured():
# SSH path — works on TrueNAS SCALE 25.10+ and CORE
try:
output = await ssh_client.start_smart_test(devname, test_type)
except Exception as exc:
raise HTTPException(status_code=502, detail=f"SSH error: {exc}")
# Mark as running in DB (truenas_job_id=NULL signals SSH-managed test)
# Store smartctl start output as proof the test was initiated
await db.execute(
"""INSERT INTO smart_tests (drive_id, test_type, state, percent, started_at, raw_output)
VALUES (?,?,?,?,?,?)
ON CONFLICT(drive_id, test_type) DO UPDATE SET
state='running', percent=0, truenas_job_id=NULL,
started_at=excluded.started_at, finished_at=NULL, error_text=NULL,
raw_output=excluded.raw_output""",
(drive_id, ttype_lower, "running", 0, now, output),
)
await db.commit()
from app import poller as _poller
_poller._notify_subscribers()
return {"devname": devname, "type": test_type, "message": output[:200]}
else:
# REST path — older TrueNAS CORE / SCALE versions
client = _burnin._client client = _burnin._client
if client is None: if client is None:
raise HTTPException(status_code=503, detail="TrueNAS client not ready") raise HTTPException(status_code=503, detail="TrueNAS client not ready")
try: try:
tn_job_id = await client.start_smart_test([devname], test_type) tn_job_id = await client.start_smart_test([devname], test_type)
except Exception as exc: except Exception as exc:
raise HTTPException(status_code=502, detail=f"TrueNAS error: {exc}") raise HTTPException(status_code=502, detail=f"TrueNAS error: {exc}")
return {"job_id": tn_job_id, "devname": devname, "type": test_type} return {"job_id": tn_job_id, "devname": devname, "type": test_type}
@ -403,7 +444,16 @@ async def smart_cancel(
if client is None: if client is None:
raise HTTPException(status_code=503, detail="TrueNAS client not ready") raise HTTPException(status_code=503, detail="TrueNAS client not ready")
# Find the running TrueNAS job for this drive/test-type from app import ssh_client
if ssh_client.is_configured():
# SSH path — abort via smartctl -X
try:
await ssh_client.abort_smart_test(devname)
except Exception as exc:
raise HTTPException(status_code=502, detail=f"SSH abort error: {exc}")
else:
# REST path — find TrueNAS job and abort it
try: try:
jobs = await client.get_smart_jobs() jobs = await client.get_smart_jobs()
tn_job_id = None tn_job_id = None

View file

@ -38,15 +38,26 @@ SMART_ATTRS: dict[int, tuple[str, bool]] = {
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def is_configured() -> bool: def is_configured() -> bool:
"""Returns True when SSH credentials are present and usable.""" """Returns True when SSH host + at least one auth method is available."""
import os
from app.config import settings from app.config import settings
return bool(settings.ssh_host and (settings.ssh_password or settings.ssh_key)) if not settings.ssh_host:
return False
has_creds = bool(
settings.ssh_key
or settings.ssh_password
or os.path.exists(os.environ.get("SSH_KEY_FILE", _MOUNTED_KEY_PATH))
)
return has_creds
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Low-level connection # Low-level connection
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
_MOUNTED_KEY_PATH = "/run/secrets/ssh_key"
async def _connect(): async def _connect():
"""Open a single-use SSH connection. Caller must use `async with`.""" """Open a single-use SSH connection. Caller must use `async with`."""
import asyncssh import asyncssh
@ -59,9 +70,17 @@ async def _connect():
"known_hosts": None, # trust all hosts (same spirit as TRUENAS_VERIFY_TLS=false) "known_hosts": None, # trust all hosts (same spirit as TRUENAS_VERIFY_TLS=false)
} }
if settings.ssh_key: if settings.ssh_key:
# Key material provided via env var (base case)
kwargs["client_keys"] = [asyncssh.import_private_key(settings.ssh_key)] kwargs["client_keys"] = [asyncssh.import_private_key(settings.ssh_key)]
if settings.ssh_password: elif settings.ssh_password:
kwargs["password"] = settings.ssh_password kwargs["password"] = settings.ssh_password
else:
# Fall back to mounted key file (preferred for production — no key in env vars)
import os
key_path = os.environ.get("SSH_KEY_FILE", _MOUNTED_KEY_PATH)
if os.path.exists(key_path):
kwargs["client_keys"] = [key_path]
# If nothing is configured, asyncssh will attempt agent/default key lookup
return asyncssh.connect(**kwargs) return asyncssh.connect(**kwargs)
@ -228,6 +247,70 @@ async def run_badblocks(
} }
async def get_system_sensors() -> dict:
"""
Run `sensors -j` on TrueNAS and extract system-level temperatures.
Returns {"cpu_c": int|None, "pch_c": int|None}.
cpu_c = CPU package temp (coretemp chip)
pch_c = PCH/chipset temp (pch_* chip) proxy for storage I/O lane thermals
Falls back gracefully if SSH is not configured or lm-sensors is unavailable.
"""
if not is_configured():
return {}
try:
async with await _connect() as conn:
result = await conn.run("sensors -j 2>/dev/null", check=False)
output = result.stdout.strip()
if not output:
return {}
return _parse_sensors_json(output)
except Exception as exc:
log.debug("get_system_sensors failed: %s", exc)
return {}
def _parse_sensors_json(output: str) -> dict:
import json as _json
try:
data = _json.loads(output)
except Exception:
return {}
cpu_c: int | None = None
pch_c: int | None = None
for chip_name, chip_data in data.items():
if not isinstance(chip_data, dict):
continue
# CPU package temp — coretemp chip, "Package id N" sensor
if chip_name.startswith("coretemp") and cpu_c is None:
for sensor_name, sensor_vals in chip_data.items():
if not isinstance(sensor_vals, dict):
continue
if "package" in sensor_name.lower():
for k, v in sensor_vals.items():
if k.endswith("_input") and isinstance(v, (int, float)):
cpu_c = int(round(v))
break
if cpu_c is not None:
break
# PCH / chipset temp — manages PCIe lanes including HBA / storage I/O
elif chip_name.startswith("pch_") and pch_c is None:
for sensor_name, sensor_vals in chip_data.items():
if not isinstance(sensor_vals, dict):
continue
for k, v in sensor_vals.items():
if k.endswith("_input") and isinstance(v, (int, float)):
pch_c = int(round(v))
break
if pch_c is not None:
break
return {"cpu_c": cpu_c, "pch_c": pch_c}
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Parsers # Parsers
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@ -275,7 +358,7 @@ def _parse_smartctl(output: str) -> dict:
def _parse_smart_progress(output: str) -> dict: def _parse_smart_progress(output: str) -> dict:
state = "unknown" state = "unknown"
percent_remaining = 0 percent_remaining = None # None = "in progress but no % line parsed yet"
lower = output.lower() lower = output.lower()

View file

@ -1076,6 +1076,56 @@ a.stat-card:hover {
.stat-passed .stat-value { color: var(--green); } .stat-passed .stat-value { color: var(--green); }
.stat-idle .stat-value { color: var(--text-muted); } .stat-idle .stat-value { color: var(--text-muted); }
/* Vertical separator between drive-count cards and sensor chips */
.stats-bar-sep {
width: 1px;
height: 36px;
background: var(--border);
align-self: center;
flex-shrink: 0;
}
/* Compact sensor chip — CPU / PCH / Thermal */
.stat-sensor {
background: var(--bg-card);
border: 1px solid var(--border);
border-radius: 8px;
padding: 6px 12px;
text-align: center;
min-width: 52px;
display: flex;
flex-direction: column;
gap: 2px;
}
.stat-sensor-val {
font-size: 16px;
font-weight: 700;
font-variant-numeric: tabular-nums;
line-height: 1.1;
}
.stat-sensor-label {
font-size: 9px;
text-transform: uppercase;
letter-spacing: 0.08em;
color: var(--text-muted);
line-height: 1.2;
}
/* Thermal pressure states */
.stat-sensor-thermal-warn {
border-color: var(--yellow-bd);
background: var(--yellow-bg);
}
.stat-sensor-thermal-warn .stat-sensor-val { color: var(--yellow); }
.stat-sensor-thermal-crit {
border-color: var(--red-bd);
background: var(--red-bg);
}
.stat-sensor-thermal-crit .stat-sensor-val { color: var(--red); }
/* ----------------------------------------------------------------------- /* -----------------------------------------------------------------------
Batch action bar (inside filter-bar) Batch action bar (inside filter-bar)
----------------------------------------------------------------------- */ ----------------------------------------------------------------------- */

View file

@ -135,14 +135,59 @@
if (nb) nb.style.display = 'none'; if (nb) nb.style.display = 'none';
} }
// Handle job-alert SSE events for browser notifications // Handle SSE events
document.addEventListener('htmx:sseMessage', function (e) { document.addEventListener('htmx:sseMessage', function (e) {
if (!e.detail || e.detail.type !== 'job-alert') return; if (!e.detail) return;
try { if (e.detail.type === 'job-alert') {
handleJobAlert(JSON.parse(e.detail.data)); try { handleJobAlert(JSON.parse(e.detail.data)); } catch (_) {}
} catch (_) {} } else if (e.detail.type === 'system-sensors') {
try { handleSystemSensors(JSON.parse(e.detail.data)); } catch (_) {}
}
}); });
function handleSystemSensors(data) {
var st = data.system_temps || {};
var tp = data.thermal_pressure || 'ok';
var warn = data.temp_warn_c || 46;
var crit = data.temp_crit_c || 55;
function tempClass(c) {
if (c == null) return '';
return c >= crit ? 'temp-hot' : c >= warn ? 'temp-warm' : 'temp-cool';
}
// CPU chip
var cpuChip = document.getElementById('sensor-cpu');
var cpuVal = document.getElementById('sensor-cpu-val');
if (cpuVal && st.cpu_c != null) {
if (cpuChip) cpuChip.hidden = false;
cpuVal.textContent = st.cpu_c + '°';
cpuVal.className = 'stat-sensor-val ' + tempClass(st.cpu_c);
}
// PCH chip
var pchChip = document.getElementById('sensor-pch');
var pchVal = document.getElementById('sensor-pch-val');
if (pchVal && st.pch_c != null) {
if (pchChip) pchChip.hidden = false;
pchVal.textContent = st.pch_c + '°';
pchVal.className = 'stat-sensor-val ' + tempClass(st.pch_c);
}
// Thermal pressure chip
var tChip = document.getElementById('sensor-thermal');
var tVal = document.getElementById('sensor-thermal-val');
if (tChip && tVal) {
if (tp === 'warn' || tp === 'crit') {
tChip.hidden = false;
tChip.className = 'stat-sensor stat-sensor-thermal stat-sensor-thermal-' + tp;
tVal.textContent = tp === 'warn' ? 'WARM' : 'HOT';
} else {
tChip.hidden = true;
}
}
}
function handleJobAlert(data) { function handleJobAlert(data) {
var isPass = data.state === 'passed'; var isPass = data.state === 'passed';
var icon = isPass ? '✓' : '✕'; var icon = isPass ? '✓' : '✕';

View file

@ -6,7 +6,7 @@
{% include "components/modal_start.html" %} {% include "components/modal_start.html" %}
{% include "components/modal_batch.html" %} {% include "components/modal_batch.html" %}
<!-- Stats bar — counts are updated live by app.js updateCounts() --> <!-- Stats bar — drive counts updated live by app.js updateCounts(); sensor chips updated by SSE system-sensors event -->
<div class="stats-bar"> <div class="stats-bar">
<div class="stat-card" data-stat-filter="all"> <div class="stat-card" data-stat-filter="all">
<span class="stat-value" id="stat-all">{{ drives | length }}</span> <span class="stat-value" id="stat-all">{{ drives | length }}</span>
@ -28,6 +28,33 @@
<span class="stat-value" id="stat-idle">0</span> <span class="stat-value" id="stat-idle">0</span>
<span class="stat-label">Idle</span> <span class="stat-label">Idle</span>
</div> </div>
{%- set st = poller.system_temps if (poller and poller.system_temps) else {} %}
{%- if st.get('cpu_c') is not none or st.get('pch_c') is not none %}
<div class="stats-bar-sep"></div>
{%- if st.get('cpu_c') is not none %}
<div class="stat-sensor" id="sensor-cpu">
<span class="stat-sensor-val {{ st.get('cpu_c') | temp_class }}" id="sensor-cpu-val">{{ st.get('cpu_c') }}°</span>
<span class="stat-sensor-label">CPU</span>
</div>
{%- endif %}
{%- if st.get('pch_c') is not none %}
<div class="stat-sensor" id="sensor-pch">
<span class="stat-sensor-val {{ st.get('pch_c') | temp_class }}" id="sensor-pch-val">{{ st.get('pch_c') }}°</span>
<span class="stat-sensor-label">PCH</span>
</div>
{%- endif %}
{%- endif %}
{%- set tp = poller.thermal_pressure if poller else 'ok' %}
<div class="stat-sensor stat-sensor-thermal stat-sensor-thermal-{{ tp }}"
id="sensor-thermal"
{% if not tp or tp == 'ok' %}hidden{% endif %}>
<span class="stat-sensor-val" id="sensor-thermal-val">
{%- if tp == 'warn' %}WARM{%- elif tp == 'crit' %}HOT{%- else %}OK{%- endif %}
</span>
<span class="stat-sensor-label">Thermal</span>
</div>
</div> </div>
<!-- Failed drive banner — shown/hidden by JS when failed count > 0 --> <!-- Failed drive banner — shown/hidden by JS when failed count > 0 -->

View file

@ -49,6 +49,13 @@ async def handle(ws: WebSocket) -> None:
return return
elif settings.ssh_password: elif settings.ssh_password:
connect_kw["password"] = settings.ssh_password connect_kw["password"] = settings.ssh_password
else:
# Fall back to mounted key file (same logic as ssh_client._connect)
import os
from app import ssh_client as _sc
key_path = os.environ.get("SSH_KEY_FILE", _sc._MOUNTED_KEY_PATH)
if os.path.exists(key_path):
connect_kw["client_keys"] = [key_path]
else: else:
await _send(ws, await _send(ws,
b"\r\n\x1b[33mNo SSH credentials configured.\x1b[0m " b"\r\n\x1b[33mNo SSH credentials configured.\x1b[0m "

View file

@ -65,7 +65,13 @@ class TrueNASClient:
"get_disks", "get_disks",
) )
r.raise_for_status() r.raise_for_status()
return r.json() disks = r.json()
# Filter out expired records — TrueNAS keeps historical entries for removed
# disks with expiretime set. Only return currently-present drives.
active = [d for d in disks if not d.get("expiretime")]
if len(active) < len(disks):
log.debug("get_disks: filtered %d expired record(s)", len(disks) - len(active))
return active
async def get_smart_jobs(self, state: str | None = None) -> list[dict]: async def get_smart_jobs(self, state: str | None = None) -> list[dict]:
params: dict = {"method": "smart.test"} params: dict = {"method": "smart.test"}
@ -110,3 +116,49 @@ class TrueNASClient:
) )
r.raise_for_status() r.raise_for_status()
return r.json() return r.json()
async def get_disk_temperatures(self) -> dict[str, float | None]:
"""
Returns {devname: celsius | None}.
Uses POST /api/v2.0/disk/temperatures available on TrueNAS SCALE 25.10+.
CORE compatibility: raises on 404/405, caller should catch and skip.
"""
r = await _with_retry(
lambda: self._client.post("/api/v2.0/disk/temperatures", json={}),
"get_disk_temperatures",
)
r.raise_for_status()
return r.json()
async def wipe_disk(self, devname: str, mode: str = "FULL") -> int:
"""
Start a disk wipe job. Not retried duplicate starts would launch a second wipe.
mode: "QUICK" (wipe MBR/partitions only), "FULL" (write zeros), "FULL_RANDOM" (write random)
devname: basename only, e.g. "ada0" (not "/dev/ada0")
Returns the TrueNAS job ID.
"""
r = await self._client.post(
"/api/v2.0/disk/wipe",
json={"dev": devname, "mode": mode},
)
r.raise_for_status()
return r.json()
async def get_job(self, job_id: int) -> dict | None:
"""
Fetch a single TrueNAS job by ID.
Returns the job dict, or None if not found.
"""
import json as _json
r = await _with_retry(
lambda: self._client.get(
"/api/v2.0/core/get_jobs",
params={"filters": _json.dumps([["id", "=", job_id]])},
),
f"get_job({job_id})",
)
r.raise_for_status()
jobs = r.json()
if isinstance(jobs, list) and jobs:
return jobs[0]
return None

View file

@ -0,0 +1,23 @@
services:
# mock-truenas is kept for local dev — not started in production
# To use mock mode: docker compose --profile mock up
# mock-truenas:
# build: ./mock-truenas
# container_name: mock-truenas
# ports:
# - "8000:8000"
# profiles: [mock]
# restart: unless-stopped
app:
build: .
container_name: truenas-burnin
ports:
- "8084:8084"
env_file: .env
volumes:
- ./data:/data
- ./app/templates:/opt/app/app/templates
- ./app/static:/opt/app/app/static
- /home/brandon/.ssh/id_ed25519:/run/secrets/ssh_key:ro
restart: unless-stopped

View file

@ -1,5 +1,5 @@
fastapi fastapi
uvicorn uvicorn[standard]
aiosqlite aiosqlite
httpx httpx
pydantic-settings pydantic-settings

View file

@ -1,21 +0,0 @@
services:
mock-truenas:
build: ./mock-truenas
container_name: mock-truenas
ports:
- "8000:8000"
restart: unless-stopped
app:
build: .
container_name: truenas-burnin
ports:
- "8084:8084"
env_file: .env
volumes:
- ./data:/data
- ./app/templates:/opt/app/app/templates
- ./app/static:/opt/app/app/static
depends_on:
- mock-truenas
restart: unless-stopped