truenas-burnin/app/poller.py
Brandon Walter b73b5251ae Initial commit — TrueNAS Burn-In Dashboard v0.5.0
Full-stack burn-in orchestration dashboard (Stages 1–6d complete):
FastAPI backend, SQLite/WAL, SSE live dashboard, mock TrueNAS server,
SMTP/webhook notifications, batch burn-in, settings UI, audit log,
stats page, cancel SMART/burn-in, drag-to-reorder stages.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-24 00:08:29 -05:00

290 lines
9.2 KiB
Python

"""
Polling loop — fetches TrueNAS state every POLL_INTERVAL_SECONDS and
normalizes it into SQLite.
Design notes:
- Opens its own DB connection per cycle (WAL allows concurrent readers).
- Skips a cycle if TrueNAS is unreachable; marks poller unhealthy.
- Never overwrites a 'running' state with stale history.
"""
import asyncio
import logging
from datetime import datetime, timezone, timedelta
from typing import Any
import aiosqlite
from app.config import settings
from app.truenas import TrueNASClient
log = logging.getLogger(__name__)
# Shared state read by the /health endpoint
_state: dict[str, Any] = {
"last_poll_at": None,
"last_error": None,
"healthy": False,
"drives_seen": 0,
"consecutive_failures": 0,
}
# SSE subscriber queues — notified after each successful poll
_subscribers: list[asyncio.Queue] = []
def get_state() -> dict:
return _state.copy()
def subscribe() -> asyncio.Queue:
q: asyncio.Queue = asyncio.Queue(maxsize=1)
_subscribers.append(q)
return q
def unsubscribe(q: asyncio.Queue) -> None:
try:
_subscribers.remove(q)
except ValueError:
pass
def _notify_subscribers(alert: dict | None = None) -> None:
payload = {"alert": alert}
for q in list(_subscribers):
try:
q.put_nowait(payload)
except asyncio.QueueFull:
pass # Client is behind; skip this update
def _now() -> str:
return datetime.now(timezone.utc).isoformat()
def _eta_from_progress(percent: int, started_iso: str | None) -> str | None:
"""Linear ETA extrapolation from elapsed time and percent complete."""
if not started_iso or percent <= 0:
return None
try:
start = datetime.fromisoformat(started_iso)
if start.tzinfo is None:
start = start.replace(tzinfo=timezone.utc)
elapsed = (datetime.now(timezone.utc) - start).total_seconds()
total_est = elapsed / (percent / 100)
remaining = max(0.0, total_est - elapsed)
return (datetime.now(timezone.utc) + timedelta(seconds=remaining)).isoformat()
except Exception:
return None
def _map_history_state(status: str) -> str:
return "passed" if "without error" in status.lower() else "failed"
# ---------------------------------------------------------------------------
# DB helpers
# ---------------------------------------------------------------------------
async def _upsert_drive(db: aiosqlite.Connection, disk: dict, now: str) -> int:
await db.execute(
"""
INSERT INTO drives
(truenas_disk_id, devname, serial, model, size_bytes,
temperature_c, smart_health, last_seen_at, last_polled_at)
VALUES (?,?,?,?,?,?,?,?,?)
ON CONFLICT(truenas_disk_id) DO UPDATE SET
temperature_c = excluded.temperature_c,
smart_health = excluded.smart_health,
last_seen_at = excluded.last_seen_at,
last_polled_at = excluded.last_polled_at
""",
(
disk["identifier"],
disk["devname"],
disk.get("serial"),
disk.get("model"),
disk.get("size"),
disk.get("temperature"),
disk.get("smart_health", "UNKNOWN"),
now,
now,
),
)
cur = await db.execute(
"SELECT id FROM drives WHERE truenas_disk_id = ?", (disk["identifier"],)
)
row = await cur.fetchone()
return row["id"]
async def _upsert_test(db: aiosqlite.Connection, drive_id: int, ttype: str, data: dict) -> None:
await db.execute(
"""
INSERT INTO smart_tests
(drive_id, test_type, state, percent, truenas_job_id,
started_at, eta_at, finished_at, error_text)
VALUES (?,?,?,?,?,?,?,?,?)
ON CONFLICT(drive_id, test_type) DO UPDATE SET
state = excluded.state,
percent = excluded.percent,
truenas_job_id = excluded.truenas_job_id,
started_at = COALESCE(excluded.started_at, smart_tests.started_at),
eta_at = excluded.eta_at,
finished_at = excluded.finished_at,
error_text = excluded.error_text
""",
(
drive_id,
ttype,
data["state"],
data.get("percent", 0),
data.get("truenas_job_id"),
data.get("started_at"),
data.get("eta_at"),
data.get("finished_at"),
data.get("error_text"),
),
)
async def _apply_running_job(
db: aiosqlite.Connection, drive_id: int, ttype: str, job: dict
) -> None:
pct = job["progress"]["percent"]
await _upsert_test(db, drive_id, ttype, {
"state": "running",
"percent": pct,
"truenas_job_id": job["id"],
"started_at": job.get("time_started"),
"eta_at": _eta_from_progress(pct, job.get("time_started")),
"finished_at": None,
"error_text": None,
})
async def _sync_history(
db: aiosqlite.Connection,
client: TrueNASClient,
drive_id: int,
devname: str,
ttype: str,
) -> None:
"""Pull most recent completed test from history.
This is only called when the drive+type is NOT in the active running-jobs
dict, so it's safe to overwrite any previous 'running' state — the job
has finished (or was never started).
"""
try:
results = await client.get_smart_results(devname)
except Exception:
return # History fetch failure is non-fatal
if not results:
return
for test in results[0].get("tests", []):
t_name = test.get("type", "").lower()
is_short = "short" in t_name
if (ttype == "short") != is_short:
continue # Wrong test type
state = _map_history_state(test.get("status", ""))
await _upsert_test(db, drive_id, ttype, {
"state": state,
"percent": 100 if state == "passed" else 0,
"truenas_job_id": None,
"started_at": None,
"eta_at": None,
"finished_at": None,
"error_text": test.get("status_verbose") if state == "failed" else None,
})
break # Most recent only
# ---------------------------------------------------------------------------
# Poll cycle
# ---------------------------------------------------------------------------
async def poll_cycle(client: TrueNASClient) -> int:
"""Run one full poll. Returns number of drives seen."""
now = _now()
disks = await client.get_disks()
running_jobs = await client.get_smart_jobs(state="RUNNING")
# Index running jobs by (devname, test_type)
active: dict[tuple[str, str], dict] = {}
for job in running_jobs:
try:
args = job["arguments"][0]
devname = args["disks"][0]
ttype = args["type"].lower()
active[(devname, ttype)] = job
except (KeyError, IndexError, TypeError):
pass
async with aiosqlite.connect(settings.db_path) as db:
db.row_factory = aiosqlite.Row
await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA foreign_keys=ON")
for disk in disks:
devname = disk["devname"]
drive_id = await _upsert_drive(db, disk, now)
for ttype in ("short", "long"):
if (devname, ttype) in active:
await _apply_running_job(db, drive_id, ttype, active[(devname, ttype)])
else:
await _sync_history(db, client, drive_id, devname, ttype)
await db.commit()
return len(disks)
# ---------------------------------------------------------------------------
# Background loop
# ---------------------------------------------------------------------------
async def run(client: TrueNASClient) -> None:
log.info("Poller started", extra={"poll_interval": settings.poll_interval_seconds})
cycle = 0
while True:
try:
count = await poll_cycle(client)
cycle += 1
_state["last_poll_at"] = _now()
_state["last_error"] = None
_state["healthy"] = True
_state["drives_seen"] = count
_state["consecutive_failures"] = 0
log.debug("Poll OK", extra={"drives": count})
_notify_subscribers()
# Check for stuck jobs every 5 cycles (~1 min at default 12s interval)
if cycle % 5 == 0:
try:
from app import burnin as _burnin
await _burnin.check_stuck_jobs()
except Exception as exc:
log.error("Stuck-job check failed: %s", exc)
except Exception as exc:
failures = _state["consecutive_failures"] + 1
_state["consecutive_failures"] = failures
_state["last_error"] = str(exc)
_state["healthy"] = False
if failures >= 5:
log.critical(
"Poller has failed %d consecutive times: %s",
failures, exc,
extra={"consecutive_failures": failures},
)
else:
log.error("Poll failed: %s", exc, extra={"consecutive_failures": failures})
await asyncio.sleep(settings.poll_interval_seconds)