nas-burnin/app/truenas.py

import asyncio
import logging
from collections.abc import Callable, Coroutine
from typing import Any, TypeVar

import httpx

from app.config import settings

log = logging.getLogger(__name__)

T = TypeVar("T")

# Exceptions that are safe to retry (transient network issues)
_RETRYABLE = (
    httpx.ConnectError,
    httpx.TimeoutException,
    httpx.RemoteProtocolError,
    httpx.ReadError,
)


async def _with_retry(
    factory: Callable[[], Coroutine[Any, Any, T]],
    label: str,
    max_attempts: int = 3,
) -> T:
    """
    Call factory() to get a fresh coroutine and await it, retrying with
    exponential backoff on transient failures.

    A factory (not a bare coroutine) is required so each attempt gets a
    new coroutine object — an already-awaited coroutine cannot be reused.
    """
    backoff = 1.0
    for attempt in range(1, max_attempts + 1):
        try:
            return await factory()
        except _RETRYABLE as exc:
            if attempt == max_attempts:
                raise
            log.warning(
                "TrueNAS %s transient error (attempt %d/%d): %s — retrying in %.0fs",
                label, attempt, max_attempts, exc, backoff,
            )
            await asyncio.sleep(backoff)
            backoff *= 2
    # Unreachable: the loop either returns on success or re-raises on the
    # final attempt. The explicit raise makes that obvious to type-checkers
    # and to anyone reading top-down without tracing the control flow.
    raise RuntimeError("unreachable: _with_retry exhausted without returning")


class TrueNASClient:
    def __init__(self) -> None:
        self._client = httpx.AsyncClient(
            base_url=settings.truenas_base_url,
            headers={"Authorization": f"Bearer {settings.truenas_api_key}"},
            verify=settings.truenas_verify_tls,
            timeout=10.0,
        )

    async def close(self) -> None:
        await self._client.aclose()

    async def get_disks(self) -> list[dict]:
        r = await _with_retry(
            lambda: self._client.get("/api/v2.0/disk"),
            "get_disks",
        )
        r.raise_for_status()
        disks = r.json()
        # Filter out expired records — TrueNAS keeps historical entries for removed
        # disks with expiretime set. Only return currently-present drives.
        active = [d for d in disks if not d.get("expiretime")]
        if len(active) < len(disks):
            log.debug("get_disks: filtered %d expired record(s)", len(disks) - len(active))
        return active

    async def get_smart_jobs(self, state: str | None = None) -> list[dict]:
        params: dict = {"method": "smart.test"}
        if state:
            params["state"] = state
        r = await _with_retry(
            lambda: self._client.get("/api/v2.0/core/get_jobs", params=params),
            "get_smart_jobs",
        )
        r.raise_for_status()
        return r.json()

    async def get_smart_results(self, devname: str) -> list[dict]:
        r = await _with_retry(
            lambda: self._client.get(f"/api/v2.0/smart/test/results/{devname}"),
            f"get_smart_results({devname})",
        )
        r.raise_for_status()
        return r.json()

    async def start_smart_test(self, disks: list[str], test_type: str) -> int:
        """Start a SMART test. Not retried — a duplicate start would launch a second job."""
        r = await self._client.post(
            "/api/v2.0/smart/test",
            json={"disks": disks, "type": test_type},
        )
        r.raise_for_status()
        return r.json()

    async def abort_job(self, job_id: int) -> None:
        """Abort a TrueNAS job. Not retried — best-effort cancel."""
        r = await self._client.post(
            "/api/v2.0/core/job_abort",
            json={"id": job_id},
        )
        r.raise_for_status()

    async def get_system_info(self) -> dict:
        r = await _with_retry(
            lambda: self._client.get("/api/v2.0/system/info"),
            "get_system_info",
        )
        r.raise_for_status()
        return r.json()

    async def get_disk_temperatures(self) -> dict[str, float | None]:
        """
        Returns {devname: celsius | None}.
        Uses POST /api/v2.0/disk/temperatures — available on TrueNAS SCALE 25.10+.
        CORE compatibility: raises on 404/405, caller should catch and skip.
        """
        r = await _with_retry(
            lambda: self._client.post("/api/v2.0/disk/temperatures", json={}),
            "get_disk_temperatures",
        )
        r.raise_for_status()
        return r.json()

    async def wipe_disk(self, devname: str, mode: str = "FULL") -> int:
        """
        Start a disk wipe job. Not retried — duplicate starts would launch a second wipe.
        mode: "QUICK" (wipe MBR/partitions only), "FULL" (write zeros), "FULL_RANDOM" (write random)
        devname: basename only, e.g. "ada0" (not "/dev/ada0")
        Returns the TrueNAS job ID.
        """
        r = await self._client.post(
            "/api/v2.0/disk/wipe",
            json={"dev": devname, "mode": mode},
        )
        r.raise_for_status()
        return r.json()

    async def get_job(self, job_id: int) -> dict | None:
        """
        Fetch a single TrueNAS job by ID.
        Returns the job dict, or None if not found.
        """
        import json as _json
        r = await _with_retry(
            lambda: self._client.get(
                "/api/v2.0/core/get_jobs",
                params={"filters": _json.dumps([["id", "=", job_id]])},
            ),
            f"get_job({job_id})",
        )
        r.raise_for_status()
        jobs = r.json()
        if isinstance(jobs, list) and jobs:
            return jobs[0]
        return None