truenas-burnin/app/truenas.py

import asyncio
import logging
from collections.abc import Callable, Coroutine
from typing import Any, TypeVar

import httpx

from app.config import settings

log = logging.getLogger(__name__)

T = TypeVar("T")

# Exceptions that are safe to retry (transient network issues)
_RETRYABLE = (
    httpx.ConnectError,
    httpx.TimeoutException,
    httpx.RemoteProtocolError,
    httpx.ReadError,
)


async def _with_retry(
    factory: Callable[[], Coroutine[Any, Any, T]],
    label: str,
    max_attempts: int = 3,
) -> T:
    """
    Call factory() to get a fresh coroutine and await it, retrying with
    exponential backoff on transient failures.

    A factory (not a bare coroutine) is required so each attempt gets a
    new coroutine object — an already-awaited coroutine cannot be reused.
    """
    backoff = 1.0
    for attempt in range(1, max_attempts + 1):
        try:
            return await factory()
        except _RETRYABLE as exc:
            if attempt == max_attempts:
                raise
            log.warning(
                "TrueNAS %s transient error (attempt %d/%d): %s — retrying in %.0fs",
                label, attempt, max_attempts, exc, backoff,
            )
            await asyncio.sleep(backoff)
            backoff *= 2


class TrueNASClient:
    def __init__(self) -> None:
        self._client = httpx.AsyncClient(
            base_url=settings.truenas_base_url,
            headers={"Authorization": f"Bearer {settings.truenas_api_key}"},
            verify=settings.truenas_verify_tls,
            timeout=10.0,
        )

    async def close(self) -> None:
        await self._client.aclose()

    async def get_disks(self) -> list[dict]:
        r = await _with_retry(
            lambda: self._client.get("/api/v2.0/disk"),
            "get_disks",
        )
        r.raise_for_status()
        return r.json()

    async def get_smart_jobs(self, state: str | None = None) -> list[dict]:
        params: dict = {"method": "smart.test"}
        if state:
            params["state"] = state
        r = await _with_retry(
            lambda: self._client.get("/api/v2.0/core/get_jobs", params=params),
            "get_smart_jobs",
        )
        r.raise_for_status()
        return r.json()

    async def get_smart_results(self, devname: str) -> list[dict]:
        r = await _with_retry(
            lambda: self._client.get(f"/api/v2.0/smart/test/results/{devname}"),
            f"get_smart_results({devname})",
        )
        r.raise_for_status()
        return r.json()

    async def start_smart_test(self, disks: list[str], test_type: str) -> int:
        """Start a SMART test. Not retried — a duplicate start would launch a second job."""
        r = await self._client.post(
            "/api/v2.0/smart/test",
            json={"disks": disks, "type": test_type},
        )
        r.raise_for_status()
        return r.json()

    async def abort_job(self, job_id: int) -> None:
        """Abort a TrueNAS job. Not retried — best-effort cancel."""
        r = await self._client.post(
            "/api/v2.0/core/job_abort",
            json={"id": job_id},
        )
        r.raise_for_status()

    async def get_system_info(self) -> dict:
        r = await _with_retry(
            lambda: self._client.get("/api/v2.0/system/info"),
            "get_system_info",
        )
        r.raise_for_status()
        return r.json()