import asyncio import logging from collections.abc import Callable, Coroutine from typing import Any, TypeVar import httpx from app.config import settings log = logging.getLogger(__name__) T = TypeVar("T") # Exceptions that are safe to retry (transient network issues) _RETRYABLE = ( httpx.ConnectError, httpx.TimeoutException, httpx.RemoteProtocolError, httpx.ReadError, ) async def _with_retry( factory: Callable[[], Coroutine[Any, Any, T]], label: str, max_attempts: int = 3, ) -> T: """ Call factory() to get a fresh coroutine and await it, retrying with exponential backoff on transient failures. A factory (not a bare coroutine) is required so each attempt gets a new coroutine object — an already-awaited coroutine cannot be reused. """ backoff = 1.0 for attempt in range(1, max_attempts + 1): try: return await factory() except _RETRYABLE as exc: if attempt == max_attempts: raise log.warning( "TrueNAS %s transient error (attempt %d/%d): %s — retrying in %.0fs", label, attempt, max_attempts, exc, backoff, ) await asyncio.sleep(backoff) backoff *= 2 class TrueNASClient: def __init__(self) -> None: self._client = httpx.AsyncClient( base_url=settings.truenas_base_url, headers={"Authorization": f"Bearer {settings.truenas_api_key}"}, verify=settings.truenas_verify_tls, timeout=10.0, ) async def close(self) -> None: await self._client.aclose() async def get_disks(self) -> list[dict]: r = await _with_retry( lambda: self._client.get("/api/v2.0/disk"), "get_disks", ) r.raise_for_status() return r.json() async def get_smart_jobs(self, state: str | None = None) -> list[dict]: params: dict = {"method": "smart.test"} if state: params["state"] = state r = await _with_retry( lambda: self._client.get("/api/v2.0/core/get_jobs", params=params), "get_smart_jobs", ) r.raise_for_status() return r.json() async def get_smart_results(self, devname: str) -> list[dict]: r = await _with_retry( lambda: self._client.get(f"/api/v2.0/smart/test/results/{devname}"), f"get_smart_results({devname})", ) r.raise_for_status() return r.json() async def start_smart_test(self, disks: list[str], test_type: str) -> int: """Start a SMART test. Not retried — a duplicate start would launch a second job.""" r = await self._client.post( "/api/v2.0/smart/test", json={"disks": disks, "type": test_type}, ) r.raise_for_status() return r.json() async def abort_job(self, job_id: int) -> None: """Abort a TrueNAS job. Not retried — best-effort cancel.""" r = await self._client.post( "/api/v2.0/core/job_abort", json={"id": job_id}, ) r.raise_for_status() async def get_system_info(self) -> dict: r = await _with_retry( lambda: self._client.get("/api/v2.0/system/info"), "get_system_info", ) r.raise_for_status() return r.json()