nas-burnin/app/routes/drives.py
Brandon Walter 383258df97
Some checks are pending
Security scan / pip-audit (push) Waiting to run
Security scan / bandit (push) Waiting to run
Security scan / gitleaks (push) Waiting to run
Security scan / mypy (push) Waiting to run
feat: phase caption + bad-block badge + per-pattern history (1.0.0-47)
Three additions to the surface_validate drawer block:

1. **Phase caption** below the meters: "Pattern 2 of 4 · Verify 0x55
   · 47% within phase". Pure JS — no schema change. Makes the
   visual grammar explicit without needing the operator to mentally
   map phase=4 to "verifying pattern 2".

2. **Bad-block badge** in the vitals row. Green at 0, red at >0.
   The number was already on the stage row but burying it in the
   log felt wrong — surfacing it next to temp/speed/ETA keeps it
   in eye-line during long runs.

3. **Per-pattern duration history** below the caption. New
   bb_phase_history JSON column (idempotent migration) maps
   {phase_num: ts}. Parser stamps the timestamp on every phase
   transition (and on stage entry for phase 1). Drawer diffs
   consecutive write-phase starts to derive "0xaa: 14h 22m"
   for completed patterns. Once one pattern is done you can
   predict the rest without leaving the drawer.

Persistence is idempotent: re-entry of the same phase keeps the
original timestamp so a transient parser reset doesn't blow away
history. JSON parse failures fail gracefully (no row rendered).
2026-05-08 23:23:02 -07:00

378 lines
14 KiB
Python

"""Drive endpoints — list, drawer, edit, SMART start/cancel, reset, unlock.
GET /api/v1/drives
GET /api/v1/drives/{id}/drawer
GET /api/v1/drives/{id}
PATCH /api/v1/drives/{id} — notes / location update
POST /api/v1/drives/{id}/smart/start
POST /api/v1/drives/{id}/smart/cancel
POST /api/v1/drives/{id}/reset
POST /api/v1/drives/{id}/unlock — pool-membership lock override
"""
from __future__ import annotations
import json as _json
from datetime import datetime, timezone
import aiosqlite
from fastapi import APIRouter, Depends, HTTPException, Request
from app import auth, burnin, poller
from app.database import get_db
from app.models import (
DriveResponse, UnlockPoolDriveRequest, UpdateDriveRequest,
)
from ._drives_helpers import _DRIVES_QUERY, _row_to_drive
from ._helpers import client_ip, operator_for
router = APIRouter()
@router.get("/api/v1/drives", response_model=list[DriveResponse])
async def list_drives(db: aiosqlite.Connection = Depends(get_db)):
cur = await db.execute(_DRIVES_QUERY.format(where=""))
rows = await cur.fetchall()
return [_row_to_drive(r) for r in rows]
@router.get("/api/v1/drives/{drive_id}/drawer")
async def drive_drawer(drive_id: int, db: aiosqlite.Connection = Depends(get_db)):
"""Data for the log drawer — latest burn-in job + stages, SMART tests, audit events."""
cur = await db.execute(_DRIVES_QUERY.format(where="AND d.id = ?"), (drive_id,))
row = await cur.fetchone()
if not row:
raise HTTPException(status_code=404, detail="Drive not found")
drive = _row_to_drive(row)
# Latest burn-in job + its stages (include log_text and bad_blocks)
cur = await db.execute(
"SELECT * FROM burnin_jobs WHERE drive_id=? ORDER BY id DESC LIMIT 1",
(drive_id,),
)
job_row = await cur.fetchone()
burnin_job = None
if job_row:
job = dict(job_row)
cur = await db.execute(
"SELECT id, stage_name, state, percent, started_at, finished_at, "
"duration_seconds, error_text, log_text, bad_blocks, "
"bb_phase, bb_phase_pct, bb_mbps, bb_phase_history "
"FROM burnin_stages WHERE burnin_job_id=? ORDER BY id",
(job_row["id"],),
)
job["stages"] = [dict(r) for r in await cur.fetchall()]
burnin_job = job
# SMART raw output from smart_tests table
cur = await db.execute(
"SELECT test_type, state, percent, started_at, finished_at, error_text, raw_output "
"FROM smart_tests WHERE drive_id=?",
(drive_id,),
)
smart_rows = {r["test_type"]: dict(r) for r in await cur.fetchall()}
# Cached SMART attributes (JSON blob on drives table)
smart_attrs = None
cur = await db.execute("SELECT smart_attrs FROM drives WHERE id=?", (drive_id,))
attrs_row = await cur.fetchone()
if attrs_row and attrs_row["smart_attrs"]:
try:
smart_attrs = _json.loads(attrs_row["smart_attrs"])
except Exception:
pass
# Last 50 audit events for this drive (newest first)
cur = await db.execute("""
SELECT id, event_type, operator, message, created_at
FROM audit_events
WHERE drive_id = ?
ORDER BY id DESC
LIMIT 50
""", (drive_id,))
events = [dict(r) for r in await cur.fetchall()]
def _smart_card(test_type: str) -> dict:
smart_obj = drive.smart_short if test_type == "short" else drive.smart_long
base = smart_obj.model_dump() if smart_obj else {}
row = smart_rows.get(test_type, {})
base["raw_output"] = row.get("raw_output")
return base
return {
"drive": {
"id": drive.id,
"devname": drive.devname,
"serial": drive.serial,
"model": drive.model,
"size_bytes": drive.size_bytes,
"temperature_c": drive.temperature_c,
},
"burnin": burnin_job,
"smart": {
"short": _smart_card("short"),
"long": _smart_card("long"),
"attrs": smart_attrs,
},
"events": events,
}
@router.get("/api/v1/drives/{drive_id}", response_model=DriveResponse)
async def get_drive(drive_id: int, db: aiosqlite.Connection = Depends(get_db)):
cur = await db.execute(
_DRIVES_QUERY.format(where="AND d.id = ?"), (drive_id,)
)
row = await cur.fetchone()
if not row:
raise HTTPException(status_code=404, detail="Drive not found")
return _row_to_drive(row)
@router.post("/api/v1/drives/{drive_id}/smart/start")
async def smart_start(
drive_id: int,
request: Request,
body: dict,
db: aiosqlite.Connection = Depends(get_db),
):
"""Start a standalone SHORT or LONG SMART test on a single drive.
Uses SSH (smartctl) when configured — required for TrueNAS SCALE 25.10+
where the REST smart/test endpoint no longer exists.
Falls back to TrueNAS REST API for older versions.
"""
from app import ssh_client
test_type = (body.get("type") or "").upper()
if test_type not in ("SHORT", "LONG"):
raise HTTPException(status_code=422, detail="type must be SHORT or LONG")
cur = await db.execute("SELECT devname FROM drives WHERE id=?", (drive_id,))
row = await cur.fetchone()
if not row:
raise HTTPException(status_code=404, detail="Drive not found")
devname = row[0]
operator = operator_for(request, body.get("operator"))
now = datetime.now(timezone.utc).isoformat()
ttype_lower = test_type.lower()
if ssh_client.is_configured():
# SSH path — works on TrueNAS SCALE 25.10+ and CORE
try:
output = await ssh_client.start_smart_test(devname, test_type)
except Exception as exc:
raise HTTPException(status_code=502, detail=f"SSH error: {exc}")
# Mark as running in DB (truenas_job_id=NULL signals SSH-managed test)
# Store smartctl start output as proof the test was initiated
await db.execute(
"""INSERT INTO smart_tests (drive_id, test_type, state, percent, started_at, raw_output)
VALUES (?,?,?,?,?,?)
ON CONFLICT(drive_id, test_type) DO UPDATE SET
state='running', percent=0, truenas_job_id=NULL,
started_at=excluded.started_at, finished_at=NULL, error_text=NULL,
raw_output=excluded.raw_output""",
(drive_id, ttype_lower, "running", 0, now, output),
)
await db.execute(
"""INSERT INTO audit_events (event_type, drive_id, operator, message)
VALUES (?,?,?,?)""",
("smart_test_start", drive_id, operator,
f"{test_type} SMART test started on {devname}"),
)
await db.commit()
poller._notify_subscribers()
return {"devname": devname, "type": test_type, "message": output[:200]}
else:
# REST path — older TrueNAS CORE / SCALE versions
client = burnin._client
if client is None:
raise HTTPException(status_code=503, detail="TrueNAS client not ready")
try:
tn_job_id = await client.start_smart_test([devname], test_type)
except Exception as exc:
raise HTTPException(status_code=502, detail=f"TrueNAS error: {exc}")
await db.execute(
"""INSERT INTO audit_events (event_type, drive_id, operator, message)
VALUES (?,?,?,?)""",
("smart_test_start", drive_id, operator,
f"{test_type} SMART test started on {devname}"),
)
await db.commit()
return {"job_id": tn_job_id, "devname": devname, "type": test_type}
@router.post("/api/v1/drives/{drive_id}/smart/cancel")
async def smart_cancel(
drive_id: int,
request: Request,
body: dict,
db: aiosqlite.Connection = Depends(get_db),
):
"""Cancel a running standalone SMART test on a drive."""
test_type = (body.get("type") or "").lower()
if test_type not in ("short", "long"):
raise HTTPException(status_code=422, detail="type must be 'short' or 'long'")
cur = await db.execute("SELECT devname FROM drives WHERE id=?", (drive_id,))
row = await cur.fetchone()
if not row:
raise HTTPException(status_code=404, detail="Drive not found")
devname = row[0]
operator = operator_for(request, body.get("operator"))
client = burnin._client
if client is None:
raise HTTPException(status_code=503, detail="TrueNAS client not ready")
from app import ssh_client
if ssh_client.is_configured():
# SSH path — abort via smartctl -X
try:
await ssh_client.abort_smart_test(devname)
except Exception as exc:
raise HTTPException(status_code=502, detail=f"SSH abort error: {exc}")
else:
# REST path — find TrueNAS job and abort it
try:
jobs = await client.get_smart_jobs()
tn_job_id = None
for j in jobs:
if j.get("state") != "RUNNING":
continue
args = j.get("arguments", [])
if not args or not isinstance(args[0], dict):
continue
if devname in args[0].get("disks", []):
tn_job_id = j["id"]
break
if tn_job_id is None:
raise HTTPException(status_code=404, detail="No running SMART test found for this drive")
await client.abort_job(tn_job_id)
except HTTPException:
raise
except Exception as exc:
raise HTTPException(status_code=502, detail=f"TrueNAS error: {exc}")
# Update local DB state
now = datetime.now(timezone.utc).isoformat()
await db.execute(
"UPDATE smart_tests SET state='aborted', finished_at=? WHERE drive_id=? AND test_type=? AND state='running'",
(now, drive_id, test_type),
)
await db.execute(
"""INSERT INTO audit_events (event_type, drive_id, operator, message)
VALUES (?,?,?,?)""",
("smart_test_cancel", drive_id, operator,
f"{test_type.upper()} SMART test cancelled on {devname}"),
)
await db.commit()
return {"cancelled": True, "devname": devname, "type": test_type}
@router.patch("/api/v1/drives/{drive_id}")
async def update_drive(
drive_id: int,
req: UpdateDriveRequest,
db: aiosqlite.Connection = Depends(get_db),
):
cur = await db.execute("SELECT id FROM drives WHERE id=?", (drive_id,))
if not await cur.fetchone():
raise HTTPException(status_code=404, detail="Drive not found")
await db.execute(
"UPDATE drives SET notes=?, location=? WHERE id=?",
(req.notes, req.location, drive_id),
)
await db.commit()
return {"updated": True}
@router.post("/api/v1/drives/{drive_id}/reset")
async def reset_drive(
drive_id: int,
request: Request,
body: dict,
db: aiosqlite.Connection = Depends(get_db),
):
"""
Clear SMART test results for a drive so it shows as fresh.
Only allowed when no burn-in job is active (queued or running).
Preserves all job history — just resets the display state.
"""
cur = await db.execute("SELECT id FROM drives WHERE id=?", (drive_id,))
if not await cur.fetchone():
raise HTTPException(status_code=404, detail="Drive not found")
# Reject if any active burn-in
cur = await db.execute(
"SELECT COUNT(*) FROM burnin_jobs WHERE drive_id=? AND state IN ('queued','running')",
(drive_id,),
)
if (await cur.fetchone())[0] > 0:
raise HTTPException(status_code=409, detail="Cannot reset while a burn-in is active")
# Trust the logged-in user, not the body (the JS used to send a
# literal "operator" because window._operator was never set).
operator = operator_for(request, body.get("operator"))
# Reset SMART test state to idle
await db.execute(
"""UPDATE smart_tests SET state='idle', percent=0, started_at=NULL,
eta_at=NULL, finished_at=NULL, error_text=NULL, raw_output=NULL
WHERE drive_id=?""",
(drive_id,),
)
# Clear SMART attrs cache + stamp reset time (hides prior burn-in from dashboard)
now = datetime.now(timezone.utc).isoformat()
await db.execute(
"UPDATE drives SET smart_attrs=NULL, last_reset_at=? WHERE id=?",
(now, drive_id),
)
# Audit event
await db.execute(
"""INSERT INTO audit_events (event_type, drive_id, operator, message)
VALUES (?,?,?,?)""",
("drive_reset", drive_id, operator, "Drive reset — SMART state cleared"),
)
await db.commit()
poller._notify_subscribers()
return {"reset": True}
@router.post("/api/v1/drives/{drive_id}/unlock")
async def unlock_pool_drive(drive_id: int, request: Request, req: UnlockPoolDriveRequest):
operator = operator_for(request, req.operator)
ip = client_ip(request)
# Rate-limit by drive AND by source IP. A typo on the confirm token
# is the common case so the threshold is loose, but a brute-force
# attempt to guess the token still hits the IP cap.
keys = (("drive", drive_id), ("ip", ip))
attempt = auth.unlock_limiter.register(*keys)
if attempt != "ok":
raise HTTPException(
status_code=429,
detail="Too many unlock attempts on this drive. Try again later.",
)
try:
expiry = await burnin.grant_pool_unlock(
drive_id, req.confirm_token, operator, req.reason,
)
except ValueError as exc:
raise HTTPException(status_code=400, detail=str(exc))
auth.unlock_limiter.clear(*keys)
# Read from the submodule, not the package-root snapshot alias —
# keeps tests that monkey-patch UNLOCK_TTL_SECONDS in
# app.burnin.unlock observable from the API response.
return {"unlocked": True, "expires_at": expiry,
"ttl_seconds": burnin.unlock.UNLOCK_TTL_SECONDS}