nas-burnin/app/routes/__init__.py
Brandon Walter ec636f8f3a
Some checks failed
Security scan / pip-audit (push) Has been cancelled
Security scan / bandit (push) Has been cancelled
Security scan / gitleaks (push) Has been cancelled
Security scan / mypy (push) Has been cancelled
fix: PRAGMA busy_timeout on every SQLite connection (1.0.0-60)
Jobs 60-63 ran healthy for 16h then all 4 died simultaneously with
'database is locked'. The burnin drain used _db() which set
busy_timeout=10000, but:

1. 10s was sometimes too short under heavy contention (4 burn-in
   drains writing every 5s + poller every 12s + retention scan +
   auth + lifespan = many concurrent writers).
2. OTHER aiosqlite.connect() sites (poller, retention, auth, mailer,
   routes/__init__'s SSE, burnin/__init__.py's various helpers,
   database.get_db) didn't set busy_timeout at all. Without it,
   SQLite raises 'database is locked' INSTANTLY on any contention,
   which forced concurrency back onto the drain's connection.

Fix:
- _db() busy_timeout 10000 → 60000 (60s; aggressive but right for
  this workload — brief contention spikes are normal and waiting
  beats failing).
- PRAGMA busy_timeout=60000 added on every aiosqlite.connect() site
  next to the existing PRAGMA calls. Applied via a small Python
  pass that preserves the original variable name (db / _tdb / src
  / dst etc.) and indentation.

Same restart sequence applied: rebuild container, reset 4 drives,
relaunch via loopback bypass. Jobs 64-67 are now running.

This is auto-restart #2 in 24h. Safety brake at 3.
2026-05-14 06:39:33 -04:00

166 lines
6.2 KiB
Python

import asyncio
import csv
import io
import json
from datetime import datetime, timezone
import aiosqlite
from fastapi import APIRouter, Depends, HTTPException, Query, Request
from fastapi.responses import HTMLResponse, StreamingResponse
from sse_starlette.sse import EventSourceResponse
from app import poller
from app.config import settings
from app.database import get_db
from app.models import (
BurninJobResponse, BurninStageResponse,
CancelBurninRequest, DriveResponse,
SmartTestState, StartBurninRequest, UnlockPoolDriveRequest,
UpdateDriveRequest,
)
from app.renderer import templates
# Helpers shared with the extracted sub-routers — keep the underscore-
# prefixed local names that existing in-file callers reach for.
from ._helpers import (
client_ip as _client_ip,
is_stale as _is_stale,
operator_for as _operator_for,
secret_status as _secret_status,
stale_context as _stale_context,
SECRET_FIELDS as _SECRET_FIELDS,
)
router = APIRouter()
# Sub-routers extracted as part of the routes/ package split (1.0.0-34).
# Their endpoints get registered against the same APIRouter, so the
# external `from app.routes import router` import in app/main.py keeps
# working unchanged. Future slices can extract more — drives, burnin,
# settings, history — using the same pattern.
#
# Absolute imports (`import app.routes.X as _Y`) instead of relative
# (`from . import X as _Y`) so we stay safe even if a future top-level
# `from app import X` is reintroduced here — `from app import auth`
# would bind `auth` on the `app.routes` package namespace and shadow
# any relative-submodule lookup. Absolute imports always resolve to
# `app.routes.X` regardless of what's already bound on the package.
import app.routes.auth as _auth_routes # noqa: E402
import app.routes.system as _system_routes # noqa: E402
import app.routes.history as _history_routes # noqa: E402
import app.routes.audit as _audit_routes # noqa: E402
import app.routes.stats as _stats_routes # noqa: E402
import app.routes.report as _report_routes # noqa: E402
import app.routes.settings as _settings_routes # noqa: E402
import app.routes.drives as _drives_routes # noqa: E402
import app.routes.burnin as _burnin_routes # noqa: E402
router.include_router(_auth_routes.router)
router.include_router(_system_routes.router)
router.include_router(_history_routes.router)
router.include_router(_audit_routes.router)
router.include_router(_stats_routes.router)
router.include_router(_report_routes.router)
router.include_router(_settings_routes.router)
router.include_router(_drives_routes.router)
router.include_router(_burnin_routes.router)
# Drives helpers — re-exported for the dashboard + SSE handlers in this
# file AND for `from app.routes import _fetch_drives_for_template`
# from mailer.py (existing back-compat shim).
from ._drives_helpers import ( # noqa: E402
_DRIVES_QUERY, _row_to_drive, _build_smart, _compute_status,
_compute_eta_seconds, _eta_seconds,
_fetch_burnin_by_drive, _fetch_drives_for_template,
)
# _stale_context is now imported from ._helpers above.
# ---------------------------------------------------------------------------
# Dashboard
# ---------------------------------------------------------------------------
@router.get("/", response_class=HTMLResponse)
async def dashboard(request: Request, db: aiosqlite.Connection = Depends(get_db)):
drives = await _fetch_drives_for_template(db)
ps = poller.get_state()
return templates.TemplateResponse(request, "dashboard.html", {
"request": request,
"drives": drives,
"poller": ps,
**_stale_context(ps),
})
# ---------------------------------------------------------------------------
# SSE — live drive table updates
# ---------------------------------------------------------------------------
@router.get("/sse/drives")
async def sse_drives(request: Request):
q = poller.subscribe()
async def generate():
try:
while True:
# Wait for next poll notification or keepalive timeout
try:
payload = await asyncio.wait_for(q.get(), timeout=25.0)
except asyncio.TimeoutError:
if await request.is_disconnected():
break
yield {"event": "keepalive", "data": ""}
continue
if await request.is_disconnected():
break
# Extract alert from payload (may be None for regular polls)
alert = None
if isinstance(payload, dict):
alert = payload.get("alert")
# Render fresh table HTML
async with aiosqlite.connect(settings.db_path) as db:
db.row_factory = aiosqlite.Row
await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA busy_timeout=60000")
drives = await _fetch_drives_for_template(db)
html = templates.env.get_template(
"components/drives_table.html"
).render(drives=drives)
yield {"event": "drives-update", "data": html}
# Push system sensor state so JS can update temp chips live
ps = poller.get_state()
yield {
"event": "system-sensors",
"data": json.dumps({
"system_temps": ps.get("system_temps", {}),
"thermal_pressure": ps.get("thermal_pressure", "ok"),
"temp_warn_c": settings.temp_warn_c,
"temp_crit_c": settings.temp_crit_c,
}),
}
# Push browser notification event if this was a job completion
if alert:
yield {"event": "job-alert", "data": json.dumps(alert)}
finally:
poller.unsubscribe(q)
return EventSourceResponse(generate())
# ---------------------------------------------------------------------------
# JSON API
# ---------------------------------------------------------------------------