Closes the four remaining items from the post-Codex hardening list. #1 Rate-limit unlock + change-password endpoints (1.0.0-33) * Generalised the existing login limiter into a reusable `_RateLimiter` class in app/auth.py. Atomic check-then-increment in synchronous code so a parallel asyncio burst can't slip past the threshold. * `unlock_limiter` (5 attempts in 10 min → 10 min lockout) gates POST /api/v1/drives/{id}/unlock per-drive AND per-source-IP. * `pwchange_limiter` (5 in 10 min → 15 min lockout) gates POST /api/v1/auth/change-password per-user AND per-IP. * Both clear on successful operation. The login limiter keeps its existing `register_login_attempt` / `clear_login_failures` facade names so external callers don't change. #3 mypy in security-scan (1.0.0-33) * Added a 4th tool to the daily scan + forge workflow. Runs in a throwaway python:3.12-slim container against the deploy dir, exit code is informational only (NOT included in the `TOTAL_EXIT` failure sum). Findings land in ~/security-scans/scan-YYYY-MM-DD/mypy.txt for ratchet-down work over time. * Forge job uses `continue-on-error: true` so it doesn't fail the workflow until the type-debt baseline is annotated down. #4 Lifecycle test coverage (1.0.0-33) * New tests/test_lifecycle.py with 15 cases: - TestCommonHelpers (7 tests): _start_stage, _finish_stage success/failure/error-preservation, _recalculate_progress weighted math, _is_cancelled, _append_stage_log. - TestStartCancelJob (4 tests): start_job inserts queued row + correct stage list, duplicate-active rejection, cancel marks state, cancel returns False on terminal-state jobs. - TestRateLimiter (4 tests): under-threshold ok, trips at threshold, clear removes both counter + lockout, separate keys don't interfere. * Total goes from 44 to 59 tests; closes the orchestration-path coverage gap Codex flagged. #2 Partial routes.py split (1.0.0-34) * routes.py → routes/ package. Same staged-extraction pattern as the burnin.py split. * routes/auth.py — login/logout/setup/change-password (170 LoC). * routes/system.py — /health, /ws/terminal, /api/v1/updates/check (136 LoC). * routes/_helpers.py — shared utilities used by both extracted modules and the still-monolithic remainder: client_ip, operator_for, is_stale, stale_context, secret_status, SECRET_FIELDS (97 LoC). * routes/__init__.py shrank from 1568 LoC to 1261. Future slices can extract drives, burnin, history, settings the same way. * GOTCHA recorded in commit body: `from app import auth` at the top of __init__.py binds `auth` as an attribute on the package namespace, so `from . import auth as _auth_routes` finds the OUTER module and yields `app.auth` instead of the submodule. Fix is `import app.routes.auth as _auth_routes` (absolute). This bit me once at deploy time; container failed to start with `module 'app.auth' has no attribute 'router'`. Verification: 59/59 tests pass (44 existing + 15 new); container boots clean at 1.0.0-34; /health 200 with all checks green; security scan still clean (mypy informational findings ignored from totals). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
136 lines
5 KiB
Python
136 lines
5 KiB
Python
"""System-level endpoints with no business-logic dependencies.
|
|
|
|
GET /health — readiness probe (DB write + poller + SSH)
|
|
GET /api/v1/updates/check — check Forgejo for newer release
|
|
WS /ws/terminal — xterm.js bridge to TrueNAS SSH PTY
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import datetime, timezone
|
|
|
|
import aiosqlite
|
|
from fastapi import APIRouter, Depends, WebSocket
|
|
from fastapi.responses import JSONResponse
|
|
|
|
from app import poller
|
|
from app.config import settings
|
|
from app.database import get_db
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
@router.get("/health")
|
|
async def health(db: aiosqlite.Connection = Depends(get_db)):
|
|
"""Real readiness check, not just process-is-running.
|
|
|
|
Verifies (a) DB writable, (b) poller has succeeded recently relative
|
|
to the configured stale_threshold_seconds, (c) SSH reachable when
|
|
configured. Returns 503 when any check fails so a proxy/orchestrator
|
|
health probe can take the container out of rotation.
|
|
"""
|
|
from app import ssh_client as _ssh
|
|
|
|
checks: dict[str, dict] = {}
|
|
|
|
# DB probe — actually exercise the write path (read-only mounts,
|
|
# full disks, broken WAL all silently pass a journal_mode read).
|
|
# Uses a temp table that lives only inside the connection so the
|
|
# round-trip touches the writer without polluting real data.
|
|
try:
|
|
await db.execute(
|
|
"CREATE TEMP TABLE IF NOT EXISTS _hc (k INTEGER PRIMARY KEY, v TEXT)"
|
|
)
|
|
await db.execute("INSERT OR REPLACE INTO _hc (k, v) VALUES (1, ?)",
|
|
(datetime.now(timezone.utc).isoformat(),))
|
|
cur = await db.execute("SELECT v FROM _hc WHERE k=1")
|
|
row = await cur.fetchone()
|
|
await db.commit()
|
|
checks["db"] = {"ok": bool(row)}
|
|
except Exception as exc:
|
|
checks["db"] = {"ok": False, "error": str(exc)}
|
|
|
|
ps = poller.get_state()
|
|
last = ps.get("last_poll_at")
|
|
poll_age = None
|
|
if last:
|
|
try:
|
|
t = datetime.fromisoformat(last)
|
|
if t.tzinfo is None:
|
|
t = t.replace(tzinfo=timezone.utc)
|
|
poll_age = (datetime.now(timezone.utc) - t).total_seconds()
|
|
except Exception:
|
|
poll_age = None
|
|
poll_ok = ps.get("healthy") and (
|
|
poll_age is None or poll_age <= settings.stale_threshold_seconds * 3
|
|
)
|
|
checks["poller"] = {
|
|
"ok": bool(poll_ok),
|
|
"last_error": ps.get("last_error"),
|
|
"last_poll_at": last,
|
|
"age_seconds": int(poll_age) if poll_age is not None else None,
|
|
}
|
|
|
|
# SSH probe — only when configured. Cheap (single sensors -j).
|
|
if _ssh.is_configured():
|
|
try:
|
|
r = await _ssh.test_connection()
|
|
checks["ssh"] = {"ok": bool(r.get("ok")),
|
|
"error": r.get("error")}
|
|
except Exception as exc:
|
|
checks["ssh"] = {"ok": False, "error": str(exc)}
|
|
else:
|
|
checks["ssh"] = {"ok": True, "skipped": True}
|
|
|
|
cur = await db.execute("SELECT COUNT(*) FROM drives")
|
|
row = await cur.fetchone()
|
|
drives_tracked = row[0] if row else 0
|
|
|
|
status_ok = all(c["ok"] for c in checks.values())
|
|
body = {
|
|
"status": "ok" if status_ok else "degraded",
|
|
"checks": checks,
|
|
"drives_tracked": drives_tracked,
|
|
"poll_interval_s": settings.poll_interval_seconds,
|
|
"version": settings.app_version,
|
|
}
|
|
return JSONResponse(body, status_code=200 if status_ok else 503)
|
|
|
|
|
|
@router.websocket("/ws/terminal")
|
|
async def terminal_ws(websocket: WebSocket):
|
|
"""WebSocket endpoint bridging the browser xterm.js terminal to an SSH PTY."""
|
|
from app import terminal as _term
|
|
await _term.handle(websocket)
|
|
|
|
|
|
@router.get("/api/v1/updates/check")
|
|
async def check_updates():
|
|
"""Check for a newer release on Forgejo."""
|
|
import httpx
|
|
current = settings.app_version
|
|
try:
|
|
async with httpx.AsyncClient(timeout=8.0) as client:
|
|
r = await client.get(
|
|
"https://git.hellocomputer.xyz/api/v1/repos/brandon/truenas-burnin/releases/latest",
|
|
headers={"Accept": "application/json"},
|
|
)
|
|
if r.status_code == 200:
|
|
data = r.json()
|
|
latest = data.get("tag_name", "").lstrip("v")
|
|
up_to_date = not latest or latest == current
|
|
return {
|
|
"current": current,
|
|
"latest": latest or None,
|
|
"update_available": not up_to_date,
|
|
"message": None,
|
|
}
|
|
elif r.status_code == 404:
|
|
return {"current": current, "latest": None, "update_available": False,
|
|
"message": "No releases published yet"}
|
|
else:
|
|
return {"current": current, "latest": None, "update_available": False,
|
|
"message": f"Forgejo API returned {r.status_code}"}
|
|
except Exception as exc:
|
|
return {"current": current, "latest": None, "update_available": False,
|
|
"message": f"Could not reach update server: {exc}"}
|