fix: address Codex review of burnin package split (1.0.0-32)
Some checks are pending
Security scan / pip-audit (push) Waiting to run
Security scan / bandit (push) Waiting to run
Security scan / gitleaks (push) Waiting to run

Three LOW-severity findings from Codex's audit of the post-split
package, all small mechanical cleanups:

#1 routes.py:848 read burnin.UNLOCK_TTL_SECONDS — a snapshot alias
   bound at import time. After a test (or runtime) monkey-patches
   app.burnin.unlock.UNLOCK_TTL_SECONDS the API response would
   advertise the OLD value while grant_pool_unlock used the new one.
   Now reads burnin.unlock.UNLOCK_TTL_SECONDS directly so the API
   stays in sync with whatever the actual source-of-truth is.

#2 _stage_surface_validate_ssh() carried dead extraction scaffolding
   from when the badblocks logic was first inlined into burnin.py:
   _is_cancelled_sync (sync wrapper that does run_until_complete in
   a coroutine — would deadlock if ever called), last_logged_pct,
   on_progress, accumulated_lines, on_progress_async — none on any
   control-flow path. Plus result["output"] which was set but never
   read. All deleted; the inline _drain coroutines below already
   handle progress/log throttling correctly.

#3 The new module boundaries were leaking — root orchestration
   mutated _remote_pids and _unlock_grants directly even though
   kill.clear_remote_pid() and unlock.invalidate_grant() existed.
   Now using the helpers, so a future change to the storage shape
   only requires editing the owning module.

Bonus from Codex's check note: _get_client() now asserts
burnin._client is not None with a clear message instead of relying
on an obscure NoneType AttributeError if a stage is somehow called
before init().

Verified: 44/44 tests pass; container boots clean; /health 200.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Brandon Walter 2026-05-03 01:35:07 -04:00
parent 19c2c0dc0f
commit eb2a964171
4 changed files with 14 additions and 38 deletions

View file

@ -128,7 +128,7 @@ def _spawn_run_job(job_id: int) -> "asyncio.Task":
# Remove only if it's still us — avoid clobbering a re-enqueued task.
if _active_tasks.get(job_id) is t:
_active_tasks.pop(job_id, None)
_remote_pids.pop(job_id, None)
_kill.clear_remote_pid(job_id)
task.add_done_callback(_cleanup)
return task
@ -205,7 +205,7 @@ async def start_job(drive_id: int, profile: str, operator: str,
# unlock grant (it was bound to stale identity) and
# refuse with a descriptive error so the operator
# knows to wait for the next poll cycle.
_unlock_grants.pop(drive_id, None)
_unlock.invalidate_grant(drive_id)
fresh_pool = fresh["pool"] if fresh else None
fresh_role = fresh["role"] if fresh else None
if fresh_pool:

View file

@ -40,6 +40,10 @@ def _get_client():
the package root for backward compat with routes.py which reaches
for ``burnin._client`` directly."""
from app import burnin
assert burnin._client is not None, (
"burnin._client is None — burnin.init() must be called before any "
"stage that reaches the TrueNAS REST API."
)
return burnin._client
@ -378,41 +382,11 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
f"All data on /dev/{devname} will be overwritten.\n\n"
)
def _is_cancelled_sync() -> bool:
# Synchronous version — we check the DB state flag set by cancel_job()
import asyncio
loop = asyncio.get_event_loop()
try:
return loop.run_until_complete(_is_cancelled(job_id))
except Exception:
return False
last_logged_pct = [-1]
def on_progress(pct: int, bad_blocks: int, line: str) -> None:
nonlocal last_logged_pct
# Write to log (fire-and-forget via asyncio.create_task from sync context)
# The log append is done in the async flush below
pass
accumulated_lines: list[str] = []
async def on_progress_async(pct: int, bad_blocks: int, line: str) -> None:
accumulated_lines.append(line)
# Flush to DB and update progress every ~25 lines to avoid excessive DB writes
if len(accumulated_lines) % 25 == 0:
await _append_stage_log(job_id, "surface_validate", "".join(accumulated_lines[-25:]))
await _update_stage_bad_blocks(job_id, "surface_validate", bad_blocks)
await _update_stage_percent(job_id, "surface_validate", pct)
await _recalculate_progress(job_id)
_push_update()
if await _is_cancelled(job_id):
raise asyncio.CancelledError
# Run badblocks — we adapt the callback pattern to async by collecting then flushing
# Streaming + progress is handled by the inline _drain coroutines
# below; the in-loop _append_stage_log + _update_stage_percent calls
# take care of throttled DB writes. Result dict is just final tallies.
result = {"bad_blocks": 0, "output": "", "aborted": False}
try:
# The actual streaming; we handle progress via the accumulated_lines pattern
bad_blocks_total = 0
output_lines: list[str] = []
@ -520,7 +494,6 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
if tail:
await _append_stage_log(job_id, "surface_validate", tail)
result["bad_blocks"] = bad_blocks_total
result["output"] = "".join(output_lines) # in-memory only, not re-stored
result["aborted"] = bad_blocks_total > settings.bad_block_threshold
except asyncio.CancelledError:

View file

@ -83,7 +83,7 @@ class Settings(BaseSettings):
ssh_key: str = "" # PEM private key content (paste full key including headers)
# Application version — used by the /api/v1/updates/check endpoint
app_version: str = "1.0.0-31"
app_version: str = "1.0.0-32"
# ---- Authentication (1.0.0-22) ----
# session_secret: HMAC key for signing session cookies. Empty = generate

View file

@ -845,7 +845,10 @@ async def unlock_pool_drive(drive_id: int, request: Request, req: UnlockPoolDriv
except ValueError as exc:
raise HTTPException(status_code=400, detail=str(exc))
return {"unlocked": True, "expires_at": expiry,
"ttl_seconds": burnin.UNLOCK_TTL_SECONDS}
# Read from the submodule, not the package-root snapshot
# alias — keeps tests that monkey-patch UNLOCK_TTL_SECONDS
# in app.burnin.unlock observable from the API response.
"ttl_seconds": burnin.unlock.UNLOCK_TTL_SECONDS}
@router.post("/api/v1/burnin/{job_id}/cancel")