fix: address Codex review of burnin package split (1.0.0-32)
Three LOW-severity findings from Codex's audit of the post-split package, all small mechanical cleanups: #1 routes.py:848 read burnin.UNLOCK_TTL_SECONDS — a snapshot alias bound at import time. After a test (or runtime) monkey-patches app.burnin.unlock.UNLOCK_TTL_SECONDS the API response would advertise the OLD value while grant_pool_unlock used the new one. Now reads burnin.unlock.UNLOCK_TTL_SECONDS directly so the API stays in sync with whatever the actual source-of-truth is. #2 _stage_surface_validate_ssh() carried dead extraction scaffolding from when the badblocks logic was first inlined into burnin.py: _is_cancelled_sync (sync wrapper that does run_until_complete in a coroutine — would deadlock if ever called), last_logged_pct, on_progress, accumulated_lines, on_progress_async — none on any control-flow path. Plus result["output"] which was set but never read. All deleted; the inline _drain coroutines below already handle progress/log throttling correctly. #3 The new module boundaries were leaking — root orchestration mutated _remote_pids and _unlock_grants directly even though kill.clear_remote_pid() and unlock.invalidate_grant() existed. Now using the helpers, so a future change to the storage shape only requires editing the owning module. Bonus from Codex's check note: _get_client() now asserts burnin._client is not None with a clear message instead of relying on an obscure NoneType AttributeError if a stage is somehow called before init(). Verified: 44/44 tests pass; container boots clean; /health 200. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
19c2c0dc0f
commit
eb2a964171
4 changed files with 14 additions and 38 deletions
|
|
@ -128,7 +128,7 @@ def _spawn_run_job(job_id: int) -> "asyncio.Task":
|
|||
# Remove only if it's still us — avoid clobbering a re-enqueued task.
|
||||
if _active_tasks.get(job_id) is t:
|
||||
_active_tasks.pop(job_id, None)
|
||||
_remote_pids.pop(job_id, None)
|
||||
_kill.clear_remote_pid(job_id)
|
||||
|
||||
task.add_done_callback(_cleanup)
|
||||
return task
|
||||
|
|
@ -205,7 +205,7 @@ async def start_job(drive_id: int, profile: str, operator: str,
|
|||
# unlock grant (it was bound to stale identity) and
|
||||
# refuse with a descriptive error so the operator
|
||||
# knows to wait for the next poll cycle.
|
||||
_unlock_grants.pop(drive_id, None)
|
||||
_unlock.invalidate_grant(drive_id)
|
||||
fresh_pool = fresh["pool"] if fresh else None
|
||||
fresh_role = fresh["role"] if fresh else None
|
||||
if fresh_pool:
|
||||
|
|
|
|||
|
|
@ -40,6 +40,10 @@ def _get_client():
|
|||
the package root for backward compat with routes.py which reaches
|
||||
for ``burnin._client`` directly."""
|
||||
from app import burnin
|
||||
assert burnin._client is not None, (
|
||||
"burnin._client is None — burnin.init() must be called before any "
|
||||
"stage that reaches the TrueNAS REST API."
|
||||
)
|
||||
return burnin._client
|
||||
|
||||
|
||||
|
|
@ -378,41 +382,11 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
|||
f"All data on /dev/{devname} will be overwritten.\n\n"
|
||||
)
|
||||
|
||||
def _is_cancelled_sync() -> bool:
|
||||
# Synchronous version — we check the DB state flag set by cancel_job()
|
||||
import asyncio
|
||||
loop = asyncio.get_event_loop()
|
||||
try:
|
||||
return loop.run_until_complete(_is_cancelled(job_id))
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
last_logged_pct = [-1]
|
||||
|
||||
def on_progress(pct: int, bad_blocks: int, line: str) -> None:
|
||||
nonlocal last_logged_pct
|
||||
# Write to log (fire-and-forget via asyncio.create_task from sync context)
|
||||
# The log append is done in the async flush below
|
||||
pass
|
||||
|
||||
accumulated_lines: list[str] = []
|
||||
|
||||
async def on_progress_async(pct: int, bad_blocks: int, line: str) -> None:
|
||||
accumulated_lines.append(line)
|
||||
# Flush to DB and update progress every ~25 lines to avoid excessive DB writes
|
||||
if len(accumulated_lines) % 25 == 0:
|
||||
await _append_stage_log(job_id, "surface_validate", "".join(accumulated_lines[-25:]))
|
||||
await _update_stage_bad_blocks(job_id, "surface_validate", bad_blocks)
|
||||
await _update_stage_percent(job_id, "surface_validate", pct)
|
||||
await _recalculate_progress(job_id)
|
||||
_push_update()
|
||||
if await _is_cancelled(job_id):
|
||||
raise asyncio.CancelledError
|
||||
|
||||
# Run badblocks — we adapt the callback pattern to async by collecting then flushing
|
||||
# Streaming + progress is handled by the inline _drain coroutines
|
||||
# below; the in-loop _append_stage_log + _update_stage_percent calls
|
||||
# take care of throttled DB writes. Result dict is just final tallies.
|
||||
result = {"bad_blocks": 0, "output": "", "aborted": False}
|
||||
try:
|
||||
# The actual streaming; we handle progress via the accumulated_lines pattern
|
||||
bad_blocks_total = 0
|
||||
output_lines: list[str] = []
|
||||
|
||||
|
|
@ -520,7 +494,6 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
|||
if tail:
|
||||
await _append_stage_log(job_id, "surface_validate", tail)
|
||||
result["bad_blocks"] = bad_blocks_total
|
||||
result["output"] = "".join(output_lines) # in-memory only, not re-stored
|
||||
result["aborted"] = bad_blocks_total > settings.bad_block_threshold
|
||||
|
||||
except asyncio.CancelledError:
|
||||
|
|
|
|||
|
|
@ -83,7 +83,7 @@ class Settings(BaseSettings):
|
|||
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
||||
|
||||
# Application version — used by the /api/v1/updates/check endpoint
|
||||
app_version: str = "1.0.0-31"
|
||||
app_version: str = "1.0.0-32"
|
||||
|
||||
# ---- Authentication (1.0.0-22) ----
|
||||
# session_secret: HMAC key for signing session cookies. Empty = generate
|
||||
|
|
|
|||
|
|
@ -845,7 +845,10 @@ async def unlock_pool_drive(drive_id: int, request: Request, req: UnlockPoolDriv
|
|||
except ValueError as exc:
|
||||
raise HTTPException(status_code=400, detail=str(exc))
|
||||
return {"unlocked": True, "expires_at": expiry,
|
||||
"ttl_seconds": burnin.UNLOCK_TTL_SECONDS}
|
||||
# Read from the submodule, not the package-root snapshot
|
||||
# alias — keeps tests that monkey-patch UNLOCK_TTL_SECONDS
|
||||
# in app.burnin.unlock observable from the API response.
|
||||
"ttl_seconds": burnin.unlock.UNLOCK_TTL_SECONDS}
|
||||
|
||||
|
||||
@router.post("/api/v1/burnin/{job_id}/cancel")
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue