From eb2a9641717bd1e4da4a0fb551df37458e486053 Mon Sep 17 00:00:00 2001 From: Brandon Walter <51866976+echoparkbaby@users.noreply.github.com> Date: Sun, 3 May 2026 01:35:07 -0400 Subject: [PATCH] fix: address Codex review of burnin package split (1.0.0-32) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three LOW-severity findings from Codex's audit of the post-split package, all small mechanical cleanups: #1 routes.py:848 read burnin.UNLOCK_TTL_SECONDS — a snapshot alias bound at import time. After a test (or runtime) monkey-patches app.burnin.unlock.UNLOCK_TTL_SECONDS the API response would advertise the OLD value while grant_pool_unlock used the new one. Now reads burnin.unlock.UNLOCK_TTL_SECONDS directly so the API stays in sync with whatever the actual source-of-truth is. #2 _stage_surface_validate_ssh() carried dead extraction scaffolding from when the badblocks logic was first inlined into burnin.py: _is_cancelled_sync (sync wrapper that does run_until_complete in a coroutine — would deadlock if ever called), last_logged_pct, on_progress, accumulated_lines, on_progress_async — none on any control-flow path. Plus result["output"] which was set but never read. All deleted; the inline _drain coroutines below already handle progress/log throttling correctly. #3 The new module boundaries were leaking — root orchestration mutated _remote_pids and _unlock_grants directly even though kill.clear_remote_pid() and unlock.invalidate_grant() existed. Now using the helpers, so a future change to the storage shape only requires editing the owning module. Bonus from Codex's check note: _get_client() now asserts burnin._client is not None with a clear message instead of relying on an obscure NoneType AttributeError if a stage is somehow called before init(). Verified: 44/44 tests pass; container boots clean; /health 200. Co-Authored-By: Claude Opus 4.7 (1M context) --- app/burnin/__init__.py | 4 ++-- app/burnin/stages.py | 41 +++++++---------------------------------- app/config.py | 2 +- app/routes.py | 5 ++++- 4 files changed, 14 insertions(+), 38 deletions(-) diff --git a/app/burnin/__init__.py b/app/burnin/__init__.py index 681e5f0..14ea800 100644 --- a/app/burnin/__init__.py +++ b/app/burnin/__init__.py @@ -128,7 +128,7 @@ def _spawn_run_job(job_id: int) -> "asyncio.Task": # Remove only if it's still us — avoid clobbering a re-enqueued task. if _active_tasks.get(job_id) is t: _active_tasks.pop(job_id, None) - _remote_pids.pop(job_id, None) + _kill.clear_remote_pid(job_id) task.add_done_callback(_cleanup) return task @@ -205,7 +205,7 @@ async def start_job(drive_id: int, profile: str, operator: str, # unlock grant (it was bound to stale identity) and # refuse with a descriptive error so the operator # knows to wait for the next poll cycle. - _unlock_grants.pop(drive_id, None) + _unlock.invalidate_grant(drive_id) fresh_pool = fresh["pool"] if fresh else None fresh_role = fresh["role"] if fresh else None if fresh_pool: diff --git a/app/burnin/stages.py b/app/burnin/stages.py index e135e01..12b6a33 100644 --- a/app/burnin/stages.py +++ b/app/burnin/stages.py @@ -40,6 +40,10 @@ def _get_client(): the package root for backward compat with routes.py which reaches for ``burnin._client`` directly.""" from app import burnin + assert burnin._client is not None, ( + "burnin._client is None — burnin.init() must be called before any " + "stage that reaches the TrueNAS REST API." + ) return burnin._client @@ -378,41 +382,11 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) f"All data on /dev/{devname} will be overwritten.\n\n" ) - def _is_cancelled_sync() -> bool: - # Synchronous version — we check the DB state flag set by cancel_job() - import asyncio - loop = asyncio.get_event_loop() - try: - return loop.run_until_complete(_is_cancelled(job_id)) - except Exception: - return False - - last_logged_pct = [-1] - - def on_progress(pct: int, bad_blocks: int, line: str) -> None: - nonlocal last_logged_pct - # Write to log (fire-and-forget via asyncio.create_task from sync context) - # The log append is done in the async flush below - pass - - accumulated_lines: list[str] = [] - - async def on_progress_async(pct: int, bad_blocks: int, line: str) -> None: - accumulated_lines.append(line) - # Flush to DB and update progress every ~25 lines to avoid excessive DB writes - if len(accumulated_lines) % 25 == 0: - await _append_stage_log(job_id, "surface_validate", "".join(accumulated_lines[-25:])) - await _update_stage_bad_blocks(job_id, "surface_validate", bad_blocks) - await _update_stage_percent(job_id, "surface_validate", pct) - await _recalculate_progress(job_id) - _push_update() - if await _is_cancelled(job_id): - raise asyncio.CancelledError - - # Run badblocks — we adapt the callback pattern to async by collecting then flushing + # Streaming + progress is handled by the inline _drain coroutines + # below; the in-loop _append_stage_log + _update_stage_percent calls + # take care of throttled DB writes. Result dict is just final tallies. result = {"bad_blocks": 0, "output": "", "aborted": False} try: - # The actual streaming; we handle progress via the accumulated_lines pattern bad_blocks_total = 0 output_lines: list[str] = [] @@ -520,7 +494,6 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) if tail: await _append_stage_log(job_id, "surface_validate", tail) result["bad_blocks"] = bad_blocks_total - result["output"] = "".join(output_lines) # in-memory only, not re-stored result["aborted"] = bad_blocks_total > settings.bad_block_threshold except asyncio.CancelledError: diff --git a/app/config.py b/app/config.py index 3ee5f30..e6fc657 100644 --- a/app/config.py +++ b/app/config.py @@ -83,7 +83,7 @@ class Settings(BaseSettings): ssh_key: str = "" # PEM private key content (paste full key including headers) # Application version — used by the /api/v1/updates/check endpoint - app_version: str = "1.0.0-31" + app_version: str = "1.0.0-32" # ---- Authentication (1.0.0-22) ---- # session_secret: HMAC key for signing session cookies. Empty = generate diff --git a/app/routes.py b/app/routes.py index 858a2de..7a0e427 100644 --- a/app/routes.py +++ b/app/routes.py @@ -845,7 +845,10 @@ async def unlock_pool_drive(drive_id: int, request: Request, req: UnlockPoolDriv except ValueError as exc: raise HTTPException(status_code=400, detail=str(exc)) return {"unlocked": True, "expires_at": expiry, - "ttl_seconds": burnin.UNLOCK_TTL_SECONDS} + # Read from the submodule, not the package-root snapshot + # alias — keeps tests that monkey-patch UNLOCK_TTL_SECONDS + # in app.burnin.unlock observable from the API response. + "ttl_seconds": burnin.unlock.UNLOCK_TTL_SECONDS} @router.post("/api/v1/burnin/{job_id}/cancel")