From c5a41d0260e59e477798b10d49ce355a8a8c159e Mon Sep 17 00:00:00 2001 From: Brandon Walter <51866976+echoparkbaby@users.noreply.github.com> Date: Sun, 10 May 2026 22:12:02 -0700 Subject: [PATCH] fix: throttle badblocks parser DB writes (1.0.0-52) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User reported sdb showing 134-day ETA. Investigation: badblocks processes all stuck in pipe_write wchan, iostat showing 0 throughput across all drives despite badblocks supposedly running. Root cause: each progress line was triggering 4-6 DB transactions (_update_stage_percent, _update_stage_bb_phase, _update_stage_bad_blocks, _update_stage_bb_mbps, _record_bb_phase_start, _recalculate_progress). With 4 concurrent burn-ins and sub-second progress lines, the asyncssh _drain couldn't keep up. Drain fell behind → asyncssh channel buffer filled → SSH window stopped advancing → sshd stopped reading from badblocks's stdout pipe → pipe filled → badblocks blocked on pipe_write() → no more disk I/O. That regression came in across 1.0.0-44 → 1.0.0-47 as I added each new persisted field. The previous per-line write path worked when there was only one DB call; it doesn't with five-plus. Fix: BB_DB_MIN_SECONDS=5 throttle on the DB-write path. The drain loop still consumes every progress line (so the pipe drains continuously), but commits to DB at most once every 5 seconds. Phase transitions always commit immediately (rare and important — they stamp bb_phase_history and advance the per-pattern meter). UI impact: minimal — drawer polls drives every ~12s anyway, so the displayed % was already at 12s resolution. The meter strip just won't sub-tick within a 5s window. DB load impact: 60-80x reduction during surface_validate. --- app/burnin/stages.py | 61 ++++++++++++++++++++++++++++---------------- app/config.py | 2 +- 2 files changed, 40 insertions(+), 23 deletions(-) diff --git a/app/burnin/stages.py b/app/burnin/stages.py index 1eaa25e..a261765 100644 --- a/app/burnin/stages.py +++ b/app/burnin/stages.py @@ -31,6 +31,13 @@ class _BadblocksResult(TypedDict): # dashboard appears to "rewind" every ~2 hours when a new phase starts. _BB_PATTERN_PHASE = {"0xaa": 1, "0x55": 3, "0xff": 5, "0x00": 7} _BB_TOTAL_PHASES = 8 +# Throttle DB writes from the badblocks parser. Each progress line used +# to trigger 4-6 transactions; with 4 concurrent burn-ins emitting sub- +# second progress lines, the asyncssh drain couldn't keep up — the +# stdout pipe on TrueNAS filled, badblocks blocked on pipe_write, +# disk I/O effectively stopped. 5 seconds is fine for the UI (drawer +# polls every ~12s anyway) and cuts DB load 60-80x. +BB_DB_MIN_SECONDS = 5.0 import re as _re_pre # noqa: E402 @@ -513,7 +520,7 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) # of the previous progress sample so we can compute MB/s # from the delta on each new sample. last_pct_sample: float = progress.overall_pct - last_pct_ts: float = time.monotonic() + last_db_write_ts: float = time.monotonic() # Seed bb_phase=1, bb_phase_pct=0 immediately so the # drawer's per-pattern meters have something to render @@ -557,12 +564,24 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) # picks up the "Testing with pattern 0xXX" + "Reading # and comparing" headers (which advance the phase # counter), not just the percent-done lines. - prev = progress.overall_pct prev_phase = progress.phase progress.update(line) phase_changed = progress.phase != prev_phase - pct_changed = progress.overall_pct != prev - if phase_changed or pct_changed or _BB_PERCENT_RE.search(line): + + # Throttle DB writes. Each progress line used to + # trigger 4-6 transactions; with 4 concurrent + # burn-ins and sub-second progress lines that + # backs up the asyncssh drain → pipe fills → + # badblocks blocks on pipe_write → no disk I/O. + # Commit at most once every BB_DB_MIN_SECONDS, + # plus always on phase transitions (transitions + # are rare AND important: they stamp history, + # advance the meter strip). + now_ts = time.monotonic() + time_since_last_db = now_ts - last_db_write_ts + should_write = phase_changed or time_since_last_db >= BB_DB_MIN_SECONDS + + if should_write: if phase_changed: # Stamp the moment we first see this # phase so the drawer can show @@ -578,29 +597,27 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) job_id, "surface_validate", progress.phase, progress.phase_pct, ) - await _update_stage_bad_blocks(job_id, "surface_validate", bad_blocks_total) - # Throughput: only on real percent advances - # (skip header-only ticks where pct didn't move). - # Skip phase transitions too — the per-phase - # pct resets and would produce a negative delta. + await _update_stage_bad_blocks( + job_id, "surface_validate", bad_blocks_total, + ) + # Throughput: compute since the last DB + # write, not since the last progress line. + # Skip phase transitions (per-phase pct + # resets would yield a negative delta). if ( drive_size_bytes - and pct_changed and not phase_changed and progress.overall_pct > last_pct_sample + and time_since_last_db >= 1.0 ): - now_ts = time.monotonic() - dt = now_ts - last_pct_ts - if dt >= 1.0: # avoid noisy sub-second samples - d_pct = progress.overall_pct - last_pct_sample - # Each 1% overall = drive_size / 800 bytes - bytes_done = (d_pct / 800.0) * drive_size_bytes - mbps = bytes_done / dt / 1_000_000 - await _update_stage_bb_mbps( - job_id, "surface_validate", mbps, - ) - last_pct_sample = progress.overall_pct - last_pct_ts = now_ts + d_pct = progress.overall_pct - last_pct_sample + bytes_done = (d_pct / 800.0) * drive_size_bytes + mbps = bytes_done / time_since_last_db / 1_000_000 + await _update_stage_bb_mbps( + job_id, "surface_validate", mbps, + ) + last_pct_sample = progress.overall_pct + last_db_write_ts = now_ts await _recalculate_progress(job_id) _push_update() else: diff --git a/app/config.py b/app/config.py index be8a38a..a60ff43 100644 --- a/app/config.py +++ b/app/config.py @@ -86,7 +86,7 @@ class Settings(BaseSettings): ssh_key: str = "" # PEM private key content (paste full key including headers) # Application version — used by the /api/v1/updates/check endpoint - app_version: str = "1.0.0-51" + app_version: str = "1.0.0-52" # ---- Authentication (1.0.0-22) ---- # session_secret: HMAC key for signing session cookies. Empty = generate