diff --git a/app/burnin/stages.py b/app/burnin/stages.py index 1eaa25e..a261765 100644 --- a/app/burnin/stages.py +++ b/app/burnin/stages.py @@ -31,6 +31,13 @@ class _BadblocksResult(TypedDict): # dashboard appears to "rewind" every ~2 hours when a new phase starts. _BB_PATTERN_PHASE = {"0xaa": 1, "0x55": 3, "0xff": 5, "0x00": 7} _BB_TOTAL_PHASES = 8 +# Throttle DB writes from the badblocks parser. Each progress line used +# to trigger 4-6 transactions; with 4 concurrent burn-ins emitting sub- +# second progress lines, the asyncssh drain couldn't keep up — the +# stdout pipe on TrueNAS filled, badblocks blocked on pipe_write, +# disk I/O effectively stopped. 5 seconds is fine for the UI (drawer +# polls every ~12s anyway) and cuts DB load 60-80x. +BB_DB_MIN_SECONDS = 5.0 import re as _re_pre # noqa: E402 @@ -513,7 +520,7 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) # of the previous progress sample so we can compute MB/s # from the delta on each new sample. last_pct_sample: float = progress.overall_pct - last_pct_ts: float = time.monotonic() + last_db_write_ts: float = time.monotonic() # Seed bb_phase=1, bb_phase_pct=0 immediately so the # drawer's per-pattern meters have something to render @@ -557,12 +564,24 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) # picks up the "Testing with pattern 0xXX" + "Reading # and comparing" headers (which advance the phase # counter), not just the percent-done lines. - prev = progress.overall_pct prev_phase = progress.phase progress.update(line) phase_changed = progress.phase != prev_phase - pct_changed = progress.overall_pct != prev - if phase_changed or pct_changed or _BB_PERCENT_RE.search(line): + + # Throttle DB writes. Each progress line used to + # trigger 4-6 transactions; with 4 concurrent + # burn-ins and sub-second progress lines that + # backs up the asyncssh drain → pipe fills → + # badblocks blocks on pipe_write → no disk I/O. + # Commit at most once every BB_DB_MIN_SECONDS, + # plus always on phase transitions (transitions + # are rare AND important: they stamp history, + # advance the meter strip). + now_ts = time.monotonic() + time_since_last_db = now_ts - last_db_write_ts + should_write = phase_changed or time_since_last_db >= BB_DB_MIN_SECONDS + + if should_write: if phase_changed: # Stamp the moment we first see this # phase so the drawer can show @@ -578,29 +597,27 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) job_id, "surface_validate", progress.phase, progress.phase_pct, ) - await _update_stage_bad_blocks(job_id, "surface_validate", bad_blocks_total) - # Throughput: only on real percent advances - # (skip header-only ticks where pct didn't move). - # Skip phase transitions too — the per-phase - # pct resets and would produce a negative delta. + await _update_stage_bad_blocks( + job_id, "surface_validate", bad_blocks_total, + ) + # Throughput: compute since the last DB + # write, not since the last progress line. + # Skip phase transitions (per-phase pct + # resets would yield a negative delta). if ( drive_size_bytes - and pct_changed and not phase_changed and progress.overall_pct > last_pct_sample + and time_since_last_db >= 1.0 ): - now_ts = time.monotonic() - dt = now_ts - last_pct_ts - if dt >= 1.0: # avoid noisy sub-second samples - d_pct = progress.overall_pct - last_pct_sample - # Each 1% overall = drive_size / 800 bytes - bytes_done = (d_pct / 800.0) * drive_size_bytes - mbps = bytes_done / dt / 1_000_000 - await _update_stage_bb_mbps( - job_id, "surface_validate", mbps, - ) - last_pct_sample = progress.overall_pct - last_pct_ts = now_ts + d_pct = progress.overall_pct - last_pct_sample + bytes_done = (d_pct / 800.0) * drive_size_bytes + mbps = bytes_done / time_since_last_db / 1_000_000 + await _update_stage_bb_mbps( + job_id, "surface_validate", mbps, + ) + last_pct_sample = progress.overall_pct + last_db_write_ts = now_ts await _recalculate_progress(job_id) _push_update() else: diff --git a/app/config.py b/app/config.py index be8a38a..a60ff43 100644 --- a/app/config.py +++ b/app/config.py @@ -86,7 +86,7 @@ class Settings(BaseSettings): ssh_key: str = "" # PEM private key content (paste full key including headers) # Application version — used by the /api/v1/updates/check endpoint - app_version: str = "1.0.0-51" + app_version: str = "1.0.0-52" # ---- Authentication (1.0.0-22) ---- # session_secret: HMAC key for signing session cookies. Empty = generate