fix: throttle ALL drain-loop DB calls + drop progress noise from log (1.0.0-54)
Some checks are pending
Security scan / pip-audit (push) Waiting to run
Security scan / bandit (push) Waiting to run
Security scan / gitleaks (push) Waiting to run
Security scan / mypy (push) Waiting to run

1.0.0-52 throttled the percent/bb_phase writes but missed:

- `_is_cancelled` ran a DB query on EVERY stderr line (sub-second
  cadence × 4 concurrent burn-ins = ~10+ DB connection opens/s)
- `_append_stage_log` ran every 20 output_lines (~once per second)
  doing a quadratic `log_text || ?` concat that gets multi-MB
  rewrites as the log grows
- `_recalculate_progress` + `_push_update` also fired per gated tick

Cumulative load kept the asyncssh drain coroutine too busy to
consume the SSH channel buffer; SSH window stalled; sshd stopped
reading the pipe; badblocks blocked on pipe_write with state=S
wchan=pipe_write. /sys/block sectors_written delta confirmed
0 disk I/O across all running drives despite 23h elapsed.

Fix:
1. Single throttle gate (BB_DB_MIN_SECONDS=5s) covers EVERY DB
   touch in the drain — cancel check, percent/phase/bb_count
   updates, throughput sample, log flush, recalc, SSE push.
   Phase transitions still bypass the throttle (rare + important).
2. Exclude "XX% done" lines from the log entirely. They were the
   dominant volume; meaningful content (pattern headers, errors,
   bad-block numbers) still gets logged via the throttled flush.
3. log_text concat still quadratic but the volume reduction makes
   it tractable — buffer to pending_log_chunks, flush on the gate.

Net effect: ~99% reduction in drain-loop DB load. asyncssh drain
keeps up; pipe drains; badblocks writes; disk goes brr.
This commit is contained in:
Brandon Walter 2026-05-11 22:07:39 -07:00
parent c906ab15f7
commit 149f2901b7
2 changed files with 93 additions and 70 deletions

View file

@ -521,6 +521,9 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
# from the delta on each new sample. # from the delta on each new sample.
last_pct_sample: float = progress.overall_pct last_pct_sample: float = progress.overall_pct
last_db_write_ts: float = time.monotonic() last_db_write_ts: float = time.monotonic()
# Lines accumulated since last log flush. Flushed in the
# throttled DB-write window (see BB_DB_MIN_SECONDS).
pending_log_chunks: list[str] = []
# Seed bb_phase=1, bb_phase_pct=0 immediately so the # Seed bb_phase=1, bb_phase_pct=0 immediately so the
# drawer's per-pattern meters have something to render # drawer's per-pattern meters have something to render
@ -557,35 +560,52 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
pass pass
continue continue
output_lines.append(line)
if is_stderr:
# Drive progress.update from EVERY stderr line so it # Drive progress.update from EVERY stderr line so it
# picks up the "Testing with pattern 0xXX" + "Reading # picks up the "Testing with pattern 0xXX" + "Reading
# and comparing" headers (which advance the phase # and comparing" headers (which advance the phase
# counter), not just the percent-done lines. # counter), not just the percent-done lines. CPU-
# bound regex work, no I/O, safe to do unconditionally.
prev_phase = progress.phase prev_phase = progress.phase
phase_changed = False
is_progress_line = False
if is_stderr:
progress.update(line) progress.update(line)
phase_changed = progress.phase != prev_phase phase_changed = progress.phase != prev_phase
is_progress_line = bool(_BB_PERCENT_RE.search(line))
else:
stripped = line.strip()
if stripped and stripped.isdigit():
bad_blocks_total += 1
# Throttle DB writes. Each progress line used to # Keep the "XX% done" lines OUT of output_lines.
# trigger 4-6 transactions; with 4 concurrent # They're the dominant volume (sub-second cadence
# burn-ins and sub-second progress lines that # for hours) and the log_text concat is quadratic
# backs up the asyncssh drain → pipe fills → # in length. Headers, errors, bad-block numbers,
# badblocks blocks on pipe_write → no disk I/O. # and other diagnostic output still get logged.
# Commit at most once every BB_DB_MIN_SECONDS, if not is_progress_line:
# plus always on phase transitions (transitions output_lines.append(line)
# are rare AND important: they stamp history,
# advance the meter strip). # Single throttle gate covering EVERY DB touch in
# this loop. Without this, _is_cancelled + percent
# writes + chunk-log appends collectively overwhelm
# the asyncio loop, the asyncssh drain falls behind,
# the SSH window stops advancing, sshd stops reading
# the pipe, badblocks blocks on pipe_write and no
# disk I/O happens. Symptom: badblocks pid in
# wchan=pipe_write with /sys/block sectors_written
# delta of 0.
now_ts = time.monotonic() now_ts = time.monotonic()
time_since_last_db = now_ts - last_db_write_ts time_since_last_db = now_ts - last_db_write_ts
should_write = phase_changed or time_since_last_db >= BB_DB_MIN_SECONDS should_write = phase_changed or time_since_last_db >= BB_DB_MIN_SECONDS
if should_write: if should_write:
# 1) Cancellation check (was per-line)
if await _is_cancelled(job_id):
await kill.kill_remote_process(job_id)
return
# 2) Per-pattern progress + history
if phase_changed: if phase_changed:
# Stamp the moment we first see this
# phase so the drawer can show
# per-pattern elapsed times.
await _record_bb_phase_start( await _record_bb_phase_start(
job_id, "surface_validate", job_id, "surface_validate",
progress.phase, _now(), progress.phase, _now(),
@ -600,10 +620,9 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
await _update_stage_bad_blocks( await _update_stage_bad_blocks(
job_id, "surface_validate", bad_blocks_total, job_id, "surface_validate", bad_blocks_total,
) )
# Throughput: compute since the last DB
# write, not since the last progress line. # 3) Throughput. Skip phase transitions (per-
# Skip phase transitions (per-phase pct # phase pct resets would yield a negative delta).
# resets would yield a negative delta).
if ( if (
drive_size_bytes drive_size_bytes
and not phase_changed and not phase_changed
@ -616,21 +635,29 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
await _update_stage_bb_mbps( await _update_stage_bb_mbps(
job_id, "surface_validate", mbps, job_id, "surface_validate", mbps,
) )
# 4) Log flush. Chunk what's accumulated since
# the last write. log_text concat is quadratic
# so the volume reduction from skipping "% done"
# lines above matters MORE than the throttle.
if pending_log_chunks:
chunk = "".join(pending_log_chunks)
pending_log_chunks.clear()
await _append_stage_log(
job_id, "surface_validate", chunk,
)
last_pct_sample = progress.overall_pct last_pct_sample = progress.overall_pct
last_db_write_ts = now_ts last_db_write_ts = now_ts
await _recalculate_progress(job_id) await _recalculate_progress(job_id)
_push_update() _push_update()
else:
stripped = line.strip()
if stripped and stripped.isdigit():
bad_blocks_total += 1
# Append to DB log in chunks # Buffer non-progress lines for the next flush.
if len(output_lines) % 20 == 0: if not is_progress_line:
chunk = "".join(output_lines[-20:]) pending_log_chunks.append(line)
await _append_stage_log(job_id, "surface_validate", chunk)
# Abort on bad block threshold # Abort on bad block threshold (immediate, not
# throttled — finding bad blocks is urgent).
if bad_blocks_total > settings.bad_block_threshold: if bad_blocks_total > settings.bad_block_threshold:
await kill.kill_remote_process(job_id) await kill.kill_remote_process(job_id)
output_lines.append( output_lines.append(
@ -639,10 +666,6 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
) )
return return
if await _is_cancelled(job_id):
await kill.kill_remote_process(job_id)
return
await asyncio.gather( await asyncio.gather(
_drain(proc.stdout, False), _drain(proc.stdout, False),
_drain(proc.stderr, True), _drain(proc.stderr, True),

View file

@ -86,7 +86,7 @@ class Settings(BaseSettings):
ssh_key: str = "" # PEM private key content (paste full key including headers) ssh_key: str = "" # PEM private key content (paste full key including headers)
# Application version — used by the /api/v1/updates/check endpoint # Application version — used by the /api/v1/updates/check endpoint
app_version: str = "1.0.0-53" app_version: str = "1.0.0-54"
# ---- Authentication (1.0.0-22) ---- # ---- Authentication (1.0.0-22) ----
# session_secret: HMAC key for signing session cookies. Empty = generate # session_secret: HMAC key for signing session cookies. Empty = generate