fix: throttle ALL drain-loop DB calls + drop progress noise from log (1.0.0-54)
1.0.0-52 throttled the percent/bb_phase writes but missed: - `_is_cancelled` ran a DB query on EVERY stderr line (sub-second cadence × 4 concurrent burn-ins = ~10+ DB connection opens/s) - `_append_stage_log` ran every 20 output_lines (~once per second) doing a quadratic `log_text || ?` concat that gets multi-MB rewrites as the log grows - `_recalculate_progress` + `_push_update` also fired per gated tick Cumulative load kept the asyncssh drain coroutine too busy to consume the SSH channel buffer; SSH window stalled; sshd stopped reading the pipe; badblocks blocked on pipe_write with state=S wchan=pipe_write. /sys/block sectors_written delta confirmed 0 disk I/O across all running drives despite 23h elapsed. Fix: 1. Single throttle gate (BB_DB_MIN_SECONDS=5s) covers EVERY DB touch in the drain — cancel check, percent/phase/bb_count updates, throughput sample, log flush, recalc, SSE push. Phase transitions still bypass the throttle (rare + important). 2. Exclude "XX% done" lines from the log entirely. They were the dominant volume; meaningful content (pattern headers, errors, bad-block numbers) still gets logged via the throttled flush. 3. log_text concat still quadratic but the volume reduction makes it tractable — buffer to pending_log_chunks, flush on the gate. Net effect: ~99% reduction in drain-loop DB load. asyncssh drain keeps up; pipe drains; badblocks writes; disk goes brr.
This commit is contained in:
parent
c906ab15f7
commit
149f2901b7
2 changed files with 93 additions and 70 deletions
|
|
@ -521,6 +521,9 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
||||||
# from the delta on each new sample.
|
# from the delta on each new sample.
|
||||||
last_pct_sample: float = progress.overall_pct
|
last_pct_sample: float = progress.overall_pct
|
||||||
last_db_write_ts: float = time.monotonic()
|
last_db_write_ts: float = time.monotonic()
|
||||||
|
# Lines accumulated since last log flush. Flushed in the
|
||||||
|
# throttled DB-write window (see BB_DB_MIN_SECONDS).
|
||||||
|
pending_log_chunks: list[str] = []
|
||||||
|
|
||||||
# Seed bb_phase=1, bb_phase_pct=0 immediately so the
|
# Seed bb_phase=1, bb_phase_pct=0 immediately so the
|
||||||
# drawer's per-pattern meters have something to render
|
# drawer's per-pattern meters have something to render
|
||||||
|
|
@ -557,35 +560,52 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
||||||
pass
|
pass
|
||||||
continue
|
continue
|
||||||
|
|
||||||
output_lines.append(line)
|
|
||||||
|
|
||||||
if is_stderr:
|
|
||||||
# Drive progress.update from EVERY stderr line so it
|
# Drive progress.update from EVERY stderr line so it
|
||||||
# picks up the "Testing with pattern 0xXX" + "Reading
|
# picks up the "Testing with pattern 0xXX" + "Reading
|
||||||
# and comparing" headers (which advance the phase
|
# and comparing" headers (which advance the phase
|
||||||
# counter), not just the percent-done lines.
|
# counter), not just the percent-done lines. CPU-
|
||||||
|
# bound regex work, no I/O, safe to do unconditionally.
|
||||||
prev_phase = progress.phase
|
prev_phase = progress.phase
|
||||||
|
phase_changed = False
|
||||||
|
is_progress_line = False
|
||||||
|
if is_stderr:
|
||||||
progress.update(line)
|
progress.update(line)
|
||||||
phase_changed = progress.phase != prev_phase
|
phase_changed = progress.phase != prev_phase
|
||||||
|
is_progress_line = bool(_BB_PERCENT_RE.search(line))
|
||||||
|
else:
|
||||||
|
stripped = line.strip()
|
||||||
|
if stripped and stripped.isdigit():
|
||||||
|
bad_blocks_total += 1
|
||||||
|
|
||||||
# Throttle DB writes. Each progress line used to
|
# Keep the "XX% done" lines OUT of output_lines.
|
||||||
# trigger 4-6 transactions; with 4 concurrent
|
# They're the dominant volume (sub-second cadence
|
||||||
# burn-ins and sub-second progress lines that
|
# for hours) and the log_text concat is quadratic
|
||||||
# backs up the asyncssh drain → pipe fills →
|
# in length. Headers, errors, bad-block numbers,
|
||||||
# badblocks blocks on pipe_write → no disk I/O.
|
# and other diagnostic output still get logged.
|
||||||
# Commit at most once every BB_DB_MIN_SECONDS,
|
if not is_progress_line:
|
||||||
# plus always on phase transitions (transitions
|
output_lines.append(line)
|
||||||
# are rare AND important: they stamp history,
|
|
||||||
# advance the meter strip).
|
# Single throttle gate covering EVERY DB touch in
|
||||||
|
# this loop. Without this, _is_cancelled + percent
|
||||||
|
# writes + chunk-log appends collectively overwhelm
|
||||||
|
# the asyncio loop, the asyncssh drain falls behind,
|
||||||
|
# the SSH window stops advancing, sshd stops reading
|
||||||
|
# the pipe, badblocks blocks on pipe_write and no
|
||||||
|
# disk I/O happens. Symptom: badblocks pid in
|
||||||
|
# wchan=pipe_write with /sys/block sectors_written
|
||||||
|
# delta of 0.
|
||||||
now_ts = time.monotonic()
|
now_ts = time.monotonic()
|
||||||
time_since_last_db = now_ts - last_db_write_ts
|
time_since_last_db = now_ts - last_db_write_ts
|
||||||
should_write = phase_changed or time_since_last_db >= BB_DB_MIN_SECONDS
|
should_write = phase_changed or time_since_last_db >= BB_DB_MIN_SECONDS
|
||||||
|
|
||||||
if should_write:
|
if should_write:
|
||||||
|
# 1) Cancellation check (was per-line)
|
||||||
|
if await _is_cancelled(job_id):
|
||||||
|
await kill.kill_remote_process(job_id)
|
||||||
|
return
|
||||||
|
|
||||||
|
# 2) Per-pattern progress + history
|
||||||
if phase_changed:
|
if phase_changed:
|
||||||
# Stamp the moment we first see this
|
|
||||||
# phase so the drawer can show
|
|
||||||
# per-pattern elapsed times.
|
|
||||||
await _record_bb_phase_start(
|
await _record_bb_phase_start(
|
||||||
job_id, "surface_validate",
|
job_id, "surface_validate",
|
||||||
progress.phase, _now(),
|
progress.phase, _now(),
|
||||||
|
|
@ -600,10 +620,9 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
||||||
await _update_stage_bad_blocks(
|
await _update_stage_bad_blocks(
|
||||||
job_id, "surface_validate", bad_blocks_total,
|
job_id, "surface_validate", bad_blocks_total,
|
||||||
)
|
)
|
||||||
# Throughput: compute since the last DB
|
|
||||||
# write, not since the last progress line.
|
# 3) Throughput. Skip phase transitions (per-
|
||||||
# Skip phase transitions (per-phase pct
|
# phase pct resets would yield a negative delta).
|
||||||
# resets would yield a negative delta).
|
|
||||||
if (
|
if (
|
||||||
drive_size_bytes
|
drive_size_bytes
|
||||||
and not phase_changed
|
and not phase_changed
|
||||||
|
|
@ -616,21 +635,29 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
||||||
await _update_stage_bb_mbps(
|
await _update_stage_bb_mbps(
|
||||||
job_id, "surface_validate", mbps,
|
job_id, "surface_validate", mbps,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 4) Log flush. Chunk what's accumulated since
|
||||||
|
# the last write. log_text concat is quadratic
|
||||||
|
# so the volume reduction from skipping "% done"
|
||||||
|
# lines above matters MORE than the throttle.
|
||||||
|
if pending_log_chunks:
|
||||||
|
chunk = "".join(pending_log_chunks)
|
||||||
|
pending_log_chunks.clear()
|
||||||
|
await _append_stage_log(
|
||||||
|
job_id, "surface_validate", chunk,
|
||||||
|
)
|
||||||
|
|
||||||
last_pct_sample = progress.overall_pct
|
last_pct_sample = progress.overall_pct
|
||||||
last_db_write_ts = now_ts
|
last_db_write_ts = now_ts
|
||||||
await _recalculate_progress(job_id)
|
await _recalculate_progress(job_id)
|
||||||
_push_update()
|
_push_update()
|
||||||
else:
|
|
||||||
stripped = line.strip()
|
|
||||||
if stripped and stripped.isdigit():
|
|
||||||
bad_blocks_total += 1
|
|
||||||
|
|
||||||
# Append to DB log in chunks
|
# Buffer non-progress lines for the next flush.
|
||||||
if len(output_lines) % 20 == 0:
|
if not is_progress_line:
|
||||||
chunk = "".join(output_lines[-20:])
|
pending_log_chunks.append(line)
|
||||||
await _append_stage_log(job_id, "surface_validate", chunk)
|
|
||||||
|
|
||||||
# Abort on bad block threshold
|
# Abort on bad block threshold (immediate, not
|
||||||
|
# throttled — finding bad blocks is urgent).
|
||||||
if bad_blocks_total > settings.bad_block_threshold:
|
if bad_blocks_total > settings.bad_block_threshold:
|
||||||
await kill.kill_remote_process(job_id)
|
await kill.kill_remote_process(job_id)
|
||||||
output_lines.append(
|
output_lines.append(
|
||||||
|
|
@ -639,10 +666,6 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
if await _is_cancelled(job_id):
|
|
||||||
await kill.kill_remote_process(job_id)
|
|
||||||
return
|
|
||||||
|
|
||||||
await asyncio.gather(
|
await asyncio.gather(
|
||||||
_drain(proc.stdout, False),
|
_drain(proc.stdout, False),
|
||||||
_drain(proc.stderr, True),
|
_drain(proc.stderr, True),
|
||||||
|
|
|
||||||
|
|
@ -86,7 +86,7 @@ class Settings(BaseSettings):
|
||||||
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
||||||
|
|
||||||
# Application version — used by the /api/v1/updates/check endpoint
|
# Application version — used by the /api/v1/updates/check endpoint
|
||||||
app_version: str = "1.0.0-53"
|
app_version: str = "1.0.0-54"
|
||||||
|
|
||||||
# ---- Authentication (1.0.0-22) ----
|
# ---- Authentication (1.0.0-22) ----
|
||||||
# session_secret: HMAC key for signing session cookies. Empty = generate
|
# session_secret: HMAC key for signing session cookies. Empty = generate
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue