fix: throttle badblocks parser DB writes (1.0.0-52)
User reported sdb showing 134-day ETA. Investigation: badblocks processes all stuck in pipe_write wchan, iostat showing 0 throughput across all drives despite badblocks supposedly running. Root cause: each progress line was triggering 4-6 DB transactions (_update_stage_percent, _update_stage_bb_phase, _update_stage_bad_blocks, _update_stage_bb_mbps, _record_bb_phase_start, _recalculate_progress). With 4 concurrent burn-ins and sub-second progress lines, the asyncssh _drain couldn't keep up. Drain fell behind → asyncssh channel buffer filled → SSH window stopped advancing → sshd stopped reading from badblocks's stdout pipe → pipe filled → badblocks blocked on pipe_write() → no more disk I/O. That regression came in across 1.0.0-44 → 1.0.0-47 as I added each new persisted field. The previous per-line write path worked when there was only one DB call; it doesn't with five-plus. Fix: BB_DB_MIN_SECONDS=5 throttle on the DB-write path. The drain loop still consumes every progress line (so the pipe drains continuously), but commits to DB at most once every 5 seconds. Phase transitions always commit immediately (rare and important — they stamp bb_phase_history and advance the per-pattern meter). UI impact: minimal — drawer polls drives every ~12s anyway, so the displayed % was already at 12s resolution. The meter strip just won't sub-tick within a 5s window. DB load impact: 60-80x reduction during surface_validate.
This commit is contained in:
parent
2107981cf1
commit
c5a41d0260
2 changed files with 40 additions and 23 deletions
|
|
@ -31,6 +31,13 @@ class _BadblocksResult(TypedDict):
|
|||
# dashboard appears to "rewind" every ~2 hours when a new phase starts.
|
||||
_BB_PATTERN_PHASE = {"0xaa": 1, "0x55": 3, "0xff": 5, "0x00": 7}
|
||||
_BB_TOTAL_PHASES = 8
|
||||
# Throttle DB writes from the badblocks parser. Each progress line used
|
||||
# to trigger 4-6 transactions; with 4 concurrent burn-ins emitting sub-
|
||||
# second progress lines, the asyncssh drain couldn't keep up — the
|
||||
# stdout pipe on TrueNAS filled, badblocks blocked on pipe_write,
|
||||
# disk I/O effectively stopped. 5 seconds is fine for the UI (drawer
|
||||
# polls every ~12s anyway) and cuts DB load 60-80x.
|
||||
BB_DB_MIN_SECONDS = 5.0
|
||||
|
||||
import re as _re_pre # noqa: E402
|
||||
|
||||
|
|
@ -513,7 +520,7 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
|||
# of the previous progress sample so we can compute MB/s
|
||||
# from the delta on each new sample.
|
||||
last_pct_sample: float = progress.overall_pct
|
||||
last_pct_ts: float = time.monotonic()
|
||||
last_db_write_ts: float = time.monotonic()
|
||||
|
||||
# Seed bb_phase=1, bb_phase_pct=0 immediately so the
|
||||
# drawer's per-pattern meters have something to render
|
||||
|
|
@ -557,12 +564,24 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
|||
# picks up the "Testing with pattern 0xXX" + "Reading
|
||||
# and comparing" headers (which advance the phase
|
||||
# counter), not just the percent-done lines.
|
||||
prev = progress.overall_pct
|
||||
prev_phase = progress.phase
|
||||
progress.update(line)
|
||||
phase_changed = progress.phase != prev_phase
|
||||
pct_changed = progress.overall_pct != prev
|
||||
if phase_changed or pct_changed or _BB_PERCENT_RE.search(line):
|
||||
|
||||
# Throttle DB writes. Each progress line used to
|
||||
# trigger 4-6 transactions; with 4 concurrent
|
||||
# burn-ins and sub-second progress lines that
|
||||
# backs up the asyncssh drain → pipe fills →
|
||||
# badblocks blocks on pipe_write → no disk I/O.
|
||||
# Commit at most once every BB_DB_MIN_SECONDS,
|
||||
# plus always on phase transitions (transitions
|
||||
# are rare AND important: they stamp history,
|
||||
# advance the meter strip).
|
||||
now_ts = time.monotonic()
|
||||
time_since_last_db = now_ts - last_db_write_ts
|
||||
should_write = phase_changed or time_since_last_db >= BB_DB_MIN_SECONDS
|
||||
|
||||
if should_write:
|
||||
if phase_changed:
|
||||
# Stamp the moment we first see this
|
||||
# phase so the drawer can show
|
||||
|
|
@ -578,29 +597,27 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
|||
job_id, "surface_validate",
|
||||
progress.phase, progress.phase_pct,
|
||||
)
|
||||
await _update_stage_bad_blocks(job_id, "surface_validate", bad_blocks_total)
|
||||
# Throughput: only on real percent advances
|
||||
# (skip header-only ticks where pct didn't move).
|
||||
# Skip phase transitions too — the per-phase
|
||||
# pct resets and would produce a negative delta.
|
||||
await _update_stage_bad_blocks(
|
||||
job_id, "surface_validate", bad_blocks_total,
|
||||
)
|
||||
# Throughput: compute since the last DB
|
||||
# write, not since the last progress line.
|
||||
# Skip phase transitions (per-phase pct
|
||||
# resets would yield a negative delta).
|
||||
if (
|
||||
drive_size_bytes
|
||||
and pct_changed
|
||||
and not phase_changed
|
||||
and progress.overall_pct > last_pct_sample
|
||||
and time_since_last_db >= 1.0
|
||||
):
|
||||
now_ts = time.monotonic()
|
||||
dt = now_ts - last_pct_ts
|
||||
if dt >= 1.0: # avoid noisy sub-second samples
|
||||
d_pct = progress.overall_pct - last_pct_sample
|
||||
# Each 1% overall = drive_size / 800 bytes
|
||||
bytes_done = (d_pct / 800.0) * drive_size_bytes
|
||||
mbps = bytes_done / dt / 1_000_000
|
||||
mbps = bytes_done / time_since_last_db / 1_000_000
|
||||
await _update_stage_bb_mbps(
|
||||
job_id, "surface_validate", mbps,
|
||||
)
|
||||
last_pct_sample = progress.overall_pct
|
||||
last_pct_ts = now_ts
|
||||
last_db_write_ts = now_ts
|
||||
await _recalculate_progress(job_id)
|
||||
_push_update()
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -86,7 +86,7 @@ class Settings(BaseSettings):
|
|||
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
||||
|
||||
# Application version — used by the /api/v1/updates/check endpoint
|
||||
app_version: str = "1.0.0-51"
|
||||
app_version: str = "1.0.0-52"
|
||||
|
||||
# ---- Authentication (1.0.0-22) ----
|
||||
# session_secret: HMAC key for signing session cookies. Empty = generate
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue