fix: badblocks progress tracks overall %, not per-phase (1.0.0-42)
`badblocks -w` cycles through 4 patterns (0xaa, 0x55, 0xff, 0x00), each with a write phase + a verify phase = 8 phases. The output's "XX% done" lines are per-phase, so the dashboard appeared to "rewind" every ~2 hours. Two drives racing each other could look 4× apart in displayed progress despite identical hardware — actually one was just further into a later phase. New _BadblocksProgress state machine watches for "Testing with pattern 0xXX" and "Reading and comparing" headers, advances the phase counter, and reports overall = ((phase-1) * 100 + phase_pct) / 8 clipped to 99. Pure state machine, no I/O. 7 new tests cover phase-header detection, boundary math, monotonicity across a synthetic stream, and the original "two drives at same per-phase % look identical" bug. Image rebuilt and tagged but NOT deployed to the running container — 4 surface-validate jobs are 20-95% through 14TB drives and a recreate would SIGHUP the remote badblocks processes. Deploy with `docker compose up -d` after the current batch finishes.
This commit is contained in:
parent
775251b993
commit
b406e3f315
3 changed files with 203 additions and 7 deletions
|
|
@ -25,6 +25,72 @@ class _BadblocksResult(TypedDict):
|
||||||
aborted: bool
|
aborted: bool
|
||||||
|
|
||||||
|
|
||||||
|
# `badblocks -w` cycles through 4 patterns (0xaa, 0x55, 0xff, 0x00),
|
||||||
|
# each with a write phase followed by a read-back/verify phase = 8 phases.
|
||||||
|
# Per-phase percent comes back via `XX% done`; without translation, the
|
||||||
|
# dashboard appears to "rewind" every ~2 hours when a new phase starts.
|
||||||
|
_BB_PATTERN_PHASE = {"0xaa": 1, "0x55": 3, "0xff": 5, "0x00": 7}
|
||||||
|
_BB_TOTAL_PHASES = 8
|
||||||
|
|
||||||
|
import re as _re_pre # noqa: E402
|
||||||
|
|
||||||
|
_BB_PATTERN_RE = _re_pre.compile(r"Testing with pattern\s+(0x[0-9a-fA-F]+)")
|
||||||
|
_BB_VERIFY_RE = _re_pre.compile(r"Reading and comparing")
|
||||||
|
_BB_PERCENT_RE = _re_pre.compile(r"([\d.]+)%\s+done")
|
||||||
|
|
||||||
|
|
||||||
|
class _BadblocksProgress:
|
||||||
|
"""Track which phase of `badblocks -w -p N` we're in so the
|
||||||
|
displayed percent maps to overall progress, not per-phase progress.
|
||||||
|
|
||||||
|
Pure state machine — no I/O. Feed it lines from the badblocks output
|
||||||
|
via :meth:`update`; read :attr:`overall_pct` after each call.
|
||||||
|
|
||||||
|
Behavior:
|
||||||
|
- Defaults to phase 1 (write 0xaa) before any header is seen.
|
||||||
|
- "Testing with pattern 0xXX" sets the phase to the write-phase index
|
||||||
|
for that pattern (1, 3, 5, or 7).
|
||||||
|
- "Reading and comparing" advances to the matching verify phase
|
||||||
|
(last_write_phase + 1).
|
||||||
|
- "XX% done" updates the in-phase percent.
|
||||||
|
- overall_pct = ((phase - 1) * 100 + phase_pct) / 8, clipped to 99
|
||||||
|
so we don't claim "100%" until the stage's success path explicitly
|
||||||
|
writes 100.
|
||||||
|
"""
|
||||||
|
|
||||||
|
__slots__ = ("phase", "phase_pct", "_last_write_phase")
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.phase: int = 1
|
||||||
|
self.phase_pct: float = 0.0
|
||||||
|
self._last_write_phase: int = 1
|
||||||
|
|
||||||
|
def update(self, line: str) -> None:
|
||||||
|
m = _BB_PATTERN_RE.search(line)
|
||||||
|
if m:
|
||||||
|
p = m.group(1).lower()
|
||||||
|
if p in _BB_PATTERN_PHASE:
|
||||||
|
self.phase = _BB_PATTERN_PHASE[p]
|
||||||
|
self._last_write_phase = self.phase
|
||||||
|
self.phase_pct = 0.0
|
||||||
|
return
|
||||||
|
if _BB_VERIFY_RE.search(line):
|
||||||
|
self.phase = self._last_write_phase + 1
|
||||||
|
self.phase_pct = 0.0
|
||||||
|
return
|
||||||
|
m = _BB_PERCENT_RE.search(line)
|
||||||
|
if m:
|
||||||
|
try:
|
||||||
|
self.phase_pct = float(m.group(1))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
def overall_pct(self) -> int:
|
||||||
|
total = (self.phase - 1) * 100.0 + self.phase_pct
|
||||||
|
return min(99, int(total / _BB_TOTAL_PHASES))
|
||||||
|
|
||||||
|
|
||||||
def _build_badblocks_cmd(devname: str) -> str:
|
def _build_badblocks_cmd(devname: str) -> str:
|
||||||
"""Construct the wrapped badblocks command for a given device.
|
"""Construct the wrapped badblocks command for a given device.
|
||||||
|
|
||||||
|
|
@ -425,9 +491,8 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
||||||
#
|
#
|
||||||
cmd = _build_badblocks_cmd(devname)
|
cmd = _build_badblocks_cmd(devname)
|
||||||
async with conn.create_process(cmd) as proc:
|
async with conn.create_process(cmd) as proc:
|
||||||
import re as _re
|
|
||||||
|
|
||||||
pid_seen = False
|
pid_seen = False
|
||||||
|
progress = _BadblocksProgress()
|
||||||
|
|
||||||
async def _drain(stream, is_stderr: bool):
|
async def _drain(stream, is_stderr: bool):
|
||||||
nonlocal bad_blocks_total, pid_seen
|
nonlocal bad_blocks_total, pid_seen
|
||||||
|
|
@ -451,10 +516,16 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
||||||
output_lines.append(line)
|
output_lines.append(line)
|
||||||
|
|
||||||
if is_stderr:
|
if is_stderr:
|
||||||
m = _re.search(r"([\d.]+)%\s+done", line)
|
# Drive progress.update from EVERY stderr line so it
|
||||||
if m:
|
# picks up the "Testing with pattern 0xXX" + "Reading
|
||||||
pct = min(99, int(float(m.group(1))))
|
# and comparing" headers (which advance the phase
|
||||||
await _update_stage_percent(job_id, "surface_validate", pct)
|
# counter), not just the percent-done lines.
|
||||||
|
prev = progress.overall_pct
|
||||||
|
progress.update(line)
|
||||||
|
if progress.overall_pct != prev or _BB_PERCENT_RE.search(line):
|
||||||
|
await _update_stage_percent(
|
||||||
|
job_id, "surface_validate", progress.overall_pct,
|
||||||
|
)
|
||||||
await _update_stage_bad_blocks(job_id, "surface_validate", bad_blocks_total)
|
await _update_stage_bad_blocks(job_id, "surface_validate", bad_blocks_total)
|
||||||
await _recalculate_progress(job_id)
|
await _recalculate_progress(job_id)
|
||||||
_push_update()
|
_push_update()
|
||||||
|
|
|
||||||
|
|
@ -83,7 +83,7 @@ class Settings(BaseSettings):
|
||||||
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
||||||
|
|
||||||
# Application version — used by the /api/v1/updates/check endpoint
|
# Application version — used by the /api/v1/updates/check endpoint
|
||||||
app_version: str = "1.0.0-41"
|
app_version: str = "1.0.0-42"
|
||||||
|
|
||||||
# ---- Authentication (1.0.0-22) ----
|
# ---- Authentication (1.0.0-22) ----
|
||||||
# session_secret: HMAC key for signing session cookies. Empty = generate
|
# session_secret: HMAC key for signing session cookies. Empty = generate
|
||||||
|
|
|
||||||
125
tests/test_badblocks_progress.py
Normal file
125
tests/test_badblocks_progress.py
Normal file
|
|
@ -0,0 +1,125 @@
|
||||||
|
"""Verifies _BadblocksProgress translates per-phase badblocks output
|
||||||
|
into a monotonic 0-99% overall progress.
|
||||||
|
|
||||||
|
`badblocks -w` cycles through 4 patterns × {write, verify} = 8 phases.
|
||||||
|
Each phase prints "XX% done" relative to its own 0-100 range. Without
|
||||||
|
this translation the dashboard appeared to "rewind" every ~2 hours
|
||||||
|
when a new phase started — and two drives racing each other could
|
||||||
|
look 4× apart in displayed progress despite identical hardware.
|
||||||
|
|
||||||
|
Run inside the container image so app deps are present.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from app.burnin.stages import _BadblocksProgress
|
||||||
|
|
||||||
|
|
||||||
|
class TestBadblocksProgress(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_default_phase_one(self):
|
||||||
|
"""Before any header, treat as start of pattern-1 write."""
|
||||||
|
p = _BadblocksProgress()
|
||||||
|
self.assertEqual(p.phase, 1)
|
||||||
|
self.assertEqual(p.overall_pct, 0)
|
||||||
|
|
||||||
|
def test_pattern_headers_set_phase(self):
|
||||||
|
"""0xaa→1, 0x55→3, 0xff→5, 0x00→7 (write phases)."""
|
||||||
|
p = _BadblocksProgress()
|
||||||
|
for header, want in [
|
||||||
|
("Testing with pattern 0xaa: ", 1),
|
||||||
|
("Testing with pattern 0x55: ", 3),
|
||||||
|
("Testing with pattern 0xff: ", 5),
|
||||||
|
("Testing with pattern 0x00: ", 7),
|
||||||
|
]:
|
||||||
|
p.update(header)
|
||||||
|
self.assertEqual(p.phase, want, f"after {header!r}")
|
||||||
|
|
||||||
|
def test_verify_advances_to_next_phase(self):
|
||||||
|
"""`Reading and comparing` after `Testing with pattern 0x55`
|
||||||
|
(phase 3) advances to phase 4."""
|
||||||
|
p = _BadblocksProgress()
|
||||||
|
p.update("Testing with pattern 0x55: 100.00% done")
|
||||||
|
self.assertEqual(p.phase, 3)
|
||||||
|
p.update("Reading and comparing: 0.00% done")
|
||||||
|
self.assertEqual(p.phase, 4)
|
||||||
|
|
||||||
|
def test_overall_pct_at_phase_boundaries(self):
|
||||||
|
"""Verify the math at each phase boundary: phase N at 100% =
|
||||||
|
N * 12.5% overall (clipped to 99 at the end)."""
|
||||||
|
cases = [
|
||||||
|
(1, 0.0, 0), # start of run
|
||||||
|
(1, 100.0, 12), # 100/800 = 12.5
|
||||||
|
(2, 100.0, 25), # 200/800
|
||||||
|
(4, 100.0, 50), # 400/800
|
||||||
|
(7, 100.0, 87), # 700/800
|
||||||
|
(8, 100.0, 99), # 800/800 → clipped to 99
|
||||||
|
]
|
||||||
|
for phase, phase_pct, want in cases:
|
||||||
|
p = _BadblocksProgress()
|
||||||
|
p.phase = phase
|
||||||
|
p.phase_pct = phase_pct
|
||||||
|
self.assertEqual(
|
||||||
|
p.overall_pct, want,
|
||||||
|
f"phase={phase} phase_pct={phase_pct}",
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_realistic_sequence(self):
|
||||||
|
"""End-to-end: feed a synthetic badblocks output stream and
|
||||||
|
check the overall percent stays monotonically non-decreasing."""
|
||||||
|
lines = [
|
||||||
|
"Testing with pattern 0xaa: ",
|
||||||
|
"10.00% done, 1:00:00 elapsed. (0/0/0 errors)",
|
||||||
|
"50.00% done, 5:00:00 elapsed. (0/0/0 errors)",
|
||||||
|
"99.99% done, 10:00:00 elapsed. (0/0/0 errors)",
|
||||||
|
"Reading and comparing: ",
|
||||||
|
"0.00% done, 10:00:01 elapsed. (0/0/0 errors)",
|
||||||
|
"50.00% done, 12:30:00 elapsed. (0/0/0 errors)",
|
||||||
|
"Testing with pattern 0x55: ",
|
||||||
|
"0.00% done, 15:00:00 elapsed. (0/0/0 errors)",
|
||||||
|
"50.00% done, 17:30:00 elapsed. (0/0/0 errors)",
|
||||||
|
]
|
||||||
|
p = _BadblocksProgress()
|
||||||
|
seen = []
|
||||||
|
for line in lines:
|
||||||
|
p.update(line)
|
||||||
|
seen.append(p.overall_pct)
|
||||||
|
self.assertEqual(
|
||||||
|
seen, sorted(seen),
|
||||||
|
f"progress went backwards: {seen}",
|
||||||
|
)
|
||||||
|
# Sanity: by the time we're halfway through pattern-2 write
|
||||||
|
# (phase 3, 50%), we should report ((3-1)*100 + 50) / 8 = 31%.
|
||||||
|
self.assertEqual(seen[-1], 31)
|
||||||
|
|
||||||
|
def test_drives_at_different_phases_show_different_overall(self):
|
||||||
|
"""The original bug: two drives at the same per-phase 60%
|
||||||
|
but different phases used to look identical (both '60%').
|
||||||
|
Now they correctly diverge."""
|
||||||
|
slow = _BadblocksProgress()
|
||||||
|
slow.update("Testing with pattern 0xaa: ")
|
||||||
|
slow.update("60.00% done")
|
||||||
|
|
||||||
|
fast = _BadblocksProgress()
|
||||||
|
fast.update("Testing with pattern 0xaa: ")
|
||||||
|
fast.update("99.99% done")
|
||||||
|
fast.update("Reading and comparing: ")
|
||||||
|
fast.update("60.00% done")
|
||||||
|
|
||||||
|
# slow: 60/800 = 7%; fast: (1*100 + 60)/800 = 20%
|
||||||
|
self.assertEqual(slow.overall_pct, 7)
|
||||||
|
self.assertEqual(fast.overall_pct, 20)
|
||||||
|
|
||||||
|
def test_unknown_pattern_does_not_crash(self):
|
||||||
|
"""An unrecognized pattern (e.g. badblocks future versions or
|
||||||
|
custom patterns) just leaves phase unchanged."""
|
||||||
|
p = _BadblocksProgress()
|
||||||
|
p.update("Testing with pattern 0xab: ")
|
||||||
|
# phase stays at the default 1
|
||||||
|
self.assertEqual(p.phase, 1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
Loading…
Add table
Reference in a new issue