fix: badblocks progress tracks overall %, not per-phase (1.0.0-42)
`badblocks -w` cycles through 4 patterns (0xaa, 0x55, 0xff, 0x00), each with a write phase + a verify phase = 8 phases. The output's "XX% done" lines are per-phase, so the dashboard appeared to "rewind" every ~2 hours. Two drives racing each other could look 4× apart in displayed progress despite identical hardware — actually one was just further into a later phase. New _BadblocksProgress state machine watches for "Testing with pattern 0xXX" and "Reading and comparing" headers, advances the phase counter, and reports overall = ((phase-1) * 100 + phase_pct) / 8 clipped to 99. Pure state machine, no I/O. 7 new tests cover phase-header detection, boundary math, monotonicity across a synthetic stream, and the original "two drives at same per-phase % look identical" bug. Image rebuilt and tagged but NOT deployed to the running container — 4 surface-validate jobs are 20-95% through 14TB drives and a recreate would SIGHUP the remote badblocks processes. Deploy with `docker compose up -d` after the current batch finishes.
This commit is contained in:
parent
775251b993
commit
b406e3f315
3 changed files with 203 additions and 7 deletions
|
|
@ -25,6 +25,72 @@ class _BadblocksResult(TypedDict):
|
|||
aborted: bool
|
||||
|
||||
|
||||
# `badblocks -w` cycles through 4 patterns (0xaa, 0x55, 0xff, 0x00),
|
||||
# each with a write phase followed by a read-back/verify phase = 8 phases.
|
||||
# Per-phase percent comes back via `XX% done`; without translation, the
|
||||
# dashboard appears to "rewind" every ~2 hours when a new phase starts.
|
||||
_BB_PATTERN_PHASE = {"0xaa": 1, "0x55": 3, "0xff": 5, "0x00": 7}
|
||||
_BB_TOTAL_PHASES = 8
|
||||
|
||||
import re as _re_pre # noqa: E402
|
||||
|
||||
_BB_PATTERN_RE = _re_pre.compile(r"Testing with pattern\s+(0x[0-9a-fA-F]+)")
|
||||
_BB_VERIFY_RE = _re_pre.compile(r"Reading and comparing")
|
||||
_BB_PERCENT_RE = _re_pre.compile(r"([\d.]+)%\s+done")
|
||||
|
||||
|
||||
class _BadblocksProgress:
|
||||
"""Track which phase of `badblocks -w -p N` we're in so the
|
||||
displayed percent maps to overall progress, not per-phase progress.
|
||||
|
||||
Pure state machine — no I/O. Feed it lines from the badblocks output
|
||||
via :meth:`update`; read :attr:`overall_pct` after each call.
|
||||
|
||||
Behavior:
|
||||
- Defaults to phase 1 (write 0xaa) before any header is seen.
|
||||
- "Testing with pattern 0xXX" sets the phase to the write-phase index
|
||||
for that pattern (1, 3, 5, or 7).
|
||||
- "Reading and comparing" advances to the matching verify phase
|
||||
(last_write_phase + 1).
|
||||
- "XX% done" updates the in-phase percent.
|
||||
- overall_pct = ((phase - 1) * 100 + phase_pct) / 8, clipped to 99
|
||||
so we don't claim "100%" until the stage's success path explicitly
|
||||
writes 100.
|
||||
"""
|
||||
|
||||
__slots__ = ("phase", "phase_pct", "_last_write_phase")
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.phase: int = 1
|
||||
self.phase_pct: float = 0.0
|
||||
self._last_write_phase: int = 1
|
||||
|
||||
def update(self, line: str) -> None:
|
||||
m = _BB_PATTERN_RE.search(line)
|
||||
if m:
|
||||
p = m.group(1).lower()
|
||||
if p in _BB_PATTERN_PHASE:
|
||||
self.phase = _BB_PATTERN_PHASE[p]
|
||||
self._last_write_phase = self.phase
|
||||
self.phase_pct = 0.0
|
||||
return
|
||||
if _BB_VERIFY_RE.search(line):
|
||||
self.phase = self._last_write_phase + 1
|
||||
self.phase_pct = 0.0
|
||||
return
|
||||
m = _BB_PERCENT_RE.search(line)
|
||||
if m:
|
||||
try:
|
||||
self.phase_pct = float(m.group(1))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
@property
|
||||
def overall_pct(self) -> int:
|
||||
total = (self.phase - 1) * 100.0 + self.phase_pct
|
||||
return min(99, int(total / _BB_TOTAL_PHASES))
|
||||
|
||||
|
||||
def _build_badblocks_cmd(devname: str) -> str:
|
||||
"""Construct the wrapped badblocks command for a given device.
|
||||
|
||||
|
|
@ -425,9 +491,8 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
|||
#
|
||||
cmd = _build_badblocks_cmd(devname)
|
||||
async with conn.create_process(cmd) as proc:
|
||||
import re as _re
|
||||
|
||||
pid_seen = False
|
||||
progress = _BadblocksProgress()
|
||||
|
||||
async def _drain(stream, is_stderr: bool):
|
||||
nonlocal bad_blocks_total, pid_seen
|
||||
|
|
@ -451,10 +516,16 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
|||
output_lines.append(line)
|
||||
|
||||
if is_stderr:
|
||||
m = _re.search(r"([\d.]+)%\s+done", line)
|
||||
if m:
|
||||
pct = min(99, int(float(m.group(1))))
|
||||
await _update_stage_percent(job_id, "surface_validate", pct)
|
||||
# Drive progress.update from EVERY stderr line so it
|
||||
# picks up the "Testing with pattern 0xXX" + "Reading
|
||||
# and comparing" headers (which advance the phase
|
||||
# counter), not just the percent-done lines.
|
||||
prev = progress.overall_pct
|
||||
progress.update(line)
|
||||
if progress.overall_pct != prev or _BB_PERCENT_RE.search(line):
|
||||
await _update_stage_percent(
|
||||
job_id, "surface_validate", progress.overall_pct,
|
||||
)
|
||||
await _update_stage_bad_blocks(job_id, "surface_validate", bad_blocks_total)
|
||||
await _recalculate_progress(job_id)
|
||||
_push_update()
|
||||
|
|
|
|||
|
|
@ -83,7 +83,7 @@ class Settings(BaseSettings):
|
|||
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
||||
|
||||
# Application version — used by the /api/v1/updates/check endpoint
|
||||
app_version: str = "1.0.0-41"
|
||||
app_version: str = "1.0.0-42"
|
||||
|
||||
# ---- Authentication (1.0.0-22) ----
|
||||
# session_secret: HMAC key for signing session cookies. Empty = generate
|
||||
|
|
|
|||
125
tests/test_badblocks_progress.py
Normal file
125
tests/test_badblocks_progress.py
Normal file
|
|
@ -0,0 +1,125 @@
|
|||
"""Verifies _BadblocksProgress translates per-phase badblocks output
|
||||
into a monotonic 0-99% overall progress.
|
||||
|
||||
`badblocks -w` cycles through 4 patterns × {write, verify} = 8 phases.
|
||||
Each phase prints "XX% done" relative to its own 0-100 range. Without
|
||||
this translation the dashboard appeared to "rewind" every ~2 hours
|
||||
when a new phase started — and two drives racing each other could
|
||||
look 4× apart in displayed progress despite identical hardware.
|
||||
|
||||
Run inside the container image so app deps are present.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import unittest
|
||||
|
||||
from app.burnin.stages import _BadblocksProgress
|
||||
|
||||
|
||||
class TestBadblocksProgress(unittest.TestCase):
|
||||
|
||||
def test_default_phase_one(self):
|
||||
"""Before any header, treat as start of pattern-1 write."""
|
||||
p = _BadblocksProgress()
|
||||
self.assertEqual(p.phase, 1)
|
||||
self.assertEqual(p.overall_pct, 0)
|
||||
|
||||
def test_pattern_headers_set_phase(self):
|
||||
"""0xaa→1, 0x55→3, 0xff→5, 0x00→7 (write phases)."""
|
||||
p = _BadblocksProgress()
|
||||
for header, want in [
|
||||
("Testing with pattern 0xaa: ", 1),
|
||||
("Testing with pattern 0x55: ", 3),
|
||||
("Testing with pattern 0xff: ", 5),
|
||||
("Testing with pattern 0x00: ", 7),
|
||||
]:
|
||||
p.update(header)
|
||||
self.assertEqual(p.phase, want, f"after {header!r}")
|
||||
|
||||
def test_verify_advances_to_next_phase(self):
|
||||
"""`Reading and comparing` after `Testing with pattern 0x55`
|
||||
(phase 3) advances to phase 4."""
|
||||
p = _BadblocksProgress()
|
||||
p.update("Testing with pattern 0x55: 100.00% done")
|
||||
self.assertEqual(p.phase, 3)
|
||||
p.update("Reading and comparing: 0.00% done")
|
||||
self.assertEqual(p.phase, 4)
|
||||
|
||||
def test_overall_pct_at_phase_boundaries(self):
|
||||
"""Verify the math at each phase boundary: phase N at 100% =
|
||||
N * 12.5% overall (clipped to 99 at the end)."""
|
||||
cases = [
|
||||
(1, 0.0, 0), # start of run
|
||||
(1, 100.0, 12), # 100/800 = 12.5
|
||||
(2, 100.0, 25), # 200/800
|
||||
(4, 100.0, 50), # 400/800
|
||||
(7, 100.0, 87), # 700/800
|
||||
(8, 100.0, 99), # 800/800 → clipped to 99
|
||||
]
|
||||
for phase, phase_pct, want in cases:
|
||||
p = _BadblocksProgress()
|
||||
p.phase = phase
|
||||
p.phase_pct = phase_pct
|
||||
self.assertEqual(
|
||||
p.overall_pct, want,
|
||||
f"phase={phase} phase_pct={phase_pct}",
|
||||
)
|
||||
|
||||
def test_realistic_sequence(self):
|
||||
"""End-to-end: feed a synthetic badblocks output stream and
|
||||
check the overall percent stays monotonically non-decreasing."""
|
||||
lines = [
|
||||
"Testing with pattern 0xaa: ",
|
||||
"10.00% done, 1:00:00 elapsed. (0/0/0 errors)",
|
||||
"50.00% done, 5:00:00 elapsed. (0/0/0 errors)",
|
||||
"99.99% done, 10:00:00 elapsed. (0/0/0 errors)",
|
||||
"Reading and comparing: ",
|
||||
"0.00% done, 10:00:01 elapsed. (0/0/0 errors)",
|
||||
"50.00% done, 12:30:00 elapsed. (0/0/0 errors)",
|
||||
"Testing with pattern 0x55: ",
|
||||
"0.00% done, 15:00:00 elapsed. (0/0/0 errors)",
|
||||
"50.00% done, 17:30:00 elapsed. (0/0/0 errors)",
|
||||
]
|
||||
p = _BadblocksProgress()
|
||||
seen = []
|
||||
for line in lines:
|
||||
p.update(line)
|
||||
seen.append(p.overall_pct)
|
||||
self.assertEqual(
|
||||
seen, sorted(seen),
|
||||
f"progress went backwards: {seen}",
|
||||
)
|
||||
# Sanity: by the time we're halfway through pattern-2 write
|
||||
# (phase 3, 50%), we should report ((3-1)*100 + 50) / 8 = 31%.
|
||||
self.assertEqual(seen[-1], 31)
|
||||
|
||||
def test_drives_at_different_phases_show_different_overall(self):
|
||||
"""The original bug: two drives at the same per-phase 60%
|
||||
but different phases used to look identical (both '60%').
|
||||
Now they correctly diverge."""
|
||||
slow = _BadblocksProgress()
|
||||
slow.update("Testing with pattern 0xaa: ")
|
||||
slow.update("60.00% done")
|
||||
|
||||
fast = _BadblocksProgress()
|
||||
fast.update("Testing with pattern 0xaa: ")
|
||||
fast.update("99.99% done")
|
||||
fast.update("Reading and comparing: ")
|
||||
fast.update("60.00% done")
|
||||
|
||||
# slow: 60/800 = 7%; fast: (1*100 + 60)/800 = 20%
|
||||
self.assertEqual(slow.overall_pct, 7)
|
||||
self.assertEqual(fast.overall_pct, 20)
|
||||
|
||||
def test_unknown_pattern_does_not_crash(self):
|
||||
"""An unrecognized pattern (e.g. badblocks future versions or
|
||||
custom patterns) just leaves phase unchanged."""
|
||||
p = _BadblocksProgress()
|
||||
p.update("Testing with pattern 0xab: ")
|
||||
# phase stays at the default 1
|
||||
self.assertEqual(p.phase, 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Loading…
Add table
Reference in a new issue