fix: badblocks progress tracks overall %, not per-phase (1.0.0-42)
Some checks failed
Security scan / pip-audit (push) Has been cancelled
Security scan / bandit (push) Has been cancelled
Security scan / gitleaks (push) Has been cancelled
Security scan / mypy (push) Has been cancelled

`badblocks -w` cycles through 4 patterns (0xaa, 0x55, 0xff, 0x00),
each with a write phase + a verify phase = 8 phases. The output's
"XX% done" lines are per-phase, so the dashboard appeared to "rewind"
every ~2 hours. Two drives racing each other could look 4× apart in
displayed progress despite identical hardware — actually one was
just further into a later phase.

New _BadblocksProgress state machine watches for "Testing with
pattern 0xXX" and "Reading and comparing" headers, advances the
phase counter, and reports overall = ((phase-1) * 100 + phase_pct) / 8
clipped to 99. Pure state machine, no I/O.

7 new tests cover phase-header detection, boundary math, monotonicity
across a synthetic stream, and the original "two drives at same
per-phase % look identical" bug.

Image rebuilt and tagged but NOT deployed to the running container —
4 surface-validate jobs are 20-95% through 14TB drives and a recreate
would SIGHUP the remote badblocks processes. Deploy with
`docker compose up -d` after the current batch finishes.
This commit is contained in:
Brandon Walter 2026-05-05 07:26:23 -07:00
parent 775251b993
commit b406e3f315
3 changed files with 203 additions and 7 deletions

View file

@ -25,6 +25,72 @@ class _BadblocksResult(TypedDict):
aborted: bool
# `badblocks -w` cycles through 4 patterns (0xaa, 0x55, 0xff, 0x00),
# each with a write phase followed by a read-back/verify phase = 8 phases.
# Per-phase percent comes back via `XX% done`; without translation, the
# dashboard appears to "rewind" every ~2 hours when a new phase starts.
_BB_PATTERN_PHASE = {"0xaa": 1, "0x55": 3, "0xff": 5, "0x00": 7}
_BB_TOTAL_PHASES = 8
import re as _re_pre # noqa: E402
_BB_PATTERN_RE = _re_pre.compile(r"Testing with pattern\s+(0x[0-9a-fA-F]+)")
_BB_VERIFY_RE = _re_pre.compile(r"Reading and comparing")
_BB_PERCENT_RE = _re_pre.compile(r"([\d.]+)%\s+done")
class _BadblocksProgress:
"""Track which phase of `badblocks -w -p N` we're in so the
displayed percent maps to overall progress, not per-phase progress.
Pure state machine no I/O. Feed it lines from the badblocks output
via :meth:`update`; read :attr:`overall_pct` after each call.
Behavior:
- Defaults to phase 1 (write 0xaa) before any header is seen.
- "Testing with pattern 0xXX" sets the phase to the write-phase index
for that pattern (1, 3, 5, or 7).
- "Reading and comparing" advances to the matching verify phase
(last_write_phase + 1).
- "XX% done" updates the in-phase percent.
- overall_pct = ((phase - 1) * 100 + phase_pct) / 8, clipped to 99
so we don't claim "100%" until the stage's success path explicitly
writes 100.
"""
__slots__ = ("phase", "phase_pct", "_last_write_phase")
def __init__(self) -> None:
self.phase: int = 1
self.phase_pct: float = 0.0
self._last_write_phase: int = 1
def update(self, line: str) -> None:
m = _BB_PATTERN_RE.search(line)
if m:
p = m.group(1).lower()
if p in _BB_PATTERN_PHASE:
self.phase = _BB_PATTERN_PHASE[p]
self._last_write_phase = self.phase
self.phase_pct = 0.0
return
if _BB_VERIFY_RE.search(line):
self.phase = self._last_write_phase + 1
self.phase_pct = 0.0
return
m = _BB_PERCENT_RE.search(line)
if m:
try:
self.phase_pct = float(m.group(1))
except ValueError:
pass
@property
def overall_pct(self) -> int:
total = (self.phase - 1) * 100.0 + self.phase_pct
return min(99, int(total / _BB_TOTAL_PHASES))
def _build_badblocks_cmd(devname: str) -> str:
"""Construct the wrapped badblocks command for a given device.
@ -425,9 +491,8 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
#
cmd = _build_badblocks_cmd(devname)
async with conn.create_process(cmd) as proc:
import re as _re
pid_seen = False
progress = _BadblocksProgress()
async def _drain(stream, is_stderr: bool):
nonlocal bad_blocks_total, pid_seen
@ -451,10 +516,16 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
output_lines.append(line)
if is_stderr:
m = _re.search(r"([\d.]+)%\s+done", line)
if m:
pct = min(99, int(float(m.group(1))))
await _update_stage_percent(job_id, "surface_validate", pct)
# Drive progress.update from EVERY stderr line so it
# picks up the "Testing with pattern 0xXX" + "Reading
# and comparing" headers (which advance the phase
# counter), not just the percent-done lines.
prev = progress.overall_pct
progress.update(line)
if progress.overall_pct != prev or _BB_PERCENT_RE.search(line):
await _update_stage_percent(
job_id, "surface_validate", progress.overall_pct,
)
await _update_stage_bad_blocks(job_id, "surface_validate", bad_blocks_total)
await _recalculate_progress(job_id)
_push_update()

View file

@ -83,7 +83,7 @@ class Settings(BaseSettings):
ssh_key: str = "" # PEM private key content (paste full key including headers)
# Application version — used by the /api/v1/updates/check endpoint
app_version: str = "1.0.0-41"
app_version: str = "1.0.0-42"
# ---- Authentication (1.0.0-22) ----
# session_secret: HMAC key for signing session cookies. Empty = generate

View file

@ -0,0 +1,125 @@
"""Verifies _BadblocksProgress translates per-phase badblocks output
into a monotonic 0-99% overall progress.
`badblocks -w` cycles through 4 patterns × {write, verify} = 8 phases.
Each phase prints "XX% done" relative to its own 0-100 range. Without
this translation the dashboard appeared to "rewind" every ~2 hours
when a new phase started and two drives racing each other could
look 4× apart in displayed progress despite identical hardware.
Run inside the container image so app deps are present.
"""
from __future__ import annotations
import unittest
from app.burnin.stages import _BadblocksProgress
class TestBadblocksProgress(unittest.TestCase):
def test_default_phase_one(self):
"""Before any header, treat as start of pattern-1 write."""
p = _BadblocksProgress()
self.assertEqual(p.phase, 1)
self.assertEqual(p.overall_pct, 0)
def test_pattern_headers_set_phase(self):
"""0xaa→1, 0x55→3, 0xff→5, 0x00→7 (write phases)."""
p = _BadblocksProgress()
for header, want in [
("Testing with pattern 0xaa: ", 1),
("Testing with pattern 0x55: ", 3),
("Testing with pattern 0xff: ", 5),
("Testing with pattern 0x00: ", 7),
]:
p.update(header)
self.assertEqual(p.phase, want, f"after {header!r}")
def test_verify_advances_to_next_phase(self):
"""`Reading and comparing` after `Testing with pattern 0x55`
(phase 3) advances to phase 4."""
p = _BadblocksProgress()
p.update("Testing with pattern 0x55: 100.00% done")
self.assertEqual(p.phase, 3)
p.update("Reading and comparing: 0.00% done")
self.assertEqual(p.phase, 4)
def test_overall_pct_at_phase_boundaries(self):
"""Verify the math at each phase boundary: phase N at 100% =
N * 12.5% overall (clipped to 99 at the end)."""
cases = [
(1, 0.0, 0), # start of run
(1, 100.0, 12), # 100/800 = 12.5
(2, 100.0, 25), # 200/800
(4, 100.0, 50), # 400/800
(7, 100.0, 87), # 700/800
(8, 100.0, 99), # 800/800 → clipped to 99
]
for phase, phase_pct, want in cases:
p = _BadblocksProgress()
p.phase = phase
p.phase_pct = phase_pct
self.assertEqual(
p.overall_pct, want,
f"phase={phase} phase_pct={phase_pct}",
)
def test_realistic_sequence(self):
"""End-to-end: feed a synthetic badblocks output stream and
check the overall percent stays monotonically non-decreasing."""
lines = [
"Testing with pattern 0xaa: ",
"10.00% done, 1:00:00 elapsed. (0/0/0 errors)",
"50.00% done, 5:00:00 elapsed. (0/0/0 errors)",
"99.99% done, 10:00:00 elapsed. (0/0/0 errors)",
"Reading and comparing: ",
"0.00% done, 10:00:01 elapsed. (0/0/0 errors)",
"50.00% done, 12:30:00 elapsed. (0/0/0 errors)",
"Testing with pattern 0x55: ",
"0.00% done, 15:00:00 elapsed. (0/0/0 errors)",
"50.00% done, 17:30:00 elapsed. (0/0/0 errors)",
]
p = _BadblocksProgress()
seen = []
for line in lines:
p.update(line)
seen.append(p.overall_pct)
self.assertEqual(
seen, sorted(seen),
f"progress went backwards: {seen}",
)
# Sanity: by the time we're halfway through pattern-2 write
# (phase 3, 50%), we should report ((3-1)*100 + 50) / 8 = 31%.
self.assertEqual(seen[-1], 31)
def test_drives_at_different_phases_show_different_overall(self):
"""The original bug: two drives at the same per-phase 60%
but different phases used to look identical (both '60%').
Now they correctly diverge."""
slow = _BadblocksProgress()
slow.update("Testing with pattern 0xaa: ")
slow.update("60.00% done")
fast = _BadblocksProgress()
fast.update("Testing with pattern 0xaa: ")
fast.update("99.99% done")
fast.update("Reading and comparing: ")
fast.update("60.00% done")
# slow: 60/800 = 7%; fast: (1*100 + 60)/800 = 20%
self.assertEqual(slow.overall_pct, 7)
self.assertEqual(fast.overall_pct, 20)
def test_unknown_pattern_does_not_crash(self):
"""An unrecognized pattern (e.g. badblocks future versions or
custom patterns) just leaves phase unchanged."""
p = _BadblocksProgress()
p.update("Testing with pattern 0xab: ")
# phase stays at the default 1
self.assertEqual(p.phase, 1)
if __name__ == "__main__":
unittest.main()