From b406e3f315e53a39179db9bcafbd5f492cb73282 Mon Sep 17 00:00:00 2001 From: Brandon Walter <51866976+echoparkbaby@users.noreply.github.com> Date: Tue, 5 May 2026 07:26:23 -0700 Subject: [PATCH] fix: badblocks progress tracks overall %, not per-phase (1.0.0-42) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `badblocks -w` cycles through 4 patterns (0xaa, 0x55, 0xff, 0x00), each with a write phase + a verify phase = 8 phases. The output's "XX% done" lines are per-phase, so the dashboard appeared to "rewind" every ~2 hours. Two drives racing each other could look 4× apart in displayed progress despite identical hardware — actually one was just further into a later phase. New _BadblocksProgress state machine watches for "Testing with pattern 0xXX" and "Reading and comparing" headers, advances the phase counter, and reports overall = ((phase-1) * 100 + phase_pct) / 8 clipped to 99. Pure state machine, no I/O. 7 new tests cover phase-header detection, boundary math, monotonicity across a synthetic stream, and the original "two drives at same per-phase % look identical" bug. Image rebuilt and tagged but NOT deployed to the running container — 4 surface-validate jobs are 20-95% through 14TB drives and a recreate would SIGHUP the remote badblocks processes. Deploy with `docker compose up -d` after the current batch finishes. --- app/burnin/stages.py | 83 ++++++++++++++++++-- app/config.py | 2 +- tests/test_badblocks_progress.py | 125 +++++++++++++++++++++++++++++++ 3 files changed, 203 insertions(+), 7 deletions(-) create mode 100644 tests/test_badblocks_progress.py diff --git a/app/burnin/stages.py b/app/burnin/stages.py index beca66f..bfa8119 100644 --- a/app/burnin/stages.py +++ b/app/burnin/stages.py @@ -25,6 +25,72 @@ class _BadblocksResult(TypedDict): aborted: bool +# `badblocks -w` cycles through 4 patterns (0xaa, 0x55, 0xff, 0x00), +# each with a write phase followed by a read-back/verify phase = 8 phases. +# Per-phase percent comes back via `XX% done`; without translation, the +# dashboard appears to "rewind" every ~2 hours when a new phase starts. +_BB_PATTERN_PHASE = {"0xaa": 1, "0x55": 3, "0xff": 5, "0x00": 7} +_BB_TOTAL_PHASES = 8 + +import re as _re_pre # noqa: E402 + +_BB_PATTERN_RE = _re_pre.compile(r"Testing with pattern\s+(0x[0-9a-fA-F]+)") +_BB_VERIFY_RE = _re_pre.compile(r"Reading and comparing") +_BB_PERCENT_RE = _re_pre.compile(r"([\d.]+)%\s+done") + + +class _BadblocksProgress: + """Track which phase of `badblocks -w -p N` we're in so the + displayed percent maps to overall progress, not per-phase progress. + + Pure state machine — no I/O. Feed it lines from the badblocks output + via :meth:`update`; read :attr:`overall_pct` after each call. + + Behavior: + - Defaults to phase 1 (write 0xaa) before any header is seen. + - "Testing with pattern 0xXX" sets the phase to the write-phase index + for that pattern (1, 3, 5, or 7). + - "Reading and comparing" advances to the matching verify phase + (last_write_phase + 1). + - "XX% done" updates the in-phase percent. + - overall_pct = ((phase - 1) * 100 + phase_pct) / 8, clipped to 99 + so we don't claim "100%" until the stage's success path explicitly + writes 100. + """ + + __slots__ = ("phase", "phase_pct", "_last_write_phase") + + def __init__(self) -> None: + self.phase: int = 1 + self.phase_pct: float = 0.0 + self._last_write_phase: int = 1 + + def update(self, line: str) -> None: + m = _BB_PATTERN_RE.search(line) + if m: + p = m.group(1).lower() + if p in _BB_PATTERN_PHASE: + self.phase = _BB_PATTERN_PHASE[p] + self._last_write_phase = self.phase + self.phase_pct = 0.0 + return + if _BB_VERIFY_RE.search(line): + self.phase = self._last_write_phase + 1 + self.phase_pct = 0.0 + return + m = _BB_PERCENT_RE.search(line) + if m: + try: + self.phase_pct = float(m.group(1)) + except ValueError: + pass + + @property + def overall_pct(self) -> int: + total = (self.phase - 1) * 100.0 + self.phase_pct + return min(99, int(total / _BB_TOTAL_PHASES)) + + def _build_badblocks_cmd(devname: str) -> str: """Construct the wrapped badblocks command for a given device. @@ -425,9 +491,8 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) # cmd = _build_badblocks_cmd(devname) async with conn.create_process(cmd) as proc: - import re as _re - pid_seen = False + progress = _BadblocksProgress() async def _drain(stream, is_stderr: bool): nonlocal bad_blocks_total, pid_seen @@ -451,10 +516,16 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) output_lines.append(line) if is_stderr: - m = _re.search(r"([\d.]+)%\s+done", line) - if m: - pct = min(99, int(float(m.group(1)))) - await _update_stage_percent(job_id, "surface_validate", pct) + # Drive progress.update from EVERY stderr line so it + # picks up the "Testing with pattern 0xXX" + "Reading + # and comparing" headers (which advance the phase + # counter), not just the percent-done lines. + prev = progress.overall_pct + progress.update(line) + if progress.overall_pct != prev or _BB_PERCENT_RE.search(line): + await _update_stage_percent( + job_id, "surface_validate", progress.overall_pct, + ) await _update_stage_bad_blocks(job_id, "surface_validate", bad_blocks_total) await _recalculate_progress(job_id) _push_update() diff --git a/app/config.py b/app/config.py index 621dbee..eaa43c9 100644 --- a/app/config.py +++ b/app/config.py @@ -83,7 +83,7 @@ class Settings(BaseSettings): ssh_key: str = "" # PEM private key content (paste full key including headers) # Application version — used by the /api/v1/updates/check endpoint - app_version: str = "1.0.0-41" + app_version: str = "1.0.0-42" # ---- Authentication (1.0.0-22) ---- # session_secret: HMAC key for signing session cookies. Empty = generate diff --git a/tests/test_badblocks_progress.py b/tests/test_badblocks_progress.py new file mode 100644 index 0000000..79a2eae --- /dev/null +++ b/tests/test_badblocks_progress.py @@ -0,0 +1,125 @@ +"""Verifies _BadblocksProgress translates per-phase badblocks output +into a monotonic 0-99% overall progress. + +`badblocks -w` cycles through 4 patterns × {write, verify} = 8 phases. +Each phase prints "XX% done" relative to its own 0-100 range. Without +this translation the dashboard appeared to "rewind" every ~2 hours +when a new phase started — and two drives racing each other could +look 4× apart in displayed progress despite identical hardware. + +Run inside the container image so app deps are present. +""" + +from __future__ import annotations + +import unittest + +from app.burnin.stages import _BadblocksProgress + + +class TestBadblocksProgress(unittest.TestCase): + + def test_default_phase_one(self): + """Before any header, treat as start of pattern-1 write.""" + p = _BadblocksProgress() + self.assertEqual(p.phase, 1) + self.assertEqual(p.overall_pct, 0) + + def test_pattern_headers_set_phase(self): + """0xaa→1, 0x55→3, 0xff→5, 0x00→7 (write phases).""" + p = _BadblocksProgress() + for header, want in [ + ("Testing with pattern 0xaa: ", 1), + ("Testing with pattern 0x55: ", 3), + ("Testing with pattern 0xff: ", 5), + ("Testing with pattern 0x00: ", 7), + ]: + p.update(header) + self.assertEqual(p.phase, want, f"after {header!r}") + + def test_verify_advances_to_next_phase(self): + """`Reading and comparing` after `Testing with pattern 0x55` + (phase 3) advances to phase 4.""" + p = _BadblocksProgress() + p.update("Testing with pattern 0x55: 100.00% done") + self.assertEqual(p.phase, 3) + p.update("Reading and comparing: 0.00% done") + self.assertEqual(p.phase, 4) + + def test_overall_pct_at_phase_boundaries(self): + """Verify the math at each phase boundary: phase N at 100% = + N * 12.5% overall (clipped to 99 at the end).""" + cases = [ + (1, 0.0, 0), # start of run + (1, 100.0, 12), # 100/800 = 12.5 + (2, 100.0, 25), # 200/800 + (4, 100.0, 50), # 400/800 + (7, 100.0, 87), # 700/800 + (8, 100.0, 99), # 800/800 → clipped to 99 + ] + for phase, phase_pct, want in cases: + p = _BadblocksProgress() + p.phase = phase + p.phase_pct = phase_pct + self.assertEqual( + p.overall_pct, want, + f"phase={phase} phase_pct={phase_pct}", + ) + + def test_realistic_sequence(self): + """End-to-end: feed a synthetic badblocks output stream and + check the overall percent stays monotonically non-decreasing.""" + lines = [ + "Testing with pattern 0xaa: ", + "10.00% done, 1:00:00 elapsed. (0/0/0 errors)", + "50.00% done, 5:00:00 elapsed. (0/0/0 errors)", + "99.99% done, 10:00:00 elapsed. (0/0/0 errors)", + "Reading and comparing: ", + "0.00% done, 10:00:01 elapsed. (0/0/0 errors)", + "50.00% done, 12:30:00 elapsed. (0/0/0 errors)", + "Testing with pattern 0x55: ", + "0.00% done, 15:00:00 elapsed. (0/0/0 errors)", + "50.00% done, 17:30:00 elapsed. (0/0/0 errors)", + ] + p = _BadblocksProgress() + seen = [] + for line in lines: + p.update(line) + seen.append(p.overall_pct) + self.assertEqual( + seen, sorted(seen), + f"progress went backwards: {seen}", + ) + # Sanity: by the time we're halfway through pattern-2 write + # (phase 3, 50%), we should report ((3-1)*100 + 50) / 8 = 31%. + self.assertEqual(seen[-1], 31) + + def test_drives_at_different_phases_show_different_overall(self): + """The original bug: two drives at the same per-phase 60% + but different phases used to look identical (both '60%'). + Now they correctly diverge.""" + slow = _BadblocksProgress() + slow.update("Testing with pattern 0xaa: ") + slow.update("60.00% done") + + fast = _BadblocksProgress() + fast.update("Testing with pattern 0xaa: ") + fast.update("99.99% done") + fast.update("Reading and comparing: ") + fast.update("60.00% done") + + # slow: 60/800 = 7%; fast: (1*100 + 60)/800 = 20% + self.assertEqual(slow.overall_pct, 7) + self.assertEqual(fast.overall_pct, 20) + + def test_unknown_pattern_does_not_crash(self): + """An unrecognized pattern (e.g. badblocks future versions or + custom patterns) just leaves phase unchanged.""" + p = _BadblocksProgress() + p.update("Testing with pattern 0xab: ") + # phase stays at the default 1 + self.assertEqual(p.phase, 1) + + +if __name__ == "__main__": + unittest.main()