From 4922b19a9fd2d1bffc0723d51110af1c12fe21ca Mon Sep 17 00:00:00 2001
From: Brandon Walter <51866976+echoparkbaby@users.noreply.github.com>
Date: Fri, 8 May 2026 13:23:05 -0700
Subject: [PATCH] =?UTF-8?q?fix:=20stuck=5Fjob=5Fhours=20default=2024=20?=
 =?UTF-8?q?=E2=86=92=20168=20(7=20days)=20(1.0.0-43)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A user with 4× 14 TB WD HDDs running -w surface_validate had all
4 jobs marked 'unknown' at exactly 24h+1min — the stuck-job
detector firing on legitimate work because 14 TB at 8192-block
badblocks needs ~5+ days to complete all 4 patterns × 2 phases.

168h covers a full -w pass on 14 TB+ HDDs with margin. Anyone
running short SSDs who wants faster detection can drop the value
in Settings → Burn-in.

README warning replaced — no longer instructs users to bump the
threshold before starting big-drive burn-ins, since the default
now handles that case.

Settings UI already accepts up to 168 via the input's max=168
attribute, so no template change needed.
---
 README.md     | 14 ++++++++------
 app/config.py |  7 +++++--
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 5335f17..4c0da7b 100644
--- a/README.md
+++ b/README.md
@@ -83,11 +83,12 @@ runtime roughly in half at ~2× RAM cost — matches the upstream
 
 ### Watch out
 
-- **Stuck-job timeout** — `stuck_job_hours` (default 24) marks any job
-  past that threshold as `unknown` and kills the remote process. If
-  you're burning in 14 TB drives with default block size, raise this to
-  **48** in Settings before starting, or you'll get false positives near
-  the end of surface_validate.
+- **Stuck-job timeout** — `stuck_job_hours` (default 168 = 7 days)
+  marks any job past that threshold as `unknown` and kills the remote
+  process. The default covers `-w` surface_validate on 14 TB+ HDDs with
+  margin. If you're running short SSDs and want faster detection of
+  genuinely stuck jobs, drop it. (Earlier versions defaulted to 24h
+  which false-positived on multi-TB drives.)
 - **Thermal gate** — if drives currently under burn-in hit the
   temperature warning threshold, new jobs wait up to 3 minutes before
   acquiring a slot. Increase `temp_warn_c` if your chassis runs hot but
@@ -144,7 +145,8 @@ All settings live under `/settings` (header link). Key knobs:
 - **`surface_validate_block_size` / `_block_buffer` / `_passes`** —
   badblocks `-b` / `-c` / `-p`. Defaults preserve original behaviour;
   tune for speed vs paranoia.
-- **`stuck_job_hours`** (default 24) — raise for big drives.
+- **`stuck_job_hours`** (default 168 = 7 days) — covers 14 TB+ HDDs;
+  drop for faster detection on small fast drives.
 - **`temp_warn_c` / `temp_crit_c`** — thermal gating thresholds.
 - **`bad_block_threshold`** (default 0) — number of bad blocks
   surface_validate tolerates before failing the stage.
diff --git a/app/config.py b/app/config.py
index eaa43c9..ef97840 100644
--- a/app/config.py
+++ b/app/config.py
@@ -49,7 +49,10 @@ class Settings(BaseSettings):
     webhook_url: str = ""
 
     # Stuck-job detection: jobs running longer than this are marked 'unknown'
-    stuck_job_hours: int = 24
+    # and the remote badblocks/smartctl is killed. 168h (7 days) covers a
+    # full -w surface_validate on a 14 TB+ HDD with margin. Older default
+    # was 24h which false-positived on multi-TB drives almost every time.
+    stuck_job_hours: int = 168
 
     # Temperature thresholds (°C) — drives table colouring + precheck gate
     temp_warn_c: int = 46   # orange warning
@@ -83,7 +86,7 @@ class Settings(BaseSettings):
     ssh_key: str = ""             # PEM private key content (paste full key including headers)
 
     # Application version — used by the /api/v1/updates/check endpoint
-    app_version: str = "1.0.0-42"
+    app_version: str = "1.0.0-43"
 
     # ---- Authentication (1.0.0-22) ----
     # session_secret: HMAC key for signing session cookies. Empty = generate