Stage 7: SSH architecture, SMART attribute monitoring, drive reset, and polish

SSH (app/ssh_client.py — new): - asyncssh-based client: start_smart_test, poll_smart_progress, abort_smart_test, get_smart_attributes, run_badblocks with streaming progress callbacks - SMART attribute table: monitors attrs 5/10/188/197/198/199 for warn/fail thresholds - Falls back to REST API / mock simulation when ssh_host is not configured Burn-in stages updated (burnin.py): - _stage_smart_test: SSH path polls smartctl -a, stores raw output + parsed attributes - _stage_surface_validate: SSH path streams badblocks, counts bad blocks vs configurable threshold - _stage_final_check: SSH path checks smartctl attributes; DB fallback for mock mode - New DB helpers: _append_stage_log, _update_stage_bad_blocks, _store_smart_attrs, _store_smart_raw_output Database (database.py): - Migrations: burnin_stages.log_text, burnin_stages.bad_blocks, drives.smart_attrs (JSON), smart_tests.raw_output Settings (config.py + settings_store.py): - ssh_host, ssh_port, ssh_user, ssh_password, ssh_key — all runtime-editable - SSH section in Settings UI with Test SSH Connection button Webhook (notifier.py): - Added bad_blocks and timestamp fields to payload per SPEC Drive reset (routes.py + drives_table.html): - POST /api/v1/drives/{id}/reset — clears SMART state, smart_attrs; audit logged - Reset button visible on drives with completed test state (no active burn-in) Log drawer (app.js): - Burn-In tab: shows raw stage log_text (SSH output) with bad block highlighting - SMART tab: shows SMART attribute table with warn/fail colouring + raw smartctl output Polish: - Version badge (v1.0.0-6d) in header via Jinja2 global - Parallel burn-in warning when max_parallel_burnins > 8 in Settings - Stats page: avg duration by drive size + failure breakdown by stage - settings.html: SSH section with key textarea, parallel warn div Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-24 08:09:30 -05:00 · 2026-02-24 08:09:30 -05:00 · 2dff58bd52
commit 2dff58bd52
parent 4ab54d7ed8
15 changed files with 1141 additions and 44 deletions
--- a/app/burnin.py
+++ b/app/burnin.py
@ -303,6 +303,16 @@ async def _run_job(job_id: int) -> None:
                )
                job_row = await cur2.fetchone()
            if job_row:
                # Get bad_blocks count from surface_validate stage if present
                bad_blocks = 0
                async with _db() as db3:
                    cur3 = await db3.execute(
                        "SELECT bad_blocks FROM burnin_stages WHERE burnin_job_id=? AND stage_name='surface_validate'",
                        (job_id,)
                    )
                    bb_row = await cur3.fetchone()
                    if bb_row and bb_row[0]:
                        bad_blocks = bb_row[0]
                asyncio.create_task(notifier.notify_job_complete(
                    job_id=job_id,
                    devname=devname,
@ -312,6 +322,7 @@ async def _run_job(job_id: int) -> None:
                    profile=job_row["profile"],
                    operator=job_row["operator"],
                    error_text=error_text,
                    bad_blocks=bad_blocks,
                ))
        except Exception as exc:
            log.error("Failed to schedule notifications: %s", exc)
@ -352,15 +363,15 @@ async def _dispatch_stage(job_id: int, stage_name: str, devname: str, drive_id:
    if stage_name == "precheck":
        return await _stage_precheck(job_id, drive_id)
    elif stage_name == "short_smart":
-        return await _stage_smart_test(job_id, devname, "SHORT", "short_smart")
+        return await _stage_smart_test(job_id, devname, "SHORT", "short_smart", drive_id)
    elif stage_name == "long_smart":
-        return await _stage_smart_test(job_id, devname, "LONG", "long_smart")
+        return await _stage_smart_test(job_id, devname, "LONG", "long_smart", drive_id)
    elif stage_name == "surface_validate":
-        return await _stage_timed_simulate(job_id, "surface_validate", settings.surface_validate_seconds)
+        return await _stage_surface_validate(job_id, devname, drive_id)
    elif stage_name == "io_validate":
        return await _stage_timed_simulate(job_id, "io_validate", settings.io_validate_seconds)
    elif stage_name == "final_check":
-        return await _stage_final_check(job_id, devname)
+        return await _stage_final_check(job_id, devname, drive_id)
    return True
@ -393,8 +404,17 @@ async def _stage_precheck(job_id: int, drive_id: int) -> bool:
    return True
-async def _stage_smart_test(job_id: int, devname: str, test_type: str, stage_name: str) -> bool:
+async def _stage_smart_test(job_id: int, devname: str, test_type: str, stage_name: str,
-    """Start a TrueNAS SMART test and poll until complete."""
+                            drive_id: int | None = None) -> bool:
    """Start a SMART test. Uses SSH if configured, TrueNAS REST API otherwise."""
    from app import ssh_client
    if ssh_client.is_configured():
        return await _stage_smart_test_ssh(job_id, devname, test_type, stage_name, drive_id)
    return await _stage_smart_test_api(job_id, devname, test_type, stage_name)
 async def _stage_smart_test_api(job_id: int, devname: str, test_type: str, stage_name: str) -> bool:
    """TrueNAS REST API path for SMART test (mock / dev mode)."""
    tn_job_id = await _client.start_smart_test([devname], test_type)
    while True:
@ -428,8 +448,215 @@ async def _stage_smart_test(job_id: int, devname: str, test_type: str, stage_nam
        await asyncio.sleep(POLL_INTERVAL)
 async def _stage_smart_test_ssh(job_id: int, devname: str, test_type: str, stage_name: str,
                                 drive_id: int | None) -> bool:
    """SSH path for SMART test — runs smartctl directly on TrueNAS."""
    from app import ssh_client
    # Start the test
    try:
        startup = await ssh_client.start_smart_test(devname, test_type)
        await _append_stage_log(job_id, stage_name, startup + "\n")
    except Exception as exc:
        await _set_stage_error(job_id, stage_name, f"Failed to start SMART test via SSH: {exc}")
        return False
    # Brief pause to let the test register in smartctl output
    await asyncio.sleep(3)
    # Poll until complete
    while True:
        if await _is_cancelled(job_id):
            try:
                await ssh_client.abort_smart_test(devname)
            except Exception:
                pass
            return False
        await asyncio.sleep(POLL_INTERVAL)
        try:
            progress = await ssh_client.poll_smart_progress(devname)
        except Exception as exc:
            log.warning("SSH SMART poll failed: %s", exc, extra={"job_id": job_id})
            await _append_stage_log(job_id, stage_name, f"[poll error] {exc}\n")
            continue
        await _append_stage_log(job_id, stage_name, progress["output"] + "\n---\n")
        if progress["state"] == "running":
            pct = max(0, 100 - progress["percent_remaining"])
            await _update_stage_percent(job_id, stage_name, pct)
            await _recalculate_progress(job_id)
            _push_update()
        elif progress["state"] == "passed":
            await _update_stage_percent(job_id, stage_name, 100)
            # Run attribute check
            if drive_id is not None:
                try:
                    attrs = await ssh_client.get_smart_attributes(devname)
                    await _store_smart_attrs(drive_id, attrs)
                    await _store_smart_raw_output(drive_id, test_type, attrs["raw_output"])
                    if attrs["failures"]:
                        error = "SMART attribute failures: " + "; ".join(attrs["failures"])
                        await _set_stage_error(job_id, stage_name, error)
                        return False
                    if attrs["warnings"]:
                        await _append_stage_log(
                            job_id, stage_name,
                            "[WARNING] " + "; ".join(attrs["warnings"]) + "\n"
                        )
                except Exception as exc:
                    log.warning("Failed to retrieve SMART attributes: %s", exc)
            await _recalculate_progress(job_id)
            _push_update()
            return True
        elif progress["state"] == "failed":
            await _set_stage_error(job_id, stage_name, f"SMART {test_type} test failed")
            return False
        # "unknown" → keep polling
 async def _stage_surface_validate(job_id: int, devname: str, drive_id: int) -> bool:
    """
    Surface validation stage.
    SSH mode: runs badblocks -wsv -b 4096 -p 1 /dev/{devname}.
    Mock mode: simulated timed progress (no real I/O).
    """
    from app import ssh_client
    if ssh_client.is_configured():
        return await _stage_surface_validate_ssh(job_id, devname, drive_id)
    return await _stage_timed_simulate(job_id, "surface_validate", settings.surface_validate_seconds)
 async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) -> bool:
    """Run badblocks over SSH, streaming output to stage log."""
    from app import ssh_client
    await _append_stage_log(
        job_id, "surface_validate",
        f"[START] badblocks -wsv -b 4096 -p 1 /dev/{devname}\n"
        f"[NOTE]  This is a DESTRUCTIVE write test. All data on /dev/{devname} will be overwritten.\n\n"
    )
    def _is_cancelled_sync() -> bool:
        # Synchronous version — we check the DB state flag set by cancel_job()
        import asyncio
        loop = asyncio.get_event_loop()
        try:
            return loop.run_until_complete(_is_cancelled(job_id))
        except Exception:
            return False
    last_logged_pct = [-1]
    def on_progress(pct: int, bad_blocks: int, line: str) -> None:
        nonlocal last_logged_pct
        # Write to log (fire-and-forget via asyncio.create_task from sync context)
        # The log append is done in the async flush below
        pass
    accumulated_lines: list[str] = []
    async def on_progress_async(pct: int, bad_blocks: int, line: str) -> None:
        accumulated_lines.append(line)
        # Flush to DB and update progress every ~25 lines to avoid excessive DB writes
        if len(accumulated_lines) % 25 == 0:
            await _append_stage_log(job_id, "surface_validate", "".join(accumulated_lines[-25:]))
            await _update_stage_bad_blocks(job_id, "surface_validate", bad_blocks)
            await _update_stage_percent(job_id, "surface_validate", pct)
            await _recalculate_progress(job_id)
            _push_update()
        if await _is_cancelled(job_id):
            raise asyncio.CancelledError
    # Run badblocks — we adapt the callback pattern to async by collecting then flushing
    result = {"bad_blocks": 0, "output": "", "aborted": False}
    try:
        # The actual streaming; we handle progress via the accumulated_lines pattern
        bad_blocks_total = 0
        output_lines: list[str] = []
        async with await ssh_client._connect() as conn:
            cmd = f"badblocks -wsv -b 4096 -p 1 /dev/{devname}"
            async with conn.create_process(cmd) as proc:
                import re as _re
                async def _drain(stream, is_stderr: bool):
                    nonlocal bad_blocks_total
                    async for raw in stream:
                        line = raw if isinstance(raw, str) else raw.decode("utf-8", errors="replace")
                        output_lines.append(line)
                        if is_stderr:
                            m = _re.search(r"([\d.]+)%\s+done", line)
                            if m:
                                pct = min(99, int(float(m.group(1))))
                                await _update_stage_percent(job_id, "surface_validate", pct)
                                await _update_stage_bad_blocks(job_id, "surface_validate", bad_blocks_total)
                                await _recalculate_progress(job_id)
                                _push_update()
                        else:
                            stripped = line.strip()
                            if stripped and stripped.isdigit():
                                bad_blocks_total += 1
                        # Append to DB log in chunks
                        if len(output_lines) % 20 == 0:
                            chunk = "".join(output_lines[-20:])
                            await _append_stage_log(job_id, "surface_validate", chunk)
                        # Abort on bad block threshold
                        if bad_blocks_total > settings.bad_block_threshold:
                            proc.kill()
                            output_lines.append(
                                f"\n[ABORTED] {bad_blocks_total} bad block(s) exceeded "
                                f"threshold ({settings.bad_block_threshold})\n"
                            )
                            return
                        if await _is_cancelled(job_id):
                            proc.kill()
                            return
                await asyncio.gather(
                    _drain(proc.stdout, False),
                    _drain(proc.stderr, True),
                    return_exceptions=True,
                )
                await proc.wait()
        # Flush remaining output
        remainder = "".join(output_lines)
        await _append_stage_log(job_id, "surface_validate", remainder)
        result["bad_blocks"] = bad_blocks_total
        result["output"] = remainder
        result["aborted"] = bad_blocks_total > settings.bad_block_threshold
    except asyncio.CancelledError:
        return False
    except Exception as exc:
        await _append_stage_log(job_id, "surface_validate", f"\n[SSH error] {exc}\n")
        await _set_stage_error(job_id, "surface_validate", f"SSH badblocks error: {exc}")
        return False
    await _update_stage_bad_blocks(job_id, "surface_validate", result["bad_blocks"])
    if result["aborted"] or result["bad_blocks"] > settings.bad_block_threshold:
        await _set_stage_error(
            job_id, "surface_validate",
            f"Surface validate FAILED: {result['bad_blocks']} bad block(s) found "
            f"(threshold: {settings.bad_block_threshold})"
        )
        return False
    return True
 async def _stage_timed_simulate(job_id: int, stage_name: str, duration_seconds: int) -> bool:
-    """Simulate a timed stage (surface validation / IO validation) with progress updates."""
+    """Simulate a timed stage with progress updates (mock / dev mode)."""
    start = time.monotonic()
    while True:
@ -449,9 +676,28 @@ async def _stage_timed_simulate(job_id: int, stage_name: str, duration_seconds:
        await asyncio.sleep(POLL_INTERVAL)
-async def _stage_final_check(job_id: int, devname: str) -> bool:
+async def _stage_final_check(job_id: int, devname: str, drive_id: int | None = None) -> bool:
-    """Verify drive passed all tests by checking current SMART health in DB."""
+    """
    Verify drive passed all tests.
    SSH mode: run smartctl -a and check critical attributes.
    Mock mode: check SMART health field in DB.
    """
    await asyncio.sleep(1)
    from app import ssh_client
    if ssh_client.is_configured() and drive_id is not None:
        try:
            attrs = await ssh_client.get_smart_attributes(devname)
            await _store_smart_attrs(drive_id, attrs)
            if attrs["health"] == "FAILED" or attrs["failures"]:
                failures = attrs["failures"] or [f"SMART health: {attrs['health']}"]
                await _set_stage_error(job_id, "final_check",
                                       "Final check failed: " + "; ".join(failures))
                return False
            return True
        except Exception as exc:
            log.warning("SSH final_check failed, falling back to DB check: %s", exc)
    # DB check (mock mode fallback)
    async with _db() as db:
        cur = await db.execute(
            "SELECT smart_health FROM drives WHERE devname=?", (devname,)
@ -549,6 +795,57 @@ async def _cancel_stage(job_id: int, stage_name: str) -> None:
        await db.commit()
 async def _append_stage_log(job_id: int, stage_name: str, text: str) -> None:
    """Append text to the log_text column of a burnin_stages row."""
    async with _db() as db:
        await db.execute("PRAGMA journal_mode=WAL")
        await db.execute(
            """UPDATE burnin_stages
               SET log_text = COALESCE(log_text, '') || ?
               WHERE burnin_job_id=? AND stage_name=?""",
            (text, job_id, stage_name),
        )
        await db.commit()
 async def _update_stage_bad_blocks(job_id: int, stage_name: str, count: int) -> None:
    async with _db() as db:
        await db.execute("PRAGMA journal_mode=WAL")
        await db.execute(
            "UPDATE burnin_stages SET bad_blocks=? WHERE burnin_job_id=? AND stage_name=?",
            (count, job_id, stage_name),
        )
        await db.commit()
 async def _store_smart_attrs(drive_id: int, attrs: dict) -> None:
    """Persist latest SMART attribute dict to drives.smart_attrs (JSON)."""
    import json
    # Convert int keys to str for JSON serialisation
    serialisable = {str(k): v for k, v in attrs.get("attributes", {}).items()}
    blob = json.dumps({
        "health":   attrs.get("health", "UNKNOWN"),
        "attrs":    serialisable,
        "warnings": attrs.get("warnings", []),
        "failures": attrs.get("failures", []),
    })
    async with _db() as db:
        await db.execute("PRAGMA journal_mode=WAL")
        await db.execute("UPDATE drives SET smart_attrs=? WHERE id=?", (blob, drive_id))
        await db.commit()
 async def _store_smart_raw_output(drive_id: int, test_type: str, raw: str) -> None:
    """Store raw smartctl output in smart_tests.raw_output."""
    async with _db() as db:
        await db.execute("PRAGMA journal_mode=WAL")
        await db.execute(
            "UPDATE smart_tests SET raw_output=? WHERE drive_id=? AND test_type=?",
            (raw, drive_id, test_type.lower()),
        )
        await db.commit()
 async def _set_stage_error(job_id: int, stage_name: str, error_text: str) -> None:
    async with _db() as db:
        await db.execute("PRAGMA journal_mode=WAL")
--- a/app/config.py
+++ b/app/config.py
@ -56,9 +56,17 @@ class Settings(BaseSettings):
    temp_crit_c: int = 55   # red critical (precheck refuses to start above this)
    # Bad-block tolerance — surface_validate fails if bad blocks exceed this
    # (applies to real badblocks in Stage 7; ignored by mock simulation)
    bad_block_threshold: int = 0
    # SSH credentials for direct TrueNAS command execution (Stage 7)
    # When ssh_host is set, burn-in stages use SSH for smartctl/badblocks instead of REST API.
    # Leave ssh_host empty to use the mock/REST API (development mode).
    ssh_host: str = ""
    ssh_port: int = 22
    ssh_user: str = "root"        # TrueNAS CORE default is root
    ssh_password: str = ""        # Password auth (leave blank if using key)
    ssh_key: str = ""             # PEM private key content (paste full key including headers)
    # Application version — used by the /api/v1/updates/check endpoint
    app_version: str = "1.0.0-6d"
--- a/app/database.py
+++ b/app/database.py
@ -82,6 +82,11 @@ CREATE INDEX IF NOT EXISTS idx_audit_events_job   ON audit_events(burnin_job_id)
 _MIGRATIONS = [
    "ALTER TABLE drives ADD COLUMN notes TEXT",
    "ALTER TABLE drives ADD COLUMN location TEXT",
    # Stage 7: SSH command output + SMART attribute storage
    "ALTER TABLE burnin_stages ADD COLUMN log_text TEXT",
    "ALTER TABLE burnin_stages ADD COLUMN bad_blocks INTEGER DEFAULT 0",
    "ALTER TABLE drives ADD COLUMN smart_attrs TEXT",
    "ALTER TABLE smart_tests ADD COLUMN raw_output TEXT",
 ]
--- a/app/notifier.py
+++ b/app/notifier.py
@ -23,8 +23,10 @@ async def notify_job_complete(
    profile: str,
    operator: str,
    error_text: str | None,
    bad_blocks: int = 0,
 ) -> None:
    """Fire all configured notifications for a completed burn-in job."""
    from datetime import datetime, timezone
    tasks = []
    if settings.webhook_url:
@ -38,6 +40,8 @@ async def notify_job_complete(
            "profile":        profile,
            "operator":       operator,
            "error_text":     error_text,
            "bad_blocks":     bad_blocks,
            "timestamp":      datetime.now(timezone.utc).isoformat(),
        }))
    if settings.smtp_host:
--- a/app/renderer.py
+++ b/app/renderer.py
@ -126,7 +126,7 @@ def _format_elapsed(iso: str | None) -> str:
        return ""
-# Register
+# Register filters
 templates.env.filters["format_bytes"]    = _format_bytes
 templates.env.filters["format_eta"]      = _format_eta
 templates.env.filters["temp_class"]      = _temp_class
@ -135,3 +135,7 @@ templates.env.filters["format_dt_full"]  = _format_dt_full
 templates.env.filters["format_duration"] = _format_duration
 templates.env.filters["format_elapsed"]  = _format_elapsed
 templates.env.globals["drive_status"]    = _drive_status
 from app.config import settings as _settings
 templates.env.globals["app_version"] = _settings.app_version
--- a/app/routes.py
+++ b/app/routes.py
@ -258,7 +258,7 @@ async def drive_drawer(drive_id: int, db: aiosqlite.Connection = Depends(get_db)
        raise HTTPException(status_code=404, detail="Drive not found")
    drive = _row_to_drive(row)
-    # Latest burn-in job + its stages
+    # Latest burn-in job + its stages (include log_text and bad_blocks)
    cur = await db.execute(
        "SELECT * FROM burnin_jobs WHERE drive_id=? ORDER BY id DESC LIMIT 1",
        (drive_id,),
@ -268,12 +268,33 @@ async def drive_drawer(drive_id: int, db: aiosqlite.Connection = Depends(get_db)
    if job_row:
        job = dict(job_row)
        cur = await db.execute(
-            "SELECT * FROM burnin_stages WHERE burnin_job_id=? ORDER BY id",
+            "SELECT id, stage_name, state, percent, started_at, finished_at, "
            "duration_seconds, error_text, log_text, bad_blocks "
            "FROM burnin_stages WHERE burnin_job_id=? ORDER BY id",
            (job_row["id"],),
        )
        job["stages"] = [dict(r) for r in await cur.fetchall()]
        burnin = job
    # SMART raw output from smart_tests table
    cur = await db.execute(
        "SELECT test_type, state, percent, started_at, finished_at, error_text, raw_output "
        "FROM smart_tests WHERE drive_id=?",
        (drive_id,),
    )
    smart_rows = {r["test_type"]: dict(r) for r in await cur.fetchall()}
    # Cached SMART attributes (JSON blob on drives table)
    import json as _json
    smart_attrs = None
    cur = await db.execute("SELECT smart_attrs FROM drives WHERE id=?", (drive_id,))
    attrs_row = await cur.fetchone()
    if attrs_row and attrs_row["smart_attrs"]:
        try:
            smart_attrs = _json.loads(attrs_row["smart_attrs"])
        except Exception:
            pass
    # Last 50 audit events for this drive (newest first)
    cur = await db.execute("""
        SELECT id, event_type, operator, message, created_at
@ -284,6 +305,13 @@ async def drive_drawer(drive_id: int, db: aiosqlite.Connection = Depends(get_db)
    """, (drive_id,))
    events = [dict(r) for r in await cur.fetchall()]
    def _smart_card(test_type: str) -> dict:
        smart_obj = drive.smart_short if test_type == "short" else drive.smart_long
        base = smart_obj.model_dump() if smart_obj else {}
        row = smart_rows.get(test_type, {})
        base["raw_output"] = row.get("raw_output")
        return base
    return {
        "drive": {
            "id":         drive.id,
@ -294,8 +322,9 @@ async def drive_drawer(drive_id: int, db: aiosqlite.Connection = Depends(get_db)
        },
        "burnin":      burnin,
        "smart": {
-            "short": drive.smart_short.model_dump() if drive.smart_short else None,
+            "short":       _smart_card("short"),
-            "long":  drive.smart_long.model_dump()  if drive.smart_long  else None,
+            "long":        _smart_card("long"),
            "attrs":       smart_attrs,
        },
        "events":      events,
    }
@ -672,6 +701,53 @@ async def update_drive(
    return {"updated": True}
@router.post("/api/v1/drives/{drive_id}/reset")
 async def reset_drive(
    drive_id: int,
    body: dict,
    db: aiosqlite.Connection = Depends(get_db),
 ):
    """
    Clear SMART test results for a drive so it shows as fresh.
    Only allowed when no burn-in job is active (queued or running).
    Preserves all job history — just resets the display state.
    """
    cur = await db.execute("SELECT id FROM drives WHERE id=?", (drive_id,))
    if not await cur.fetchone():
        raise HTTPException(status_code=404, detail="Drive not found")
    # Reject if any active burn-in
    cur = await db.execute(
        "SELECT COUNT(*) FROM burnin_jobs WHERE drive_id=? AND state IN ('queued','running')",
        (drive_id,),
    )
    if (await cur.fetchone())[0] > 0:
        raise HTTPException(status_code=409, detail="Cannot reset while a burn-in is active")
    operator = body.get("operator", "operator")
    # Reset SMART test state to idle
    await db.execute(
        """UPDATE smart_tests SET state='idle', percent=0, started_at=NULL,
           eta_at=NULL, finished_at=NULL, error_text=NULL, raw_output=NULL
           WHERE drive_id=?""",
        (drive_id,),
    )
    # Clear cached SMART attributes
    await db.execute("UPDATE drives SET smart_attrs=NULL WHERE id=?", (drive_id,))
    # Audit event
    await db.execute(
        """INSERT INTO audit_events (event_type, drive_id, operator, message)
           VALUES (?,?,?,?)""",
        ("drive_reset", drive_id, operator, "Drive reset — SMART state cleared"),
    )
    await db.commit()
    poller._notify_subscribers()
    return {"reset": True}
 # ---------------------------------------------------------------------------
 # Audit log page
 # ---------------------------------------------------------------------------
@ -766,6 +842,36 @@ async def stats_page(
    """)
    by_day = [dict(r) for r in await cur.fetchall()]
    # Average test duration by drive size (rounded to nearest TB)
    cur = await db.execute("""
        SELECT
            CAST(ROUND(CAST(d.size_bytes AS REAL) / 1e12) AS INTEGER) AS size_tb,
            COUNT(*)  AS total,
            ROUND(AVG(
                (julianday(bj.finished_at) - julianday(bj.started_at)) * 86400 / 3600.0
            ), 1)     AS avg_hours
        FROM burnin_jobs bj
        JOIN drives d ON d.id = bj.drive_id
        WHERE bj.state IN ('passed', 'failed')
          AND bj.started_at IS NOT NULL
          AND bj.finished_at IS NOT NULL
        GROUP BY size_tb
        ORDER BY size_tb
    """)
    by_size = [dict(r) for r in await cur.fetchall()]
    # Failure breakdown by stage (which stage caused the failure)
    cur = await db.execute("""
        SELECT
            COALESCE(bj.stage_name, 'unknown') AS failed_stage,
            COUNT(*) AS count
        FROM burnin_jobs bj
        WHERE bj.state = 'failed'
        GROUP BY failed_stage
        ORDER BY count DESC
    """)
    by_failure_stage = [dict(r) for r in await cur.fetchall()]
    # Drives tracked
    cur = await db.execute("SELECT COUNT(*) FROM drives")
    drives_total = (await cur.fetchone())[0]
@ -776,6 +882,8 @@ async def stats_page(
        "overall":          overall,
        "by_model":         by_model,
        "by_day":           by_day,
        "by_size":          by_size,
        "by_failure_stage": by_failure_stage,
        "drives_total":     drives_total,
        "poller":           ps,
        **_stale_context(ps),
@ -813,6 +921,11 @@ async def settings_page(
        "temp_warn_c":               settings.temp_warn_c,
        "temp_crit_c":               settings.temp_crit_c,
        "bad_block_threshold":       settings.bad_block_threshold,
        # SSH credentials (take effect immediately — each SSH call reads live settings)
        "ssh_host":                  settings.ssh_host,
        "ssh_port":                  settings.ssh_port,
        "ssh_user":                  settings.ssh_user,
        # Note: ssh_password and ssh_key intentionally omitted from display (sensitive)
        # System settings (restart required to fully apply)
        "truenas_base_url":          settings.truenas_base_url,
        "truenas_verify_tls":        settings.truenas_verify_tls,
@ -823,11 +936,13 @@ async def settings_page(
        # Note: truenas_api_key intentionally omitted from display (sensitive)
    }
    from app import ssh_client as _ssh
    ps = poller.get_state()
    return templates.TemplateResponse("settings.html", {
        "request":        request,
        "editable":       editable,
        "smtp_enabled":   bool(settings.smtp_host),
        "ssh_configured": _ssh.is_configured(),
        "app_version":    settings.app_version,
        "poller":         ps,
        **_stale_context(ps),
@ -838,7 +953,7 @@ async def settings_page(
 async def save_settings(body: dict):
    """Save editable runtime settings.  Secrets are only updated if non-empty."""
    # Don't overwrite secrets if client sent empty string
-    for secret_field in ("smtp_password", "truenas_api_key"):
+    for secret_field in ("smtp_password", "truenas_api_key", "ssh_password", "ssh_key"):
        if secret_field in body and body[secret_field] == "":
            del body[secret_field]
@ -859,6 +974,16 @@ async def test_smtp():
    return {"ok": True}
@router.post("/api/v1/settings/test-ssh")
 async def test_ssh():
    """Test the current SSH configuration."""
    from app import ssh_client
    result = await ssh_client.test_connection()
    if not result["ok"]:
        raise HTTPException(status_code=502, detail=result.get("error", "Connection failed"))
    return {"ok": True}
@router.get("/api/v1/updates/check")
 async def check_updates():
    """Check for a newer release on Forgejo."""
--- a/app/settings_store.py
+++ b/app/settings_store.py
@ -38,6 +38,12 @@ _EDITABLE: dict[str, type] = {
    "temp_warn_c":               int,
    "temp_crit_c":               int,
    "bad_block_threshold":       int,
    # SSH credentials — take effect immediately (each connection reads live settings)
    "ssh_host":                  str,
    "ssh_port":                  int,
    "ssh_user":                  str,
    "ssh_password":              str,
    "ssh_key":                   str,
    # System settings — saved to JSON; require container restart to fully apply
    "truenas_base_url":          str,
    "truenas_api_key":           str,
@ -90,6 +96,9 @@ def _apply(data: dict) -> None:
            if key == "bad_block_threshold" and int(val) < 0:
                log.warning("settings_store: bad_block_threshold must be >= 0 — ignoring")
                continue
            if key == "ssh_port" and not (1 <= int(val) <= 65535):
                log.warning("settings_store: ssh_port out of range — ignoring")
                continue
            setattr(settings, key, val)
        except (ValueError, TypeError) as exc:
            log.warning("settings_store: invalid value for %s: %s", key, exc)
--- a/app/ssh_client.py
+++ b/app/ssh_client.py
@ -0,0 +1,303 @@
 """
 SSH client for direct TrueNAS command execution (Stage 7).
 When ssh_host is configured, burn-in stages use SSH to run smartctl and
 badblocks directly on the TrueNAS host instead of going through the REST API.
 Falls back to REST API / simulation when SSH is not configured (dev/mock mode).
 TrueNAS CORE (FreeBSD) device paths: /dev/ada0, /dev/da0, etc.
 TrueNAS SCALE (Linux) device paths: /dev/sda, /dev/sdb, etc.
 The devname from the TrueNAS API is used as-is in /dev/{devname}.
 """
 import asyncio
 import logging
 import re
 from typing import Callable
 log = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Monitored SMART attributes
 # True  → any non-zero raw value is a hard failure (drive rejected)
 # False → non-zero is a warning (flagged but test continues)
 # ---------------------------------------------------------------------------
 SMART_ATTRS: dict[int, tuple[str, bool]] = {
    5:   ("Reallocated_Sector_Ct",  True),   # reallocation = FAIL
    10:  ("Spin_Retry_Count",       False),  # mechanical stress = WARN
    188: ("Command_Timeout",        False),  # drive not responding = WARN
    197: ("Current_Pending_Sector", True),   # pending reallocation = FAIL
    198: ("Offline_Uncorrectable",  True),   # unrecoverable read error = FAIL
    199: ("UDMA_CRC_Error_Count",   False),  # cable/controller issue = WARN
 }
 # ---------------------------------------------------------------------------
 # Configuration check
 # ---------------------------------------------------------------------------
 def is_configured() -> bool:
    """Returns True when SSH credentials are present and usable."""
    from app.config import settings
    return bool(settings.ssh_host and (settings.ssh_password or settings.ssh_key))
 # ---------------------------------------------------------------------------
 # Low-level connection
 # ---------------------------------------------------------------------------
 async def _connect():
    """Open a single-use SSH connection. Caller must use `async with`."""
    import asyncssh
    from app.config import settings
    kwargs: dict = {
        "host":        settings.ssh_host,
        "port":        settings.ssh_port,
        "username":    settings.ssh_user,
        "known_hosts": None,          # trust all hosts (same spirit as TRUENAS_VERIFY_TLS=false)
    }
    if settings.ssh_key:
        kwargs["client_keys"] = [asyncssh.import_private_key(settings.ssh_key)]
    if settings.ssh_password:
        kwargs["password"] = settings.ssh_password
    return asyncssh.connect(**kwargs)
 # ---------------------------------------------------------------------------
 # Public API
 # ---------------------------------------------------------------------------
 async def test_connection() -> dict:
    """Test SSH connectivity. Returns {"ok": True} or {"ok": False, "error": str}."""
    if not is_configured():
        return {"ok": False, "error": "SSH not configured (ssh_host is empty)"}
    try:
        async with await _connect() as conn:
            result = await conn.run("echo ok", check=False)
            if "ok" in result.stdout:
                return {"ok": True}
            return {"ok": False, "error": result.stderr.strip() or "unexpected output"}
    except Exception as exc:
        return {"ok": False, "error": str(exc)}
 async def get_smart_attributes(devname: str) -> dict:
    """
    Run `smartctl -a /dev/{devname}` and parse the output.
    Returns:
        health:     str — "PASSED" | "FAILED" | "UNKNOWN"
        raw_output: str — full smartctl output
        attributes: dict[int, {"name": str, "raw": int}]
        warnings:   list[str] — attribute names with non-zero raw (non-critical)
        failures:   list[str] — attribute names with non-zero raw (critical)
    """
    cmd = f"smartctl -a /dev/{devname}"
    try:
        async with await _connect() as conn:
            result = await conn.run(cmd, check=False)
            output = result.stdout + result.stderr
            return _parse_smartctl(output)
    except Exception as exc:
        return {
            "health":     "UNKNOWN",
            "raw_output": str(exc),
            "attributes": {},
            "warnings":   [],
            "failures":   [f"SSH error: {exc}"],
        }
 async def start_smart_test(devname: str, test_type: str) -> str:
    """
    Run `smartctl -t short|long /dev/{devname}`.
    Returns raw output. Raises RuntimeError on unrecoverable failure.
    test_type: "SHORT" or "LONG"
    """
    arg = "short" if test_type.upper() == "SHORT" else "long"
    cmd = f"smartctl -t {arg} /dev/{devname}"
    async with await _connect() as conn:
        result = await conn.run(cmd, check=False)
        output = result.stdout + result.stderr
        # smartctl exits 0 or 4 when the test is successfully started on most drives
        started = ("Testing has begun" in output or
                   "test has begun" in output.lower() or
                   result.returncode in (0, 4))
        if not started:
            raise RuntimeError(f"smartctl returned exit {result.returncode}: {output[:400]}")
        return output
 async def poll_smart_progress(devname: str) -> dict:
    """
    Run `smartctl -a /dev/{devname}` and extract self-test status.
    Returns:
        state:             "running" | "passed" | "failed" | "unknown"
        percent_remaining: int (0 = complete when state != "running")
        output:            str
    """
    cmd = f"smartctl -a /dev/{devname}"
    async with await _connect() as conn:
        result = await conn.run(cmd, check=False)
        output = result.stdout + result.stderr
        return _parse_smart_progress(output)
 async def abort_smart_test(devname: str) -> None:
    """Send `smartctl -X /dev/{devname}` to abort an in-progress test."""
    cmd = f"smartctl -X /dev/{devname}"
    async with await _connect() as conn:
        await conn.run(cmd, check=False)
 async def run_badblocks(
    devname: str,
    on_progress: Callable[[int, int, str], None],
    cancelled_fn: Callable[[], bool] | None = None,
 ) -> dict:
    """
    Run `badblocks -wsv -b 4096 -p 1 /dev/{devname}` and stream output.
    on_progress(percent, bad_blocks, line) is called for each line of output.
    cancelled_fn() is polled to support mid-test cancellation.
    Returns: {"bad_blocks": int, "output": str, "aborted": bool}
    """
    from app.config import settings
    cmd = f"badblocks -wsv -b 4096 -p 1 /dev/{devname}"
    lines: list[str] = []
    bad_blocks = 0
    aborted = False
    last_pct = 0
    try:
        async with await _connect() as conn:
            async with conn.create_process(cmd) as proc:
                # badblocks writes progress to stderr, bad block numbers to stdout
                async def _read_stream(stream, is_stderr: bool):
                    nonlocal bad_blocks, last_pct, aborted
                    async for raw_line in stream:
                        line = raw_line if isinstance(raw_line, str) else raw_line.decode("utf-8", errors="replace")
                        lines.append(line)
                        if is_stderr:
                            m = re.search(r"([\d.]+)%\s+done", line)
                            if m:
                                last_pct = min(99, int(float(m.group(1))))
                        else:
                            # Each non-empty stdout line during badblocks is a bad block number
                            stripped = line.strip()
                            if stripped and stripped.isdigit():
                                bad_blocks += 1
                        on_progress(last_pct, bad_blocks, line)
                        # Abort if threshold exceeded
                        if bad_blocks > settings.bad_block_threshold:
                            aborted = True
                            proc.kill()
                            lines.append(
                                f"\n[ABORTED] Bad block count ({bad_blocks}) exceeded "
                                f"threshold ({settings.bad_block_threshold})\n"
                            )
                            return
                        # Abort on cancellation
                        if cancelled_fn and cancelled_fn():
                            aborted = True
                            proc.kill()
                            return
                stdout_task = asyncio.create_task(_read_stream(proc.stdout, False))
                stderr_task = asyncio.create_task(_read_stream(proc.stderr, True))
                await asyncio.gather(stdout_task, stderr_task, return_exceptions=True)
                await proc.wait()
    except Exception as exc:
        lines.append(f"\n[SSH error] {exc}\n")
    if not aborted:
        last_pct = 100
    return {
        "bad_blocks": bad_blocks,
        "output":     "".join(lines),
        "aborted":    aborted,
    }
 # ---------------------------------------------------------------------------
 # Parsers
 # ---------------------------------------------------------------------------
 def _parse_smartctl(output: str) -> dict:
    health = "UNKNOWN"
    attributes: dict[int, dict] = {}
    warnings: list[str] = []
    failures: list[str] = []
    m = re.search(r"self-assessment test result:\s+(\w+)", output, re.IGNORECASE)
    if m:
        health = m.group(1).upper()
    # Attribute table: ID#  NAME  FLAG  VALUE  WORST  THRESH  TYPE  UPDATED  WHEN_FAILED  RAW_VALUE
    for line in output.splitlines():
        am = re.match(
            r"\s*(\d+)\s+(\S+)\s+\S+\s+\d+\s+\d+\s+\d+\s+\S+\s+\S+\s+\S+\s+(\d+)",
            line,
        )
        if not am:
            continue
        attr_id  = int(am.group(1))
        attr_name = am.group(2)
        raw_val   = int(am.group(3))
        attributes[attr_id] = {"name": attr_name, "raw": raw_val}
        if attr_id in SMART_ATTRS:
            _, is_critical = SMART_ATTRS[attr_id]
            if raw_val > 0:
                msg = f"{attr_name} = {raw_val}"
                if is_critical:
                    failures.append(msg)
                else:
                    warnings.append(msg)
    return {
        "health":     health,
        "raw_output": output,
        "attributes": attributes,
        "warnings":   warnings,
        "failures":   failures,
    }
 def _parse_smart_progress(output: str) -> dict:
    state = "unknown"
    percent_remaining = 0
    lower = output.lower()
    if "self-test routine in progress" in lower or "self-test routine in progress" in output:
        state = "running"
        m = re.search(r"(\d+)%\s+of\s+test\s+remaining", output, re.IGNORECASE)
        if m:
            percent_remaining = int(m.group(1))
    elif "completed without error" in lower:
        state = "passed"
    elif (
        "completed: read failure" in lower
        or "completed: write failure" in lower
        or "aborted by host" in lower
        or ("completed" in lower and "failure" in lower)
    ):
        state = "failed"
    elif "in progress" in lower:
        state = "running"
    return {
        "state":             state,
        "percent_remaining": percent_remaining,
        "output":            output,
    }
--- a/app/static/app.css
+++ b/app/static/app.css
@ -2283,3 +2283,125 @@ tr.drawer-row-active {
  .drawer-smart-grid { grid-template-columns: 1fr; }
  .drawer-drive-meta { display: none; }
 }
 /* -----------------------------------------------------------------------
   Stage raw log output (SSH mode)
 ----------------------------------------------------------------------- */
 .stage-log {
  font-family: "SF Mono", "Consolas", "Monaco", monospace;
  font-size: 11px;
  line-height: 1.5;
  color: var(--text-muted);
  background: var(--bg);
  border-left: 2px solid var(--border);
  margin: 6px 0 2px 28px;
  padding: 6px 10px;
  white-space: pre-wrap;
  word-break: break-all;
  max-height: 200px;
  overflow-y: auto;
 }
 .stage-log .log-bad-block {
  color: var(--red);
  font-weight: 600;
 }
 .stage-log .log-warn {
  color: var(--yellow);
 }
 /* -----------------------------------------------------------------------
   SMART attributes table in drawer
 ----------------------------------------------------------------------- */
 .smart-attrs {
  margin-top: 12px;
  border-top: 1px solid var(--border);
  padding-top: 10px;
 }
 .smart-attrs-title {
  font-size: 11px;
  font-weight: 600;
  color: var(--text-muted);
  text-transform: uppercase;
  letter-spacing: .05em;
  margin-bottom: 6px;
 }
 .smart-attr-row {
  display: flex;
  justify-content: space-between;
  align-items: center;
  padding: 3px 0;
  font-size: 12px;
  border-bottom: 1px solid color-mix(in srgb, var(--border) 50%, transparent);
 }
 .smart-attr-row:last-child { border-bottom: none; }
 .smart-attr-name { color: var(--text-muted); }
 .smart-attr-val  { font-family: "SF Mono", monospace; font-size: 12px; }
 .smart-attr-val.attr-ok   { color: var(--green); }
 .smart-attr-val.attr-warn { color: var(--yellow); font-weight: 600; }
 .smart-attr-val.attr-fail { color: var(--red);    font-weight: 600; }
 .smart-attr-raw-output {
  font-family: "SF Mono", "Consolas", monospace;
  font-size: 10.5px;
  line-height: 1.45;
  color: var(--text-muted);
  background: var(--bg);
  border: 1px solid var(--border);
  border-radius: 4px;
  padding: 8px 10px;
  margin-top: 10px;
  white-space: pre;
  overflow: auto;
  max-height: 240px;
 }
 /* -----------------------------------------------------------------------
   Reset button
 ----------------------------------------------------------------------- */
 .btn-reset {
  background: transparent;
  border: 1px solid color-mix(in srgb, var(--text-muted) 40%, transparent);
  color: var(--text-muted);
  border-radius: 5px;
  padding: 3px 8px;
  font-size: 12px;
  cursor: pointer;
  transition: border-color .15s, color .15s;
 }
 .btn-reset:hover {
  border-color: var(--yellow);
  color: var(--yellow);
 }
 /* -----------------------------------------------------------------------
   Parallel burn-in inline warning
 ----------------------------------------------------------------------- */
 .sf-inline-warn {
  background: color-mix(in srgb, var(--yellow) 12%, transparent);
  border: 1px solid color-mix(in srgb, var(--yellow) 40%, transparent);
  border-radius: 5px;
  color: var(--yellow);
  font-size: 12px;
  padding: 7px 10px;
  margin: 4px 0 8px 0;
 }
 /* -----------------------------------------------------------------------
   SSH textarea
 ----------------------------------------------------------------------- */
 .sf-textarea {
  resize: vertical;
  min-height: 90px;
  font-family: "SF Mono", "Consolas", monospace;
  font-size: 11px;
 }
 /* -----------------------------------------------------------------------
   Version badge in header
 ----------------------------------------------------------------------- */
 .header-version {
  font-size: 11px;
  color: var(--text-muted);
  opacity: .6;
  padding: 0 2px;
  font-variant-numeric: tabular-nums;
 }
--- a/app/static/app.js
+++ b/app/static/app.js
@ -957,8 +957,18 @@
        if (s.error_text) {
          html += '<div class="stage-error-line">' + _esc(s.error_text) + '</div>';
        }
        // Raw SSH log output (if available)
        if (s.log_text) {
          var logHtml = _esc(s.log_text)
            .replace(/^(\d+)\s*$/gm, '<span class="log-bad-block">$1  ← BAD BLOCK</span>')
            .replace(/\[WARNING\][^\n]*/g, '<span class="log-warn">$&</span>');
          html += '<pre class="stage-log">' + logHtml + '</pre>';
        }
        // Bad block count badge
        if (s.bad_blocks && s.bad_blocks > 0) {
          html += '<div class="stage-error-line">' + s.bad_blocks + ' bad block(s) found</div>';
        }
        html += '</div>';
      });
    } else {
      html += '<div class="drawer-empty">No stage data yet.</div>';
    }
@ -973,6 +983,10 @@
    }
  }
  // Monitored SMART attributes for inline colouring
  var _SMART_CRITICAL = {5: true, 197: true, 198: true};
  var _SMART_WARN     = {10: true, 188: true, 199: true};
  function _drawerRenderSmart(smart) {
    var panel = document.getElementById('drawer-panel-smart');
    if (!panel) return;
@ -994,10 +1008,41 @@
        if (t.started_at)  html += '<div class="smart-detail">Started: '  + _drawerFmtDt(t.started_at) + '</div>';
        if (t.finished_at) html += '<div class="smart-detail">Finished: ' + _drawerFmtDt(t.finished_at) + '</div>';
        if (t.error_text)  html += '<div class="stage-error-line">' + _esc(t.error_text) + '</div>';
        // Raw smartctl output (SSH mode)
        if (t.raw_output) {
          html += '<pre class="smart-attr-raw-output">' + _esc(t.raw_output) + '</pre>';
        }
      }
      html += '</div>';
    });
    html += '</div>';
    // SMART attribute table (from SSH attribute parse)
    var attrs = smart && smart.attrs;
    if (attrs) {
      html += '<div class="smart-attrs">';
      html += '<div class="smart-attrs-title">SMART Attributes</div>';
      if (attrs.failures && attrs.failures.length) {
        html += '<div class="stage-error-line" style="margin-bottom:6px">✕ Failures: ' + _esc(attrs.failures.join('; ')) + '</div>';
      }
      if (attrs.warnings && attrs.warnings.length) {
        html += '<div class="stage-error-line" style="color:var(--yellow);margin-bottom:6px">⚠ Warnings: ' + _esc(attrs.warnings.join('; ')) + '</div>';
      }
      var attrMap = attrs.attrs || {};
      var monitoredIds = [5, 10, 188, 197, 198, 199];
      monitoredIds.forEach(function (id) {
        var entry = attrMap[String(id)];
        if (!entry) return;
        var raw = entry.raw;
        var cls = raw > 0 ? (_SMART_CRITICAL[id] ? 'attr-fail' : 'attr-warn') : 'attr-ok';
        html += '<div class="smart-attr-row">';
        html += '<span class="smart-attr-name">' + id + ' ' + _esc(entry.name) + '</span>';
        html += '<span class="smart-attr-val ' + cls + '">' + raw + '</span>';
        html += '</div>';
      });
      html += '</div>';
    }
    panel.innerHTML = html;
  }
@ -1078,4 +1123,21 @@
    if (e.target.closest('#drawer-close-btn')) closeDrawer();
  });
  // Reset button — clears SMART state for a drive
  document.addEventListener('click', function (e) {
    var btn = e.target.closest('.btn-reset');
    if (!btn) return;
    var driveId = btn.dataset.driveId;
    if (!driveId) return;
    var operator = (window._operator || 'operator');
    fetch('/api/v1/drives/' + driveId + '/reset', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({ operator: operator }),
    }).then(function (r) {
      if (!r.ok) return r.json().then(function (d) { showToast(d.detail || 'Reset failed', 'error'); });
      showToast('Drive reset — state cleared', 'success');
    }).catch(function () { showToast('Network error', 'error'); });
  });
 }());
--- a/app/templates/components/drives_table.html
+++ b/app/templates/components/drives_table.html
@ -81,6 +81,10 @@
      {%- set short_busy = drive.smart_short and drive.smart_short.state == 'running' %}
      {%- set long_busy  = drive.smart_long  and drive.smart_long.state  == 'running' %}
      {%- set selectable = not bi_active and not short_busy and not long_busy %}
      {%- set bi_done = drive.burnin and drive.burnin.state in ('passed', 'failed', 'cancelled', 'unknown') %}
      {%- set smart_done = (drive.smart_short and drive.smart_short.state in ('passed','failed','aborted'))
                        or (drive.smart_long  and drive.smart_long.state  in ('passed','failed','aborted')) %}
      {%- set can_reset = (bi_done or smart_done) and not bi_active and not short_busy and not long_busy %}
      <tr data-status="{{ drive.status }}" id="drive-{{ drive.id }}">
        <td class="col-check">
          {%- if selectable %}
@ -160,6 +164,12 @@
                    data-health="{{ drive.smart_health }}"
                    {% if short_busy or long_busy %}disabled{% endif %}
                    title="Start Burn-In">Burn-In</button>
            <!-- Reset — clears SMART state so drive can be re-tested from scratch -->
            {%- if can_reset %}
            <button class="btn-action btn-reset"
                    data-drive-id="{{ drive.id }}"
                    title="Reset SMART state — clears test results so drive shows as fresh">Reset</button>
            {%- endif %}
            {%- endif %}
          </div>
        </td>
--- a/app/templates/layout.html
+++ b/app/templates/layout.html
@ -37,6 +37,7 @@
    <a class="header-link" href="/audit">Audit</a>
    <a class="header-link" href="/settings">Settings</a>
    <a class="header-link" href="/docs" target="_blank" rel="noopener">API</a>
    <span class="header-version">v{{ app_version if app_version is defined else '—' }}</span>
  </div>
 </header>
--- a/app/templates/settings.html
+++ b/app/templates/settings.html
@ -91,6 +91,57 @@
        </div>
      </div>
      <!-- SSH -->
      <div class="settings-card">
        <div class="settings-card-header">
          <span class="settings-card-title">SSH (TrueNAS Direct)</span>
          {% if ssh_configured %}
          <span class="chip chip-passed" style="font-size:10px">Configured</span>
          {% else %}
          <span class="chip chip-unknown" style="font-size:10px">Not configured — using REST API / mock</span>
          {% endif %}
        </div>
        <p class="sf-hint" style="margin-bottom:8px">
          When configured, burn-in stages run smartctl and badblocks directly on TrueNAS over SSH,
          enabling SMART attribute monitoring and real bad-block detection. Leave Host empty to use
          the TrueNAS REST API (mock / dev mode).
        </p>
        <div class="sf-fields">
          <div class="sf-full sf-row-test" style="margin-bottom:4px">
            <button type="button" id="test-ssh-btn" class="btn-secondary">Test SSH Connection</button>
            <span id="ssh-test-result" class="settings-test-result" style="display:none"></span>
          </div>
          <label for="ssh_host">Host / IP</label>
          <input class="sf-input" id="ssh_host" name="ssh_host" type="text"
                 value="{{ editable.ssh_host }}" placeholder="10.0.0.x (same as TrueNAS IP)">
          <label for="ssh_port">Port</label>
          <input class="sf-input sf-input-xs" id="ssh_port" name="ssh_port"
                 type="number" min="1" max="65535" value="{{ editable.ssh_port }}" style="width:70px">
          <label for="ssh_user">Username</label>
          <input class="sf-input" id="ssh_user" name="ssh_user" type="text"
                 value="{{ editable.ssh_user }}" placeholder="root">
          <label for="ssh_password">Password</label>
          <input class="sf-input" id="ssh_password" name="ssh_password" type="password"
                 placeholder="leave blank to keep existing" autocomplete="new-password">
          <label for="ssh_key">Private Key</label>
          <div>
            <textarea class="sf-input sf-textarea" id="ssh_key" name="ssh_key"
                      rows="6" placeholder="Paste PEM private key here (-----BEGIN ... KEY-----). Leave blank to keep existing." autocomplete="off"></textarea>
            <span class="sf-hint" style="margin-top:3px">
              Either password or key auth. Key takes precedence if both are set.
              Key is stored securely in <code>/data/settings_overrides.json</code>.
            </span>
          </div>
        </div>
      </div>
    </div><!-- /left col -->
    <!-- RIGHT column: Notifications + Behavior -->
@ -159,9 +210,14 @@
        <div class="sf-row">
          <label class="sf-label" for="max_parallel_burnins">Max Parallel Burn-Ins</label>
          <input class="sf-input sf-input-xs" id="max_parallel_burnins" name="max_parallel_burnins"
-                 type="number" min="1" max="16" value="{{ editable.max_parallel_burnins }}">
+                 type="number" min="1" max="60" value="{{ editable.max_parallel_burnins }}">
          <span class="sf-hint">How many jobs can run at the same time</span>
        </div>
        <div id="parallel-warn" class="sf-inline-warn"
             {% if editable.max_parallel_burnins <= 8 %}style="display:none"{% endif %}>
          ⚠ Running many simultaneous surface scans may saturate your storage controller
          and produce unreliable results. Recommended: 2–4.
        </div>
        <div class="sf-row">
          <label class="sf-label" for="stuck_job_hours">Stuck Job Threshold (hours)</label>
@ -348,6 +404,36 @@
    }
  });
  // Parallel burn-in warning
  var parallelInput = document.getElementById('max_parallel_burnins');
  var parallelWarn  = document.getElementById('parallel-warn');
  if (parallelInput && parallelWarn) {
    parallelInput.addEventListener('input', function () {
      parallelWarn.style.display = parseInt(parallelInput.value, 10) > 8 ? '' : 'none';
    });
  }
  // Test SSH
  var sshBtn    = document.getElementById('test-ssh-btn');
  var sshResult = document.getElementById('ssh-test-result');
  if (sshBtn) {
    sshBtn.addEventListener('click', async function () {
      sshBtn.disabled = true;
      sshBtn.textContent = 'Testing…';
      sshResult.style.display = 'none';
      try {
        var resp = await fetch('/api/v1/settings/test-ssh', { method: 'POST' });
        var data = await resp.json();
        showResult(sshResult, resp.ok, resp.ok ? 'Connection OK' : (data.detail || 'Failed'));
      } catch (e) {
        showResult(sshResult, false, 'Network error');
      } finally {
        sshBtn.disabled = false;
        sshBtn.textContent = 'Test SSH Connection';
      }
    });
  }
  // Check for Updates
  var updBtn = document.getElementById('check-updates-btn');
  var updResult = document.getElementById('update-result');
--- a/app/templates/stats.html
+++ b/app/templates/stats.html
@ -119,5 +119,65 @@
    {% endif %}
  </div>
 </div>
 <div class="stats-grid" style="margin-top:24px">
  <!-- Average duration by drive size -->
  <div class="stats-section">
    <h2 class="section-title">Avg. Test Duration by Drive Size</h2>
    {% if by_size %}
    <div class="table-wrap" style="max-height:none">
      <table>
        <thead>
          <tr>
            <th>Size</th>
            <th style="text-align:right">Jobs</th>
            <th style="text-align:right">Avg Duration</th>
          </tr>
        </thead>
        <tbody>
          {% for s in by_size %}
          <tr>
            <td style="font-weight:500;color:var(--text-strong)">{{ s.size_tb }} TB</td>
            <td class="mono text-muted" style="text-align:right">{{ s.total }}</td>
            <td class="mono" style="text-align:right;color:var(--text-strong)">{{ s.avg_hours }}h</td>
          </tr>
          {% endfor %}
        </tbody>
      </table>
    </div>
    {% else %}
    <div class="empty-state" style="border:1px solid var(--border);border-radius:8px;padding:32px">No completed jobs yet.</div>
    {% endif %}
  </div>
  <!-- Failure breakdown by stage -->
  <div class="stats-section">
    <h2 class="section-title">Failures by Stage</h2>
    {% if by_failure_stage %}
    <div class="table-wrap" style="max-height:none">
      <table>
        <thead>
          <tr>
            <th>Stage</th>
            <th style="text-align:right">Count</th>
          </tr>
        </thead>
        <tbody>
          {% for f in by_failure_stage %}
          <tr>
            <td style="font-weight:500;color:var(--red)">{{ f.failed_stage | replace('_',' ') | title }}</td>
            <td class="mono" style="text-align:right;color:var(--red)">{{ f.count }}</td>
          </tr>
          {% endfor %}
        </tbody>
      </table>
    </div>
    {% else %}
    <div class="empty-state" style="border:1px solid var(--border);border-radius:8px;padding:32px">No failures recorded.</div>
    {% endif %}
  </div>
 </div>
 {% endblock %}
--- a/requirements.txt
+++ b/requirements.txt
@ -5,3 +5,4 @@ httpx
 pydantic-settings
 jinja2
 sse-starlette
 asyncssh