Stage 7: SSH architecture, SMART attribute monitoring, drive reset, and polish

SSH (app/ssh_client.py — new): - asyncssh-based client: start_smart_test, poll_smart_progress, abort_smart_test, get_smart_attributes, run_badblocks with streaming progress callbacks - SMART attribute table: monitors attrs 5/10/188/197/198/199 for warn/fail thresholds - Falls back to REST API / mock simulation when ssh_host is not configured Burn-in stages updated (burnin.py): - _stage_smart_test: SSH path polls smartctl -a, stores raw output + parsed attributes - _stage_surface_validate: SSH path streams badblocks, counts bad blocks vs configurable threshold - _stage_final_check: SSH path checks smartctl attributes; DB fallback for mock mode - New DB helpers: _append_stage_log, _update_stage_bad_blocks, _store_smart_attrs, _store_smart_raw_output Database (database.py): - Migrations: burnin_stages.log_text, burnin_stages.bad_blocks, drives.smart_attrs (JSON), smart_tests.raw_output Settings (config.py + settings_store.py): - ssh_host, ssh_port, ssh_user, ssh_password, ssh_key — all runtime-editable - SSH section in Settings UI with Test SSH Connection button Webhook (notifier.py): - Added bad_blocks and timestamp fields to payload per SPEC Drive reset (routes.py + drives_table.html): - POST /api/v1/drives/{id}/reset — clears SMART state, smart_attrs; audit logged - Reset button visible on drives with completed test state (no active burn-in) Log drawer (app.js): - Burn-In tab: shows raw stage log_text (SSH output) with bad block highlighting - SMART tab: shows SMART attribute table with warn/fail colouring + raw smartctl output Polish: - Version badge (v1.0.0-6d) in header via Jinja2 global - Parallel burn-in warning when max_parallel_burnins > 8 in Settings - Stats page: avg duration by drive size + failure breakdown by stage - settings.html: SSH section with key textarea, parallel warn div Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-24 08:09:30 -05:00 · 2026-02-24 08:09:30 -05:00 · 2dff58bd52
commit 2dff58bd52
parent 4ab54d7ed8
15 changed files with 1141 additions and 44 deletions
--- a/app/burnin.py
+++ b/app/burnin.py
@ -303,6 +303,16 @@ async def _run_job(job_id: int) -> None:
                )
                job_row = await cur2.fetchone()
            if job_row:
+                # Get bad_blocks count from surface_validate stage if present
+                bad_blocks = 0
+                async with _db() as db3:
+                    cur3 = await db3.execute(
+                        "SELECT bad_blocks FROM burnin_stages WHERE burnin_job_id=? AND stage_name='surface_validate'",
+                        (job_id,)
+                    )
+                    bb_row = await cur3.fetchone()
+                    if bb_row and bb_row[0]:
+                        bad_blocks = bb_row[0]
                asyncio.create_task(notifier.notify_job_complete(
                    job_id=job_id,
                    devname=devname,
@ -312,6 +322,7 @@ async def _run_job(job_id: int) -> None:
                    profile=job_row["profile"],
                    operator=job_row["operator"],
                    error_text=error_text,
+                    bad_blocks=bad_blocks,
                ))
        except Exception as exc:
            log.error("Failed to schedule notifications: %s", exc)
@ -352,15 +363,15 @@ async def _dispatch_stage(job_id: int, stage_name: str, devname: str, drive_id:
    if stage_name == "precheck":
        return await _stage_precheck(job_id, drive_id)
    elif stage_name == "short_smart":
-        return await _stage_smart_test(job_id, devname, "SHORT", "short_smart")
+        return await _stage_smart_test(job_id, devname, "SHORT", "short_smart", drive_id)
    elif stage_name == "long_smart":
-        return await _stage_smart_test(job_id, devname, "LONG", "long_smart")
+        return await _stage_smart_test(job_id, devname, "LONG", "long_smart", drive_id)
    elif stage_name == "surface_validate":
-        return await _stage_timed_simulate(job_id, "surface_validate", settings.surface_validate_seconds)
+        return await _stage_surface_validate(job_id, devname, drive_id)
    elif stage_name == "io_validate":
        return await _stage_timed_simulate(job_id, "io_validate", settings.io_validate_seconds)
    elif stage_name == "final_check":
-        return await _stage_final_check(job_id, devname)
+        return await _stage_final_check(job_id, devname, drive_id)
    return True


@ -393,8 +404,17 @@ async def _stage_precheck(job_id: int, drive_id: int) -> bool:
    return True


-async def _stage_smart_test(job_id: int, devname: str, test_type: str, stage_name: str) -> bool:
-    """Start a TrueNAS SMART test and poll until complete."""
+async def _stage_smart_test(job_id: int, devname: str, test_type: str, stage_name: str,
+                            drive_id: int | None = None) -> bool:
+    """Start a SMART test. Uses SSH if configured, TrueNAS REST API otherwise."""
+    from app import ssh_client
+    if ssh_client.is_configured():
+        return await _stage_smart_test_ssh(job_id, devname, test_type, stage_name, drive_id)
+    return await _stage_smart_test_api(job_id, devname, test_type, stage_name)
+
+
+async def _stage_smart_test_api(job_id: int, devname: str, test_type: str, stage_name: str) -> bool:
+    """TrueNAS REST API path for SMART test (mock / dev mode)."""
    tn_job_id = await _client.start_smart_test([devname], test_type)

    while True:
@ -428,8 +448,215 @@ async def _stage_smart_test(job_id: int, devname: str, test_type: str, stage_nam
        await asyncio.sleep(POLL_INTERVAL)


+async def _stage_smart_test_ssh(job_id: int, devname: str, test_type: str, stage_name: str,
+                                 drive_id: int | None) -> bool:
+    """SSH path for SMART test — runs smartctl directly on TrueNAS."""
+    from app import ssh_client
+
+    # Start the test
+    try:
+        startup = await ssh_client.start_smart_test(devname, test_type)
+        await _append_stage_log(job_id, stage_name, startup + "\n")
+    except Exception as exc:
+        await _set_stage_error(job_id, stage_name, f"Failed to start SMART test via SSH: {exc}")
+        return False
+
+    # Brief pause to let the test register in smartctl output
+    await asyncio.sleep(3)
+
+    # Poll until complete
+    while True:
+        if await _is_cancelled(job_id):
+            try:
+                await ssh_client.abort_smart_test(devname)
+            except Exception:
+                pass
+            return False
+
+        await asyncio.sleep(POLL_INTERVAL)
+
+        try:
+            progress = await ssh_client.poll_smart_progress(devname)
+        except Exception as exc:
+            log.warning("SSH SMART poll failed: %s", exc, extra={"job_id": job_id})
+            await _append_stage_log(job_id, stage_name, f"[poll error] {exc}\n")
+            continue
+
+        await _append_stage_log(job_id, stage_name, progress["output"] + "\n---\n")
+
+        if progress["state"] == "running":
+            pct = max(0, 100 - progress["percent_remaining"])
+            await _update_stage_percent(job_id, stage_name, pct)
+            await _recalculate_progress(job_id)
+            _push_update()
+
+        elif progress["state"] == "passed":
+            await _update_stage_percent(job_id, stage_name, 100)
+            # Run attribute check
+            if drive_id is not None:
+                try:
+                    attrs = await ssh_client.get_smart_attributes(devname)
+                    await _store_smart_attrs(drive_id, attrs)
+                    await _store_smart_raw_output(drive_id, test_type, attrs["raw_output"])
+                    if attrs["failures"]:
+                        error = "SMART attribute failures: " + "; ".join(attrs["failures"])
+                        await _set_stage_error(job_id, stage_name, error)
+                        return False
+                    if attrs["warnings"]:
+                        await _append_stage_log(
+                            job_id, stage_name,
+                            "[WARNING] " + "; ".join(attrs["warnings"]) + "\n"
+                        )
+                except Exception as exc:
+                    log.warning("Failed to retrieve SMART attributes: %s", exc)
+            await _recalculate_progress(job_id)
+            _push_update()
+            return True
+
+        elif progress["state"] == "failed":
+            await _set_stage_error(job_id, stage_name, f"SMART {test_type} test failed")
+            return False
+        # "unknown" → keep polling
+
+
+async def _stage_surface_validate(job_id: int, devname: str, drive_id: int) -> bool:
+    """
+    Surface validation stage.
+    SSH mode: runs badblocks -wsv -b 4096 -p 1 /dev/{devname}.
+    Mock mode: simulated timed progress (no real I/O).
+    """
+    from app import ssh_client
+    if ssh_client.is_configured():
+        return await _stage_surface_validate_ssh(job_id, devname, drive_id)
+    return await _stage_timed_simulate(job_id, "surface_validate", settings.surface_validate_seconds)
+
+
+async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) -> bool:
+    """Run badblocks over SSH, streaming output to stage log."""
+    from app import ssh_client
+
+    await _append_stage_log(
+        job_id, "surface_validate",
+        f"[START] badblocks -wsv -b 4096 -p 1 /dev/{devname}\n"
+        f"[NOTE]  This is a DESTRUCTIVE write test. All data on /dev/{devname} will be overwritten.\n\n"
+    )
+
+    def _is_cancelled_sync() -> bool:
+        # Synchronous version — we check the DB state flag set by cancel_job()
+        import asyncio
+        loop = asyncio.get_event_loop()
+        try:
+            return loop.run_until_complete(_is_cancelled(job_id))
+        except Exception:
+            return False
+
+    last_logged_pct = [-1]
+
+    def on_progress(pct: int, bad_blocks: int, line: str) -> None:
+        nonlocal last_logged_pct
+        # Write to log (fire-and-forget via asyncio.create_task from sync context)
+        # The log append is done in the async flush below
+        pass
+
+    accumulated_lines: list[str] = []
+
+    async def on_progress_async(pct: int, bad_blocks: int, line: str) -> None:
+        accumulated_lines.append(line)
+        # Flush to DB and update progress every ~25 lines to avoid excessive DB writes
+        if len(accumulated_lines) % 25 == 0:
+            await _append_stage_log(job_id, "surface_validate", "".join(accumulated_lines[-25:]))
+            await _update_stage_bad_blocks(job_id, "surface_validate", bad_blocks)
+            await _update_stage_percent(job_id, "surface_validate", pct)
+            await _recalculate_progress(job_id)
+            _push_update()
+        if await _is_cancelled(job_id):
+            raise asyncio.CancelledError
+
+    # Run badblocks — we adapt the callback pattern to async by collecting then flushing
+    result = {"bad_blocks": 0, "output": "", "aborted": False}
+    try:
+        # The actual streaming; we handle progress via the accumulated_lines pattern
+        bad_blocks_total = 0
+        output_lines: list[str] = []
+
+        async with await ssh_client._connect() as conn:
+            cmd = f"badblocks -wsv -b 4096 -p 1 /dev/{devname}"
+            async with conn.create_process(cmd) as proc:
+                import re as _re
+
+                async def _drain(stream, is_stderr: bool):
+                    nonlocal bad_blocks_total
+                    async for raw in stream:
+                        line = raw if isinstance(raw, str) else raw.decode("utf-8", errors="replace")
+                        output_lines.append(line)
+
+                        if is_stderr:
+                            m = _re.search(r"([\d.]+)%\s+done", line)
+                            if m:
+                                pct = min(99, int(float(m.group(1))))
+                                await _update_stage_percent(job_id, "surface_validate", pct)
+                                await _update_stage_bad_blocks(job_id, "surface_validate", bad_blocks_total)
+                                await _recalculate_progress(job_id)
+                                _push_update()
+                        else:
+                            stripped = line.strip()
+                            if stripped and stripped.isdigit():
+                                bad_blocks_total += 1
+
+                        # Append to DB log in chunks
+                        if len(output_lines) % 20 == 0:
+                            chunk = "".join(output_lines[-20:])
+                            await _append_stage_log(job_id, "surface_validate", chunk)
+
+                        # Abort on bad block threshold
+                        if bad_blocks_total > settings.bad_block_threshold:
+                            proc.kill()
+                            output_lines.append(
+                                f"\n[ABORTED] {bad_blocks_total} bad block(s) exceeded "
+                                f"threshold ({settings.bad_block_threshold})\n"
+                            )
+                            return
+
+                        if await _is_cancelled(job_id):
+                            proc.kill()
+                            return
+
+                await asyncio.gather(
+                    _drain(proc.stdout, False),
+                    _drain(proc.stderr, True),
+                    return_exceptions=True,
+                )
+                await proc.wait()
+
+        # Flush remaining output
+        remainder = "".join(output_lines)
+        await _append_stage_log(job_id, "surface_validate", remainder)
+        result["bad_blocks"] = bad_blocks_total
+        result["output"] = remainder
+        result["aborted"] = bad_blocks_total > settings.bad_block_threshold
+
+    except asyncio.CancelledError:
+        return False
+    except Exception as exc:
+        await _append_stage_log(job_id, "surface_validate", f"\n[SSH error] {exc}\n")
+        await _set_stage_error(job_id, "surface_validate", f"SSH badblocks error: {exc}")
+        return False
+
+    await _update_stage_bad_blocks(job_id, "surface_validate", result["bad_blocks"])
+
+    if result["aborted"] or result["bad_blocks"] > settings.bad_block_threshold:
+        await _set_stage_error(
+            job_id, "surface_validate",
+            f"Surface validate FAILED: {result['bad_blocks']} bad block(s) found "
+            f"(threshold: {settings.bad_block_threshold})"
+        )
+        return False
+
+    return True
+
+
 async def _stage_timed_simulate(job_id: int, stage_name: str, duration_seconds: int) -> bool:
-    """Simulate a timed stage (surface validation / IO validation) with progress updates."""
+    """Simulate a timed stage with progress updates (mock / dev mode)."""
    start = time.monotonic()

    while True:
@ -449,9 +676,28 @@ async def _stage_timed_simulate(job_id: int, stage_name: str, duration_seconds:
        await asyncio.sleep(POLL_INTERVAL)


-async def _stage_final_check(job_id: int, devname: str) -> bool:
-    """Verify drive passed all tests by checking current SMART health in DB."""
+async def _stage_final_check(job_id: int, devname: str, drive_id: int | None = None) -> bool:
+    """
+    Verify drive passed all tests.
+    SSH mode: run smartctl -a and check critical attributes.
+    Mock mode: check SMART health field in DB.
+    """
    await asyncio.sleep(1)
+    from app import ssh_client
+    if ssh_client.is_configured() and drive_id is not None:
+        try:
+            attrs = await ssh_client.get_smart_attributes(devname)
+            await _store_smart_attrs(drive_id, attrs)
+            if attrs["health"] == "FAILED" or attrs["failures"]:
+                failures = attrs["failures"] or [f"SMART health: {attrs['health']}"]
+                await _set_stage_error(job_id, "final_check",
+                                       "Final check failed: " + "; ".join(failures))
+                return False
+            return True
+        except Exception as exc:
+            log.warning("SSH final_check failed, falling back to DB check: %s", exc)
+
+    # DB check (mock mode fallback)
    async with _db() as db:
        cur = await db.execute(
            "SELECT smart_health FROM drives WHERE devname=?", (devname,)
@ -549,6 +795,57 @@ async def _cancel_stage(job_id: int, stage_name: str) -> None:
        await db.commit()


+async def _append_stage_log(job_id: int, stage_name: str, text: str) -> None:
+    """Append text to the log_text column of a burnin_stages row."""
+    async with _db() as db:
+        await db.execute("PRAGMA journal_mode=WAL")
+        await db.execute(
+            """UPDATE burnin_stages
+               SET log_text = COALESCE(log_text, '') || ?
+               WHERE burnin_job_id=? AND stage_name=?""",
+            (text, job_id, stage_name),
+        )
+        await db.commit()
+
+
+async def _update_stage_bad_blocks(job_id: int, stage_name: str, count: int) -> None:
+    async with _db() as db:
+        await db.execute("PRAGMA journal_mode=WAL")
+        await db.execute(
+            "UPDATE burnin_stages SET bad_blocks=? WHERE burnin_job_id=? AND stage_name=?",
+            (count, job_id, stage_name),
+        )
+        await db.commit()
+
+
+async def _store_smart_attrs(drive_id: int, attrs: dict) -> None:
+    """Persist latest SMART attribute dict to drives.smart_attrs (JSON)."""
+    import json
+    # Convert int keys to str for JSON serialisation
+    serialisable = {str(k): v for k, v in attrs.get("attributes", {}).items()}
+    blob = json.dumps({
+        "health":   attrs.get("health", "UNKNOWN"),
+        "attrs":    serialisable,
+        "warnings": attrs.get("warnings", []),
+        "failures": attrs.get("failures", []),
+    })
+    async with _db() as db:
+        await db.execute("PRAGMA journal_mode=WAL")
+        await db.execute("UPDATE drives SET smart_attrs=? WHERE id=?", (blob, drive_id))
+        await db.commit()
+
+
+async def _store_smart_raw_output(drive_id: int, test_type: str, raw: str) -> None:
+    """Store raw smartctl output in smart_tests.raw_output."""
+    async with _db() as db:
+        await db.execute("PRAGMA journal_mode=WAL")
+        await db.execute(
+            "UPDATE smart_tests SET raw_output=? WHERE drive_id=? AND test_type=?",
+            (raw, drive_id, test_type.lower()),
+        )
+        await db.commit()
+
+
 async def _set_stage_error(job_id: int, stage_name: str, error_text: str) -> None:
    async with _db() as db:
        await db.execute("PRAGMA journal_mode=WAL")
--- a/app/config.py
+++ b/app/config.py
@ -56,9 +56,17 @@ class Settings(BaseSettings):
    temp_crit_c: int = 55   # red critical (precheck refuses to start above this)

    # Bad-block tolerance — surface_validate fails if bad blocks exceed this
-    # (applies to real badblocks in Stage 7; ignored by mock simulation)
    bad_block_threshold: int = 0

+    # SSH credentials for direct TrueNAS command execution (Stage 7)
+    # When ssh_host is set, burn-in stages use SSH for smartctl/badblocks instead of REST API.
+    # Leave ssh_host empty to use the mock/REST API (development mode).
+    ssh_host: str = ""
+    ssh_port: int = 22
+    ssh_user: str = "root"        # TrueNAS CORE default is root
+    ssh_password: str = ""        # Password auth (leave blank if using key)
+    ssh_key: str = ""             # PEM private key content (paste full key including headers)
+
    # Application version — used by the /api/v1/updates/check endpoint
    app_version: str = "1.0.0-6d"

--- a/app/database.py
+++ b/app/database.py
@ -82,6 +82,11 @@ CREATE INDEX IF NOT EXISTS idx_audit_events_job   ON audit_events(burnin_job_id)
 _MIGRATIONS = [
    "ALTER TABLE drives ADD COLUMN notes TEXT",
    "ALTER TABLE drives ADD COLUMN location TEXT",
+    # Stage 7: SSH command output + SMART attribute storage
+    "ALTER TABLE burnin_stages ADD COLUMN log_text TEXT",
+    "ALTER TABLE burnin_stages ADD COLUMN bad_blocks INTEGER DEFAULT 0",
+    "ALTER TABLE drives ADD COLUMN smart_attrs TEXT",
+    "ALTER TABLE smart_tests ADD COLUMN raw_output TEXT",
 ]


--- a/app/notifier.py
+++ b/app/notifier.py
@ -23,8 +23,10 @@ async def notify_job_complete(
    profile: str,
    operator: str,
    error_text: str | None,
+    bad_blocks: int = 0,
 ) -> None:
    """Fire all configured notifications for a completed burn-in job."""
+    from datetime import datetime, timezone
    tasks = []

    if settings.webhook_url:
@ -38,6 +40,8 @@ async def notify_job_complete(
            "profile":        profile,
            "operator":       operator,
            "error_text":     error_text,
+            "bad_blocks":     bad_blocks,
+            "timestamp":      datetime.now(timezone.utc).isoformat(),
        }))

    if settings.smtp_host:
--- a/app/renderer.py
+++ b/app/renderer.py
@ -126,7 +126,7 @@ def _format_elapsed(iso: str | None) -> str:
        return ""


-# Register
+# Register filters
 templates.env.filters["format_bytes"]    = _format_bytes
 templates.env.filters["format_eta"]      = _format_eta
 templates.env.filters["temp_class"]      = _temp_class
@ -135,3 +135,7 @@ templates.env.filters["format_dt_full"]  = _format_dt_full
 templates.env.filters["format_duration"] = _format_duration
 templates.env.filters["format_elapsed"]  = _format_elapsed
 templates.env.globals["drive_status"]    = _drive_status
+
+
+from app.config import settings as _settings
+templates.env.globals["app_version"] = _settings.app_version
--- a/app/routes.py
+++ b/app/routes.py
@ -258,7 +258,7 @@ async def drive_drawer(drive_id: int, db: aiosqlite.Connection = Depends(get_db)
        raise HTTPException(status_code=404, detail="Drive not found")
    drive = _row_to_drive(row)

-    # Latest burn-in job + its stages
+    # Latest burn-in job + its stages (include log_text and bad_blocks)
    cur = await db.execute(
        "SELECT * FROM burnin_jobs WHERE drive_id=? ORDER BY id DESC LIMIT 1",
        (drive_id,),
@ -268,12 +268,33 @@ async def drive_drawer(drive_id: int, db: aiosqlite.Connection = Depends(get_db)
    if job_row:
        job = dict(job_row)
        cur = await db.execute(
-            "SELECT * FROM burnin_stages WHERE burnin_job_id=? ORDER BY id",
+            "SELECT id, stage_name, state, percent, started_at, finished_at, "
+            "duration_seconds, error_text, log_text, bad_blocks "
+            "FROM burnin_stages WHERE burnin_job_id=? ORDER BY id",
            (job_row["id"],),
        )
        job["stages"] = [dict(r) for r in await cur.fetchall()]
        burnin = job

+    # SMART raw output from smart_tests table
+    cur = await db.execute(
+        "SELECT test_type, state, percent, started_at, finished_at, error_text, raw_output "
+        "FROM smart_tests WHERE drive_id=?",
+        (drive_id,),
+    )
+    smart_rows = {r["test_type"]: dict(r) for r in await cur.fetchall()}
+
+    # Cached SMART attributes (JSON blob on drives table)
+    import json as _json
+    smart_attrs = None
+    cur = await db.execute("SELECT smart_attrs FROM drives WHERE id=?", (drive_id,))
+    attrs_row = await cur.fetchone()
+    if attrs_row and attrs_row["smart_attrs"]:
+        try:
+            smart_attrs = _json.loads(attrs_row["smart_attrs"])
+        except Exception:
+            pass
+
    # Last 50 audit events for this drive (newest first)
    cur = await db.execute("""
        SELECT id, event_type, operator, message, created_at
@ -284,6 +305,13 @@ async def drive_drawer(drive_id: int, db: aiosqlite.Connection = Depends(get_db)
    """, (drive_id,))
    events = [dict(r) for r in await cur.fetchall()]

+    def _smart_card(test_type: str) -> dict:
+        smart_obj = drive.smart_short if test_type == "short" else drive.smart_long
+        base = smart_obj.model_dump() if smart_obj else {}
+        row = smart_rows.get(test_type, {})
+        base["raw_output"] = row.get("raw_output")
+        return base
+
    return {
        "drive": {
            "id":         drive.id,
@ -294,8 +322,9 @@ async def drive_drawer(drive_id: int, db: aiosqlite.Connection = Depends(get_db)
        },
        "burnin":      burnin,
        "smart": {
-            "short": drive.smart_short.model_dump() if drive.smart_short else None,
-            "long":  drive.smart_long.model_dump()  if drive.smart_long  else None,
+            "short":       _smart_card("short"),
+            "long":        _smart_card("long"),
+            "attrs":       smart_attrs,
        },
        "events":      events,
    }
@ -672,6 +701,53 @@ async def update_drive(
    return {"updated": True}


+@router.post("/api/v1/drives/{drive_id}/reset")
+async def reset_drive(
+    drive_id: int,
+    body: dict,
+    db: aiosqlite.Connection = Depends(get_db),
+):
+    """
+    Clear SMART test results for a drive so it shows as fresh.
+    Only allowed when no burn-in job is active (queued or running).
+    Preserves all job history — just resets the display state.
+    """
+    cur = await db.execute("SELECT id FROM drives WHERE id=?", (drive_id,))
+    if not await cur.fetchone():
+        raise HTTPException(status_code=404, detail="Drive not found")
+
+    # Reject if any active burn-in
+    cur = await db.execute(
+        "SELECT COUNT(*) FROM burnin_jobs WHERE drive_id=? AND state IN ('queued','running')",
+        (drive_id,),
+    )
+    if (await cur.fetchone())[0] > 0:
+        raise HTTPException(status_code=409, detail="Cannot reset while a burn-in is active")
+
+    operator = body.get("operator", "operator")
+
+    # Reset SMART test state to idle
+    await db.execute(
+        """UPDATE smart_tests SET state='idle', percent=0, started_at=NULL,
+           eta_at=NULL, finished_at=NULL, error_text=NULL, raw_output=NULL
+           WHERE drive_id=?""",
+        (drive_id,),
+    )
+    # Clear cached SMART attributes
+    await db.execute("UPDATE drives SET smart_attrs=NULL WHERE id=?", (drive_id,))
+
+    # Audit event
+    await db.execute(
+        """INSERT INTO audit_events (event_type, drive_id, operator, message)
+           VALUES (?,?,?,?)""",
+        ("drive_reset", drive_id, operator, "Drive reset — SMART state cleared"),
+    )
+    await db.commit()
+
+    poller._notify_subscribers()
+    return {"reset": True}
+
+
 # ---------------------------------------------------------------------------
 # Audit log page
 # ---------------------------------------------------------------------------
@ -766,6 +842,36 @@ async def stats_page(
    """)
    by_day = [dict(r) for r in await cur.fetchall()]

+    # Average test duration by drive size (rounded to nearest TB)
+    cur = await db.execute("""
+        SELECT
+            CAST(ROUND(CAST(d.size_bytes AS REAL) / 1e12) AS INTEGER) AS size_tb,
+            COUNT(*)  AS total,
+            ROUND(AVG(
+                (julianday(bj.finished_at) - julianday(bj.started_at)) * 86400 / 3600.0
+            ), 1)     AS avg_hours
+        FROM burnin_jobs bj
+        JOIN drives d ON d.id = bj.drive_id
+        WHERE bj.state IN ('passed', 'failed')
+          AND bj.started_at IS NOT NULL
+          AND bj.finished_at IS NOT NULL
+        GROUP BY size_tb
+        ORDER BY size_tb
+    """)
+    by_size = [dict(r) for r in await cur.fetchall()]
+
+    # Failure breakdown by stage (which stage caused the failure)
+    cur = await db.execute("""
+        SELECT
+            COALESCE(bj.stage_name, 'unknown') AS failed_stage,
+            COUNT(*) AS count
+        FROM burnin_jobs bj
+        WHERE bj.state = 'failed'
+        GROUP BY failed_stage
+        ORDER BY count DESC
+    """)
+    by_failure_stage = [dict(r) for r in await cur.fetchall()]
+
    # Drives tracked
    cur = await db.execute("SELECT COUNT(*) FROM drives")
    drives_total = (await cur.fetchone())[0]
@ -776,6 +882,8 @@ async def stats_page(
        "overall":          overall,
        "by_model":         by_model,
        "by_day":           by_day,
+        "by_size":          by_size,
+        "by_failure_stage": by_failure_stage,
        "drives_total":     drives_total,
        "poller":           ps,
        **_stale_context(ps),
@ -813,6 +921,11 @@ async def settings_page(
        "temp_warn_c":               settings.temp_warn_c,
        "temp_crit_c":               settings.temp_crit_c,
        "bad_block_threshold":       settings.bad_block_threshold,
+        # SSH credentials (take effect immediately — each SSH call reads live settings)
+        "ssh_host":                  settings.ssh_host,
+        "ssh_port":                  settings.ssh_port,
+        "ssh_user":                  settings.ssh_user,
+        # Note: ssh_password and ssh_key intentionally omitted from display (sensitive)
        # System settings (restart required to fully apply)
        "truenas_base_url":          settings.truenas_base_url,
        "truenas_verify_tls":        settings.truenas_verify_tls,
@ -823,11 +936,13 @@ async def settings_page(
        # Note: truenas_api_key intentionally omitted from display (sensitive)
    }

+    from app import ssh_client as _ssh
    ps = poller.get_state()
    return templates.TemplateResponse("settings.html", {
        "request":        request,
        "editable":       editable,
        "smtp_enabled":   bool(settings.smtp_host),
+        "ssh_configured": _ssh.is_configured(),
        "app_version":    settings.app_version,
        "poller":         ps,
        **_stale_context(ps),
@ -838,7 +953,7 @@ async def settings_page(
 async def save_settings(body: dict):
    """Save editable runtime settings.  Secrets are only updated if non-empty."""
    # Don't overwrite secrets if client sent empty string
-    for secret_field in ("smtp_password", "truenas_api_key"):
+    for secret_field in ("smtp_password", "truenas_api_key", "ssh_password", "ssh_key"):
        if secret_field in body and body[secret_field] == "":
            del body[secret_field]

@ -859,6 +974,16 @@ async def test_smtp():
    return {"ok": True}


+@router.post("/api/v1/settings/test-ssh")
+async def test_ssh():
+    """Test the current SSH configuration."""
+    from app import ssh_client
+    result = await ssh_client.test_connection()
+    if not result["ok"]:
+        raise HTTPException(status_code=502, detail=result.get("error", "Connection failed"))
+    return {"ok": True}
+
+
@router.get("/api/v1/updates/check")
 async def check_updates():
    """Check for a newer release on Forgejo."""
--- a/app/settings_store.py
+++ b/app/settings_store.py
@ -38,6 +38,12 @@ _EDITABLE: dict[str, type] = {
    "temp_warn_c":               int,
    "temp_crit_c":               int,
    "bad_block_threshold":       int,
+    # SSH credentials — take effect immediately (each connection reads live settings)
+    "ssh_host":                  str,
+    "ssh_port":                  int,
+    "ssh_user":                  str,
+    "ssh_password":              str,
+    "ssh_key":                   str,
    # System settings — saved to JSON; require container restart to fully apply
    "truenas_base_url":          str,
    "truenas_api_key":           str,
@ -90,6 +96,9 @@ def _apply(data: dict) -> None:
            if key == "bad_block_threshold" and int(val) < 0:
                log.warning("settings_store: bad_block_threshold must be >= 0 — ignoring")
                continue
+            if key == "ssh_port" and not (1 <= int(val) <= 65535):
+                log.warning("settings_store: ssh_port out of range — ignoring")
+                continue
            setattr(settings, key, val)
        except (ValueError, TypeError) as exc:
            log.warning("settings_store: invalid value for %s: %s", key, exc)
--- a/app/ssh_client.py
+++ b/app/ssh_client.py
@ -0,0 +1,303 @@
+"""
+SSH client for direct TrueNAS command execution (Stage 7).
+
+When ssh_host is configured, burn-in stages use SSH to run smartctl and
+badblocks directly on the TrueNAS host instead of going through the REST API.
+Falls back to REST API / simulation when SSH is not configured (dev/mock mode).
+
+TrueNAS CORE (FreeBSD) device paths: /dev/ada0, /dev/da0, etc.
+TrueNAS SCALE (Linux) device paths: /dev/sda, /dev/sdb, etc.
+The devname from the TrueNAS API is used as-is in /dev/{devname}.
+"""
+
+import asyncio
+import logging
+import re
+from typing import Callable
+
+log = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Monitored SMART attributes
+# True  → any non-zero raw value is a hard failure (drive rejected)
+# False → non-zero is a warning (flagged but test continues)
+# ---------------------------------------------------------------------------
+
+SMART_ATTRS: dict[int, tuple[str, bool]] = {
+    5:   ("Reallocated_Sector_Ct",  True),   # reallocation = FAIL
+    10:  ("Spin_Retry_Count",       False),  # mechanical stress = WARN
+    188: ("Command_Timeout",        False),  # drive not responding = WARN
+    197: ("Current_Pending_Sector", True),   # pending reallocation = FAIL
+    198: ("Offline_Uncorrectable",  True),   # unrecoverable read error = FAIL
+    199: ("UDMA_CRC_Error_Count",   False),  # cable/controller issue = WARN
+}
+
+
+# ---------------------------------------------------------------------------
+# Configuration check
+# ---------------------------------------------------------------------------
+
+def is_configured() -> bool:
+    """Returns True when SSH credentials are present and usable."""
+    from app.config import settings
+    return bool(settings.ssh_host and (settings.ssh_password or settings.ssh_key))
+
+
+# ---------------------------------------------------------------------------
+# Low-level connection
+# ---------------------------------------------------------------------------
+
+async def _connect():
+    """Open a single-use SSH connection. Caller must use `async with`."""
+    import asyncssh
+    from app.config import settings
+
+    kwargs: dict = {
+        "host":        settings.ssh_host,
+        "port":        settings.ssh_port,
+        "username":    settings.ssh_user,
+        "known_hosts": None,          # trust all hosts (same spirit as TRUENAS_VERIFY_TLS=false)
+    }
+    if settings.ssh_key:
+        kwargs["client_keys"] = [asyncssh.import_private_key(settings.ssh_key)]
+    if settings.ssh_password:
+        kwargs["password"] = settings.ssh_password
+
+    return asyncssh.connect(**kwargs)
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+async def test_connection() -> dict:
+    """Test SSH connectivity. Returns {"ok": True} or {"ok": False, "error": str}."""
+    if not is_configured():
+        return {"ok": False, "error": "SSH not configured (ssh_host is empty)"}
+    try:
+        async with await _connect() as conn:
+            result = await conn.run("echo ok", check=False)
+            if "ok" in result.stdout:
+                return {"ok": True}
+            return {"ok": False, "error": result.stderr.strip() or "unexpected output"}
+    except Exception as exc:
+        return {"ok": False, "error": str(exc)}
+
+
+async def get_smart_attributes(devname: str) -> dict:
+    """
+    Run `smartctl -a /dev/{devname}` and parse the output.
+    Returns:
+        health:     str — "PASSED" | "FAILED" | "UNKNOWN"
+        raw_output: str — full smartctl output
+        attributes: dict[int, {"name": str, "raw": int}]
+        warnings:   list[str] — attribute names with non-zero raw (non-critical)
+        failures:   list[str] — attribute names with non-zero raw (critical)
+    """
+    cmd = f"smartctl -a /dev/{devname}"
+    try:
+        async with await _connect() as conn:
+            result = await conn.run(cmd, check=False)
+            output = result.stdout + result.stderr
+            return _parse_smartctl(output)
+    except Exception as exc:
+        return {
+            "health":     "UNKNOWN",
+            "raw_output": str(exc),
+            "attributes": {},
+            "warnings":   [],
+            "failures":   [f"SSH error: {exc}"],
+        }
+
+
+async def start_smart_test(devname: str, test_type: str) -> str:
+    """
+    Run `smartctl -t short|long /dev/{devname}`.
+    Returns raw output. Raises RuntimeError on unrecoverable failure.
+    test_type: "SHORT" or "LONG"
+    """
+    arg = "short" if test_type.upper() == "SHORT" else "long"
+    cmd = f"smartctl -t {arg} /dev/{devname}"
+    async with await _connect() as conn:
+        result = await conn.run(cmd, check=False)
+        output = result.stdout + result.stderr
+        # smartctl exits 0 or 4 when the test is successfully started on most drives
+        started = ("Testing has begun" in output or
+                   "test has begun" in output.lower() or
+                   result.returncode in (0, 4))
+        if not started:
+            raise RuntimeError(f"smartctl returned exit {result.returncode}: {output[:400]}")
+        return output
+
+
+async def poll_smart_progress(devname: str) -> dict:
+    """
+    Run `smartctl -a /dev/{devname}` and extract self-test status.
+    Returns:
+        state:             "running" | "passed" | "failed" | "unknown"
+        percent_remaining: int (0 = complete when state != "running")
+        output:            str
+    """
+    cmd = f"smartctl -a /dev/{devname}"
+    async with await _connect() as conn:
+        result = await conn.run(cmd, check=False)
+        output = result.stdout + result.stderr
+        return _parse_smart_progress(output)
+
+
+async def abort_smart_test(devname: str) -> None:
+    """Send `smartctl -X /dev/{devname}` to abort an in-progress test."""
+    cmd = f"smartctl -X /dev/{devname}"
+    async with await _connect() as conn:
+        await conn.run(cmd, check=False)
+
+
+async def run_badblocks(
+    devname: str,
+    on_progress: Callable[[int, int, str], None],
+    cancelled_fn: Callable[[], bool] | None = None,
+) -> dict:
+    """
+    Run `badblocks -wsv -b 4096 -p 1 /dev/{devname}` and stream output.
+
+    on_progress(percent, bad_blocks, line) is called for each line of output.
+    cancelled_fn() is polled to support mid-test cancellation.
+
+    Returns: {"bad_blocks": int, "output": str, "aborted": bool}
+    """
+    from app.config import settings
+    cmd = f"badblocks -wsv -b 4096 -p 1 /dev/{devname}"
+    lines: list[str] = []
+    bad_blocks = 0
+    aborted = False
+    last_pct = 0
+
+    try:
+        async with await _connect() as conn:
+            async with conn.create_process(cmd) as proc:
+                # badblocks writes progress to stderr, bad block numbers to stdout
+                async def _read_stream(stream, is_stderr: bool):
+                    nonlocal bad_blocks, last_pct, aborted
+                    async for raw_line in stream:
+                        line = raw_line if isinstance(raw_line, str) else raw_line.decode("utf-8", errors="replace")
+                        lines.append(line)
+
+                        if is_stderr:
+                            m = re.search(r"([\d.]+)%\s+done", line)
+                            if m:
+                                last_pct = min(99, int(float(m.group(1))))
+                        else:
+                            # Each non-empty stdout line during badblocks is a bad block number
+                            stripped = line.strip()
+                            if stripped and stripped.isdigit():
+                                bad_blocks += 1
+
+                        on_progress(last_pct, bad_blocks, line)
+
+                        # Abort if threshold exceeded
+                        if bad_blocks > settings.bad_block_threshold:
+                            aborted = True
+                            proc.kill()
+                            lines.append(
+                                f"\n[ABORTED] Bad block count ({bad_blocks}) exceeded "
+                                f"threshold ({settings.bad_block_threshold})\n"
+                            )
+                            return
+
+                        # Abort on cancellation
+                        if cancelled_fn and cancelled_fn():
+                            aborted = True
+                            proc.kill()
+                            return
+
+                stdout_task = asyncio.create_task(_read_stream(proc.stdout, False))
+                stderr_task = asyncio.create_task(_read_stream(proc.stderr, True))
+                await asyncio.gather(stdout_task, stderr_task, return_exceptions=True)
+                await proc.wait()
+
+    except Exception as exc:
+        lines.append(f"\n[SSH error] {exc}\n")
+
+    if not aborted:
+        last_pct = 100
+
+    return {
+        "bad_blocks": bad_blocks,
+        "output":     "".join(lines),
+        "aborted":    aborted,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Parsers
+# ---------------------------------------------------------------------------
+
+def _parse_smartctl(output: str) -> dict:
+    health = "UNKNOWN"
+    attributes: dict[int, dict] = {}
+    warnings: list[str] = []
+    failures: list[str] = []
+
+    m = re.search(r"self-assessment test result:\s+(\w+)", output, re.IGNORECASE)
+    if m:
+        health = m.group(1).upper()
+
+    # Attribute table: ID#  NAME  FLAG  VALUE  WORST  THRESH  TYPE  UPDATED  WHEN_FAILED  RAW_VALUE
+    for line in output.splitlines():
+        am = re.match(
+            r"\s*(\d+)\s+(\S+)\s+\S+\s+\d+\s+\d+\s+\d+\s+\S+\s+\S+\s+\S+\s+(\d+)",
+            line,
+        )
+        if not am:
+            continue
+        attr_id  = int(am.group(1))
+        attr_name = am.group(2)
+        raw_val   = int(am.group(3))
+        attributes[attr_id] = {"name": attr_name, "raw": raw_val}
+
+        if attr_id in SMART_ATTRS:
+            _, is_critical = SMART_ATTRS[attr_id]
+            if raw_val > 0:
+                msg = f"{attr_name} = {raw_val}"
+                if is_critical:
+                    failures.append(msg)
+                else:
+                    warnings.append(msg)
+
+    return {
+        "health":     health,
+        "raw_output": output,
+        "attributes": attributes,
+        "warnings":   warnings,
+        "failures":   failures,
+    }
+
+
+def _parse_smart_progress(output: str) -> dict:
+    state = "unknown"
+    percent_remaining = 0
+
+    lower = output.lower()
+
+    if "self-test routine in progress" in lower or "self-test routine in progress" in output:
+        state = "running"
+        m = re.search(r"(\d+)%\s+of\s+test\s+remaining", output, re.IGNORECASE)
+        if m:
+            percent_remaining = int(m.group(1))
+    elif "completed without error" in lower:
+        state = "passed"
+    elif (
+        "completed: read failure" in lower
+        or "completed: write failure" in lower
+        or "aborted by host" in lower
+        or ("completed" in lower and "failure" in lower)
+    ):
+        state = "failed"
+    elif "in progress" in lower:
+        state = "running"
+
+    return {
+        "state":             state,
+        "percent_remaining": percent_remaining,
+        "output":            output,
+    }
--- a/app/static/app.css
+++ b/app/static/app.css
@ -2283,3 +2283,125 @@ tr.drawer-row-active {
  .drawer-smart-grid { grid-template-columns: 1fr; }
  .drawer-drive-meta { display: none; }
 }
+
+/* -----------------------------------------------------------------------
+   Stage raw log output (SSH mode)
+----------------------------------------------------------------------- */
+.stage-log {
+  font-family: "SF Mono", "Consolas", "Monaco", monospace;
+  font-size: 11px;
+  line-height: 1.5;
+  color: var(--text-muted);
+  background: var(--bg);
+  border-left: 2px solid var(--border);
+  margin: 6px 0 2px 28px;
+  padding: 6px 10px;
+  white-space: pre-wrap;
+  word-break: break-all;
+  max-height: 200px;
+  overflow-y: auto;
+}
+.stage-log .log-bad-block {
+  color: var(--red);
+  font-weight: 600;
+}
+.stage-log .log-warn {
+  color: var(--yellow);
+}
+
+/* -----------------------------------------------------------------------
+   SMART attributes table in drawer
+----------------------------------------------------------------------- */
+.smart-attrs {
+  margin-top: 12px;
+  border-top: 1px solid var(--border);
+  padding-top: 10px;
+}
+.smart-attrs-title {
+  font-size: 11px;
+  font-weight: 600;
+  color: var(--text-muted);
+  text-transform: uppercase;
+  letter-spacing: .05em;
+  margin-bottom: 6px;
+}
+.smart-attr-row {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  padding: 3px 0;
+  font-size: 12px;
+  border-bottom: 1px solid color-mix(in srgb, var(--border) 50%, transparent);
+}
+.smart-attr-row:last-child { border-bottom: none; }
+.smart-attr-name { color: var(--text-muted); }
+.smart-attr-val  { font-family: "SF Mono", monospace; font-size: 12px; }
+.smart-attr-val.attr-ok   { color: var(--green); }
+.smart-attr-val.attr-warn { color: var(--yellow); font-weight: 600; }
+.smart-attr-val.attr-fail { color: var(--red);    font-weight: 600; }
+.smart-attr-raw-output {
+  font-family: "SF Mono", "Consolas", monospace;
+  font-size: 10.5px;
+  line-height: 1.45;
+  color: var(--text-muted);
+  background: var(--bg);
+  border: 1px solid var(--border);
+  border-radius: 4px;
+  padding: 8px 10px;
+  margin-top: 10px;
+  white-space: pre;
+  overflow: auto;
+  max-height: 240px;
+}
+
+/* -----------------------------------------------------------------------
+   Reset button
+----------------------------------------------------------------------- */
+.btn-reset {
+  background: transparent;
+  border: 1px solid color-mix(in srgb, var(--text-muted) 40%, transparent);
+  color: var(--text-muted);
+  border-radius: 5px;
+  padding: 3px 8px;
+  font-size: 12px;
+  cursor: pointer;
+  transition: border-color .15s, color .15s;
+}
+.btn-reset:hover {
+  border-color: var(--yellow);
+  color: var(--yellow);
+}
+
+/* -----------------------------------------------------------------------
+   Parallel burn-in inline warning
+----------------------------------------------------------------------- */
+.sf-inline-warn {
+  background: color-mix(in srgb, var(--yellow) 12%, transparent);
+  border: 1px solid color-mix(in srgb, var(--yellow) 40%, transparent);
+  border-radius: 5px;
+  color: var(--yellow);
+  font-size: 12px;
+  padding: 7px 10px;
+  margin: 4px 0 8px 0;
+}
+
+/* -----------------------------------------------------------------------
+   SSH textarea
+----------------------------------------------------------------------- */
+.sf-textarea {
+  resize: vertical;
+  min-height: 90px;
+  font-family: "SF Mono", "Consolas", monospace;
+  font-size: 11px;
+}
+
+/* -----------------------------------------------------------------------
+   Version badge in header
+----------------------------------------------------------------------- */
+.header-version {
+  font-size: 11px;
+  color: var(--text-muted);
+  opacity: .6;
+  padding: 0 2px;
+  font-variant-numeric: tabular-nums;
+}
--- a/app/static/app.js
+++ b/app/static/app.js
@ -957,8 +957,18 @@
        if (s.error_text) {
          html += '<div class="stage-error-line">' + _esc(s.error_text) + '</div>';
        }
+        // Raw SSH log output (if available)
+        if (s.log_text) {
+          var logHtml = _esc(s.log_text)
+            .replace(/^(\d+)\s*$/gm, '<span class="log-bad-block">$1  ← BAD BLOCK</span>')
+            .replace(/\[WARNING\][^\n]*/g, '<span class="log-warn">$&</span>');
+          html += '<pre class="stage-log">' + logHtml + '</pre>';
+        }
+        // Bad block count badge
+        if (s.bad_blocks && s.bad_blocks > 0) {
+          html += '<div class="stage-error-line">' + s.bad_blocks + ' bad block(s) found</div>';
+        }
        html += '</div>';
-      });
    } else {
      html += '<div class="drawer-empty">No stage data yet.</div>';
    }
@ -973,6 +983,10 @@
    }
  }

+  // Monitored SMART attributes for inline colouring
+  var _SMART_CRITICAL = {5: true, 197: true, 198: true};
+  var _SMART_WARN     = {10: true, 188: true, 199: true};
+
  function _drawerRenderSmart(smart) {
    var panel = document.getElementById('drawer-panel-smart');
    if (!panel) return;
@ -994,10 +1008,41 @@
        if (t.started_at)  html += '<div class="smart-detail">Started: '  + _drawerFmtDt(t.started_at) + '</div>';
        if (t.finished_at) html += '<div class="smart-detail">Finished: ' + _drawerFmtDt(t.finished_at) + '</div>';
        if (t.error_text)  html += '<div class="stage-error-line">' + _esc(t.error_text) + '</div>';
+        // Raw smartctl output (SSH mode)
+        if (t.raw_output) {
+          html += '<pre class="smart-attr-raw-output">' + _esc(t.raw_output) + '</pre>';
+        }
      }
      html += '</div>';
    });
    html += '</div>';
+
+    // SMART attribute table (from SSH attribute parse)
+    var attrs = smart && smart.attrs;
+    if (attrs) {
+      html += '<div class="smart-attrs">';
+      html += '<div class="smart-attrs-title">SMART Attributes</div>';
+      if (attrs.failures && attrs.failures.length) {
+        html += '<div class="stage-error-line" style="margin-bottom:6px">✕ Failures: ' + _esc(attrs.failures.join('; ')) + '</div>';
+      }
+      if (attrs.warnings && attrs.warnings.length) {
+        html += '<div class="stage-error-line" style="color:var(--yellow);margin-bottom:6px">⚠ Warnings: ' + _esc(attrs.warnings.join('; ')) + '</div>';
+      }
+      var attrMap = attrs.attrs || {};
+      var monitoredIds = [5, 10, 188, 197, 198, 199];
+      monitoredIds.forEach(function (id) {
+        var entry = attrMap[String(id)];
+        if (!entry) return;
+        var raw = entry.raw;
+        var cls = raw > 0 ? (_SMART_CRITICAL[id] ? 'attr-fail' : 'attr-warn') : 'attr-ok';
+        html += '<div class="smart-attr-row">';
+        html += '<span class="smart-attr-name">' + id + ' ' + _esc(entry.name) + '</span>';
+        html += '<span class="smart-attr-val ' + cls + '">' + raw + '</span>';
+        html += '</div>';
+      });
+      html += '</div>';
+    }
+
    panel.innerHTML = html;
  }

@ -1078,4 +1123,21 @@
    if (e.target.closest('#drawer-close-btn')) closeDrawer();
  });

+  // Reset button — clears SMART state for a drive
+  document.addEventListener('click', function (e) {
+    var btn = e.target.closest('.btn-reset');
+    if (!btn) return;
+    var driveId = btn.dataset.driveId;
+    if (!driveId) return;
+    var operator = (window._operator || 'operator');
+    fetch('/api/v1/drives/' + driveId + '/reset', {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ operator: operator }),
+    }).then(function (r) {
+      if (!r.ok) return r.json().then(function (d) { showToast(d.detail || 'Reset failed', 'error'); });
+      showToast('Drive reset — state cleared', 'success');
+    }).catch(function () { showToast('Network error', 'error'); });
+  });
+
 }());
--- a/app/templates/components/drives_table.html
+++ b/app/templates/components/drives_table.html
@ -81,6 +81,10 @@
      {%- set short_busy = drive.smart_short and drive.smart_short.state == 'running' %}
      {%- set long_busy  = drive.smart_long  and drive.smart_long.state  == 'running' %}
      {%- set selectable = not bi_active and not short_busy and not long_busy %}
+      {%- set bi_done = drive.burnin and drive.burnin.state in ('passed', 'failed', 'cancelled', 'unknown') %}
+      {%- set smart_done = (drive.smart_short and drive.smart_short.state in ('passed','failed','aborted'))
+                        or (drive.smart_long  and drive.smart_long.state  in ('passed','failed','aborted')) %}
+      {%- set can_reset = (bi_done or smart_done) and not bi_active and not short_busy and not long_busy %}
      <tr data-status="{{ drive.status }}" id="drive-{{ drive.id }}">
        <td class="col-check">
          {%- if selectable %}
@ -160,6 +164,12 @@
                    data-health="{{ drive.smart_health }}"
                    {% if short_busy or long_busy %}disabled{% endif %}
                    title="Start Burn-In">Burn-In</button>
+            <!-- Reset — clears SMART state so drive can be re-tested from scratch -->
+            {%- if can_reset %}
+            <button class="btn-action btn-reset"
+                    data-drive-id="{{ drive.id }}"
+                    title="Reset SMART state — clears test results so drive shows as fresh">Reset</button>
+            {%- endif %}
            {%- endif %}
          </div>
        </td>
--- a/app/templates/layout.html
+++ b/app/templates/layout.html
@ -37,6 +37,7 @@
    <a class="header-link" href="/audit">Audit</a>
    <a class="header-link" href="/settings">Settings</a>
    <a class="header-link" href="/docs" target="_blank" rel="noopener">API</a>
+    <span class="header-version">v{{ app_version if app_version is defined else '—' }}</span>
  </div>
 </header>

--- a/app/templates/settings.html
+++ b/app/templates/settings.html
@ -91,6 +91,57 @@
        </div>
      </div>

+      <!-- SSH -->
+      <div class="settings-card">
+        <div class="settings-card-header">
+          <span class="settings-card-title">SSH (TrueNAS Direct)</span>
+          {% if ssh_configured %}
+          <span class="chip chip-passed" style="font-size:10px">Configured</span>
+          {% else %}
+          <span class="chip chip-unknown" style="font-size:10px">Not configured — using REST API / mock</span>
+          {% endif %}
+        </div>
+        <p class="sf-hint" style="margin-bottom:8px">
+          When configured, burn-in stages run smartctl and badblocks directly on TrueNAS over SSH,
+          enabling SMART attribute monitoring and real bad-block detection. Leave Host empty to use
+          the TrueNAS REST API (mock / dev mode).
+        </p>
+        <div class="sf-fields">
+
+          <div class="sf-full sf-row-test" style="margin-bottom:4px">
+            <button type="button" id="test-ssh-btn" class="btn-secondary">Test SSH Connection</button>
+            <span id="ssh-test-result" class="settings-test-result" style="display:none"></span>
+          </div>
+
+          <label for="ssh_host">Host / IP</label>
+          <input class="sf-input" id="ssh_host" name="ssh_host" type="text"
+                 value="{{ editable.ssh_host }}" placeholder="10.0.0.x (same as TrueNAS IP)">
+
+          <label for="ssh_port">Port</label>
+          <input class="sf-input sf-input-xs" id="ssh_port" name="ssh_port"
+                 type="number" min="1" max="65535" value="{{ editable.ssh_port }}" style="width:70px">
+
+          <label for="ssh_user">Username</label>
+          <input class="sf-input" id="ssh_user" name="ssh_user" type="text"
+                 value="{{ editable.ssh_user }}" placeholder="root">
+
+          <label for="ssh_password">Password</label>
+          <input class="sf-input" id="ssh_password" name="ssh_password" type="password"
+                 placeholder="leave blank to keep existing" autocomplete="new-password">
+
+          <label for="ssh_key">Private Key</label>
+          <div>
+            <textarea class="sf-input sf-textarea" id="ssh_key" name="ssh_key"
+                      rows="6" placeholder="Paste PEM private key here (-----BEGIN ... KEY-----). Leave blank to keep existing." autocomplete="off"></textarea>
+            <span class="sf-hint" style="margin-top:3px">
+              Either password or key auth. Key takes precedence if both are set.
+              Key is stored securely in <code>/data/settings_overrides.json</code>.
+            </span>
+          </div>
+
+        </div>
+      </div>
+
    </div><!-- /left col -->

    <!-- RIGHT column: Notifications + Behavior -->
@ -159,9 +210,14 @@
        <div class="sf-row">
          <label class="sf-label" for="max_parallel_burnins">Max Parallel Burn-Ins</label>
          <input class="sf-input sf-input-xs" id="max_parallel_burnins" name="max_parallel_burnins"
-                 type="number" min="1" max="16" value="{{ editable.max_parallel_burnins }}">
+                 type="number" min="1" max="60" value="{{ editable.max_parallel_burnins }}">
          <span class="sf-hint">How many jobs can run at the same time</span>
        </div>
+        <div id="parallel-warn" class="sf-inline-warn"
+             {% if editable.max_parallel_burnins <= 8 %}style="display:none"{% endif %}>
+          ⚠ Running many simultaneous surface scans may saturate your storage controller
+          and produce unreliable results. Recommended: 2–4.
+        </div>

        <div class="sf-row">
          <label class="sf-label" for="stuck_job_hours">Stuck Job Threshold (hours)</label>
@ -348,6 +404,36 @@
    }
  });

+  // Parallel burn-in warning
+  var parallelInput = document.getElementById('max_parallel_burnins');
+  var parallelWarn  = document.getElementById('parallel-warn');
+  if (parallelInput && parallelWarn) {
+    parallelInput.addEventListener('input', function () {
+      parallelWarn.style.display = parseInt(parallelInput.value, 10) > 8 ? '' : 'none';
+    });
+  }
+
+  // Test SSH
+  var sshBtn    = document.getElementById('test-ssh-btn');
+  var sshResult = document.getElementById('ssh-test-result');
+  if (sshBtn) {
+    sshBtn.addEventListener('click', async function () {
+      sshBtn.disabled = true;
+      sshBtn.textContent = 'Testing…';
+      sshResult.style.display = 'none';
+      try {
+        var resp = await fetch('/api/v1/settings/test-ssh', { method: 'POST' });
+        var data = await resp.json();
+        showResult(sshResult, resp.ok, resp.ok ? 'Connection OK' : (data.detail || 'Failed'));
+      } catch (e) {
+        showResult(sshResult, false, 'Network error');
+      } finally {
+        sshBtn.disabled = false;
+        sshBtn.textContent = 'Test SSH Connection';
+      }
+    });
+  }
+
  // Check for Updates
  var updBtn = document.getElementById('check-updates-btn');
  var updResult = document.getElementById('update-result');
--- a/app/templates/stats.html
+++ b/app/templates/stats.html
@ -119,5 +119,65 @@
    {% endif %}
  </div>

+</div>
+
+<div class="stats-grid" style="margin-top:24px">
+
+  <!-- Average duration by drive size -->
+  <div class="stats-section">
+    <h2 class="section-title">Avg. Test Duration by Drive Size</h2>
+    {% if by_size %}
+    <div class="table-wrap" style="max-height:none">
+      <table>
+        <thead>
+          <tr>
+            <th>Size</th>
+            <th style="text-align:right">Jobs</th>
+            <th style="text-align:right">Avg Duration</th>
+          </tr>
+        </thead>
+        <tbody>
+          {% for s in by_size %}
+          <tr>
+            <td style="font-weight:500;color:var(--text-strong)">{{ s.size_tb }} TB</td>
+            <td class="mono text-muted" style="text-align:right">{{ s.total }}</td>
+            <td class="mono" style="text-align:right;color:var(--text-strong)">{{ s.avg_hours }}h</td>
+          </tr>
+          {% endfor %}
+        </tbody>
+      </table>
+    </div>
+    {% else %}
+    <div class="empty-state" style="border:1px solid var(--border);border-radius:8px;padding:32px">No completed jobs yet.</div>
+    {% endif %}
+  </div>
+
+  <!-- Failure breakdown by stage -->
+  <div class="stats-section">
+    <h2 class="section-title">Failures by Stage</h2>
+    {% if by_failure_stage %}
+    <div class="table-wrap" style="max-height:none">
+      <table>
+        <thead>
+          <tr>
+            <th>Stage</th>
+            <th style="text-align:right">Count</th>
+          </tr>
+        </thead>
+        <tbody>
+          {% for f in by_failure_stage %}
+          <tr>
+            <td style="font-weight:500;color:var(--red)">{{ f.failed_stage | replace('_',' ') | title }}</td>
+            <td class="mono" style="text-align:right;color:var(--red)">{{ f.count }}</td>
+          </tr>
+          {% endfor %}
+        </tbody>
+      </table>
+    </div>
+    {% else %}
+    <div class="empty-state" style="border:1px solid var(--border);border-radius:8px;padding:32px">No failures recorded.</div>
+    {% endif %}
+  </div>
+
 </div>
 {% endblock %}
--- a/requirements.txt
+++ b/requirements.txt
@ -5,3 +5,4 @@ httpx
 pydantic-settings
 jinja2
 sse-starlette
+asyncssh