Stage 7: SSH architecture, SMART attribute monitoring, drive reset, and polish
SSH (app/ssh_client.py — new):
- asyncssh-based client: start_smart_test, poll_smart_progress, abort_smart_test,
get_smart_attributes, run_badblocks with streaming progress callbacks
- SMART attribute table: monitors attrs 5/10/188/197/198/199 for warn/fail thresholds
- Falls back to REST API / mock simulation when ssh_host is not configured
Burn-in stages updated (burnin.py):
- _stage_smart_test: SSH path polls smartctl -a, stores raw output + parsed attributes
- _stage_surface_validate: SSH path streams badblocks, counts bad blocks vs configurable threshold
- _stage_final_check: SSH path checks smartctl attributes; DB fallback for mock mode
- New DB helpers: _append_stage_log, _update_stage_bad_blocks, _store_smart_attrs,
_store_smart_raw_output
Database (database.py):
- Migrations: burnin_stages.log_text, burnin_stages.bad_blocks,
drives.smart_attrs (JSON), smart_tests.raw_output
Settings (config.py + settings_store.py):
- ssh_host, ssh_port, ssh_user, ssh_password, ssh_key — all runtime-editable
- SSH section in Settings UI with Test SSH Connection button
Webhook (notifier.py):
- Added bad_blocks and timestamp fields to payload per SPEC
Drive reset (routes.py + drives_table.html):
- POST /api/v1/drives/{id}/reset — clears SMART state, smart_attrs; audit logged
- Reset button visible on drives with completed test state (no active burn-in)
Log drawer (app.js):
- Burn-In tab: shows raw stage log_text (SSH output) with bad block highlighting
- SMART tab: shows SMART attribute table with warn/fail colouring + raw smartctl output
Polish:
- Version badge (v1.0.0-6d) in header via Jinja2 global
- Parallel burn-in warning when max_parallel_burnins > 8 in Settings
- Stats page: avg duration by drive size + failure breakdown by stage
- settings.html: SSH section with key textarea, parallel warn div
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
4ab54d7ed8
commit
2dff58bd52
15 changed files with 1141 additions and 44 deletions
315
app/burnin.py
315
app/burnin.py
|
|
@ -303,6 +303,16 @@ async def _run_job(job_id: int) -> None:
|
||||||
)
|
)
|
||||||
job_row = await cur2.fetchone()
|
job_row = await cur2.fetchone()
|
||||||
if job_row:
|
if job_row:
|
||||||
|
# Get bad_blocks count from surface_validate stage if present
|
||||||
|
bad_blocks = 0
|
||||||
|
async with _db() as db3:
|
||||||
|
cur3 = await db3.execute(
|
||||||
|
"SELECT bad_blocks FROM burnin_stages WHERE burnin_job_id=? AND stage_name='surface_validate'",
|
||||||
|
(job_id,)
|
||||||
|
)
|
||||||
|
bb_row = await cur3.fetchone()
|
||||||
|
if bb_row and bb_row[0]:
|
||||||
|
bad_blocks = bb_row[0]
|
||||||
asyncio.create_task(notifier.notify_job_complete(
|
asyncio.create_task(notifier.notify_job_complete(
|
||||||
job_id=job_id,
|
job_id=job_id,
|
||||||
devname=devname,
|
devname=devname,
|
||||||
|
|
@ -312,6 +322,7 @@ async def _run_job(job_id: int) -> None:
|
||||||
profile=job_row["profile"],
|
profile=job_row["profile"],
|
||||||
operator=job_row["operator"],
|
operator=job_row["operator"],
|
||||||
error_text=error_text,
|
error_text=error_text,
|
||||||
|
bad_blocks=bad_blocks,
|
||||||
))
|
))
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
log.error("Failed to schedule notifications: %s", exc)
|
log.error("Failed to schedule notifications: %s", exc)
|
||||||
|
|
@ -352,15 +363,15 @@ async def _dispatch_stage(job_id: int, stage_name: str, devname: str, drive_id:
|
||||||
if stage_name == "precheck":
|
if stage_name == "precheck":
|
||||||
return await _stage_precheck(job_id, drive_id)
|
return await _stage_precheck(job_id, drive_id)
|
||||||
elif stage_name == "short_smart":
|
elif stage_name == "short_smart":
|
||||||
return await _stage_smart_test(job_id, devname, "SHORT", "short_smart")
|
return await _stage_smart_test(job_id, devname, "SHORT", "short_smart", drive_id)
|
||||||
elif stage_name == "long_smart":
|
elif stage_name == "long_smart":
|
||||||
return await _stage_smart_test(job_id, devname, "LONG", "long_smart")
|
return await _stage_smart_test(job_id, devname, "LONG", "long_smart", drive_id)
|
||||||
elif stage_name == "surface_validate":
|
elif stage_name == "surface_validate":
|
||||||
return await _stage_timed_simulate(job_id, "surface_validate", settings.surface_validate_seconds)
|
return await _stage_surface_validate(job_id, devname, drive_id)
|
||||||
elif stage_name == "io_validate":
|
elif stage_name == "io_validate":
|
||||||
return await _stage_timed_simulate(job_id, "io_validate", settings.io_validate_seconds)
|
return await _stage_timed_simulate(job_id, "io_validate", settings.io_validate_seconds)
|
||||||
elif stage_name == "final_check":
|
elif stage_name == "final_check":
|
||||||
return await _stage_final_check(job_id, devname)
|
return await _stage_final_check(job_id, devname, drive_id)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -393,8 +404,17 @@ async def _stage_precheck(job_id: int, drive_id: int) -> bool:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
async def _stage_smart_test(job_id: int, devname: str, test_type: str, stage_name: str) -> bool:
|
async def _stage_smart_test(job_id: int, devname: str, test_type: str, stage_name: str,
|
||||||
"""Start a TrueNAS SMART test and poll until complete."""
|
drive_id: int | None = None) -> bool:
|
||||||
|
"""Start a SMART test. Uses SSH if configured, TrueNAS REST API otherwise."""
|
||||||
|
from app import ssh_client
|
||||||
|
if ssh_client.is_configured():
|
||||||
|
return await _stage_smart_test_ssh(job_id, devname, test_type, stage_name, drive_id)
|
||||||
|
return await _stage_smart_test_api(job_id, devname, test_type, stage_name)
|
||||||
|
|
||||||
|
|
||||||
|
async def _stage_smart_test_api(job_id: int, devname: str, test_type: str, stage_name: str) -> bool:
|
||||||
|
"""TrueNAS REST API path for SMART test (mock / dev mode)."""
|
||||||
tn_job_id = await _client.start_smart_test([devname], test_type)
|
tn_job_id = await _client.start_smart_test([devname], test_type)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|
@ -428,8 +448,215 @@ async def _stage_smart_test(job_id: int, devname: str, test_type: str, stage_nam
|
||||||
await asyncio.sleep(POLL_INTERVAL)
|
await asyncio.sleep(POLL_INTERVAL)
|
||||||
|
|
||||||
|
|
||||||
|
async def _stage_smart_test_ssh(job_id: int, devname: str, test_type: str, stage_name: str,
|
||||||
|
drive_id: int | None) -> bool:
|
||||||
|
"""SSH path for SMART test — runs smartctl directly on TrueNAS."""
|
||||||
|
from app import ssh_client
|
||||||
|
|
||||||
|
# Start the test
|
||||||
|
try:
|
||||||
|
startup = await ssh_client.start_smart_test(devname, test_type)
|
||||||
|
await _append_stage_log(job_id, stage_name, startup + "\n")
|
||||||
|
except Exception as exc:
|
||||||
|
await _set_stage_error(job_id, stage_name, f"Failed to start SMART test via SSH: {exc}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Brief pause to let the test register in smartctl output
|
||||||
|
await asyncio.sleep(3)
|
||||||
|
|
||||||
|
# Poll until complete
|
||||||
|
while True:
|
||||||
|
if await _is_cancelled(job_id):
|
||||||
|
try:
|
||||||
|
await ssh_client.abort_smart_test(devname)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
await asyncio.sleep(POLL_INTERVAL)
|
||||||
|
|
||||||
|
try:
|
||||||
|
progress = await ssh_client.poll_smart_progress(devname)
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("SSH SMART poll failed: %s", exc, extra={"job_id": job_id})
|
||||||
|
await _append_stage_log(job_id, stage_name, f"[poll error] {exc}\n")
|
||||||
|
continue
|
||||||
|
|
||||||
|
await _append_stage_log(job_id, stage_name, progress["output"] + "\n---\n")
|
||||||
|
|
||||||
|
if progress["state"] == "running":
|
||||||
|
pct = max(0, 100 - progress["percent_remaining"])
|
||||||
|
await _update_stage_percent(job_id, stage_name, pct)
|
||||||
|
await _recalculate_progress(job_id)
|
||||||
|
_push_update()
|
||||||
|
|
||||||
|
elif progress["state"] == "passed":
|
||||||
|
await _update_stage_percent(job_id, stage_name, 100)
|
||||||
|
# Run attribute check
|
||||||
|
if drive_id is not None:
|
||||||
|
try:
|
||||||
|
attrs = await ssh_client.get_smart_attributes(devname)
|
||||||
|
await _store_smart_attrs(drive_id, attrs)
|
||||||
|
await _store_smart_raw_output(drive_id, test_type, attrs["raw_output"])
|
||||||
|
if attrs["failures"]:
|
||||||
|
error = "SMART attribute failures: " + "; ".join(attrs["failures"])
|
||||||
|
await _set_stage_error(job_id, stage_name, error)
|
||||||
|
return False
|
||||||
|
if attrs["warnings"]:
|
||||||
|
await _append_stage_log(
|
||||||
|
job_id, stage_name,
|
||||||
|
"[WARNING] " + "; ".join(attrs["warnings"]) + "\n"
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("Failed to retrieve SMART attributes: %s", exc)
|
||||||
|
await _recalculate_progress(job_id)
|
||||||
|
_push_update()
|
||||||
|
return True
|
||||||
|
|
||||||
|
elif progress["state"] == "failed":
|
||||||
|
await _set_stage_error(job_id, stage_name, f"SMART {test_type} test failed")
|
||||||
|
return False
|
||||||
|
# "unknown" → keep polling
|
||||||
|
|
||||||
|
|
||||||
|
async def _stage_surface_validate(job_id: int, devname: str, drive_id: int) -> bool:
|
||||||
|
"""
|
||||||
|
Surface validation stage.
|
||||||
|
SSH mode: runs badblocks -wsv -b 4096 -p 1 /dev/{devname}.
|
||||||
|
Mock mode: simulated timed progress (no real I/O).
|
||||||
|
"""
|
||||||
|
from app import ssh_client
|
||||||
|
if ssh_client.is_configured():
|
||||||
|
return await _stage_surface_validate_ssh(job_id, devname, drive_id)
|
||||||
|
return await _stage_timed_simulate(job_id, "surface_validate", settings.surface_validate_seconds)
|
||||||
|
|
||||||
|
|
||||||
|
async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) -> bool:
|
||||||
|
"""Run badblocks over SSH, streaming output to stage log."""
|
||||||
|
from app import ssh_client
|
||||||
|
|
||||||
|
await _append_stage_log(
|
||||||
|
job_id, "surface_validate",
|
||||||
|
f"[START] badblocks -wsv -b 4096 -p 1 /dev/{devname}\n"
|
||||||
|
f"[NOTE] This is a DESTRUCTIVE write test. All data on /dev/{devname} will be overwritten.\n\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _is_cancelled_sync() -> bool:
|
||||||
|
# Synchronous version — we check the DB state flag set by cancel_job()
|
||||||
|
import asyncio
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
try:
|
||||||
|
return loop.run_until_complete(_is_cancelled(job_id))
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
last_logged_pct = [-1]
|
||||||
|
|
||||||
|
def on_progress(pct: int, bad_blocks: int, line: str) -> None:
|
||||||
|
nonlocal last_logged_pct
|
||||||
|
# Write to log (fire-and-forget via asyncio.create_task from sync context)
|
||||||
|
# The log append is done in the async flush below
|
||||||
|
pass
|
||||||
|
|
||||||
|
accumulated_lines: list[str] = []
|
||||||
|
|
||||||
|
async def on_progress_async(pct: int, bad_blocks: int, line: str) -> None:
|
||||||
|
accumulated_lines.append(line)
|
||||||
|
# Flush to DB and update progress every ~25 lines to avoid excessive DB writes
|
||||||
|
if len(accumulated_lines) % 25 == 0:
|
||||||
|
await _append_stage_log(job_id, "surface_validate", "".join(accumulated_lines[-25:]))
|
||||||
|
await _update_stage_bad_blocks(job_id, "surface_validate", bad_blocks)
|
||||||
|
await _update_stage_percent(job_id, "surface_validate", pct)
|
||||||
|
await _recalculate_progress(job_id)
|
||||||
|
_push_update()
|
||||||
|
if await _is_cancelled(job_id):
|
||||||
|
raise asyncio.CancelledError
|
||||||
|
|
||||||
|
# Run badblocks — we adapt the callback pattern to async by collecting then flushing
|
||||||
|
result = {"bad_blocks": 0, "output": "", "aborted": False}
|
||||||
|
try:
|
||||||
|
# The actual streaming; we handle progress via the accumulated_lines pattern
|
||||||
|
bad_blocks_total = 0
|
||||||
|
output_lines: list[str] = []
|
||||||
|
|
||||||
|
async with await ssh_client._connect() as conn:
|
||||||
|
cmd = f"badblocks -wsv -b 4096 -p 1 /dev/{devname}"
|
||||||
|
async with conn.create_process(cmd) as proc:
|
||||||
|
import re as _re
|
||||||
|
|
||||||
|
async def _drain(stream, is_stderr: bool):
|
||||||
|
nonlocal bad_blocks_total
|
||||||
|
async for raw in stream:
|
||||||
|
line = raw if isinstance(raw, str) else raw.decode("utf-8", errors="replace")
|
||||||
|
output_lines.append(line)
|
||||||
|
|
||||||
|
if is_stderr:
|
||||||
|
m = _re.search(r"([\d.]+)%\s+done", line)
|
||||||
|
if m:
|
||||||
|
pct = min(99, int(float(m.group(1))))
|
||||||
|
await _update_stage_percent(job_id, "surface_validate", pct)
|
||||||
|
await _update_stage_bad_blocks(job_id, "surface_validate", bad_blocks_total)
|
||||||
|
await _recalculate_progress(job_id)
|
||||||
|
_push_update()
|
||||||
|
else:
|
||||||
|
stripped = line.strip()
|
||||||
|
if stripped and stripped.isdigit():
|
||||||
|
bad_blocks_total += 1
|
||||||
|
|
||||||
|
# Append to DB log in chunks
|
||||||
|
if len(output_lines) % 20 == 0:
|
||||||
|
chunk = "".join(output_lines[-20:])
|
||||||
|
await _append_stage_log(job_id, "surface_validate", chunk)
|
||||||
|
|
||||||
|
# Abort on bad block threshold
|
||||||
|
if bad_blocks_total > settings.bad_block_threshold:
|
||||||
|
proc.kill()
|
||||||
|
output_lines.append(
|
||||||
|
f"\n[ABORTED] {bad_blocks_total} bad block(s) exceeded "
|
||||||
|
f"threshold ({settings.bad_block_threshold})\n"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
if await _is_cancelled(job_id):
|
||||||
|
proc.kill()
|
||||||
|
return
|
||||||
|
|
||||||
|
await asyncio.gather(
|
||||||
|
_drain(proc.stdout, False),
|
||||||
|
_drain(proc.stderr, True),
|
||||||
|
return_exceptions=True,
|
||||||
|
)
|
||||||
|
await proc.wait()
|
||||||
|
|
||||||
|
# Flush remaining output
|
||||||
|
remainder = "".join(output_lines)
|
||||||
|
await _append_stage_log(job_id, "surface_validate", remainder)
|
||||||
|
result["bad_blocks"] = bad_blocks_total
|
||||||
|
result["output"] = remainder
|
||||||
|
result["aborted"] = bad_blocks_total > settings.bad_block_threshold
|
||||||
|
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
return False
|
||||||
|
except Exception as exc:
|
||||||
|
await _append_stage_log(job_id, "surface_validate", f"\n[SSH error] {exc}\n")
|
||||||
|
await _set_stage_error(job_id, "surface_validate", f"SSH badblocks error: {exc}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
await _update_stage_bad_blocks(job_id, "surface_validate", result["bad_blocks"])
|
||||||
|
|
||||||
|
if result["aborted"] or result["bad_blocks"] > settings.bad_block_threshold:
|
||||||
|
await _set_stage_error(
|
||||||
|
job_id, "surface_validate",
|
||||||
|
f"Surface validate FAILED: {result['bad_blocks']} bad block(s) found "
|
||||||
|
f"(threshold: {settings.bad_block_threshold})"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
async def _stage_timed_simulate(job_id: int, stage_name: str, duration_seconds: int) -> bool:
|
async def _stage_timed_simulate(job_id: int, stage_name: str, duration_seconds: int) -> bool:
|
||||||
"""Simulate a timed stage (surface validation / IO validation) with progress updates."""
|
"""Simulate a timed stage with progress updates (mock / dev mode)."""
|
||||||
start = time.monotonic()
|
start = time.monotonic()
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|
@ -449,9 +676,28 @@ async def _stage_timed_simulate(job_id: int, stage_name: str, duration_seconds:
|
||||||
await asyncio.sleep(POLL_INTERVAL)
|
await asyncio.sleep(POLL_INTERVAL)
|
||||||
|
|
||||||
|
|
||||||
async def _stage_final_check(job_id: int, devname: str) -> bool:
|
async def _stage_final_check(job_id: int, devname: str, drive_id: int | None = None) -> bool:
|
||||||
"""Verify drive passed all tests by checking current SMART health in DB."""
|
"""
|
||||||
|
Verify drive passed all tests.
|
||||||
|
SSH mode: run smartctl -a and check critical attributes.
|
||||||
|
Mock mode: check SMART health field in DB.
|
||||||
|
"""
|
||||||
await asyncio.sleep(1)
|
await asyncio.sleep(1)
|
||||||
|
from app import ssh_client
|
||||||
|
if ssh_client.is_configured() and drive_id is not None:
|
||||||
|
try:
|
||||||
|
attrs = await ssh_client.get_smart_attributes(devname)
|
||||||
|
await _store_smart_attrs(drive_id, attrs)
|
||||||
|
if attrs["health"] == "FAILED" or attrs["failures"]:
|
||||||
|
failures = attrs["failures"] or [f"SMART health: {attrs['health']}"]
|
||||||
|
await _set_stage_error(job_id, "final_check",
|
||||||
|
"Final check failed: " + "; ".join(failures))
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("SSH final_check failed, falling back to DB check: %s", exc)
|
||||||
|
|
||||||
|
# DB check (mock mode fallback)
|
||||||
async with _db() as db:
|
async with _db() as db:
|
||||||
cur = await db.execute(
|
cur = await db.execute(
|
||||||
"SELECT smart_health FROM drives WHERE devname=?", (devname,)
|
"SELECT smart_health FROM drives WHERE devname=?", (devname,)
|
||||||
|
|
@ -549,6 +795,57 @@ async def _cancel_stage(job_id: int, stage_name: str) -> None:
|
||||||
await db.commit()
|
await db.commit()
|
||||||
|
|
||||||
|
|
||||||
|
async def _append_stage_log(job_id: int, stage_name: str, text: str) -> None:
|
||||||
|
"""Append text to the log_text column of a burnin_stages row."""
|
||||||
|
async with _db() as db:
|
||||||
|
await db.execute("PRAGMA journal_mode=WAL")
|
||||||
|
await db.execute(
|
||||||
|
"""UPDATE burnin_stages
|
||||||
|
SET log_text = COALESCE(log_text, '') || ?
|
||||||
|
WHERE burnin_job_id=? AND stage_name=?""",
|
||||||
|
(text, job_id, stage_name),
|
||||||
|
)
|
||||||
|
await db.commit()
|
||||||
|
|
||||||
|
|
||||||
|
async def _update_stage_bad_blocks(job_id: int, stage_name: str, count: int) -> None:
|
||||||
|
async with _db() as db:
|
||||||
|
await db.execute("PRAGMA journal_mode=WAL")
|
||||||
|
await db.execute(
|
||||||
|
"UPDATE burnin_stages SET bad_blocks=? WHERE burnin_job_id=? AND stage_name=?",
|
||||||
|
(count, job_id, stage_name),
|
||||||
|
)
|
||||||
|
await db.commit()
|
||||||
|
|
||||||
|
|
||||||
|
async def _store_smart_attrs(drive_id: int, attrs: dict) -> None:
|
||||||
|
"""Persist latest SMART attribute dict to drives.smart_attrs (JSON)."""
|
||||||
|
import json
|
||||||
|
# Convert int keys to str for JSON serialisation
|
||||||
|
serialisable = {str(k): v for k, v in attrs.get("attributes", {}).items()}
|
||||||
|
blob = json.dumps({
|
||||||
|
"health": attrs.get("health", "UNKNOWN"),
|
||||||
|
"attrs": serialisable,
|
||||||
|
"warnings": attrs.get("warnings", []),
|
||||||
|
"failures": attrs.get("failures", []),
|
||||||
|
})
|
||||||
|
async with _db() as db:
|
||||||
|
await db.execute("PRAGMA journal_mode=WAL")
|
||||||
|
await db.execute("UPDATE drives SET smart_attrs=? WHERE id=?", (blob, drive_id))
|
||||||
|
await db.commit()
|
||||||
|
|
||||||
|
|
||||||
|
async def _store_smart_raw_output(drive_id: int, test_type: str, raw: str) -> None:
|
||||||
|
"""Store raw smartctl output in smart_tests.raw_output."""
|
||||||
|
async with _db() as db:
|
||||||
|
await db.execute("PRAGMA journal_mode=WAL")
|
||||||
|
await db.execute(
|
||||||
|
"UPDATE smart_tests SET raw_output=? WHERE drive_id=? AND test_type=?",
|
||||||
|
(raw, drive_id, test_type.lower()),
|
||||||
|
)
|
||||||
|
await db.commit()
|
||||||
|
|
||||||
|
|
||||||
async def _set_stage_error(job_id: int, stage_name: str, error_text: str) -> None:
|
async def _set_stage_error(job_id: int, stage_name: str, error_text: str) -> None:
|
||||||
async with _db() as db:
|
async with _db() as db:
|
||||||
await db.execute("PRAGMA journal_mode=WAL")
|
await db.execute("PRAGMA journal_mode=WAL")
|
||||||
|
|
|
||||||
|
|
@ -56,9 +56,17 @@ class Settings(BaseSettings):
|
||||||
temp_crit_c: int = 55 # red critical (precheck refuses to start above this)
|
temp_crit_c: int = 55 # red critical (precheck refuses to start above this)
|
||||||
|
|
||||||
# Bad-block tolerance — surface_validate fails if bad blocks exceed this
|
# Bad-block tolerance — surface_validate fails if bad blocks exceed this
|
||||||
# (applies to real badblocks in Stage 7; ignored by mock simulation)
|
|
||||||
bad_block_threshold: int = 0
|
bad_block_threshold: int = 0
|
||||||
|
|
||||||
|
# SSH credentials for direct TrueNAS command execution (Stage 7)
|
||||||
|
# When ssh_host is set, burn-in stages use SSH for smartctl/badblocks instead of REST API.
|
||||||
|
# Leave ssh_host empty to use the mock/REST API (development mode).
|
||||||
|
ssh_host: str = ""
|
||||||
|
ssh_port: int = 22
|
||||||
|
ssh_user: str = "root" # TrueNAS CORE default is root
|
||||||
|
ssh_password: str = "" # Password auth (leave blank if using key)
|
||||||
|
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
||||||
|
|
||||||
# Application version — used by the /api/v1/updates/check endpoint
|
# Application version — used by the /api/v1/updates/check endpoint
|
||||||
app_version: str = "1.0.0-6d"
|
app_version: str = "1.0.0-6d"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -82,6 +82,11 @@ CREATE INDEX IF NOT EXISTS idx_audit_events_job ON audit_events(burnin_job_id)
|
||||||
_MIGRATIONS = [
|
_MIGRATIONS = [
|
||||||
"ALTER TABLE drives ADD COLUMN notes TEXT",
|
"ALTER TABLE drives ADD COLUMN notes TEXT",
|
||||||
"ALTER TABLE drives ADD COLUMN location TEXT",
|
"ALTER TABLE drives ADD COLUMN location TEXT",
|
||||||
|
# Stage 7: SSH command output + SMART attribute storage
|
||||||
|
"ALTER TABLE burnin_stages ADD COLUMN log_text TEXT",
|
||||||
|
"ALTER TABLE burnin_stages ADD COLUMN bad_blocks INTEGER DEFAULT 0",
|
||||||
|
"ALTER TABLE drives ADD COLUMN smart_attrs TEXT",
|
||||||
|
"ALTER TABLE smart_tests ADD COLUMN raw_output TEXT",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -23,8 +23,10 @@ async def notify_job_complete(
|
||||||
profile: str,
|
profile: str,
|
||||||
operator: str,
|
operator: str,
|
||||||
error_text: str | None,
|
error_text: str | None,
|
||||||
|
bad_blocks: int = 0,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Fire all configured notifications for a completed burn-in job."""
|
"""Fire all configured notifications for a completed burn-in job."""
|
||||||
|
from datetime import datetime, timezone
|
||||||
tasks = []
|
tasks = []
|
||||||
|
|
||||||
if settings.webhook_url:
|
if settings.webhook_url:
|
||||||
|
|
@ -38,6 +40,8 @@ async def notify_job_complete(
|
||||||
"profile": profile,
|
"profile": profile,
|
||||||
"operator": operator,
|
"operator": operator,
|
||||||
"error_text": error_text,
|
"error_text": error_text,
|
||||||
|
"bad_blocks": bad_blocks,
|
||||||
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||||
}))
|
}))
|
||||||
|
|
||||||
if settings.smtp_host:
|
if settings.smtp_host:
|
||||||
|
|
|
||||||
|
|
@ -126,7 +126,7 @@ def _format_elapsed(iso: str | None) -> str:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
# Register
|
# Register filters
|
||||||
templates.env.filters["format_bytes"] = _format_bytes
|
templates.env.filters["format_bytes"] = _format_bytes
|
||||||
templates.env.filters["format_eta"] = _format_eta
|
templates.env.filters["format_eta"] = _format_eta
|
||||||
templates.env.filters["temp_class"] = _temp_class
|
templates.env.filters["temp_class"] = _temp_class
|
||||||
|
|
@ -135,3 +135,7 @@ templates.env.filters["format_dt_full"] = _format_dt_full
|
||||||
templates.env.filters["format_duration"] = _format_duration
|
templates.env.filters["format_duration"] = _format_duration
|
||||||
templates.env.filters["format_elapsed"] = _format_elapsed
|
templates.env.filters["format_elapsed"] = _format_elapsed
|
||||||
templates.env.globals["drive_status"] = _drive_status
|
templates.env.globals["drive_status"] = _drive_status
|
||||||
|
|
||||||
|
|
||||||
|
from app.config import settings as _settings
|
||||||
|
templates.env.globals["app_version"] = _settings.app_version
|
||||||
|
|
|
||||||
135
app/routes.py
135
app/routes.py
|
|
@ -258,7 +258,7 @@ async def drive_drawer(drive_id: int, db: aiosqlite.Connection = Depends(get_db)
|
||||||
raise HTTPException(status_code=404, detail="Drive not found")
|
raise HTTPException(status_code=404, detail="Drive not found")
|
||||||
drive = _row_to_drive(row)
|
drive = _row_to_drive(row)
|
||||||
|
|
||||||
# Latest burn-in job + its stages
|
# Latest burn-in job + its stages (include log_text and bad_blocks)
|
||||||
cur = await db.execute(
|
cur = await db.execute(
|
||||||
"SELECT * FROM burnin_jobs WHERE drive_id=? ORDER BY id DESC LIMIT 1",
|
"SELECT * FROM burnin_jobs WHERE drive_id=? ORDER BY id DESC LIMIT 1",
|
||||||
(drive_id,),
|
(drive_id,),
|
||||||
|
|
@ -268,12 +268,33 @@ async def drive_drawer(drive_id: int, db: aiosqlite.Connection = Depends(get_db)
|
||||||
if job_row:
|
if job_row:
|
||||||
job = dict(job_row)
|
job = dict(job_row)
|
||||||
cur = await db.execute(
|
cur = await db.execute(
|
||||||
"SELECT * FROM burnin_stages WHERE burnin_job_id=? ORDER BY id",
|
"SELECT id, stage_name, state, percent, started_at, finished_at, "
|
||||||
|
"duration_seconds, error_text, log_text, bad_blocks "
|
||||||
|
"FROM burnin_stages WHERE burnin_job_id=? ORDER BY id",
|
||||||
(job_row["id"],),
|
(job_row["id"],),
|
||||||
)
|
)
|
||||||
job["stages"] = [dict(r) for r in await cur.fetchall()]
|
job["stages"] = [dict(r) for r in await cur.fetchall()]
|
||||||
burnin = job
|
burnin = job
|
||||||
|
|
||||||
|
# SMART raw output from smart_tests table
|
||||||
|
cur = await db.execute(
|
||||||
|
"SELECT test_type, state, percent, started_at, finished_at, error_text, raw_output "
|
||||||
|
"FROM smart_tests WHERE drive_id=?",
|
||||||
|
(drive_id,),
|
||||||
|
)
|
||||||
|
smart_rows = {r["test_type"]: dict(r) for r in await cur.fetchall()}
|
||||||
|
|
||||||
|
# Cached SMART attributes (JSON blob on drives table)
|
||||||
|
import json as _json
|
||||||
|
smart_attrs = None
|
||||||
|
cur = await db.execute("SELECT smart_attrs FROM drives WHERE id=?", (drive_id,))
|
||||||
|
attrs_row = await cur.fetchone()
|
||||||
|
if attrs_row and attrs_row["smart_attrs"]:
|
||||||
|
try:
|
||||||
|
smart_attrs = _json.loads(attrs_row["smart_attrs"])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# Last 50 audit events for this drive (newest first)
|
# Last 50 audit events for this drive (newest first)
|
||||||
cur = await db.execute("""
|
cur = await db.execute("""
|
||||||
SELECT id, event_type, operator, message, created_at
|
SELECT id, event_type, operator, message, created_at
|
||||||
|
|
@ -284,6 +305,13 @@ async def drive_drawer(drive_id: int, db: aiosqlite.Connection = Depends(get_db)
|
||||||
""", (drive_id,))
|
""", (drive_id,))
|
||||||
events = [dict(r) for r in await cur.fetchall()]
|
events = [dict(r) for r in await cur.fetchall()]
|
||||||
|
|
||||||
|
def _smart_card(test_type: str) -> dict:
|
||||||
|
smart_obj = drive.smart_short if test_type == "short" else drive.smart_long
|
||||||
|
base = smart_obj.model_dump() if smart_obj else {}
|
||||||
|
row = smart_rows.get(test_type, {})
|
||||||
|
base["raw_output"] = row.get("raw_output")
|
||||||
|
return base
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"drive": {
|
"drive": {
|
||||||
"id": drive.id,
|
"id": drive.id,
|
||||||
|
|
@ -294,8 +322,9 @@ async def drive_drawer(drive_id: int, db: aiosqlite.Connection = Depends(get_db)
|
||||||
},
|
},
|
||||||
"burnin": burnin,
|
"burnin": burnin,
|
||||||
"smart": {
|
"smart": {
|
||||||
"short": drive.smart_short.model_dump() if drive.smart_short else None,
|
"short": _smart_card("short"),
|
||||||
"long": drive.smart_long.model_dump() if drive.smart_long else None,
|
"long": _smart_card("long"),
|
||||||
|
"attrs": smart_attrs,
|
||||||
},
|
},
|
||||||
"events": events,
|
"events": events,
|
||||||
}
|
}
|
||||||
|
|
@ -672,6 +701,53 @@ async def update_drive(
|
||||||
return {"updated": True}
|
return {"updated": True}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/api/v1/drives/{drive_id}/reset")
|
||||||
|
async def reset_drive(
|
||||||
|
drive_id: int,
|
||||||
|
body: dict,
|
||||||
|
db: aiosqlite.Connection = Depends(get_db),
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Clear SMART test results for a drive so it shows as fresh.
|
||||||
|
Only allowed when no burn-in job is active (queued or running).
|
||||||
|
Preserves all job history — just resets the display state.
|
||||||
|
"""
|
||||||
|
cur = await db.execute("SELECT id FROM drives WHERE id=?", (drive_id,))
|
||||||
|
if not await cur.fetchone():
|
||||||
|
raise HTTPException(status_code=404, detail="Drive not found")
|
||||||
|
|
||||||
|
# Reject if any active burn-in
|
||||||
|
cur = await db.execute(
|
||||||
|
"SELECT COUNT(*) FROM burnin_jobs WHERE drive_id=? AND state IN ('queued','running')",
|
||||||
|
(drive_id,),
|
||||||
|
)
|
||||||
|
if (await cur.fetchone())[0] > 0:
|
||||||
|
raise HTTPException(status_code=409, detail="Cannot reset while a burn-in is active")
|
||||||
|
|
||||||
|
operator = body.get("operator", "operator")
|
||||||
|
|
||||||
|
# Reset SMART test state to idle
|
||||||
|
await db.execute(
|
||||||
|
"""UPDATE smart_tests SET state='idle', percent=0, started_at=NULL,
|
||||||
|
eta_at=NULL, finished_at=NULL, error_text=NULL, raw_output=NULL
|
||||||
|
WHERE drive_id=?""",
|
||||||
|
(drive_id,),
|
||||||
|
)
|
||||||
|
# Clear cached SMART attributes
|
||||||
|
await db.execute("UPDATE drives SET smart_attrs=NULL WHERE id=?", (drive_id,))
|
||||||
|
|
||||||
|
# Audit event
|
||||||
|
await db.execute(
|
||||||
|
"""INSERT INTO audit_events (event_type, drive_id, operator, message)
|
||||||
|
VALUES (?,?,?,?)""",
|
||||||
|
("drive_reset", drive_id, operator, "Drive reset — SMART state cleared"),
|
||||||
|
)
|
||||||
|
await db.commit()
|
||||||
|
|
||||||
|
poller._notify_subscribers()
|
||||||
|
return {"reset": True}
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Audit log page
|
# Audit log page
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -766,6 +842,36 @@ async def stats_page(
|
||||||
""")
|
""")
|
||||||
by_day = [dict(r) for r in await cur.fetchall()]
|
by_day = [dict(r) for r in await cur.fetchall()]
|
||||||
|
|
||||||
|
# Average test duration by drive size (rounded to nearest TB)
|
||||||
|
cur = await db.execute("""
|
||||||
|
SELECT
|
||||||
|
CAST(ROUND(CAST(d.size_bytes AS REAL) / 1e12) AS INTEGER) AS size_tb,
|
||||||
|
COUNT(*) AS total,
|
||||||
|
ROUND(AVG(
|
||||||
|
(julianday(bj.finished_at) - julianday(bj.started_at)) * 86400 / 3600.0
|
||||||
|
), 1) AS avg_hours
|
||||||
|
FROM burnin_jobs bj
|
||||||
|
JOIN drives d ON d.id = bj.drive_id
|
||||||
|
WHERE bj.state IN ('passed', 'failed')
|
||||||
|
AND bj.started_at IS NOT NULL
|
||||||
|
AND bj.finished_at IS NOT NULL
|
||||||
|
GROUP BY size_tb
|
||||||
|
ORDER BY size_tb
|
||||||
|
""")
|
||||||
|
by_size = [dict(r) for r in await cur.fetchall()]
|
||||||
|
|
||||||
|
# Failure breakdown by stage (which stage caused the failure)
|
||||||
|
cur = await db.execute("""
|
||||||
|
SELECT
|
||||||
|
COALESCE(bj.stage_name, 'unknown') AS failed_stage,
|
||||||
|
COUNT(*) AS count
|
||||||
|
FROM burnin_jobs bj
|
||||||
|
WHERE bj.state = 'failed'
|
||||||
|
GROUP BY failed_stage
|
||||||
|
ORDER BY count DESC
|
||||||
|
""")
|
||||||
|
by_failure_stage = [dict(r) for r in await cur.fetchall()]
|
||||||
|
|
||||||
# Drives tracked
|
# Drives tracked
|
||||||
cur = await db.execute("SELECT COUNT(*) FROM drives")
|
cur = await db.execute("SELECT COUNT(*) FROM drives")
|
||||||
drives_total = (await cur.fetchone())[0]
|
drives_total = (await cur.fetchone())[0]
|
||||||
|
|
@ -776,6 +882,8 @@ async def stats_page(
|
||||||
"overall": overall,
|
"overall": overall,
|
||||||
"by_model": by_model,
|
"by_model": by_model,
|
||||||
"by_day": by_day,
|
"by_day": by_day,
|
||||||
|
"by_size": by_size,
|
||||||
|
"by_failure_stage": by_failure_stage,
|
||||||
"drives_total": drives_total,
|
"drives_total": drives_total,
|
||||||
"poller": ps,
|
"poller": ps,
|
||||||
**_stale_context(ps),
|
**_stale_context(ps),
|
||||||
|
|
@ -813,6 +921,11 @@ async def settings_page(
|
||||||
"temp_warn_c": settings.temp_warn_c,
|
"temp_warn_c": settings.temp_warn_c,
|
||||||
"temp_crit_c": settings.temp_crit_c,
|
"temp_crit_c": settings.temp_crit_c,
|
||||||
"bad_block_threshold": settings.bad_block_threshold,
|
"bad_block_threshold": settings.bad_block_threshold,
|
||||||
|
# SSH credentials (take effect immediately — each SSH call reads live settings)
|
||||||
|
"ssh_host": settings.ssh_host,
|
||||||
|
"ssh_port": settings.ssh_port,
|
||||||
|
"ssh_user": settings.ssh_user,
|
||||||
|
# Note: ssh_password and ssh_key intentionally omitted from display (sensitive)
|
||||||
# System settings (restart required to fully apply)
|
# System settings (restart required to fully apply)
|
||||||
"truenas_base_url": settings.truenas_base_url,
|
"truenas_base_url": settings.truenas_base_url,
|
||||||
"truenas_verify_tls": settings.truenas_verify_tls,
|
"truenas_verify_tls": settings.truenas_verify_tls,
|
||||||
|
|
@ -823,11 +936,13 @@ async def settings_page(
|
||||||
# Note: truenas_api_key intentionally omitted from display (sensitive)
|
# Note: truenas_api_key intentionally omitted from display (sensitive)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
from app import ssh_client as _ssh
|
||||||
ps = poller.get_state()
|
ps = poller.get_state()
|
||||||
return templates.TemplateResponse("settings.html", {
|
return templates.TemplateResponse("settings.html", {
|
||||||
"request": request,
|
"request": request,
|
||||||
"editable": editable,
|
"editable": editable,
|
||||||
"smtp_enabled": bool(settings.smtp_host),
|
"smtp_enabled": bool(settings.smtp_host),
|
||||||
|
"ssh_configured": _ssh.is_configured(),
|
||||||
"app_version": settings.app_version,
|
"app_version": settings.app_version,
|
||||||
"poller": ps,
|
"poller": ps,
|
||||||
**_stale_context(ps),
|
**_stale_context(ps),
|
||||||
|
|
@ -838,7 +953,7 @@ async def settings_page(
|
||||||
async def save_settings(body: dict):
|
async def save_settings(body: dict):
|
||||||
"""Save editable runtime settings. Secrets are only updated if non-empty."""
|
"""Save editable runtime settings. Secrets are only updated if non-empty."""
|
||||||
# Don't overwrite secrets if client sent empty string
|
# Don't overwrite secrets if client sent empty string
|
||||||
for secret_field in ("smtp_password", "truenas_api_key"):
|
for secret_field in ("smtp_password", "truenas_api_key", "ssh_password", "ssh_key"):
|
||||||
if secret_field in body and body[secret_field] == "":
|
if secret_field in body and body[secret_field] == "":
|
||||||
del body[secret_field]
|
del body[secret_field]
|
||||||
|
|
||||||
|
|
@ -859,6 +974,16 @@ async def test_smtp():
|
||||||
return {"ok": True}
|
return {"ok": True}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/api/v1/settings/test-ssh")
|
||||||
|
async def test_ssh():
|
||||||
|
"""Test the current SSH configuration."""
|
||||||
|
from app import ssh_client
|
||||||
|
result = await ssh_client.test_connection()
|
||||||
|
if not result["ok"]:
|
||||||
|
raise HTTPException(status_code=502, detail=result.get("error", "Connection failed"))
|
||||||
|
return {"ok": True}
|
||||||
|
|
||||||
|
|
||||||
@router.get("/api/v1/updates/check")
|
@router.get("/api/v1/updates/check")
|
||||||
async def check_updates():
|
async def check_updates():
|
||||||
"""Check for a newer release on Forgejo."""
|
"""Check for a newer release on Forgejo."""
|
||||||
|
|
|
||||||
|
|
@ -38,6 +38,12 @@ _EDITABLE: dict[str, type] = {
|
||||||
"temp_warn_c": int,
|
"temp_warn_c": int,
|
||||||
"temp_crit_c": int,
|
"temp_crit_c": int,
|
||||||
"bad_block_threshold": int,
|
"bad_block_threshold": int,
|
||||||
|
# SSH credentials — take effect immediately (each connection reads live settings)
|
||||||
|
"ssh_host": str,
|
||||||
|
"ssh_port": int,
|
||||||
|
"ssh_user": str,
|
||||||
|
"ssh_password": str,
|
||||||
|
"ssh_key": str,
|
||||||
# System settings — saved to JSON; require container restart to fully apply
|
# System settings — saved to JSON; require container restart to fully apply
|
||||||
"truenas_base_url": str,
|
"truenas_base_url": str,
|
||||||
"truenas_api_key": str,
|
"truenas_api_key": str,
|
||||||
|
|
@ -90,6 +96,9 @@ def _apply(data: dict) -> None:
|
||||||
if key == "bad_block_threshold" and int(val) < 0:
|
if key == "bad_block_threshold" and int(val) < 0:
|
||||||
log.warning("settings_store: bad_block_threshold must be >= 0 — ignoring")
|
log.warning("settings_store: bad_block_threshold must be >= 0 — ignoring")
|
||||||
continue
|
continue
|
||||||
|
if key == "ssh_port" and not (1 <= int(val) <= 65535):
|
||||||
|
log.warning("settings_store: ssh_port out of range — ignoring")
|
||||||
|
continue
|
||||||
setattr(settings, key, val)
|
setattr(settings, key, val)
|
||||||
except (ValueError, TypeError) as exc:
|
except (ValueError, TypeError) as exc:
|
||||||
log.warning("settings_store: invalid value for %s: %s", key, exc)
|
log.warning("settings_store: invalid value for %s: %s", key, exc)
|
||||||
|
|
|
||||||
303
app/ssh_client.py
Normal file
303
app/ssh_client.py
Normal file
|
|
@ -0,0 +1,303 @@
|
||||||
|
"""
|
||||||
|
SSH client for direct TrueNAS command execution (Stage 7).
|
||||||
|
|
||||||
|
When ssh_host is configured, burn-in stages use SSH to run smartctl and
|
||||||
|
badblocks directly on the TrueNAS host instead of going through the REST API.
|
||||||
|
Falls back to REST API / simulation when SSH is not configured (dev/mock mode).
|
||||||
|
|
||||||
|
TrueNAS CORE (FreeBSD) device paths: /dev/ada0, /dev/da0, etc.
|
||||||
|
TrueNAS SCALE (Linux) device paths: /dev/sda, /dev/sdb, etc.
|
||||||
|
The devname from the TrueNAS API is used as-is in /dev/{devname}.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Monitored SMART attributes
|
||||||
|
# True → any non-zero raw value is a hard failure (drive rejected)
|
||||||
|
# False → non-zero is a warning (flagged but test continues)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
SMART_ATTRS: dict[int, tuple[str, bool]] = {
|
||||||
|
5: ("Reallocated_Sector_Ct", True), # reallocation = FAIL
|
||||||
|
10: ("Spin_Retry_Count", False), # mechanical stress = WARN
|
||||||
|
188: ("Command_Timeout", False), # drive not responding = WARN
|
||||||
|
197: ("Current_Pending_Sector", True), # pending reallocation = FAIL
|
||||||
|
198: ("Offline_Uncorrectable", True), # unrecoverable read error = FAIL
|
||||||
|
199: ("UDMA_CRC_Error_Count", False), # cable/controller issue = WARN
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Configuration check
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def is_configured() -> bool:
|
||||||
|
"""Returns True when SSH credentials are present and usable."""
|
||||||
|
from app.config import settings
|
||||||
|
return bool(settings.ssh_host and (settings.ssh_password or settings.ssh_key))
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Low-level connection
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def _connect():
|
||||||
|
"""Open a single-use SSH connection. Caller must use `async with`."""
|
||||||
|
import asyncssh
|
||||||
|
from app.config import settings
|
||||||
|
|
||||||
|
kwargs: dict = {
|
||||||
|
"host": settings.ssh_host,
|
||||||
|
"port": settings.ssh_port,
|
||||||
|
"username": settings.ssh_user,
|
||||||
|
"known_hosts": None, # trust all hosts (same spirit as TRUENAS_VERIFY_TLS=false)
|
||||||
|
}
|
||||||
|
if settings.ssh_key:
|
||||||
|
kwargs["client_keys"] = [asyncssh.import_private_key(settings.ssh_key)]
|
||||||
|
if settings.ssh_password:
|
||||||
|
kwargs["password"] = settings.ssh_password
|
||||||
|
|
||||||
|
return asyncssh.connect(**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Public API
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async def test_connection() -> dict:
|
||||||
|
"""Test SSH connectivity. Returns {"ok": True} or {"ok": False, "error": str}."""
|
||||||
|
if not is_configured():
|
||||||
|
return {"ok": False, "error": "SSH not configured (ssh_host is empty)"}
|
||||||
|
try:
|
||||||
|
async with await _connect() as conn:
|
||||||
|
result = await conn.run("echo ok", check=False)
|
||||||
|
if "ok" in result.stdout:
|
||||||
|
return {"ok": True}
|
||||||
|
return {"ok": False, "error": result.stderr.strip() or "unexpected output"}
|
||||||
|
except Exception as exc:
|
||||||
|
return {"ok": False, "error": str(exc)}
|
||||||
|
|
||||||
|
|
||||||
|
async def get_smart_attributes(devname: str) -> dict:
|
||||||
|
"""
|
||||||
|
Run `smartctl -a /dev/{devname}` and parse the output.
|
||||||
|
Returns:
|
||||||
|
health: str — "PASSED" | "FAILED" | "UNKNOWN"
|
||||||
|
raw_output: str — full smartctl output
|
||||||
|
attributes: dict[int, {"name": str, "raw": int}]
|
||||||
|
warnings: list[str] — attribute names with non-zero raw (non-critical)
|
||||||
|
failures: list[str] — attribute names with non-zero raw (critical)
|
||||||
|
"""
|
||||||
|
cmd = f"smartctl -a /dev/{devname}"
|
||||||
|
try:
|
||||||
|
async with await _connect() as conn:
|
||||||
|
result = await conn.run(cmd, check=False)
|
||||||
|
output = result.stdout + result.stderr
|
||||||
|
return _parse_smartctl(output)
|
||||||
|
except Exception as exc:
|
||||||
|
return {
|
||||||
|
"health": "UNKNOWN",
|
||||||
|
"raw_output": str(exc),
|
||||||
|
"attributes": {},
|
||||||
|
"warnings": [],
|
||||||
|
"failures": [f"SSH error: {exc}"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def start_smart_test(devname: str, test_type: str) -> str:
|
||||||
|
"""
|
||||||
|
Run `smartctl -t short|long /dev/{devname}`.
|
||||||
|
Returns raw output. Raises RuntimeError on unrecoverable failure.
|
||||||
|
test_type: "SHORT" or "LONG"
|
||||||
|
"""
|
||||||
|
arg = "short" if test_type.upper() == "SHORT" else "long"
|
||||||
|
cmd = f"smartctl -t {arg} /dev/{devname}"
|
||||||
|
async with await _connect() as conn:
|
||||||
|
result = await conn.run(cmd, check=False)
|
||||||
|
output = result.stdout + result.stderr
|
||||||
|
# smartctl exits 0 or 4 when the test is successfully started on most drives
|
||||||
|
started = ("Testing has begun" in output or
|
||||||
|
"test has begun" in output.lower() or
|
||||||
|
result.returncode in (0, 4))
|
||||||
|
if not started:
|
||||||
|
raise RuntimeError(f"smartctl returned exit {result.returncode}: {output[:400]}")
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
async def poll_smart_progress(devname: str) -> dict:
|
||||||
|
"""
|
||||||
|
Run `smartctl -a /dev/{devname}` and extract self-test status.
|
||||||
|
Returns:
|
||||||
|
state: "running" | "passed" | "failed" | "unknown"
|
||||||
|
percent_remaining: int (0 = complete when state != "running")
|
||||||
|
output: str
|
||||||
|
"""
|
||||||
|
cmd = f"smartctl -a /dev/{devname}"
|
||||||
|
async with await _connect() as conn:
|
||||||
|
result = await conn.run(cmd, check=False)
|
||||||
|
output = result.stdout + result.stderr
|
||||||
|
return _parse_smart_progress(output)
|
||||||
|
|
||||||
|
|
||||||
|
async def abort_smart_test(devname: str) -> None:
|
||||||
|
"""Send `smartctl -X /dev/{devname}` to abort an in-progress test."""
|
||||||
|
cmd = f"smartctl -X /dev/{devname}"
|
||||||
|
async with await _connect() as conn:
|
||||||
|
await conn.run(cmd, check=False)
|
||||||
|
|
||||||
|
|
||||||
|
async def run_badblocks(
|
||||||
|
devname: str,
|
||||||
|
on_progress: Callable[[int, int, str], None],
|
||||||
|
cancelled_fn: Callable[[], bool] | None = None,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Run `badblocks -wsv -b 4096 -p 1 /dev/{devname}` and stream output.
|
||||||
|
|
||||||
|
on_progress(percent, bad_blocks, line) is called for each line of output.
|
||||||
|
cancelled_fn() is polled to support mid-test cancellation.
|
||||||
|
|
||||||
|
Returns: {"bad_blocks": int, "output": str, "aborted": bool}
|
||||||
|
"""
|
||||||
|
from app.config import settings
|
||||||
|
cmd = f"badblocks -wsv -b 4096 -p 1 /dev/{devname}"
|
||||||
|
lines: list[str] = []
|
||||||
|
bad_blocks = 0
|
||||||
|
aborted = False
|
||||||
|
last_pct = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with await _connect() as conn:
|
||||||
|
async with conn.create_process(cmd) as proc:
|
||||||
|
# badblocks writes progress to stderr, bad block numbers to stdout
|
||||||
|
async def _read_stream(stream, is_stderr: bool):
|
||||||
|
nonlocal bad_blocks, last_pct, aborted
|
||||||
|
async for raw_line in stream:
|
||||||
|
line = raw_line if isinstance(raw_line, str) else raw_line.decode("utf-8", errors="replace")
|
||||||
|
lines.append(line)
|
||||||
|
|
||||||
|
if is_stderr:
|
||||||
|
m = re.search(r"([\d.]+)%\s+done", line)
|
||||||
|
if m:
|
||||||
|
last_pct = min(99, int(float(m.group(1))))
|
||||||
|
else:
|
||||||
|
# Each non-empty stdout line during badblocks is a bad block number
|
||||||
|
stripped = line.strip()
|
||||||
|
if stripped and stripped.isdigit():
|
||||||
|
bad_blocks += 1
|
||||||
|
|
||||||
|
on_progress(last_pct, bad_blocks, line)
|
||||||
|
|
||||||
|
# Abort if threshold exceeded
|
||||||
|
if bad_blocks > settings.bad_block_threshold:
|
||||||
|
aborted = True
|
||||||
|
proc.kill()
|
||||||
|
lines.append(
|
||||||
|
f"\n[ABORTED] Bad block count ({bad_blocks}) exceeded "
|
||||||
|
f"threshold ({settings.bad_block_threshold})\n"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Abort on cancellation
|
||||||
|
if cancelled_fn and cancelled_fn():
|
||||||
|
aborted = True
|
||||||
|
proc.kill()
|
||||||
|
return
|
||||||
|
|
||||||
|
stdout_task = asyncio.create_task(_read_stream(proc.stdout, False))
|
||||||
|
stderr_task = asyncio.create_task(_read_stream(proc.stderr, True))
|
||||||
|
await asyncio.gather(stdout_task, stderr_task, return_exceptions=True)
|
||||||
|
await proc.wait()
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
lines.append(f"\n[SSH error] {exc}\n")
|
||||||
|
|
||||||
|
if not aborted:
|
||||||
|
last_pct = 100
|
||||||
|
|
||||||
|
return {
|
||||||
|
"bad_blocks": bad_blocks,
|
||||||
|
"output": "".join(lines),
|
||||||
|
"aborted": aborted,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Parsers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _parse_smartctl(output: str) -> dict:
|
||||||
|
health = "UNKNOWN"
|
||||||
|
attributes: dict[int, dict] = {}
|
||||||
|
warnings: list[str] = []
|
||||||
|
failures: list[str] = []
|
||||||
|
|
||||||
|
m = re.search(r"self-assessment test result:\s+(\w+)", output, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
health = m.group(1).upper()
|
||||||
|
|
||||||
|
# Attribute table: ID# NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE
|
||||||
|
for line in output.splitlines():
|
||||||
|
am = re.match(
|
||||||
|
r"\s*(\d+)\s+(\S+)\s+\S+\s+\d+\s+\d+\s+\d+\s+\S+\s+\S+\s+\S+\s+(\d+)",
|
||||||
|
line,
|
||||||
|
)
|
||||||
|
if not am:
|
||||||
|
continue
|
||||||
|
attr_id = int(am.group(1))
|
||||||
|
attr_name = am.group(2)
|
||||||
|
raw_val = int(am.group(3))
|
||||||
|
attributes[attr_id] = {"name": attr_name, "raw": raw_val}
|
||||||
|
|
||||||
|
if attr_id in SMART_ATTRS:
|
||||||
|
_, is_critical = SMART_ATTRS[attr_id]
|
||||||
|
if raw_val > 0:
|
||||||
|
msg = f"{attr_name} = {raw_val}"
|
||||||
|
if is_critical:
|
||||||
|
failures.append(msg)
|
||||||
|
else:
|
||||||
|
warnings.append(msg)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"health": health,
|
||||||
|
"raw_output": output,
|
||||||
|
"attributes": attributes,
|
||||||
|
"warnings": warnings,
|
||||||
|
"failures": failures,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_smart_progress(output: str) -> dict:
|
||||||
|
state = "unknown"
|
||||||
|
percent_remaining = 0
|
||||||
|
|
||||||
|
lower = output.lower()
|
||||||
|
|
||||||
|
if "self-test routine in progress" in lower or "self-test routine in progress" in output:
|
||||||
|
state = "running"
|
||||||
|
m = re.search(r"(\d+)%\s+of\s+test\s+remaining", output, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
percent_remaining = int(m.group(1))
|
||||||
|
elif "completed without error" in lower:
|
||||||
|
state = "passed"
|
||||||
|
elif (
|
||||||
|
"completed: read failure" in lower
|
||||||
|
or "completed: write failure" in lower
|
||||||
|
or "aborted by host" in lower
|
||||||
|
or ("completed" in lower and "failure" in lower)
|
||||||
|
):
|
||||||
|
state = "failed"
|
||||||
|
elif "in progress" in lower:
|
||||||
|
state = "running"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"state": state,
|
||||||
|
"percent_remaining": percent_remaining,
|
||||||
|
"output": output,
|
||||||
|
}
|
||||||
|
|
@ -2283,3 +2283,125 @@ tr.drawer-row-active {
|
||||||
.drawer-smart-grid { grid-template-columns: 1fr; }
|
.drawer-smart-grid { grid-template-columns: 1fr; }
|
||||||
.drawer-drive-meta { display: none; }
|
.drawer-drive-meta { display: none; }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* -----------------------------------------------------------------------
|
||||||
|
Stage raw log output (SSH mode)
|
||||||
|
----------------------------------------------------------------------- */
|
||||||
|
.stage-log {
|
||||||
|
font-family: "SF Mono", "Consolas", "Monaco", monospace;
|
||||||
|
font-size: 11px;
|
||||||
|
line-height: 1.5;
|
||||||
|
color: var(--text-muted);
|
||||||
|
background: var(--bg);
|
||||||
|
border-left: 2px solid var(--border);
|
||||||
|
margin: 6px 0 2px 28px;
|
||||||
|
padding: 6px 10px;
|
||||||
|
white-space: pre-wrap;
|
||||||
|
word-break: break-all;
|
||||||
|
max-height: 200px;
|
||||||
|
overflow-y: auto;
|
||||||
|
}
|
||||||
|
.stage-log .log-bad-block {
|
||||||
|
color: var(--red);
|
||||||
|
font-weight: 600;
|
||||||
|
}
|
||||||
|
.stage-log .log-warn {
|
||||||
|
color: var(--yellow);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* -----------------------------------------------------------------------
|
||||||
|
SMART attributes table in drawer
|
||||||
|
----------------------------------------------------------------------- */
|
||||||
|
.smart-attrs {
|
||||||
|
margin-top: 12px;
|
||||||
|
border-top: 1px solid var(--border);
|
||||||
|
padding-top: 10px;
|
||||||
|
}
|
||||||
|
.smart-attrs-title {
|
||||||
|
font-size: 11px;
|
||||||
|
font-weight: 600;
|
||||||
|
color: var(--text-muted);
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: .05em;
|
||||||
|
margin-bottom: 6px;
|
||||||
|
}
|
||||||
|
.smart-attr-row {
|
||||||
|
display: flex;
|
||||||
|
justify-content: space-between;
|
||||||
|
align-items: center;
|
||||||
|
padding: 3px 0;
|
||||||
|
font-size: 12px;
|
||||||
|
border-bottom: 1px solid color-mix(in srgb, var(--border) 50%, transparent);
|
||||||
|
}
|
||||||
|
.smart-attr-row:last-child { border-bottom: none; }
|
||||||
|
.smart-attr-name { color: var(--text-muted); }
|
||||||
|
.smart-attr-val { font-family: "SF Mono", monospace; font-size: 12px; }
|
||||||
|
.smart-attr-val.attr-ok { color: var(--green); }
|
||||||
|
.smart-attr-val.attr-warn { color: var(--yellow); font-weight: 600; }
|
||||||
|
.smart-attr-val.attr-fail { color: var(--red); font-weight: 600; }
|
||||||
|
.smart-attr-raw-output {
|
||||||
|
font-family: "SF Mono", "Consolas", monospace;
|
||||||
|
font-size: 10.5px;
|
||||||
|
line-height: 1.45;
|
||||||
|
color: var(--text-muted);
|
||||||
|
background: var(--bg);
|
||||||
|
border: 1px solid var(--border);
|
||||||
|
border-radius: 4px;
|
||||||
|
padding: 8px 10px;
|
||||||
|
margin-top: 10px;
|
||||||
|
white-space: pre;
|
||||||
|
overflow: auto;
|
||||||
|
max-height: 240px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* -----------------------------------------------------------------------
|
||||||
|
Reset button
|
||||||
|
----------------------------------------------------------------------- */
|
||||||
|
.btn-reset {
|
||||||
|
background: transparent;
|
||||||
|
border: 1px solid color-mix(in srgb, var(--text-muted) 40%, transparent);
|
||||||
|
color: var(--text-muted);
|
||||||
|
border-radius: 5px;
|
||||||
|
padding: 3px 8px;
|
||||||
|
font-size: 12px;
|
||||||
|
cursor: pointer;
|
||||||
|
transition: border-color .15s, color .15s;
|
||||||
|
}
|
||||||
|
.btn-reset:hover {
|
||||||
|
border-color: var(--yellow);
|
||||||
|
color: var(--yellow);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* -----------------------------------------------------------------------
|
||||||
|
Parallel burn-in inline warning
|
||||||
|
----------------------------------------------------------------------- */
|
||||||
|
.sf-inline-warn {
|
||||||
|
background: color-mix(in srgb, var(--yellow) 12%, transparent);
|
||||||
|
border: 1px solid color-mix(in srgb, var(--yellow) 40%, transparent);
|
||||||
|
border-radius: 5px;
|
||||||
|
color: var(--yellow);
|
||||||
|
font-size: 12px;
|
||||||
|
padding: 7px 10px;
|
||||||
|
margin: 4px 0 8px 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* -----------------------------------------------------------------------
|
||||||
|
SSH textarea
|
||||||
|
----------------------------------------------------------------------- */
|
||||||
|
.sf-textarea {
|
||||||
|
resize: vertical;
|
||||||
|
min-height: 90px;
|
||||||
|
font-family: "SF Mono", "Consolas", monospace;
|
||||||
|
font-size: 11px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* -----------------------------------------------------------------------
|
||||||
|
Version badge in header
|
||||||
|
----------------------------------------------------------------------- */
|
||||||
|
.header-version {
|
||||||
|
font-size: 11px;
|
||||||
|
color: var(--text-muted);
|
||||||
|
opacity: .6;
|
||||||
|
padding: 0 2px;
|
||||||
|
font-variant-numeric: tabular-nums;
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -957,8 +957,18 @@
|
||||||
if (s.error_text) {
|
if (s.error_text) {
|
||||||
html += '<div class="stage-error-line">' + _esc(s.error_text) + '</div>';
|
html += '<div class="stage-error-line">' + _esc(s.error_text) + '</div>';
|
||||||
}
|
}
|
||||||
|
// Raw SSH log output (if available)
|
||||||
|
if (s.log_text) {
|
||||||
|
var logHtml = _esc(s.log_text)
|
||||||
|
.replace(/^(\d+)\s*$/gm, '<span class="log-bad-block">$1 ← BAD BLOCK</span>')
|
||||||
|
.replace(/\[WARNING\][^\n]*/g, '<span class="log-warn">$&</span>');
|
||||||
|
html += '<pre class="stage-log">' + logHtml + '</pre>';
|
||||||
|
}
|
||||||
|
// Bad block count badge
|
||||||
|
if (s.bad_blocks && s.bad_blocks > 0) {
|
||||||
|
html += '<div class="stage-error-line">' + s.bad_blocks + ' bad block(s) found</div>';
|
||||||
|
}
|
||||||
html += '</div>';
|
html += '</div>';
|
||||||
});
|
|
||||||
} else {
|
} else {
|
||||||
html += '<div class="drawer-empty">No stage data yet.</div>';
|
html += '<div class="drawer-empty">No stage data yet.</div>';
|
||||||
}
|
}
|
||||||
|
|
@ -973,6 +983,10 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Monitored SMART attributes for inline colouring
|
||||||
|
var _SMART_CRITICAL = {5: true, 197: true, 198: true};
|
||||||
|
var _SMART_WARN = {10: true, 188: true, 199: true};
|
||||||
|
|
||||||
function _drawerRenderSmart(smart) {
|
function _drawerRenderSmart(smart) {
|
||||||
var panel = document.getElementById('drawer-panel-smart');
|
var panel = document.getElementById('drawer-panel-smart');
|
||||||
if (!panel) return;
|
if (!panel) return;
|
||||||
|
|
@ -994,10 +1008,41 @@
|
||||||
if (t.started_at) html += '<div class="smart-detail">Started: ' + _drawerFmtDt(t.started_at) + '</div>';
|
if (t.started_at) html += '<div class="smart-detail">Started: ' + _drawerFmtDt(t.started_at) + '</div>';
|
||||||
if (t.finished_at) html += '<div class="smart-detail">Finished: ' + _drawerFmtDt(t.finished_at) + '</div>';
|
if (t.finished_at) html += '<div class="smart-detail">Finished: ' + _drawerFmtDt(t.finished_at) + '</div>';
|
||||||
if (t.error_text) html += '<div class="stage-error-line">' + _esc(t.error_text) + '</div>';
|
if (t.error_text) html += '<div class="stage-error-line">' + _esc(t.error_text) + '</div>';
|
||||||
|
// Raw smartctl output (SSH mode)
|
||||||
|
if (t.raw_output) {
|
||||||
|
html += '<pre class="smart-attr-raw-output">' + _esc(t.raw_output) + '</pre>';
|
||||||
|
}
|
||||||
}
|
}
|
||||||
html += '</div>';
|
html += '</div>';
|
||||||
});
|
});
|
||||||
html += '</div>';
|
html += '</div>';
|
||||||
|
|
||||||
|
// SMART attribute table (from SSH attribute parse)
|
||||||
|
var attrs = smart && smart.attrs;
|
||||||
|
if (attrs) {
|
||||||
|
html += '<div class="smart-attrs">';
|
||||||
|
html += '<div class="smart-attrs-title">SMART Attributes</div>';
|
||||||
|
if (attrs.failures && attrs.failures.length) {
|
||||||
|
html += '<div class="stage-error-line" style="margin-bottom:6px">✕ Failures: ' + _esc(attrs.failures.join('; ')) + '</div>';
|
||||||
|
}
|
||||||
|
if (attrs.warnings && attrs.warnings.length) {
|
||||||
|
html += '<div class="stage-error-line" style="color:var(--yellow);margin-bottom:6px">⚠ Warnings: ' + _esc(attrs.warnings.join('; ')) + '</div>';
|
||||||
|
}
|
||||||
|
var attrMap = attrs.attrs || {};
|
||||||
|
var monitoredIds = [5, 10, 188, 197, 198, 199];
|
||||||
|
monitoredIds.forEach(function (id) {
|
||||||
|
var entry = attrMap[String(id)];
|
||||||
|
if (!entry) return;
|
||||||
|
var raw = entry.raw;
|
||||||
|
var cls = raw > 0 ? (_SMART_CRITICAL[id] ? 'attr-fail' : 'attr-warn') : 'attr-ok';
|
||||||
|
html += '<div class="smart-attr-row">';
|
||||||
|
html += '<span class="smart-attr-name">' + id + ' ' + _esc(entry.name) + '</span>';
|
||||||
|
html += '<span class="smart-attr-val ' + cls + '">' + raw + '</span>';
|
||||||
|
html += '</div>';
|
||||||
|
});
|
||||||
|
html += '</div>';
|
||||||
|
}
|
||||||
|
|
||||||
panel.innerHTML = html;
|
panel.innerHTML = html;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1078,4 +1123,21 @@
|
||||||
if (e.target.closest('#drawer-close-btn')) closeDrawer();
|
if (e.target.closest('#drawer-close-btn')) closeDrawer();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Reset button — clears SMART state for a drive
|
||||||
|
document.addEventListener('click', function (e) {
|
||||||
|
var btn = e.target.closest('.btn-reset');
|
||||||
|
if (!btn) return;
|
||||||
|
var driveId = btn.dataset.driveId;
|
||||||
|
if (!driveId) return;
|
||||||
|
var operator = (window._operator || 'operator');
|
||||||
|
fetch('/api/v1/drives/' + driveId + '/reset', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ operator: operator }),
|
||||||
|
}).then(function (r) {
|
||||||
|
if (!r.ok) return r.json().then(function (d) { showToast(d.detail || 'Reset failed', 'error'); });
|
||||||
|
showToast('Drive reset — state cleared', 'success');
|
||||||
|
}).catch(function () { showToast('Network error', 'error'); });
|
||||||
|
});
|
||||||
|
|
||||||
}());
|
}());
|
||||||
|
|
|
||||||
|
|
@ -81,6 +81,10 @@
|
||||||
{%- set short_busy = drive.smart_short and drive.smart_short.state == 'running' %}
|
{%- set short_busy = drive.smart_short and drive.smart_short.state == 'running' %}
|
||||||
{%- set long_busy = drive.smart_long and drive.smart_long.state == 'running' %}
|
{%- set long_busy = drive.smart_long and drive.smart_long.state == 'running' %}
|
||||||
{%- set selectable = not bi_active and not short_busy and not long_busy %}
|
{%- set selectable = not bi_active and not short_busy and not long_busy %}
|
||||||
|
{%- set bi_done = drive.burnin and drive.burnin.state in ('passed', 'failed', 'cancelled', 'unknown') %}
|
||||||
|
{%- set smart_done = (drive.smart_short and drive.smart_short.state in ('passed','failed','aborted'))
|
||||||
|
or (drive.smart_long and drive.smart_long.state in ('passed','failed','aborted')) %}
|
||||||
|
{%- set can_reset = (bi_done or smart_done) and not bi_active and not short_busy and not long_busy %}
|
||||||
<tr data-status="{{ drive.status }}" id="drive-{{ drive.id }}">
|
<tr data-status="{{ drive.status }}" id="drive-{{ drive.id }}">
|
||||||
<td class="col-check">
|
<td class="col-check">
|
||||||
{%- if selectable %}
|
{%- if selectable %}
|
||||||
|
|
@ -160,6 +164,12 @@
|
||||||
data-health="{{ drive.smart_health }}"
|
data-health="{{ drive.smart_health }}"
|
||||||
{% if short_busy or long_busy %}disabled{% endif %}
|
{% if short_busy or long_busy %}disabled{% endif %}
|
||||||
title="Start Burn-In">Burn-In</button>
|
title="Start Burn-In">Burn-In</button>
|
||||||
|
<!-- Reset — clears SMART state so drive can be re-tested from scratch -->
|
||||||
|
{%- if can_reset %}
|
||||||
|
<button class="btn-action btn-reset"
|
||||||
|
data-drive-id="{{ drive.id }}"
|
||||||
|
title="Reset SMART state — clears test results so drive shows as fresh">Reset</button>
|
||||||
|
{%- endif %}
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
</div>
|
</div>
|
||||||
</td>
|
</td>
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,7 @@
|
||||||
<a class="header-link" href="/audit">Audit</a>
|
<a class="header-link" href="/audit">Audit</a>
|
||||||
<a class="header-link" href="/settings">Settings</a>
|
<a class="header-link" href="/settings">Settings</a>
|
||||||
<a class="header-link" href="/docs" target="_blank" rel="noopener">API</a>
|
<a class="header-link" href="/docs" target="_blank" rel="noopener">API</a>
|
||||||
|
<span class="header-version">v{{ app_version if app_version is defined else '—' }}</span>
|
||||||
</div>
|
</div>
|
||||||
</header>
|
</header>
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -91,6 +91,57 @@
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<!-- SSH -->
|
||||||
|
<div class="settings-card">
|
||||||
|
<div class="settings-card-header">
|
||||||
|
<span class="settings-card-title">SSH (TrueNAS Direct)</span>
|
||||||
|
{% if ssh_configured %}
|
||||||
|
<span class="chip chip-passed" style="font-size:10px">Configured</span>
|
||||||
|
{% else %}
|
||||||
|
<span class="chip chip-unknown" style="font-size:10px">Not configured — using REST API / mock</span>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
<p class="sf-hint" style="margin-bottom:8px">
|
||||||
|
When configured, burn-in stages run smartctl and badblocks directly on TrueNAS over SSH,
|
||||||
|
enabling SMART attribute monitoring and real bad-block detection. Leave Host empty to use
|
||||||
|
the TrueNAS REST API (mock / dev mode).
|
||||||
|
</p>
|
||||||
|
<div class="sf-fields">
|
||||||
|
|
||||||
|
<div class="sf-full sf-row-test" style="margin-bottom:4px">
|
||||||
|
<button type="button" id="test-ssh-btn" class="btn-secondary">Test SSH Connection</button>
|
||||||
|
<span id="ssh-test-result" class="settings-test-result" style="display:none"></span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<label for="ssh_host">Host / IP</label>
|
||||||
|
<input class="sf-input" id="ssh_host" name="ssh_host" type="text"
|
||||||
|
value="{{ editable.ssh_host }}" placeholder="10.0.0.x (same as TrueNAS IP)">
|
||||||
|
|
||||||
|
<label for="ssh_port">Port</label>
|
||||||
|
<input class="sf-input sf-input-xs" id="ssh_port" name="ssh_port"
|
||||||
|
type="number" min="1" max="65535" value="{{ editable.ssh_port }}" style="width:70px">
|
||||||
|
|
||||||
|
<label for="ssh_user">Username</label>
|
||||||
|
<input class="sf-input" id="ssh_user" name="ssh_user" type="text"
|
||||||
|
value="{{ editable.ssh_user }}" placeholder="root">
|
||||||
|
|
||||||
|
<label for="ssh_password">Password</label>
|
||||||
|
<input class="sf-input" id="ssh_password" name="ssh_password" type="password"
|
||||||
|
placeholder="leave blank to keep existing" autocomplete="new-password">
|
||||||
|
|
||||||
|
<label for="ssh_key">Private Key</label>
|
||||||
|
<div>
|
||||||
|
<textarea class="sf-input sf-textarea" id="ssh_key" name="ssh_key"
|
||||||
|
rows="6" placeholder="Paste PEM private key here (-----BEGIN ... KEY-----). Leave blank to keep existing." autocomplete="off"></textarea>
|
||||||
|
<span class="sf-hint" style="margin-top:3px">
|
||||||
|
Either password or key auth. Key takes precedence if both are set.
|
||||||
|
Key is stored securely in <code>/data/settings_overrides.json</code>.
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
</div><!-- /left col -->
|
</div><!-- /left col -->
|
||||||
|
|
||||||
<!-- RIGHT column: Notifications + Behavior -->
|
<!-- RIGHT column: Notifications + Behavior -->
|
||||||
|
|
@ -159,9 +210,14 @@
|
||||||
<div class="sf-row">
|
<div class="sf-row">
|
||||||
<label class="sf-label" for="max_parallel_burnins">Max Parallel Burn-Ins</label>
|
<label class="sf-label" for="max_parallel_burnins">Max Parallel Burn-Ins</label>
|
||||||
<input class="sf-input sf-input-xs" id="max_parallel_burnins" name="max_parallel_burnins"
|
<input class="sf-input sf-input-xs" id="max_parallel_burnins" name="max_parallel_burnins"
|
||||||
type="number" min="1" max="16" value="{{ editable.max_parallel_burnins }}">
|
type="number" min="1" max="60" value="{{ editable.max_parallel_burnins }}">
|
||||||
<span class="sf-hint">How many jobs can run at the same time</span>
|
<span class="sf-hint">How many jobs can run at the same time</span>
|
||||||
</div>
|
</div>
|
||||||
|
<div id="parallel-warn" class="sf-inline-warn"
|
||||||
|
{% if editable.max_parallel_burnins <= 8 %}style="display:none"{% endif %}>
|
||||||
|
⚠ Running many simultaneous surface scans may saturate your storage controller
|
||||||
|
and produce unreliable results. Recommended: 2–4.
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class="sf-row">
|
<div class="sf-row">
|
||||||
<label class="sf-label" for="stuck_job_hours">Stuck Job Threshold (hours)</label>
|
<label class="sf-label" for="stuck_job_hours">Stuck Job Threshold (hours)</label>
|
||||||
|
|
@ -348,6 +404,36 @@
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Parallel burn-in warning
|
||||||
|
var parallelInput = document.getElementById('max_parallel_burnins');
|
||||||
|
var parallelWarn = document.getElementById('parallel-warn');
|
||||||
|
if (parallelInput && parallelWarn) {
|
||||||
|
parallelInput.addEventListener('input', function () {
|
||||||
|
parallelWarn.style.display = parseInt(parallelInput.value, 10) > 8 ? '' : 'none';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test SSH
|
||||||
|
var sshBtn = document.getElementById('test-ssh-btn');
|
||||||
|
var sshResult = document.getElementById('ssh-test-result');
|
||||||
|
if (sshBtn) {
|
||||||
|
sshBtn.addEventListener('click', async function () {
|
||||||
|
sshBtn.disabled = true;
|
||||||
|
sshBtn.textContent = 'Testing…';
|
||||||
|
sshResult.style.display = 'none';
|
||||||
|
try {
|
||||||
|
var resp = await fetch('/api/v1/settings/test-ssh', { method: 'POST' });
|
||||||
|
var data = await resp.json();
|
||||||
|
showResult(sshResult, resp.ok, resp.ok ? 'Connection OK' : (data.detail || 'Failed'));
|
||||||
|
} catch (e) {
|
||||||
|
showResult(sshResult, false, 'Network error');
|
||||||
|
} finally {
|
||||||
|
sshBtn.disabled = false;
|
||||||
|
sshBtn.textContent = 'Test SSH Connection';
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// Check for Updates
|
// Check for Updates
|
||||||
var updBtn = document.getElementById('check-updates-btn');
|
var updBtn = document.getElementById('check-updates-btn');
|
||||||
var updResult = document.getElementById('update-result');
|
var updResult = document.getElementById('update-result');
|
||||||
|
|
|
||||||
|
|
@ -119,5 +119,65 @@
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="stats-grid" style="margin-top:24px">
|
||||||
|
|
||||||
|
<!-- Average duration by drive size -->
|
||||||
|
<div class="stats-section">
|
||||||
|
<h2 class="section-title">Avg. Test Duration by Drive Size</h2>
|
||||||
|
{% if by_size %}
|
||||||
|
<div class="table-wrap" style="max-height:none">
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Size</th>
|
||||||
|
<th style="text-align:right">Jobs</th>
|
||||||
|
<th style="text-align:right">Avg Duration</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for s in by_size %}
|
||||||
|
<tr>
|
||||||
|
<td style="font-weight:500;color:var(--text-strong)">{{ s.size_tb }} TB</td>
|
||||||
|
<td class="mono text-muted" style="text-align:right">{{ s.total }}</td>
|
||||||
|
<td class="mono" style="text-align:right;color:var(--text-strong)">{{ s.avg_hours }}h</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="empty-state" style="border:1px solid var(--border);border-radius:8px;padding:32px">No completed jobs yet.</div>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Failure breakdown by stage -->
|
||||||
|
<div class="stats-section">
|
||||||
|
<h2 class="section-title">Failures by Stage</h2>
|
||||||
|
{% if by_failure_stage %}
|
||||||
|
<div class="table-wrap" style="max-height:none">
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Stage</th>
|
||||||
|
<th style="text-align:right">Count</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for f in by_failure_stage %}
|
||||||
|
<tr>
|
||||||
|
<td style="font-weight:500;color:var(--red)">{{ f.failed_stage | replace('_',' ') | title }}</td>
|
||||||
|
<td class="mono" style="text-align:right;color:var(--red)">{{ f.count }}</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="empty-state" style="border:1px solid var(--border);border-radius:8px;padding:32px">No failures recorded.</div>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|
|
||||||
|
|
@ -5,3 +5,4 @@ httpx
|
||||||
pydantic-settings
|
pydantic-settings
|
||||||
jinja2
|
jinja2
|
||||||
sse-starlette
|
sse-starlette
|
||||||
|
asyncssh
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue