nas-burnin/app/mailer.py
Brandon Walter ec636f8f3a
Some checks failed
Security scan / pip-audit (push) Has been cancelled
Security scan / bandit (push) Has been cancelled
Security scan / gitleaks (push) Has been cancelled
Security scan / mypy (push) Has been cancelled
fix: PRAGMA busy_timeout on every SQLite connection (1.0.0-60)
Jobs 60-63 ran healthy for 16h then all 4 died simultaneously with
'database is locked'. The burnin drain used _db() which set
busy_timeout=10000, but:

1. 10s was sometimes too short under heavy contention (4 burn-in
   drains writing every 5s + poller every 12s + retention scan +
   auth + lifespan = many concurrent writers).
2. OTHER aiosqlite.connect() sites (poller, retention, auth, mailer,
   routes/__init__'s SSE, burnin/__init__.py's various helpers,
   database.get_db) didn't set busy_timeout at all. Without it,
   SQLite raises 'database is locked' INSTANTLY on any contention,
   which forced concurrency back onto the drain's connection.

Fix:
- _db() busy_timeout 10000 → 60000 (60s; aggressive but right for
  this workload — brief contention spikes are normal and waiting
  beats failing).
- PRAGMA busy_timeout=60000 added on every aiosqlite.connect() site
  next to the existing PRAGMA calls. Applied via a small Python
  pass that preserves the original variable name (db / _tdb / src
  / dst etc.) and indentation.

Same restart sequence applied: rebuild container, reset 4 drives,
relaunch via loopback bypass. Jobs 64-67 are now running.

This is auto-restart #2 in 24h. Safety brake at 3.
2026-05-14 06:39:33 -04:00

544 lines
22 KiB
Python

"""
Daily status email — sent at smtp_report_hour (local time) every day.
Disabled when SMTP_HOST is not set.
"""
import asyncio
import html
import logging
import smtplib
import ssl
from datetime import datetime, timedelta, timezone
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import aiosqlite
from app.config import settings
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# HTML email template
# ---------------------------------------------------------------------------
def _chip(state: str) -> str:
colours = {
"PASSED": ("#1a4731", "#3fb950", "#3fb950"),
"passed": ("#1a4731", "#3fb950", "#3fb950"),
"FAILED": ("#4b1113", "#f85149", "#f85149"),
"failed": ("#4b1113", "#f85149", "#f85149"),
"running": ("#0d2d6b", "#58a6ff", "#58a6ff"),
"queued": ("#4b3800", "#d29922", "#d29922"),
"cancelled": ("#222", "#8b949e", "#8b949e"),
"unknown": ("#222", "#8b949e", "#8b949e"),
"idle": ("#222", "#8b949e", "#8b949e"),
"UNKNOWN": ("#222", "#8b949e", "#8b949e"),
}
bg, fg, bd = colours.get(state, ("#222", "#8b949e", "#8b949e"))
label = state.upper()
return (
f'<span style="background:{bg};color:{fg};border:1px solid {bd};'
f'border-radius:4px;padding:2px 8px;font-size:11px;font-weight:600;'
f'letter-spacing:.04em;white-space:nowrap">{label}</span>'
)
def _temp_colour(c) -> str:
if c is None:
return "#8b949e"
if c < 40:
return "#3fb950"
if c < 50:
return "#d29922"
return "#f85149"
def _fmt_bytes(b) -> str:
if b is None:
return ""
tb = b / 1_000_000_000_000
if tb >= 1:
return f"{tb:.0f} TB"
return f"{b / 1_000_000_000:.0f} GB"
def _fmt_dt(iso: str | None) -> str:
if not iso:
return ""
try:
dt = datetime.fromisoformat(iso)
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt.astimezone().strftime("%Y-%m-%d %H:%M")
except Exception:
return iso or ""
def _drive_rows_html(drives: list[dict]) -> str:
if not drives:
return '<tr><td colspan="8" style="text-align:center;color:#8b949e;padding:24px">No drives found</td></tr>'
rows = []
for d in drives:
health = d.get("smart_health") or "UNKNOWN"
temp = d.get("temperature_c")
bi = d.get("burnin") or {}
bi_state = bi.get("state", "") if bi else ""
short = d.get("smart_short") or {}
long_ = d.get("smart_long") or {}
short_state = short.get("state", "idle")
long_state = long_.get("state", "idle")
row_bg = "#1c0a0a" if health == "FAILED" else "#0d1117"
rows.append(f"""
<tr style="background:{row_bg};border-bottom:1px solid #30363d">
<td style="padding:9px 12px;font-weight:600;color:#c9d1d9">{d.get('devname','')}</td>
<td style="padding:9px 12px;color:#8b949e;font-size:12px">{d.get('model','')}</td>
<td style="padding:9px 12px;font-family:monospace;font-size:12px;color:#8b949e">{d.get('serial','')}</td>
<td style="padding:9px 12px;text-align:right;color:#8b949e">{_fmt_bytes(d.get('size_bytes'))}</td>
<td style="padding:9px 12px;text-align:right;color:{_temp_colour(temp)};font-weight:500">{f'{temp}°C' if temp is not None else ''}</td>
<td style="padding:9px 12px">{_chip(health)}</td>
<td style="padding:9px 12px">{_chip(short_state)}</td>
<td style="padding:9px 12px">{_chip(long_state)}</td>
<td style="padding:9px 12px">{_chip(bi_state) if bi else ''}</td>
</tr>""")
return "\n".join(rows)
def _build_unlock_banner_html(events: list[dict]) -> str:
"""Banner listing every pool-drive unlock granted in the last 24h.
Every interpolated DB field is run through html.escape — operator and
reason are free-text from the unlock modal and otherwise inject into
the email body verbatim.
"""
if not events:
return ""
rows = []
for e in events:
evt = e.get("event_type") or ""
is_boot = evt == "boot_pool_drive_unlocked"
is_exported = evt == "exported_pool_drive_unlocked"
is_mounted = evt == "mounted_drive_unlocked"
kind = (
"BOOT POOL" if is_boot
else "EXPORTED ZFS" if is_exported
else "MOUNTED FILESYSTEM" if is_mounted
else "pool"
)
when = html.escape((e.get("created_at") or "")[:19])
operator = html.escape(e.get("operator") or "?")
devname = html.escape(e.get("devname") or "?")
# `message` already includes pool name, devname, and the operator's
# reason — surface it verbatim so the audit trail is faithful.
message = html.escape(e.get("message") or "")
rows.append(
f"<li style='margin:4px 0'><strong>{when}</strong> &middot; "
f"<strong>{operator}</strong> unlocked a {kind} drive "
f"({devname}): "
f"<span style='color:#c9d1d9'>{message}</span></li>"
)
return f"""
<div style="background:#4b1113;border:1px solid #f85149;border-radius:6px;
padding:14px 18px;margin-bottom:20px;color:#f85149">
<div style="font-weight:600;font-size:14px;margin-bottom:6px">
&#x26A0; {len(events)} pool-drive unlock(s) in the last 24h
</div>
<ul style="margin:0;padding-left:18px;font-size:12.5px;color:#f0a0a0">
{''.join(rows)}
</ul>
</div>"""
def _build_html(drives: list[dict], generated_at: str,
unlock_events: list[dict] | None = None) -> str:
total = len(drives)
failed_drives = [d for d in drives if d.get("smart_health") == "FAILED"]
running_burnin = [d for d in drives if (d.get("burnin") or {}).get("state") == "running"]
passed_burnin = [d for d in drives if (d.get("burnin") or {}).get("state") == "passed"]
# Alert banners (unlock events first — the audit-grade signal)
alert_html = _build_unlock_banner_html(unlock_events or [])
if failed_drives:
names = ", ".join(d["devname"] for d in failed_drives)
alert_html += f"""
<div style="background:#4b1113;border:1px solid #f85149;border-radius:6px;padding:14px 18px;margin-bottom:20px;color:#f85149;font-weight:500">
⚠ SMART health FAILED on {len(failed_drives)} drive(s): {names}
</div>"""
drive_rows = _drive_rows_html(drives)
return f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<title>NAS Burn-In — Daily Report</title>
</head>
<body style="margin:0;padding:0;background:#0d1117;font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',system-ui,sans-serif;font-size:14px;color:#c9d1d9">
<table width="100%" cellpadding="0" cellspacing="0" style="background:#0d1117;min-height:100vh">
<tr><td align="center" style="padding:32px 16px">
<table width="700" cellpadding="0" cellspacing="0" style="max-width:700px;width:100%">
<!-- Header -->
<tr>
<td style="background:#161b22;border:1px solid #30363d;border-radius:10px 10px 0 0;padding:20px 24px;border-bottom:none">
<table width="100%" cellpadding="0" cellspacing="0">
<tr>
<td><span style="font-size:18px;font-weight:700;color:#f0f6fc">NAS Burn-In</span>
<span style="color:#8b949e;font-size:13px;margin-left:10px">Daily Status Report</span></td>
<td align="right" style="color:#8b949e;font-size:12px">{generated_at}</td>
</tr>
</table>
</td>
</tr>
<!-- Body -->
<tr>
<td style="background:#0d1117;border:1px solid #30363d;border-top:none;border-bottom:none;padding:24px">
{alert_html}
<!-- Summary chips -->
<table cellpadding="0" cellspacing="0" style="margin-bottom:24px">
<tr>
<td style="padding-right:10px">
<div style="background:#161b22;border:1px solid #30363d;border-radius:8px;padding:12px 18px;text-align:center;min-width:80px">
<div style="font-size:24px;font-weight:700;color:#f0f6fc">{total}</div>
<div style="font-size:11px;color:#8b949e;text-transform:uppercase;letter-spacing:.06em;margin-top:2px">Drives</div>
</div>
</td>
<td style="padding-right:10px">
<div style="background:#161b22;border:1px solid #30363d;border-radius:8px;padding:12px 18px;text-align:center;min-width:80px">
<div style="font-size:24px;font-weight:700;color:#f85149">{len(failed_drives)}</div>
<div style="font-size:11px;color:#8b949e;text-transform:uppercase;letter-spacing:.06em;margin-top:2px">Failed</div>
</div>
</td>
<td style="padding-right:10px">
<div style="background:#161b22;border:1px solid #30363d;border-radius:8px;padding:12px 18px;text-align:center;min-width:80px">
<div style="font-size:24px;font-weight:700;color:#58a6ff">{len(running_burnin)}</div>
<div style="font-size:11px;color:#8b949e;text-transform:uppercase;letter-spacing:.06em;margin-top:2px">Running</div>
</div>
</td>
<td>
<div style="background:#161b22;border:1px solid #30363d;border-radius:8px;padding:12px 18px;text-align:center;min-width:80px">
<div style="font-size:24px;font-weight:700;color:#3fb950">{len(passed_burnin)}</div>
<div style="font-size:11px;color:#8b949e;text-transform:uppercase;letter-spacing:.06em;margin-top:2px">Passed</div>
</div>
</td>
</tr>
</table>
<!-- Drive table -->
<table width="100%" cellpadding="0" cellspacing="0" style="border:1px solid #30363d;border-radius:8px;overflow:hidden">
<thead>
<tr style="background:#161b22">
<th style="padding:9px 12px;font-size:11px;font-weight:600;text-transform:uppercase;letter-spacing:.06em;color:#8b949e;text-align:left;border-bottom:1px solid #30363d">Drive</th>
<th style="padding:9px 12px;font-size:11px;font-weight:600;text-transform:uppercase;letter-spacing:.06em;color:#8b949e;text-align:left;border-bottom:1px solid #30363d">Model</th>
<th style="padding:9px 12px;font-size:11px;font-weight:600;text-transform:uppercase;letter-spacing:.06em;color:#8b949e;text-align:left;border-bottom:1px solid #30363d">Serial</th>
<th style="padding:9px 12px;font-size:11px;font-weight:600;text-transform:uppercase;letter-spacing:.06em;color:#8b949e;text-align:right;border-bottom:1px solid #30363d">Size</th>
<th style="padding:9px 12px;font-size:11px;font-weight:600;text-transform:uppercase;letter-spacing:.06em;color:#8b949e;text-align:right;border-bottom:1px solid #30363d">Temp</th>
<th style="padding:9px 12px;font-size:11px;font-weight:600;text-transform:uppercase;letter-spacing:.06em;color:#8b949e;text-align:left;border-bottom:1px solid #30363d">Health</th>
<th style="padding:9px 12px;font-size:11px;font-weight:600;text-transform:uppercase;letter-spacing:.06em;color:#8b949e;text-align:left;border-bottom:1px solid #30363d">Short</th>
<th style="padding:9px 12px;font-size:11px;font-weight:600;text-transform:uppercase;letter-spacing:.06em;color:#8b949e;text-align:left;border-bottom:1px solid #30363d">Long</th>
<th style="padding:9px 12px;font-size:11px;font-weight:600;text-transform:uppercase;letter-spacing:.06em;color:#8b949e;text-align:left;border-bottom:1px solid #30363d">Burn-In</th>
</tr>
</thead>
<tbody>
{drive_rows}
</tbody>
</table>
</td>
</tr>
<!-- Footer -->
<tr>
<td style="background:#161b22;border:1px solid #30363d;border-top:none;border-radius:0 0 10px 10px;padding:14px 24px;text-align:center">
<span style="font-size:12px;color:#8b949e">Generated by NAS Burn-In Dashboard · {generated_at}</span>
</td>
</tr>
</table>
</td></tr>
</table>
</body>
</html>"""
# ---------------------------------------------------------------------------
# Send
# ---------------------------------------------------------------------------
# Standard ports for each SSL mode — used when smtp_port is not overridden
_MODE_PORTS: dict[str, int] = {"starttls": 587, "ssl": 465, "plain": 25}
def _smtp_port() -> int:
"""Derive port from ssl_mode; fall back to settings.smtp_port if explicitly set."""
mode = (settings.smtp_ssl_mode or "starttls").lower()
return _MODE_PORTS.get(mode, 587)
def _send_email(subject: str, html: str) -> None:
recipients = [r.strip() for r in settings.smtp_to.split(",") if r.strip()]
if not recipients:
log.warning("SMTP_TO is empty — skipping send")
return
msg = MIMEMultipart("alternative")
msg["Subject"] = subject
msg["From"] = settings.smtp_from or settings.smtp_user
msg["To"] = ", ".join(recipients)
msg.attach(MIMEText(html, "html", "utf-8"))
ctx = ssl.create_default_context()
mode = (settings.smtp_ssl_mode or "starttls").lower()
timeout = int(settings.smtp_timeout or 60)
port = _smtp_port()
# SMTP / SMTP_SSL share a parent class but mypy can't unify them
# without an explicit Union annotation on the binding.
server: smtplib.SMTP
if mode == "ssl":
server = smtplib.SMTP_SSL(settings.smtp_host, port, context=ctx, timeout=timeout)
server.ehlo()
server.login(settings.smtp_user, settings.smtp_password)
server.sendmail(msg["From"], recipients, msg.as_string())
server.quit()
else:
with smtplib.SMTP(settings.smtp_host, port, timeout=timeout) as server:
server.ehlo()
if mode == "starttls":
server.starttls(context=ctx)
server.ehlo()
server.login(settings.smtp_user, settings.smtp_password)
server.sendmail(msg["From"], recipients, msg.as_string())
log.info("Email sent to %s", recipients)
# ---------------------------------------------------------------------------
# Data fetch
# ---------------------------------------------------------------------------
async def _fetch_report_data() -> list[dict]:
"""Pull drives + latest burnin state from DB."""
from app.routes import _fetch_drives_for_template # local import avoids circular
async with aiosqlite.connect(settings.db_path) as db:
db.row_factory = aiosqlite.Row
await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA busy_timeout=60000")
return await _fetch_drives_for_template(db)
async def _fetch_unlock_events_24h() -> list[dict]:
"""Return pool-drive unlock audit events from the last 24 hours.
These are operator overrides of the pool-membership lock — every entry
represents a deliberate decision to risk a pool, so the daily report
surfaces them as an audit-grade banner.
"""
async with aiosqlite.connect(settings.db_path) as db:
db.row_factory = aiosqlite.Row
await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA busy_timeout=60000")
# julianday() handles the 'YYYY-MM-DDTHH:MM:SS.fff+00:00' format
# we write from Python; comparing the raw string against
# datetime('now','-1 day') (which formats as 'YYYY-MM-DD HH:MM:SS')
# produces subtle off-by-up-to-a-day errors because of the
# 'T' vs ' ' separator and the '+00:00' suffix.
cur = await db.execute("""
SELECT ae.event_type, ae.operator, ae.message, ae.created_at,
d.devname, d.pool_name, d.pool_role
FROM audit_events ae
LEFT JOIN drives d ON d.id = ae.drive_id
WHERE ae.event_type IN (
'pool_drive_unlocked',
'boot_pool_drive_unlocked',
'exported_pool_drive_unlocked',
'mounted_drive_unlocked')
AND julianday(ae.created_at) >= julianday('now', '-1 day')
ORDER BY ae.created_at DESC
""")
return [dict(r) for r in await cur.fetchall()]
# ---------------------------------------------------------------------------
# Scheduler
# ---------------------------------------------------------------------------
def _build_alert_html(
job_id: int,
devname: str,
serial: str | None,
model: str | None,
state: str,
error_text: str | None,
generated_at: str,
) -> str:
is_fail = state == "failed"
color = "#f85149" if is_fail else "#3fb950"
bg = "#4b1113" if is_fail else "#1a4731"
icon = "&#x2715;" if is_fail else "&#x2713;"
error_section = ""
if error_text:
error_section = f"""
<div style="background:#4b1113;border:1px solid #f85149;border-radius:6px;
padding:12px 16px;margin-top:16px;color:#f85149;font-size:13px">
<strong>Error:</strong> {error_text}
</div>"""
return f"""<!DOCTYPE html>
<html lang="en">
<head><meta charset="UTF-8"><title>Burn-In {state.title()} Alert</title></head>
<body style="margin:0;padding:0;background:#0d1117;font-family:-apple-system,sans-serif;
font-size:14px;color:#c9d1d9">
<table width="100%" cellpadding="0" cellspacing="0">
<tr><td align="center" style="padding:32px 16px">
<table width="480" cellpadding="0" cellspacing="0" style="max-width:480px;width:100%">
<tr>
<td style="background:{bg};border:2px solid {color};border-radius:10px;padding:24px">
<div style="font-size:26px;font-weight:700;color:{color};margin-bottom:16px">
{icon} Burn-In {state.upper()}
</div>
<table cellpadding="0" cellspacing="0" style="width:100%">
<tr>
<td style="color:#8b949e;font-size:12px;padding:5px 0">Device</td>
<td style="font-weight:600;text-align:right;font-size:15px">{devname}</td>
</tr>
<tr>
<td style="color:#8b949e;font-size:12px;padding:5px 0">Model</td>
<td style="text-align:right">{model or ''}</td>
</tr>
<tr>
<td style="color:#8b949e;font-size:12px;padding:5px 0">Serial</td>
<td style="font-family:monospace;text-align:right">{serial or ''}</td>
</tr>
<tr>
<td style="color:#8b949e;font-size:12px;padding:5px 0">Job #</td>
<td style="font-family:monospace;text-align:right">{job_id}</td>
</tr>
</table>
{error_section}
<div style="margin-top:16px;font-size:11px;color:#8b949e">{generated_at}</div>
</td>
</tr>
</table>
</td></tr>
</table>
</body>
</html>"""
async def send_job_alert(
job_id: int,
devname: str,
serial: str | None,
model: str | None,
state: str,
error_text: str | None,
) -> None:
"""Send an immediate per-job alert email (pass or fail)."""
icon = "" if state == "failed" else ""
subject = f"{icon} Burn-In {state.upper()}: {devname} ({serial or 'no serial'})"
now_str = datetime.now().strftime("%Y-%m-%d %H:%M")
html = _build_alert_html(job_id, devname, serial, model, state, error_text, now_str)
await asyncio.to_thread(_send_email, subject, html)
async def test_smtp_connection() -> dict:
"""
Try to establish an SMTP connection using current settings.
Returns {"ok": True/False, "error": str|None}.
Does NOT send any email.
"""
if not settings.smtp_host:
return {"ok": False, "error": "SMTP_HOST is not configured"}
def _test() -> dict:
try:
ctx = ssl.create_default_context()
mode = (settings.smtp_ssl_mode or "starttls").lower()
timeout = int(settings.smtp_timeout or 60)
port = _smtp_port()
server: smtplib.SMTP
if mode == "ssl":
server = smtplib.SMTP_SSL(settings.smtp_host, port,
context=ctx, timeout=timeout)
server.ehlo()
else:
server = smtplib.SMTP(settings.smtp_host, port, timeout=timeout)
server.ehlo()
if mode == "starttls":
server.starttls(context=ctx)
server.ehlo()
if settings.smtp_user:
server.login(settings.smtp_user, settings.smtp_password)
server.quit()
return {"ok": True, "error": None}
except Exception as exc:
return {"ok": False, "error": str(exc)}
return await asyncio.to_thread(_test)
async def send_report_now() -> None:
"""Send a report immediately (used by on-demand API endpoint)."""
drives = await _fetch_report_data()
unlock_events = await _fetch_unlock_events_24h()
now_str = datetime.now().strftime("%Y-%m-%d %H:%M")
html = _build_html(drives, now_str, unlock_events)
suffix = ""
if unlock_events:
suffix = f"{len(unlock_events)} pool unlock(s)"
subject = (
f"Burn-In Report — {datetime.now().strftime('%Y-%m-%d')} "
f"({len(drives)} drives){suffix}"
)
await asyncio.to_thread(_send_email, subject, html)
async def run() -> None:
"""Background loop: send daily report at smtp_report_hour local time."""
if not settings.smtp_host:
log.info("SMTP not configured — daily email disabled")
return
log.info(
"Mailer started — daily report at %02d:00 local time",
settings.smtp_report_hour,
)
while True:
now = datetime.now()
target = now.replace(
hour=settings.smtp_report_hour,
minute=0, second=0, microsecond=0,
)
if target <= now:
target += timedelta(days=1)
wait = (target - now).total_seconds()
log.info("Next report in %.0f seconds (%s)", wait, target.strftime("%Y-%m-%d %H:%M"))
await asyncio.sleep(wait)
if settings.smtp_daily_report_enabled:
try:
await send_report_now()
except Exception as exc:
log.error("Failed to send daily report: %s", exc)
else:
log.info("Daily report skipped — smtp_daily_report_enabled is False")
# Sleep briefly past the hour to avoid drift from re-triggering immediately
await asyncio.sleep(60)