Closes the last open Codex finding (#5) and removes one piece of dead code Codex flagged in passing. #5 — Live pool re-check before burn-in start: Before this change, _is_unlocked compared the operator's unlock grant against the cached drives.pool_* row. If a drive was imported into a pool, mounted, or had ZFS labels written between the operator's unlock click and the next ~12s poll, burn-in could still start against the stale identity and silently destroy the new pool. start_job now calls a fresh ssh_client.fresh_pool_check_for_drive() immediately after the cached gate. That helper re-runs the three detection probes (zpool list -vHP / lsblk zfs_member / findmnt) over a fresh SSH session and returns the live answer for one devname. If it differs from cached state we invalidate any existing unlock grant and raise PoolMemberError with the FRESH pool name so the UI reflects current reality. If fresh shows free but cached said locked the drive came back to free since last poll — log it and allow. Cost: ~200ms per burn-in start. For batch starts of 12 drives, that's 2.4s extra latency — cheap against destroying a freshly-imported pool. Dead code removal: ssh_client.run_badblocks() — no callers since 1.0.0-13 when the SSH badblocks logic was inlined into burnin._stage_surface_validate_ssh (with the asyncssh-signal-doesn't-actually-kill workaround). Removing the dead function also lets us drop the now-unused `from typing import Callable` import. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
116 lines
5.3 KiB
Python
116 lines
5.3 KiB
Python
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
|
|
|
|
class Settings(BaseSettings):
|
|
model_config = SettingsConfigDict(
|
|
env_file=".env",
|
|
env_file_encoding="utf-8",
|
|
case_sensitive=False,
|
|
)
|
|
|
|
app_host: str = "0.0.0.0" # nosec B104 — container deliberately binds all interfaces; nginx-proxy-manager fronts it.
|
|
app_port: int = 8080
|
|
db_path: str = "/data/app.db"
|
|
|
|
truenas_base_url: str = "http://localhost:8000"
|
|
truenas_api_key: str = "mock-key"
|
|
truenas_verify_tls: bool = False
|
|
|
|
poll_interval_seconds: int = 12
|
|
stale_threshold_seconds: int = 45
|
|
max_parallel_burnins: int = 2
|
|
surface_validate_seconds: int = 45 # mock simulation duration
|
|
io_validate_seconds: int = 25 # mock simulation duration
|
|
|
|
# Logging
|
|
log_level: str = "INFO"
|
|
|
|
# Security — comma-separated IPs or CIDRs, e.g. "10.0.0.0/24,127.0.0.1"
|
|
# Empty string means allow all (default).
|
|
allowed_ips: str = ""
|
|
|
|
# SMTP — daily status email at 8am local time
|
|
# Leave smtp_host empty to disable email.
|
|
smtp_host: str = ""
|
|
smtp_port: int = 587
|
|
smtp_user: str = ""
|
|
smtp_password: str = ""
|
|
smtp_from: str = ""
|
|
smtp_to: str = "" # comma-separated recipients
|
|
smtp_report_hour: int = 8 # local hour to send (0-23)
|
|
smtp_daily_report_enabled: bool = True # set False to skip daily report without disabling alerts
|
|
smtp_alert_on_fail: bool = True # immediate email when a job fails
|
|
smtp_alert_on_pass: bool = False # immediate email when a job passes
|
|
smtp_ssl_mode: str = "starttls" # "starttls" | "ssl" | "plain"
|
|
smtp_timeout: int = 60 # connection + read timeout in seconds
|
|
|
|
# Webhook — POST JSON payload on every job state change (pass/fail)
|
|
# Leave empty to disable. Works with Slack, Discord, ntfy, n8n, etc.
|
|
webhook_url: str = ""
|
|
|
|
# Stuck-job detection: jobs running longer than this are marked 'unknown'
|
|
stuck_job_hours: int = 24
|
|
|
|
# Temperature thresholds (°C) — drives table colouring + precheck gate
|
|
temp_warn_c: int = 46 # orange warning
|
|
temp_crit_c: int = 55 # red critical (precheck refuses to start above this)
|
|
|
|
# Bad-block tolerance — surface_validate fails if bad blocks exceed this
|
|
bad_block_threshold: int = 0
|
|
|
|
# Surface-validate (badblocks) tunables — defaults match the Spearfoot
|
|
# disk-burnin.sh community script's recommended geometry for large HDDs.
|
|
# block_size : -b in bytes; aligned to AF (4 KiB) sectors. Bumping
|
|
# to 8192 roughly halves badblocks runtime on multi-TB
|
|
# drives at the cost of ~2x RAM in the test buffer.
|
|
# block_buffer : -c blocks held in memory per IO. 64 = badblocks
|
|
# default. Higher values = larger buffer, faster IO,
|
|
# more RAM (block_size * block_buffer bytes per pass).
|
|
# passes : -p value. 1 = repeat until one consecutive clean
|
|
# scan (current behavior). 2-3 for paranoid burn-in
|
|
# that re-confirms after finding errors.
|
|
surface_validate_block_size: int = 4096
|
|
surface_validate_block_buffer: int = 64
|
|
surface_validate_passes: int = 1
|
|
|
|
# SSH credentials for direct TrueNAS command execution (Stage 7)
|
|
# When ssh_host is set, burn-in stages use SSH for smartctl/badblocks instead of REST API.
|
|
# Leave ssh_host empty to use the mock/REST API (development mode).
|
|
ssh_host: str = ""
|
|
ssh_port: int = 22
|
|
ssh_user: str = "root" # TrueNAS CORE default is root
|
|
ssh_password: str = "" # Password auth (leave blank if using key)
|
|
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
|
|
|
# Application version — used by the /api/v1/updates/check endpoint
|
|
app_version: str = "1.0.0-29"
|
|
|
|
# ---- Authentication (1.0.0-22) ----
|
|
# session_secret: HMAC key for signing session cookies. Empty = generate
|
|
# one and persist to /data/session_secret on first run (sessions survive
|
|
# restarts but rotate if the file is deleted). Set explicitly via
|
|
# SESSION_SECRET env var if you want to share secrets across replicas.
|
|
session_secret: str = ""
|
|
session_max_age_seconds: int = 60 * 60 * 24 * 7 # 7 days
|
|
# Set to True when the dashboard is exclusively reachable over HTTPS
|
|
# (typical when fronted by nginx-proxy-manager with TLS). Refuses to
|
|
# send the session cookie on plain HTTP, eliminating the on-the-wire
|
|
# exposure surface. Leaving False allows initial deploy + LAN testing.
|
|
session_cookie_secure: bool = False
|
|
# Initial admin bootstrap. If both env vars are set AND the users table
|
|
# is empty at startup, create that account immediately. After that the
|
|
# env vars are ignored — change passwords via the UI / database, not
|
|
# by editing compose.yml.
|
|
initial_admin_username: str = ""
|
|
initial_admin_password: str = ""
|
|
|
|
# ---- Retention + backup (1.0.0-23) ----
|
|
# log_days : burnin_stages.log_text NULLed out after this many days
|
|
# (history rows themselves are preserved). Default keeps
|
|
# ~5 weeks; long-soak burn-ins typically finish in <2.
|
|
# backup_keep: number of nightly DB snapshots to keep in /data/backups.
|
|
retention_log_days: int = 35
|
|
retention_backup_keep: int = 14
|
|
|
|
|
|
settings = Settings()
|