diff --git a/app/auth.py b/app/auth.py index 010ab4e..5c70d8c 100644 --- a/app/auth.py +++ b/app/auth.py @@ -136,8 +136,15 @@ async def get_user_by_id(user_id: int) -> User | None: async def create_user(username: str, password: str, full_name: str | None = None, - is_admin: bool = False) -> User: - """Insert a new user. Raises ValueError if the username collides.""" + is_admin: bool = False, + bootstrap_only: bool = False) -> User: + """Insert a new user. Raises ValueError if the username collides. + + bootstrap_only=True: serializes the insert with a check that the + users table is empty inside an IMMEDIATE transaction. Used for the + /api/v1/auth/setup first-user flow so two concurrent requests can't + both create admin accounts during the bootstrap window. + """ username = (username or "").strip() if not username: raise ValueError("Username is required.") @@ -146,6 +153,16 @@ async def create_user(username: str, password: str, h = hash_password(password) try: async with aiosqlite.connect(settings.db_path) as db: + if bootstrap_only: + # IMMEDIATE acquires the write lock up-front so a parallel + # setup request waits or fails — no two-step race. + await db.execute("BEGIN IMMEDIATE") + cur = await db.execute("SELECT COUNT(*) FROM users") + if (await cur.fetchone())[0] != 0: + await db.execute("ROLLBACK") + raise ValueError( + "Users already exist — first-user setup is closed." + ) cur = await db.execute( """INSERT INTO users (username, password_hash, full_name, is_admin, created_at) @@ -237,23 +254,48 @@ def login_locked_until(username: str, ip: str) -> float | None: return soonest -def record_login_failure(username: str, ip: str) -> bool: - """Returns True if this failure tripped a lockout.""" - tripped = False +def register_login_attempt(username: str, ip: str) -> str: + """Atomic check-then-increment for a login attempt. + + Returns: + "ok" — allowed, counter incremented + "locked_out" — already locked from a prior attempt + "now_locked_out" — THIS attempt is what tripped the lockout + + The increment runs synchronously (no awaits) so concurrent requests + can't slip past the threshold in CPython's single-threaded asyncio + loop. Caller must invoke clear_login_failures() on successful auth + to roll back this attempt's contribution. + """ now = _time.time() + # Check existing lockouts first; if already locked, don't even + # increment — the lockout window absorbs everything. + for key in (("user", username.lower()), ("ip", ip)): + exp = _login_lockouts.get(key) + if exp is None: + continue + if now >= exp: + del _login_lockouts[key] + continue + return "locked_out" + # Increment + arm lockout if this push crosses the threshold. + tripped = False for key in (("user", username.lower()), ("ip", ip)): _gc_failures(key) _login_failures.setdefault(key, []).append(now) if len(_login_failures[key]) >= LOGIN_FAILURE_THRESHOLD: _login_lockouts[key] = now + LOGIN_LOCKOUT_SECONDS - _login_failures[key] = [] # reset counter once lockout armed + _login_failures[key] = [] tripped = True - return tripped + return "now_locked_out" if tripped else "ok" def clear_login_failures(username: str, ip: str) -> None: + """Erase counters AND any lockout for a successful auth — caller + proved they have credentials, so the brute-force ladder resets.""" for key in (("user", username.lower()), ("ip", ip)): _login_failures.pop(key, None) + _login_lockouts.pop(key, None) # --------------------------------------------------------------------------- @@ -309,6 +351,24 @@ async def get_current_user_optional(request: Request) -> User | None: return await get_user_by_id(int(sess_user_id)) +def require_admin(request: Request) -> User: + """Strict admin gate — for any settings-mutating endpoint. The + AuthGate middleware has already populated request.state.current_user; + this just enforces is_admin on top.""" + user = getattr(request.state, "current_user", None) + if not user: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Authentication required", + ) + if not user.is_admin: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Admin only", + ) + return user + + async def get_current_user(request: Request) -> User: """Strict version — for routes. 401 (or redirect for HTML) if missing.""" user = await get_current_user_optional(request) diff --git a/app/burnin.py b/app/burnin.py index 8b019a3..e210dd1 100644 --- a/app/burnin.py +++ b/app/burnin.py @@ -952,17 +952,49 @@ async def _stage_surface_validate_nvme(job_id: int, devname: str, ) return False - # Sanity-check post-format SMART health. + # Sanity-check post-format SMART health. Mirrors the surface_validate + # SSH path's check parity — fail on FAILED health, fail on real + # SMART attribute failures, log warnings but don't fail. A transport + # error here is treated as a soft pass (log + continue) so a single + # SSH blip after a successful format doesn't undo the work. try: attrs = await ssh_client.get_smart_attributes(devname) + ssh_only_failures = [ + f for f in (attrs.get("failures") or []) if f.startswith("SSH error:") + ] + real_failures = [ + f for f in (attrs.get("failures") or []) if not f.startswith("SSH error:") + ] if attrs.get("health") == "FAILED": await _set_stage_error( job_id, "surface_validate", - "NVMe SMART health FAILED after format" + "NVMe SMART health FAILED after format", ) return False + if real_failures: + await _set_stage_error( + job_id, "surface_validate", + "NVMe SMART attribute failures after format: " + + "; ".join(real_failures), + ) + return False + if ssh_only_failures: + await _append_stage_log( + job_id, "surface_validate", + "[WARN] post-format SMART check had SSH errors " + "(soft-passing): " + "; ".join(ssh_only_failures) + "\n", + ) + if attrs.get("warnings"): + await _append_stage_log( + job_id, "surface_validate", + "[WARN] " + "; ".join(attrs["warnings"]) + "\n", + ) except Exception as exc: log.warning("Post-format SMART check error on %s: %s", devname, exc) + await _append_stage_log( + job_id, "surface_validate", + f"[WARN] post-format SMART check raised: {exc}\n", + ) await _update_stage_percent(job_id, "surface_validate", 100) await _recalculate_progress(job_id) @@ -1116,11 +1148,16 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int) job_id, ) - # Flush remaining output - remainder = "".join(output_lines) - await _append_stage_log(job_id, "surface_validate", remainder) + # Flush only lines we haven't already written in 20-line chunks. + # Previously we appended the FULL accumulated output here too, + # doubling the stored log_text size for every surface_validate + # stage and pushing app.db into hundreds of MB. + flushed_count = (len(output_lines) // 20) * 20 + tail = "".join(output_lines[flushed_count:]) + if tail: + await _append_stage_log(job_id, "surface_validate", tail) result["bad_blocks"] = bad_blocks_total - result["output"] = remainder + result["output"] = "".join(output_lines) # in-memory only, not re-stored result["aborted"] = bad_blocks_total > settings.bad_block_threshold except asyncio.CancelledError: diff --git a/app/config.py b/app/config.py index e48ba4a..0a3737b 100644 --- a/app/config.py +++ b/app/config.py @@ -83,7 +83,7 @@ class Settings(BaseSettings): ssh_key: str = "" # PEM private key content (paste full key including headers) # Application version — used by the /api/v1/updates/check endpoint - app_version: str = "1.0.0-27" + app_version: str = "1.0.0-28" # ---- Authentication (1.0.0-22) ---- # session_secret: HMAC key for signing session cookies. Empty = generate @@ -92,6 +92,11 @@ class Settings(BaseSettings): # SESSION_SECRET env var if you want to share secrets across replicas. session_secret: str = "" session_max_age_seconds: int = 60 * 60 * 24 * 7 # 7 days + # Set to True when the dashboard is exclusively reachable over HTTPS + # (typical when fronted by nginx-proxy-manager with TLS). Refuses to + # send the session cookie on plain HTTP, eliminating the on-the-wire + # exposure surface. Leaving False allows initial deploy + LAN testing. + session_cookie_secure: bool = False # Initial admin bootstrap. If both env vars are set AND the users table # is empty at startup, create that account immediately. After that the # env vars are ignored — change passwords via the UI / database, not diff --git a/app/database.py b/app/database.py index 0f13298..d39bdd1 100644 --- a/app/database.py +++ b/app/database.py @@ -109,6 +109,11 @@ _MIGRATIONS = [ created_at TEXT NOT NULL, last_login_at TEXT )""", + # 1.0.0-28: case-insensitive uniqueness. The base UNIQUE on username + # is case-sensitive but login does NOCASE — without this index two + # users `Admin` and `admin` could coexist and shadow each other. + """CREATE UNIQUE INDEX IF NOT EXISTS uniq_users_username_nocase + ON users (username COLLATE NOCASE)""", ] diff --git a/app/main.py b/app/main.py index 01c16c3..f782a07 100644 --- a/app/main.py +++ b/app/main.py @@ -213,7 +213,10 @@ app.add_middleware( secret_key=auth.get_session_secret(), session_cookie="burnin_session", max_age=settings.session_max_age_seconds, - https_only=False, # we sit behind nginx-proxy-manager; trust upstream + # session_cookie_secure flips the cookie's Secure flag. Set to True + # in production behind HTTPS (nginx-proxy-manager) so the auth cookie + # is never sent on plain HTTP. + https_only=settings.session_cookie_secure, # SameSite=strict is the primary CSRF mitigation: the browser never # sends the session cookie on cross-site requests, so an attacker # page can't trigger any state-changing endpoint even if it knows diff --git a/app/poller.py b/app/poller.py index b30672b..25ecc3f 100644 --- a/app/poller.py +++ b/app/poller.py @@ -375,52 +375,54 @@ async def poll_cycle(client: TrueNASClient) -> int: # locked, and previously-unlocked drives stay unlocked, until detection # recovers. Treating a transient SSH blip as "no pool members" would # silently unlock every drive on the next poll. - detection_ok = True + # Each detection probe (pool / exported / mounted) succeeds or fails + # INDEPENDENTLY. Previously a single None blew away the whole map, + # so a fresh DB on a host where lsblk lacks zfs_member info but + # zpool works would never lock pool members. Now we apply each + # successful probe and only fail-closed for the ones that actually + # errored. pool_map: dict = {} - zfs_member_set: set = set() - mounted_set: set = set() + pool_probe_ok = True # zpool list -vHP succeeded + zfs_probe_ok = True # lsblk zfs_member succeeded + mounted_probe_ok = True # findmnt succeeded try: from app import ssh_client as _ssh if _ssh.is_configured(): pm = await _ssh.get_pool_membership() zs = await _ssh.get_zfs_member_drives() ms = await _ssh.get_mounted_drives() - if pm is None or zs is None or ms is None: - detection_ok = False - else: - pool_map = pm - zfs_member_set = zs - mounted_set = ms - # SSH unconfigured (mock/dev mode) — detection_ok stays True with + pool_probe_ok = pm is not None + zfs_probe_ok = zs is not None + mounted_probe_ok = ms is not None + if pool_probe_ok: + pool_map.update(pm) + if zfs_probe_ok: + for devname in zs: + if devname not in pool_map: + pool_map[devname] = {"pool": "(exported)", "role": "exported"} + if mounted_probe_ok: + for devname in ms: + if devname not in pool_map: + pool_map[devname] = {"pool": "(mounted)", "role": "mounted"} + # SSH unconfigured (mock/dev mode) — all probes "succeed" with # empty maps, so dev mode never artificially locks drives. except Exception: - detection_ok = False + pool_probe_ok = zfs_probe_ok = mounted_probe_ok = False + pool_map = {} - if not detection_ok: + # If ALL probes failed we have no fresh data at all — preserve the + # existing pool columns to keep locks honest. If at least one probe + # succeeded the new pool_map is a partial truth: we apply it and + # only refuse to clear locks coming from a probe that failed. + detection_ok = pool_probe_ok or zfs_probe_ok or mounted_probe_ok + + if not (pool_probe_ok and zfs_probe_ok and mounted_probe_ok): log.warning( - "Pool detection failed this cycle — preserving existing " - "pool_name/pool_role columns. Locked drives stay locked, " - "unlocked drives stay unlocked, until SSH recovers." + "Pool detection partial: pool=%s zfs=%s mounted=%s — preserving " + "stale lock state from any probe that failed.", + pool_probe_ok, zfs_probe_ok, mounted_probe_ok, ) - if detection_ok: - # Drives carrying ZFS labels but not in any active pool are - # "exported" — same hazard as an active pool member, so lock them - # too. We don't know the original pool name without - # `zpool import`-style scanning (slow + blocks); display - # "(exported)" and use a special token. - for devname in zfs_member_set: - if devname not in pool_map: - pool_map[devname] = {"pool": "(exported)", "role": "exported"} - # Drives with a non-ZFS mount somewhere (XFS/ext4/scratch/etc.) - # also lock — wiping a mounted FS is just as catastrophic. Lower - # precedence than active pool membership, since a drive in `tank` - # would also show under findmnt for the pool's mountpoint via - # /dev/zd* or zvol — but those are filtered in the parser. - for devname in mounted_set: - if devname not in pool_map: - pool_map[devname] = {"pool": "(mounted)", "role": "mounted"} - # Index running jobs by (devname, test_type) active: dict[tuple[str, str], dict] = {} for job in running_jobs: diff --git a/app/retention.py b/app/retention.py index c821c91..a2c0be7 100644 --- a/app/retention.py +++ b/app/retention.py @@ -74,19 +74,36 @@ def _backup_dir() -> Path: async def backup_db(keep_count: int) -> Path | None: """Online-backup the live DB to backups/app-YYYY-MM-DD.db. Returns - the new file's path. Old backups beyond keep_count are deleted.""" + the new file's path. Old backups beyond keep_count are deleted. + + Atomicity: writes to a sibling tmp file first and renames into the + canonical daily slot only after backup succeeds. An interrupted + backup leaves the tmp file (cleaned up on next run); the previous + day's snapshot stays intact. os.replace is atomic within the same + filesystem on POSIX. + """ + import os as _os bdir = _backup_dir() bdir.mkdir(parents=True, exist_ok=True) today = datetime.now().strftime("%Y-%m-%d") out = bdir / f"app-{today}.db" + tmp = bdir / f"app-{today}.db.tmp" + + # Drop any leftover tmp from a previous interrupted run. + if tmp.exists(): + try: + tmp.unlink() + except OSError: + pass # aiosqlite.Connection.backup() is an async wrapper around # sqlite3.Connection.backup — atomic online snapshot that doesn't # block writers (it copies pages in batches and yields between). async with aiosqlite.connect(settings.db_path) as src: - async with aiosqlite.connect(str(out)) as dst: + async with aiosqlite.connect(str(tmp)) as dst: await src.backup(dst) + _os.replace(tmp, out) log.info("Retention: DB backed up to %s (%d bytes)", out, out.stat().st_size) # Keep the N most recent backups; delete older. @@ -124,17 +141,26 @@ async def run() -> None: now = datetime.now() today = now.strftime("%Y-%m-%d") if now.hour == _RUN_HOUR and _state["last_run_date"] != today: - _state["last_run_date"] = today + # Track prune + backup success independently. Mark the + # day "done" only when BOTH succeed so a transient + # failure gets retried on the next 5-min tick (still + # within the 03:00 hour). + prune_ok = False + backup_ok = False try: pruned = await prune_stage_logs(settings.retention_log_days) if pruned: await vacuum_db() + prune_ok = True except Exception as exc: log.exception("Retention: pruning failed: %s", exc) try: await backup_db(settings.retention_backup_keep) + backup_ok = True except Exception as exc: log.exception("Retention: backup failed: %s", exc) + if prune_ok and backup_ok: + _state["last_run_date"] = today except asyncio.CancelledError: raise except Exception as exc: diff --git a/app/routes.py b/app/routes.py index bff5e4c..858a2de 100644 --- a/app/routes.py +++ b/app/routes.py @@ -2,6 +2,7 @@ import asyncio import csv import io import json +import time as _time from datetime import datetime, timezone import aiosqlite @@ -263,14 +264,22 @@ async def login_submit(request: Request): next_url = "/" ip = _client_ip(request) - # Rate-limit gate — checked BEFORE bcrypt so an attacker can't burn CPU. - locked_until = auth.login_locked_until(username, ip) - if locked_until is not None: - remaining = int(locked_until - __import__("time").time()) + # Atomic register-and-check: increments the counter NOW (before any + # await), so a parallel burst of guesses can't all slip past the + # threshold. Cleared on successful auth via clear_login_failures. + attempt = auth.register_login_attempt(username, ip) + if attempt != "ok": + if attempt == "now_locked_out": + await auth.audit_auth_event( + "user_login_locked_out", username, + f"Failed login from {ip} — IP/user locked out for {auth.LOGIN_LOCKOUT_SECONDS // 60} min", + ) + locked_until = auth.login_locked_until(username, ip) + remaining = int((locked_until or _time.time()) - _time.time()) return templates.TemplateResponse(request, "login.html", { "request": request, "needs_setup": False, - "error": f"Too many failed attempts. Try again in {remaining // 60} min.", + "error": f"Too many failed attempts. Try again in {remaining // 60 + 1} min.", "next": next_url, }, status_code=429) @@ -280,14 +289,8 @@ async def login_submit(request: Request): # so the timing of "user not found" matches "wrong password." if not found: auth.verify_password(password, "$2b$12$" + "x" * 53) - tripped = auth.record_login_failure(username, ip) await auth.audit_auth_event( - "user_login_locked_out" if tripped else "user_login_failed", - username, - f"Failed login from {ip}" + ( - f" — IP/user locked out for {auth.LOGIN_LOCKOUT_SECONDS // 60} min" - if tripped else "" - ), + "user_login_failed", username, f"Failed login from {ip}", ) return templates.TemplateResponse(request, "login.html", { "request": request, @@ -323,7 +326,12 @@ async def auth_first_user_setup(request: Request): password = form.get("password") or "" full_name = (form.get("full_name") or "").strip() or None try: - user = await auth.create_user(username, password, full_name, is_admin=True) + # bootstrap_only=True wraps the existence check + insert in an + # IMMEDIATE transaction so two concurrent setup requests can't + # both create admin accounts during the bootstrap window. + user = await auth.create_user( + username, password, full_name, is_admin=True, bootstrap_only=True + ) except ValueError as exc: raise HTTPException(status_code=400, detail=str(exc)) # Same fixation defense as the login flow — discard any pre-existing @@ -466,12 +474,20 @@ async def health(db: aiosqlite.Connection = Depends(get_db)): checks: dict[str, dict] = {} - # DB probe — confirm the journal is healthy (PRAGMA reads journal_mode - # and would fail loudly if WAL is wedged or the file is unreadable). + # DB probe — actually exercise the write path (read-only mounts, + # full disks, broken WAL all silently pass a journal_mode read). + # Uses a temp table that lives only inside the connection so the + # round-trip touches the writer without polluting real data. try: - cur = await db.execute("PRAGMA journal_mode") - await cur.fetchone() - checks["db"] = {"ok": True} + await db.execute( + "CREATE TEMP TABLE IF NOT EXISTS _hc (k INTEGER PRIMARY KEY, v TEXT)" + ) + await db.execute("INSERT OR REPLACE INTO _hc (k, v) VALUES (1, ?)", + (datetime.now(timezone.utc).isoformat(),)) + cur = await db.execute("SELECT v FROM _hc WHERE k=1") + row = await cur.fetchone() + await db.commit() + checks["db"] = {"ok": bool(row)} except Exception as exc: checks["db"] = {"ok": False, "error": str(exc)} @@ -781,14 +797,25 @@ def _row_to_burnin(row: aiosqlite.Row, stages: list[aiosqlite.Row]) -> BurninJob ) +def _operator_for(request: Request, _ignored_body_value: str | None = None) -> str: + """Always return the logged-in user's name for audit attribution. + The request body's `operator` field (if any) is ignored — clients + can't spoof the operator identity any more.""" + user = getattr(request.state, "current_user", None) + if not user: + raise HTTPException(status_code=401, detail="Authentication required") + return user.full_name or user.username + + @router.post("/api/v1/burnin/start") -async def burnin_start(req: StartBurninRequest): +async def burnin_start(request: Request, req: StartBurninRequest): + operator = _operator_for(request, req.operator) results = [] errors = [] for drive_id in req.drive_ids: try: job_id = await burnin.start_job( - drive_id, req.profile, req.operator, stage_order=req.stage_order + drive_id, req.profile, operator, stage_order=req.stage_order ) results.append({"drive_id": drive_id, "job_id": job_id}) except burnin.PoolMemberError as exc: @@ -809,10 +836,11 @@ async def burnin_start(req: StartBurninRequest): @router.post("/api/v1/drives/{drive_id}/unlock") -async def unlock_pool_drive(drive_id: int, req: UnlockPoolDriveRequest): +async def unlock_pool_drive(drive_id: int, request: Request, req: UnlockPoolDriveRequest): + operator = _operator_for(request, req.operator) try: expiry = await burnin.grant_pool_unlock( - drive_id, req.confirm_token, req.operator, req.reason, + drive_id, req.confirm_token, operator, req.reason, ) except ValueError as exc: raise HTTPException(status_code=400, detail=str(exc)) @@ -821,8 +849,9 @@ async def unlock_pool_drive(drive_id: int, req: UnlockPoolDriveRequest): @router.post("/api/v1/burnin/{job_id}/cancel") -async def burnin_cancel(job_id: int, req: CancelBurninRequest): - ok = await burnin.cancel_job(job_id, req.operator) +async def burnin_cancel(job_id: int, request: Request, req: CancelBurninRequest): + operator = _operator_for(request, req.operator) + ok = await burnin.cancel_job(job_id, operator) if not ok: raise HTTPException(status_code=409, detail="Job not found or not cancellable") return {"cancelled": True} @@ -1044,6 +1073,7 @@ async def update_drive( @router.post("/api/v1/drives/{drive_id}/reset") async def reset_drive( drive_id: int, + request: Request, body: dict, db: aiosqlite.Connection = Depends(get_db), ): @@ -1064,7 +1094,9 @@ async def reset_drive( if (await cur.fetchone())[0] > 0: raise HTTPException(status_code=409, detail="Cannot reset while a burn-in is active") - operator = body.get("operator", "operator") + # Trust the logged-in user, not the body (the JS used to send a + # literal "operator" because window._operator was never set). + operator = _operator_for(request, body.get("operator")) # Reset SMART test state to idle await db.execute( @@ -1243,6 +1275,7 @@ async def settings_page( request: Request, db: aiosqlite.Connection = Depends(get_db), ): + auth.require_admin(request) # Editable values — real values for form fields (secrets excluded) editable = { # SMTP @@ -1359,7 +1392,7 @@ async def get_settings_redacted(request: Request): @router.post("/api/v1/settings") async def save_settings(request: Request, body: dict): """Save editable runtime settings. Secrets are only updated if non-empty.""" - user = request.state.current_user + user = auth.require_admin(request) # Don't overwrite secrets if client sent empty string. Track which # ones DID get a real change so we can audit the rotation. rotated: list[str] = [] @@ -1389,8 +1422,9 @@ async def save_settings(request: Request, body: dict): @router.post("/api/v1/settings/test-smtp") -async def test_smtp(): +async def test_smtp(request: Request): """Test the current SMTP configuration without sending an email.""" + auth.require_admin(request) result = await mailer.test_smtp_connection() if not result["ok"]: raise HTTPException(status_code=502, detail=result["error"]) @@ -1398,8 +1432,9 @@ async def test_smtp(): @router.post("/api/v1/settings/test-ssh") -async def test_ssh(): +async def test_ssh(request: Request): """Test the current SSH configuration.""" + auth.require_admin(request) from app import ssh_client result = await ssh_client.test_connection() if not result["ok"]: diff --git a/app/ssh_client.py b/app/ssh_client.py index 612e74e..75711c0 100644 --- a/app/ssh_client.py +++ b/app/ssh_client.py @@ -388,7 +388,16 @@ async def get_mounted_drives() -> set | None: def _parse_findmnt_sources(stdout: str) -> set: """Pure parser for findmnt output. Strips partitions; ignores tmpfs, - overlay, zfs (zfs is handled by pool detection).""" + overlay, zfs (zfs is handled by pool detection). + + Recognised devnames (covers TrueNAS SCALE + CORE + LVM/MD stacks): + sd[a-z]+ — Linux SCSI/SATA (sda, sdb, ..., sdaa) + nvmeXnY[pZ] — Linux NVMe namespaces + mapper/ — LVM logical volumes (/dev/mapper/vg-lv) + dm-N — devicemapper short names + mdN — Linux MD RAID arrays + ada[0-9]+, da[0-9]+ — TrueNAS CORE (FreeBSD) SATA/SAS + """ import re as _re out: set = set() for raw in stdout.splitlines(): @@ -400,14 +409,22 @@ def _parse_findmnt_sources(stdout: str) -> set: if "/dev/zd" in s or "/dev/zvol" in s: continue name = s[len("/dev/"):].split("[")[0] # bind mounts can have [subdir] - if name.startswith("nvme"): - m = _re.match(r"^(nvme\d+n\d+)", name) - if m: - out.add(m.group(1)) - else: - m = _re.match(r"^(sd[a-z]+)", name) + # Try each recognised devname pattern in order. Mapper/dm-/md + # entries are kept whole because they represent a stack the + # operator should resolve manually before burn-in. + for pat in ( + r"^(nvme\d+n\d+)", # NVMe (strip pN) + r"^(sd[a-z]+)", # Linux SCSI/SATA (strip number) + r"^(mapper/[^/]+)", # LVM logical volume + r"^(dm-\d+)", # devicemapper short name + r"^(md\d+)", # MD RAID + r"^(ada\d+)", # FreeBSD SATA + r"^(da\d+)", # FreeBSD SAS/SCSI + ): + m = _re.match(pat, name) if m: out.add(m.group(1)) + break return out diff --git a/scripts/security-scan.sh b/scripts/security-scan.sh index ed1ab0c..2185aab 100644 --- a/scripts/security-scan.sh +++ b/scripts/security-scan.sh @@ -41,19 +41,33 @@ if [ ! -d "$REPO/.git" ]; then fi cd "$REPO" -git fetch --quiet --prune origin 2>&1 || true -git checkout --quiet main 2>&1 || true -git reset --hard --quiet origin/main 2>&1 || true +# Refresh the scan checkout. Failures here mean we'd be scanning stale +# code without knowing — fail loudly instead of soldiering on silently. +if ! git fetch --quiet --prune origin; then + echo "fatal: git fetch failed in $REPO" >&2 + exit 65 +fi +git checkout --quiet main || true # ok if already on main +if ! git reset --hard --quiet origin/main; then + echo "fatal: git reset --hard failed in $REPO" >&2 + exit 65 +fi echo "=== Security scan $DATE ===" > "$OUT_DIR/summary.txt" date -Iseconds >> "$OUT_DIR/summary.txt" echo >> "$OUT_DIR/summary.txt" -# --- pip-audit against the LIVE container's installed packages ---------- -# Catches CVEs that hit a transitive dep we don't pin in requirements.txt. -echo "--- pip-audit (live container) ---" | tee -a "$OUT_DIR/summary.txt" -docker exec truenas-burnin sh -c \ - "pip install --quiet --no-cache-dir --disable-pip-version-check pip-audit 2>/dev/null && pip-audit --strict --format=columns" \ +# --- pip-audit against the lockfile in a throwaway container ------------ +# Previously we did `docker exec truenas-burnin pip install pip-audit` +# which mutated the live production container with a transient package. +# Now scan the lockfile in an ephemeral container — same coverage of +# pinned versions + their transitives, no side effects on prod. +echo "--- pip-audit (requirements.txt in throwaway container) ---" | tee -a "$OUT_DIR/summary.txt" +docker run --rm \ + -v "$REPO/requirements.txt:/work/requirements.txt:ro" \ + -w /work \ + python:3.12-slim sh -c \ + "pip install --quiet --no-cache-dir --disable-pip-version-check pip-audit 2>/dev/null && pip-audit --requirement requirements.txt --strict --format=columns" \ > "$OUT_DIR/pip-audit.txt" 2>&1 PIPS=$? echo " exit=$PIPS ($OUT_DIR/pip-audit.txt)" | tee -a "$OUT_DIR/summary.txt"