Addresses 12 of 13 findings from the Codex tech-debt + security review of versions 1.0.0-22 through 1.0.0-27. Item #5 (live pool re-check before start_job) deferred — would add an SSH round-trip per start. #1 Pool detection now treats zpool / lsblk / findmnt failures INDEPENDENTLY. Previously a single None blew away the whole map, so a host where lsblk lacks zfs_member info but zpool works would never lock pool members. Extended findmnt parser to recognise /dev/mapper/*, /dev/dm-*, /dev/md*, /dev/da*, /dev/ada* (LVM, devicemapper, MD RAID, FreeBSD CORE devnames). #2 Admin role enforced on every settings mutation. New auth.require_admin() helper applied to GET /settings, POST /api/v1/settings, /test-smtp, /test-ssh. Previously any authenticated user (the CLI explicitly supports non-admin accounts) could rewrite SMTP/SSH/API secrets. #3 First-user setup race closed. auth.create_user() now accepts bootstrap_only=True which wraps the existence check + insert in BEGIN IMMEDIATE so two concurrent /api/v1/auth/setup requests can't both create admin accounts during the bootstrap window. #4 Case-insensitive uniqueness enforced via new `uniq_users_username_nocase` index. Login does NOCASE lookup so without this `Admin` and `admin` could coexist as distinct rows. #6 New `session_cookie_secure` setting (default False for LAN/dev deploys, set True in production behind HTTPS) flips the session cookie's Secure flag. Defends against on-the-wire exposure when the dashboard is reachable over plain HTTP. #7 Audit trail bound to authenticated identity. Burn-in start / cancel / unlock / drive reset all now use `_operator_for(request)` which reads `request.state.current_user.full_name|username` instead of the body's operator field. Logged-in users can no longer spoof attribution. Drive reset's literal-"operator" fallback (window._operator was never set) is also fixed by this. #8 Login rate-limit race fixed. New `register_login_attempt()` is atomic check-AND-increment in synchronous code (no awaits inside), so a parallel burst can't slip past the threshold. `record_login_failure()` removed; `clear_login_failures()` now also drops any active lockout for a successful auth. Pre-existing bug where `tripped` was always False (so user_login_locked_out audit events never fired) also fixed. #9 NVMe surface_validate post-format check now mirrors the SSH path: fails on FAILED health AND on real SMART attribute failures, soft-passes SSH-only failures (logged), surfaces warnings to the stage log without failing. #10 retention.backup_db() now writes to `.tmp` then atomic-renames into the canonical daily slot — an interrupted backup leaves the tmp behind but doesn't corrupt the real snapshot. Scheduler marks last_run_date only on (prune AND backup) success so a transient failure gets retried within the 03:00 hour. #11 /health DB probe now exercises the WRITE path via a temp-table INSERT/SELECT/COMMIT round-trip. Previously only read PRAGMA journal_mode + a row count, which silently passes on read-only mounts and broken-WAL conditions. #12 security-scan.sh now fails loudly if `git fetch` or `git reset --hard origin/main` errors (was `|| true`, scanning stale code silently). pip-audit now runs in a throwaway python:3.12-slim container against requirements.txt instead of `docker exec`-ing into the live truenas-burnin container — cleaner separation, no transient package install on prod. #13 Badblocks SSH stage no longer doubles its log_text. Previously appended every 20-line chunk during streaming AND the full accumulated output at end. Now only flushes the un-flushed tail (typically <20 lines). `result["output"]` stays in-memory only. Verification: all 44 unit tests pass in container; /health 200; security scan returns 0 findings; deployed maple build is green. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
168 lines
6.2 KiB
Python
168 lines
6.2 KiB
Python
"""
|
|
Background retention + backup tasks.
|
|
|
|
* Stage-log pruning: each surface_validate burn-in stage can write tens of
|
|
MB of badblocks output to burnin_stages.log_text. Without retention the
|
|
DB grows unbounded — we observed 447 MB on the live host after a few
|
|
weeks of use. Nightly job nulls log_text on stages older than
|
|
`retention_days`, then VACUUMs to reclaim pages.
|
|
|
|
* Automated DB backup: nightly `sqlite3 .backup` to `backups/app-YYYY-
|
|
MM-DD.db` inside the data dir. Retains the most recent
|
|
`backup_keep_count` files. Uses the online-backup API so the live DB
|
|
isn't locked.
|
|
|
|
Both tasks share a single hourly tick — cheap and fits the existing
|
|
mailer-style background-loop pattern. Failures are logged but never
|
|
crash the supervisor.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
from datetime import datetime, timedelta, timezone
|
|
from pathlib import Path
|
|
|
|
import aiosqlite
|
|
|
|
from app.config import settings
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Stage-log pruning
|
|
# ---------------------------------------------------------------------------
|
|
|
|
async def prune_stage_logs(retention_days: int) -> int:
|
|
"""NULL out log_text on burnin_stages older than retention_days.
|
|
Returns the number of rows updated."""
|
|
cutoff = (datetime.now(timezone.utc) - timedelta(days=retention_days)).isoformat()
|
|
async with aiosqlite.connect(settings.db_path) as db:
|
|
cur = await db.execute(
|
|
"""UPDATE burnin_stages
|
|
SET log_text = NULL
|
|
WHERE log_text IS NOT NULL
|
|
AND finished_at IS NOT NULL
|
|
AND finished_at < ?""",
|
|
(cutoff,),
|
|
)
|
|
n = cur.rowcount or 0
|
|
await db.commit()
|
|
if n > 0:
|
|
log.info("Retention: pruned log_text on %d stage row(s) older than %d days",
|
|
n, retention_days)
|
|
return n
|
|
|
|
|
|
async def vacuum_db() -> None:
|
|
"""Reclaim pages freed by the prune. SQLite VACUUM rewrites the file
|
|
so it must run outside any transaction."""
|
|
async with aiosqlite.connect(settings.db_path, isolation_level=None) as db:
|
|
await db.execute("VACUUM")
|
|
log.info("Retention: VACUUM completed")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Backup
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _backup_dir() -> Path:
|
|
return Path(settings.db_path).parent / "backups"
|
|
|
|
|
|
async def backup_db(keep_count: int) -> Path | None:
|
|
"""Online-backup the live DB to backups/app-YYYY-MM-DD.db. Returns
|
|
the new file's path. Old backups beyond keep_count are deleted.
|
|
|
|
Atomicity: writes to a sibling tmp file first and renames into the
|
|
canonical daily slot only after backup succeeds. An interrupted
|
|
backup leaves the tmp file (cleaned up on next run); the previous
|
|
day's snapshot stays intact. os.replace is atomic within the same
|
|
filesystem on POSIX.
|
|
"""
|
|
import os as _os
|
|
bdir = _backup_dir()
|
|
bdir.mkdir(parents=True, exist_ok=True)
|
|
today = datetime.now().strftime("%Y-%m-%d")
|
|
out = bdir / f"app-{today}.db"
|
|
tmp = bdir / f"app-{today}.db.tmp"
|
|
|
|
# Drop any leftover tmp from a previous interrupted run.
|
|
if tmp.exists():
|
|
try:
|
|
tmp.unlink()
|
|
except OSError:
|
|
pass
|
|
|
|
# aiosqlite.Connection.backup() is an async wrapper around
|
|
# sqlite3.Connection.backup — atomic online snapshot that doesn't
|
|
# block writers (it copies pages in batches and yields between).
|
|
async with aiosqlite.connect(settings.db_path) as src:
|
|
async with aiosqlite.connect(str(tmp)) as dst:
|
|
await src.backup(dst)
|
|
|
|
_os.replace(tmp, out)
|
|
log.info("Retention: DB backed up to %s (%d bytes)", out, out.stat().st_size)
|
|
|
|
# Keep the N most recent backups; delete older.
|
|
snapshots = sorted(bdir.glob("app-*.db"), key=lambda p: p.stat().st_mtime,
|
|
reverse=True)
|
|
for old in snapshots[keep_count:]:
|
|
try:
|
|
old.unlink()
|
|
log.info("Retention: removed old backup %s", old.name)
|
|
except OSError as exc:
|
|
log.warning("Retention: could not remove %s: %s", old, exc)
|
|
|
|
return out
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Scheduler — single hourly tick fires daily-grain work
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_RUN_HOUR = 3 # 03:00 local time — quiet for most homelabs
|
|
_state = {"last_run_date": None}
|
|
|
|
|
|
async def run() -> None:
|
|
"""Background loop. Wakes every 5 min, runs the daily tasks once
|
|
when the local hour matches _RUN_HOUR and we haven't run today."""
|
|
log.info(
|
|
"Retention loop started (run at %02d:00 local; prune>%d days; keep %d backups)",
|
|
_RUN_HOUR,
|
|
settings.retention_log_days,
|
|
settings.retention_backup_keep,
|
|
)
|
|
while True:
|
|
try:
|
|
now = datetime.now()
|
|
today = now.strftime("%Y-%m-%d")
|
|
if now.hour == _RUN_HOUR and _state["last_run_date"] != today:
|
|
# Track prune + backup success independently. Mark the
|
|
# day "done" only when BOTH succeed so a transient
|
|
# failure gets retried on the next 5-min tick (still
|
|
# within the 03:00 hour).
|
|
prune_ok = False
|
|
backup_ok = False
|
|
try:
|
|
pruned = await prune_stage_logs(settings.retention_log_days)
|
|
if pruned:
|
|
await vacuum_db()
|
|
prune_ok = True
|
|
except Exception as exc:
|
|
log.exception("Retention: pruning failed: %s", exc)
|
|
try:
|
|
await backup_db(settings.retention_backup_keep)
|
|
backup_ok = True
|
|
except Exception as exc:
|
|
log.exception("Retention: backup failed: %s", exc)
|
|
if prune_ok and backup_ok:
|
|
_state["last_run_date"] = today
|
|
except asyncio.CancelledError:
|
|
raise
|
|
except Exception as exc:
|
|
log.exception("Retention loop iteration failed: %s", exc)
|
|
await asyncio.sleep(300) # 5 min
|