nas-burnin/app/retention.py
Brandon Walter 7cd66d460f
Some checks are pending
Security scan / pip-audit (push) Waiting to run
Security scan / bandit (push) Waiting to run
Security scan / gitleaks (push) Waiting to run
Security scan / mypy (push) Waiting to run
fix: annotate to mypy-clean + promote to gating (1.0.0-40)
Five files needed annotation tweaks to clear the 14 outstanding
mypy errors, all cosmetic (zero runtime bugs):

- settings_store._coerce: return Any (concrete type depends on key,
  no narrowing path mypy can follow from the dict lookup)
- retention._state: explicit dict[str, str | None] init
- mailer: explicit `server: smtplib.SMTP` binding so SMTP_SSL and
  SMTP both narrow to the parent class for shared call sites
- burnin/stages.py: TypedDict for the badblocks result dict so
  `result["bad_blocks"]` narrows to int at the comparison site

scripts/security-scan.sh: mypy now counted in TOTAL_EXIT and
findings.log line. Comment updated to reflect gating status.
2026-05-03 21:21:55 -07:00

168 lines
6.2 KiB
Python

"""
Background retention + backup tasks.
* Stage-log pruning: each surface_validate burn-in stage can write tens of
MB of badblocks output to burnin_stages.log_text. Without retention the
DB grows unbounded — we observed 447 MB on the live host after a few
weeks of use. Nightly job nulls log_text on stages older than
`retention_days`, then VACUUMs to reclaim pages.
* Automated DB backup: nightly `sqlite3 .backup` to `backups/app-YYYY-
MM-DD.db` inside the data dir. Retains the most recent
`backup_keep_count` files. Uses the online-backup API so the live DB
isn't locked.
Both tasks share a single hourly tick — cheap and fits the existing
mailer-style background-loop pattern. Failures are logged but never
crash the supervisor.
"""
from __future__ import annotations
import asyncio
import logging
from datetime import datetime, timedelta, timezone
from pathlib import Path
import aiosqlite
from app.config import settings
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Stage-log pruning
# ---------------------------------------------------------------------------
async def prune_stage_logs(retention_days: int) -> int:
"""NULL out log_text on burnin_stages older than retention_days.
Returns the number of rows updated."""
cutoff = (datetime.now(timezone.utc) - timedelta(days=retention_days)).isoformat()
async with aiosqlite.connect(settings.db_path) as db:
cur = await db.execute(
"""UPDATE burnin_stages
SET log_text = NULL
WHERE log_text IS NOT NULL
AND finished_at IS NOT NULL
AND finished_at < ?""",
(cutoff,),
)
n = cur.rowcount or 0
await db.commit()
if n > 0:
log.info("Retention: pruned log_text on %d stage row(s) older than %d days",
n, retention_days)
return n
async def vacuum_db() -> None:
"""Reclaim pages freed by the prune. SQLite VACUUM rewrites the file
so it must run outside any transaction."""
async with aiosqlite.connect(settings.db_path, isolation_level=None) as db:
await db.execute("VACUUM")
log.info("Retention: VACUUM completed")
# ---------------------------------------------------------------------------
# Backup
# ---------------------------------------------------------------------------
def _backup_dir() -> Path:
return Path(settings.db_path).parent / "backups"
async def backup_db(keep_count: int) -> Path | None:
"""Online-backup the live DB to backups/app-YYYY-MM-DD.db. Returns
the new file's path. Old backups beyond keep_count are deleted.
Atomicity: writes to a sibling tmp file first and renames into the
canonical daily slot only after backup succeeds. An interrupted
backup leaves the tmp file (cleaned up on next run); the previous
day's snapshot stays intact. os.replace is atomic within the same
filesystem on POSIX.
"""
import os as _os
bdir = _backup_dir()
bdir.mkdir(parents=True, exist_ok=True)
today = datetime.now().strftime("%Y-%m-%d")
out = bdir / f"app-{today}.db"
tmp = bdir / f"app-{today}.db.tmp"
# Drop any leftover tmp from a previous interrupted run.
if tmp.exists():
try:
tmp.unlink()
except OSError:
pass
# aiosqlite.Connection.backup() is an async wrapper around
# sqlite3.Connection.backup — atomic online snapshot that doesn't
# block writers (it copies pages in batches and yields between).
async with aiosqlite.connect(settings.db_path) as src:
async with aiosqlite.connect(str(tmp)) as dst:
await src.backup(dst)
_os.replace(tmp, out)
log.info("Retention: DB backed up to %s (%d bytes)", out, out.stat().st_size)
# Keep the N most recent backups; delete older.
snapshots = sorted(bdir.glob("app-*.db"), key=lambda p: p.stat().st_mtime,
reverse=True)
for old in snapshots[keep_count:]:
try:
old.unlink()
log.info("Retention: removed old backup %s", old.name)
except OSError as exc:
log.warning("Retention: could not remove %s: %s", old, exc)
return out
# ---------------------------------------------------------------------------
# Scheduler — single hourly tick fires daily-grain work
# ---------------------------------------------------------------------------
_RUN_HOUR = 3 # 03:00 local time — quiet for most homelabs
_state: dict[str, str | None] = {"last_run_date": None}
async def run() -> None:
"""Background loop. Wakes every 5 min, runs the daily tasks once
when the local hour matches _RUN_HOUR and we haven't run today."""
log.info(
"Retention loop started (run at %02d:00 local; prune>%d days; keep %d backups)",
_RUN_HOUR,
settings.retention_log_days,
settings.retention_backup_keep,
)
while True:
try:
now = datetime.now()
today = now.strftime("%Y-%m-%d")
if now.hour == _RUN_HOUR and _state["last_run_date"] != today:
# Track prune + backup success independently. Mark the
# day "done" only when BOTH succeed so a transient
# failure gets retried on the next 5-min tick (still
# within the 03:00 hour).
prune_ok = False
backup_ok = False
try:
pruned = await prune_stage_logs(settings.retention_log_days)
if pruned:
await vacuum_db()
prune_ok = True
except Exception as exc:
log.exception("Retention: pruning failed: %s", exc)
try:
await backup_db(settings.retention_backup_keep)
backup_ok = True
except Exception as exc:
log.exception("Retention: backup failed: %s", exc)
if prune_ok and backup_ok:
_state["last_run_date"] = today
except asyncio.CancelledError:
raise
except Exception as exc:
log.exception("Retention loop iteration failed: %s", exc)
await asyncio.sleep(300) # 5 min