""" Background retention + backup tasks. * Stage-log pruning: each surface_validate burn-in stage can write tens of MB of badblocks output to burnin_stages.log_text. Without retention the DB grows unbounded — we observed 447 MB on the live host after a few weeks of use. Nightly job nulls log_text on stages older than `retention_days`, then VACUUMs to reclaim pages. * Automated DB backup: nightly `sqlite3 .backup` to `backups/app-YYYY- MM-DD.db` inside the data dir. Retains the most recent `backup_keep_count` files. Uses the online-backup API so the live DB isn't locked. Both tasks share a single hourly tick — cheap and fits the existing mailer-style background-loop pattern. Failures are logged but never crash the supervisor. """ from __future__ import annotations import asyncio import logging from datetime import datetime, timedelta, timezone from pathlib import Path import aiosqlite from app.config import settings log = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Stage-log pruning # --------------------------------------------------------------------------- async def prune_stage_logs(retention_days: int) -> int: """NULL out log_text on burnin_stages older than retention_days. Returns the number of rows updated.""" cutoff = (datetime.now(timezone.utc) - timedelta(days=retention_days)).isoformat() async with aiosqlite.connect(settings.db_path) as db: cur = await db.execute( """UPDATE burnin_stages SET log_text = NULL WHERE log_text IS NOT NULL AND finished_at IS NOT NULL AND finished_at < ?""", (cutoff,), ) n = cur.rowcount or 0 await db.commit() if n > 0: log.info("Retention: pruned log_text on %d stage row(s) older than %d days", n, retention_days) return n async def vacuum_db() -> None: """Reclaim pages freed by the prune. SQLite VACUUM rewrites the file so it must run outside any transaction.""" async with aiosqlite.connect(settings.db_path, isolation_level=None) as db: await db.execute("VACUUM") log.info("Retention: VACUUM completed") # --------------------------------------------------------------------------- # Backup # --------------------------------------------------------------------------- def _backup_dir() -> Path: return Path(settings.db_path).parent / "backups" async def backup_db(keep_count: int) -> Path | None: """Online-backup the live DB to backups/app-YYYY-MM-DD.db. Returns the new file's path. Old backups beyond keep_count are deleted. Atomicity: writes to a sibling tmp file first and renames into the canonical daily slot only after backup succeeds. An interrupted backup leaves the tmp file (cleaned up on next run); the previous day's snapshot stays intact. os.replace is atomic within the same filesystem on POSIX. """ import os as _os bdir = _backup_dir() bdir.mkdir(parents=True, exist_ok=True) today = datetime.now().strftime("%Y-%m-%d") out = bdir / f"app-{today}.db" tmp = bdir / f"app-{today}.db.tmp" # Drop any leftover tmp from a previous interrupted run. if tmp.exists(): try: tmp.unlink() except OSError: pass # aiosqlite.Connection.backup() is an async wrapper around # sqlite3.Connection.backup — atomic online snapshot that doesn't # block writers (it copies pages in batches and yields between). async with aiosqlite.connect(settings.db_path) as src: async with aiosqlite.connect(str(tmp)) as dst: await src.backup(dst) _os.replace(tmp, out) log.info("Retention: DB backed up to %s (%d bytes)", out, out.stat().st_size) # Keep the N most recent backups; delete older. snapshots = sorted(bdir.glob("app-*.db"), key=lambda p: p.stat().st_mtime, reverse=True) for old in snapshots[keep_count:]: try: old.unlink() log.info("Retention: removed old backup %s", old.name) except OSError as exc: log.warning("Retention: could not remove %s: %s", old, exc) return out # --------------------------------------------------------------------------- # Scheduler — single hourly tick fires daily-grain work # --------------------------------------------------------------------------- _RUN_HOUR = 3 # 03:00 local time — quiet for most homelabs _state: dict[str, str | None] = {"last_run_date": None} async def run() -> None: """Background loop. Wakes every 5 min, runs the daily tasks once when the local hour matches _RUN_HOUR and we haven't run today.""" log.info( "Retention loop started (run at %02d:00 local; prune>%d days; keep %d backups)", _RUN_HOUR, settings.retention_log_days, settings.retention_backup_keep, ) while True: try: now = datetime.now() today = now.strftime("%Y-%m-%d") if now.hour == _RUN_HOUR and _state["last_run_date"] != today: # Track prune + backup success independently. Mark the # day "done" only when BOTH succeed so a transient # failure gets retried on the next 5-min tick (still # within the 03:00 hour). prune_ok = False backup_ok = False try: pruned = await prune_stage_logs(settings.retention_log_days) if pruned: await vacuum_db() prune_ok = True except Exception as exc: log.exception("Retention: pruning failed: %s", exc) try: await backup_db(settings.retention_backup_keep) backup_ok = True except Exception as exc: log.exception("Retention: backup failed: %s", exc) if prune_ok and backup_ok: _state["last_run_date"] = today except asyncio.CancelledError: raise except Exception as exc: log.exception("Retention loop iteration failed: %s", exc) await asyncio.sleep(300) # 5 min