nas-burnin/app/retention.py

"""
Background retention + backup tasks.

* Stage-log pruning: each surface_validate burn-in stage can write tens of
  MB of badblocks output to burnin_stages.log_text. Without retention the
  DB grows unbounded — we observed 447 MB on the live host after a few
  weeks of use. Nightly job nulls log_text on stages older than
  `retention_days`, then VACUUMs to reclaim pages.

* Automated DB backup: nightly `sqlite3 .backup` to `backups/app-YYYY-
  MM-DD.db` inside the data dir. Retains the most recent
  `backup_keep_count` files. Uses the online-backup API so the live DB
  isn't locked.

Both tasks share a single hourly tick — cheap and fits the existing
mailer-style background-loop pattern. Failures are logged but never
crash the supervisor.
"""

from __future__ import annotations

import asyncio
import logging
from datetime import datetime, timedelta, timezone
from pathlib import Path

import aiosqlite

from app.config import settings

log = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Stage-log pruning
# ---------------------------------------------------------------------------

async def prune_stage_logs(retention_days: int) -> int:
    """NULL out log_text on burnin_stages older than retention_days.
    Returns the number of rows updated."""
    cutoff = (datetime.now(timezone.utc) - timedelta(days=retention_days)).isoformat()
    async with aiosqlite.connect(settings.db_path) as db:
        cur = await db.execute(
            """UPDATE burnin_stages
                  SET log_text = NULL
                WHERE log_text IS NOT NULL
                  AND finished_at IS NOT NULL
                  AND finished_at < ?""",
            (cutoff,),
        )
        n = cur.rowcount or 0
        await db.commit()
    if n > 0:
        log.info("Retention: pruned log_text on %d stage row(s) older than %d days",
                 n, retention_days)
    return n


async def vacuum_db() -> None:
    """Reclaim pages freed by the prune. SQLite VACUUM rewrites the file
    so it must run outside any transaction."""
    async with aiosqlite.connect(settings.db_path, isolation_level=None) as db:
        await db.execute("VACUUM")
    log.info("Retention: VACUUM completed")


# ---------------------------------------------------------------------------
# Backup
# ---------------------------------------------------------------------------

def _backup_dir() -> Path:
    return Path(settings.db_path).parent / "backups"


async def backup_db(keep_count: int) -> Path | None:
    """Online-backup the live DB to backups/app-YYYY-MM-DD.db. Returns
    the new file's path. Old backups beyond keep_count are deleted.

    Atomicity: writes to a sibling tmp file first and renames into the
    canonical daily slot only after backup succeeds. An interrupted
    backup leaves the tmp file (cleaned up on next run); the previous
    day's snapshot stays intact. os.replace is atomic within the same
    filesystem on POSIX.
    """
    import os as _os
    bdir = _backup_dir()
    bdir.mkdir(parents=True, exist_ok=True)
    today = datetime.now().strftime("%Y-%m-%d")
    out = bdir / f"app-{today}.db"
    tmp = bdir / f"app-{today}.db.tmp"

    # Drop any leftover tmp from a previous interrupted run.
    if tmp.exists():
        try:
            tmp.unlink()
        except OSError:
            pass

    # aiosqlite.Connection.backup() is an async wrapper around
    # sqlite3.Connection.backup — atomic online snapshot that doesn't
    # block writers (it copies pages in batches and yields between).
    async with aiosqlite.connect(settings.db_path) as src:
        async with aiosqlite.connect(str(tmp)) as dst:
            await src.backup(dst)

    _os.replace(tmp, out)
    log.info("Retention: DB backed up to %s (%d bytes)", out, out.stat().st_size)

    # Keep the N most recent backups; delete older.
    snapshots = sorted(bdir.glob("app-*.db"), key=lambda p: p.stat().st_mtime,
                       reverse=True)
    for old in snapshots[keep_count:]:
        try:
            old.unlink()
            log.info("Retention: removed old backup %s", old.name)
        except OSError as exc:
            log.warning("Retention: could not remove %s: %s", old, exc)

    return out


# ---------------------------------------------------------------------------
# Scheduler — single hourly tick fires daily-grain work
# ---------------------------------------------------------------------------

_RUN_HOUR = 3   # 03:00 local time — quiet for most homelabs
_state = {"last_run_date": None}


async def run() -> None:
    """Background loop. Wakes every 5 min, runs the daily tasks once
    when the local hour matches _RUN_HOUR and we haven't run today."""
    log.info(
        "Retention loop started (run at %02d:00 local; prune>%d days; keep %d backups)",
        _RUN_HOUR,
        settings.retention_log_days,
        settings.retention_backup_keep,
    )
    while True:
        try:
            now = datetime.now()
            today = now.strftime("%Y-%m-%d")
            if now.hour == _RUN_HOUR and _state["last_run_date"] != today:
                # Track prune + backup success independently. Mark the
                # day "done" only when BOTH succeed so a transient
                # failure gets retried on the next 5-min tick (still
                # within the 03:00 hour).
                prune_ok = False
                backup_ok = False
                try:
                    pruned = await prune_stage_logs(settings.retention_log_days)
                    if pruned:
                        await vacuum_db()
                    prune_ok = True
                except Exception as exc:
                    log.exception("Retention: pruning failed: %s", exc)
                try:
                    await backup_db(settings.retention_backup_keep)
                    backup_ok = True
                except Exception as exc:
                    log.exception("Retention: backup failed: %s", exc)
                if prune_ok and backup_ok:
                    _state["last_run_date"] = today
        except asyncio.CancelledError:
            raise
        except Exception as exc:
            log.exception("Retention loop iteration failed: %s", exc)
        await asyncio.sleep(300)  # 5 min