From ec636f8f3a3e8805b505e62f6e21e9fbab5ea22b Mon Sep 17 00:00:00 2001 From: Brandon Walter <51866976+echoparkbaby@users.noreply.github.com> Date: Thu, 14 May 2026 06:39:33 -0400 Subject: [PATCH] fix: PRAGMA busy_timeout on every SQLite connection (1.0.0-60) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Jobs 60-63 ran healthy for 16h then all 4 died simultaneously with 'database is locked'. The burnin drain used _db() which set busy_timeout=10000, but: 1. 10s was sometimes too short under heavy contention (4 burn-in drains writing every 5s + poller every 12s + retention scan + auth + lifespan = many concurrent writers). 2. OTHER aiosqlite.connect() sites (poller, retention, auth, mailer, routes/__init__'s SSE, burnin/__init__.py's various helpers, database.get_db) didn't set busy_timeout at all. Without it, SQLite raises 'database is locked' INSTANTLY on any contention, which forced concurrency back onto the drain's connection. Fix: - _db() busy_timeout 10000 → 60000 (60s; aggressive but right for this workload — brief contention spikes are normal and waiting beats failing). - PRAGMA busy_timeout=60000 added on every aiosqlite.connect() site next to the existing PRAGMA calls. Applied via a small Python pass that preserves the original variable name (db / _tdb / src / dst etc.) and indentation. Same restart sequence applied: rebuild container, reset 4 drives, relaunch via loopback bypass. Jobs 64-67 are now running. This is auto-restart #2 in 24h. Safety brake at 3. --- app/burnin/__init__.py | 6 ++++++ app/burnin/_common.py | 8 ++++++-- app/config.py | 2 +- app/database.py | 2 ++ app/mailer.py | 2 ++ app/poller.py | 2 ++ app/routes/__init__.py | 1 + 7 files changed, 20 insertions(+), 3 deletions(-) diff --git a/app/burnin/__init__.py b/app/burnin/__init__.py index 47657b4..5a340e5 100644 --- a/app/burnin/__init__.py +++ b/app/burnin/__init__.py @@ -93,6 +93,7 @@ async def init(client: TrueNASClient) -> None: async with _db() as db: db.row_factory = aiosqlite.Row await db.execute("PRAGMA journal_mode=WAL") + await db.execute("PRAGMA busy_timeout=60000") await db.execute("PRAGMA foreign_keys=ON") # Mark interrupted running jobs as unknown @@ -161,6 +162,7 @@ async def start_job(drive_id: int, profile: str, operator: str, async with _db() as db: db.row_factory = aiosqlite.Row await db.execute("PRAGMA journal_mode=WAL") + await db.execute("PRAGMA busy_timeout=60000") await db.execute("PRAGMA foreign_keys=ON") # Reject duplicate active burn-in for same drive @@ -261,6 +263,7 @@ async def cancel_job(job_id: int, operator: str) -> bool: async with _db() as db: db.row_factory = aiosqlite.Row await db.execute("PRAGMA journal_mode=WAL") + await db.execute("PRAGMA busy_timeout=60000") cur = await db.execute( "SELECT state, drive_id FROM burnin_jobs WHERE id=?", (job_id,) @@ -345,6 +348,7 @@ async def _run_job(job_id: int) -> None: # Transition queued → running async with _db() as db: await db.execute("PRAGMA journal_mode=WAL") + await db.execute("PRAGMA busy_timeout=60000") row = await (await db.execute( "SELECT drive_id, profile FROM burnin_jobs WHERE id=?", (job_id,) )).fetchone() @@ -425,6 +429,7 @@ async def _run_job(job_id: int) -> None: ) async with _db() as db: await db.execute("PRAGMA journal_mode=WAL") + await db.execute("PRAGMA busy_timeout=60000") await db.execute( "UPDATE burnin_jobs SET state=?, percent=?, finished_at=?, error_text=? WHERE id=?", (final_state, 100 if success else None, _now(), @@ -563,6 +568,7 @@ async def check_stuck_jobs() -> None: async with _db() as db: db.row_factory = aiosqlite.Row await db.execute("PRAGMA journal_mode=WAL") + await db.execute("PRAGMA busy_timeout=60000") cur = await db.execute(""" SELECT bj.id, bj.drive_id, d.devname, bj.started_at diff --git a/app/burnin/_common.py b/app/burnin/_common.py index 2f3b50c..79d73cc 100644 --- a/app/burnin/_common.py +++ b/app/burnin/_common.py @@ -77,9 +77,13 @@ def _now() -> str: @asynccontextmanager async def _db(): """Open a WAL-mode connection with busy_timeout so writers wait for the lock - instead of immediately raising 'database is locked' under contention.""" + instead of immediately raising 'database is locked' under contention. + + 60s timeout is intentionally generous: with 4 concurrent burn-in drains + + the poller + retention + auth all writing, brief contention spikes + are normal and waiting is the right behavior. 10s was too tight.""" async with aiosqlite.connect(settings.db_path) as db: - await db.execute("PRAGMA busy_timeout=10000") + await db.execute("PRAGMA busy_timeout=60000") yield db diff --git a/app/config.py b/app/config.py index 51d8766..dfbc0b3 100644 --- a/app/config.py +++ b/app/config.py @@ -86,7 +86,7 @@ class Settings(BaseSettings): ssh_key: str = "" # PEM private key content (paste full key including headers) # Application version — used by the /api/v1/updates/check endpoint - app_version: str = "1.0.0-59" + app_version: str = "1.0.0-60" # ---- Authentication (1.0.0-22) ---- # session_secret: HMAC key for signing session cookies. Empty = generate diff --git a/app/database.py b/app/database.py index e2a2067..803b1e8 100644 --- a/app/database.py +++ b/app/database.py @@ -176,6 +176,7 @@ async def init_db() -> None: Path(settings.db_path).parent.mkdir(parents=True, exist_ok=True) async with aiosqlite.connect(settings.db_path) as db: await db.execute("PRAGMA journal_mode=WAL") + await db.execute("PRAGMA busy_timeout=60000") await db.execute("PRAGMA foreign_keys=ON") await db.executescript(SCHEMA) await _run_migrations(db) @@ -187,6 +188,7 @@ async def get_db(): db.row_factory = aiosqlite.Row try: await db.execute("PRAGMA journal_mode=WAL") + await db.execute("PRAGMA busy_timeout=60000") await db.execute("PRAGMA foreign_keys=ON") yield db finally: diff --git a/app/mailer.py b/app/mailer.py index 0d26e3c..3df806b 100644 --- a/app/mailer.py +++ b/app/mailer.py @@ -334,6 +334,7 @@ async def _fetch_report_data() -> list[dict]: async with aiosqlite.connect(settings.db_path) as db: db.row_factory = aiosqlite.Row await db.execute("PRAGMA journal_mode=WAL") + await db.execute("PRAGMA busy_timeout=60000") return await _fetch_drives_for_template(db) @@ -347,6 +348,7 @@ async def _fetch_unlock_events_24h() -> list[dict]: async with aiosqlite.connect(settings.db_path) as db: db.row_factory = aiosqlite.Row await db.execute("PRAGMA journal_mode=WAL") + await db.execute("PRAGMA busy_timeout=60000") # julianday() handles the 'YYYY-MM-DDTHH:MM:SS.fff+00:00' format # we write from Python; comparing the raw string against # datetime('now','-1 day') (which formats as 'YYYY-MM-DD HH:MM:SS') diff --git a/app/poller.py b/app/poller.py index 25ecc3f..5c83826 100644 --- a/app/poller.py +++ b/app/poller.py @@ -437,6 +437,7 @@ async def poll_cycle(client: TrueNASClient) -> int: async with aiosqlite.connect(settings.db_path) as db: db.row_factory = aiosqlite.Row await db.execute("PRAGMA journal_mode=WAL") + await db.execute("PRAGMA busy_timeout=60000") await db.execute("PRAGMA foreign_keys=ON") for disk in disks: @@ -492,6 +493,7 @@ async def run(client: TrueNASClient) -> None: async with aiosqlite.connect(settings.db_path) as _tdb: _tdb.row_factory = aiosqlite.Row await _tdb.execute("PRAGMA journal_mode=WAL") + await _tdb.execute("PRAGMA busy_timeout=60000") _cur = await _tdb.execute(""" SELECT MAX(d.temperature_c) FROM drives d diff --git a/app/routes/__init__.py b/app/routes/__init__.py index 75a639c..3d2665a 100644 --- a/app/routes/__init__.py +++ b/app/routes/__init__.py @@ -128,6 +128,7 @@ async def sse_drives(request: Request): async with aiosqlite.connect(settings.db_path) as db: db.row_factory = aiosqlite.Row await db.execute("PRAGMA journal_mode=WAL") + await db.execute("PRAGMA busy_timeout=60000") drives = await _fetch_drives_for_template(db) html = templates.env.get_template(