fix: PRAGMA busy_timeout on every SQLite connection (1.0.0-60)
Some checks failed
Security scan / pip-audit (push) Has been cancelled
Security scan / bandit (push) Has been cancelled
Security scan / gitleaks (push) Has been cancelled
Security scan / mypy (push) Has been cancelled

Jobs 60-63 ran healthy for 16h then all 4 died simultaneously with
'database is locked'. The burnin drain used _db() which set
busy_timeout=10000, but:

1. 10s was sometimes too short under heavy contention (4 burn-in
   drains writing every 5s + poller every 12s + retention scan +
   auth + lifespan = many concurrent writers).
2. OTHER aiosqlite.connect() sites (poller, retention, auth, mailer,
   routes/__init__'s SSE, burnin/__init__.py's various helpers,
   database.get_db) didn't set busy_timeout at all. Without it,
   SQLite raises 'database is locked' INSTANTLY on any contention,
   which forced concurrency back onto the drain's connection.

Fix:
- _db() busy_timeout 10000 → 60000 (60s; aggressive but right for
  this workload — brief contention spikes are normal and waiting
  beats failing).
- PRAGMA busy_timeout=60000 added on every aiosqlite.connect() site
  next to the existing PRAGMA calls. Applied via a small Python
  pass that preserves the original variable name (db / _tdb / src
  / dst etc.) and indentation.

Same restart sequence applied: rebuild container, reset 4 drives,
relaunch via loopback bypass. Jobs 64-67 are now running.

This is auto-restart #2 in 24h. Safety brake at 3.
This commit is contained in:
Brandon Walter 2026-05-14 06:39:33 -04:00
parent 7e42464016
commit ec636f8f3a
7 changed files with 20 additions and 3 deletions

View file

@ -93,6 +93,7 @@ async def init(client: TrueNASClient) -> None:
async with _db() as db: async with _db() as db:
db.row_factory = aiosqlite.Row db.row_factory = aiosqlite.Row
await db.execute("PRAGMA journal_mode=WAL") await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA busy_timeout=60000")
await db.execute("PRAGMA foreign_keys=ON") await db.execute("PRAGMA foreign_keys=ON")
# Mark interrupted running jobs as unknown # Mark interrupted running jobs as unknown
@ -161,6 +162,7 @@ async def start_job(drive_id: int, profile: str, operator: str,
async with _db() as db: async with _db() as db:
db.row_factory = aiosqlite.Row db.row_factory = aiosqlite.Row
await db.execute("PRAGMA journal_mode=WAL") await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA busy_timeout=60000")
await db.execute("PRAGMA foreign_keys=ON") await db.execute("PRAGMA foreign_keys=ON")
# Reject duplicate active burn-in for same drive # Reject duplicate active burn-in for same drive
@ -261,6 +263,7 @@ async def cancel_job(job_id: int, operator: str) -> bool:
async with _db() as db: async with _db() as db:
db.row_factory = aiosqlite.Row db.row_factory = aiosqlite.Row
await db.execute("PRAGMA journal_mode=WAL") await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA busy_timeout=60000")
cur = await db.execute( cur = await db.execute(
"SELECT state, drive_id FROM burnin_jobs WHERE id=?", (job_id,) "SELECT state, drive_id FROM burnin_jobs WHERE id=?", (job_id,)
@ -345,6 +348,7 @@ async def _run_job(job_id: int) -> None:
# Transition queued → running # Transition queued → running
async with _db() as db: async with _db() as db:
await db.execute("PRAGMA journal_mode=WAL") await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA busy_timeout=60000")
row = await (await db.execute( row = await (await db.execute(
"SELECT drive_id, profile FROM burnin_jobs WHERE id=?", (job_id,) "SELECT drive_id, profile FROM burnin_jobs WHERE id=?", (job_id,)
)).fetchone() )).fetchone()
@ -425,6 +429,7 @@ async def _run_job(job_id: int) -> None:
) )
async with _db() as db: async with _db() as db:
await db.execute("PRAGMA journal_mode=WAL") await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA busy_timeout=60000")
await db.execute( await db.execute(
"UPDATE burnin_jobs SET state=?, percent=?, finished_at=?, error_text=? WHERE id=?", "UPDATE burnin_jobs SET state=?, percent=?, finished_at=?, error_text=? WHERE id=?",
(final_state, 100 if success else None, _now(), (final_state, 100 if success else None, _now(),
@ -563,6 +568,7 @@ async def check_stuck_jobs() -> None:
async with _db() as db: async with _db() as db:
db.row_factory = aiosqlite.Row db.row_factory = aiosqlite.Row
await db.execute("PRAGMA journal_mode=WAL") await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA busy_timeout=60000")
cur = await db.execute(""" cur = await db.execute("""
SELECT bj.id, bj.drive_id, d.devname, bj.started_at SELECT bj.id, bj.drive_id, d.devname, bj.started_at

View file

@ -77,9 +77,13 @@ def _now() -> str:
@asynccontextmanager @asynccontextmanager
async def _db(): async def _db():
"""Open a WAL-mode connection with busy_timeout so writers wait for the lock """Open a WAL-mode connection with busy_timeout so writers wait for the lock
instead of immediately raising 'database is locked' under contention.""" instead of immediately raising 'database is locked' under contention.
60s timeout is intentionally generous: with 4 concurrent burn-in drains
+ the poller + retention + auth all writing, brief contention spikes
are normal and waiting is the right behavior. 10s was too tight."""
async with aiosqlite.connect(settings.db_path) as db: async with aiosqlite.connect(settings.db_path) as db:
await db.execute("PRAGMA busy_timeout=10000") await db.execute("PRAGMA busy_timeout=60000")
yield db yield db

View file

@ -86,7 +86,7 @@ class Settings(BaseSettings):
ssh_key: str = "" # PEM private key content (paste full key including headers) ssh_key: str = "" # PEM private key content (paste full key including headers)
# Application version — used by the /api/v1/updates/check endpoint # Application version — used by the /api/v1/updates/check endpoint
app_version: str = "1.0.0-59" app_version: str = "1.0.0-60"
# ---- Authentication (1.0.0-22) ---- # ---- Authentication (1.0.0-22) ----
# session_secret: HMAC key for signing session cookies. Empty = generate # session_secret: HMAC key for signing session cookies. Empty = generate

View file

@ -176,6 +176,7 @@ async def init_db() -> None:
Path(settings.db_path).parent.mkdir(parents=True, exist_ok=True) Path(settings.db_path).parent.mkdir(parents=True, exist_ok=True)
async with aiosqlite.connect(settings.db_path) as db: async with aiosqlite.connect(settings.db_path) as db:
await db.execute("PRAGMA journal_mode=WAL") await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA busy_timeout=60000")
await db.execute("PRAGMA foreign_keys=ON") await db.execute("PRAGMA foreign_keys=ON")
await db.executescript(SCHEMA) await db.executescript(SCHEMA)
await _run_migrations(db) await _run_migrations(db)
@ -187,6 +188,7 @@ async def get_db():
db.row_factory = aiosqlite.Row db.row_factory = aiosqlite.Row
try: try:
await db.execute("PRAGMA journal_mode=WAL") await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA busy_timeout=60000")
await db.execute("PRAGMA foreign_keys=ON") await db.execute("PRAGMA foreign_keys=ON")
yield db yield db
finally: finally:

View file

@ -334,6 +334,7 @@ async def _fetch_report_data() -> list[dict]:
async with aiosqlite.connect(settings.db_path) as db: async with aiosqlite.connect(settings.db_path) as db:
db.row_factory = aiosqlite.Row db.row_factory = aiosqlite.Row
await db.execute("PRAGMA journal_mode=WAL") await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA busy_timeout=60000")
return await _fetch_drives_for_template(db) return await _fetch_drives_for_template(db)
@ -347,6 +348,7 @@ async def _fetch_unlock_events_24h() -> list[dict]:
async with aiosqlite.connect(settings.db_path) as db: async with aiosqlite.connect(settings.db_path) as db:
db.row_factory = aiosqlite.Row db.row_factory = aiosqlite.Row
await db.execute("PRAGMA journal_mode=WAL") await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA busy_timeout=60000")
# julianday() handles the 'YYYY-MM-DDTHH:MM:SS.fff+00:00' format # julianday() handles the 'YYYY-MM-DDTHH:MM:SS.fff+00:00' format
# we write from Python; comparing the raw string against # we write from Python; comparing the raw string against
# datetime('now','-1 day') (which formats as 'YYYY-MM-DD HH:MM:SS') # datetime('now','-1 day') (which formats as 'YYYY-MM-DD HH:MM:SS')

View file

@ -437,6 +437,7 @@ async def poll_cycle(client: TrueNASClient) -> int:
async with aiosqlite.connect(settings.db_path) as db: async with aiosqlite.connect(settings.db_path) as db:
db.row_factory = aiosqlite.Row db.row_factory = aiosqlite.Row
await db.execute("PRAGMA journal_mode=WAL") await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA busy_timeout=60000")
await db.execute("PRAGMA foreign_keys=ON") await db.execute("PRAGMA foreign_keys=ON")
for disk in disks: for disk in disks:
@ -492,6 +493,7 @@ async def run(client: TrueNASClient) -> None:
async with aiosqlite.connect(settings.db_path) as _tdb: async with aiosqlite.connect(settings.db_path) as _tdb:
_tdb.row_factory = aiosqlite.Row _tdb.row_factory = aiosqlite.Row
await _tdb.execute("PRAGMA journal_mode=WAL") await _tdb.execute("PRAGMA journal_mode=WAL")
await _tdb.execute("PRAGMA busy_timeout=60000")
_cur = await _tdb.execute(""" _cur = await _tdb.execute("""
SELECT MAX(d.temperature_c) SELECT MAX(d.temperature_c)
FROM drives d FROM drives d

View file

@ -128,6 +128,7 @@ async def sse_drives(request: Request):
async with aiosqlite.connect(settings.db_path) as db: async with aiosqlite.connect(settings.db_path) as db:
db.row_factory = aiosqlite.Row db.row_factory = aiosqlite.Row
await db.execute("PRAGMA journal_mode=WAL") await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA busy_timeout=60000")
drives = await _fetch_drives_for_template(db) drives = await _fetch_drives_for_template(db)
html = templates.env.get_template( html = templates.env.get_template(