fix: PRAGMA busy_timeout on every SQLite connection (1.0.0-60)
Jobs 60-63 ran healthy for 16h then all 4 died simultaneously with 'database is locked'. The burnin drain used _db() which set busy_timeout=10000, but: 1. 10s was sometimes too short under heavy contention (4 burn-in drains writing every 5s + poller every 12s + retention scan + auth + lifespan = many concurrent writers). 2. OTHER aiosqlite.connect() sites (poller, retention, auth, mailer, routes/__init__'s SSE, burnin/__init__.py's various helpers, database.get_db) didn't set busy_timeout at all. Without it, SQLite raises 'database is locked' INSTANTLY on any contention, which forced concurrency back onto the drain's connection. Fix: - _db() busy_timeout 10000 → 60000 (60s; aggressive but right for this workload — brief contention spikes are normal and waiting beats failing). - PRAGMA busy_timeout=60000 added on every aiosqlite.connect() site next to the existing PRAGMA calls. Applied via a small Python pass that preserves the original variable name (db / _tdb / src / dst etc.) and indentation. Same restart sequence applied: rebuild container, reset 4 drives, relaunch via loopback bypass. Jobs 64-67 are now running. This is auto-restart #2 in 24h. Safety brake at 3.
This commit is contained in:
parent
7e42464016
commit
ec636f8f3a
7 changed files with 20 additions and 3 deletions
|
|
@ -93,6 +93,7 @@ async def init(client: TrueNASClient) -> None:
|
||||||
async with _db() as db:
|
async with _db() as db:
|
||||||
db.row_factory = aiosqlite.Row
|
db.row_factory = aiosqlite.Row
|
||||||
await db.execute("PRAGMA journal_mode=WAL")
|
await db.execute("PRAGMA journal_mode=WAL")
|
||||||
|
await db.execute("PRAGMA busy_timeout=60000")
|
||||||
await db.execute("PRAGMA foreign_keys=ON")
|
await db.execute("PRAGMA foreign_keys=ON")
|
||||||
|
|
||||||
# Mark interrupted running jobs as unknown
|
# Mark interrupted running jobs as unknown
|
||||||
|
|
@ -161,6 +162,7 @@ async def start_job(drive_id: int, profile: str, operator: str,
|
||||||
async with _db() as db:
|
async with _db() as db:
|
||||||
db.row_factory = aiosqlite.Row
|
db.row_factory = aiosqlite.Row
|
||||||
await db.execute("PRAGMA journal_mode=WAL")
|
await db.execute("PRAGMA journal_mode=WAL")
|
||||||
|
await db.execute("PRAGMA busy_timeout=60000")
|
||||||
await db.execute("PRAGMA foreign_keys=ON")
|
await db.execute("PRAGMA foreign_keys=ON")
|
||||||
|
|
||||||
# Reject duplicate active burn-in for same drive
|
# Reject duplicate active burn-in for same drive
|
||||||
|
|
@ -261,6 +263,7 @@ async def cancel_job(job_id: int, operator: str) -> bool:
|
||||||
async with _db() as db:
|
async with _db() as db:
|
||||||
db.row_factory = aiosqlite.Row
|
db.row_factory = aiosqlite.Row
|
||||||
await db.execute("PRAGMA journal_mode=WAL")
|
await db.execute("PRAGMA journal_mode=WAL")
|
||||||
|
await db.execute("PRAGMA busy_timeout=60000")
|
||||||
|
|
||||||
cur = await db.execute(
|
cur = await db.execute(
|
||||||
"SELECT state, drive_id FROM burnin_jobs WHERE id=?", (job_id,)
|
"SELECT state, drive_id FROM burnin_jobs WHERE id=?", (job_id,)
|
||||||
|
|
@ -345,6 +348,7 @@ async def _run_job(job_id: int) -> None:
|
||||||
# Transition queued → running
|
# Transition queued → running
|
||||||
async with _db() as db:
|
async with _db() as db:
|
||||||
await db.execute("PRAGMA journal_mode=WAL")
|
await db.execute("PRAGMA journal_mode=WAL")
|
||||||
|
await db.execute("PRAGMA busy_timeout=60000")
|
||||||
row = await (await db.execute(
|
row = await (await db.execute(
|
||||||
"SELECT drive_id, profile FROM burnin_jobs WHERE id=?", (job_id,)
|
"SELECT drive_id, profile FROM burnin_jobs WHERE id=?", (job_id,)
|
||||||
)).fetchone()
|
)).fetchone()
|
||||||
|
|
@ -425,6 +429,7 @@ async def _run_job(job_id: int) -> None:
|
||||||
)
|
)
|
||||||
async with _db() as db:
|
async with _db() as db:
|
||||||
await db.execute("PRAGMA journal_mode=WAL")
|
await db.execute("PRAGMA journal_mode=WAL")
|
||||||
|
await db.execute("PRAGMA busy_timeout=60000")
|
||||||
await db.execute(
|
await db.execute(
|
||||||
"UPDATE burnin_jobs SET state=?, percent=?, finished_at=?, error_text=? WHERE id=?",
|
"UPDATE burnin_jobs SET state=?, percent=?, finished_at=?, error_text=? WHERE id=?",
|
||||||
(final_state, 100 if success else None, _now(),
|
(final_state, 100 if success else None, _now(),
|
||||||
|
|
@ -563,6 +568,7 @@ async def check_stuck_jobs() -> None:
|
||||||
async with _db() as db:
|
async with _db() as db:
|
||||||
db.row_factory = aiosqlite.Row
|
db.row_factory = aiosqlite.Row
|
||||||
await db.execute("PRAGMA journal_mode=WAL")
|
await db.execute("PRAGMA journal_mode=WAL")
|
||||||
|
await db.execute("PRAGMA busy_timeout=60000")
|
||||||
|
|
||||||
cur = await db.execute("""
|
cur = await db.execute("""
|
||||||
SELECT bj.id, bj.drive_id, d.devname, bj.started_at
|
SELECT bj.id, bj.drive_id, d.devname, bj.started_at
|
||||||
|
|
|
||||||
|
|
@ -77,9 +77,13 @@ def _now() -> str:
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def _db():
|
async def _db():
|
||||||
"""Open a WAL-mode connection with busy_timeout so writers wait for the lock
|
"""Open a WAL-mode connection with busy_timeout so writers wait for the lock
|
||||||
instead of immediately raising 'database is locked' under contention."""
|
instead of immediately raising 'database is locked' under contention.
|
||||||
|
|
||||||
|
60s timeout is intentionally generous: with 4 concurrent burn-in drains
|
||||||
|
+ the poller + retention + auth all writing, brief contention spikes
|
||||||
|
are normal and waiting is the right behavior. 10s was too tight."""
|
||||||
async with aiosqlite.connect(settings.db_path) as db:
|
async with aiosqlite.connect(settings.db_path) as db:
|
||||||
await db.execute("PRAGMA busy_timeout=10000")
|
await db.execute("PRAGMA busy_timeout=60000")
|
||||||
yield db
|
yield db
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -86,7 +86,7 @@ class Settings(BaseSettings):
|
||||||
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
||||||
|
|
||||||
# Application version — used by the /api/v1/updates/check endpoint
|
# Application version — used by the /api/v1/updates/check endpoint
|
||||||
app_version: str = "1.0.0-59"
|
app_version: str = "1.0.0-60"
|
||||||
|
|
||||||
# ---- Authentication (1.0.0-22) ----
|
# ---- Authentication (1.0.0-22) ----
|
||||||
# session_secret: HMAC key for signing session cookies. Empty = generate
|
# session_secret: HMAC key for signing session cookies. Empty = generate
|
||||||
|
|
|
||||||
|
|
@ -176,6 +176,7 @@ async def init_db() -> None:
|
||||||
Path(settings.db_path).parent.mkdir(parents=True, exist_ok=True)
|
Path(settings.db_path).parent.mkdir(parents=True, exist_ok=True)
|
||||||
async with aiosqlite.connect(settings.db_path) as db:
|
async with aiosqlite.connect(settings.db_path) as db:
|
||||||
await db.execute("PRAGMA journal_mode=WAL")
|
await db.execute("PRAGMA journal_mode=WAL")
|
||||||
|
await db.execute("PRAGMA busy_timeout=60000")
|
||||||
await db.execute("PRAGMA foreign_keys=ON")
|
await db.execute("PRAGMA foreign_keys=ON")
|
||||||
await db.executescript(SCHEMA)
|
await db.executescript(SCHEMA)
|
||||||
await _run_migrations(db)
|
await _run_migrations(db)
|
||||||
|
|
@ -187,6 +188,7 @@ async def get_db():
|
||||||
db.row_factory = aiosqlite.Row
|
db.row_factory = aiosqlite.Row
|
||||||
try:
|
try:
|
||||||
await db.execute("PRAGMA journal_mode=WAL")
|
await db.execute("PRAGMA journal_mode=WAL")
|
||||||
|
await db.execute("PRAGMA busy_timeout=60000")
|
||||||
await db.execute("PRAGMA foreign_keys=ON")
|
await db.execute("PRAGMA foreign_keys=ON")
|
||||||
yield db
|
yield db
|
||||||
finally:
|
finally:
|
||||||
|
|
|
||||||
|
|
@ -334,6 +334,7 @@ async def _fetch_report_data() -> list[dict]:
|
||||||
async with aiosqlite.connect(settings.db_path) as db:
|
async with aiosqlite.connect(settings.db_path) as db:
|
||||||
db.row_factory = aiosqlite.Row
|
db.row_factory = aiosqlite.Row
|
||||||
await db.execute("PRAGMA journal_mode=WAL")
|
await db.execute("PRAGMA journal_mode=WAL")
|
||||||
|
await db.execute("PRAGMA busy_timeout=60000")
|
||||||
return await _fetch_drives_for_template(db)
|
return await _fetch_drives_for_template(db)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -347,6 +348,7 @@ async def _fetch_unlock_events_24h() -> list[dict]:
|
||||||
async with aiosqlite.connect(settings.db_path) as db:
|
async with aiosqlite.connect(settings.db_path) as db:
|
||||||
db.row_factory = aiosqlite.Row
|
db.row_factory = aiosqlite.Row
|
||||||
await db.execute("PRAGMA journal_mode=WAL")
|
await db.execute("PRAGMA journal_mode=WAL")
|
||||||
|
await db.execute("PRAGMA busy_timeout=60000")
|
||||||
# julianday() handles the 'YYYY-MM-DDTHH:MM:SS.fff+00:00' format
|
# julianday() handles the 'YYYY-MM-DDTHH:MM:SS.fff+00:00' format
|
||||||
# we write from Python; comparing the raw string against
|
# we write from Python; comparing the raw string against
|
||||||
# datetime('now','-1 day') (which formats as 'YYYY-MM-DD HH:MM:SS')
|
# datetime('now','-1 day') (which formats as 'YYYY-MM-DD HH:MM:SS')
|
||||||
|
|
|
||||||
|
|
@ -437,6 +437,7 @@ async def poll_cycle(client: TrueNASClient) -> int:
|
||||||
async with aiosqlite.connect(settings.db_path) as db:
|
async with aiosqlite.connect(settings.db_path) as db:
|
||||||
db.row_factory = aiosqlite.Row
|
db.row_factory = aiosqlite.Row
|
||||||
await db.execute("PRAGMA journal_mode=WAL")
|
await db.execute("PRAGMA journal_mode=WAL")
|
||||||
|
await db.execute("PRAGMA busy_timeout=60000")
|
||||||
await db.execute("PRAGMA foreign_keys=ON")
|
await db.execute("PRAGMA foreign_keys=ON")
|
||||||
|
|
||||||
for disk in disks:
|
for disk in disks:
|
||||||
|
|
@ -492,6 +493,7 @@ async def run(client: TrueNASClient) -> None:
|
||||||
async with aiosqlite.connect(settings.db_path) as _tdb:
|
async with aiosqlite.connect(settings.db_path) as _tdb:
|
||||||
_tdb.row_factory = aiosqlite.Row
|
_tdb.row_factory = aiosqlite.Row
|
||||||
await _tdb.execute("PRAGMA journal_mode=WAL")
|
await _tdb.execute("PRAGMA journal_mode=WAL")
|
||||||
|
await _tdb.execute("PRAGMA busy_timeout=60000")
|
||||||
_cur = await _tdb.execute("""
|
_cur = await _tdb.execute("""
|
||||||
SELECT MAX(d.temperature_c)
|
SELECT MAX(d.temperature_c)
|
||||||
FROM drives d
|
FROM drives d
|
||||||
|
|
|
||||||
|
|
@ -128,6 +128,7 @@ async def sse_drives(request: Request):
|
||||||
async with aiosqlite.connect(settings.db_path) as db:
|
async with aiosqlite.connect(settings.db_path) as db:
|
||||||
db.row_factory = aiosqlite.Row
|
db.row_factory = aiosqlite.Row
|
||||||
await db.execute("PRAGMA journal_mode=WAL")
|
await db.execute("PRAGMA journal_mode=WAL")
|
||||||
|
await db.execute("PRAGMA busy_timeout=60000")
|
||||||
drives = await _fetch_drives_for_template(db)
|
drives = await _fetch_drives_for_template(db)
|
||||||
|
|
||||||
html = templates.env.get_template(
|
html = templates.env.get_template(
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue