Compare commits
26 commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ec636f8f3a | ||
|
|
7e42464016 | ||
|
|
129f233e0a | ||
|
|
7c3873dd5e | ||
|
|
f71ae341f5 | ||
|
|
71eac9cba0 | ||
|
|
149f2901b7 | ||
|
|
c906ab15f7 | ||
|
|
c5a41d0260 | ||
|
|
2107981cf1 | ||
|
|
659f540270 | ||
|
|
1bc1b378ab | ||
|
|
7f959e6f4c | ||
|
|
28d046f42e | ||
|
|
f5c6b85402 | ||
|
|
383258df97 | ||
|
|
6b2367b892 | ||
|
|
1393ba0bc8 | ||
|
|
30062affc2 | ||
|
|
4922b19a9f | ||
|
|
b406e3f315 | ||
|
|
775251b993 | ||
|
|
8ae84862de | ||
|
|
d38807f957 | ||
|
|
7cd66d460f | ||
|
|
cd92a4d3c8 |
33 changed files with 1601 additions and 164 deletions
22
CLAUDE.md
22
CLAUDE.md
|
|
@ -11,8 +11,8 @@ A self-hosted web dashboard for running and tracking hard-drive burn-in tests
|
|||
against a TrueNAS SCALE 25.10 instance. Deployed on **maple.local** (10.0.0.138).
|
||||
|
||||
- **App URL**: http://10.0.0.138:8084 (or http://burnin.hellocomputer.xyz)
|
||||
- **Stack path on maple.local**: `~/docker/stacks/truenas-burnin/`
|
||||
- **Source (local mac)**: `~/Desktop/claudesandbox/truenas-burnin/`
|
||||
- **Stack path on maple.local**: `~/docker/stacks/nas-burnin/`
|
||||
- **Source (local mac)**: `~/Desktop/claudesandbox/nas-burnin/`
|
||||
- **Compose synced to maple.local** via `scp` or manual copy
|
||||
|
||||
### Stages completed
|
||||
|
|
@ -36,7 +36,7 @@ against a TrueNAS SCALE 25.10 instance. Deployed on **maple.local** (10.0.0.138)
|
|||
## File Map
|
||||
|
||||
```
|
||||
truenas-burnin/
|
||||
nas-burnin/
|
||||
├── docker-compose.yml # two services: mock-truenas + app
|
||||
├── Dockerfile # app container
|
||||
├── requirements.txt
|
||||
|
|
@ -222,18 +222,18 @@ All read from `.env` via `pydantic-settings`. See `.env.example` for full list.
|
|||
### First deploy (already done)
|
||||
```bash
|
||||
# On maple.local
|
||||
cd ~/docker/stacks/truenas-burnin
|
||||
cd ~/docker/stacks/nas-burnin
|
||||
docker compose up -d --build
|
||||
```
|
||||
|
||||
### Redeploy after code changes
|
||||
```bash
|
||||
# Copy changed files from mac to maple.local first, e.g.:
|
||||
scp -P 2225 -r app/ brandon@10.0.0.138:~/docker/stacks/truenas-burnin/
|
||||
scp -P 2225 -r app/ brandon@10.0.0.138:~/docker/stacks/nas-burnin/
|
||||
|
||||
# Then on maple.local:
|
||||
ssh brandon@10.0.0.138 -p 2225
|
||||
cd ~/docker/stacks/truenas-burnin
|
||||
cd ~/docker/stacks/nas-burnin
|
||||
docker compose up -d --build
|
||||
```
|
||||
|
||||
|
|
@ -242,7 +242,7 @@ docker compose up -d --build
|
|||
# On maple.local — stop containers first
|
||||
docker compose stop app
|
||||
# Delete DB using alpine (container owns the file, sudo not available)
|
||||
docker run --rm -v ~/docker/stacks/truenas-burnin/data:/data alpine rm -f /data/app.db
|
||||
docker run --rm -v ~/docker/stacks/nas-burnin/data:/data alpine rm -f /data/app.db
|
||||
docker compose start app
|
||||
```
|
||||
|
||||
|
|
@ -350,7 +350,7 @@ async def burnin_get(job_id: int, ...): ...
|
|||
### `requirements.txt` is unpinned
|
||||
Every `docker compose up -d --build` pulls latest of fastapi, starlette, jinja2, asyncssh, etc. The Starlette 1.0 regression on 2026-04-27 is a direct consequence. **Either pin to known-good versions, or audit installed versions immediately after each rebuild** with:
|
||||
```bash
|
||||
docker exec truenas-burnin python3 -c "import fastapi, starlette, jinja2; print(fastapi.__version__, starlette.__version__, jinja2.__version__)"
|
||||
docker exec nas-burnin python3 -c "import fastapi, starlette, jinja2; print(fastapi.__version__, starlette.__version__, jinja2.__version__)"
|
||||
```
|
||||
|
||||
### Local source ↔ maple host can drift
|
||||
|
|
@ -358,8 +358,8 @@ The deploy convention is `scp -r app/` from mac to maple, but if you ever edit o
|
|||
|
||||
**Always `diff -u` before bulk scp:**
|
||||
```bash
|
||||
ssh -p 2225 brandon@10.0.0.138 'cat ~/docker/stacks/truenas-burnin/app/routes.py' > /tmp/deployed_routes.py
|
||||
diff -u /tmp/deployed_routes.py ~/Desktop/claudesandbox/truenas-burnin/app/routes.py
|
||||
ssh -p 2225 brandon@10.0.0.138 'cat ~/docker/stacks/nas-burnin/app/routes.py' > /tmp/deployed_routes.py
|
||||
diff -u /tmp/deployed_routes.py ~/Desktop/claudesandbox/nas-burnin/app/routes.py
|
||||
```
|
||||
When sides have conflicting edits, prefer **patching the host file in place + rebuild** over a destructive scp.
|
||||
|
||||
|
|
@ -427,7 +427,7 @@ SMART attrs stored as JSON blob in `drives.smart_attrs`. Updated by `final_check
|
|||
|
||||
Settings page has a "Check for Updates" button that fetches:
|
||||
```
|
||||
GET https://git.hellocomputer.xyz/api/v1/repos/brandon/truenas-burnin/releases/latest
|
||||
GET https://git.hellocomputer.xyz/api/v1/repos/brandon/nas-burnin/releases/latest
|
||||
```
|
||||
Compares tag name against `settings.app_version`; shows "up to date" or "v{tag} available".
|
||||
|
||||
|
|
|
|||
113
README.md
113
README.md
|
|
@ -37,7 +37,7 @@ open http://localhost:8084 # or your host's IP
|
|||
If you set `INITIAL_ADMIN_*` env vars *and* the users table is empty, that
|
||||
account is created on startup automatically. After that the env vars are
|
||||
ignored — change passwords from the UI ("Change password" header link) or
|
||||
the CLI (`docker exec -it truenas-burnin python -m app.auth_cli reset
|
||||
the CLI (`docker exec -it nas-burnin python -m app.auth_cli reset
|
||||
<username>`).
|
||||
|
||||
---
|
||||
|
|
@ -83,11 +83,12 @@ runtime roughly in half at ~2× RAM cost — matches the upstream
|
|||
|
||||
### Watch out
|
||||
|
||||
- **Stuck-job timeout** — `stuck_job_hours` (default 24) marks any job
|
||||
past that threshold as `unknown` and kills the remote process. If
|
||||
you're burning in 14 TB drives with default block size, raise this to
|
||||
**48** in Settings before starting, or you'll get false positives near
|
||||
the end of surface_validate.
|
||||
- **Stuck-job timeout** — `stuck_job_hours` (default 168 = 7 days)
|
||||
marks any job past that threshold as `unknown` and kills the remote
|
||||
process. The default covers `-w` surface_validate on 14 TB+ HDDs with
|
||||
margin. If you're running short SSDs and want faster detection of
|
||||
genuinely stuck jobs, drop it. (Earlier versions defaulted to 24h
|
||||
which false-positived on multi-TB drives.)
|
||||
- **Thermal gate** — if drives currently under burn-in hit the
|
||||
temperature warning threshold, new jobs wait up to 3 minutes before
|
||||
acquiring a slot. Increase `temp_warn_c` if your chassis runs hot but
|
||||
|
|
@ -105,6 +106,91 @@ Click the red ✕ next to a running job. The orchestrator:
|
|||
Cancellations are durable — restart the container and queued jobs resume,
|
||||
cancelled jobs stay cancelled.
|
||||
|
||||
### Job states explained
|
||||
|
||||
| State | When it's set |
|
||||
|-------------|-------------------------------------------------------------------------------|
|
||||
| `queued` | Submitted, waiting for a `max_parallel_burnins` slot |
|
||||
| `running` | Actively executing some stage |
|
||||
| `passed` | All stages finished green |
|
||||
| `failed` | A stage failed deterministically (bad blocks > threshold, SMART failure, etc.) |
|
||||
| `cancelled` | Operator clicked ✕ |
|
||||
| `unknown` | Job was alive but its outcome is indeterminate — see below |
|
||||
|
||||
`unknown` fires in two situations:
|
||||
|
||||
1. The stuck-job detector (`stuck_job_hours`, default 7 days) trips because
|
||||
the job has been running too long without finishing.
|
||||
2. The asyncio task got cancelled mid-stage by something *other* than an
|
||||
operator click — usually a container restart (`docker compose up -d`,
|
||||
`--build`, or the host rebooting). Burn-in source code goes through
|
||||
the Dockerfile `COPY`, so any source-code deploy recreates the
|
||||
container, drops the SSH connection to TrueNAS, and would orphan the
|
||||
running burn-in. Avoid `--build` while burn-ins are active.
|
||||
|
||||
When `unknown` fires the drawer's per-stage Reason block shows
|
||||
*"Task cancelled mid-run — likely container restart or shutdown"* so the
|
||||
classification is explicit, not silent.
|
||||
|
||||
---
|
||||
|
||||
## Drive drawer
|
||||
|
||||
Click any drive row to slide a detail drawer down from the top. Three tabs:
|
||||
|
||||
- **Burn-In** — per-stage breakdown of the latest job
|
||||
- **SMART** — short/long test states + cached SMART attributes
|
||||
- **Events** — last 50 audit events for the drive
|
||||
|
||||
### Surface-validate visualization
|
||||
|
||||
For drives in a `surface_validate` stage (running or finished), the Burn-In
|
||||
tab renders:
|
||||
|
||||
1. **Vital-signs strip** — `Start` (with date) · `Elapsed` · `ETA` (duration
|
||||
remaining) · `Finish` (wall-clock estimate, browser-local timezone) ·
|
||||
`Temp` (cool/warm/hot colour). Computed from data in the drawer payload;
|
||||
ETA + Finish suppressed below 0.5% so you don't see a "Finish: Jun 22"
|
||||
stutter at the very start.
|
||||
2. **Four pattern meters** — `0xaa` / `0x55` / `0xff` / `0x00`. Each meter
|
||||
is split into a left half (write phase, blue) and a right half (verify
|
||||
phase, green). Current pattern's label glows blue; completed patterns'
|
||||
labels go green. This translates badblocks's per-phase percent into
|
||||
monotonic 0-99% overall progress, so the bar never appears to "rewind"
|
||||
when a new phase starts.
|
||||
3. **Phase caption** — explicit text: *"Pattern 2 of 4 · Verify 0x55 · 47%
|
||||
within phase"*. Makes the visual grammar unambiguous.
|
||||
4. **Completed-pattern history** — once pattern 1 finishes, a chip appears
|
||||
showing `0xaa: 14h 22m`. Lets you predict the rest of the run from the
|
||||
first pattern's elapsed time.
|
||||
|
||||
### Failure reason block
|
||||
|
||||
Stages that ended `failed` / `cancelled` / `unknown` show a coloured Reason
|
||||
pill at the top of the stage section. Sources, in order of preference:
|
||||
|
||||
1. The stage's own `error_text`
|
||||
2. The parent job's `error_text` (backfilled by the drawer when the stage's
|
||||
own is empty — catches orphan rows from hard crashes)
|
||||
3. A heuristic: if the log is tiny and no real progress was recorded,
|
||||
*"Stopped without recording an error — likely cause: SSH connection drop
|
||||
or container restart while this stage was running"*
|
||||
|
||||
Otherwise: *"No error message recorded."* — there's never a blank where you
|
||||
expect to see why something broke.
|
||||
|
||||
### Column sorting
|
||||
|
||||
Click any column header (Drive, Serial, Size, Temp, Health, Short SMART,
|
||||
Long SMART, Burn-In) to sort. Cycle: ascending → descending → cleared. Sort
|
||||
state persists in `localStorage` so it survives page reload AND every
|
||||
SSE-driven tbody refresh (~12 s poll cycle). Empty values always sink to
|
||||
the bottom regardless of direction.
|
||||
|
||||
Sortable values are emitted as `data-sort-*` attributes on each `<tr>`,
|
||||
with numeric priority maps for SMART states (e.g. `running` always sorts
|
||||
ahead of `idle`).
|
||||
|
||||
---
|
||||
|
||||
## Drive locks
|
||||
|
|
@ -144,7 +230,8 @@ All settings live under `/settings` (header link). Key knobs:
|
|||
- **`surface_validate_block_size` / `_block_buffer` / `_passes`** —
|
||||
badblocks `-b` / `-c` / `-p`. Defaults preserve original behaviour;
|
||||
tune for speed vs paranoia.
|
||||
- **`stuck_job_hours`** (default 24) — raise for big drives.
|
||||
- **`stuck_job_hours`** (default 168 = 7 days) — covers 14 TB+ HDDs;
|
||||
drop for faster detection on small fast drives.
|
||||
- **`temp_warn_c` / `temp_crit_c`** — thermal gating thresholds.
|
||||
- **`bad_block_threshold`** (default 0) — number of bad blocks
|
||||
surface_validate tolerates before failing the stage.
|
||||
|
|
@ -172,17 +259,17 @@ Configure SMTP in Settings → Email. Includes a "Test SMTP" button.
|
|||
### Logs
|
||||
|
||||
```bash
|
||||
docker logs -f truenas-burnin
|
||||
docker logs -f nas-burnin
|
||||
# JSON-structured. Filter with jq:
|
||||
docker logs truenas-burnin 2>&1 | jq -rR 'fromjson? | "\(.ts) \(.level) \(.msg)"'
|
||||
docker logs nas-burnin 2>&1 | jq -rR 'fromjson? | "\(.ts) \(.level) \(.msg)"'
|
||||
```
|
||||
|
||||
### User management
|
||||
|
||||
```bash
|
||||
docker exec -it truenas-burnin python -m app.auth_cli list
|
||||
docker exec -it truenas-burnin python -m app.auth_cli add <username>
|
||||
docker exec -it truenas-burnin python -m app.auth_cli reset <username>
|
||||
docker exec -it nas-burnin python -m app.auth_cli list
|
||||
docker exec -it nas-burnin python -m app.auth_cli add <username>
|
||||
docker exec -it nas-burnin python -m app.auth_cli reset <username>
|
||||
```
|
||||
|
||||
Passwords are read from a TTY prompt; never accept them on the command
|
||||
|
|
@ -259,7 +346,7 @@ pinned version after the fact.
|
|||
- `CLAUDE.md` — full architecture, file map, deploy workflow, and the
|
||||
rationale behind every non-obvious design decision.
|
||||
- `SPEC.md` — canonical feature reference per version.
|
||||
- `tests/` — `python -m unittest discover tests/` (44 tests, stdlib-only).
|
||||
- `tests/` — `python -m unittest discover tests/` (65 tests, stdlib-only). Or run inside the deployed container with `scripts/run-tests.sh`.
|
||||
|
||||
---
|
||||
|
||||
|
|
|
|||
4
SPEC.md
4
SPEC.md
|
|
@ -251,8 +251,8 @@ The API makes this app a strong candidate for MCP server integration, allowing a
|
|||
Docker Compose. Minimum viable setup:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/yourusername/truenas-burnin
|
||||
cd truenas-burnin
|
||||
git clone https://github.com/yourusername/nas-burnin
|
||||
cd nas-burnin
|
||||
cp .env.example .env
|
||||
# Edit .env for system-level settings (TrueNAS URL, poll interval, etc.)
|
||||
docker compose up -d
|
||||
|
|
|
|||
|
|
@ -72,6 +72,14 @@ class User:
|
|||
is_admin: bool
|
||||
|
||||
|
||||
def LoopbackUser(username: str = "monitor", full_name: str = "Autonomous Monitor") -> User:
|
||||
"""Synthetic admin used by the loopback bypass in _AuthGateMiddleware.
|
||||
id=0 (no real DB row) and is_admin=True so admin-gated routes work.
|
||||
Only reachable when request.client.host is 127.0.0.1 / ::1 —
|
||||
a process inside the container's network namespace (docker exec)."""
|
||||
return User(id=0, username=username, full_name=full_name, is_admin=True)
|
||||
|
||||
|
||||
def _now() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
"""Password reset / user management CLI.
|
||||
|
||||
Run inside the container:
|
||||
docker exec -it truenas-burnin python -m app.auth_cli reset <username>
|
||||
docker exec -it truenas-burnin python -m app.auth_cli list
|
||||
docker exec -it truenas-burnin python -m app.auth_cli add <username>
|
||||
docker exec -it nas-burnin python -m app.auth_cli reset <username>
|
||||
docker exec -it nas-burnin python -m app.auth_cli list
|
||||
docker exec -it nas-burnin python -m app.auth_cli add <username>
|
||||
|
||||
Reads the password from a TTY prompt — never accept it on the command
|
||||
line so it doesn't leak into shell history.
|
||||
|
|
|
|||
|
|
@ -93,6 +93,7 @@ async def init(client: TrueNASClient) -> None:
|
|||
async with _db() as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
await db.execute("PRAGMA journal_mode=WAL")
|
||||
await db.execute("PRAGMA busy_timeout=60000")
|
||||
await db.execute("PRAGMA foreign_keys=ON")
|
||||
|
||||
# Mark interrupted running jobs as unknown
|
||||
|
|
@ -161,6 +162,7 @@ async def start_job(drive_id: int, profile: str, operator: str,
|
|||
async with _db() as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
await db.execute("PRAGMA journal_mode=WAL")
|
||||
await db.execute("PRAGMA busy_timeout=60000")
|
||||
await db.execute("PRAGMA foreign_keys=ON")
|
||||
|
||||
# Reject duplicate active burn-in for same drive
|
||||
|
|
@ -261,6 +263,7 @@ async def cancel_job(job_id: int, operator: str) -> bool:
|
|||
async with _db() as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
await db.execute("PRAGMA journal_mode=WAL")
|
||||
await db.execute("PRAGMA busy_timeout=60000")
|
||||
|
||||
cur = await db.execute(
|
||||
"SELECT state, drive_id FROM burnin_jobs WHERE id=?", (job_id,)
|
||||
|
|
@ -345,6 +348,7 @@ async def _run_job(job_id: int) -> None:
|
|||
# Transition queued → running
|
||||
async with _db() as db:
|
||||
await db.execute("PRAGMA journal_mode=WAL")
|
||||
await db.execute("PRAGMA busy_timeout=60000")
|
||||
row = await (await db.execute(
|
||||
"SELECT drive_id, profile FROM burnin_jobs WHERE id=?", (job_id,)
|
||||
)).fetchone()
|
||||
|
|
@ -411,12 +415,34 @@ async def _run_job(job_id: int) -> None:
|
|||
final_state = "unknown"
|
||||
else:
|
||||
final_state = "passed" if success else "failed"
|
||||
# If the asyncio task was cancelled mid-stage (container shutdown,
|
||||
# uvicorn reload, etc.), CancelledError propagates past
|
||||
# _execute_stages, so any running stage row is still marked
|
||||
# 'running' in the DB. Reconcile here: mark every still-running
|
||||
# stage on this job as 'unknown' with the parent's finished_at,
|
||||
# and stamp a default error_text so the drawer's Reason block has
|
||||
# something concrete to show. Use a write that's idempotent under
|
||||
# repeat (only touches rows still 'running').
|
||||
cancel_err = (
|
||||
"Task cancelled mid-run — likely container restart or shutdown"
|
||||
if was_cancelled else None
|
||||
)
|
||||
async with _db() as db:
|
||||
await db.execute("PRAGMA journal_mode=WAL")
|
||||
await db.execute("PRAGMA busy_timeout=60000")
|
||||
await db.execute(
|
||||
"UPDATE burnin_jobs SET state=?, percent=?, finished_at=?, error_text=? WHERE id=?",
|
||||
(final_state, 100 if success else None, _now(), error_text, job_id),
|
||||
(final_state, 100 if success else None, _now(),
|
||||
error_text or cancel_err, job_id),
|
||||
)
|
||||
if was_cancelled:
|
||||
await db.execute(
|
||||
"""UPDATE burnin_stages
|
||||
SET state='unknown', finished_at=?,
|
||||
error_text=COALESCE(error_text, ?)
|
||||
WHERE burnin_job_id=? AND state='running'""",
|
||||
(_now(), cancel_err, job_id),
|
||||
)
|
||||
await db.execute(
|
||||
"""INSERT INTO audit_events (event_type, drive_id, burnin_job_id, operator, message)
|
||||
VALUES (?,?,?,(SELECT operator FROM burnin_jobs WHERE id=?),?)""",
|
||||
|
|
@ -542,6 +568,7 @@ async def check_stuck_jobs() -> None:
|
|||
async with _db() as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
await db.execute("PRAGMA journal_mode=WAL")
|
||||
await db.execute("PRAGMA busy_timeout=60000")
|
||||
|
||||
cur = await db.execute("""
|
||||
SELECT bj.id, bj.drive_id, d.devname, bj.started_at
|
||||
|
|
|
|||
|
|
@ -77,9 +77,13 @@ def _now() -> str:
|
|||
@asynccontextmanager
|
||||
async def _db():
|
||||
"""Open a WAL-mode connection with busy_timeout so writers wait for the lock
|
||||
instead of immediately raising 'database is locked' under contention."""
|
||||
instead of immediately raising 'database is locked' under contention.
|
||||
|
||||
60s timeout is intentionally generous: with 4 concurrent burn-in drains
|
||||
+ the poller + retention + auth all writing, brief contention spikes
|
||||
are normal and waiting is the right behavior. 10s was too tight."""
|
||||
async with aiosqlite.connect(settings.db_path) as db:
|
||||
await db.execute("PRAGMA busy_timeout=10000")
|
||||
await db.execute("PRAGMA busy_timeout=60000")
|
||||
yield db
|
||||
|
||||
|
||||
|
|
@ -190,6 +194,72 @@ async def _update_stage_bad_blocks(job_id: int, stage_name: str, count: int) ->
|
|||
await db.commit()
|
||||
|
||||
|
||||
async def _update_stage_bb_phase(
|
||||
job_id: int, stage_name: str, phase: int, phase_pct: float,
|
||||
) -> None:
|
||||
"""Persist per-pattern badblocks progress so the drive-drawer UI
|
||||
can render 4 meters with separate write/verify halves."""
|
||||
async with _db() as db:
|
||||
await db.execute("PRAGMA journal_mode=WAL")
|
||||
await db.execute(
|
||||
"UPDATE burnin_stages SET bb_phase=?, bb_phase_pct=? "
|
||||
"WHERE burnin_job_id=? AND stage_name=?",
|
||||
(phase, phase_pct, job_id, stage_name),
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
|
||||
async def _update_stage_bb_mbps(
|
||||
job_id: int, stage_name: str, mbps: float,
|
||||
) -> None:
|
||||
"""Persist live throughput for the surface_validate meter strip.
|
||||
Computed from delta_overall_pct between successive badblocks
|
||||
progress lines, scaled by drive size_bytes / 800 (8 phases × 100)."""
|
||||
async with _db() as db:
|
||||
await db.execute("PRAGMA journal_mode=WAL")
|
||||
await db.execute(
|
||||
"UPDATE burnin_stages SET bb_mbps=? "
|
||||
"WHERE burnin_job_id=? AND stage_name=?",
|
||||
(mbps, job_id, stage_name),
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
|
||||
async def _record_bb_phase_start(
|
||||
job_id: int, stage_name: str, phase: int, ts: str,
|
||||
) -> None:
|
||||
"""Record the moment a phase first becomes current. Idempotent:
|
||||
re-entry of the same phase keeps the original timestamp so a
|
||||
transient parser reset doesn't blow away history.
|
||||
|
||||
Stored as a JSON object keyed by phase number (string). The
|
||||
drawer reads it to compute per-pattern elapsed times.
|
||||
"""
|
||||
async with _db() as db:
|
||||
await db.execute("PRAGMA journal_mode=WAL")
|
||||
cur = await db.execute(
|
||||
"SELECT bb_phase_history FROM burnin_stages "
|
||||
"WHERE burnin_job_id=? AND stage_name=?",
|
||||
(job_id, stage_name),
|
||||
)
|
||||
row = await cur.fetchone()
|
||||
existing = {}
|
||||
if row and row[0]:
|
||||
try:
|
||||
existing = json.loads(row[0])
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
existing = {}
|
||||
key = str(phase)
|
||||
if key not in existing:
|
||||
existing[key] = ts
|
||||
await db.execute(
|
||||
"UPDATE burnin_stages SET bb_phase_history=? "
|
||||
"WHERE burnin_job_id=? AND stage_name=?",
|
||||
(json.dumps(existing), job_id, stage_name),
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
|
||||
async def _store_smart_attrs(drive_id: int, attrs: dict) -> None:
|
||||
"""Persist latest SMART attribute dict to drives.smart_attrs (JSON)."""
|
||||
# Convert int keys to str for JSON serialisation
|
||||
|
|
|
|||
|
|
@ -14,21 +14,138 @@ from __future__ import annotations
|
|||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from typing import TypedDict
|
||||
|
||||
from app.config import settings
|
||||
|
||||
|
||||
class _BadblocksResult(TypedDict):
|
||||
bad_blocks: int
|
||||
output: str
|
||||
aborted: bool
|
||||
|
||||
|
||||
# `badblocks -w` cycles through 4 patterns (0xaa, 0x55, 0xff, 0x00),
|
||||
# each with a write phase followed by a read-back/verify phase = 8 phases.
|
||||
# Per-phase percent comes back via `XX% done`; without translation, the
|
||||
# dashboard appears to "rewind" every ~2 hours when a new phase starts.
|
||||
_BB_PATTERN_PHASE = {"0xaa": 1, "0x55": 3, "0xff": 5, "0x00": 7}
|
||||
_BB_TOTAL_PHASES = 8
|
||||
# Throttle DB writes from the badblocks parser. Each progress line used
|
||||
# to trigger 4-6 transactions; with 4 concurrent burn-ins emitting sub-
|
||||
# second progress lines, the asyncssh drain couldn't keep up — the
|
||||
# stdout pipe on TrueNAS filled, badblocks blocked on pipe_write,
|
||||
# disk I/O effectively stopped. 5 seconds is fine for the UI (drawer
|
||||
# polls every ~12s anyway) and cuts DB load 60-80x.
|
||||
BB_DB_MIN_SECONDS = 5.0
|
||||
|
||||
import re as _re_pre # noqa: E402
|
||||
|
||||
_BB_PATTERN_RE = _re_pre.compile(r"Testing with pattern\s+(0x[0-9a-fA-F]+)")
|
||||
_BB_VERIFY_RE = _re_pre.compile(r"Reading and comparing")
|
||||
_BB_PERCENT_RE = _re_pre.compile(r"([\d.]+)%\s+done")
|
||||
|
||||
|
||||
class _BadblocksProgress:
|
||||
"""Track which phase of `badblocks -w -p N` we're in so the
|
||||
displayed percent maps to overall progress, not per-phase progress.
|
||||
|
||||
Pure state machine — no I/O. Feed it lines from the badblocks output
|
||||
via :meth:`update`; read :attr:`overall_pct` after each call.
|
||||
|
||||
Behavior:
|
||||
- Defaults to phase 1 (write 0xaa) before any header is seen.
|
||||
- "Testing with pattern 0xXX" sets the phase to the write-phase index
|
||||
for that pattern (1, 3, 5, or 7).
|
||||
- "Reading and comparing" advances to the matching verify phase
|
||||
(last_write_phase + 1).
|
||||
- "XX% done" updates the in-phase percent.
|
||||
- overall_pct = ((phase - 1) * 100 + phase_pct) / 8, clipped to 99
|
||||
so we don't claim "100%" until the stage's success path explicitly
|
||||
writes 100.
|
||||
"""
|
||||
|
||||
__slots__ = ("phase", "phase_pct", "_last_write_phase")
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.phase: int = 1
|
||||
self.phase_pct: float = 0.0
|
||||
self._last_write_phase: int = 1
|
||||
|
||||
def update(self, line: str) -> None:
|
||||
m = _BB_PATTERN_RE.search(line)
|
||||
if m:
|
||||
p = m.group(1).lower()
|
||||
if p in _BB_PATTERN_PHASE:
|
||||
self.phase = _BB_PATTERN_PHASE[p]
|
||||
self._last_write_phase = self.phase
|
||||
self.phase_pct = 0.0
|
||||
return
|
||||
if _BB_VERIFY_RE.search(line):
|
||||
self.phase = self._last_write_phase + 1
|
||||
self.phase_pct = 0.0
|
||||
return
|
||||
m = _BB_PERCENT_RE.search(line)
|
||||
if m:
|
||||
try:
|
||||
self.phase_pct = float(m.group(1))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
@property
|
||||
def overall_pct(self) -> int:
|
||||
total = (self.phase - 1) * 100.0 + self.phase_pct
|
||||
return min(99, int(total / _BB_TOTAL_PHASES))
|
||||
|
||||
|
||||
def _build_badblocks_cmd(devname: str) -> str:
|
||||
"""Construct the wrapped badblocks command for a given device.
|
||||
|
||||
badblocks's progress output uses '\\b' backspace characters to
|
||||
overwrite the previous "XX% done" line — there's no '\\n' between
|
||||
updates until a phase transition. asyncssh's line-buffered reader
|
||||
needs a real '\\n' to yield a line, so we pipe the output through
|
||||
`tr '\\b' '\\n'` at the shell level. After this, every progress
|
||||
update is a normal newline-terminated line.
|
||||
|
||||
Inner shell does `echo PID:$$; exec badblocks ...` so $$ is the
|
||||
badblocks PID after exec (needed for out-of-band kill -9; asyncssh's
|
||||
signal channel is ignored by sshd). 2>&1 merges stderr into stdout
|
||||
so tr sees the progress lines (badblocks emits them on stderr).
|
||||
|
||||
Geometry (-b -c -p) is operator-tunable via Settings → Burn-in;
|
||||
defaults match the Spearfoot disk-burnin.sh recommendation.
|
||||
"""
|
||||
inner = (
|
||||
f"echo PID:$$; exec badblocks "
|
||||
f"-wsv "
|
||||
f"-b {settings.surface_validate_block_size} "
|
||||
f"-c {settings.surface_validate_block_buffer} "
|
||||
f"-p {settings.surface_validate_passes} "
|
||||
f"/dev/{devname} 2>&1"
|
||||
)
|
||||
# The outer pipeline lets tr translate \\b → \\n. stdbuf -oL forces
|
||||
# tr's stdout to line-buffered mode; without it tr's stdout is
|
||||
# block-buffered (4 KB chunks) when its destination is a pipe,
|
||||
# which delays each progress line by ~6 minutes at our throughput.
|
||||
return f"sh -c '{inner}' | stdbuf -oL tr '\\b' '\\n'"
|
||||
|
||||
from . import kill
|
||||
from ._common import (
|
||||
POLL_INTERVAL,
|
||||
_append_stage_log,
|
||||
_db,
|
||||
_is_cancelled,
|
||||
_now,
|
||||
_push_update,
|
||||
_recalculate_progress,
|
||||
_record_bb_phase_start,
|
||||
_set_stage_error,
|
||||
_store_smart_attrs,
|
||||
_store_smart_raw_output,
|
||||
_update_stage_bad_blocks,
|
||||
_update_stage_bb_mbps,
|
||||
_update_stage_bb_phase,
|
||||
_update_stage_percent,
|
||||
)
|
||||
|
||||
|
|
@ -373,6 +490,17 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
|||
"""Run badblocks over SSH, streaming output to stage log."""
|
||||
from app import ssh_client
|
||||
|
||||
# Pull drive size for the throughput calculation. Each badblocks
|
||||
# phase covers the full disk once, so 1% overall progress = size/800
|
||||
# bytes (8 phases × 100). NULL-safe: if size lookup fails we just
|
||||
# skip the MB/s update.
|
||||
drive_size_bytes: int | None = None
|
||||
async with _db() as db:
|
||||
cur = await db.execute("SELECT size_bytes FROM drives WHERE id=?", (drive_id,))
|
||||
row = await cur.fetchone()
|
||||
if row and row[0]:
|
||||
drive_size_bytes = int(row[0])
|
||||
|
||||
await _append_stage_log(
|
||||
job_id, "surface_validate",
|
||||
f"[START] badblocks -wsv -b {settings.surface_validate_block_size} "
|
||||
|
|
@ -385,7 +513,7 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
|||
# Streaming + progress is handled by the inline _drain coroutines
|
||||
# below; the in-loop _append_stage_log + _update_stage_percent calls
|
||||
# take care of throttled DB writes. Result dict is just final tallies.
|
||||
result = {"bad_blocks": 0, "output": "", "aborted": False}
|
||||
result: _BadblocksResult = {"bad_blocks": 0, "output": "", "aborted": False}
|
||||
try:
|
||||
bad_blocks_total = 0
|
||||
output_lines: list[str] = []
|
||||
|
|
@ -397,32 +525,49 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
|||
# we need the PID to issue an out-of-band `kill -9` over a fresh
|
||||
# session when we want to abort.
|
||||
#
|
||||
# Block geometry is operator-tunable (Settings → Burn-in):
|
||||
# -b N block size in bytes (settings.surface_validate_block_size)
|
||||
# -c N blocks held per IO (settings.surface_validate_block_buffer)
|
||||
# -p N pass count (settings.surface_validate_passes)
|
||||
# Defaults preserve original behavior (-b 4096 -c 64 -p 1).
|
||||
bb_args = (
|
||||
f"-wsv "
|
||||
f"-b {settings.surface_validate_block_size} "
|
||||
f"-c {settings.surface_validate_block_buffer} "
|
||||
f"-p {settings.surface_validate_passes}"
|
||||
)
|
||||
cmd = (
|
||||
f"sh -c 'echo PID:$$; exec badblocks {bb_args} /dev/{devname}'"
|
||||
)
|
||||
cmd = _build_badblocks_cmd(devname)
|
||||
async with conn.create_process(cmd) as proc:
|
||||
import re as _re
|
||||
|
||||
pid_seen = False
|
||||
progress = _BadblocksProgress()
|
||||
|
||||
# Throughput tracker — store (overall_pct, monotonic_ts)
|
||||
# of the previous progress sample so we can compute MB/s
|
||||
# from the delta on each new sample.
|
||||
last_pct_sample: float = progress.overall_pct
|
||||
last_db_write_ts: float = time.monotonic()
|
||||
# Lines accumulated since last log flush. Flushed in the
|
||||
# throttled DB-write window (see BB_DB_MIN_SECONDS).
|
||||
pending_log_chunks: list[str] = []
|
||||
|
||||
# Seed bb_phase=1, bb_phase_pct=0 immediately so the
|
||||
# drawer's per-pattern meters have something to render
|
||||
# before badblocks emits its first "X% done" line. On a
|
||||
# 14 TB drive that first line can be several minutes in,
|
||||
# and a blank meter strip looks broken to the operator.
|
||||
await _update_stage_bb_phase(
|
||||
job_id, "surface_validate",
|
||||
progress.phase, progress.phase_pct,
|
||||
)
|
||||
# Stamp phase 1 (write 0xaa) start so the drawer's
|
||||
# duration history starts populating immediately.
|
||||
await _record_bb_phase_start(
|
||||
job_id, "surface_validate", progress.phase, _now(),
|
||||
)
|
||||
_push_update()
|
||||
|
||||
async def _drain(stream, is_stderr: bool):
|
||||
nonlocal bad_blocks_total, pid_seen
|
||||
nonlocal bad_blocks_total, pid_seen, last_db_write_ts, last_pct_sample
|
||||
# Line-based drain. The wrapped badblocks command
|
||||
# pipes through `tr '\b' '\n'` at the shell level
|
||||
# so every progress update is a real newline-
|
||||
# terminated line by the time it reaches us.
|
||||
async for raw in stream:
|
||||
line = raw if isinstance(raw, str) else raw.decode("utf-8", errors="replace")
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
# First stdout line is "PID:<n>" from the wrapping shell.
|
||||
# Capture it and don't append it to the user-visible log.
|
||||
# First stdout line is "PID:<n>" from the
|
||||
# wrapping shell. Capture and skip.
|
||||
if not is_stderr and not pid_seen and line.startswith("PID:"):
|
||||
pid_seen = True
|
||||
try:
|
||||
|
|
@ -435,27 +580,86 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
|||
pass
|
||||
continue
|
||||
|
||||
output_lines.append(line)
|
||||
# Note: with the `tr` pipe, badblocks's stderr is
|
||||
# merged into stdout (`2>&1`). is_stderr is now
|
||||
# always False — we treat every non-PID line as
|
||||
# potentially containing progress or bad-block
|
||||
# output. The phase parser is idempotent on
|
||||
# unrelated lines.
|
||||
prev_phase = progress.phase
|
||||
progress.update(line)
|
||||
phase_changed = progress.phase != prev_phase
|
||||
is_progress_line = bool(_BB_PERCENT_RE.search(line))
|
||||
# Bare-number lines from badblocks are bad-block
|
||||
# block numbers (one per line on stdout).
|
||||
stripped = line.strip()
|
||||
if stripped and stripped.isdigit() and not is_progress_line:
|
||||
bad_blocks_total += 1
|
||||
|
||||
if is_stderr:
|
||||
m = _re.search(r"([\d.]+)%\s+done", line)
|
||||
if m:
|
||||
pct = min(99, int(float(m.group(1))))
|
||||
await _update_stage_percent(job_id, "surface_validate", pct)
|
||||
await _update_stage_bad_blocks(job_id, "surface_validate", bad_blocks_total)
|
||||
await _recalculate_progress(job_id)
|
||||
_push_update()
|
||||
else:
|
||||
stripped = line.strip()
|
||||
if stripped and stripped.isdigit():
|
||||
bad_blocks_total += 1
|
||||
# Keep "XX% done" lines OUT of output_lines. Big
|
||||
# volume + quadratic log_text concat.
|
||||
if not is_progress_line:
|
||||
output_lines.append(line)
|
||||
|
||||
# Append to DB log in chunks
|
||||
if len(output_lines) % 20 == 0:
|
||||
chunk = "".join(output_lines[-20:])
|
||||
await _append_stage_log(job_id, "surface_validate", chunk)
|
||||
# Single throttle gate covering EVERY DB touch.
|
||||
# Cumulative DB load otherwise overwhelms the
|
||||
# asyncio loop → asyncssh drain falls behind →
|
||||
# SSH window stops advancing → pipe fills →
|
||||
# badblocks blocks on pipe_write → disk I/O stops.
|
||||
now_ts = time.monotonic()
|
||||
time_since_last_db = now_ts - last_db_write_ts
|
||||
should_write = phase_changed or time_since_last_db >= BB_DB_MIN_SECONDS
|
||||
|
||||
# Abort on bad block threshold
|
||||
if should_write:
|
||||
if await _is_cancelled(job_id):
|
||||
await kill.kill_remote_process(job_id)
|
||||
return
|
||||
|
||||
if phase_changed:
|
||||
await _record_bb_phase_start(
|
||||
job_id, "surface_validate",
|
||||
progress.phase, _now(),
|
||||
)
|
||||
await _update_stage_percent(
|
||||
job_id, "surface_validate", progress.overall_pct,
|
||||
)
|
||||
await _update_stage_bb_phase(
|
||||
job_id, "surface_validate",
|
||||
progress.phase, progress.phase_pct,
|
||||
)
|
||||
await _update_stage_bad_blocks(
|
||||
job_id, "surface_validate", bad_blocks_total,
|
||||
)
|
||||
|
||||
if (
|
||||
drive_size_bytes
|
||||
and not phase_changed
|
||||
and progress.overall_pct > last_pct_sample
|
||||
and time_since_last_db >= 1.0
|
||||
):
|
||||
d_pct = progress.overall_pct - last_pct_sample
|
||||
bytes_done = (d_pct / 800.0) * drive_size_bytes
|
||||
mbps = bytes_done / time_since_last_db / 1_000_000
|
||||
await _update_stage_bb_mbps(
|
||||
job_id, "surface_validate", mbps,
|
||||
)
|
||||
|
||||
if pending_log_chunks:
|
||||
chunk = "".join(pending_log_chunks)
|
||||
pending_log_chunks.clear()
|
||||
await _append_stage_log(
|
||||
job_id, "surface_validate", chunk,
|
||||
)
|
||||
|
||||
last_pct_sample = progress.overall_pct
|
||||
last_db_write_ts = now_ts
|
||||
await _recalculate_progress(job_id)
|
||||
_push_update()
|
||||
|
||||
if not is_progress_line:
|
||||
pending_log_chunks.append(line)
|
||||
|
||||
# Abort on bad block threshold — immediate.
|
||||
if bad_blocks_total > settings.bad_block_threshold:
|
||||
await kill.kill_remote_process(job_id)
|
||||
output_lines.append(
|
||||
|
|
@ -464,15 +668,9 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
|||
)
|
||||
return
|
||||
|
||||
if await _is_cancelled(job_id):
|
||||
await kill.kill_remote_process(job_id)
|
||||
return
|
||||
|
||||
await asyncio.gather(
|
||||
_drain(proc.stdout, False),
|
||||
_drain(proc.stderr, True),
|
||||
return_exceptions=True,
|
||||
)
|
||||
# Single stream now — the `2>&1` in _build_badblocks_cmd
|
||||
# merges stderr into stdout before the `tr` pipe.
|
||||
await _drain(proc.stdout, False)
|
||||
# Bound proc.wait so a remote process that ignored our kill
|
||||
# signal (or that we never managed to kill) can't pin this
|
||||
# task in the semaphore forever. Closing the connection on
|
||||
|
|
@ -497,7 +695,21 @@ async def _stage_surface_validate_ssh(job_id: int, devname: str, drive_id: int)
|
|||
result["aborted"] = bad_blocks_total > settings.bad_block_threshold
|
||||
|
||||
except asyncio.CancelledError:
|
||||
return False
|
||||
# Best-effort kill of the remote badblocks process before
|
||||
# propagating the cancel. asyncio.shield() so the kill attempt
|
||||
# itself isn't interrupted by ongoing loop shutdown. Then
|
||||
# re-raise so _run_job marks the job 'unknown' (honest about
|
||||
# the indeterminate outcome) instead of 'failed' (which
|
||||
# implies the burn-in itself failed, which we don't know).
|
||||
try:
|
||||
await asyncio.shield(kill.kill_remote_process(job_id))
|
||||
except Exception:
|
||||
pass
|
||||
await _append_stage_log(
|
||||
job_id, "surface_validate",
|
||||
"\n[ABORTED] task cancelled (likely container restart or shutdown)\n",
|
||||
)
|
||||
raise
|
||||
except Exception as exc:
|
||||
await _append_stage_log(job_id, "surface_validate", f"\n[SSH error] {exc}\n")
|
||||
await _set_stage_error(job_id, "surface_validate", f"SSH badblocks error: {exc}")
|
||||
|
|
|
|||
|
|
@ -49,7 +49,10 @@ class Settings(BaseSettings):
|
|||
webhook_url: str = ""
|
||||
|
||||
# Stuck-job detection: jobs running longer than this are marked 'unknown'
|
||||
stuck_job_hours: int = 24
|
||||
# and the remote badblocks/smartctl is killed. 168h (7 days) covers a
|
||||
# full -w surface_validate on a 14 TB+ HDD with margin. Older default
|
||||
# was 24h which false-positived on multi-TB drives almost every time.
|
||||
stuck_job_hours: int = 168
|
||||
|
||||
# Temperature thresholds (°C) — drives table colouring + precheck gate
|
||||
temp_warn_c: int = 46 # orange warning
|
||||
|
|
@ -83,7 +86,7 @@ class Settings(BaseSettings):
|
|||
ssh_key: str = "" # PEM private key content (paste full key including headers)
|
||||
|
||||
# Application version — used by the /api/v1/updates/check endpoint
|
||||
app_version: str = "1.0.0-39"
|
||||
app_version: str = "1.0.0-60"
|
||||
|
||||
# ---- Authentication (1.0.0-22) ----
|
||||
# session_secret: HMAC key for signing session cookies. Empty = generate
|
||||
|
|
|
|||
|
|
@ -93,6 +93,24 @@ _MIGRATIONS = [
|
|||
"ALTER TABLE drives ADD COLUMN pool_name TEXT",
|
||||
"ALTER TABLE drives ADD COLUMN pool_role TEXT",
|
||||
"ALTER TABLE drives ADD COLUMN pool_seen_at TEXT",
|
||||
# 1.0.0-44: per-pattern badblocks progress for the drive drawer's
|
||||
# 4-meter UI. bb_phase is 1-8 (1=write 0xaa, 2=verify 0xaa, 3=write
|
||||
# 0x55, 4=verify 0x55, 5=write 0xff, 6=verify 0xff, 7=write 0x00,
|
||||
# 8=verify 0x00). bb_phase_pct is 0-100 within the current phase.
|
||||
"ALTER TABLE burnin_stages ADD COLUMN bb_phase INTEGER",
|
||||
"ALTER TABLE burnin_stages ADD COLUMN bb_phase_pct REAL",
|
||||
# 1.0.0-46: live write/read throughput for the per-pattern meters.
|
||||
# Computed from successive `XX% done` lines in badblocks output:
|
||||
# delta_bytes = (overall_pct_delta / 800) * drive_size_bytes.
|
||||
# Updated on every progress line; NULL until the second progress
|
||||
# line arrives (need two samples to compute a rate).
|
||||
"ALTER TABLE burnin_stages ADD COLUMN bb_mbps REAL",
|
||||
# 1.0.0-47: per-pattern duration history. JSON map of
|
||||
# {"1": "2026-05-09T05:39:44+00:00", "2": ..., ...} where each key
|
||||
# is the phase number (1-8) and the value is when the parser first
|
||||
# observed that phase. Drawer derives "0xaa: 14h 22m" by diffing
|
||||
# consecutive phase-1 keys.
|
||||
"ALTER TABLE burnin_stages ADD COLUMN bb_phase_history TEXT",
|
||||
# 1.0.0-19: enforce one active burn-in per drive at the storage layer.
|
||||
# Closes the read-then-insert race in burnin.start_job — without this,
|
||||
# two concurrent /api/v1/burnin/start requests for the same drive could
|
||||
|
|
@ -158,6 +176,7 @@ async def init_db() -> None:
|
|||
Path(settings.db_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
async with aiosqlite.connect(settings.db_path) as db:
|
||||
await db.execute("PRAGMA journal_mode=WAL")
|
||||
await db.execute("PRAGMA busy_timeout=60000")
|
||||
await db.execute("PRAGMA foreign_keys=ON")
|
||||
await db.executescript(SCHEMA)
|
||||
await _run_migrations(db)
|
||||
|
|
@ -169,6 +188,7 @@ async def get_db():
|
|||
db.row_factory = aiosqlite.Row
|
||||
try:
|
||||
await db.execute("PRAGMA journal_mode=WAL")
|
||||
await db.execute("PRAGMA busy_timeout=60000")
|
||||
await db.execute("PRAGMA foreign_keys=ON")
|
||||
yield db
|
||||
finally:
|
||||
|
|
|
|||
|
|
@ -303,6 +303,9 @@ def _send_email(subject: str, html: str) -> None:
|
|||
timeout = int(settings.smtp_timeout or 60)
|
||||
port = _smtp_port()
|
||||
|
||||
# SMTP / SMTP_SSL share a parent class but mypy can't unify them
|
||||
# without an explicit Union annotation on the binding.
|
||||
server: smtplib.SMTP
|
||||
if mode == "ssl":
|
||||
server = smtplib.SMTP_SSL(settings.smtp_host, port, context=ctx, timeout=timeout)
|
||||
server.ehlo()
|
||||
|
|
@ -331,6 +334,7 @@ async def _fetch_report_data() -> list[dict]:
|
|||
async with aiosqlite.connect(settings.db_path) as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
await db.execute("PRAGMA journal_mode=WAL")
|
||||
await db.execute("PRAGMA busy_timeout=60000")
|
||||
return await _fetch_drives_for_template(db)
|
||||
|
||||
|
||||
|
|
@ -344,6 +348,7 @@ async def _fetch_unlock_events_24h() -> list[dict]:
|
|||
async with aiosqlite.connect(settings.db_path) as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
await db.execute("PRAGMA journal_mode=WAL")
|
||||
await db.execute("PRAGMA busy_timeout=60000")
|
||||
# julianday() handles the 'YYYY-MM-DDTHH:MM:SS.fff+00:00' format
|
||||
# we write from Python; comparing the raw string against
|
||||
# datetime('now','-1 day') (which formats as 'YYYY-MM-DD HH:MM:SS')
|
||||
|
|
@ -465,6 +470,7 @@ async def test_smtp_connection() -> dict:
|
|||
timeout = int(settings.smtp_timeout or 60)
|
||||
port = _smtp_port()
|
||||
|
||||
server: smtplib.SMTP
|
||||
if mode == "ssl":
|
||||
server = smtplib.SMTP_SSL(settings.smtp_host, port,
|
||||
context=ctx, timeout=timeout)
|
||||
|
|
|
|||
15
app/main.py
15
app/main.py
|
|
@ -189,6 +189,21 @@ class _AuthGateMiddleware(BaseHTTPMiddleware):
|
|||
await auth.get_user_by_id(int(user_id)) if user_id else None
|
||||
)
|
||||
|
||||
# Loopback bypass (1.0.0-56): requests from 127.0.0.1 / ::1
|
||||
# inside the container skip the auth gate. The only way to hit
|
||||
# that source IP is a process in the container's network
|
||||
# namespace — `docker exec` from the host. External traffic
|
||||
# comes through the docker bridge with a non-loopback source,
|
||||
# so it still goes through full auth. We read request.client.host
|
||||
# directly (raw TCP socket), NOT X-Forwarded-For, so external
|
||||
# attackers can't spoof loopback via headers. This unlocks the
|
||||
# autonomous monitor's ability to POST /api/v1/burnin/start
|
||||
# without provisioning a session cookie.
|
||||
if request.client and request.client.host in ("127.0.0.1", "::1"):
|
||||
if request.state.current_user is None:
|
||||
request.state.current_user = auth.LoopbackUser()
|
||||
return await call_next(request)
|
||||
|
||||
if path in _PUBLIC_PATHS or path.startswith(_PUBLIC_PREFIXES):
|
||||
return await call_next(request)
|
||||
if request.state.current_user is not None:
|
||||
|
|
|
|||
|
|
@ -437,6 +437,7 @@ async def poll_cycle(client: TrueNASClient) -> int:
|
|||
async with aiosqlite.connect(settings.db_path) as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
await db.execute("PRAGMA journal_mode=WAL")
|
||||
await db.execute("PRAGMA busy_timeout=60000")
|
||||
await db.execute("PRAGMA foreign_keys=ON")
|
||||
|
||||
for disk in disks:
|
||||
|
|
@ -492,6 +493,7 @@ async def run(client: TrueNASClient) -> None:
|
|||
async with aiosqlite.connect(settings.db_path) as _tdb:
|
||||
_tdb.row_factory = aiosqlite.Row
|
||||
await _tdb.execute("PRAGMA journal_mode=WAL")
|
||||
await _tdb.execute("PRAGMA busy_timeout=60000")
|
||||
_cur = await _tdb.execute("""
|
||||
SELECT MAX(d.temperature_c)
|
||||
FROM drives d
|
||||
|
|
|
|||
|
|
@ -124,7 +124,7 @@ async def backup_db(keep_count: int) -> Path | None:
|
|||
# ---------------------------------------------------------------------------
|
||||
|
||||
_RUN_HOUR = 3 # 03:00 local time — quiet for most homelabs
|
||||
_state = {"last_run_date": None}
|
||||
_state: dict[str, str | None] = {"last_run_date": None}
|
||||
|
||||
|
||||
async def run() -> None:
|
||||
|
|
|
|||
|
|
@ -128,6 +128,7 @@ async def sse_drives(request: Request):
|
|||
async with aiosqlite.connect(settings.db_path) as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
await db.execute("PRAGMA journal_mode=WAL")
|
||||
await db.execute("PRAGMA busy_timeout=60000")
|
||||
drives = await _fetch_drives_for_template(db)
|
||||
|
||||
html = templates.env.get_template(
|
||||
|
|
|
|||
|
|
@ -147,11 +147,12 @@ async def _fetch_drives_for_template(db: aiosqlite.Connection) -> list[dict]:
|
|||
|
||||
# For burn-ins that include SMART stages, fetch those stages so we can
|
||||
# mirror their progress/result in the Short/Long SMART columns.
|
||||
# We include burn-ins in ANY state — including failed/passed/cancelled —
|
||||
# so the SMART columns don't go blank when the burn-in finishes. Without
|
||||
# this, "FAILED (LONG SMART)" appears in the Burn-In column while the
|
||||
# Long SMART column shows "—", which contradicts itself.
|
||||
bi_smart_stages: dict[int, dict[str, dict]] = {} # job_id -> {stage_name: row}
|
||||
bi_ids_with_smart = [
|
||||
bi["id"] for bi in burnin_by_drive.values()
|
||||
if bi["state"] in ("running", "queued")
|
||||
]
|
||||
bi_ids_with_smart = [bi["id"] for bi in burnin_by_drive.values()]
|
||||
if bi_ids_with_smart:
|
||||
placeholders = ",".join("?" * len(bi_ids_with_smart))
|
||||
# placeholders is purely structural ("?,?,?"); IDs themselves are
|
||||
|
|
@ -163,7 +164,7 @@ async def _fetch_drives_for_template(db: aiosqlite.Connection) -> list[dict]:
|
|||
"FROM burnin_stages bs "
|
||||
"WHERE bs.burnin_job_id IN (" + placeholders + ") "
|
||||
" AND bs.stage_name IN ('short_smart', 'long_smart') "
|
||||
" AND bs.state IN ('running', 'passed', 'failed')"
|
||||
" AND bs.state IN ('running', 'passed', 'failed', 'aborted')"
|
||||
)
|
||||
cur = await db.execute(sql, bi_ids_with_smart)
|
||||
for r in await cur.fetchall():
|
||||
|
|
@ -185,14 +186,26 @@ async def _fetch_drives_for_template(db: aiosqlite.Connection) -> list[dict]:
|
|||
if existing.get("state") not in (None, "idle"):
|
||||
continue
|
||||
pct = stage["percent"] or 0
|
||||
stage_state = stage["state"]
|
||||
# If the parent burn-in ended in failure but this SMART
|
||||
# stage is still recorded as "running", that's an
|
||||
# orphaned stage row from a hard crash (e.g. the old
|
||||
# `database is locked` failure mode). Surface as failed
|
||||
# so the column matches the Burn-In column.
|
||||
if stage_state == "running" and bi.get("state") in (
|
||||
"failed", "cancelled", "unknown"
|
||||
):
|
||||
stage_state = bi["state"] if bi["state"] != "unknown" else "failed"
|
||||
d[target] = {
|
||||
"state": stage["state"],
|
||||
"percent": pct if stage["state"] == "running" else (100 if stage["state"] == "passed" else 0),
|
||||
"eta_seconds": _compute_eta_seconds(stage["started_at"], pct) if stage["state"] == "running" else None,
|
||||
"state": stage_state,
|
||||
"percent": pct if stage_state == "running" else (100 if stage_state == "passed" else 0),
|
||||
"eta_seconds": _compute_eta_seconds(stage["started_at"], pct) if stage_state == "running" else None,
|
||||
"eta_timestamp": None,
|
||||
"started_at": stage["started_at"],
|
||||
"finished_at": stage["finished_at"],
|
||||
"error_text": stage["error_text"],
|
||||
"error_text": stage["error_text"] or (
|
||||
bi.get("error_text") if stage_state == "failed" else None
|
||||
),
|
||||
}
|
||||
|
||||
drives.append(d)
|
||||
|
|
|
|||
|
|
@ -57,11 +57,26 @@ async def drive_drawer(drive_id: int, db: aiosqlite.Connection = Depends(get_db)
|
|||
job = dict(job_row)
|
||||
cur = await db.execute(
|
||||
"SELECT id, stage_name, state, percent, started_at, finished_at, "
|
||||
"duration_seconds, error_text, log_text, bad_blocks "
|
||||
"duration_seconds, error_text, log_text, bad_blocks, "
|
||||
"bb_phase, bb_phase_pct, bb_mbps, bb_phase_history "
|
||||
"FROM burnin_stages WHERE burnin_job_id=? ORDER BY id",
|
||||
(job_row["id"],),
|
||||
)
|
||||
job["stages"] = [dict(r) for r in await cur.fetchall()]
|
||||
stages = [dict(r) for r in await cur.fetchall()]
|
||||
# Backfill empty stage.error_text from the parent job's error_text
|
||||
# for any stage that ended in a terminal state without recording
|
||||
# an error of its own. This catches the orphan pattern from hard
|
||||
# crashes (DB-locked, SSH disconnect, container restart) where
|
||||
# the failure didn't get to write a per-stage explanation.
|
||||
job_err = job.get("error_text")
|
||||
for s in stages:
|
||||
if (
|
||||
s.get("state") in ("failed", "cancelled", "unknown")
|
||||
and not s.get("error_text")
|
||||
and job_err
|
||||
):
|
||||
s["error_text"] = job_err
|
||||
job["stages"] = stages
|
||||
burnin_job = job
|
||||
|
||||
# SMART raw output from smart_tests table
|
||||
|
|
@ -101,11 +116,12 @@ async def drive_drawer(drive_id: int, db: aiosqlite.Connection = Depends(get_db)
|
|||
|
||||
return {
|
||||
"drive": {
|
||||
"id": drive.id,
|
||||
"devname": drive.devname,
|
||||
"serial": drive.serial,
|
||||
"model": drive.model,
|
||||
"size_bytes": drive.size_bytes,
|
||||
"id": drive.id,
|
||||
"devname": drive.devname,
|
||||
"serial": drive.serial,
|
||||
"model": drive.model,
|
||||
"size_bytes": drive.size_bytes,
|
||||
"temperature_c": drive.temperature_c,
|
||||
},
|
||||
"burnin": burnin_job,
|
||||
"smart": {
|
||||
|
|
|
|||
|
|
@ -112,7 +112,7 @@ async def check_updates():
|
|||
try:
|
||||
async with httpx.AsyncClient(timeout=8.0) as client:
|
||||
r = await client.get(
|
||||
"https://git.hellocomputer.xyz/api/v1/repos/brandon/truenas-burnin/releases/latest",
|
||||
"https://git.hellocomputer.xyz/api/v1/repos/brandon/nas-burnin/releases/latest",
|
||||
headers={"Accept": "application/json"},
|
||||
)
|
||||
if r.status_code == 200:
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ a container restart to fully take effect (clients/middleware are initialized at
|
|||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from app.config import settings
|
||||
|
||||
|
|
@ -65,7 +66,14 @@ def _overrides_path() -> Path:
|
|||
return Path(settings.db_path).parent / "settings_overrides.json"
|
||||
|
||||
|
||||
def _coerce(key: str, raw) -> object:
|
||||
def _coerce(key: str, raw: Any) -> Any:
|
||||
"""Coerce a raw value to the type registered in _EDITABLE.
|
||||
|
||||
Return type is Any because the concrete return type depends on
|
||||
the key — int/str/bool — and there's no narrowing path mypy can
|
||||
follow from the dict lookup. Callers know which type to expect
|
||||
based on the field they're reading.
|
||||
"""
|
||||
coerce = _EDITABLE[key]
|
||||
if coerce is bool:
|
||||
if isinstance(raw, bool):
|
||||
|
|
|
|||
|
|
@ -244,7 +244,7 @@ thead {
|
|||
}
|
||||
|
||||
th {
|
||||
padding: 9px 14px;
|
||||
padding: 6px 8px;
|
||||
font-size: 11px;
|
||||
font-weight: 600;
|
||||
text-transform: uppercase;
|
||||
|
|
@ -256,9 +256,10 @@ th {
|
|||
}
|
||||
|
||||
td {
|
||||
padding: 10px 14px;
|
||||
padding: 7px 8px;
|
||||
border-bottom: 1px solid var(--border);
|
||||
vertical-align: middle;
|
||||
line-height: 1.3;
|
||||
}
|
||||
|
||||
tr:last-child td {
|
||||
|
|
@ -276,17 +277,15 @@ tr:hover td {
|
|||
/* -----------------------------------------------------------------------
|
||||
Column widths
|
||||
----------------------------------------------------------------------- */
|
||||
.col-drive { min-width: 180px; }
|
||||
.col-serial { min-width: 110px; }
|
||||
.col-size { min-width: 70px; text-align: right; }
|
||||
.col-temp { min-width: 75px; text-align: right; }
|
||||
.col-health { min-width: 85px; }
|
||||
.col-smart { min-width: 95px; }
|
||||
/* Tighter horizontal padding on the SMART columns — they hold short
|
||||
pills ("Passed"/"—") or a progress bar, so the default 14px gutter
|
||||
wastes space on 13" laptops. */
|
||||
th.col-smart, td.col-smart { padding-left: 6px; padding-right: 6px; }
|
||||
.col-actions { min-width: 170px; }
|
||||
.col-drive { min-width: 160px; }
|
||||
.col-serial { min-width: 95px; }
|
||||
.col-size { min-width: 60px; text-align: right; }
|
||||
.col-temp { min-width: 60px; text-align: right; }
|
||||
.col-health { min-width: 70px; }
|
||||
.col-smart { min-width: 80px; }
|
||||
/* Tighter SMART columns — they hold short pills or a progress bar. */
|
||||
th.col-smart, td.col-smart { padding-left: 5px; padding-right: 5px; }
|
||||
.col-actions { min-width: 150px; }
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Drive cell
|
||||
|
|
@ -295,14 +294,23 @@ th.col-smart, td.col-smart { padding-left: 6px; padding-right: 6px; }
|
|||
display: block;
|
||||
font-weight: 500;
|
||||
color: var(--text-strong);
|
||||
font-size: 14px;
|
||||
font-size: 13px;
|
||||
line-height: 1.25;
|
||||
}
|
||||
|
||||
.drive-model {
|
||||
display: block;
|
||||
font-size: 11px;
|
||||
display: inline;
|
||||
font-size: 10px;
|
||||
color: var(--text-muted);
|
||||
margin-top: 1px;
|
||||
margin-top: 0;
|
||||
line-height: 1.25;
|
||||
}
|
||||
/* Separator between model and location when both are present on the
|
||||
same line. ::after on .drive-model puts a thin dot between them. */
|
||||
.drive-model + .drive-location::before {
|
||||
content: " · ";
|
||||
color: var(--border);
|
||||
margin: 0 2px;
|
||||
}
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
|
|
@ -425,7 +433,7 @@ th.col-smart, td.col-smart { padding-left: 6px; padding-right: 6px; }
|
|||
/* -----------------------------------------------------------------------
|
||||
Burn-in column
|
||||
----------------------------------------------------------------------- */
|
||||
.col-burnin { min-width: 160px; }
|
||||
.col-burnin { min-width: 130px; }
|
||||
|
||||
.burnin-cell { min-width: 140px; }
|
||||
|
||||
|
|
@ -1180,9 +1188,9 @@ a.stat-card:hover {
|
|||
Checkbox column
|
||||
----------------------------------------------------------------------- */
|
||||
.col-check {
|
||||
width: 36px;
|
||||
min-width: 36px;
|
||||
padding: 10px 8px 10px 14px;
|
||||
width: 32px;
|
||||
min-width: 32px;
|
||||
padding: 7px 4px 7px 8px;
|
||||
}
|
||||
|
||||
.drive-checkbox, #select-all-cb {
|
||||
|
|
@ -1196,18 +1204,15 @@ a.stat-card:hover {
|
|||
Drive location inline edit
|
||||
----------------------------------------------------------------------- */
|
||||
.drive-location {
|
||||
display: block;
|
||||
display: inline;
|
||||
font-size: 10px;
|
||||
color: var(--text-muted);
|
||||
margin-top: 2px;
|
||||
margin-top: 0;
|
||||
cursor: pointer;
|
||||
border-radius: 3px;
|
||||
padding: 1px 3px;
|
||||
padding: 0 3px;
|
||||
line-height: 1.1;
|
||||
transition: background 0.1s;
|
||||
max-width: 160px;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
white-space: nowrap;
|
||||
}
|
||||
.drive-location:hover { background: var(--border); color: var(--text); }
|
||||
|
||||
|
|
@ -2694,3 +2699,276 @@ tr.drawer-row-active {
|
|||
font-variant-numeric: tabular-nums;
|
||||
}
|
||||
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Per-pattern badblocks meters in the drive drawer (1.0.0-44).
|
||||
Four meters, one per pattern (0xaa / 0x55 / 0xff / 0x00). Each meter
|
||||
has two halves: write (left) and verify (right), so a glance shows
|
||||
both which pattern is running and which sub-phase within it.
|
||||
----------------------------------------------------------------------- */
|
||||
.bb-meters {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(4, 1fr);
|
||||
gap: 8px;
|
||||
padding: 10px 12px;
|
||||
background: var(--bg-soft, #161b22);
|
||||
border-radius: 6px;
|
||||
margin: 6px 0 8px 0;
|
||||
}
|
||||
.bb-meter {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 4px;
|
||||
}
|
||||
.bb-meter-label {
|
||||
font-family: "SF Mono", "Consolas", monospace;
|
||||
font-size: 10px;
|
||||
color: var(--text-muted);
|
||||
text-transform: uppercase;
|
||||
letter-spacing: .04em;
|
||||
}
|
||||
.bb-meter-current .bb-meter-label {
|
||||
color: var(--blue, #58a6ff);
|
||||
font-weight: 600;
|
||||
}
|
||||
.bb-meter-done .bb-meter-label {
|
||||
color: var(--green, #3fb950);
|
||||
}
|
||||
.bb-meter-bar {
|
||||
display: flex;
|
||||
height: 10px;
|
||||
background: var(--bg, #0d1117);
|
||||
border: 1px solid var(--border, #30363d);
|
||||
border-radius: 3px;
|
||||
overflow: hidden;
|
||||
position: relative;
|
||||
}
|
||||
.bb-meter-half {
|
||||
height: 100%;
|
||||
transition: width .3s ease;
|
||||
}
|
||||
.bb-write {
|
||||
background: var(--blue, #58a6ff);
|
||||
flex: 0 0 auto;
|
||||
max-width: 50%;
|
||||
}
|
||||
.bb-verify {
|
||||
background: var(--green, #3fb950);
|
||||
flex: 0 0 auto;
|
||||
max-width: 50%;
|
||||
}
|
||||
.bb-meter-half-spacer {
|
||||
flex: 0 0 auto;
|
||||
width: 1px;
|
||||
background: var(--border, #30363d);
|
||||
height: 100%;
|
||||
}
|
||||
.bb-meter-done .bb-write,
|
||||
.bb-meter-done .bb-verify {
|
||||
opacity: .55;
|
||||
}
|
||||
.bb-meter-sub {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
font-family: "SF Mono", "Consolas", monospace;
|
||||
font-size: 9px;
|
||||
color: var(--text-muted);
|
||||
}
|
||||
.bb-sub-write { color: color-mix(in srgb, var(--blue) 80%, var(--text-muted)); }
|
||||
.bb-sub-verify { color: color-mix(in srgb, var(--green) 80%, var(--text-muted)); }
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Surface-scan vital-signs row in the drawer (1.0.0-46).
|
||||
Sits directly above the per-pattern meters. Temperature with
|
||||
green/yellow/red colour, live MB/s, elapsed, ETA — all derived
|
||||
from data already in the drawer payload.
|
||||
----------------------------------------------------------------------- */
|
||||
.bb-vitals {
|
||||
display: flex;
|
||||
gap: 14px;
|
||||
flex-wrap: wrap;
|
||||
padding: 8px 12px 4px 12px;
|
||||
background: var(--bg-soft, #161b22);
|
||||
border-radius: 6px 6px 0 0;
|
||||
margin: 6px 0 0 0;
|
||||
border-bottom: 1px solid var(--border, #30363d);
|
||||
}
|
||||
/* When vitals lead, suppress the meter strip's top radius + margin so
|
||||
they read as one stacked unit. */
|
||||
.bb-vitals + .bb-meters {
|
||||
border-radius: 0 0 6px 6px;
|
||||
margin-top: 0;
|
||||
}
|
||||
.bb-vital {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 1px;
|
||||
font-family: "SF Mono", "Consolas", monospace;
|
||||
}
|
||||
.bb-vital-label {
|
||||
font-size: 9px;
|
||||
color: var(--text-muted);
|
||||
text-transform: uppercase;
|
||||
letter-spacing: .04em;
|
||||
}
|
||||
.bb-vital-value {
|
||||
font-size: 13px;
|
||||
color: var(--text-strong, #f0f6fc);
|
||||
font-weight: 500;
|
||||
font-variant-numeric: tabular-nums;
|
||||
}
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Phase caption + per-pattern history (1.0.0-47).
|
||||
----------------------------------------------------------------------- */
|
||||
.bb-caption {
|
||||
font-family: "SF Mono", "Consolas", monospace;
|
||||
font-size: 11px;
|
||||
color: var(--text-muted);
|
||||
padding: 6px 12px 0 12px;
|
||||
letter-spacing: .02em;
|
||||
}
|
||||
.bb-history {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
align-items: center;
|
||||
gap: 10px;
|
||||
padding: 6px 12px 8px 12px;
|
||||
font-family: "SF Mono", "Consolas", monospace;
|
||||
font-size: 10px;
|
||||
color: var(--text-muted);
|
||||
}
|
||||
.bb-hist-title {
|
||||
text-transform: uppercase;
|
||||
letter-spacing: .04em;
|
||||
font-size: 9px;
|
||||
margin-right: 4px;
|
||||
}
|
||||
.bb-hist-row {
|
||||
display: inline-flex;
|
||||
align-items: baseline;
|
||||
gap: 4px;
|
||||
background: var(--bg, #0d1117);
|
||||
border: 1px solid var(--border, #30363d);
|
||||
border-radius: 3px;
|
||||
padding: 1px 6px;
|
||||
}
|
||||
.bb-hist-label {
|
||||
color: var(--green, #3fb950);
|
||||
font-weight: 600;
|
||||
}
|
||||
.bb-hist-dur {
|
||||
color: var(--text-strong, #f0f6fc);
|
||||
font-variant-numeric: tabular-nums;
|
||||
}
|
||||
|
||||
/* Bad-block counter colour states inside the vitals row */
|
||||
.bb-vital-good { color: var(--green, #3fb950); }
|
||||
.bb-vital-bad { color: var(--red, #f85149); }
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Column sort (1.0.0-48). Click a sortable TH to cycle asc → desc →
|
||||
cleared. Indicator arrow appears next to the column label.
|
||||
----------------------------------------------------------------------- */
|
||||
th.sortable {
|
||||
cursor: pointer;
|
||||
user-select: none;
|
||||
position: relative;
|
||||
}
|
||||
th.sortable:hover { color: var(--text); }
|
||||
th.sortable::after {
|
||||
content: "";
|
||||
display: inline-block;
|
||||
width: 0;
|
||||
height: 0;
|
||||
margin-left: 4px;
|
||||
border-left: 4px solid transparent;
|
||||
border-right: 4px solid transparent;
|
||||
vertical-align: middle;
|
||||
opacity: 0;
|
||||
}
|
||||
th.sortable:hover::after { opacity: 0.4; border-bottom: 5px solid currentColor; }
|
||||
th.sort-asc::after {
|
||||
opacity: 1;
|
||||
border-bottom: 5px solid var(--blue, #58a6ff);
|
||||
}
|
||||
th.sort-desc::after {
|
||||
opacity: 1;
|
||||
border-top: 5px solid var(--blue, #58a6ff);
|
||||
}
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Stage "Reason" block — explains why a stage ended in a terminal
|
||||
state. Replaces the old single-line stage-error-line for
|
||||
failed/cancelled/unknown stages so the operator gets a clear,
|
||||
prominent explanation at the top.
|
||||
----------------------------------------------------------------------- */
|
||||
.stage-reason {
|
||||
display: flex;
|
||||
gap: 10px;
|
||||
align-items: baseline;
|
||||
padding: 8px 12px;
|
||||
margin: 6px 0;
|
||||
border-radius: 5px;
|
||||
font-size: 12px;
|
||||
border: 1px solid;
|
||||
}
|
||||
.stage-reason-failed {
|
||||
background: var(--red-bg, color-mix(in srgb, var(--red) 12%, transparent));
|
||||
border-color: var(--red-bd, color-mix(in srgb, var(--red) 40%, transparent));
|
||||
}
|
||||
.stage-reason-cancelled,
|
||||
.stage-reason-unknown {
|
||||
background: var(--yellow-bg, color-mix(in srgb, var(--yellow) 12%, transparent));
|
||||
border-color: var(--yellow-bd, color-mix(in srgb, var(--yellow) 40%, transparent));
|
||||
}
|
||||
.stage-reason-label {
|
||||
font-size: 10px;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: .06em;
|
||||
font-weight: 600;
|
||||
color: var(--text-muted);
|
||||
flex-shrink: 0;
|
||||
}
|
||||
.stage-reason-text {
|
||||
flex: 1;
|
||||
color: var(--text-strong, #f0f6fc);
|
||||
line-height: 1.4;
|
||||
word-wrap: break-word;
|
||||
}
|
||||
.stage-reason-failed .stage-reason-text { color: var(--red, #f85149); }
|
||||
.stage-reason-cancelled .stage-reason-text,
|
||||
.stage-reason-unknown .stage-reason-text { color: var(--yellow, #d29922); }
|
||||
|
||||
/* -----------------------------------------------------------------------
|
||||
Drawer job-level estimated completion (right-aligned in the header,
|
||||
so it doesn't compete with the state chip + operator info).
|
||||
----------------------------------------------------------------------- */
|
||||
.drawer-job-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 10px;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
.drawer-job-finish {
|
||||
display: inline-flex;
|
||||
align-items: baseline;
|
||||
gap: 8px;
|
||||
padding: 4px 10px;
|
||||
background: var(--bg-soft, #161b22);
|
||||
border: 1px solid var(--border, #30363d);
|
||||
border-radius: 5px;
|
||||
font-family: "SF Mono", "Consolas", monospace;
|
||||
}
|
||||
.drawer-job-finish-label {
|
||||
font-size: 9px;
|
||||
color: var(--text-muted);
|
||||
text-transform: uppercase;
|
||||
letter-spacing: .04em;
|
||||
}
|
||||
.drawer-job-finish-value {
|
||||
font-size: 12px;
|
||||
color: var(--text-strong, #f0f6fc);
|
||||
font-weight: 500;
|
||||
font-variant-numeric: tabular-nums;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -79,12 +79,86 @@
|
|||
initElapsedTimers();
|
||||
initUnlockCountdowns();
|
||||
initLocationEdits();
|
||||
applySort(); // SSE swap replaces #drives-tbody — re-apply persisted sort
|
||||
paintSortIndicators();
|
||||
if (_drawerDriveId) {
|
||||
_drawerHighlightRow(_drawerDriveId);
|
||||
drawerFetch(_drawerDriveId);
|
||||
}
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------
|
||||
// Column sorting (client-side, persisted in localStorage so it
|
||||
// survives reload AND survives every SSE-driven tbody refresh).
|
||||
// ---------------------------------------------------------------
|
||||
var SORT_KEY = 'nasburnin.sort';
|
||||
function getSort() {
|
||||
try {
|
||||
var raw = localStorage.getItem(SORT_KEY);
|
||||
if (!raw) return null;
|
||||
var p = JSON.parse(raw);
|
||||
if (p && p.col && (p.dir === 'asc' || p.dir === 'desc')) return p;
|
||||
} catch (e) {}
|
||||
return null;
|
||||
}
|
||||
function setSort(col, dir) {
|
||||
if (!col) localStorage.removeItem(SORT_KEY);
|
||||
else localStorage.setItem(SORT_KEY, JSON.stringify({col: col, dir: dir}));
|
||||
}
|
||||
function applySort() {
|
||||
var s = getSort();
|
||||
var tbody = document.getElementById('drives-tbody');
|
||||
if (!tbody || !s) return;
|
||||
var rows = Array.from(tbody.querySelectorAll('tr[id^="drive-"]'));
|
||||
if (!rows.length) return;
|
||||
var attr = 'data-sort-' + s.col;
|
||||
var dirMul = s.dir === 'asc' ? 1 : -1;
|
||||
rows.sort(function (a, b) {
|
||||
var av = a.getAttribute(attr);
|
||||
var bv = b.getAttribute(attr);
|
||||
// Empty values always sink to the bottom regardless of direction.
|
||||
var aEmpty = av === null || av === '';
|
||||
var bEmpty = bv === null || bv === '';
|
||||
if (aEmpty && !bEmpty) return 1;
|
||||
if (!aEmpty && bEmpty) return -1;
|
||||
if (aEmpty && bEmpty) return 0;
|
||||
// Numeric comparison if both parse cleanly, else string.
|
||||
var an = parseFloat(av), bn = parseFloat(bv);
|
||||
if (!isNaN(an) && !isNaN(bn) && String(an) === av && String(bn) === bv) {
|
||||
return (an - bn) * dirMul;
|
||||
}
|
||||
return av.localeCompare(bv) * dirMul;
|
||||
});
|
||||
rows.forEach(function (r) { tbody.appendChild(r); });
|
||||
}
|
||||
function paintSortIndicators() {
|
||||
var s = getSort();
|
||||
document.querySelectorAll('th.sortable').forEach(function (th) {
|
||||
th.classList.remove('sort-asc', 'sort-desc');
|
||||
if (s && th.dataset.sortKey === s.col) {
|
||||
th.classList.add(s.dir === 'asc' ? 'sort-asc' : 'sort-desc');
|
||||
}
|
||||
});
|
||||
}
|
||||
document.addEventListener('click', function (e) {
|
||||
var th = e.target.closest('th.sortable');
|
||||
if (!th) return;
|
||||
var col = th.dataset.sortKey;
|
||||
var s = getSort();
|
||||
var dir = 'asc';
|
||||
if (s && s.col === col) {
|
||||
// Click cycle: asc → desc → cleared
|
||||
if (s.dir === 'asc') dir = 'desc';
|
||||
else { setSort(null); applySort(); paintSortIndicators(); return; }
|
||||
}
|
||||
setSort(col, dir);
|
||||
applySort();
|
||||
paintSortIndicators();
|
||||
});
|
||||
// Initial paint on page load (HTML is already rendered server-side).
|
||||
applySort();
|
||||
paintSortIndicators();
|
||||
|
||||
updateCounts();
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
|
|
@ -1271,8 +1345,14 @@
|
|||
}
|
||||
}
|
||||
|
||||
// Stash the last drive object so the burn-in panel renderer can
|
||||
// pull temperature_c into the vital-signs row without having to
|
||||
// pass it through the Burn-In renderer's signature.
|
||||
var _DRAWER_LAST_DRIVE = null;
|
||||
|
||||
function _drawerRender(data) {
|
||||
var drive = data.drive || {};
|
||||
_DRAWER_LAST_DRIVE = drive;
|
||||
var devnameEl = document.getElementById('drawer-devname');
|
||||
var metaEl = document.getElementById('drawer-drive-meta');
|
||||
if (devnameEl) devnameEl.textContent = drive.devname || '\u2014';
|
||||
|
|
@ -1286,6 +1366,170 @@
|
|||
_drawerRenderEvents(data.events);
|
||||
}
|
||||
|
||||
// Vital-signs row above the meters: drive temp, live throughput,
|
||||
// elapsed time, ETA. Computed from data already in the drawer payload.
|
||||
function _drawerRenderBadblocksVitals(stage, drive) {
|
||||
var phase = parseInt(stage.bb_phase, 10) || 1;
|
||||
var phasePct = parseFloat(stage.bb_phase_pct || 0);
|
||||
var overallPct = ((phase - 1) * 100 + phasePct) / 8; // 0..100
|
||||
var html = '<div class="bb-vitals">';
|
||||
var dateOpts = {
|
||||
weekday: 'short', month: 'short', day: 'numeric',
|
||||
hour: 'numeric', minute: '2-digit',
|
||||
};
|
||||
|
||||
// Start (wall-clock, with date)
|
||||
if (stage.started_at) {
|
||||
var startMs = Date.parse(stage.started_at);
|
||||
var startStr = new Date(startMs).toLocaleString(undefined, dateOpts);
|
||||
html += '<div class="bb-vital">';
|
||||
html += '<span class="bb-vital-label">Start</span>';
|
||||
html += '<span class="bb-vital-value">' + startStr + '</span>';
|
||||
html += '</div>';
|
||||
|
||||
// Elapsed
|
||||
var elapsedSec = Math.max(0, (Date.now() - startMs) / 1000);
|
||||
html += '<div class="bb-vital">';
|
||||
html += '<span class="bb-vital-label">Elapsed</span>';
|
||||
html += '<span class="bb-vital-value">' + _bbFmtDuration(elapsedSec) + '</span>';
|
||||
html += '</div>';
|
||||
|
||||
// ETA + Finish — only once we have measurable progress, so the
|
||||
// first samples don't paint a "47 days" estimate.
|
||||
if (overallPct >= 0.5) {
|
||||
var totalSec = elapsedSec * (100 / overallPct);
|
||||
var remainingSec = Math.max(0, totalSec - elapsedSec);
|
||||
html += '<div class="bb-vital">';
|
||||
html += '<span class="bb-vital-label">ETA</span>';
|
||||
html += '<span class="bb-vital-value">' + _bbFmtDuration(remainingSec) + '</span>';
|
||||
html += '</div>';
|
||||
|
||||
var finishStr = new Date(Date.now() + remainingSec * 1000)
|
||||
.toLocaleString(undefined, dateOpts);
|
||||
html += '<div class="bb-vital">';
|
||||
html += '<span class="bb-vital-label">Finish</span>';
|
||||
html += '<span class="bb-vital-value">' + finishStr + '</span>';
|
||||
html += '</div>';
|
||||
}
|
||||
}
|
||||
|
||||
// Temp with hot/warm/cool colour
|
||||
if (drive && typeof drive.temperature_c === 'number') {
|
||||
var tc = drive.temperature_c;
|
||||
var tClass = 'temp-cool';
|
||||
if (tc >= 48) tClass = 'temp-hot';
|
||||
else if (tc >= 42) tClass = 'temp-warm';
|
||||
html += '<div class="bb-vital">';
|
||||
html += '<span class="bb-vital-label">Temp</span>';
|
||||
html += '<span class="bb-vital-value temp ' + tClass + '">' + tc + '°C</span>';
|
||||
html += '</div>';
|
||||
}
|
||||
|
||||
html += '</div>';
|
||||
return html;
|
||||
}
|
||||
|
||||
function _bbFmtDuration(sec) {
|
||||
sec = Math.floor(sec);
|
||||
var d = Math.floor(sec / 86400);
|
||||
var h = Math.floor((sec % 86400) / 3600);
|
||||
var m = Math.floor((sec % 3600) / 60);
|
||||
if (d > 0) return d + 'd ' + h + 'h';
|
||||
if (h > 0) return h + 'h ' + m + 'm';
|
||||
return m + 'm';
|
||||
}
|
||||
|
||||
// Phase caption — explicit text below the meters: e.g.
|
||||
// "Pattern 2 of 4 · Verify 0x55 · 47% within phase".
|
||||
function _drawerRenderBadblocksCaption(phase, phasePct) {
|
||||
if (!phase) return '';
|
||||
var p = parseInt(phase, 10);
|
||||
var pct = parseFloat(phasePct || 0);
|
||||
var labels = ['0xaa', '0x55', '0xff', '0x00'];
|
||||
var pattern = Math.ceil(p / 2);
|
||||
var subPhase = (p % 2 === 1) ? 'Write' : 'Verify';
|
||||
var label = labels[pattern - 1];
|
||||
var html = '<div class="bb-caption">';
|
||||
html += 'Pattern ' + pattern + ' of 4 · ';
|
||||
html += subPhase + ' ' + label + ' · ';
|
||||
html += pct.toFixed(1) + '% within phase';
|
||||
html += '</div>';
|
||||
return html;
|
||||
}
|
||||
|
||||
// Per-pattern duration history. Reads bb_phase_history (JSON) and
|
||||
// emits "0xaa: 14h 22m" rows for completed patterns. Pattern N is
|
||||
// "complete" when its verify-phase end timestamp is known (= the
|
||||
// next pattern's write-phase start, or stage.finished_at for the
|
||||
// final one).
|
||||
function _drawerRenderBadblocksHistory(stage) {
|
||||
if (!stage.bb_phase_history) return '';
|
||||
var hist;
|
||||
try { hist = JSON.parse(stage.bb_phase_history); }
|
||||
catch (e) { return ''; }
|
||||
if (!hist || typeof hist !== 'object') return '';
|
||||
var labels = ['0xaa', '0x55', '0xff', '0x00'];
|
||||
var rows = [];
|
||||
for (var n = 1; n <= 4; n++) {
|
||||
var writeStart = hist[String(2 * n - 1)];
|
||||
if (!writeStart) continue;
|
||||
var endTs = (n < 4) ? hist[String(2 * n + 1)] : stage.finished_at;
|
||||
if (!endTs) continue;
|
||||
var elapsedSec = (Date.parse(endTs) - Date.parse(writeStart)) / 1000;
|
||||
if (elapsedSec <= 0) continue;
|
||||
rows.push('<span class="bb-hist-row">' +
|
||||
'<span class="bb-hist-label">' + labels[n - 1] + '</span>' +
|
||||
'<span class="bb-hist-dur">' + _bbFmtDuration(elapsedSec) + '</span>' +
|
||||
'</span>');
|
||||
}
|
||||
if (!rows.length) return '';
|
||||
return '<div class="bb-history"><span class="bb-hist-title">Completed patterns</span>' +
|
||||
rows.join('') + '</div>';
|
||||
}
|
||||
|
||||
// Render 4 pattern meters for badblocks -w surface_validate. Each
|
||||
// meter splits write/verify halves so you can see at a glance which
|
||||
// pattern is current AND whether you're writing or verifying within
|
||||
// it. phase: 1-8 (1=write 0xaa, 2=verify 0xaa, 3=write 0x55, ...).
|
||||
function _drawerRenderBadblocksMeters(phase, phasePct) {
|
||||
if (!phase) return '';
|
||||
var p = parseInt(phase, 10);
|
||||
var pct = parseFloat(phasePct || 0);
|
||||
var labels = ['0xaa', '0x55', '0xff', '0x00'];
|
||||
var html = '<div class="bb-meters">';
|
||||
for (var i = 0; i < 4; i++) {
|
||||
var writePhase = i * 2 + 1;
|
||||
var verifyPhase = writePhase + 1;
|
||||
var writeFill, verifyFill;
|
||||
if (p > verifyPhase) {
|
||||
writeFill = 100; verifyFill = 100;
|
||||
} else if (p === verifyPhase) {
|
||||
writeFill = 100; verifyFill = pct;
|
||||
} else if (p === writePhase) {
|
||||
writeFill = pct; verifyFill = 0;
|
||||
} else {
|
||||
writeFill = 0; verifyFill = 0;
|
||||
}
|
||||
var classes = 'bb-meter';
|
||||
if (p === writePhase || p === verifyPhase) classes += ' bb-meter-current';
|
||||
if (p > verifyPhase) classes += ' bb-meter-done';
|
||||
html += '<div class="' + classes + '">';
|
||||
html += '<div class="bb-meter-label">' + labels[i] + '</div>';
|
||||
html += '<div class="bb-meter-bar">';
|
||||
html += '<div class="bb-meter-half bb-write" style="width:' + writeFill.toFixed(1) + '%"></div>';
|
||||
html += '<div class="bb-meter-half-spacer"></div>';
|
||||
html += '<div class="bb-meter-half bb-verify" style="width:' + verifyFill.toFixed(1) + '%"></div>';
|
||||
html += '</div>';
|
||||
html += '<div class="bb-meter-sub">';
|
||||
html += '<span class="bb-sub-write">W ' + Math.round(writeFill) + '%</span>';
|
||||
html += '<span class="bb-sub-verify">V ' + Math.round(verifyFill) + '%</span>';
|
||||
html += '</div>';
|
||||
html += '</div>';
|
||||
}
|
||||
html += '</div>';
|
||||
return html;
|
||||
}
|
||||
|
||||
function _drawerRenderBurnin(burnin) {
|
||||
var panel = document.getElementById('drawer-panel-burnin');
|
||||
if (!panel) return;
|
||||
|
|
@ -1300,7 +1544,30 @@
|
|||
html += '<span class="drawer-job-meta">';
|
||||
if (burnin.operator) html += 'by ' + _esc(burnin.operator);
|
||||
if (burnin.started_at) html += ' \u00b7 ' + _drawerFmtDt(burnin.started_at);
|
||||
html += '</span></div>';
|
||||
html += '</span>';
|
||||
// Job-level estimated completion. Uses the weighted overall job %
|
||||
// (recalculated server-side from stage progress) so it reflects
|
||||
// every stage, not just the current one. Suppressed under 0.5%
|
||||
// so the early sample doesn't paint a "Finish: Sep 22" stutter.
|
||||
if (burnin.state === 'running' && burnin.started_at) {
|
||||
var jobPct = parseFloat(burnin.percent || 0);
|
||||
if (jobPct >= 0.5) {
|
||||
var jobStartMs = Date.parse(burnin.started_at);
|
||||
var jobElapsedSec = Math.max(0, (Date.now() - jobStartMs) / 1000);
|
||||
var jobTotalSec = jobElapsedSec * (100 / jobPct);
|
||||
var jobRemainSec = Math.max(0, jobTotalSec - jobElapsedSec);
|
||||
var jobFinish = new Date(Date.now() + jobRemainSec * 1000);
|
||||
var jobFinishStr = jobFinish.toLocaleString(undefined, {
|
||||
weekday: 'short', month: 'short', day: 'numeric',
|
||||
hour: 'numeric', minute: '2-digit',
|
||||
});
|
||||
html += '<span class="drawer-job-finish" title="Estimated completion of the entire burn-in (all stages)">';
|
||||
html += '<span class="drawer-job-finish-label">Est. completion</span>';
|
||||
html += '<span class="drawer-job-finish-value">' + jobFinishStr + '</span>';
|
||||
html += '</span>';
|
||||
}
|
||||
}
|
||||
html += '</div>';
|
||||
|
||||
html += '<div class="drawer-stages">';
|
||||
var stages = burnin.stages || [];
|
||||
|
|
@ -1320,9 +1587,37 @@
|
|||
html += '<span class="stage-duration">' + _drawerFmtDuration(s.started_at, s.finished_at) + '</span>';
|
||||
}
|
||||
html += '</div>';
|
||||
if (s.error_text) {
|
||||
// Prominent "Why it failed" block at the top of failed/cancelled/
|
||||
// unknown stages. Falls back to a heuristic when no error was
|
||||
// recorded — e.g. a tiny log + no badblocks progress + terminal
|
||||
// state means the stage was killed externally (SSH disconnect or
|
||||
// container restart) before it could record an error.
|
||||
if (s.state === 'failed' || s.state === 'cancelled' || s.state === 'unknown') {
|
||||
var reason = s.error_text;
|
||||
if (!reason) {
|
||||
var logLen = (s.log_text || '').length;
|
||||
var noBbProgress = !s.bb_phase || (s.bb_phase === 1 && (parseFloat(s.bb_phase_pct || 0) < 0.1));
|
||||
if (logLen < 500 && noBbProgress) {
|
||||
reason = 'Stopped without recording an error — likely cause: SSH connection drop or container restart while this stage was running.';
|
||||
} else {
|
||||
reason = 'No error message recorded.';
|
||||
}
|
||||
}
|
||||
html += '<div class="stage-reason stage-reason-' + _esc(s.state) + '">';
|
||||
html += '<span class="stage-reason-label">Reason</span>';
|
||||
html += '<span class="stage-reason-text">' + _esc(reason) + '</span>';
|
||||
html += '</div>';
|
||||
} else if (s.error_text) {
|
||||
html += '<div class="stage-error-line">' + _esc(s.error_text) + '</div>';
|
||||
}
|
||||
// Per-pattern meters for badblocks surface_validate, plus the
|
||||
// vital-signs row above (temp / speed / elapsed / ETA).
|
||||
if (s.stage_name === 'surface_validate' && s.bb_phase) {
|
||||
html += _drawerRenderBadblocksVitals(s, _DRAWER_LAST_DRIVE);
|
||||
html += _drawerRenderBadblocksMeters(s.bb_phase, s.bb_phase_pct);
|
||||
html += _drawerRenderBadblocksCaption(s.bb_phase, s.bb_phase_pct);
|
||||
html += _drawerRenderBadblocksHistory(s);
|
||||
}
|
||||
// Raw SSH log output (if available)
|
||||
if (s.log_text) {
|
||||
var logHtml = _esc(s.log_text)
|
||||
|
|
|
|||
|
|
@ -46,7 +46,13 @@
|
|||
{%- elif bi.state == 'passed' -%}
|
||||
<span class="chip chip-passed">Passed</span>
|
||||
{%- elif bi.state == 'failed' -%}
|
||||
<span class="chip chip-failed">Failed{% if bi.stage_name %} ({{ bi.stage_name | replace('_',' ') }}){% endif %}</span>
|
||||
{# Suppress the stage suffix for SMART + surface_validate stages.
|
||||
SMART has its own columns, and surface_validate is the dominant
|
||||
case so a redundant suffix just adds visual noise. The drawer
|
||||
shows the per-stage Reason for any digging. Keep the suffix for
|
||||
precheck / final_check since those are rare enough that the hint
|
||||
is helpful. #}
|
||||
<span class="chip chip-failed">Failed{% if bi.stage_name and bi.stage_name not in ('short_smart', 'long_smart', 'surface_validate') %} ({{ bi.stage_name | replace('_',' ') }}){% endif %}</span>
|
||||
{%- elif bi.state == 'cancelled' -%}
|
||||
<span class="chip chip-aborted">Cancelled</span>
|
||||
{%- elif bi.state == 'unknown' -%}
|
||||
|
|
@ -63,14 +69,14 @@
|
|||
<th class="col-check">
|
||||
<input type="checkbox" id="select-all-cb" class="drive-cb" title="Select all idle drives">
|
||||
</th>
|
||||
<th class="col-drive">Drive</th>
|
||||
<th class="col-serial">Serial</th>
|
||||
<th class="col-size">Size</th>
|
||||
<th class="col-temp">Temp</th>
|
||||
<th class="col-health">Health</th>
|
||||
<th class="col-smart">Short SMART</th>
|
||||
<th class="col-smart">Long SMART</th>
|
||||
<th class="col-burnin">Burn-In</th>
|
||||
<th class="col-drive sortable" data-sort-key="drive">Drive</th>
|
||||
<th class="col-serial sortable" data-sort-key="serial">Serial</th>
|
||||
<th class="col-size sortable" data-sort-key="size">Size</th>
|
||||
<th class="col-temp sortable" data-sort-key="temp">Temp</th>
|
||||
<th class="col-health sortable" data-sort-key="health">Health</th>
|
||||
<th class="col-smart sortable" data-sort-key="short">Short SMART</th>
|
||||
<th class="col-smart sortable" data-sort-key="long">Long SMART</th>
|
||||
<th class="col-burnin sortable" data-sort-key="burnin">Burn-In</th>
|
||||
<th class="col-actions">Actions</th>
|
||||
</tr>
|
||||
</thead>
|
||||
|
|
@ -89,7 +95,19 @@
|
|||
{%- set smart_done = (drive.smart_short and drive.smart_short.state in ('passed','failed','aborted'))
|
||||
or (drive.smart_long and drive.smart_long.state in ('passed','failed','aborted')) %}
|
||||
{%- set can_reset = (bi_done or smart_done) and not bi_active and not short_busy and not long_busy and not pool_locked %}
|
||||
<tr data-status="{{ drive.status }}" id="drive-{{ drive.id }}">
|
||||
{%- set short_state = drive.smart_short.state if drive.smart_short else 'idle' %}
|
||||
{%- set long_state = drive.smart_long.state if drive.smart_long else 'idle' %}
|
||||
{%- set burnin_state = drive.burnin.state if drive.burnin else '' %}
|
||||
<tr data-status="{{ drive.status }}" id="drive-{{ drive.id }}"
|
||||
data-sort-drive="{{ drive.devname }}"
|
||||
data-sort-serial="{{ (drive.serial or '') | lower }}"
|
||||
data-sort-size="{{ drive.size_bytes or 0 }}"
|
||||
data-sort-temp="{{ drive.temperature_c if drive.temperature_c is not none else '' }}"
|
||||
data-sort-health="{{ {'PASSED': 1, 'WARNING': 2, 'FAILED': 3, 'UNKNOWN': 4}.get(drive.smart_health, 9) }}"
|
||||
data-sort-short="{{ {'running': 1, 'failed': 2, 'aborted': 3, 'passed': 4, 'idle': 5}.get(short_state, 9) }}"
|
||||
data-sort-long="{{ {'running': 1, 'failed': 2, 'aborted': 3, 'passed': 4, 'idle': 5}.get(long_state, 9) }}"
|
||||
data-sort-burnin="{{ {'running': 1, 'queued': 2, 'failed': 3, 'unknown': 4, 'cancelled': 5, 'passed': 6}.get(burnin_state, 9) }}"
|
||||
>
|
||||
<td class="col-check">
|
||||
{%- if selectable %}
|
||||
<input type="checkbox" class="drive-checkbox" data-drive-id="{{ drive.id }}">
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@
|
|||
<div class="login-footer">
|
||||
Authentication is local to this dashboard. Forgot your password?
|
||||
Reset it via the container DB:<br>
|
||||
<code class="login-code">docker exec truenas-burnin python -m app.auth_cli reset <user></code>
|
||||
<code class="login-code">docker exec nas-burnin python -m app.auth_cli reset <user></code>
|
||||
</div>
|
||||
</main>
|
||||
|
||||
|
|
|
|||
|
|
@ -343,7 +343,7 @@
|
|||
<div id="restart-banner" style="display:none;margin-top:12px;padding:12px 16px;background:rgba(255,170,0,0.12);border:1px solid var(--yellow);border-radius:8px;color:var(--text-strong)">
|
||||
<strong>⚠ Container restart required</strong> — system settings are saved but won't take effect until you restart the app container:
|
||||
<pre style="margin:8px 0 0;padding:8px 10px;background:var(--bg-card);border-radius:5px;font-size:12px;color:var(--text-strong);user-select:all">docker compose restart app</pre>
|
||||
<span style="font-size:11px;color:var(--text-muted)">Run this on <strong>maple.local</strong> from <code>~/docker/stacks/truenas-burnin/</code></span>
|
||||
<span style="font-size:11px;color:var(--text-muted)">Run this on <strong>maple.local</strong> from <code>~/docker/stacks/nas-burnin/</code></span>
|
||||
</div>
|
||||
</form>
|
||||
|
||||
|
|
|
|||
|
|
@ -45,6 +45,10 @@ async def _with_retry(
|
|||
)
|
||||
await asyncio.sleep(backoff)
|
||||
backoff *= 2
|
||||
# Unreachable: the loop either returns on success or re-raises on the
|
||||
# final attempt. The explicit raise makes that obvious to type-checkers
|
||||
# and to anyone reading top-down without tracing the control flow.
|
||||
raise RuntimeError("unreachable: _with_retry exhausted without returning")
|
||||
|
||||
|
||||
class TrueNASClient:
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ services:
|
|||
|
||||
app:
|
||||
build: .
|
||||
container_name: truenas-burnin
|
||||
container_name: nas-burnin
|
||||
ports:
|
||||
- "8084:8084"
|
||||
env_file: .env
|
||||
|
|
|
|||
44
scripts/run-tests.sh
Executable file
44
scripts/run-tests.sh
Executable file
|
|
@ -0,0 +1,44 @@
|
|||
#!/usr/bin/env bash
|
||||
# Run the test suite against the deployed container on maple.
|
||||
#
|
||||
# Tests aren't shipped in the prod image (Dockerfile only COPYs app/),
|
||||
# so this tars them, copies them in, and runs unittest discover. Cleans
|
||||
# up after itself so the running container doesn't accrue test files.
|
||||
#
|
||||
# Usage:
|
||||
# scripts/run-tests.sh # run full suite
|
||||
# scripts/run-tests.sh test_lifecycle # run a specific module
|
||||
#
|
||||
# Requires: ssh access to maple (configured in ~/.ssh/config).
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
REMOTE_HOST="${REMOTE_HOST:-maple}"
|
||||
CONTAINER="${CONTAINER:-nas-burnin}"
|
||||
REMOTE_TMP="/tmp/tnb-tests-$$.tgz"
|
||||
CONTAINER_TMP="/tmp/tnb-tests.tgz"
|
||||
PATTERN="${1:-}"
|
||||
|
||||
# Resolve repo root so this works whether invoked from the root or scripts/
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
echo "→ Packing tests/ from $REPO_ROOT"
|
||||
cd "$REPO_ROOT"
|
||||
tar cz tests | ssh "$REMOTE_HOST" "cat > $REMOTE_TMP"
|
||||
|
||||
echo "→ Copying into container $CONTAINER"
|
||||
ssh "$REMOTE_HOST" "docker cp $REMOTE_TMP $CONTAINER:$CONTAINER_TMP && rm -f $REMOTE_TMP"
|
||||
|
||||
if [ -n "$PATTERN" ]; then
|
||||
echo "→ Running tests matching: $PATTERN"
|
||||
RUN_CMD="cd /opt/app && tar xzf $CONTAINER_TMP && python -m unittest tests.$PATTERN -v"
|
||||
else
|
||||
echo "→ Running full suite"
|
||||
RUN_CMD="cd /opt/app && tar xzf $CONTAINER_TMP && python -m unittest discover -s tests"
|
||||
fi
|
||||
|
||||
# Always try to clean tests/ out of the container after the run, even on failure.
|
||||
CLEANUP="rm -rf /opt/app/tests $CONTAINER_TMP"
|
||||
|
||||
ssh "$REMOTE_HOST" "docker exec $CONTAINER sh -c '$RUN_CMD; rc=\$?; $CLEANUP; exit \$rc'"
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
[Unit]
|
||||
Description=Security scan of truenas-burnin (pip-audit + bandit + gitleaks)
|
||||
Description=Security scan of nas-burnin (pip-audit + bandit + gitleaks)
|
||||
After=network-online.target docker.service
|
||||
Wants=network-online.target
|
||||
|
||||
|
|
@ -7,7 +7,7 @@ Wants=network-online.target
|
|||
Type=oneshot
|
||||
# Wire SECURITY_SCAN_WEBHOOK here if you want findings POSTed somewhere.
|
||||
# Environment=SECURITY_SCAN_WEBHOOK=https://chat.example/hooks/abc
|
||||
ExecStart=%h/docker/stacks/truenas-burnin/scripts/security-scan.sh
|
||||
ExecStart=%h/docker/stacks/nas-burnin/scripts/security-scan.sh
|
||||
# Tools cache + container pulls — give them headroom.
|
||||
TimeoutStartSec=600
|
||||
StandardOutput=journal
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
#!/usr/bin/env bash
|
||||
# Daily security scan of the deployed truenas-burnin source on maple.
|
||||
# Daily security scan of the deployed nas-burnin source on maple.
|
||||
# Mirrors the .forgejo/workflows/security-scan.yml CI pipeline so a finding
|
||||
# the runner-less forge would have flagged still surfaces here.
|
||||
#
|
||||
|
|
@ -18,8 +18,8 @@
|
|||
|
||||
set -uo pipefail
|
||||
|
||||
REPO_URL="${REPO_URL:-https://git.hellocomputer.xyz/brandon/truenas-burnin.git}"
|
||||
REPO="${REPO:-$HOME/scan-checkouts/truenas-burnin}"
|
||||
REPO_URL="${REPO_URL:-https://git.hellocomputer.xyz/brandon/nas-burnin.git}"
|
||||
REPO="${REPO:-$HOME/scan-checkouts/nas-burnin}"
|
||||
OUT_BASE="${OUT_BASE:-$HOME/security-scans}"
|
||||
DATE="$(date +%Y-%m-%d)"
|
||||
OUT_DIR="$OUT_BASE/scan-$DATE"
|
||||
|
|
@ -29,7 +29,7 @@ GITLEAKS_VERSION="${GITLEAKS_VERSION:-8.21.2}"
|
|||
mkdir -p "$OUT_DIR" "$(dirname "$REPO")"
|
||||
|
||||
# Maintain a dedicated checkout for scanning. The deploy at
|
||||
# ~/docker/stacks/truenas-burnin/ is just the bind-mounted source — no
|
||||
# ~/docker/stacks/nas-burnin/ is just the bind-mounted source — no
|
||||
# .git, no history — so gitleaks can't scan there. We keep a separate
|
||||
# clone, fast-forward it to origin/main each run.
|
||||
if [ ! -d "$REPO/.git" ]; then
|
||||
|
|
@ -58,7 +58,7 @@ date -Iseconds >> "$OUT_DIR/summary.txt"
|
|||
echo >> "$OUT_DIR/summary.txt"
|
||||
|
||||
# --- pip-audit against the lockfile in a throwaway container ------------
|
||||
# Previously we did `docker exec truenas-burnin pip install pip-audit`
|
||||
# Previously we did `docker exec nas-burnin pip install pip-audit`
|
||||
# which mutated the live production container with a transient package.
|
||||
# Now scan the lockfile in an ephemeral container — same coverage of
|
||||
# pinned versions + their transitives, no side effects on prod.
|
||||
|
|
@ -77,7 +77,7 @@ echo " exit=$PIPS ($OUT_DIR/pip-audit.txt)" | tee -a "$OUT_DIR/summary.txt"
|
|||
# forge HEAD and maple. B608 (SQL injection via dynamic strings) is
|
||||
# skipped globally: every dynamic SQL build in this codebase uses
|
||||
# bound parameters for data and structural placeholders only.
|
||||
DEPLOY_DIR="${DEPLOY_DIR:-$HOME/docker/stacks/truenas-burnin}"
|
||||
DEPLOY_DIR="${DEPLOY_DIR:-$HOME/docker/stacks/nas-burnin}"
|
||||
echo "--- bandit (deploy: $DEPLOY_DIR) ---" | tee -a "$OUT_DIR/summary.txt"
|
||||
docker run --rm \
|
||||
-v "$DEPLOY_DIR/app:/src:ro" \
|
||||
|
|
@ -87,19 +87,26 @@ docker run --rm \
|
|||
BANDITS=$?
|
||||
echo " exit=$BANDITS ($OUT_DIR/bandit.txt)" | tee -a "$OUT_DIR/summary.txt"
|
||||
|
||||
# --- mypy against the deploy dir (informational only) -------------------
|
||||
# --- mypy against the deploy dir (gating as of 1.0.0-40) ----------------
|
||||
# Type checker — surfaces None-handling bugs and missing-attribute errors
|
||||
# the runtime would have caught at the worst possible moment. Doesn't
|
||||
# count toward the failure exit-code sum until the codebase is annotated
|
||||
# enough to make findings actionable.
|
||||
echo "--- mypy (informational) ---" | tee -a "$OUT_DIR/summary.txt"
|
||||
# the runtime would have caught at the worst possible moment.
|
||||
#
|
||||
# Mount at /opt/app/app so internal `from . import X` resolves through
|
||||
# the `app` package (not `src`). Without this the relative imports inside
|
||||
# subpackages like burnin/ produce spurious "Module 'src' has no
|
||||
# attribute 'X'" errors that look like real bugs but are scan-env noise.
|
||||
#
|
||||
# Now counted toward TOTAL_EXIT — the codebase is fully clean under
|
||||
# `--ignore-missing-imports --no-strict-optional`. New errors fail the scan.
|
||||
echo "--- mypy ---" | tee -a "$OUT_DIR/summary.txt"
|
||||
docker run --rm \
|
||||
-v "$DEPLOY_DIR/app:/src:ro" \
|
||||
-v "$DEPLOY_DIR/app:/opt/app/app:ro" \
|
||||
-w /opt/app \
|
||||
python:3.12-slim sh -c \
|
||||
"pip install --quiet --no-cache-dir --disable-pip-version-check mypy 2>&1 | tail -3 && mypy --ignore-missing-imports --no-strict-optional /src" \
|
||||
"pip install --quiet --no-cache-dir --disable-pip-version-check mypy 2>&1 | tail -3 && mypy --ignore-missing-imports --no-strict-optional app" \
|
||||
> "$OUT_DIR/mypy.txt" 2>&1
|
||||
MYPY=$?
|
||||
echo " exit=$MYPY ($OUT_DIR/mypy.txt) — informational only" | tee -a "$OUT_DIR/summary.txt"
|
||||
echo " exit=$MYPY ($OUT_DIR/mypy.txt)" | tee -a "$OUT_DIR/summary.txt"
|
||||
|
||||
# --- gitleaks against the full git history ------------------------------
|
||||
echo "--- gitleaks ---" | tee -a "$OUT_DIR/summary.txt"
|
||||
|
|
@ -112,18 +119,19 @@ LEAKS=$?
|
|||
echo " exit=$LEAKS ($OUT_DIR/gitleaks.txt)" | tee -a "$OUT_DIR/summary.txt"
|
||||
|
||||
# --- summary + notification --------------------------------------------
|
||||
TOTAL_EXIT=$(( PIPS + BANDITS + LEAKS ))
|
||||
TOTAL_EXIT=$(( PIPS + BANDITS + MYPY + LEAKS ))
|
||||
{
|
||||
echo
|
||||
echo "Total findings exit-code sum: $TOTAL_EXIT"
|
||||
echo " pip-audit: $PIPS"
|
||||
echo " bandit: $BANDITS"
|
||||
echo " mypy: $MYPY"
|
||||
echo " gitleaks: $LEAKS"
|
||||
} >> "$OUT_DIR/summary.txt"
|
||||
|
||||
if [ "$TOTAL_EXIT" -ne 0 ]; then
|
||||
printf '%s — findings (pip-audit=%d bandit=%d gitleaks=%d) — see %s\n' \
|
||||
"$DATE" "$PIPS" "$BANDITS" "$LEAKS" "$OUT_DIR" >> "$SUMMARY"
|
||||
printf '%s — findings (pip-audit=%d bandit=%d mypy=%d gitleaks=%d) — see %s\n' \
|
||||
"$DATE" "$PIPS" "$BANDITS" "$MYPY" "$LEAKS" "$OUT_DIR" >> "$SUMMARY"
|
||||
# Hook for downstream notification — wire to your existing Mattermost
|
||||
# / Fastmail / webhook chain. Stays a no-op until SECURITY_SCAN_WEBHOOK
|
||||
# is set in the systemd unit's Environment=.
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
[Unit]
|
||||
Description=Daily security scan of truenas-burnin
|
||||
Description=Daily security scan of nas-burnin
|
||||
Requires=security-scan.service
|
||||
|
||||
[Timer]
|
||||
|
|
|
|||
77
tests/test_badblocks_cmd.py
Normal file
77
tests/test_badblocks_cmd.py
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
"""Verifies the Spearfoot tunables (block_size, block_buffer, passes)
|
||||
actually thread through to the badblocks command line.
|
||||
|
||||
These three settings are exposed in Settings → Burn-in. Without a test,
|
||||
nothing catches if a future refactor drops one of the flags or reads
|
||||
from the wrong attribute. The defaults match the Spearfoot disk-burnin.sh
|
||||
community script; non-defaults can roughly halve runtime on multi-TB
|
||||
drives at the cost of more RAM.
|
||||
|
||||
Run inside the container image so app deps are present.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import unittest
|
||||
|
||||
from app.burnin.stages import _build_badblocks_cmd
|
||||
from app.config import settings
|
||||
|
||||
|
||||
class TestBadblocksCmd(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
# Snapshot defaults so each test can mutate freely without
|
||||
# polluting siblings or the running process.
|
||||
self._snap = (
|
||||
settings.surface_validate_block_size,
|
||||
settings.surface_validate_block_buffer,
|
||||
settings.surface_validate_passes,
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
(
|
||||
settings.surface_validate_block_size,
|
||||
settings.surface_validate_block_buffer,
|
||||
settings.surface_validate_passes,
|
||||
) = self._snap
|
||||
|
||||
def test_defaults_match_spearfoot(self):
|
||||
"""Out of the box: -b 4096 -c 64 -p 1 — matches the
|
||||
disk-burnin.sh community script's recommendation for HDDs."""
|
||||
cmd = _build_badblocks_cmd("sda")
|
||||
self.assertIn("-b 4096", cmd)
|
||||
self.assertIn("-c 64", cmd)
|
||||
self.assertIn("-p 1", cmd)
|
||||
self.assertIn("/dev/sda", cmd)
|
||||
# Destructive write+verify mode must always be present — anything
|
||||
# else (read-only, non-destructive) defeats the purpose of burn-in.
|
||||
self.assertIn("-wsv", cmd)
|
||||
|
||||
def test_tunables_propagate_to_cmd(self):
|
||||
"""Operator-set values (e.g. for paranoid 3-pass burn-in on a
|
||||
suspect drive, or 8 KiB blocks for faster scan on a 24 TB HDD)
|
||||
must end up in the shell command."""
|
||||
settings.surface_validate_block_size = 8192
|
||||
settings.surface_validate_block_buffer = 128
|
||||
settings.surface_validate_passes = 3
|
||||
cmd = _build_badblocks_cmd("sdb")
|
||||
self.assertIn("-b 8192", cmd)
|
||||
self.assertIn("-c 128", cmd)
|
||||
self.assertIn("-p 3", cmd)
|
||||
self.assertNotIn("-b 4096", cmd) # no leak from defaults
|
||||
self.assertNotIn("-c 64", cmd)
|
||||
self.assertIn("/dev/sdb", cmd)
|
||||
|
||||
def test_pid_capture_wrapper_intact(self):
|
||||
"""The `sh -c 'echo PID:$$; exec ...'` wrapper is what makes
|
||||
out-of-band kill -9 work over a fresh SSH session — asyncssh's
|
||||
signal channel is silently ignored by sshd. If a future refactor
|
||||
drops the wrapper, a cancel won't actually stop the test."""
|
||||
cmd = _build_badblocks_cmd("sda")
|
||||
self.assertTrue(cmd.startswith("sh -c 'echo PID:$$; exec badblocks"))
|
||||
self.assertTrue(cmd.endswith("'"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
125
tests/test_badblocks_progress.py
Normal file
125
tests/test_badblocks_progress.py
Normal file
|
|
@ -0,0 +1,125 @@
|
|||
"""Verifies _BadblocksProgress translates per-phase badblocks output
|
||||
into a monotonic 0-99% overall progress.
|
||||
|
||||
`badblocks -w` cycles through 4 patterns × {write, verify} = 8 phases.
|
||||
Each phase prints "XX% done" relative to its own 0-100 range. Without
|
||||
this translation the dashboard appeared to "rewind" every ~2 hours
|
||||
when a new phase started — and two drives racing each other could
|
||||
look 4× apart in displayed progress despite identical hardware.
|
||||
|
||||
Run inside the container image so app deps are present.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import unittest
|
||||
|
||||
from app.burnin.stages import _BadblocksProgress
|
||||
|
||||
|
||||
class TestBadblocksProgress(unittest.TestCase):
|
||||
|
||||
def test_default_phase_one(self):
|
||||
"""Before any header, treat as start of pattern-1 write."""
|
||||
p = _BadblocksProgress()
|
||||
self.assertEqual(p.phase, 1)
|
||||
self.assertEqual(p.overall_pct, 0)
|
||||
|
||||
def test_pattern_headers_set_phase(self):
|
||||
"""0xaa→1, 0x55→3, 0xff→5, 0x00→7 (write phases)."""
|
||||
p = _BadblocksProgress()
|
||||
for header, want in [
|
||||
("Testing with pattern 0xaa: ", 1),
|
||||
("Testing with pattern 0x55: ", 3),
|
||||
("Testing with pattern 0xff: ", 5),
|
||||
("Testing with pattern 0x00: ", 7),
|
||||
]:
|
||||
p.update(header)
|
||||
self.assertEqual(p.phase, want, f"after {header!r}")
|
||||
|
||||
def test_verify_advances_to_next_phase(self):
|
||||
"""`Reading and comparing` after `Testing with pattern 0x55`
|
||||
(phase 3) advances to phase 4."""
|
||||
p = _BadblocksProgress()
|
||||
p.update("Testing with pattern 0x55: 100.00% done")
|
||||
self.assertEqual(p.phase, 3)
|
||||
p.update("Reading and comparing: 0.00% done")
|
||||
self.assertEqual(p.phase, 4)
|
||||
|
||||
def test_overall_pct_at_phase_boundaries(self):
|
||||
"""Verify the math at each phase boundary: phase N at 100% =
|
||||
N * 12.5% overall (clipped to 99 at the end)."""
|
||||
cases = [
|
||||
(1, 0.0, 0), # start of run
|
||||
(1, 100.0, 12), # 100/800 = 12.5
|
||||
(2, 100.0, 25), # 200/800
|
||||
(4, 100.0, 50), # 400/800
|
||||
(7, 100.0, 87), # 700/800
|
||||
(8, 100.0, 99), # 800/800 → clipped to 99
|
||||
]
|
||||
for phase, phase_pct, want in cases:
|
||||
p = _BadblocksProgress()
|
||||
p.phase = phase
|
||||
p.phase_pct = phase_pct
|
||||
self.assertEqual(
|
||||
p.overall_pct, want,
|
||||
f"phase={phase} phase_pct={phase_pct}",
|
||||
)
|
||||
|
||||
def test_realistic_sequence(self):
|
||||
"""End-to-end: feed a synthetic badblocks output stream and
|
||||
check the overall percent stays monotonically non-decreasing."""
|
||||
lines = [
|
||||
"Testing with pattern 0xaa: ",
|
||||
"10.00% done, 1:00:00 elapsed. (0/0/0 errors)",
|
||||
"50.00% done, 5:00:00 elapsed. (0/0/0 errors)",
|
||||
"99.99% done, 10:00:00 elapsed. (0/0/0 errors)",
|
||||
"Reading and comparing: ",
|
||||
"0.00% done, 10:00:01 elapsed. (0/0/0 errors)",
|
||||
"50.00% done, 12:30:00 elapsed. (0/0/0 errors)",
|
||||
"Testing with pattern 0x55: ",
|
||||
"0.00% done, 15:00:00 elapsed. (0/0/0 errors)",
|
||||
"50.00% done, 17:30:00 elapsed. (0/0/0 errors)",
|
||||
]
|
||||
p = _BadblocksProgress()
|
||||
seen = []
|
||||
for line in lines:
|
||||
p.update(line)
|
||||
seen.append(p.overall_pct)
|
||||
self.assertEqual(
|
||||
seen, sorted(seen),
|
||||
f"progress went backwards: {seen}",
|
||||
)
|
||||
# Sanity: by the time we're halfway through pattern-2 write
|
||||
# (phase 3, 50%), we should report ((3-1)*100 + 50) / 8 = 31%.
|
||||
self.assertEqual(seen[-1], 31)
|
||||
|
||||
def test_drives_at_different_phases_show_different_overall(self):
|
||||
"""The original bug: two drives at the same per-phase 60%
|
||||
but different phases used to look identical (both '60%').
|
||||
Now they correctly diverge."""
|
||||
slow = _BadblocksProgress()
|
||||
slow.update("Testing with pattern 0xaa: ")
|
||||
slow.update("60.00% done")
|
||||
|
||||
fast = _BadblocksProgress()
|
||||
fast.update("Testing with pattern 0xaa: ")
|
||||
fast.update("99.99% done")
|
||||
fast.update("Reading and comparing: ")
|
||||
fast.update("60.00% done")
|
||||
|
||||
# slow: 60/800 = 7%; fast: (1*100 + 60)/800 = 20%
|
||||
self.assertEqual(slow.overall_pct, 7)
|
||||
self.assertEqual(fast.overall_pct, 20)
|
||||
|
||||
def test_unknown_pattern_does_not_crash(self):
|
||||
"""An unrecognized pattern (e.g. badblocks future versions or
|
||||
custom patterns) just leaves phase unchanged."""
|
||||
p = _BadblocksProgress()
|
||||
p.update("Testing with pattern 0xab: ")
|
||||
# phase stays at the default 1
|
||||
self.assertEqual(p.phase, 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
100
tests/test_bb_phase_persistence.py
Normal file
100
tests/test_bb_phase_persistence.py
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
"""Verifies _update_stage_bb_phase actually writes to burnin_stages
|
||||
and the migration adds the columns idempotently.
|
||||
|
||||
The drive-drawer's 4-meter UI depends on these columns being populated
|
||||
on every parser tick. If a future refactor drops the call or breaks
|
||||
the migration, this test catches it before users see the meters
|
||||
go blank.
|
||||
|
||||
Run inside the container image so app deps are present.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import aiosqlite
|
||||
|
||||
|
||||
async def _setup_db_with_stage() -> str:
|
||||
fd, path = tempfile.mkstemp(suffix=".db")
|
||||
os.close(fd)
|
||||
from app.config import settings
|
||||
settings.db_path = path
|
||||
|
||||
from app.database import init_db
|
||||
await init_db()
|
||||
|
||||
async with aiosqlite.connect(path) as db:
|
||||
await db.execute(
|
||||
"INSERT INTO drives "
|
||||
"(truenas_disk_id, devname, serial, model, size_bytes, "
|
||||
" temperature_c, smart_health, last_seen_at, last_polled_at) "
|
||||
"VALUES ('id-1', 'sda', 'SER1', 'TestModel', 14000000000000, "
|
||||
" 30, 'PASSED', '2026-05-09T00:00:00+00:00', "
|
||||
" '2026-05-09T00:00:00+00:00')"
|
||||
)
|
||||
await db.execute(
|
||||
"INSERT INTO burnin_jobs "
|
||||
"(drive_id, profile, state, operator, created_at) "
|
||||
"VALUES (1, 'surface', 'running', 'op', "
|
||||
" '2026-05-09T00:00:00+00:00')"
|
||||
)
|
||||
await db.execute(
|
||||
"INSERT INTO burnin_stages "
|
||||
"(burnin_job_id, stage_name, state) "
|
||||
"VALUES (1, 'surface_validate', 'running')"
|
||||
)
|
||||
await db.commit()
|
||||
return path
|
||||
|
||||
|
||||
class TestBBPhasePersistence(unittest.IsolatedAsyncioTestCase):
|
||||
|
||||
async def asyncSetUp(self):
|
||||
self.path = await _setup_db_with_stage()
|
||||
|
||||
async def asyncTearDown(self):
|
||||
try:
|
||||
os.unlink(self.path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
async def test_columns_exist_after_init(self):
|
||||
async with aiosqlite.connect(self.path) as db:
|
||||
cur = await db.execute("PRAGMA table_info(burnin_stages)")
|
||||
cols = {r[1] for r in await cur.fetchall()}
|
||||
self.assertIn("bb_phase", cols)
|
||||
self.assertIn("bb_phase_pct", cols)
|
||||
|
||||
async def test_update_writes_phase_and_pct(self):
|
||||
from app.burnin._common import _update_stage_bb_phase
|
||||
await _update_stage_bb_phase(1, "surface_validate", 3, 47.5)
|
||||
async with aiosqlite.connect(self.path) as db:
|
||||
cur = await db.execute(
|
||||
"SELECT bb_phase, bb_phase_pct FROM burnin_stages "
|
||||
"WHERE burnin_job_id=1 AND stage_name='surface_validate'"
|
||||
)
|
||||
row = await cur.fetchone()
|
||||
self.assertEqual(row[0], 3)
|
||||
self.assertAlmostEqual(row[1], 47.5)
|
||||
|
||||
async def test_update_overwrites(self):
|
||||
"""Each tick should replace the previous value, not accumulate."""
|
||||
from app.burnin._common import _update_stage_bb_phase
|
||||
await _update_stage_bb_phase(1, "surface_validate", 1, 10.0)
|
||||
await _update_stage_bb_phase(1, "surface_validate", 2, 80.0)
|
||||
async with aiosqlite.connect(self.path) as db:
|
||||
cur = await db.execute(
|
||||
"SELECT bb_phase, bb_phase_pct FROM burnin_stages "
|
||||
"WHERE burnin_job_id=1 AND stage_name='surface_validate'"
|
||||
)
|
||||
row = await cur.fetchone()
|
||||
self.assertEqual(row[0], 2)
|
||||
self.assertAlmostEqual(row[1], 80.0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Loading…
Add table
Reference in a new issue