From fc33c0d11eef3f24e48c72f6d3fddf40d814e298 Mon Sep 17 00:00:00 2001 From: Brandon Walter <51866976+echoparkbaby@users.noreply.github.com> Date: Tue, 24 Feb 2026 08:13:21 -0500 Subject: [PATCH] docs: update CLAUDE.md for Stage 7; bump version to 1.0.0-7 Documents all Stage 7 features: SSH burn-in architecture, SMART attr monitoring, drive reset, version badge, stats polish, new env vars, new API routes, and real-TrueNAS cutover steps. Co-Authored-By: Claude Sonnet 4.6 --- CLAUDE.md | 140 ++++++++++++++++++++++++++++++++++++++++++++------ app/config.py | 2 +- 2 files changed, 124 insertions(+), 18 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 6ba9618..6702068 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,7 +1,7 @@ # TrueNAS Burn-In Dashboard β€” Project Context > Drop this file in any new Claude session to resume work with full context. -> Last updated: 2026-02-22 (Stage 6d) +> Last updated: 2026-02-24 (Stage 7) --- @@ -28,7 +28,7 @@ against a TrueNAS CORE instance. Deployed on **maple.local** (10.0.0.138). | 6b | UX overhaul (stats bar, alerts, batch, notifications, location, print, analytics) | βœ… | | 6c | Settings overhaul (editable form, runtime store, SMTP fix, stage selection) | βœ… | | 6d | Cancel SMART tests, Cancel All burn-ins, drag-to-reorder stages in modals | βœ… | -| 7 | Cut to real TrueNAS | πŸ”² future | +| 7 | SSH burn-in execution, SMART attr monitoring, drive reset, version badge, stats polish | βœ… | --- @@ -52,6 +52,7 @@ truenas-burnin/ β”œβ”€β”€ database.py # schema, migrations, init_db(), get_db() β”œβ”€β”€ models.py # Pydantic v2 models; StartBurninRequest has run_surface/run_short/run_long + profile property β”œβ”€β”€ settings_store.py # runtime settings store β€” persists to /data/settings_overrides.json + β”œβ”€β”€ ssh_client.py # asyncssh client: smartctl parsing, badblocks streaming, test_connection β”œβ”€β”€ truenas.py # httpx async client with retry (lambda factory pattern) β”œβ”€β”€ poller.py # poll loop, SSE pub/sub, stale detection, stuck-job check β”œβ”€β”€ burnin.py # orchestrator, semaphore, stages, check_stuck_jobs() @@ -72,8 +73,8 @@ truenas-burnin/ β”œβ”€β”€ history.html β”œβ”€β”€ job_detail.html # + Print/Export button β”œβ”€β”€ audit.html # audit event log - β”œβ”€β”€ stats.html # analytics: pass rate by model, daily activity - β”œβ”€β”€ settings.html # editable 2-col form: SMTP (left) + Notifications/Behavior/Webhook (right) + β”œβ”€β”€ stats.html # analytics: pass rate by model, daily activity, duration by size, failures by stage + β”œβ”€β”€ settings.html # editable 2-col form: SMTP + SSH (left) + Notifications/Behavior/Webhook/System (right) β”œβ”€β”€ job_print.html # print view with client-side QR code (qrcodejs CDN) └── components/ β”œβ”€β”€ drives_table.html # checkboxes, elapsed time, location inline edit @@ -129,10 +130,19 @@ burnin_jobs (id, drive_id FK, profile, state CHECK(queued/running/passed/ -- burnin_stages: one row per stage per job burnin_stages (id, burnin_job_id FK, stage_name, state, percent, - started_at, finished_at, error_text) + started_at, finished_at, error_text, + log_text TEXT, -- raw smartctl/badblocks SSH output + bad_blocks INTEGER) -- bad sector count from surface_validate -- audit_events: append-only log audit_events (id, event_type, drive_id, job_id, operator, note, created_at) + +-- drives columns added by migrations: +-- location TEXT, notes TEXT (Stage 6b) +-- smart_attrs TEXT -- JSON blob of last SMART attribute snapshot (Stage 7) + +-- smart_tests columns added by migrations: +-- raw_output TEXT -- raw smartctl -a output (Stage 7) ``` --- @@ -194,6 +204,15 @@ All read from `.env` via `pydantic-settings`. See `.env.example` for full list. | `SMTP_ALERT_ON_FAIL` | `true` | Immediate email when a job fails | | `SMTP_ALERT_ON_PASS` | `false` | Immediate email when a job passes | | `WEBHOOK_URL` | `` | POST JSON on burnin_passed/burnin_failed. Works with ntfy, Slack, Discord, n8n | +| `TEMP_WARN_C` | `46` | Temperature warning threshold (Β°C) | +| `TEMP_CRIT_C` | `55` | Temperature critical threshold β€” precheck fails above this | +| `BAD_BLOCK_THRESHOLD` | `0` | Max bad blocks allowed before surface_validate fails (0 = any bad = fail) | +| `APP_VERSION` | `1.0.0-7` | Displayed in header version badge | +| `SSH_HOST` | `` | TrueNAS SSH hostname/IP β€” empty disables SSH mode (uses mock/REST) | +| `SSH_PORT` | `22` | TrueNAS SSH port | +| `SSH_USER` | `root` | TrueNAS SSH username | +| `SSH_PASSWORD` | `` | TrueNAS SSH password (use key instead for production) | +| `SSH_KEY` | `` | TrueNAS SSH private key PEM string β€” loaded in-memory, never written to disk | --- @@ -305,27 +324,114 @@ async def burnin_get(job_id: int, ...): ... | First row clipped after Stage 6b | Stats bar added 70px but max-height not updated | `max-height: calc(100vh - 205px)` | | SMTP "Connection unexpectedly closed" | `_send_email` used `settings.smtp_port` (587 default) even in SSL mode | Derive port from mode via `_MODE_PORTS` dict; SSLβ†’465, STARTTLSβ†’587, Plainβ†’25 | | SSL mode missing EHLO | `smtplib.SMTP_SSL` was created without calling `ehlo()` | Added `server.ehlo()` after both SSL and STARTTLS connections | +| `profile` NameError in `_execute_stages` | `_execute_stages` called `_recalculate_progress(job_id, profile)` but `profile` not in scope | Changed to `_recalculate_progress(job_id)` β€” profile param was unused | +| `app_version` Jinja2 global rendered as function | Set `templates.env.globals["app_version"] = _get_app_version` (callable) | Set to the static string value directly: `= _settings.app_version` | --- -## Stage 7 β€” Cutting to Real TrueNAS (TODO) +## Feature Reference (Stage 7) + +### SSH Burn-In Architecture + +`ssh_client.py` provides an optional SSH execution layer. When `SSH_HOST` is set (and key or password is present), all burn-in stages run real commands over SSH against TrueNAS. When `SSH_HOST` is empty, stages fall back to mock/REST simulation. + +**Dual-mode dispatch** β€” each stage checks `ssh_client.is_configured()`: +```python +if ssh_client.is_configured(): + # run smartctl / badblocks over SSH +else: + # simulate with REST API or timed sleep (mock mode) +``` + +**SSH client capabilities** (`ssh_client.py`): +- `test_connection()` β†’ `{"ok": bool, "error": str}` β€” used by Test SSH button +- `get_smart_attributes(devname)` β†’ parse `smartctl -a`, return `{health, raw_output, attributes, warnings, failures}` +- `start_smart_test(devname, test_type)` β†’ `smartctl -t short|long /dev/{devname}` +- `poll_smart_progress(devname)` β†’ `smartctl -a` during test; returns `{state, percent_remaining, output}` +- `abort_smart_test(devname)` β†’ `smartctl -X /dev/{devname}` +- `run_badblocks(devname, on_progress, cancelled_fn)` β†’ streams `badblocks -wsv -b 4096 -p 1`; counts bad sectors from stdout (digit-only lines) + +**Key auth pattern** β€” key is stored as PEM string in settings, never written to disk: +```python +asyncssh.connect(host, ..., client_keys=[asyncssh.import_private_key(pem_str)], known_hosts=None) +``` + +**badblocks streaming** β€” uses `asyncssh.create_process()` with parallel stdout/stderr draining via `asyncio.gather`. Progress updates written to DB every 20 lines to avoid excessive writes. + +### SMART Attribute Monitoring + +Monitored attributes and their thresholds: + +| ID | Name | Any non-zero β†’ | +|----|------|----------------| +| 5 | Reallocated_Sector_Ct | FAIL | +| 10 | Spin_Retry_Count | WARN | +| 188 | Command_Timeout | WARN | +| 197 | Current_Pending_Sector | FAIL | +| 198 | Offline_Uncorrectable | FAIL | +| 199 | UDMA_CRC_Error_Count | WARN | + +SMART attrs stored as JSON blob in `drives.smart_attrs`. Updated by `final_check` stage (SSH mode) or `short_smart`/`long_smart` REST mode. Displayed in drive drawer with colour-coded table + raw `smartctl -a` output. + +### Drive Reset Action + +- `POST /api/v1/drives/{drive_id}/reset` β€” clears `smart_tests` rows to idle, clears `drives.smart_attrs`, writes audit event, notifies SSE subscribers +- Button appears in action column when `can_reset` = drive has no active burn-in AND has any non-idle smart state or smart attrs +- Burn-in history (burnin_jobs, burnin_stages) is preserved β€” reset only affects SMART test state + +### New Routes (Stage 7) + +| Method | Path | Description | +|--------|------|-------------| +| `POST` | `/api/v1/drives/{id}/reset` | Reset SMART state and attrs for a drive | +| `POST` | `/api/v1/settings/test-ssh` | Test SSH connection with current SSH settings | +| `GET` | `/api/v1/updates/check` | Check for latest release from Forgejo git.hellocomputer.xyz | + +### Check for Updates + +Settings page has a "Check for Updates" button that fetches: +``` +GET https://git.hellocomputer.xyz/api/v1/repos/brandon/truenas-burnin/releases/latest +``` +Compares tag name against `settings.app_version`; shows "up to date" or "v{tag} available". + +### Version Badge + +`app_version` set as Jinja2 global in `renderer.py`: +```python +templates.env.globals["app_version"] = _settings.app_version +``` +Displayed in header as `v{app_version}` (right side, muted). + +### Configurable Thresholds + +`renderer.py` `_temp_class` now reads from settings instead of hardcoded values: +```python +if temp >= settings.temp_crit_c: return "temp-crit" +if temp >= settings.temp_warn_c: return "temp-warn" +``` +`precheck` stage fails if `temperature_c >= settings.temp_crit_c`. + +Surface validate fails if `bad_blocks > settings.bad_block_threshold` (default 0 = any bad sector = fail). + +### Cutting to Real TrueNAS (Next Steps) When ready to test against a real TrueNAS CORE box: -1. In `.env` on maple.local, set: - ```env - TRUENAS_BASE_URL=https://10.0.0.203 # or whatever your TrueNAS IP is - TRUENAS_API_KEY=your-real-key-here - TRUENAS_VERIFY_TLS=false # unless you have a valid cert - ``` -2. Comment out `mock-truenas` service in `docker-compose.yml` (or leave it running β€” harmless) -3. Verify TrueNAS CORE v2.0 API contract matches what `truenas.py` expects: +1. In Settings (or `.env`), set: + - **TrueNAS URL** β†’ `https://10.0.0.X` (real IP) + - **API Key** β†’ real API key + - **SSH Host** β†’ same IP as TrueNAS + - **SSH User** β†’ `root` (or sudoer with smartctl/badblocks access) + - **SSH Key** β†’ paste PEM key into textarea +2. Click **Test SSH Connection** to verify before starting a burn-in +3. TrueNAS CORE uses `ada0`, `da0` device names (not `sda`). Mock drive names will differ. +4. Delete `app.db` before first real poll to clear mock drive rows +5. Comment out `mock-truenas` service in `docker-compose.yml` (optional β€” harmless to leave) +6. Verify TrueNAS CORE v2.0 REST API: - `GET /api/v2.0/disk` returns list with `name`, `serial`, `model`, `size`, `temperature` - `GET /api/v2.0/core/get_jobs` with filter `[["method","=","smart.test"]]` - `POST /api/v2.0/smart/test` accepts `{disks: [devname], type: "SHORT"|"LONG"}` -4. Check that disk names match expected format (TrueNAS CORE uses `ada0`, `da0`, etc. β€” not `sda`) - - You may need to update mock drive names back or adjust poller logic -5. Delete `app.db` to clear mock drive rows before first real poll --- diff --git a/app/config.py b/app/config.py index 5c9b10a..7c1f3cf 100644 --- a/app/config.py +++ b/app/config.py @@ -68,7 +68,7 @@ class Settings(BaseSettings): ssh_key: str = "" # PEM private key content (paste full key including headers) # Application version β€” used by the /api/v1/updates/check endpoint - app_version: str = "1.0.0-6d" + app_version: str = "1.0.0-7" settings = Settings()