From fc33c0d11eef3f24e48c72f6d3fddf40d814e298 Mon Sep 17 00:00:00 2001
From: Brandon Walter <51866976+echoparkbaby@users.noreply.github.com>
Date: Tue, 24 Feb 2026 08:13:21 -0500
Subject: [PATCH] docs: update CLAUDE.md for Stage 7; bump version to 1.0.0-7

Documents all Stage 7 features: SSH burn-in architecture, SMART attr
monitoring, drive reset, version badge, stats polish, new env vars,
new API routes, and real-TrueNAS cutover steps.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 CLAUDE.md     | 140 ++++++++++++++++++++++++++++++++++++++++++++------
 app/config.py |   2 +-
 2 files changed, 124 insertions(+), 18 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 6ba9618..6702068 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -1,7 +1,7 @@
 # TrueNAS Burn-In Dashboard — Project Context
 
 > Drop this file in any new Claude session to resume work with full context.
-> Last updated: 2026-02-22 (Stage 6d)
+> Last updated: 2026-02-24 (Stage 7)
 
 ---
 
@@ -28,7 +28,7 @@ against a TrueNAS CORE instance. Deployed on **maple.local** (10.0.0.138).
 | 6b | UX overhaul (stats bar, alerts, batch, notifications, location, print, analytics) | ✅ |
 | 6c | Settings overhaul (editable form, runtime store, SMTP fix, stage selection) | ✅ |
 | 6d | Cancel SMART tests, Cancel All burn-ins, drag-to-reorder stages in modals | ✅ |
-| 7 | Cut to real TrueNAS | 🔲 future |
+| 7 | SSH burn-in execution, SMART attr monitoring, drive reset, version badge, stats polish | ✅ |
 
 ---
 
@@ -52,6 +52,7 @@ truenas-burnin/
     ├── database.py             # schema, migrations, init_db(), get_db()
     ├── models.py               # Pydantic v2 models; StartBurninRequest has run_surface/run_short/run_long + profile property
     ├── settings_store.py       # runtime settings store — persists to /data/settings_overrides.json
+    ├── ssh_client.py           # asyncssh client: smartctl parsing, badblocks streaming, test_connection
     ├── truenas.py              # httpx async client with retry (lambda factory pattern)
     ├── poller.py               # poll loop, SSE pub/sub, stale detection, stuck-job check
     ├── burnin.py               # orchestrator, semaphore, stages, check_stuck_jobs()
@@ -72,8 +73,8 @@ truenas-burnin/
         ├── history.html
         ├── job_detail.html     # + Print/Export button
         ├── audit.html          # audit event log
-        ├── stats.html          # analytics: pass rate by model, daily activity
-        ├── settings.html       # editable 2-col form: SMTP (left) + Notifications/Behavior/Webhook (right)
+        ├── stats.html          # analytics: pass rate by model, daily activity, duration by size, failures by stage
+        ├── settings.html       # editable 2-col form: SMTP + SSH (left) + Notifications/Behavior/Webhook/System (right)
         ├── job_print.html      # print view with client-side QR code (qrcodejs CDN)
         └── components/
             ├── drives_table.html   # checkboxes, elapsed time, location inline edit
@@ -129,10 +130,19 @@ burnin_jobs (id, drive_id FK, profile, state CHECK(queued/running/passed/
 
 -- burnin_stages: one row per stage per job
 burnin_stages (id, burnin_job_id FK, stage_name, state, percent,
-               started_at, finished_at, error_text)
+               started_at, finished_at, error_text,
+               log_text TEXT,        -- raw smartctl/badblocks SSH output
+               bad_blocks INTEGER)   -- bad sector count from surface_validate
 
 -- audit_events: append-only log
 audit_events (id, event_type, drive_id, job_id, operator, note, created_at)
+
+-- drives columns added by migrations:
+--   location TEXT, notes TEXT (Stage 6b)
+--   smart_attrs TEXT            -- JSON blob of last SMART attribute snapshot (Stage 7)
+
+-- smart_tests columns added by migrations:
+--   raw_output TEXT             -- raw smartctl -a output (Stage 7)
 ```
 
 ---
@@ -194,6 +204,15 @@ All read from `.env` via `pydantic-settings`. See `.env.example` for full list.
 | `SMTP_ALERT_ON_FAIL` | `true` | Immediate email when a job fails |
 | `SMTP_ALERT_ON_PASS` | `false` | Immediate email when a job passes |
 | `WEBHOOK_URL` | `` | POST JSON on burnin_passed/burnin_failed. Works with ntfy, Slack, Discord, n8n |
+| `TEMP_WARN_C` | `46` | Temperature warning threshold (°C) |
+| `TEMP_CRIT_C` | `55` | Temperature critical threshold — precheck fails above this |
+| `BAD_BLOCK_THRESHOLD` | `0` | Max bad blocks allowed before surface_validate fails (0 = any bad = fail) |
+| `APP_VERSION` | `1.0.0-7` | Displayed in header version badge |
+| `SSH_HOST` | `` | TrueNAS SSH hostname/IP — empty disables SSH mode (uses mock/REST) |
+| `SSH_PORT` | `22` | TrueNAS SSH port |
+| `SSH_USER` | `root` | TrueNAS SSH username |
+| `SSH_PASSWORD` | `` | TrueNAS SSH password (use key instead for production) |
+| `SSH_KEY` | `` | TrueNAS SSH private key PEM string — loaded in-memory, never written to disk |
 
 ---
 
@@ -305,27 +324,114 @@ async def burnin_get(job_id: int, ...): ...
 | First row clipped after Stage 6b | Stats bar added 70px but max-height not updated | `max-height: calc(100vh - 205px)` |
 | SMTP "Connection unexpectedly closed" | `_send_email` used `settings.smtp_port` (587 default) even in SSL mode | Derive port from mode via `_MODE_PORTS` dict; SSL→465, STARTTLS→587, Plain→25 |
 | SSL mode missing EHLO | `smtplib.SMTP_SSL` was created without calling `ehlo()` | Added `server.ehlo()` after both SSL and STARTTLS connections |
+| `profile` NameError in `_execute_stages` | `_execute_stages` called `_recalculate_progress(job_id, profile)` but `profile` not in scope | Changed to `_recalculate_progress(job_id)` — profile param was unused |
+| `app_version` Jinja2 global rendered as function | Set `templates.env.globals["app_version"] = _get_app_version` (callable) | Set to the static string value directly: `= _settings.app_version` |
 
 ---
 
-## Stage 7 — Cutting to Real TrueNAS (TODO)
+## Feature Reference (Stage 7)
+
+### SSH Burn-In Architecture
+
+`ssh_client.py` provides an optional SSH execution layer. When `SSH_HOST` is set (and key or password is present), all burn-in stages run real commands over SSH against TrueNAS. When `SSH_HOST` is empty, stages fall back to mock/REST simulation.
+
+**Dual-mode dispatch** — each stage checks `ssh_client.is_configured()`:
+```python
+if ssh_client.is_configured():
+    # run smartctl / badblocks over SSH
+else:
+    # simulate with REST API or timed sleep (mock mode)
+```
+
+**SSH client capabilities** (`ssh_client.py`):
+- `test_connection()` → `{"ok": bool, "error": str}` — used by Test SSH button
+- `get_smart_attributes(devname)` → parse `smartctl -a`, return `{health, raw_output, attributes, warnings, failures}`
+- `start_smart_test(devname, test_type)` → `smartctl -t short|long /dev/{devname}`
+- `poll_smart_progress(devname)` → `smartctl -a` during test; returns `{state, percent_remaining, output}`
+- `abort_smart_test(devname)` → `smartctl -X /dev/{devname}`
+- `run_badblocks(devname, on_progress, cancelled_fn)` → streams `badblocks -wsv -b 4096 -p 1`; counts bad sectors from stdout (digit-only lines)
+
+**Key auth pattern** — key is stored as PEM string in settings, never written to disk:
+```python
+asyncssh.connect(host, ..., client_keys=[asyncssh.import_private_key(pem_str)], known_hosts=None)
+```
+
+**badblocks streaming** — uses `asyncssh.create_process()` with parallel stdout/stderr draining via `asyncio.gather`. Progress updates written to DB every 20 lines to avoid excessive writes.
+
+### SMART Attribute Monitoring
+
+Monitored attributes and their thresholds:
+
+| ID | Name | Any non-zero → |
+|----|------|----------------|
+| 5 | Reallocated_Sector_Ct | FAIL |
+| 10 | Spin_Retry_Count | WARN |
+| 188 | Command_Timeout | WARN |
+| 197 | Current_Pending_Sector | FAIL |
+| 198 | Offline_Uncorrectable | FAIL |
+| 199 | UDMA_CRC_Error_Count | WARN |
+
+SMART attrs stored as JSON blob in `drives.smart_attrs`. Updated by `final_check` stage (SSH mode) or `short_smart`/`long_smart` REST mode. Displayed in drive drawer with colour-coded table + raw `smartctl -a` output.
+
+### Drive Reset Action
+
+- `POST /api/v1/drives/{drive_id}/reset` — clears `smart_tests` rows to idle, clears `drives.smart_attrs`, writes audit event, notifies SSE subscribers
+- Button appears in action column when `can_reset` = drive has no active burn-in AND has any non-idle smart state or smart attrs
+- Burn-in history (burnin_jobs, burnin_stages) is preserved — reset only affects SMART test state
+
+### New Routes (Stage 7)
+
+| Method | Path | Description |
+|--------|------|-------------|
+| `POST` | `/api/v1/drives/{id}/reset` | Reset SMART state and attrs for a drive |
+| `POST` | `/api/v1/settings/test-ssh` | Test SSH connection with current SSH settings |
+| `GET`  | `/api/v1/updates/check` | Check for latest release from Forgejo git.hellocomputer.xyz |
+
+### Check for Updates
+
+Settings page has a "Check for Updates" button that fetches:
+```
+GET https://git.hellocomputer.xyz/api/v1/repos/brandon/truenas-burnin/releases/latest
+```
+Compares tag name against `settings.app_version`; shows "up to date" or "v{tag} available".
+
+### Version Badge
+
+`app_version` set as Jinja2 global in `renderer.py`:
+```python
+templates.env.globals["app_version"] = _settings.app_version
+```
+Displayed in header as `<span class="header-version">v{app_version}</span>` (right side, muted).
+
+### Configurable Thresholds
+
+`renderer.py` `_temp_class` now reads from settings instead of hardcoded values:
+```python
+if temp >= settings.temp_crit_c:  return "temp-crit"
+if temp >= settings.temp_warn_c:  return "temp-warn"
+```
+`precheck` stage fails if `temperature_c >= settings.temp_crit_c`.
+
+Surface validate fails if `bad_blocks > settings.bad_block_threshold` (default 0 = any bad sector = fail).
+
+### Cutting to Real TrueNAS (Next Steps)
 
 When ready to test against a real TrueNAS CORE box:
 
-1. In `.env` on maple.local, set:
-   ```env
-   TRUENAS_BASE_URL=https://10.0.0.203   # or whatever your TrueNAS IP is
-   TRUENAS_API_KEY=your-real-key-here
-   TRUENAS_VERIFY_TLS=false              # unless you have a valid cert
-   ```
-2. Comment out `mock-truenas` service in `docker-compose.yml` (or leave it running — harmless)
-3. Verify TrueNAS CORE v2.0 API contract matches what `truenas.py` expects:
+1. In Settings (or `.env`), set:
+   - **TrueNAS URL** → `https://10.0.0.X` (real IP)
+   - **API Key** → real API key
+   - **SSH Host** → same IP as TrueNAS
+   - **SSH User** → `root` (or sudoer with smartctl/badblocks access)
+   - **SSH Key** → paste PEM key into textarea
+2. Click **Test SSH Connection** to verify before starting a burn-in
+3. TrueNAS CORE uses `ada0`, `da0` device names (not `sda`). Mock drive names will differ.
+4. Delete `app.db` before first real poll to clear mock drive rows
+5. Comment out `mock-truenas` service in `docker-compose.yml` (optional — harmless to leave)
+6. Verify TrueNAS CORE v2.0 REST API:
    - `GET /api/v2.0/disk` returns list with `name`, `serial`, `model`, `size`, `temperature`
    - `GET /api/v2.0/core/get_jobs` with filter `[["method","=","smart.test"]]`
    - `POST /api/v2.0/smart/test` accepts `{disks: [devname], type: "SHORT"|"LONG"}`
-4. Check that disk names match expected format (TrueNAS CORE uses `ada0`, `da0`, etc. — not `sda`)
-   - You may need to update mock drive names back or adjust poller logic
-5. Delete `app.db` to clear mock drive rows before first real poll
 
 ---
 
diff --git a/app/config.py b/app/config.py
index 5c9b10a..7c1f3cf 100644
--- a/app/config.py
+++ b/app/config.py
@@ -68,7 +68,7 @@ class Settings(BaseSettings):
     ssh_key: str = ""             # PEM private key content (paste full key including headers)
 
     # Application version — used by the /api/v1/updates/check endpoint
-    app_version: str = "1.0.0-6d"
+    app_version: str = "1.0.0-7"
 
 
 settings = Settings()