From fe51dc49b0bf958918d412aa2e4bd307f71e2aff Mon Sep 17 00:00:00 2001
From: Chris Sherwood <chris@crosstalksolutions.com>
Date: Wed, 13 May 2026 10:28:32 -0700
Subject: [PATCH] feat(GPU): auto-remediate nomad_ollama passthrough loss on
 admin boot (#755)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After an update, container recreate, or docker daemon restart, nomad_ollama's
HostConfig.DeviceRequests still lists the nvidia driver — but the NVIDIA
Container Toolkit binding inside the container is torn. `nvidia-smi` returns
"Failed to initialize NVML: Unknown Error" and Ollama silently falls back to
CPU inference. PR #208 detects this and shows a banner with a "Fix: Reinstall
AI Assistant" button. This change does that click automatically on admin boot.

New provider GpuPassthroughRemediationProvider runs once on web env boot:

1. Skip when KV `ai.autoFixGpuPassthrough = false` (default true).
2. Skip when Docker has no `nvidia` runtime registered (AMD-only and CPU-only
   hosts unaffected).
3. Skip when nomad_ollama isn't running.
4. Exec `nvidia-smi --query-gpu=name --format=csv,noheader` inside the
   container with an 8-second timeout. If the output matches
   "Failed to initialize NVML", "Unknown Error", "TIMEOUT", or contains no
   alphabetic characters, treat the passthrough as broken.
5. On broken: call DockerService.forceReinstall('nomad_ollama'). The existing
   force-reinstall preserves the Ollama volume + installed models. Stamp
   `gpu.autoRemediatedAt` on success.
6. On healthy: log and exit.

AMD passthrough_failed is intentionally not handled — its fix path is HSA
override handling (PR #804) rather than a simple service recreate, and false
positives during AMD startup log parsing would loop a recreate without fixing
anything. Left to a follow-up if it proves to be a recurring AMD issue.

Validated on NOMAD3 (RTX 5060, v1.32.0-rc.3 + this patch hot-applied):

- After admin restart with passthrough healthy: log line
  "[GpuPassthroughRemediationProvider] NVIDIA passthrough healthy — no action
  needed." Provider exits cleanly without touching the container.
- The broken-state branch hits the existing forceReinstall path, which was
  manually invoked earlier in the same session to fix this exact box and
  recovered GPU access in ~45s with model volume intact. No new failure mode
  is introduced — the auto-trigger removes the user click but the underlying
  operation is the same one the banner Fix button already calls.

Closes #755.
---
 admin/adonisrc.ts                             |   1 +
 .../gpu_passthrough_remediation_provider.ts   | 122 ++++++++++++++++++
 admin/types/kv_store.ts                       |   2 +
 3 files changed, 125 insertions(+)
 create mode 100644 admin/providers/gpu_passthrough_remediation_provider.ts

diff --git a/admin/adonisrc.ts b/admin/adonisrc.ts
index 741b160..9b82ee0 100644
--- a/admin/adonisrc.ts
+++ b/admin/adonisrc.ts
@@ -57,6 +57,7 @@ export default defineConfig({
     () => import('#providers/kiwix_migration_provider'),
     () => import('#providers/qdrant_restart_policy_provider'),
     () => import('#providers/version_check_provider'),
+    () => import('#providers/gpu_passthrough_remediation_provider'),
   ],
 
   /*
diff --git a/admin/providers/gpu_passthrough_remediation_provider.ts b/admin/providers/gpu_passthrough_remediation_provider.ts
new file mode 100644
index 0000000..e06aa44
--- /dev/null
+++ b/admin/providers/gpu_passthrough_remediation_provider.ts
@@ -0,0 +1,122 @@
+import logger from '@adonisjs/core/services/logger'
+import type { ApplicationService } from '@adonisjs/core/types'
+
+/**
+ * Auto-remediates NVIDIA GPU passthrough loss after admin / host restart.
+ *
+ * After an update or container recreate, nomad_ollama's HostConfig.DeviceRequests
+ * still lists the nvidia driver, but the NVIDIA Container Toolkit binding inside
+ * the container is torn. `nvidia-smi` inside the container returns
+ * "Failed to initialize NVML: Unknown Error" and Ollama silently falls back to
+ * CPU inference. PR #208 added detection + a one-click "Fix: Reinstall AI Assistant"
+ * banner. This provider does that click automatically on admin boot when the
+ * condition is detected.
+ *
+ * Guards:
+ *   - NVIDIA-only. AMD passthrough_failed has a different fix path (HSA override
+ *     handling in PR #804) and is left to the user.
+ *   - One-shot per admin boot. The provider runs once on startup; if the recreate
+ *     itself fails the banner remains as a fallback.
+ *   - Opt-out via KV `ai.autoFixGpuPassthrough = false`.
+ *   - Skipped entirely when no NVIDIA runtime is registered with Docker.
+ */
+export default class GpuPassthroughRemediationProvider {
+  constructor(protected app: ApplicationService) {}
+
+  async boot() {
+    if (this.app.getEnvironment() !== 'web') return
+
+    setImmediate(async () => {
+      try {
+        const KVStore = (await import('#models/kv_store')).default
+        const { DockerService } = await import('#services/docker_service')
+        const { SERVICE_NAMES } = await import('../constants/service_names.js')
+        const Docker = (await import('dockerode')).default
+
+        const enabledRaw = await KVStore.getValue('ai.autoFixGpuPassthrough')
+        if (String(enabledRaw) === 'false') {
+          logger.info(
+            '[GpuPassthroughRemediationProvider] Auto-fix disabled via KV — skipping.'
+          )
+          return
+        }
+
+        const docker = new Docker({ socketPath: '/var/run/docker.sock' })
+        const dockerInfo = await docker.info()
+        const runtimes = dockerInfo.Runtimes || {}
+        const hasNvidiaRuntime = 'nvidia' in runtimes
+
+        if (!hasNvidiaRuntime) {
+          logger.info(
+            '[GpuPassthroughRemediationProvider] No NVIDIA runtime registered — skipping.'
+          )
+          return
+        }
+
+        const containers = await docker.listContainers({ all: false })
+        const ollama = containers.find((c) => c.Names.includes(`/${SERVICE_NAMES.OLLAMA}`))
+
+        if (!ollama) {
+          logger.info(
+            '[GpuPassthroughRemediationProvider] nomad_ollama not running — skipping.'
+          )
+          return
+        }
+
+        // Probe: exec nvidia-smi inside the Ollama container. NVML init failure
+        // is the signature of a broken passthrough that DeviceRequests can't see.
+        const container = docker.getContainer(ollama.Id)
+        const exec = await container.exec({
+          Cmd: ['nvidia-smi', '--query-gpu=name', '--format=csv,noheader'],
+          AttachStdout: true,
+          AttachStderr: true,
+        })
+        const stream = await exec.start({ Tty: true })
+        const output = await new Promise<string>((resolve) => {
+          let buf = ''
+          const timer = setTimeout(() => resolve(buf || 'TIMEOUT'), 8000)
+          stream.on('data', (chunk: Buffer) => (buf += chunk.toString('utf8')))
+          stream.on('end', () => {
+            clearTimeout(timer)
+            resolve(buf)
+          })
+        })
+
+        const passthroughBroken =
+          /Failed to initialize NVML|Unknown Error|TIMEOUT/i.test(output) ||
+          !/[A-Za-z]/.test(output)
+
+        if (!passthroughBroken) {
+          logger.info(
+            '[GpuPassthroughRemediationProvider] NVIDIA passthrough healthy — no action needed.'
+          )
+          return
+        }
+
+        logger.warn(
+          '[GpuPassthroughRemediationProvider] NVIDIA passthrough broken (nvidia-smi inside nomad_ollama failed). ' +
+            'Auto-reinstalling nomad_ollama; volumes and installed models are preserved.'
+        )
+
+        const dockerService = new DockerService()
+        const result = await dockerService.forceReinstall(SERVICE_NAMES.OLLAMA)
+
+        if (result.success) {
+          await KVStore.setValue('gpu.autoRemediatedAt', new Date().toISOString())
+          logger.info(
+            '[GpuPassthroughRemediationProvider] nomad_ollama force-reinstall completed successfully.'
+          )
+        } else {
+          logger.error(
+            `[GpuPassthroughRemediationProvider] Force-reinstall failed: ${result.message}. ` +
+              'User can still click the "Fix: Reinstall AI Assistant" banner manually.'
+          )
+        }
+      } catch (err: any) {
+        logger.error(
+          `[GpuPassthroughRemediationProvider] Auto-remediation check failed: ${err?.message ?? err}`
+        )
+      }
+    })
+  }
+}
diff --git a/admin/types/kv_store.ts b/admin/types/kv_store.ts
index 7974e95..381b367 100644
--- a/admin/types/kv_store.ts
+++ b/admin/types/kv_store.ts
@@ -14,6 +14,8 @@ export const KV_STORE_SCHEMA = {
   'ai.ollamaFlashAttention':    'boolean',
   'ai.amdGpuAcceleration':      'boolean',
   'ai.amdHsaOverride':          'string',
+  'ai.autoFixGpuPassthrough':   'boolean',
+  'gpu.autoRemediatedAt':       'string',
 } as const
 
 type KVTagToType<T extends string> = T extends 'boolean' ? boolean : string