From fe51dc49b0bf958918d412aa2e4bd307f71e2aff Mon Sep 17 00:00:00 2001 From: Chris Sherwood Date: Wed, 13 May 2026 10:28:32 -0700 Subject: [PATCH] feat(GPU): auto-remediate nomad_ollama passthrough loss on admin boot (#755) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After an update, container recreate, or docker daemon restart, nomad_ollama's HostConfig.DeviceRequests still lists the nvidia driver — but the NVIDIA Container Toolkit binding inside the container is torn. `nvidia-smi` returns "Failed to initialize NVML: Unknown Error" and Ollama silently falls back to CPU inference. PR #208 detects this and shows a banner with a "Fix: Reinstall AI Assistant" button. This change does that click automatically on admin boot. New provider GpuPassthroughRemediationProvider runs once on web env boot: 1. Skip when KV `ai.autoFixGpuPassthrough = false` (default true). 2. Skip when Docker has no `nvidia` runtime registered (AMD-only and CPU-only hosts unaffected). 3. Skip when nomad_ollama isn't running. 4. Exec `nvidia-smi --query-gpu=name --format=csv,noheader` inside the container with an 8-second timeout. If the output matches "Failed to initialize NVML", "Unknown Error", "TIMEOUT", or contains no alphabetic characters, treat the passthrough as broken. 5. On broken: call DockerService.forceReinstall('nomad_ollama'). The existing force-reinstall preserves the Ollama volume + installed models. Stamp `gpu.autoRemediatedAt` on success. 6. On healthy: log and exit. AMD passthrough_failed is intentionally not handled — its fix path is HSA override handling (PR #804) rather than a simple service recreate, and false positives during AMD startup log parsing would loop a recreate without fixing anything. Left to a follow-up if it proves to be a recurring AMD issue. Validated on NOMAD3 (RTX 5060, v1.32.0-rc.3 + this patch hot-applied): - After admin restart with passthrough healthy: log line "[GpuPassthroughRemediationProvider] NVIDIA passthrough healthy — no action needed." Provider exits cleanly without touching the container. - The broken-state branch hits the existing forceReinstall path, which was manually invoked earlier in the same session to fix this exact box and recovered GPU access in ~45s with model volume intact. No new failure mode is introduced — the auto-trigger removes the user click but the underlying operation is the same one the banner Fix button already calls. Closes #755. --- admin/adonisrc.ts | 1 + .../gpu_passthrough_remediation_provider.ts | 122 ++++++++++++++++++ admin/types/kv_store.ts | 2 + 3 files changed, 125 insertions(+) create mode 100644 admin/providers/gpu_passthrough_remediation_provider.ts diff --git a/admin/adonisrc.ts b/admin/adonisrc.ts index 741b160..9b82ee0 100644 --- a/admin/adonisrc.ts +++ b/admin/adonisrc.ts @@ -57,6 +57,7 @@ export default defineConfig({ () => import('#providers/kiwix_migration_provider'), () => import('#providers/qdrant_restart_policy_provider'), () => import('#providers/version_check_provider'), + () => import('#providers/gpu_passthrough_remediation_provider'), ], /* diff --git a/admin/providers/gpu_passthrough_remediation_provider.ts b/admin/providers/gpu_passthrough_remediation_provider.ts new file mode 100644 index 0000000..e06aa44 --- /dev/null +++ b/admin/providers/gpu_passthrough_remediation_provider.ts @@ -0,0 +1,122 @@ +import logger from '@adonisjs/core/services/logger' +import type { ApplicationService } from '@adonisjs/core/types' + +/** + * Auto-remediates NVIDIA GPU passthrough loss after admin / host restart. + * + * After an update or container recreate, nomad_ollama's HostConfig.DeviceRequests + * still lists the nvidia driver, but the NVIDIA Container Toolkit binding inside + * the container is torn. `nvidia-smi` inside the container returns + * "Failed to initialize NVML: Unknown Error" and Ollama silently falls back to + * CPU inference. PR #208 added detection + a one-click "Fix: Reinstall AI Assistant" + * banner. This provider does that click automatically on admin boot when the + * condition is detected. + * + * Guards: + * - NVIDIA-only. AMD passthrough_failed has a different fix path (HSA override + * handling in PR #804) and is left to the user. + * - One-shot per admin boot. The provider runs once on startup; if the recreate + * itself fails the banner remains as a fallback. + * - Opt-out via KV `ai.autoFixGpuPassthrough = false`. + * - Skipped entirely when no NVIDIA runtime is registered with Docker. + */ +export default class GpuPassthroughRemediationProvider { + constructor(protected app: ApplicationService) {} + + async boot() { + if (this.app.getEnvironment() !== 'web') return + + setImmediate(async () => { + try { + const KVStore = (await import('#models/kv_store')).default + const { DockerService } = await import('#services/docker_service') + const { SERVICE_NAMES } = await import('../constants/service_names.js') + const Docker = (await import('dockerode')).default + + const enabledRaw = await KVStore.getValue('ai.autoFixGpuPassthrough') + if (String(enabledRaw) === 'false') { + logger.info( + '[GpuPassthroughRemediationProvider] Auto-fix disabled via KV — skipping.' + ) + return + } + + const docker = new Docker({ socketPath: '/var/run/docker.sock' }) + const dockerInfo = await docker.info() + const runtimes = dockerInfo.Runtimes || {} + const hasNvidiaRuntime = 'nvidia' in runtimes + + if (!hasNvidiaRuntime) { + logger.info( + '[GpuPassthroughRemediationProvider] No NVIDIA runtime registered — skipping.' + ) + return + } + + const containers = await docker.listContainers({ all: false }) + const ollama = containers.find((c) => c.Names.includes(`/${SERVICE_NAMES.OLLAMA}`)) + + if (!ollama) { + logger.info( + '[GpuPassthroughRemediationProvider] nomad_ollama not running — skipping.' + ) + return + } + + // Probe: exec nvidia-smi inside the Ollama container. NVML init failure + // is the signature of a broken passthrough that DeviceRequests can't see. + const container = docker.getContainer(ollama.Id) + const exec = await container.exec({ + Cmd: ['nvidia-smi', '--query-gpu=name', '--format=csv,noheader'], + AttachStdout: true, + AttachStderr: true, + }) + const stream = await exec.start({ Tty: true }) + const output = await new Promise((resolve) => { + let buf = '' + const timer = setTimeout(() => resolve(buf || 'TIMEOUT'), 8000) + stream.on('data', (chunk: Buffer) => (buf += chunk.toString('utf8'))) + stream.on('end', () => { + clearTimeout(timer) + resolve(buf) + }) + }) + + const passthroughBroken = + /Failed to initialize NVML|Unknown Error|TIMEOUT/i.test(output) || + !/[A-Za-z]/.test(output) + + if (!passthroughBroken) { + logger.info( + '[GpuPassthroughRemediationProvider] NVIDIA passthrough healthy — no action needed.' + ) + return + } + + logger.warn( + '[GpuPassthroughRemediationProvider] NVIDIA passthrough broken (nvidia-smi inside nomad_ollama failed). ' + + 'Auto-reinstalling nomad_ollama; volumes and installed models are preserved.' + ) + + const dockerService = new DockerService() + const result = await dockerService.forceReinstall(SERVICE_NAMES.OLLAMA) + + if (result.success) { + await KVStore.setValue('gpu.autoRemediatedAt', new Date().toISOString()) + logger.info( + '[GpuPassthroughRemediationProvider] nomad_ollama force-reinstall completed successfully.' + ) + } else { + logger.error( + `[GpuPassthroughRemediationProvider] Force-reinstall failed: ${result.message}. ` + + 'User can still click the "Fix: Reinstall AI Assistant" banner manually.' + ) + } + } catch (err: any) { + logger.error( + `[GpuPassthroughRemediationProvider] Auto-remediation check failed: ${err?.message ?? err}` + ) + } + }) + } +} diff --git a/admin/types/kv_store.ts b/admin/types/kv_store.ts index 7974e95..381b367 100644 --- a/admin/types/kv_store.ts +++ b/admin/types/kv_store.ts @@ -14,6 +14,8 @@ export const KV_STORE_SCHEMA = { 'ai.ollamaFlashAttention': 'boolean', 'ai.amdGpuAcceleration': 'boolean', 'ai.amdHsaOverride': 'string', + 'ai.autoFixGpuPassthrough': 'boolean', + 'gpu.autoRemediatedAt': 'string', } as const type KVTagToType = T extends 'boolean' ? boolean : string