mirror of
https://github.com/Crosstalk-Solutions/project-nomad.git
synced 2026-05-27 14:48:26 +02:00
feat(GPU): auto-remediate nomad_ollama passthrough loss on admin boot (#755)
After an update, container recreate, or docker daemon restart, nomad_ollama's HostConfig.DeviceRequests still lists the nvidia driver — but the NVIDIA Container Toolkit binding inside the container is torn. `nvidia-smi` returns "Failed to initialize NVML: Unknown Error" and Ollama silently falls back to CPU inference. PR #208 detects this and shows a banner with a "Fix: Reinstall AI Assistant" button. This change does that click automatically on admin boot. New provider GpuPassthroughRemediationProvider runs once on web env boot: 1. Skip when KV `ai.autoFixGpuPassthrough = false` (default true). 2. Skip when Docker has no `nvidia` runtime registered (AMD-only and CPU-only hosts unaffected). 3. Skip when nomad_ollama isn't running. 4. Exec `nvidia-smi --query-gpu=name --format=csv,noheader` inside the container with an 8-second timeout. If the output matches "Failed to initialize NVML", "Unknown Error", "TIMEOUT", or contains no alphabetic characters, treat the passthrough as broken. 5. On broken: call DockerService.forceReinstall('nomad_ollama'). The existing force-reinstall preserves the Ollama volume + installed models. Stamp `gpu.autoRemediatedAt` on success. 6. On healthy: log and exit. AMD passthrough_failed is intentionally not handled — its fix path is HSA override handling (PR #804) rather than a simple service recreate, and false positives during AMD startup log parsing would loop a recreate without fixing anything. Left to a follow-up if it proves to be a recurring AMD issue. Validated on NOMAD3 (RTX 5060, v1.32.0-rc.3 + this patch hot-applied): - After admin restart with passthrough healthy: log line "[GpuPassthroughRemediationProvider] NVIDIA passthrough healthy — no action needed." Provider exits cleanly without touching the container. - The broken-state branch hits the existing forceReinstall path, which was manually invoked earlier in the same session to fix this exact box and recovered GPU access in ~45s with model volume intact. No new failure mode is introduced — the auto-trigger removes the user click but the underlying operation is the same one the banner Fix button already calls. Closes #755.
This commit is contained in:
parent
ba661a9da1
commit
fe51dc49b0
|
|
@ -57,6 +57,7 @@ export default defineConfig({
|
|||
() => import('#providers/kiwix_migration_provider'),
|
||||
() => import('#providers/qdrant_restart_policy_provider'),
|
||||
() => import('#providers/version_check_provider'),
|
||||
() => import('#providers/gpu_passthrough_remediation_provider'),
|
||||
],
|
||||
|
||||
/*
|
||||
|
|
|
|||
122
admin/providers/gpu_passthrough_remediation_provider.ts
Normal file
122
admin/providers/gpu_passthrough_remediation_provider.ts
Normal file
|
|
@ -0,0 +1,122 @@
|
|||
import logger from '@adonisjs/core/services/logger'
|
||||
import type { ApplicationService } from '@adonisjs/core/types'
|
||||
|
||||
/**
|
||||
* Auto-remediates NVIDIA GPU passthrough loss after admin / host restart.
|
||||
*
|
||||
* After an update or container recreate, nomad_ollama's HostConfig.DeviceRequests
|
||||
* still lists the nvidia driver, but the NVIDIA Container Toolkit binding inside
|
||||
* the container is torn. `nvidia-smi` inside the container returns
|
||||
* "Failed to initialize NVML: Unknown Error" and Ollama silently falls back to
|
||||
* CPU inference. PR #208 added detection + a one-click "Fix: Reinstall AI Assistant"
|
||||
* banner. This provider does that click automatically on admin boot when the
|
||||
* condition is detected.
|
||||
*
|
||||
* Guards:
|
||||
* - NVIDIA-only. AMD passthrough_failed has a different fix path (HSA override
|
||||
* handling in PR #804) and is left to the user.
|
||||
* - One-shot per admin boot. The provider runs once on startup; if the recreate
|
||||
* itself fails the banner remains as a fallback.
|
||||
* - Opt-out via KV `ai.autoFixGpuPassthrough = false`.
|
||||
* - Skipped entirely when no NVIDIA runtime is registered with Docker.
|
||||
*/
|
||||
export default class GpuPassthroughRemediationProvider {
|
||||
constructor(protected app: ApplicationService) {}
|
||||
|
||||
async boot() {
|
||||
if (this.app.getEnvironment() !== 'web') return
|
||||
|
||||
setImmediate(async () => {
|
||||
try {
|
||||
const KVStore = (await import('#models/kv_store')).default
|
||||
const { DockerService } = await import('#services/docker_service')
|
||||
const { SERVICE_NAMES } = await import('../constants/service_names.js')
|
||||
const Docker = (await import('dockerode')).default
|
||||
|
||||
const enabledRaw = await KVStore.getValue('ai.autoFixGpuPassthrough')
|
||||
if (String(enabledRaw) === 'false') {
|
||||
logger.info(
|
||||
'[GpuPassthroughRemediationProvider] Auto-fix disabled via KV — skipping.'
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
const docker = new Docker({ socketPath: '/var/run/docker.sock' })
|
||||
const dockerInfo = await docker.info()
|
||||
const runtimes = dockerInfo.Runtimes || {}
|
||||
const hasNvidiaRuntime = 'nvidia' in runtimes
|
||||
|
||||
if (!hasNvidiaRuntime) {
|
||||
logger.info(
|
||||
'[GpuPassthroughRemediationProvider] No NVIDIA runtime registered — skipping.'
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
const containers = await docker.listContainers({ all: false })
|
||||
const ollama = containers.find((c) => c.Names.includes(`/${SERVICE_NAMES.OLLAMA}`))
|
||||
|
||||
if (!ollama) {
|
||||
logger.info(
|
||||
'[GpuPassthroughRemediationProvider] nomad_ollama not running — skipping.'
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
// Probe: exec nvidia-smi inside the Ollama container. NVML init failure
|
||||
// is the signature of a broken passthrough that DeviceRequests can't see.
|
||||
const container = docker.getContainer(ollama.Id)
|
||||
const exec = await container.exec({
|
||||
Cmd: ['nvidia-smi', '--query-gpu=name', '--format=csv,noheader'],
|
||||
AttachStdout: true,
|
||||
AttachStderr: true,
|
||||
})
|
||||
const stream = await exec.start({ Tty: true })
|
||||
const output = await new Promise<string>((resolve) => {
|
||||
let buf = ''
|
||||
const timer = setTimeout(() => resolve(buf || 'TIMEOUT'), 8000)
|
||||
stream.on('data', (chunk: Buffer) => (buf += chunk.toString('utf8')))
|
||||
stream.on('end', () => {
|
||||
clearTimeout(timer)
|
||||
resolve(buf)
|
||||
})
|
||||
})
|
||||
|
||||
const passthroughBroken =
|
||||
/Failed to initialize NVML|Unknown Error|TIMEOUT/i.test(output) ||
|
||||
!/[A-Za-z]/.test(output)
|
||||
|
||||
if (!passthroughBroken) {
|
||||
logger.info(
|
||||
'[GpuPassthroughRemediationProvider] NVIDIA passthrough healthy — no action needed.'
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
logger.warn(
|
||||
'[GpuPassthroughRemediationProvider] NVIDIA passthrough broken (nvidia-smi inside nomad_ollama failed). ' +
|
||||
'Auto-reinstalling nomad_ollama; volumes and installed models are preserved.'
|
||||
)
|
||||
|
||||
const dockerService = new DockerService()
|
||||
const result = await dockerService.forceReinstall(SERVICE_NAMES.OLLAMA)
|
||||
|
||||
if (result.success) {
|
||||
await KVStore.setValue('gpu.autoRemediatedAt', new Date().toISOString())
|
||||
logger.info(
|
||||
'[GpuPassthroughRemediationProvider] nomad_ollama force-reinstall completed successfully.'
|
||||
)
|
||||
} else {
|
||||
logger.error(
|
||||
`[GpuPassthroughRemediationProvider] Force-reinstall failed: ${result.message}. ` +
|
||||
'User can still click the "Fix: Reinstall AI Assistant" banner manually.'
|
||||
)
|
||||
}
|
||||
} catch (err: any) {
|
||||
logger.error(
|
||||
`[GpuPassthroughRemediationProvider] Auto-remediation check failed: ${err?.message ?? err}`
|
||||
)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
@ -14,6 +14,8 @@ export const KV_STORE_SCHEMA = {
|
|||
'ai.ollamaFlashAttention': 'boolean',
|
||||
'ai.amdGpuAcceleration': 'boolean',
|
||||
'ai.amdHsaOverride': 'string',
|
||||
'ai.autoFixGpuPassthrough': 'boolean',
|
||||
'gpu.autoRemediatedAt': 'string',
|
||||
} as const
|
||||
|
||||
type KVTagToType<T extends string> = T extends 'boolean' ? boolean : string
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user