mirror of
https://github.com/Crosstalk-Solutions/project-nomad.git
synced 2026-03-28 03:29:25 +01:00
refactor: reusable utility for running nvidia-smi
This commit is contained in:
parent
c16cfc3a93
commit
921eef30d6
|
|
@ -278,45 +278,12 @@ export class BenchmarkService {
|
|||
if ('nvidia' in runtimes) {
|
||||
logger.info('[BenchmarkService] NVIDIA container runtime detected, querying GPU model via nvidia-smi')
|
||||
|
||||
// Try to get GPU model name from the running Ollama container
|
||||
try {
|
||||
const containers = await this.dockerService.docker.listContainers({ all: false })
|
||||
const ollamaContainer = containers.find((c) =>
|
||||
c.Names.includes(`/${SERVICE_NAMES.OLLAMA}`)
|
||||
)
|
||||
|
||||
if (ollamaContainer) {
|
||||
const container = this.dockerService.docker.getContainer(ollamaContainer.Id)
|
||||
const exec = await container.exec({
|
||||
Cmd: ['nvidia-smi', '--query-gpu=name', '--format=csv,noheader'],
|
||||
AttachStdout: true,
|
||||
AttachStderr: true,
|
||||
Tty: true,
|
||||
})
|
||||
|
||||
const stream = await exec.start({ Tty: true })
|
||||
const output = await new Promise<string>((resolve) => {
|
||||
let data = ''
|
||||
const timeout = setTimeout(() => resolve(data), 5000)
|
||||
stream.on('data', (chunk: Buffer) => { data += chunk.toString() })
|
||||
stream.on('end', () => { clearTimeout(timeout); resolve(data) })
|
||||
})
|
||||
|
||||
const gpuName = output.replace(/[\x00-\x08]/g, '').trim()
|
||||
if (gpuName && !gpuName.toLowerCase().includes('error') && !gpuName.toLowerCase().includes('not found')) {
|
||||
gpuModel = gpuName
|
||||
logger.info(`[BenchmarkService] GPU detected via nvidia-smi: ${gpuModel}`)
|
||||
} else {
|
||||
gpuModel = 'NVIDIA GPU (model unknown)'
|
||||
logger.info('[BenchmarkService] NVIDIA runtime present but nvidia-smi query failed, using generic name')
|
||||
}
|
||||
} else {
|
||||
gpuModel = 'NVIDIA GPU (model unknown)'
|
||||
logger.info('[BenchmarkService] NVIDIA runtime present but Ollama container not running')
|
||||
}
|
||||
} catch (execError) {
|
||||
gpuModel = 'NVIDIA GPU (model unknown)'
|
||||
logger.warn(`[BenchmarkService] nvidia-smi exec failed: ${execError.message}`)
|
||||
const systemService = new (await import('./system_service.js')).SystemService(this.dockerService)
|
||||
const nvidiaInfo = await systemService.getNvidiaSmiInfo()
|
||||
if (Array.isArray(nvidiaInfo) && nvidiaInfo.length > 0) {
|
||||
gpuModel = nvidiaInfo[0].model
|
||||
} else {
|
||||
logger.warn(`[BenchmarkService] NVIDIA runtime detected but failed to get GPU info: ${typeof nvidiaInfo === 'string' ? nvidiaInfo : JSON.stringify(nvidiaInfo)}`)
|
||||
}
|
||||
}
|
||||
} catch (dockerError) {
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ export class SystemService {
|
|||
private static appVersion: string | null = null
|
||||
private static diskInfoFile = '/storage/nomad-disk-info.json'
|
||||
|
||||
constructor(private dockerService: DockerService) {}
|
||||
constructor(private dockerService: DockerService) { }
|
||||
|
||||
async checkServiceInstalled(serviceName: string): Promise<boolean> {
|
||||
const services = await this.getServices({ installedOnly: true });
|
||||
|
|
@ -66,6 +66,66 @@ export class SystemService {
|
|||
return false
|
||||
}
|
||||
|
||||
async getNvidiaSmiInfo(): Promise<Array<{ vendor: string; model: string; vram: number; }> | { error: string } | 'OLLAMA_NOT_FOUND' | 'BAD_RESPONSE' | 'UNKNOWN_ERROR'> {
|
||||
try {
|
||||
const containers = await this.dockerService.docker.listContainers({ all: false })
|
||||
const ollamaContainer = containers.find((c) =>
|
||||
c.Names.includes(`/${SERVICE_NAMES.OLLAMA}`)
|
||||
)
|
||||
if (!ollamaContainer) {
|
||||
logger.info('Ollama container not found for nvidia-smi info retrieval. This is expected if Ollama is not installed.')
|
||||
return 'OLLAMA_NOT_FOUND'
|
||||
}
|
||||
|
||||
// Execute nvidia-smi inside the Ollama container to get GPU info
|
||||
const container = this.dockerService.docker.getContainer(ollamaContainer.Id)
|
||||
const exec = await container.exec({
|
||||
Cmd: ['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader,nounits'],
|
||||
AttachStdout: true,
|
||||
AttachStderr: true,
|
||||
Tty: true,
|
||||
})
|
||||
|
||||
// Read the output stream with a timeout to prevent hanging if nvidia-smi fails
|
||||
const stream = await exec.start({ Tty: true })
|
||||
const output = await new Promise<string>((resolve) => {
|
||||
let data = ''
|
||||
const timeout = setTimeout(() => resolve(data), 5000)
|
||||
stream.on('data', (chunk: Buffer) => { data += chunk.toString() })
|
||||
stream.on('end', () => { clearTimeout(timeout); resolve(data) })
|
||||
})
|
||||
|
||||
// Remove any non-printable characters and trim the output
|
||||
const cleaned = output.replace(/[\x00-\x08]/g, '').trim()
|
||||
if (cleaned && !cleaned.toLowerCase().includes('error') && !cleaned.toLowerCase().includes('not found')) {
|
||||
// Split by newlines to handle multiple GPUs installed
|
||||
const lines = cleaned.split('\n').filter(line => line.trim())
|
||||
|
||||
// Map each line out to a useful structure for us
|
||||
const gpus = lines.map(line => {
|
||||
const parts = line.split(',').map((s) => s.trim())
|
||||
return {
|
||||
vendor: 'NVIDIA',
|
||||
model: parts[0] || 'NVIDIA GPU',
|
||||
vram: parts[1] ? parseInt(parts[1], 10) : 0,
|
||||
}
|
||||
})
|
||||
|
||||
return gpus.length > 0 ? gpus : 'BAD_RESPONSE'
|
||||
}
|
||||
|
||||
// If we got output but looks like an error, consider it a bad response from nvidia-smi
|
||||
return 'BAD_RESPONSE'
|
||||
}
|
||||
catch (error) {
|
||||
logger.error('Error getting nvidia-smi info:', error)
|
||||
if (error instanceof Error && error.message) {
|
||||
return { error: error.message }
|
||||
}
|
||||
return 'UNKNOWN_ERROR'
|
||||
}
|
||||
}
|
||||
|
||||
async getServices({ installedOnly = true }: { installedOnly?: boolean }): Promise<ServiceSlim[]> {
|
||||
await this._syncContainersWithDatabase() // Sync up before fetching to ensure we have the latest status
|
||||
|
||||
|
|
@ -195,48 +255,17 @@ export class SystemService {
|
|||
if (!graphics.controllers || graphics.controllers.length === 0) {
|
||||
const runtimes = dockerInfo.Runtimes || {}
|
||||
if ('nvidia' in runtimes) {
|
||||
let gpuName = 'NVIDIA GPU'
|
||||
try {
|
||||
const containers = await this.dockerService.docker.listContainers({ all: false })
|
||||
const ollamaContainer = containers.find((c) =>
|
||||
c.Names.includes(`/${SERVICE_NAMES.OLLAMA}`)
|
||||
)
|
||||
if (ollamaContainer) {
|
||||
const container = this.dockerService.docker.getContainer(ollamaContainer.Id)
|
||||
const exec = await container.exec({
|
||||
Cmd: ['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader,nounits'],
|
||||
AttachStdout: true,
|
||||
AttachStderr: true,
|
||||
Tty: true,
|
||||
})
|
||||
const stream = await exec.start({ Tty: true })
|
||||
const output = await new Promise<string>((resolve) => {
|
||||
let data = ''
|
||||
const timeout = setTimeout(() => resolve(data), 5000)
|
||||
stream.on('data', (chunk: Buffer) => { data += chunk.toString() })
|
||||
stream.on('end', () => { clearTimeout(timeout); resolve(data) })
|
||||
})
|
||||
const cleaned = output.replace(/[\x00-\x08]/g, '').trim()
|
||||
if (cleaned && !cleaned.toLowerCase().includes('error')) {
|
||||
const parts = cleaned.split(',').map((s) => s.trim())
|
||||
gpuName = parts[0] || gpuName
|
||||
const vramMB = parts[1] ? parseInt(parts[1], 10) : 0
|
||||
graphics.controllers = [{
|
||||
vendor: 'NVIDIA',
|
||||
model: gpuName,
|
||||
vram: vramMB || null,
|
||||
} as any]
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// nvidia-smi failed, use generic entry
|
||||
}
|
||||
if (graphics.controllers.length === 0) {
|
||||
graphics.controllers = [{
|
||||
vendor: 'NVIDIA',
|
||||
model: gpuName,
|
||||
vram: null,
|
||||
} as any]
|
||||
const nvidiaInfo = await this.getNvidiaSmiInfo()
|
||||
if (Array.isArray(nvidiaInfo)) {
|
||||
graphics.controllers = nvidiaInfo.map((gpu) => ({
|
||||
model: gpu.model,
|
||||
vendor: gpu.vendor,
|
||||
bus: "",
|
||||
vram: gpu.vram,
|
||||
vramDynamic: false, // assume false here, we don't actually use this field for our purposes.
|
||||
}))
|
||||
} else {
|
||||
logger.warn(`NVIDIA runtime detected but failed to get GPU info: ${typeof nvidiaInfo === 'string' ? nvidiaInfo : JSON.stringify(nvidiaInfo)}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -336,7 +365,7 @@ export class SystemService {
|
|||
message: 'Successfully subscribed to release notes',
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return {
|
||||
success: false,
|
||||
message: `Failed to subscribe: ${response.statusText}`,
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user