mirror of
https://github.com/Crosstalk-Solutions/project-nomad.git
synced 2026-04-03 07:19:27 +02:00
refactor: reusable utility for running nvidia-smi
This commit is contained in:
parent
c16cfc3a93
commit
921eef30d6
|
|
@ -278,45 +278,12 @@ export class BenchmarkService {
|
||||||
if ('nvidia' in runtimes) {
|
if ('nvidia' in runtimes) {
|
||||||
logger.info('[BenchmarkService] NVIDIA container runtime detected, querying GPU model via nvidia-smi')
|
logger.info('[BenchmarkService] NVIDIA container runtime detected, querying GPU model via nvidia-smi')
|
||||||
|
|
||||||
// Try to get GPU model name from the running Ollama container
|
const systemService = new (await import('./system_service.js')).SystemService(this.dockerService)
|
||||||
try {
|
const nvidiaInfo = await systemService.getNvidiaSmiInfo()
|
||||||
const containers = await this.dockerService.docker.listContainers({ all: false })
|
if (Array.isArray(nvidiaInfo) && nvidiaInfo.length > 0) {
|
||||||
const ollamaContainer = containers.find((c) =>
|
gpuModel = nvidiaInfo[0].model
|
||||||
c.Names.includes(`/${SERVICE_NAMES.OLLAMA}`)
|
} else {
|
||||||
)
|
logger.warn(`[BenchmarkService] NVIDIA runtime detected but failed to get GPU info: ${typeof nvidiaInfo === 'string' ? nvidiaInfo : JSON.stringify(nvidiaInfo)}`)
|
||||||
|
|
||||||
if (ollamaContainer) {
|
|
||||||
const container = this.dockerService.docker.getContainer(ollamaContainer.Id)
|
|
||||||
const exec = await container.exec({
|
|
||||||
Cmd: ['nvidia-smi', '--query-gpu=name', '--format=csv,noheader'],
|
|
||||||
AttachStdout: true,
|
|
||||||
AttachStderr: true,
|
|
||||||
Tty: true,
|
|
||||||
})
|
|
||||||
|
|
||||||
const stream = await exec.start({ Tty: true })
|
|
||||||
const output = await new Promise<string>((resolve) => {
|
|
||||||
let data = ''
|
|
||||||
const timeout = setTimeout(() => resolve(data), 5000)
|
|
||||||
stream.on('data', (chunk: Buffer) => { data += chunk.toString() })
|
|
||||||
stream.on('end', () => { clearTimeout(timeout); resolve(data) })
|
|
||||||
})
|
|
||||||
|
|
||||||
const gpuName = output.replace(/[\x00-\x08]/g, '').trim()
|
|
||||||
if (gpuName && !gpuName.toLowerCase().includes('error') && !gpuName.toLowerCase().includes('not found')) {
|
|
||||||
gpuModel = gpuName
|
|
||||||
logger.info(`[BenchmarkService] GPU detected via nvidia-smi: ${gpuModel}`)
|
|
||||||
} else {
|
|
||||||
gpuModel = 'NVIDIA GPU (model unknown)'
|
|
||||||
logger.info('[BenchmarkService] NVIDIA runtime present but nvidia-smi query failed, using generic name')
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
gpuModel = 'NVIDIA GPU (model unknown)'
|
|
||||||
logger.info('[BenchmarkService] NVIDIA runtime present but Ollama container not running')
|
|
||||||
}
|
|
||||||
} catch (execError) {
|
|
||||||
gpuModel = 'NVIDIA GPU (model unknown)'
|
|
||||||
logger.warn(`[BenchmarkService] nvidia-smi exec failed: ${execError.message}`)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (dockerError) {
|
} catch (dockerError) {
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,7 @@ export class SystemService {
|
||||||
private static appVersion: string | null = null
|
private static appVersion: string | null = null
|
||||||
private static diskInfoFile = '/storage/nomad-disk-info.json'
|
private static diskInfoFile = '/storage/nomad-disk-info.json'
|
||||||
|
|
||||||
constructor(private dockerService: DockerService) {}
|
constructor(private dockerService: DockerService) { }
|
||||||
|
|
||||||
async checkServiceInstalled(serviceName: string): Promise<boolean> {
|
async checkServiceInstalled(serviceName: string): Promise<boolean> {
|
||||||
const services = await this.getServices({ installedOnly: true });
|
const services = await this.getServices({ installedOnly: true });
|
||||||
|
|
@ -66,6 +66,66 @@ export class SystemService {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async getNvidiaSmiInfo(): Promise<Array<{ vendor: string; model: string; vram: number; }> | { error: string } | 'OLLAMA_NOT_FOUND' | 'BAD_RESPONSE' | 'UNKNOWN_ERROR'> {
|
||||||
|
try {
|
||||||
|
const containers = await this.dockerService.docker.listContainers({ all: false })
|
||||||
|
const ollamaContainer = containers.find((c) =>
|
||||||
|
c.Names.includes(`/${SERVICE_NAMES.OLLAMA}`)
|
||||||
|
)
|
||||||
|
if (!ollamaContainer) {
|
||||||
|
logger.info('Ollama container not found for nvidia-smi info retrieval. This is expected if Ollama is not installed.')
|
||||||
|
return 'OLLAMA_NOT_FOUND'
|
||||||
|
}
|
||||||
|
|
||||||
|
// Execute nvidia-smi inside the Ollama container to get GPU info
|
||||||
|
const container = this.dockerService.docker.getContainer(ollamaContainer.Id)
|
||||||
|
const exec = await container.exec({
|
||||||
|
Cmd: ['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader,nounits'],
|
||||||
|
AttachStdout: true,
|
||||||
|
AttachStderr: true,
|
||||||
|
Tty: true,
|
||||||
|
})
|
||||||
|
|
||||||
|
// Read the output stream with a timeout to prevent hanging if nvidia-smi fails
|
||||||
|
const stream = await exec.start({ Tty: true })
|
||||||
|
const output = await new Promise<string>((resolve) => {
|
||||||
|
let data = ''
|
||||||
|
const timeout = setTimeout(() => resolve(data), 5000)
|
||||||
|
stream.on('data', (chunk: Buffer) => { data += chunk.toString() })
|
||||||
|
stream.on('end', () => { clearTimeout(timeout); resolve(data) })
|
||||||
|
})
|
||||||
|
|
||||||
|
// Remove any non-printable characters and trim the output
|
||||||
|
const cleaned = output.replace(/[\x00-\x08]/g, '').trim()
|
||||||
|
if (cleaned && !cleaned.toLowerCase().includes('error') && !cleaned.toLowerCase().includes('not found')) {
|
||||||
|
// Split by newlines to handle multiple GPUs installed
|
||||||
|
const lines = cleaned.split('\n').filter(line => line.trim())
|
||||||
|
|
||||||
|
// Map each line out to a useful structure for us
|
||||||
|
const gpus = lines.map(line => {
|
||||||
|
const parts = line.split(',').map((s) => s.trim())
|
||||||
|
return {
|
||||||
|
vendor: 'NVIDIA',
|
||||||
|
model: parts[0] || 'NVIDIA GPU',
|
||||||
|
vram: parts[1] ? parseInt(parts[1], 10) : 0,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
return gpus.length > 0 ? gpus : 'BAD_RESPONSE'
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we got output but looks like an error, consider it a bad response from nvidia-smi
|
||||||
|
return 'BAD_RESPONSE'
|
||||||
|
}
|
||||||
|
catch (error) {
|
||||||
|
logger.error('Error getting nvidia-smi info:', error)
|
||||||
|
if (error instanceof Error && error.message) {
|
||||||
|
return { error: error.message }
|
||||||
|
}
|
||||||
|
return 'UNKNOWN_ERROR'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async getServices({ installedOnly = true }: { installedOnly?: boolean }): Promise<ServiceSlim[]> {
|
async getServices({ installedOnly = true }: { installedOnly?: boolean }): Promise<ServiceSlim[]> {
|
||||||
await this._syncContainersWithDatabase() // Sync up before fetching to ensure we have the latest status
|
await this._syncContainersWithDatabase() // Sync up before fetching to ensure we have the latest status
|
||||||
|
|
||||||
|
|
@ -195,48 +255,17 @@ export class SystemService {
|
||||||
if (!graphics.controllers || graphics.controllers.length === 0) {
|
if (!graphics.controllers || graphics.controllers.length === 0) {
|
||||||
const runtimes = dockerInfo.Runtimes || {}
|
const runtimes = dockerInfo.Runtimes || {}
|
||||||
if ('nvidia' in runtimes) {
|
if ('nvidia' in runtimes) {
|
||||||
let gpuName = 'NVIDIA GPU'
|
const nvidiaInfo = await this.getNvidiaSmiInfo()
|
||||||
try {
|
if (Array.isArray(nvidiaInfo)) {
|
||||||
const containers = await this.dockerService.docker.listContainers({ all: false })
|
graphics.controllers = nvidiaInfo.map((gpu) => ({
|
||||||
const ollamaContainer = containers.find((c) =>
|
model: gpu.model,
|
||||||
c.Names.includes(`/${SERVICE_NAMES.OLLAMA}`)
|
vendor: gpu.vendor,
|
||||||
)
|
bus: "",
|
||||||
if (ollamaContainer) {
|
vram: gpu.vram,
|
||||||
const container = this.dockerService.docker.getContainer(ollamaContainer.Id)
|
vramDynamic: false, // assume false here, we don't actually use this field for our purposes.
|
||||||
const exec = await container.exec({
|
}))
|
||||||
Cmd: ['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader,nounits'],
|
} else {
|
||||||
AttachStdout: true,
|
logger.warn(`NVIDIA runtime detected but failed to get GPU info: ${typeof nvidiaInfo === 'string' ? nvidiaInfo : JSON.stringify(nvidiaInfo)}`)
|
||||||
AttachStderr: true,
|
|
||||||
Tty: true,
|
|
||||||
})
|
|
||||||
const stream = await exec.start({ Tty: true })
|
|
||||||
const output = await new Promise<string>((resolve) => {
|
|
||||||
let data = ''
|
|
||||||
const timeout = setTimeout(() => resolve(data), 5000)
|
|
||||||
stream.on('data', (chunk: Buffer) => { data += chunk.toString() })
|
|
||||||
stream.on('end', () => { clearTimeout(timeout); resolve(data) })
|
|
||||||
})
|
|
||||||
const cleaned = output.replace(/[\x00-\x08]/g, '').trim()
|
|
||||||
if (cleaned && !cleaned.toLowerCase().includes('error')) {
|
|
||||||
const parts = cleaned.split(',').map((s) => s.trim())
|
|
||||||
gpuName = parts[0] || gpuName
|
|
||||||
const vramMB = parts[1] ? parseInt(parts[1], 10) : 0
|
|
||||||
graphics.controllers = [{
|
|
||||||
vendor: 'NVIDIA',
|
|
||||||
model: gpuName,
|
|
||||||
vram: vramMB || null,
|
|
||||||
} as any]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch {
|
|
||||||
// nvidia-smi failed, use generic entry
|
|
||||||
}
|
|
||||||
if (graphics.controllers.length === 0) {
|
|
||||||
graphics.controllers = [{
|
|
||||||
vendor: 'NVIDIA',
|
|
||||||
model: gpuName,
|
|
||||||
vram: null,
|
|
||||||
} as any]
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -336,7 +365,7 @@ export class SystemService {
|
||||||
message: 'Successfully subscribed to release notes',
|
message: 'Successfully subscribed to release notes',
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
message: `Failed to subscribe: ${response.statusText}`,
|
message: `Failed to subscribe: ${response.statusText}`,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user