refactor: reusable utility for running nvidia-smi

2026-04-03 07:19:27 +02:00 · 2026-02-08 15:17:01 -08:00 · 2026-02-08 15:17:01 -08:00 · 921eef30d6
commit 921eef30d6
parent c16cfc3a93
2 changed files with 79 additions and 83 deletions
--- a/admin/app/services/benchmark_service.ts
+++ b/admin/app/services/benchmark_service.ts
@ -278,45 +278,12 @@ export class BenchmarkService {
          if ('nvidia' in runtimes) {
            logger.info('[BenchmarkService] NVIDIA container runtime detected, querying GPU model via nvidia-smi')
-            // Try to get GPU model name from the running Ollama container
+            const systemService = new (await import('./system_service.js')).SystemService(this.dockerService)
-            try {
+            const nvidiaInfo = await systemService.getNvidiaSmiInfo()
-              const containers = await this.dockerService.docker.listContainers({ all: false })
+            if (Array.isArray(nvidiaInfo) && nvidiaInfo.length > 0) {
-              const ollamaContainer = containers.find((c) =>
+              gpuModel = nvidiaInfo[0].model
-                c.Names.includes(`/${SERVICE_NAMES.OLLAMA}`)
+            } else {
-              )
+              logger.warn(`[BenchmarkService] NVIDIA runtime detected but failed to get GPU info: ${typeof nvidiaInfo === 'string' ? nvidiaInfo : JSON.stringify(nvidiaInfo)}`)
              if (ollamaContainer) {
                const container = this.dockerService.docker.getContainer(ollamaContainer.Id)
                const exec = await container.exec({
                  Cmd: ['nvidia-smi', '--query-gpu=name', '--format=csv,noheader'],
                  AttachStdout: true,
                  AttachStderr: true,
                  Tty: true,
                })
                const stream = await exec.start({ Tty: true })
                const output = await new Promise<string>((resolve) => {
                  let data = ''
                  const timeout = setTimeout(() => resolve(data), 5000)
                  stream.on('data', (chunk: Buffer) => { data += chunk.toString() })
                  stream.on('end', () => { clearTimeout(timeout); resolve(data) })
                })
                const gpuName = output.replace(/[\x00-\x08]/g, '').trim()
                if (gpuName && !gpuName.toLowerCase().includes('error') && !gpuName.toLowerCase().includes('not found')) {
                  gpuModel = gpuName
                  logger.info(`[BenchmarkService] GPU detected via nvidia-smi: ${gpuModel}`)
                } else {
                  gpuModel = 'NVIDIA GPU (model unknown)'
                  logger.info('[BenchmarkService] NVIDIA runtime present but nvidia-smi query failed, using generic name')
                }
              } else {
                gpuModel = 'NVIDIA GPU (model unknown)'
                logger.info('[BenchmarkService] NVIDIA runtime present but Ollama container not running')
              }
            } catch (execError) {
              gpuModel = 'NVIDIA GPU (model unknown)'
              logger.warn(`[BenchmarkService] nvidia-smi exec failed: ${execError.message}`)
            }
          }
        } catch (dockerError) {
--- a/admin/app/services/system_service.ts
+++ b/admin/app/services/system_service.ts
@ -20,7 +20,7 @@ export class SystemService {
  private static appVersion: string | null = null
  private static diskInfoFile = '/storage/nomad-disk-info.json'
-  constructor(private dockerService: DockerService) {}
+  constructor(private dockerService: DockerService) { }
  async checkServiceInstalled(serviceName: string): Promise<boolean> {
    const services = await this.getServices({ installedOnly: true });
@ -66,6 +66,66 @@ export class SystemService {
    return false
  }
  async getNvidiaSmiInfo(): Promise<Array<{ vendor: string; model: string; vram: number; }> | { error: string } | 'OLLAMA_NOT_FOUND' | 'BAD_RESPONSE' | 'UNKNOWN_ERROR'> {
    try {
      const containers = await this.dockerService.docker.listContainers({ all: false })
      const ollamaContainer = containers.find((c) =>
        c.Names.includes(`/${SERVICE_NAMES.OLLAMA}`)
      )
      if (!ollamaContainer) {
        logger.info('Ollama container not found for nvidia-smi info retrieval. This is expected if Ollama is not installed.')
        return 'OLLAMA_NOT_FOUND'
      }
      // Execute nvidia-smi inside the Ollama container to get GPU info
      const container = this.dockerService.docker.getContainer(ollamaContainer.Id)
      const exec = await container.exec({
        Cmd: ['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader,nounits'],
        AttachStdout: true,
        AttachStderr: true,
        Tty: true,
      })
      // Read the output stream with a timeout to prevent hanging if nvidia-smi fails
      const stream = await exec.start({ Tty: true })
      const output = await new Promise<string>((resolve) => {
        let data = ''
        const timeout = setTimeout(() => resolve(data), 5000)
        stream.on('data', (chunk: Buffer) => { data += chunk.toString() })
        stream.on('end', () => { clearTimeout(timeout); resolve(data) })
      })
      // Remove any non-printable characters and trim the output
      const cleaned = output.replace(/[\x00-\x08]/g, '').trim()
      if (cleaned && !cleaned.toLowerCase().includes('error') && !cleaned.toLowerCase().includes('not found')) {
        // Split by newlines to handle multiple GPUs installed
        const lines = cleaned.split('\n').filter(line => line.trim())
        // Map each line out to a useful structure for us
        const gpus = lines.map(line => {
          const parts = line.split(',').map((s) => s.trim())
          return {
            vendor: 'NVIDIA',
            model: parts[0] || 'NVIDIA GPU',
            vram: parts[1] ? parseInt(parts[1], 10) : 0,
          }
        })
        return gpus.length > 0 ? gpus : 'BAD_RESPONSE'
      }
      // If we got output but looks like an error, consider it a bad response from nvidia-smi
      return 'BAD_RESPONSE'
    }
    catch (error) {
      logger.error('Error getting nvidia-smi info:', error)
      if (error instanceof Error && error.message) {
        return { error: error.message }
      }
      return 'UNKNOWN_ERROR'
    }
  }
  async getServices({ installedOnly = true }: { installedOnly?: boolean }): Promise<ServiceSlim[]> {
    await this._syncContainersWithDatabase() // Sync up before fetching to ensure we have the latest status
@ -195,48 +255,17 @@ export class SystemService {
        if (!graphics.controllers || graphics.controllers.length === 0) {
          const runtimes = dockerInfo.Runtimes || {}
          if ('nvidia' in runtimes) {
-            let gpuName = 'NVIDIA GPU'
+            const nvidiaInfo = await this.getNvidiaSmiInfo()
-            try {
+            if (Array.isArray(nvidiaInfo)) {
-              const containers = await this.dockerService.docker.listContainers({ all: false })
+              graphics.controllers = nvidiaInfo.map((gpu) => ({
-              const ollamaContainer = containers.find((c) =>
+                model: gpu.model,
-                c.Names.includes(`/${SERVICE_NAMES.OLLAMA}`)
+                vendor: gpu.vendor,
-              )
+                bus: "",
-              if (ollamaContainer) {
+                vram: gpu.vram,
-                const container = this.dockerService.docker.getContainer(ollamaContainer.Id)
+                vramDynamic: false, // assume false here, we don't actually use this field for our purposes.
-                const exec = await container.exec({
+              }))
-                  Cmd: ['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader,nounits'],
+            } else {
-                  AttachStdout: true,
+              logger.warn(`NVIDIA runtime detected but failed to get GPU info: ${typeof nvidiaInfo === 'string' ? nvidiaInfo : JSON.stringify(nvidiaInfo)}`)
                  AttachStderr: true,
                  Tty: true,
                })
                const stream = await exec.start({ Tty: true })
                const output = await new Promise<string>((resolve) => {
                  let data = ''
                  const timeout = setTimeout(() => resolve(data), 5000)
                  stream.on('data', (chunk: Buffer) => { data += chunk.toString() })
                  stream.on('end', () => { clearTimeout(timeout); resolve(data) })
                })
                const cleaned = output.replace(/[\x00-\x08]/g, '').trim()
                if (cleaned && !cleaned.toLowerCase().includes('error')) {
                  const parts = cleaned.split(',').map((s) => s.trim())
                  gpuName = parts[0] || gpuName
                  const vramMB = parts[1] ? parseInt(parts[1], 10) : 0
                  graphics.controllers = [{
                    vendor: 'NVIDIA',
                    model: gpuName,
                    vram: vramMB || null,
                  } as any]
                }
              }
            } catch {
              // nvidia-smi failed, use generic entry
            }
            if (graphics.controllers.length === 0) {
              graphics.controllers = [{
                vendor: 'NVIDIA',
                model: gpuName,
                vram: null,
              } as any]
            }
          }
        }
@ -336,7 +365,7 @@ export class SystemService {
          message: 'Successfully subscribed to release notes',
        }
      }
-      
+
      return {
        success: false,
        message: `Failed to subscribe: ${response.statusText}`,