feat(AI): add Ollama support for NVIDIA and AMD GPUs

2026-05-12 16:10:11 +02:00 · 2026-02-02 00:24:10 +00:00 · 2026-02-02 00:24:10 +00:00 · a697d930fe
commit a697d930fe
parent d1f40663d3
2 changed files with 243 additions and 2 deletions
--- a/admin/app/services/docker_service.ts
+++ b/admin/app/services/docker_service.ts
@ -7,6 +7,9 @@ import { doResumableDownloadWithRetry } from '../utils/downloads.js'
 import { join } from 'path'
 import { ZIM_STORAGE_PATH } from '../utils/fs.js'
 import { SERVICE_NAMES } from '../../constants/service_names.js'
+import { exec } from 'child_process'
+import { promisify } from 'util'
+import { readdir } from 'fs/promises'

@inject()
 export class DockerService {
@ -444,16 +447,79 @@ export class DockerService {
        )
      }

+      // GPU-aware configuration for Ollama
+      let finalImage = service.container_image
+      let gpuHostConfig = containerConfig?.HostConfig || {}
+
+      if (service.service_name === SERVICE_NAMES.OLLAMA) {
+        const gpuType = await this._detectGPUType()
+
+        if (gpuType === 'nvidia') {
+          this._broadcast(
+            service.service_name,
+            'gpu-config',
+            `NVIDIA GPU detected. Configuring container with GPU support...`
+          )
+
+          // Add GPU support for NVIDIA
+          gpuHostConfig = {
+            ...gpuHostConfig,
+            DeviceRequests: [
+              {
+                Driver: 'nvidia',
+                Count: -1, // -1 means all GPUs
+                Capabilities: [['gpu']],
+              },
+            ],
+          }
+        } else if (gpuType === 'amd') {
+          this._broadcast(
+            service.service_name,
+            'gpu-config',
+            `AMD GPU detected. Using ROCm image and configuring container with GPU support...`
+          )
+
+          // Use ROCm image for AMD
+          finalImage = 'ollama/ollama:rocm'
+
+          // Dynamically discover and add AMD GPU devices
+          const amdDevices = await this._discoverAMDDevices()
+          if (!amdDevices || amdDevices.length === 0) {
+            this._broadcast(
+              service.service_name,
+              'gpu-config-error',
+              `Failed to discover AMD GPU devices. Proceeding with CPU-only configuration...`
+            )
+            gpuHostConfig = { ...gpuHostConfig } // No GPU devices added
+            logger.warn(`[DockerService] No AMD GPU devices discovered for Ollama`)
+          } else {
+            gpuHostConfig = {
+              ...gpuHostConfig,
+              Devices: amdDevices,
+            }
+            logger.info(
+              `[DockerService] Configured ${amdDevices.length} AMD GPU devices for Ollama`
+            )
+          }
+        } else {
+          this._broadcast(
+            service.service_name,
+            'gpu-config',
+            `No GPU detected. Using CPU-only configuration...`
+          )
+        }
+      }
+
      this._broadcast(
        service.service_name,
        'creating',
        `Creating Docker container for service ${service.service_name}...`
      )
      const container = await this.docker.createContainer({
-        Image: service.container_image,
+        Image: finalImage,
        name: service.service_name,
        ...(containerConfig?.User && { User: containerConfig.User }),
-        ...(containerConfig?.HostConfig && { HostConfig: containerConfig.HostConfig }),
+        HostConfig: gpuHostConfig,
        ...(containerConfig?.WorkingDir && { WorkingDir: containerConfig.WorkingDir }),
        ...(containerConfig?.ExposedPorts && { ExposedPorts: containerConfig.ExposedPorts }),
        ...(containerConfig?.Env && { Env: containerConfig.Env }),
@ -603,6 +669,104 @@ export class DockerService {
    }
  }

+  /**
+   * Detect GPU type (NVIDIA or AMD) on the system.
+   * Returns 'nvidia', 'amd', or 'none'.
+   */
+  private async _detectGPUType(): Promise<'nvidia' | 'amd' | 'none'> {
+    try {
+      const execAsync = promisify(exec)
+
+      // Check for NVIDIA GPU
+      try {
+        const { stdout: nvidiaCheck } = await execAsync(
+          'lspci 2>/dev/null | grep -i nvidia || true'
+        )
+        if (nvidiaCheck.trim()) {
+          logger.info('[DockerService] NVIDIA GPU detected')
+          return 'nvidia'
+        }
+      } catch (error) {
+        // Continue to AMD check
+      }
+
+      // Check for AMD GPU
+      try {
+        const { stdout: amdCheck } = await execAsync(
+          'lspci 2>/dev/null | grep -iE "amd|radeon" || true'
+        )
+        if (amdCheck.trim()) {
+          logger.info('[DockerService] AMD GPU detected')
+          return 'amd'
+        }
+      } catch (error) {
+        // No GPU detected
+      }
+
+      logger.info('[DockerService] No GPU detected')
+      return 'none'
+    } catch (error) {
+      logger.warn(`[DockerService] Error detecting GPU type: ${error.message}`)
+      return 'none'
+    }
+  }
+
+  /**
+   * Discover AMD GPU DRI devices dynamically.
+   * Returns an array of device configurations for Docker.
+   */
+  private async _discoverAMDDevices(): Promise<
+    Array<{ PathOnHost: string; PathInContainer: string; CgroupPermissions: string }>
+  > {
+    try {
+      const devices: Array<{
+        PathOnHost: string
+        PathInContainer: string
+        CgroupPermissions: string
+      }> = []
+
+      // Always add /dev/kfd (Kernel Fusion Driver)
+      devices.push({
+        PathOnHost: '/dev/kfd',
+        PathInContainer: '/dev/kfd',
+        CgroupPermissions: 'rwm',
+      })
+
+      // Discover DRI devices in /dev/dri/
+      try {
+        const driDevices = await readdir('/dev/dri')
+        for (const device of driDevices) {
+          const devicePath = `/dev/dri/${device}`
+          devices.push({
+            PathOnHost: devicePath,
+            PathInContainer: devicePath,
+            CgroupPermissions: 'rwm',
+          })
+        }
+        logger.info(
+          `[DockerService] Discovered ${driDevices.length} DRI devices: ${driDevices.join(', ')}`
+        )
+      } catch (error) {
+        logger.warn(`[DockerService] Could not read /dev/dri directory: ${error.message}`)
+        // Fallback to common device names if directory read fails
+        const fallbackDevices = ['card0', 'renderD128']
+        for (const device of fallbackDevices) {
+          devices.push({
+            PathOnHost: `/dev/dri/${device}`,
+            PathInContainer: `/dev/dri/${device}`,
+            CgroupPermissions: 'rwm',
+          })
+        }
+        logger.info(`[DockerService] Using fallback DRI devices: ${fallbackDevices.join(', ')}`)
+      }
+
+      return devices
+    } catch (error) {
+      logger.error(`[DockerService] Error discovering AMD devices: ${error.message}`)
+      return []
+    }
+  }
+
  private _broadcast(service: string, status: string, message: string) {
    transmit.broadcast('service-installation', {
      service_name: service,
--- a/install/install_nomad.sh
+++ b/install/install_nomad.sh
@ -203,6 +203,82 @@ ensure_docker_installed() {
  fi
 }

+setup_nvidia_container_toolkit() {
+  echo -e "${YELLOW}#${RESET} Checking for NVIDIA GPU...\\n"
+  
+  # Safely detect NVIDIA GPU
+  local has_nvidia_gpu=false
+  if command -v lspci &> /dev/null; then
+    if lspci 2>/dev/null | grep -i nvidia &> /dev/null; then
+      has_nvidia_gpu=true
+      echo -e "${GREEN}#${RESET} NVIDIA GPU detected.\\n"
+    fi
+  fi
+  
+  # Also check for nvidia-smi
+  if ! $has_nvidia_gpu && command -v nvidia-smi &> /dev/null; then
+    if nvidia-smi &> /dev/null; then
+      has_nvidia_gpu=true
+      echo -e "${GREEN}#${RESET} NVIDIA GPU detected via nvidia-smi.\\n"
+    fi
+  fi
+  
+  if ! $has_nvidia_gpu; then
+    echo -e "${YELLOW}#${RESET} No NVIDIA GPU detected. Skipping NVIDIA container toolkit installation.\\n"
+    return 0
+  fi
+  
+  # Check if nvidia-container-toolkit is already installed
+  if command -v nvidia-ctk &> /dev/null; then
+    echo -e "${GREEN}#${RESET} NVIDIA container toolkit is already installed.\\n"
+    return 0
+  fi
+  
+  echo -e "${YELLOW}#${RESET} Installing NVIDIA container toolkit...\\n"
+  
+  # Install dependencies per https://docs.ollama.com/docker - wrapped in error handling
+  if ! curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey 2>/dev/null | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg 2>/dev/null; then
+    echo -e "${YELLOW}#${RESET} Warning: Failed to add NVIDIA container toolkit GPG key. Continuing anyway...\\n"
+    return 0
+  fi
+  
+  if ! curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list 2>/dev/null \
+      | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
+      | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list > /dev/null 2>&1; then
+    echo -e "${YELLOW}#${RESET} Warning: Failed to add NVIDIA container toolkit repository. Continuing anyway...\\n"
+    return 0
+  fi
+  
+  if ! sudo apt-get update 2>/dev/null; then
+    echo -e "${YELLOW}#${RESET} Warning: Failed to update package list. Continuing anyway...\\n"
+    return 0
+  fi
+  
+  if ! sudo apt-get install -y nvidia-container-toolkit 2>/dev/null; then
+    echo -e "${YELLOW}#${RESET} Warning: Failed to install NVIDIA container toolkit. Continuing anyway...\\n"
+    return 0
+  fi
+  
+  echo -e "${GREEN}#${RESET} NVIDIA container toolkit installed successfully.\\n"
+  
+  # Configure Docker to use NVIDIA runtime
+  echo -e "${YELLOW}#${RESET} Configuring Docker to use NVIDIA runtime...\\n"
+  
+  if ! sudo nvidia-ctk runtime configure --runtime=docker 2>/dev/null; then
+    echo -e "${YELLOW}#${RESET} Warning: Failed to configure NVIDIA runtime for Docker. Continuing anyway...\\n"
+    return 0
+  fi
+  
+  # Restart Docker service
+  echo -e "${YELLOW}#${RESET} Restarting Docker service...\\n"
+  if ! sudo systemctl restart docker 2>/dev/null; then
+    echo -e "${YELLOW}#${RESET} Warning: Failed to restart Docker service. You may need to restart it manually.\\n"
+    return 0
+  fi
+  
+  echo -e "${GREEN}#${RESET} NVIDIA container toolkit configuration completed successfully.\\n"
+}
+
 get_install_confirmation(){
  read -p "This script will install/update Project N.O.M.A.D. and its dependencies on your machine. Are you sure you want to continue? (y/n): " choice
  case "$choice" in
@ -439,6 +515,7 @@ check_is_debug_mode
 get_install_confirmation
 accept_terms
 ensure_docker_installed
+setup_nvidia_container_toolkit
 get_local_ip
 create_nomad_directory
 download_wait_for_it_script