From a697d930fe2ac149067790c756b5bb08b36dded7 Mon Sep 17 00:00:00 2001 From: Jake Turner Date: Mon, 2 Feb 2026 00:24:10 +0000 Subject: [PATCH] feat(AI): add Ollama support for NVIDIA and AMD GPUs --- admin/app/services/docker_service.ts | 168 ++++++++++++++++++++++++++- install/install_nomad.sh | 77 ++++++++++++ 2 files changed, 243 insertions(+), 2 deletions(-) diff --git a/admin/app/services/docker_service.ts b/admin/app/services/docker_service.ts index 5b352ab..99ad6d8 100644 --- a/admin/app/services/docker_service.ts +++ b/admin/app/services/docker_service.ts @@ -7,6 +7,9 @@ import { doResumableDownloadWithRetry } from '../utils/downloads.js' import { join } from 'path' import { ZIM_STORAGE_PATH } from '../utils/fs.js' import { SERVICE_NAMES } from '../../constants/service_names.js' +import { exec } from 'child_process' +import { promisify } from 'util' +import { readdir } from 'fs/promises' @inject() export class DockerService { @@ -444,16 +447,79 @@ export class DockerService { ) } + // GPU-aware configuration for Ollama + let finalImage = service.container_image + let gpuHostConfig = containerConfig?.HostConfig || {} + + if (service.service_name === SERVICE_NAMES.OLLAMA) { + const gpuType = await this._detectGPUType() + + if (gpuType === 'nvidia') { + this._broadcast( + service.service_name, + 'gpu-config', + `NVIDIA GPU detected. Configuring container with GPU support...` + ) + + // Add GPU support for NVIDIA + gpuHostConfig = { + ...gpuHostConfig, + DeviceRequests: [ + { + Driver: 'nvidia', + Count: -1, // -1 means all GPUs + Capabilities: [['gpu']], + }, + ], + } + } else if (gpuType === 'amd') { + this._broadcast( + service.service_name, + 'gpu-config', + `AMD GPU detected. Using ROCm image and configuring container with GPU support...` + ) + + // Use ROCm image for AMD + finalImage = 'ollama/ollama:rocm' + + // Dynamically discover and add AMD GPU devices + const amdDevices = await this._discoverAMDDevices() + if (!amdDevices || amdDevices.length === 0) { + this._broadcast( + service.service_name, + 'gpu-config-error', + `Failed to discover AMD GPU devices. Proceeding with CPU-only configuration...` + ) + gpuHostConfig = { ...gpuHostConfig } // No GPU devices added + logger.warn(`[DockerService] No AMD GPU devices discovered for Ollama`) + } else { + gpuHostConfig = { + ...gpuHostConfig, + Devices: amdDevices, + } + logger.info( + `[DockerService] Configured ${amdDevices.length} AMD GPU devices for Ollama` + ) + } + } else { + this._broadcast( + service.service_name, + 'gpu-config', + `No GPU detected. Using CPU-only configuration...` + ) + } + } + this._broadcast( service.service_name, 'creating', `Creating Docker container for service ${service.service_name}...` ) const container = await this.docker.createContainer({ - Image: service.container_image, + Image: finalImage, name: service.service_name, ...(containerConfig?.User && { User: containerConfig.User }), - ...(containerConfig?.HostConfig && { HostConfig: containerConfig.HostConfig }), + HostConfig: gpuHostConfig, ...(containerConfig?.WorkingDir && { WorkingDir: containerConfig.WorkingDir }), ...(containerConfig?.ExposedPorts && { ExposedPorts: containerConfig.ExposedPorts }), ...(containerConfig?.Env && { Env: containerConfig.Env }), @@ -603,6 +669,104 @@ export class DockerService { } } + /** + * Detect GPU type (NVIDIA or AMD) on the system. + * Returns 'nvidia', 'amd', or 'none'. + */ + private async _detectGPUType(): Promise<'nvidia' | 'amd' | 'none'> { + try { + const execAsync = promisify(exec) + + // Check for NVIDIA GPU + try { + const { stdout: nvidiaCheck } = await execAsync( + 'lspci 2>/dev/null | grep -i nvidia || true' + ) + if (nvidiaCheck.trim()) { + logger.info('[DockerService] NVIDIA GPU detected') + return 'nvidia' + } + } catch (error) { + // Continue to AMD check + } + + // Check for AMD GPU + try { + const { stdout: amdCheck } = await execAsync( + 'lspci 2>/dev/null | grep -iE "amd|radeon" || true' + ) + if (amdCheck.trim()) { + logger.info('[DockerService] AMD GPU detected') + return 'amd' + } + } catch (error) { + // No GPU detected + } + + logger.info('[DockerService] No GPU detected') + return 'none' + } catch (error) { + logger.warn(`[DockerService] Error detecting GPU type: ${error.message}`) + return 'none' + } + } + + /** + * Discover AMD GPU DRI devices dynamically. + * Returns an array of device configurations for Docker. + */ + private async _discoverAMDDevices(): Promise< + Array<{ PathOnHost: string; PathInContainer: string; CgroupPermissions: string }> + > { + try { + const devices: Array<{ + PathOnHost: string + PathInContainer: string + CgroupPermissions: string + }> = [] + + // Always add /dev/kfd (Kernel Fusion Driver) + devices.push({ + PathOnHost: '/dev/kfd', + PathInContainer: '/dev/kfd', + CgroupPermissions: 'rwm', + }) + + // Discover DRI devices in /dev/dri/ + try { + const driDevices = await readdir('/dev/dri') + for (const device of driDevices) { + const devicePath = `/dev/dri/${device}` + devices.push({ + PathOnHost: devicePath, + PathInContainer: devicePath, + CgroupPermissions: 'rwm', + }) + } + logger.info( + `[DockerService] Discovered ${driDevices.length} DRI devices: ${driDevices.join(', ')}` + ) + } catch (error) { + logger.warn(`[DockerService] Could not read /dev/dri directory: ${error.message}`) + // Fallback to common device names if directory read fails + const fallbackDevices = ['card0', 'renderD128'] + for (const device of fallbackDevices) { + devices.push({ + PathOnHost: `/dev/dri/${device}`, + PathInContainer: `/dev/dri/${device}`, + CgroupPermissions: 'rwm', + }) + } + logger.info(`[DockerService] Using fallback DRI devices: ${fallbackDevices.join(', ')}`) + } + + return devices + } catch (error) { + logger.error(`[DockerService] Error discovering AMD devices: ${error.message}`) + return [] + } + } + private _broadcast(service: string, status: string, message: string) { transmit.broadcast('service-installation', { service_name: service, diff --git a/install/install_nomad.sh b/install/install_nomad.sh index fc176b6..d960823 100644 --- a/install/install_nomad.sh +++ b/install/install_nomad.sh @@ -203,6 +203,82 @@ ensure_docker_installed() { fi } +setup_nvidia_container_toolkit() { + echo -e "${YELLOW}#${RESET} Checking for NVIDIA GPU...\\n" + + # Safely detect NVIDIA GPU + local has_nvidia_gpu=false + if command -v lspci &> /dev/null; then + if lspci 2>/dev/null | grep -i nvidia &> /dev/null; then + has_nvidia_gpu=true + echo -e "${GREEN}#${RESET} NVIDIA GPU detected.\\n" + fi + fi + + # Also check for nvidia-smi + if ! $has_nvidia_gpu && command -v nvidia-smi &> /dev/null; then + if nvidia-smi &> /dev/null; then + has_nvidia_gpu=true + echo -e "${GREEN}#${RESET} NVIDIA GPU detected via nvidia-smi.\\n" + fi + fi + + if ! $has_nvidia_gpu; then + echo -e "${YELLOW}#${RESET} No NVIDIA GPU detected. Skipping NVIDIA container toolkit installation.\\n" + return 0 + fi + + # Check if nvidia-container-toolkit is already installed + if command -v nvidia-ctk &> /dev/null; then + echo -e "${GREEN}#${RESET} NVIDIA container toolkit is already installed.\\n" + return 0 + fi + + echo -e "${YELLOW}#${RESET} Installing NVIDIA container toolkit...\\n" + + # Install dependencies per https://docs.ollama.com/docker - wrapped in error handling + if ! curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey 2>/dev/null | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg 2>/dev/null; then + echo -e "${YELLOW}#${RESET} Warning: Failed to add NVIDIA container toolkit GPG key. Continuing anyway...\\n" + return 0 + fi + + if ! curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list 2>/dev/null \ + | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ + | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list > /dev/null 2>&1; then + echo -e "${YELLOW}#${RESET} Warning: Failed to add NVIDIA container toolkit repository. Continuing anyway...\\n" + return 0 + fi + + if ! sudo apt-get update 2>/dev/null; then + echo -e "${YELLOW}#${RESET} Warning: Failed to update package list. Continuing anyway...\\n" + return 0 + fi + + if ! sudo apt-get install -y nvidia-container-toolkit 2>/dev/null; then + echo -e "${YELLOW}#${RESET} Warning: Failed to install NVIDIA container toolkit. Continuing anyway...\\n" + return 0 + fi + + echo -e "${GREEN}#${RESET} NVIDIA container toolkit installed successfully.\\n" + + # Configure Docker to use NVIDIA runtime + echo -e "${YELLOW}#${RESET} Configuring Docker to use NVIDIA runtime...\\n" + + if ! sudo nvidia-ctk runtime configure --runtime=docker 2>/dev/null; then + echo -e "${YELLOW}#${RESET} Warning: Failed to configure NVIDIA runtime for Docker. Continuing anyway...\\n" + return 0 + fi + + # Restart Docker service + echo -e "${YELLOW}#${RESET} Restarting Docker service...\\n" + if ! sudo systemctl restart docker 2>/dev/null; then + echo -e "${YELLOW}#${RESET} Warning: Failed to restart Docker service. You may need to restart it manually.\\n" + return 0 + fi + + echo -e "${GREEN}#${RESET} NVIDIA container toolkit configuration completed successfully.\\n" +} + get_install_confirmation(){ read -p "This script will install/update Project N.O.M.A.D. and its dependencies on your machine. Are you sure you want to continue? (y/n): " choice case "$choice" in @@ -439,6 +515,7 @@ check_is_debug_mode get_install_confirmation accept_terms ensure_docker_installed +setup_nvidia_container_toolkit get_local_ip create_nomad_directory download_wait_for_it_script