mirror of
https://github.com/Crosstalk-Solutions/project-nomad.git
synced 2026-03-28 03:29:25 +01:00
feat(AI): add Ollama support for NVIDIA and AMD GPUs
This commit is contained in:
parent
d1f40663d3
commit
a697d930fe
|
|
@ -7,6 +7,9 @@ import { doResumableDownloadWithRetry } from '../utils/downloads.js'
|
|||
import { join } from 'path'
|
||||
import { ZIM_STORAGE_PATH } from '../utils/fs.js'
|
||||
import { SERVICE_NAMES } from '../../constants/service_names.js'
|
||||
import { exec } from 'child_process'
|
||||
import { promisify } from 'util'
|
||||
import { readdir } from 'fs/promises'
|
||||
|
||||
@inject()
|
||||
export class DockerService {
|
||||
|
|
@ -444,16 +447,79 @@ export class DockerService {
|
|||
)
|
||||
}
|
||||
|
||||
// GPU-aware configuration for Ollama
|
||||
let finalImage = service.container_image
|
||||
let gpuHostConfig = containerConfig?.HostConfig || {}
|
||||
|
||||
if (service.service_name === SERVICE_NAMES.OLLAMA) {
|
||||
const gpuType = await this._detectGPUType()
|
||||
|
||||
if (gpuType === 'nvidia') {
|
||||
this._broadcast(
|
||||
service.service_name,
|
||||
'gpu-config',
|
||||
`NVIDIA GPU detected. Configuring container with GPU support...`
|
||||
)
|
||||
|
||||
// Add GPU support for NVIDIA
|
||||
gpuHostConfig = {
|
||||
...gpuHostConfig,
|
||||
DeviceRequests: [
|
||||
{
|
||||
Driver: 'nvidia',
|
||||
Count: -1, // -1 means all GPUs
|
||||
Capabilities: [['gpu']],
|
||||
},
|
||||
],
|
||||
}
|
||||
} else if (gpuType === 'amd') {
|
||||
this._broadcast(
|
||||
service.service_name,
|
||||
'gpu-config',
|
||||
`AMD GPU detected. Using ROCm image and configuring container with GPU support...`
|
||||
)
|
||||
|
||||
// Use ROCm image for AMD
|
||||
finalImage = 'ollama/ollama:rocm'
|
||||
|
||||
// Dynamically discover and add AMD GPU devices
|
||||
const amdDevices = await this._discoverAMDDevices()
|
||||
if (!amdDevices || amdDevices.length === 0) {
|
||||
this._broadcast(
|
||||
service.service_name,
|
||||
'gpu-config-error',
|
||||
`Failed to discover AMD GPU devices. Proceeding with CPU-only configuration...`
|
||||
)
|
||||
gpuHostConfig = { ...gpuHostConfig } // No GPU devices added
|
||||
logger.warn(`[DockerService] No AMD GPU devices discovered for Ollama`)
|
||||
} else {
|
||||
gpuHostConfig = {
|
||||
...gpuHostConfig,
|
||||
Devices: amdDevices,
|
||||
}
|
||||
logger.info(
|
||||
`[DockerService] Configured ${amdDevices.length} AMD GPU devices for Ollama`
|
||||
)
|
||||
}
|
||||
} else {
|
||||
this._broadcast(
|
||||
service.service_name,
|
||||
'gpu-config',
|
||||
`No GPU detected. Using CPU-only configuration...`
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
this._broadcast(
|
||||
service.service_name,
|
||||
'creating',
|
||||
`Creating Docker container for service ${service.service_name}...`
|
||||
)
|
||||
const container = await this.docker.createContainer({
|
||||
Image: service.container_image,
|
||||
Image: finalImage,
|
||||
name: service.service_name,
|
||||
...(containerConfig?.User && { User: containerConfig.User }),
|
||||
...(containerConfig?.HostConfig && { HostConfig: containerConfig.HostConfig }),
|
||||
HostConfig: gpuHostConfig,
|
||||
...(containerConfig?.WorkingDir && { WorkingDir: containerConfig.WorkingDir }),
|
||||
...(containerConfig?.ExposedPorts && { ExposedPorts: containerConfig.ExposedPorts }),
|
||||
...(containerConfig?.Env && { Env: containerConfig.Env }),
|
||||
|
|
@ -603,6 +669,104 @@ export class DockerService {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect GPU type (NVIDIA or AMD) on the system.
|
||||
* Returns 'nvidia', 'amd', or 'none'.
|
||||
*/
|
||||
private async _detectGPUType(): Promise<'nvidia' | 'amd' | 'none'> {
|
||||
try {
|
||||
const execAsync = promisify(exec)
|
||||
|
||||
// Check for NVIDIA GPU
|
||||
try {
|
||||
const { stdout: nvidiaCheck } = await execAsync(
|
||||
'lspci 2>/dev/null | grep -i nvidia || true'
|
||||
)
|
||||
if (nvidiaCheck.trim()) {
|
||||
logger.info('[DockerService] NVIDIA GPU detected')
|
||||
return 'nvidia'
|
||||
}
|
||||
} catch (error) {
|
||||
// Continue to AMD check
|
||||
}
|
||||
|
||||
// Check for AMD GPU
|
||||
try {
|
||||
const { stdout: amdCheck } = await execAsync(
|
||||
'lspci 2>/dev/null | grep -iE "amd|radeon" || true'
|
||||
)
|
||||
if (amdCheck.trim()) {
|
||||
logger.info('[DockerService] AMD GPU detected')
|
||||
return 'amd'
|
||||
}
|
||||
} catch (error) {
|
||||
// No GPU detected
|
||||
}
|
||||
|
||||
logger.info('[DockerService] No GPU detected')
|
||||
return 'none'
|
||||
} catch (error) {
|
||||
logger.warn(`[DockerService] Error detecting GPU type: ${error.message}`)
|
||||
return 'none'
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover AMD GPU DRI devices dynamically.
|
||||
* Returns an array of device configurations for Docker.
|
||||
*/
|
||||
private async _discoverAMDDevices(): Promise<
|
||||
Array<{ PathOnHost: string; PathInContainer: string; CgroupPermissions: string }>
|
||||
> {
|
||||
try {
|
||||
const devices: Array<{
|
||||
PathOnHost: string
|
||||
PathInContainer: string
|
||||
CgroupPermissions: string
|
||||
}> = []
|
||||
|
||||
// Always add /dev/kfd (Kernel Fusion Driver)
|
||||
devices.push({
|
||||
PathOnHost: '/dev/kfd',
|
||||
PathInContainer: '/dev/kfd',
|
||||
CgroupPermissions: 'rwm',
|
||||
})
|
||||
|
||||
// Discover DRI devices in /dev/dri/
|
||||
try {
|
||||
const driDevices = await readdir('/dev/dri')
|
||||
for (const device of driDevices) {
|
||||
const devicePath = `/dev/dri/${device}`
|
||||
devices.push({
|
||||
PathOnHost: devicePath,
|
||||
PathInContainer: devicePath,
|
||||
CgroupPermissions: 'rwm',
|
||||
})
|
||||
}
|
||||
logger.info(
|
||||
`[DockerService] Discovered ${driDevices.length} DRI devices: ${driDevices.join(', ')}`
|
||||
)
|
||||
} catch (error) {
|
||||
logger.warn(`[DockerService] Could not read /dev/dri directory: ${error.message}`)
|
||||
// Fallback to common device names if directory read fails
|
||||
const fallbackDevices = ['card0', 'renderD128']
|
||||
for (const device of fallbackDevices) {
|
||||
devices.push({
|
||||
PathOnHost: `/dev/dri/${device}`,
|
||||
PathInContainer: `/dev/dri/${device}`,
|
||||
CgroupPermissions: 'rwm',
|
||||
})
|
||||
}
|
||||
logger.info(`[DockerService] Using fallback DRI devices: ${fallbackDevices.join(', ')}`)
|
||||
}
|
||||
|
||||
return devices
|
||||
} catch (error) {
|
||||
logger.error(`[DockerService] Error discovering AMD devices: ${error.message}`)
|
||||
return []
|
||||
}
|
||||
}
|
||||
|
||||
private _broadcast(service: string, status: string, message: string) {
|
||||
transmit.broadcast('service-installation', {
|
||||
service_name: service,
|
||||
|
|
|
|||
|
|
@ -203,6 +203,82 @@ ensure_docker_installed() {
|
|||
fi
|
||||
}
|
||||
|
||||
setup_nvidia_container_toolkit() {
|
||||
echo -e "${YELLOW}#${RESET} Checking for NVIDIA GPU...\\n"
|
||||
|
||||
# Safely detect NVIDIA GPU
|
||||
local has_nvidia_gpu=false
|
||||
if command -v lspci &> /dev/null; then
|
||||
if lspci 2>/dev/null | grep -i nvidia &> /dev/null; then
|
||||
has_nvidia_gpu=true
|
||||
echo -e "${GREEN}#${RESET} NVIDIA GPU detected.\\n"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Also check for nvidia-smi
|
||||
if ! $has_nvidia_gpu && command -v nvidia-smi &> /dev/null; then
|
||||
if nvidia-smi &> /dev/null; then
|
||||
has_nvidia_gpu=true
|
||||
echo -e "${GREEN}#${RESET} NVIDIA GPU detected via nvidia-smi.\\n"
|
||||
fi
|
||||
fi
|
||||
|
||||
if ! $has_nvidia_gpu; then
|
||||
echo -e "${YELLOW}#${RESET} No NVIDIA GPU detected. Skipping NVIDIA container toolkit installation.\\n"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Check if nvidia-container-toolkit is already installed
|
||||
if command -v nvidia-ctk &> /dev/null; then
|
||||
echo -e "${GREEN}#${RESET} NVIDIA container toolkit is already installed.\\n"
|
||||
return 0
|
||||
fi
|
||||
|
||||
echo -e "${YELLOW}#${RESET} Installing NVIDIA container toolkit...\\n"
|
||||
|
||||
# Install dependencies per https://docs.ollama.com/docker - wrapped in error handling
|
||||
if ! curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey 2>/dev/null | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg 2>/dev/null; then
|
||||
echo -e "${YELLOW}#${RESET} Warning: Failed to add NVIDIA container toolkit GPG key. Continuing anyway...\\n"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if ! curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list 2>/dev/null \
|
||||
| sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
|
||||
| sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list > /dev/null 2>&1; then
|
||||
echo -e "${YELLOW}#${RESET} Warning: Failed to add NVIDIA container toolkit repository. Continuing anyway...\\n"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if ! sudo apt-get update 2>/dev/null; then
|
||||
echo -e "${YELLOW}#${RESET} Warning: Failed to update package list. Continuing anyway...\\n"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if ! sudo apt-get install -y nvidia-container-toolkit 2>/dev/null; then
|
||||
echo -e "${YELLOW}#${RESET} Warning: Failed to install NVIDIA container toolkit. Continuing anyway...\\n"
|
||||
return 0
|
||||
fi
|
||||
|
||||
echo -e "${GREEN}#${RESET} NVIDIA container toolkit installed successfully.\\n"
|
||||
|
||||
# Configure Docker to use NVIDIA runtime
|
||||
echo -e "${YELLOW}#${RESET} Configuring Docker to use NVIDIA runtime...\\n"
|
||||
|
||||
if ! sudo nvidia-ctk runtime configure --runtime=docker 2>/dev/null; then
|
||||
echo -e "${YELLOW}#${RESET} Warning: Failed to configure NVIDIA runtime for Docker. Continuing anyway...\\n"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Restart Docker service
|
||||
echo -e "${YELLOW}#${RESET} Restarting Docker service...\\n"
|
||||
if ! sudo systemctl restart docker 2>/dev/null; then
|
||||
echo -e "${YELLOW}#${RESET} Warning: Failed to restart Docker service. You may need to restart it manually.\\n"
|
||||
return 0
|
||||
fi
|
||||
|
||||
echo -e "${GREEN}#${RESET} NVIDIA container toolkit configuration completed successfully.\\n"
|
||||
}
|
||||
|
||||
get_install_confirmation(){
|
||||
read -p "This script will install/update Project N.O.M.A.D. and its dependencies on your machine. Are you sure you want to continue? (y/n): " choice
|
||||
case "$choice" in
|
||||
|
|
@ -439,6 +515,7 @@ check_is_debug_mode
|
|||
get_install_confirmation
|
||||
accept_terms
|
||||
ensure_docker_installed
|
||||
setup_nvidia_container_toolkit
|
||||
get_local_ip
|
||||
create_nomad_directory
|
||||
download_wait_for_it_script
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user