mirror of
https://github.com/Crosstalk-Solutions/project-nomad.git
synced 2026-03-28 03:29:25 +01:00
GPU detection results were only applied at container creation time and never persisted. If live detection failed transiently (Docker daemon hiccup, runtime temporarily unavailable), Ollama would silently fall back to CPU-only mode with no way to recover short of force-reinstall. Now _detectGPUType() persists successful detections to the KV store (gpu.type = 'nvidia' | 'amd') and uses the saved value as a fallback when live detection returns nothing. This ensures GPU config survives across container recreations regardless of transient detection failures. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1085 lines
40 KiB
TypeScript
1085 lines
40 KiB
TypeScript
import Service from '#models/service'
|
|
import Docker from 'dockerode'
|
|
import logger from '@adonisjs/core/services/logger'
|
|
import { inject } from '@adonisjs/core'
|
|
import transmit from '@adonisjs/transmit/services/main'
|
|
import { doResumableDownloadWithRetry } from '../utils/downloads.js'
|
|
import { join } from 'path'
|
|
import { ZIM_STORAGE_PATH } from '../utils/fs.js'
|
|
import { SERVICE_NAMES } from '../../constants/service_names.js'
|
|
import { exec } from 'child_process'
|
|
import { promisify } from 'util'
|
|
// import { readdir } from 'fs/promises'
|
|
import KVStore from '#models/kv_store'
|
|
import { BROADCAST_CHANNELS } from '../../constants/broadcast.js'
|
|
|
|
@inject()
|
|
export class DockerService {
|
|
public docker: Docker
|
|
private activeInstallations: Set<string> = new Set()
|
|
public static NOMAD_NETWORK = 'project-nomad_default'
|
|
|
|
constructor() {
|
|
// Support both Linux (production) and Windows (development with Docker Desktop)
|
|
const isWindows = process.platform === 'win32'
|
|
if (isWindows) {
|
|
// Windows Docker Desktop uses named pipe
|
|
this.docker = new Docker({ socketPath: '//./pipe/docker_engine' })
|
|
} else {
|
|
// Linux uses Unix socket
|
|
this.docker = new Docker({ socketPath: '/var/run/docker.sock' })
|
|
}
|
|
}
|
|
|
|
async affectContainer(
|
|
serviceName: string,
|
|
action: 'start' | 'stop' | 'restart'
|
|
): Promise<{ success: boolean; message: string }> {
|
|
try {
|
|
const service = await Service.query().where('service_name', serviceName).first()
|
|
if (!service || !service.installed) {
|
|
return {
|
|
success: false,
|
|
message: `Service ${serviceName} not found or not installed`,
|
|
}
|
|
}
|
|
|
|
const containers = await this.docker.listContainers({ all: true })
|
|
const container = containers.find((c) => c.Names.includes(`/${serviceName}`))
|
|
if (!container) {
|
|
return {
|
|
success: false,
|
|
message: `Container for service ${serviceName} not found`,
|
|
}
|
|
}
|
|
|
|
const dockerContainer = this.docker.getContainer(container.Id)
|
|
if (action === 'stop') {
|
|
await dockerContainer.stop()
|
|
return {
|
|
success: true,
|
|
message: `Service ${serviceName} stopped successfully`,
|
|
}
|
|
}
|
|
|
|
if (action === 'restart') {
|
|
await dockerContainer.restart()
|
|
|
|
return {
|
|
success: true,
|
|
message: `Service ${serviceName} restarted successfully`,
|
|
}
|
|
}
|
|
|
|
if (action === 'start') {
|
|
if (container.State === 'running') {
|
|
return {
|
|
success: true,
|
|
message: `Service ${serviceName} is already running`,
|
|
}
|
|
}
|
|
|
|
await dockerContainer.start()
|
|
|
|
return {
|
|
success: true,
|
|
message: `Service ${serviceName} started successfully`,
|
|
}
|
|
}
|
|
|
|
return {
|
|
success: false,
|
|
message: `Invalid action: ${action}. Use 'start', 'stop', or 'restart'.`,
|
|
}
|
|
} catch (error) {
|
|
logger.error(`Error starting service ${serviceName}: ${error.message}`)
|
|
return {
|
|
success: false,
|
|
message: `Failed to start service ${serviceName}: ${error.message}`,
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Fetches the status of all Docker containers related to Nomad services. (those prefixed with 'nomad_')
|
|
*/
|
|
async getServicesStatus(): Promise<
|
|
{
|
|
service_name: string
|
|
status: string
|
|
}[]
|
|
> {
|
|
try {
|
|
const containers = await this.docker.listContainers({ all: true })
|
|
const containerMap = new Map<string, Docker.ContainerInfo>()
|
|
containers.forEach((container) => {
|
|
const name = container.Names[0]?.replace('/', '')
|
|
if (name && name.startsWith('nomad_')) {
|
|
containerMap.set(name, container)
|
|
}
|
|
})
|
|
|
|
return Array.from(containerMap.entries()).map(([name, container]) => ({
|
|
service_name: name,
|
|
status: container.State,
|
|
}))
|
|
} catch (error) {
|
|
logger.error(`Error fetching services status: ${error.message}`)
|
|
return []
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get the URL to access a service based on its configuration.
|
|
* Attempts to return a docker-internal URL using the service name and exposed port.
|
|
* @param serviceName - The name of the service to get the URL for.
|
|
* @returns - The URL as a string, or null if it cannot be determined.
|
|
*/
|
|
async getServiceURL(serviceName: string): Promise<string | null> {
|
|
if (!serviceName || serviceName.trim() === '') {
|
|
return null
|
|
}
|
|
|
|
const service = await Service.query()
|
|
.where('service_name', serviceName)
|
|
.andWhere('installed', true)
|
|
.first()
|
|
|
|
if (!service) {
|
|
return null
|
|
}
|
|
|
|
const hostname = process.env.NODE_ENV === 'production' ? serviceName : 'localhost'
|
|
|
|
// First, check if ui_location is set and is a valid port number
|
|
if (service.ui_location && parseInt(service.ui_location, 10)) {
|
|
return `http://${hostname}:${service.ui_location}`
|
|
}
|
|
|
|
// Next, try to extract a host port from container_config
|
|
const parsedConfig = this._parseContainerConfig(service.container_config)
|
|
if (parsedConfig?.HostConfig?.PortBindings) {
|
|
const portBindings = parsedConfig.HostConfig.PortBindings
|
|
const hostPorts = Object.values(portBindings)
|
|
if (!hostPorts || !Array.isArray(hostPorts) || hostPorts.length === 0) {
|
|
return null
|
|
}
|
|
|
|
const hostPortsArray = hostPorts.flat() as { HostPort: string }[]
|
|
const hostPortsStrings = hostPortsArray.map((binding) => binding.HostPort)
|
|
if (hostPortsStrings.length > 0) {
|
|
return `http://${hostname}:${hostPortsStrings[0]}`
|
|
}
|
|
}
|
|
|
|
// Otherwise, return null if we can't determine a URL
|
|
return null
|
|
}
|
|
|
|
async createContainerPreflight(
|
|
serviceName: string
|
|
): Promise<{ success: boolean; message: string }> {
|
|
const service = await Service.query().where('service_name', serviceName).first()
|
|
if (!service) {
|
|
return {
|
|
success: false,
|
|
message: `Service ${serviceName} not found`,
|
|
}
|
|
}
|
|
|
|
if (service.installed) {
|
|
return {
|
|
success: false,
|
|
message: `Service ${serviceName} is already installed`,
|
|
}
|
|
}
|
|
|
|
// Check if installation is already in progress (database-level)
|
|
if (service.installation_status === 'installing') {
|
|
return {
|
|
success: false,
|
|
message: `Service ${serviceName} installation is already in progress`,
|
|
}
|
|
}
|
|
|
|
// Double-check with in-memory tracking (race condition protection)
|
|
if (this.activeInstallations.has(serviceName)) {
|
|
return {
|
|
success: false,
|
|
message: `Service ${serviceName} installation is already in progress`,
|
|
}
|
|
}
|
|
|
|
// Mark installation as in progress
|
|
this.activeInstallations.add(serviceName)
|
|
service.installation_status = 'installing'
|
|
await service.save()
|
|
|
|
// Check if a service wasn't marked as installed but has an existing container
|
|
// This can happen if the service was created but not properly installed
|
|
// or if the container was removed manually without updating the service status.
|
|
// if (await this._checkIfServiceContainerExists(serviceName)) {
|
|
// const removeResult = await this._removeServiceContainer(serviceName);
|
|
// if (!removeResult.success) {
|
|
// return {
|
|
// success: false,
|
|
// message: `Failed to remove existing container for service ${serviceName}: ${removeResult.message}`,
|
|
// };
|
|
// }
|
|
// }
|
|
|
|
const containerConfig = this._parseContainerConfig(service.container_config)
|
|
|
|
// Execute installation asynchronously and handle cleanup
|
|
this._createContainer(service, containerConfig).catch(async (error) => {
|
|
logger.error(`Installation failed for ${serviceName}: ${error.message}`)
|
|
await this._cleanupFailedInstallation(serviceName)
|
|
})
|
|
|
|
return {
|
|
success: true,
|
|
message: `Service ${serviceName} installation initiated successfully. You can receive updates via server-sent events.`,
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Force reinstall a service by stopping, removing, and recreating its container.
|
|
* This method will also clear any associated volumes/data.
|
|
* Handles edge cases gracefully (e.g., container not running, container not found).
|
|
*/
|
|
async forceReinstall(serviceName: string): Promise<{ success: boolean; message: string }> {
|
|
try {
|
|
const service = await Service.query().where('service_name', serviceName).first()
|
|
if (!service) {
|
|
return {
|
|
success: false,
|
|
message: `Service ${serviceName} not found`,
|
|
}
|
|
}
|
|
|
|
// Check if installation is already in progress
|
|
if (this.activeInstallations.has(serviceName)) {
|
|
return {
|
|
success: false,
|
|
message: `Service ${serviceName} installation is already in progress`,
|
|
}
|
|
}
|
|
|
|
// Mark as installing to prevent concurrent operations
|
|
this.activeInstallations.add(serviceName)
|
|
service.installation_status = 'installing'
|
|
await service.save()
|
|
|
|
this._broadcast(
|
|
serviceName,
|
|
'reinstall-starting',
|
|
`Starting force reinstall for ${serviceName}...`
|
|
)
|
|
|
|
// Step 1: Try to stop and remove the container if it exists
|
|
try {
|
|
const containers = await this.docker.listContainers({ all: true })
|
|
const container = containers.find((c) => c.Names.includes(`/${serviceName}`))
|
|
|
|
if (container) {
|
|
const dockerContainer = this.docker.getContainer(container.Id)
|
|
|
|
// Only try to stop if it's running
|
|
if (container.State === 'running') {
|
|
this._broadcast(serviceName, 'stopping', `Stopping container...`)
|
|
await dockerContainer.stop({ t: 10 }).catch((error) => {
|
|
// If already stopped, continue
|
|
if (!error.message.includes('already stopped')) {
|
|
logger.warn(`Error stopping container: ${error.message}`)
|
|
}
|
|
})
|
|
}
|
|
|
|
// Step 2: Remove the container
|
|
this._broadcast(serviceName, 'removing', `Removing container...`)
|
|
await dockerContainer.remove({ force: true }).catch((error) => {
|
|
logger.warn(`Error removing container: ${error.message}`)
|
|
})
|
|
} else {
|
|
this._broadcast(
|
|
serviceName,
|
|
'no-container',
|
|
`No existing container found, proceeding with installation...`
|
|
)
|
|
}
|
|
} catch (error) {
|
|
logger.warn(`Error during container cleanup: ${error.message}`)
|
|
this._broadcast(serviceName, 'cleanup-warning', `Warning during cleanup: ${error.message}`)
|
|
}
|
|
|
|
// Step 3: Clear volumes/data if needed
|
|
try {
|
|
this._broadcast(serviceName, 'clearing-volumes', `Checking for volumes to clear...`)
|
|
const volumes = await this.docker.listVolumes()
|
|
const serviceVolumes =
|
|
volumes.Volumes?.filter(
|
|
(v) => v.Name.includes(serviceName) || v.Labels?.service === serviceName
|
|
) || []
|
|
|
|
for (const vol of serviceVolumes) {
|
|
try {
|
|
const volume = this.docker.getVolume(vol.Name)
|
|
await volume.remove({ force: true })
|
|
this._broadcast(serviceName, 'volume-removed', `Removed volume: ${vol.Name}`)
|
|
} catch (error) {
|
|
logger.warn(`Failed to remove volume ${vol.Name}: ${error.message}`)
|
|
}
|
|
}
|
|
|
|
if (serviceVolumes.length === 0) {
|
|
this._broadcast(serviceName, 'no-volumes', `No volumes found to clear`)
|
|
}
|
|
} catch (error) {
|
|
logger.warn(`Error during volume cleanup: ${error.message}`)
|
|
this._broadcast(
|
|
serviceName,
|
|
'volume-cleanup-warning',
|
|
`Warning during volume cleanup: ${error.message}`
|
|
)
|
|
}
|
|
|
|
// Step 4: Mark service as uninstalled
|
|
service.installed = false
|
|
service.installation_status = 'installing'
|
|
await service.save()
|
|
|
|
// Step 5: Recreate the container
|
|
this._broadcast(serviceName, 'recreating', `Recreating container...`)
|
|
const containerConfig = this._parseContainerConfig(service.container_config)
|
|
|
|
// Execute installation asynchronously and handle cleanup
|
|
this._createContainer(service, containerConfig).catch(async (error) => {
|
|
logger.error(`Reinstallation failed for ${serviceName}: ${error.message}`)
|
|
await this._cleanupFailedInstallation(serviceName)
|
|
})
|
|
|
|
return {
|
|
success: true,
|
|
message: `Service ${serviceName} force reinstall initiated successfully. You can receive updates via server-sent events.`,
|
|
}
|
|
} catch (error) {
|
|
logger.error(`Force reinstall failed for ${serviceName}: ${error.message}`)
|
|
await this._cleanupFailedInstallation(serviceName)
|
|
return {
|
|
success: false,
|
|
message: `Failed to force reinstall service ${serviceName}: ${error.message}`,
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Handles the long-running process of creating a Docker container for a service.
|
|
* NOTE: This method should not be called directly. Instead, use `createContainerPreflight` to check prerequisites first
|
|
* This method will also transmit server-sent events to the client to notify of progress.
|
|
* @param serviceName
|
|
* @returns
|
|
*/
|
|
async _createContainer(
|
|
service: Service & { dependencies?: Service[] },
|
|
containerConfig: any
|
|
): Promise<void> {
|
|
try {
|
|
this._broadcast(service.service_name, 'initializing', '')
|
|
|
|
let dependencies = []
|
|
if (service.depends_on) {
|
|
const dependency = await Service.query().where('service_name', service.depends_on).first()
|
|
if (dependency) {
|
|
dependencies.push(dependency)
|
|
}
|
|
}
|
|
|
|
// First, check if the service has any dependencies that need to be installed first
|
|
if (dependencies && dependencies.length > 0) {
|
|
this._broadcast(
|
|
service.service_name,
|
|
'checking-dependencies',
|
|
`Checking dependencies for service ${service.service_name}...`
|
|
)
|
|
for (const dependency of dependencies) {
|
|
if (!dependency.installed) {
|
|
this._broadcast(
|
|
service.service_name,
|
|
'dependency-not-installed',
|
|
`Dependency service ${dependency.service_name} is not installed. Installing it first...`
|
|
)
|
|
await this._createContainer(
|
|
dependency,
|
|
this._parseContainerConfig(dependency.container_config)
|
|
)
|
|
} else {
|
|
this._broadcast(
|
|
service.service_name,
|
|
'dependency-installed',
|
|
`Dependency service ${dependency.service_name} is already installed.`
|
|
)
|
|
}
|
|
}
|
|
}
|
|
|
|
const imageExists = await this._checkImageExists(service.container_image)
|
|
if (imageExists) {
|
|
this._broadcast(
|
|
service.service_name,
|
|
'image-exists',
|
|
`Docker image ${service.container_image} already exists locally. Skipping pull...`
|
|
)
|
|
} else {
|
|
// Start pulling the Docker image and wait for it to complete
|
|
const pullStream = await this.docker.pull(service.container_image)
|
|
this._broadcast(
|
|
service.service_name,
|
|
'pulling',
|
|
`Pulling Docker image ${service.container_image}...`
|
|
)
|
|
await new Promise((res) => this.docker.modem.followProgress(pullStream, res))
|
|
}
|
|
|
|
if (service.service_name === SERVICE_NAMES.KIWIX) {
|
|
await this._runPreinstallActions__KiwixServe()
|
|
this._broadcast(
|
|
service.service_name,
|
|
'preinstall-complete',
|
|
`Pre-install actions for Kiwix Serve completed successfully.`
|
|
)
|
|
}
|
|
|
|
// GPU-aware configuration for Ollama
|
|
let finalImage = service.container_image
|
|
let gpuHostConfig = containerConfig?.HostConfig || {}
|
|
|
|
if (service.service_name === SERVICE_NAMES.OLLAMA) {
|
|
const gpuResult = await this._detectGPUType()
|
|
|
|
if (gpuResult.type === 'nvidia') {
|
|
this._broadcast(
|
|
service.service_name,
|
|
'gpu-config',
|
|
`NVIDIA container runtime detected. Configuring container with GPU support...`
|
|
)
|
|
|
|
// Add GPU support for NVIDIA
|
|
gpuHostConfig = {
|
|
...gpuHostConfig,
|
|
DeviceRequests: [
|
|
{
|
|
Driver: 'nvidia',
|
|
Count: -1, // -1 means all GPUs
|
|
Capabilities: [['gpu']],
|
|
},
|
|
],
|
|
}
|
|
} else if (gpuResult.type === 'amd') {
|
|
this._broadcast(
|
|
service.service_name,
|
|
'gpu-config',
|
|
`AMD GPU detected. ROCm GPU acceleration is not yet supported in this version — proceeding with CPU-only configuration. GPU support for AMD will be available in a future update.`
|
|
)
|
|
logger.warn('[DockerService] AMD GPU detected but ROCm support is not yet enabled. Using CPU-only configuration.')
|
|
// TODO: Re-enable AMD GPU support once ROCm image and device discovery are validated.
|
|
// When re-enabling:
|
|
// 1. Switch image to 'ollama/ollama:rocm'
|
|
// 2. Restore _discoverAMDDevices() to map /dev/kfd and /dev/dri/* into the container
|
|
} else if (gpuResult.toolkitMissing) {
|
|
this._broadcast(
|
|
service.service_name,
|
|
'gpu-config',
|
|
`NVIDIA GPU detected but NVIDIA Container Toolkit is not installed. Using CPU-only configuration. Install the toolkit and reinstall AI Assistant for GPU acceleration: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html`
|
|
)
|
|
} else {
|
|
this._broadcast(
|
|
service.service_name,
|
|
'gpu-config',
|
|
`No GPU detected. Using CPU-only configuration...`
|
|
)
|
|
}
|
|
}
|
|
|
|
this._broadcast(
|
|
service.service_name,
|
|
'creating',
|
|
`Creating Docker container for service ${service.service_name}...`
|
|
)
|
|
const container = await this.docker.createContainer({
|
|
Image: finalImage,
|
|
name: service.service_name,
|
|
...(containerConfig?.User && { User: containerConfig.User }),
|
|
HostConfig: gpuHostConfig,
|
|
...(containerConfig?.WorkingDir && { WorkingDir: containerConfig.WorkingDir }),
|
|
...(containerConfig?.ExposedPorts && { ExposedPorts: containerConfig.ExposedPorts }),
|
|
...(containerConfig?.Env && { Env: containerConfig.Env }),
|
|
...(service.container_command ? { Cmd: service.container_command.split(' ') } : {}),
|
|
// Ensure container is attached to the Nomad docker network in production
|
|
...(process.env.NODE_ENV === 'production' && {
|
|
NetworkingConfig: {
|
|
EndpointsConfig: {
|
|
[DockerService.NOMAD_NETWORK]: {},
|
|
},
|
|
},
|
|
}),
|
|
})
|
|
|
|
this._broadcast(
|
|
service.service_name,
|
|
'starting',
|
|
`Starting Docker container for service ${service.service_name}...`
|
|
)
|
|
await container.start()
|
|
|
|
this._broadcast(
|
|
service.service_name,
|
|
'finalizing',
|
|
`Finalizing installation of service ${service.service_name}...`
|
|
)
|
|
service.installed = true
|
|
service.installation_status = 'idle'
|
|
await service.save()
|
|
|
|
// Remove from active installs tracking
|
|
this.activeInstallations.delete(service.service_name)
|
|
|
|
// If Ollama was just installed, trigger Nomad docs discovery and embedding
|
|
if (service.service_name === SERVICE_NAMES.OLLAMA) {
|
|
logger.info('[DockerService] Ollama installation complete. Default behavior is to not enable chat suggestions.')
|
|
await KVStore.setValue('chat.suggestionsEnabled', false)
|
|
|
|
logger.info('[DockerService] Ollama installation complete. Triggering Nomad docs discovery...')
|
|
|
|
// Need to use dynamic imports here to avoid circular dependency
|
|
const ollamaService = new (await import('./ollama_service.js')).OllamaService()
|
|
const ragService = new (await import('./rag_service.js')).RagService(this, ollamaService)
|
|
|
|
ragService.discoverNomadDocs().catch((error) => {
|
|
logger.error('[DockerService] Failed to discover Nomad docs:', error)
|
|
})
|
|
}
|
|
|
|
this._broadcast(
|
|
service.service_name,
|
|
'completed',
|
|
`Service ${service.service_name} installation completed successfully.`
|
|
)
|
|
} catch (error) {
|
|
this._broadcast(
|
|
service.service_name,
|
|
'error',
|
|
`Error installing service ${service.service_name}: ${error.message}`
|
|
)
|
|
// Mark install as failed and cleanup
|
|
await this._cleanupFailedInstallation(service.service_name)
|
|
throw new Error(`Failed to install service ${service.service_name}: ${error.message}`)
|
|
}
|
|
}
|
|
|
|
async _checkIfServiceContainerExists(serviceName: string): Promise<boolean> {
|
|
try {
|
|
const containers = await this.docker.listContainers({ all: true })
|
|
return containers.some((container) => container.Names.includes(`/${serviceName}`))
|
|
} catch (error) {
|
|
logger.error(`Error checking if service container exists: ${error.message}`)
|
|
return false
|
|
}
|
|
}
|
|
|
|
async _removeServiceContainer(
|
|
serviceName: string
|
|
): Promise<{ success: boolean; message: string }> {
|
|
try {
|
|
const containers = await this.docker.listContainers({ all: true })
|
|
const container = containers.find((c) => c.Names.includes(`/${serviceName}`))
|
|
if (!container) {
|
|
return { success: false, message: `Container for service ${serviceName} not found` }
|
|
}
|
|
|
|
const dockerContainer = this.docker.getContainer(container.Id)
|
|
await dockerContainer.remove({ force: true })
|
|
|
|
return { success: true, message: `Service ${serviceName} container removed successfully` }
|
|
} catch (error) {
|
|
logger.error(`Error removing service container: ${error.message}`)
|
|
return {
|
|
success: false,
|
|
message: `Failed to remove service ${serviceName} container: ${error.message}`,
|
|
}
|
|
}
|
|
}
|
|
|
|
private async _runPreinstallActions__KiwixServe(): Promise<void> {
|
|
/**
|
|
* At least one .zim file must be available before we can start the kiwix container.
|
|
* We'll download the lightweight mini Wikipedia Top 100 zim file for this purpose.
|
|
**/
|
|
const WIKIPEDIA_ZIM_URL =
|
|
'https://github.com/Crosstalk-Solutions/project-nomad/raw/refs/heads/main/install/wikipedia_en_100_mini_2025-06.zim'
|
|
const filename = 'wikipedia_en_100_mini_2025-06.zim'
|
|
const filepath = join(process.cwd(), ZIM_STORAGE_PATH, filename)
|
|
logger.info(`[DockerService] Kiwix Serve pre-install: Downloading ZIM file to ${filepath}`)
|
|
|
|
this._broadcast(
|
|
SERVICE_NAMES.KIWIX,
|
|
'preinstall',
|
|
`Running pre-install actions for Kiwix Serve...`
|
|
)
|
|
this._broadcast(
|
|
SERVICE_NAMES.KIWIX,
|
|
'preinstall',
|
|
`Downloading Wikipedia ZIM file from ${WIKIPEDIA_ZIM_URL}. This may take some time...`
|
|
)
|
|
|
|
try {
|
|
await doResumableDownloadWithRetry({
|
|
url: WIKIPEDIA_ZIM_URL,
|
|
filepath,
|
|
timeout: 60000,
|
|
allowedMimeTypes: [
|
|
'application/x-zim',
|
|
'application/x-openzim',
|
|
'application/octet-stream',
|
|
],
|
|
})
|
|
|
|
this._broadcast(
|
|
SERVICE_NAMES.KIWIX,
|
|
'preinstall',
|
|
`Downloaded Wikipedia ZIM file to ${filepath}`
|
|
)
|
|
} catch (error) {
|
|
this._broadcast(
|
|
SERVICE_NAMES.KIWIX,
|
|
'preinstall-error',
|
|
`Failed to download Wikipedia ZIM file: ${error.message}`
|
|
)
|
|
throw new Error(`Pre-install action failed: ${error.message}`)
|
|
}
|
|
}
|
|
|
|
private async _cleanupFailedInstallation(serviceName: string): Promise<void> {
|
|
try {
|
|
const service = await Service.query().where('service_name', serviceName).first()
|
|
if (service) {
|
|
service.installation_status = 'error'
|
|
await service.save()
|
|
}
|
|
this.activeInstallations.delete(serviceName)
|
|
|
|
// Ensure any partially created container is removed
|
|
await this._removeServiceContainer(serviceName)
|
|
|
|
logger.info(`[DockerService] Cleaned up failed installation for ${serviceName}`)
|
|
} catch (error) {
|
|
logger.error(
|
|
`[DockerService] Failed to cleanup installation for ${serviceName}: ${error.message}`
|
|
)
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Detect GPU type and toolkit availability.
|
|
* Primary: Check Docker runtimes via docker.info() (works from inside containers).
|
|
* Fallback: lspci for host-based installs and AMD detection.
|
|
*/
|
|
private async _detectGPUType(): Promise<{ type: 'nvidia' | 'amd' | 'none'; toolkitMissing?: boolean }> {
|
|
try {
|
|
// Primary: Check Docker daemon for nvidia runtime (works from inside containers)
|
|
try {
|
|
const dockerInfo = await this.docker.info()
|
|
const runtimes = dockerInfo.Runtimes || {}
|
|
if ('nvidia' in runtimes) {
|
|
logger.info('[DockerService] NVIDIA container runtime detected via Docker API')
|
|
await this._persistGPUType('nvidia')
|
|
return { type: 'nvidia' }
|
|
}
|
|
} catch (error) {
|
|
logger.warn(`[DockerService] Could not query Docker info for GPU runtimes: ${error.message}`)
|
|
}
|
|
|
|
// Fallback: lspci for host-based installs (not available inside Docker)
|
|
const execAsync = promisify(exec)
|
|
|
|
// Check for NVIDIA GPU via lspci
|
|
try {
|
|
const { stdout: nvidiaCheck } = await execAsync(
|
|
'lspci 2>/dev/null | grep -i nvidia || true'
|
|
)
|
|
if (nvidiaCheck.trim()) {
|
|
// GPU hardware found but no nvidia runtime — toolkit not installed
|
|
logger.warn('[DockerService] NVIDIA GPU detected via lspci but NVIDIA Container Toolkit is not installed')
|
|
return { type: 'none', toolkitMissing: true }
|
|
}
|
|
} catch (error) {
|
|
// lspci not available (likely inside Docker container), continue
|
|
}
|
|
|
|
// Check for AMD GPU via lspci — restrict to display controller classes to avoid
|
|
// false positives from AMD CPU host bridges, PCI bridges, and chipset devices.
|
|
try {
|
|
const { stdout: amdCheck } = await execAsync(
|
|
'lspci 2>/dev/null | grep -iE "VGA|3D controller|Display" | grep -iE "amd|radeon" || true'
|
|
)
|
|
if (amdCheck.trim()) {
|
|
logger.info('[DockerService] AMD GPU detected via lspci')
|
|
await this._persistGPUType('amd')
|
|
return { type: 'amd' }
|
|
}
|
|
} catch (error) {
|
|
// lspci not available, continue
|
|
}
|
|
|
|
// Last resort: check if we previously detected a GPU and it's likely still present.
|
|
// This handles cases where live detection fails transiently (e.g., Docker daemon
|
|
// hiccup, runtime temporarily unavailable) but the hardware hasn't changed.
|
|
try {
|
|
const savedType = await KVStore.getValue('gpu.type')
|
|
if (savedType === 'nvidia' || savedType === 'amd') {
|
|
logger.info(`[DockerService] No GPU detected live, but KV store has '${savedType}' from previous detection. Using saved value.`)
|
|
return { type: savedType as 'nvidia' | 'amd' }
|
|
}
|
|
} catch {
|
|
// KV store not available, continue
|
|
}
|
|
|
|
logger.info('[DockerService] No GPU detected')
|
|
return { type: 'none' }
|
|
} catch (error) {
|
|
logger.warn(`[DockerService] Error detecting GPU type: ${error.message}`)
|
|
return { type: 'none' }
|
|
}
|
|
}
|
|
|
|
private async _persistGPUType(type: 'nvidia' | 'amd'): Promise<void> {
|
|
try {
|
|
await KVStore.setValue('gpu.type', type)
|
|
logger.info(`[DockerService] Persisted GPU type '${type}' to KV store`)
|
|
} catch (error) {
|
|
logger.warn(`[DockerService] Failed to persist GPU type: ${error.message}`)
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Discover AMD GPU DRI devices dynamically.
|
|
* Returns an array of device configurations for Docker.
|
|
*/
|
|
// private async _discoverAMDDevices(): Promise<
|
|
// Array<{ PathOnHost: string; PathInContainer: string; CgroupPermissions: string }>
|
|
// > {
|
|
// try {
|
|
// const devices: Array<{
|
|
// PathOnHost: string
|
|
// PathInContainer: string
|
|
// CgroupPermissions: string
|
|
// }> = []
|
|
|
|
// // Always add /dev/kfd (Kernel Fusion Driver)
|
|
// devices.push({
|
|
// PathOnHost: '/dev/kfd',
|
|
// PathInContainer: '/dev/kfd',
|
|
// CgroupPermissions: 'rwm',
|
|
// })
|
|
|
|
// // Discover DRI devices in /dev/dri/
|
|
// try {
|
|
// const driDevices = await readdir('/dev/dri')
|
|
// for (const device of driDevices) {
|
|
// const devicePath = `/dev/dri/${device}`
|
|
// devices.push({
|
|
// PathOnHost: devicePath,
|
|
// PathInContainer: devicePath,
|
|
// CgroupPermissions: 'rwm',
|
|
// })
|
|
// }
|
|
// logger.info(
|
|
// `[DockerService] Discovered ${driDevices.length} DRI devices: ${driDevices.join(', ')}`
|
|
// )
|
|
// } catch (error) {
|
|
// logger.warn(`[DockerService] Could not read /dev/dri directory: ${error.message}`)
|
|
// // Fallback to common device names if directory read fails
|
|
// const fallbackDevices = ['card0', 'renderD128']
|
|
// for (const device of fallbackDevices) {
|
|
// devices.push({
|
|
// PathOnHost: `/dev/dri/${device}`,
|
|
// PathInContainer: `/dev/dri/${device}`,
|
|
// CgroupPermissions: 'rwm',
|
|
// })
|
|
// }
|
|
// logger.info(`[DockerService] Using fallback DRI devices: ${fallbackDevices.join(', ')}`)
|
|
// }
|
|
|
|
// return devices
|
|
// } catch (error) {
|
|
// logger.error(`[DockerService] Error discovering AMD devices: ${error.message}`)
|
|
// return []
|
|
// }
|
|
// }
|
|
|
|
/**
|
|
* Update a service container to a new image version while preserving volumes and data.
|
|
* Includes automatic rollback if the new container fails health checks.
|
|
*/
|
|
async updateContainer(
|
|
serviceName: string,
|
|
targetVersion: string
|
|
): Promise<{ success: boolean; message: string }> {
|
|
try {
|
|
const service = await Service.query().where('service_name', serviceName).first()
|
|
if (!service) {
|
|
return { success: false, message: `Service ${serviceName} not found` }
|
|
}
|
|
if (!service.installed) {
|
|
return { success: false, message: `Service ${serviceName} is not installed` }
|
|
}
|
|
if (this.activeInstallations.has(serviceName)) {
|
|
return { success: false, message: `Service ${serviceName} already has an operation in progress` }
|
|
}
|
|
|
|
this.activeInstallations.add(serviceName)
|
|
|
|
// Compute new image string
|
|
const currentImage = service.container_image
|
|
const imageBase = currentImage.includes(':')
|
|
? currentImage.substring(0, currentImage.lastIndexOf(':'))
|
|
: currentImage
|
|
const newImage = `${imageBase}:${targetVersion}`
|
|
|
|
// Step 1: Pull new image
|
|
this._broadcast(serviceName, 'update-pulling', `Pulling image ${newImage}...`)
|
|
const pullStream = await this.docker.pull(newImage)
|
|
await new Promise((res) => this.docker.modem.followProgress(pullStream, res))
|
|
|
|
// Step 2: Find and stop existing container
|
|
this._broadcast(serviceName, 'update-stopping', `Stopping current container...`)
|
|
const containers = await this.docker.listContainers({ all: true })
|
|
const existingContainer = containers.find((c) => c.Names.includes(`/${serviceName}`))
|
|
|
|
if (!existingContainer) {
|
|
this.activeInstallations.delete(serviceName)
|
|
return { success: false, message: `Container for ${serviceName} not found` }
|
|
}
|
|
|
|
const oldContainer = this.docker.getContainer(existingContainer.Id)
|
|
|
|
// Inspect to capture full config before stopping
|
|
const inspectData = await oldContainer.inspect()
|
|
|
|
if (existingContainer.State === 'running') {
|
|
await oldContainer.stop({ t: 15 })
|
|
}
|
|
|
|
// Step 3: Rename old container as safety net
|
|
const oldName = `${serviceName}_old`
|
|
await oldContainer.rename({ name: oldName })
|
|
|
|
// Step 4: Create new container with inspected config + new image
|
|
this._broadcast(serviceName, 'update-creating', `Creating updated container...`)
|
|
|
|
const hostConfig = inspectData.HostConfig || {}
|
|
|
|
// Re-run GPU detection for Ollama so updates always reflect the current GPU environment.
|
|
// This handles cases where the NVIDIA Container Toolkit was installed after the initial
|
|
// Ollama setup, and ensures DeviceRequests are always built fresh rather than relying on
|
|
// round-tripping the Docker inspect format back into the create API.
|
|
let updatedDeviceRequests: any[] | undefined = undefined
|
|
if (serviceName === SERVICE_NAMES.OLLAMA) {
|
|
const gpuResult = await this._detectGPUType()
|
|
|
|
if (gpuResult.type === 'nvidia') {
|
|
this._broadcast(
|
|
serviceName,
|
|
'update-gpu-config',
|
|
`NVIDIA container runtime detected. Configuring updated container with GPU support...`
|
|
)
|
|
updatedDeviceRequests = [
|
|
{
|
|
Driver: 'nvidia',
|
|
Count: -1,
|
|
Capabilities: [['gpu']],
|
|
},
|
|
]
|
|
} else if (gpuResult.type === 'amd') {
|
|
this._broadcast(
|
|
serviceName,
|
|
'update-gpu-config',
|
|
`AMD GPU detected. ROCm GPU acceleration is not yet supported — using CPU-only configuration.`
|
|
)
|
|
} else if (gpuResult.toolkitMissing) {
|
|
this._broadcast(
|
|
serviceName,
|
|
'update-gpu-config',
|
|
`NVIDIA GPU detected but NVIDIA Container Toolkit is not installed. Using CPU-only configuration. Install the toolkit and reinstall AI Assistant for GPU acceleration: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html`
|
|
)
|
|
} else {
|
|
this._broadcast(serviceName, 'update-gpu-config', `No GPU detected. Using CPU-only configuration.`)
|
|
}
|
|
}
|
|
|
|
const newContainerConfig: any = {
|
|
Image: newImage,
|
|
name: serviceName,
|
|
Env: inspectData.Config?.Env || undefined,
|
|
Cmd: inspectData.Config?.Cmd || undefined,
|
|
ExposedPorts: inspectData.Config?.ExposedPorts || undefined,
|
|
WorkingDir: inspectData.Config?.WorkingDir || undefined,
|
|
User: inspectData.Config?.User || undefined,
|
|
HostConfig: {
|
|
Binds: hostConfig.Binds || undefined,
|
|
PortBindings: hostConfig.PortBindings || undefined,
|
|
RestartPolicy: hostConfig.RestartPolicy || undefined,
|
|
DeviceRequests: serviceName === SERVICE_NAMES.OLLAMA ? updatedDeviceRequests : (hostConfig.DeviceRequests || undefined),
|
|
Devices: hostConfig.Devices || undefined,
|
|
},
|
|
NetworkingConfig: inspectData.NetworkSettings?.Networks
|
|
? {
|
|
EndpointsConfig: Object.fromEntries(
|
|
Object.keys(inspectData.NetworkSettings.Networks).map((net) => [net, {}])
|
|
),
|
|
}
|
|
: undefined,
|
|
}
|
|
|
|
// Remove undefined values from HostConfig
|
|
Object.keys(newContainerConfig.HostConfig).forEach((key) => {
|
|
if (newContainerConfig.HostConfig[key] === undefined) {
|
|
delete newContainerConfig.HostConfig[key]
|
|
}
|
|
})
|
|
|
|
let newContainer: any
|
|
try {
|
|
newContainer = await this.docker.createContainer(newContainerConfig)
|
|
} catch (createError) {
|
|
// Rollback: rename old container back
|
|
this._broadcast(serviceName, 'update-rollback', `Failed to create new container: ${createError.message}. Rolling back...`)
|
|
const rollbackContainer = this.docker.getContainer((await this.docker.listContainers({ all: true })).find((c) => c.Names.includes(`/${oldName}`))!.Id)
|
|
await rollbackContainer.rename({ name: serviceName })
|
|
await rollbackContainer.start()
|
|
this.activeInstallations.delete(serviceName)
|
|
return { success: false, message: `Failed to create updated container: ${createError.message}` }
|
|
}
|
|
|
|
// Step 5: Start new container
|
|
this._broadcast(serviceName, 'update-starting', `Starting updated container...`)
|
|
await newContainer.start()
|
|
|
|
// Step 6: Health check — verify container stays running for 5 seconds
|
|
await new Promise((resolve) => setTimeout(resolve, 5000))
|
|
const newContainerInfo = await newContainer.inspect()
|
|
|
|
if (newContainerInfo.State?.Running) {
|
|
// Healthy — clean up old container
|
|
try {
|
|
const oldContainerRef = this.docker.getContainer(
|
|
(await this.docker.listContainers({ all: true })).find((c) =>
|
|
c.Names.includes(`/${oldName}`)
|
|
)?.Id || ''
|
|
)
|
|
await oldContainerRef.remove({ force: true })
|
|
} catch {
|
|
// Old container may already be gone
|
|
}
|
|
|
|
// Update DB
|
|
service.container_image = newImage
|
|
service.available_update_version = null
|
|
await service.save()
|
|
|
|
this.activeInstallations.delete(serviceName)
|
|
this._broadcast(
|
|
serviceName,
|
|
'update-complete',
|
|
`Successfully updated ${serviceName} to ${targetVersion}`
|
|
)
|
|
return { success: true, message: `Service ${serviceName} updated to ${targetVersion}` }
|
|
} else {
|
|
// Unhealthy — rollback
|
|
this._broadcast(
|
|
serviceName,
|
|
'update-rollback',
|
|
`New container failed health check. Rolling back to previous version...`
|
|
)
|
|
|
|
try {
|
|
await newContainer.stop({ t: 5 }).catch(() => {})
|
|
await newContainer.remove({ force: true })
|
|
} catch {
|
|
// Best effort cleanup
|
|
}
|
|
|
|
// Restore old container
|
|
const oldContainers = await this.docker.listContainers({ all: true })
|
|
const oldRef = oldContainers.find((c) => c.Names.includes(`/${oldName}`))
|
|
if (oldRef) {
|
|
const rollbackContainer = this.docker.getContainer(oldRef.Id)
|
|
await rollbackContainer.rename({ name: serviceName })
|
|
await rollbackContainer.start()
|
|
}
|
|
|
|
this.activeInstallations.delete(serviceName)
|
|
return {
|
|
success: false,
|
|
message: `Update failed: new container did not stay running. Rolled back to previous version.`,
|
|
}
|
|
}
|
|
} catch (error) {
|
|
this.activeInstallations.delete(serviceName)
|
|
this._broadcast(
|
|
serviceName,
|
|
'update-rollback',
|
|
`Update failed: ${error.message}`
|
|
)
|
|
logger.error(`[DockerService] Update failed for ${serviceName}: ${error.message}`)
|
|
return { success: false, message: `Update failed: ${error.message}` }
|
|
}
|
|
}
|
|
|
|
private _broadcast(service: string, status: string, message: string) {
|
|
transmit.broadcast(BROADCAST_CHANNELS.SERVICE_INSTALLATION, {
|
|
service_name: service,
|
|
timestamp: new Date().toISOString(),
|
|
status,
|
|
message,
|
|
})
|
|
logger.info(`[DockerService] [${service}] ${status}: ${message}`)
|
|
}
|
|
|
|
private _parseContainerConfig(containerConfig: any): any {
|
|
if (!containerConfig) {
|
|
return {}
|
|
}
|
|
|
|
try {
|
|
// Handle the case where containerConfig is returned as an object by DB instead of a string
|
|
let toParse = containerConfig
|
|
if (typeof containerConfig === 'object') {
|
|
toParse = JSON.stringify(containerConfig)
|
|
}
|
|
|
|
return JSON.parse(toParse)
|
|
} catch (error) {
|
|
logger.error(`Failed to parse container configuration: ${error.message}`)
|
|
throw new Error(`Invalid container configuration: ${error.message}`)
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check if a Docker image exists locally.
|
|
* @param imageName - The name and tag of the image (e.g., "nginx:latest")
|
|
* @returns - True if the image exists locally, false otherwise
|
|
*/
|
|
private async _checkImageExists(imageName: string): Promise<boolean> {
|
|
try {
|
|
const images = await this.docker.listImages()
|
|
|
|
// Check if any image has a RepoTag that matches the requested image
|
|
return images.some((image) => image.RepoTags && image.RepoTags.includes(imageName))
|
|
} catch (error) {
|
|
logger.warn(`Error checking if image exists: ${error.message}`)
|
|
// If run into an error, assume the image does not exist
|
|
return false
|
|
}
|
|
}
|
|
}
|