project-nomad/admin/app/services/ollama_service.ts
Henry Estela f98664921a
feat(ai-chat): Add support for OpenAI API
Exisiting Ollama API support still functions as before. OpenAI vs
Ollama API mostly have the same features, however model file size is not
support with OpenAI's API so when a user chooses one of those then the
models will just show up as the model name without the size.

`npm install openai` triggered some updates in admin/package-lock.json
such as adding many instances of "dev: true".

This further enhances the user's ability to run the LLM on a different
host.
2026-03-25 17:18:05 -07:00

573 lines
19 KiB
TypeScript

import { inject } from '@adonisjs/core'
import OpenAI from 'openai'
import type { ChatCompletionChunk, ChatCompletionMessageParam } from 'openai/resources/chat/completions.js'
import type { Stream } from 'openai/streaming.js'
import { NomadOllamaModel } from '../../types/ollama.js'
import { FALLBACK_RECOMMENDED_OLLAMA_MODELS } from '../../constants/ollama.js'
import fs from 'node:fs/promises'
import path from 'node:path'
import logger from '@adonisjs/core/services/logger'
import axios from 'axios'
import { DownloadModelJob } from '#jobs/download_model_job'
import { SERVICE_NAMES } from '../../constants/service_names.js'
import transmit from '@adonisjs/transmit/services/main'
import Fuse, { IFuseOptions } from 'fuse.js'
import { BROADCAST_CHANNELS } from '../../constants/broadcast.js'
import env from '#start/env'
import { NOMAD_API_DEFAULT_BASE_URL } from '../../constants/misc.js'
import KVStore from '#models/kv_store'
const NOMAD_MODELS_API_PATH = '/api/v1/ollama/models'
const MODELS_CACHE_FILE = path.join(process.cwd(), 'storage', 'ollama-models-cache.json')
const CACHE_MAX_AGE_MS = 24 * 60 * 60 * 1000 // 24 hours
export type NomadInstalledModel = {
name: string
size: number
digest?: string
details?: Record<string, any>
}
export type NomadChatResponse = {
message: { content: string; thinking?: string }
done: boolean
model: string
}
export type NomadChatStreamChunk = {
message: { content: string; thinking?: string }
done: boolean
}
type ChatInput = {
model: string
messages: Array<{ role: 'system' | 'user' | 'assistant'; content: string }>
think?: boolean | 'medium'
stream?: boolean
}
@inject()
export class OllamaService {
private openai: OpenAI | null = null
private baseUrl: string | null = null
private initPromise: Promise<void> | null = null
constructor() {}
private async _initialize() {
if (!this.initPromise) {
this.initPromise = (async () => {
// Check KVStore for a custom base URL (remote Ollama, LM Studio, llama.cpp, etc.)
const customUrl = (await KVStore.getValue('ai.remoteOllamaUrl')) as string | null
if (customUrl && customUrl.trim()) {
this.baseUrl = customUrl.trim().replace(/\/$/, '')
} else {
// Fall back to the local Ollama container managed by Docker
const dockerService = new (await import('./docker_service.js')).DockerService()
const ollamaUrl = await dockerService.getServiceURL(SERVICE_NAMES.OLLAMA)
if (!ollamaUrl) {
throw new Error('Ollama service is not installed or running.')
}
this.baseUrl = ollamaUrl.trim().replace(/\/$/, '')
}
this.openai = new OpenAI({
apiKey: 'nomad', // Required by SDK; not validated by Ollama/LM Studio/llama.cpp
baseURL: `${this.baseUrl}/v1`,
})
})()
}
return this.initPromise
}
private async _ensureDependencies() {
if (!this.openai) {
await this._initialize()
}
}
/**
* Downloads a model from Ollama with progress tracking. Only works with Ollama backends.
* Use dispatchModelDownload() for background job processing where possible.
*/
async downloadModel(
model: string,
progressCallback?: (percent: number) => void
): Promise<{ success: boolean; message: string; retryable?: boolean }> {
await this._ensureDependencies()
if (!this.baseUrl) {
return { success: false, message: 'AI service is not initialized.' }
}
try {
// See if model is already installed
const installedModels = await this.getModels()
if (installedModels && installedModels.some((m) => m.name === model)) {
logger.info(`[OllamaService] Model "${model}" is already installed.`)
return { success: true, message: 'Model is already installed.' }
}
// Stream pull via Ollama native API
const pullResponse = await axios.post(
`${this.baseUrl}/api/pull`,
{ model, stream: true },
{ responseType: 'stream', timeout: 0 }
)
await new Promise<void>((resolve, reject) => {
let buffer = ''
pullResponse.data.on('data', (chunk: Buffer) => {
buffer += chunk.toString()
const lines = buffer.split('\n')
buffer = lines.pop() || ''
for (const line of lines) {
if (!line.trim()) continue
try {
const parsed = JSON.parse(line)
if (parsed.completed && parsed.total) {
const percent = parseFloat(((parsed.completed / parsed.total) * 100).toFixed(2))
this.broadcastDownloadProgress(model, percent)
if (progressCallback) progressCallback(percent)
}
} catch {
// ignore parse errors on partial lines
}
}
})
pullResponse.data.on('end', resolve)
pullResponse.data.on('error', reject)
})
logger.info(`[OllamaService] Model "${model}" downloaded successfully.`)
return { success: true, message: 'Model downloaded successfully.' }
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error)
logger.error(
`[OllamaService] Failed to download model "${model}": ${errorMessage}`
)
// Check for version mismatch (Ollama 412 response)
const isVersionMismatch = errorMessage.includes('newer version of Ollama')
const userMessage = isVersionMismatch
? 'This model requires a newer version of Ollama. Please update AI Assistant from the Apps page.'
: `Failed to download model: ${errorMessage}`
// Broadcast failure to connected clients so UI can show the error
this.broadcastDownloadError(model, userMessage)
return { success: false, message: userMessage, retryable: !isVersionMismatch }
}
}
async dispatchModelDownload(modelName: string): Promise<{ success: boolean; message: string }> {
try {
logger.info(`[OllamaService] Dispatching model download for ${modelName} via job queue`)
await DownloadModelJob.dispatch({
modelName,
})
return {
success: true,
message:
'Model download has been queued successfully. It will start shortly after Ollama and Open WebUI are ready (if not already).',
}
} catch (error) {
logger.error(
`[OllamaService] Failed to dispatch model download for ${modelName}: ${error instanceof Error ? error.message : error}`
)
return {
success: false,
message: 'Failed to queue model download. Please try again.',
}
}
}
public async chat(chatRequest: ChatInput): Promise<NomadChatResponse> {
await this._ensureDependencies()
if (!this.openai) {
throw new Error('AI client is not initialized.')
}
const params: any = {
model: chatRequest.model,
messages: chatRequest.messages as ChatCompletionMessageParam[],
stream: false,
}
if (chatRequest.think) {
params.think = chatRequest.think
}
const response = await this.openai.chat.completions.create(params)
const choice = response.choices[0]
return {
message: {
content: choice.message.content ?? '',
thinking: (choice.message as any).thinking ?? undefined,
},
done: true,
model: response.model,
}
}
public async chatStream(chatRequest: ChatInput): Promise<AsyncIterable<NomadChatStreamChunk>> {
await this._ensureDependencies()
if (!this.openai) {
throw new Error('AI client is not initialized.')
}
const params: any = {
model: chatRequest.model,
messages: chatRequest.messages as ChatCompletionMessageParam[],
stream: true,
}
if (chatRequest.think) {
params.think = chatRequest.think
}
const stream = (await this.openai.chat.completions.create(params)) as unknown as Stream<ChatCompletionChunk>
async function* normalize(): AsyncGenerator<NomadChatStreamChunk> {
for await (const chunk of stream) {
const delta = chunk.choices[0]?.delta
yield {
message: {
content: delta?.content ?? '',
thinking: (delta as any)?.thinking ?? '',
},
done: chunk.choices[0]?.finish_reason !== null && chunk.choices[0]?.finish_reason !== undefined,
}
}
}
return normalize()
}
public async checkModelHasThinking(modelName: string): Promise<boolean> {
await this._ensureDependencies()
if (!this.baseUrl) return false
try {
const response = await axios.post(
`${this.baseUrl}/api/show`,
{ model: modelName },
{ timeout: 5000 }
)
return Array.isArray(response.data?.capabilities) && response.data.capabilities.includes('thinking')
} catch {
// Non-Ollama backends don't expose /api/show — assume no thinking support
return false
}
}
public async deleteModel(modelName: string): Promise<{ success: boolean; message: string }> {
await this._ensureDependencies()
if (!this.baseUrl) {
return { success: false, message: 'AI service is not initialized.' }
}
try {
await axios.delete(`${this.baseUrl}/api/delete`, {
data: { model: modelName },
timeout: 10000,
})
return { success: true, message: `Model "${modelName}" deleted.` }
} catch (error) {
logger.error(
`[OllamaService] Failed to delete model "${modelName}": ${error instanceof Error ? error.message : error}`
)
return { success: false, message: 'Failed to delete model. This may not be an Ollama backend.' }
}
}
/**
* Generate embeddings for the given input strings.
* Tries the Ollama native /api/embed endpoint first, falls back to /v1/embeddings.
*/
public async embed(model: string, input: string[]): Promise<{ embeddings: number[][] }> {
await this._ensureDependencies()
if (!this.baseUrl || !this.openai) {
throw new Error('AI service is not initialized.')
}
try {
// Prefer Ollama native endpoint (supports batch input natively)
const response = await axios.post(
`${this.baseUrl}/api/embed`,
{ model, input },
{ timeout: 60000 }
)
return { embeddings: response.data.embeddings }
} catch {
// Fall back to OpenAI-compatible /v1/embeddings (processes one at a time then batches)
logger.info('[OllamaService] /api/embed unavailable, falling back to /v1/embeddings')
const results = await this.openai.embeddings.create({ model, input })
return { embeddings: results.data.map((e) => e.embedding) }
}
}
public async getModels(includeEmbeddings = false): Promise<NomadInstalledModel[]> {
await this._ensureDependencies()
if (!this.baseUrl) {
throw new Error('AI service is not initialized.')
}
try {
// Prefer the Ollama native endpoint which includes size and metadata
const response = await axios.get(`${this.baseUrl}/api/tags`, { timeout: 5000 })
// LM Studio returns HTTP 200 for unknown endpoints with an incompatible body — validate explicitly
if (!Array.isArray(response.data?.models)) {
throw new Error('Not an Ollama-compatible /api/tags response')
}
const models: NomadInstalledModel[] = response.data.models
if (includeEmbeddings) return models
return models.filter((m) => !m.name.includes('embed'))
} catch {
// Fall back to the OpenAI-compatible /v1/models endpoint (LM Studio, llama.cpp, etc.)
logger.info('[OllamaService] /api/tags unavailable, falling back to /v1/models')
try {
const modelList = await this.openai!.models.list()
const models: NomadInstalledModel[] = modelList.data.map((m) => ({ name: m.id, size: 0 }))
if (includeEmbeddings) return models
return models.filter((m) => !m.name.includes('embed'))
} catch (err) {
logger.error(
`[OllamaService] Failed to list models: ${err instanceof Error ? err.message : err}`
)
return []
}
}
}
async getAvailableModels(
{
sort,
recommendedOnly,
query,
limit,
force,
}: {
sort?: 'pulls' | 'name'
recommendedOnly?: boolean
query: string | null
limit?: number
force?: boolean
} = {
sort: 'pulls',
recommendedOnly: false,
query: null,
limit: 15,
}
): Promise<{ models: NomadOllamaModel[]; hasMore: boolean } | null> {
try {
const models = await this.retrieveAndRefreshModels(sort, force)
if (!models) {
logger.warn(
'[OllamaService] Returning fallback recommended models due to failure in fetching available models'
)
return {
models: FALLBACK_RECOMMENDED_OLLAMA_MODELS,
hasMore: false,
}
}
if (!recommendedOnly) {
const filteredModels = query ? this.fuseSearchModels(models, query) : models
return {
models: filteredModels.slice(0, limit || 15),
hasMore: filteredModels.length > (limit || 15),
}
}
const sortedByPulls = sort === 'pulls' ? models : this.sortModels(models, 'pulls')
const firstThree = sortedByPulls.slice(0, 3)
const recommendedModels = firstThree.map((model) => {
return {
...model,
tags: model.tags && model.tags.length > 0 ? [model.tags[0]] : [],
}
})
if (query) {
const filteredRecommendedModels = this.fuseSearchModels(recommendedModels, query)
return {
models: filteredRecommendedModels,
hasMore: filteredRecommendedModels.length > (limit || 15),
}
}
return {
models: recommendedModels,
hasMore: recommendedModels.length > (limit || 15),
}
} catch (error) {
logger.error(
`[OllamaService] Failed to get available models: ${error instanceof Error ? error.message : error}`
)
return null
}
}
private async retrieveAndRefreshModels(
sort?: 'pulls' | 'name',
force?: boolean
): Promise<NomadOllamaModel[] | null> {
try {
if (!force) {
const cachedModels = await this.readModelsFromCache()
if (cachedModels) {
logger.info('[OllamaService] Using cached available models data')
return this.sortModels(cachedModels, sort)
}
} else {
logger.info('[OllamaService] Force refresh requested, bypassing cache')
}
logger.info('[OllamaService] Fetching fresh available models from API')
const baseUrl = env.get('NOMAD_API_URL') || NOMAD_API_DEFAULT_BASE_URL
const fullUrl = new URL(NOMAD_MODELS_API_PATH, baseUrl).toString()
const response = await axios.get(fullUrl)
if (!response.data || !Array.isArray(response.data.models)) {
logger.warn(
`[OllamaService] Invalid response format when fetching available models: ${JSON.stringify(response.data)}`
)
return null
}
const rawModels = response.data.models as NomadOllamaModel[]
const noCloud = rawModels
.map((model) => ({
...model,
tags: model.tags.filter((tag) => !tag.cloud),
}))
.filter((model) => model.tags.length > 0)
await this.writeModelsToCache(noCloud)
return this.sortModels(noCloud, sort)
} catch (error) {
logger.error(
`[OllamaService] Failed to retrieve models from Nomad API: ${error instanceof Error ? error.message : error}`
)
return null
}
}
private async readModelsFromCache(): Promise<NomadOllamaModel[] | null> {
try {
const stats = await fs.stat(MODELS_CACHE_FILE)
const cacheAge = Date.now() - stats.mtimeMs
if (cacheAge > CACHE_MAX_AGE_MS) {
logger.info('[OllamaService] Cache is stale, will fetch fresh data')
return null
}
const cacheData = await fs.readFile(MODELS_CACHE_FILE, 'utf-8')
const models = JSON.parse(cacheData) as NomadOllamaModel[]
if (!Array.isArray(models)) {
logger.warn('[OllamaService] Invalid cache format, will fetch fresh data')
return null
}
return models
} catch (error) {
if ((error as NodeJS.ErrnoException).code !== 'ENOENT') {
logger.warn(
`[OllamaService] Error reading cache: ${error instanceof Error ? error.message : error}`
)
}
return null
}
}
private async writeModelsToCache(models: NomadOllamaModel[]): Promise<void> {
try {
await fs.mkdir(path.dirname(MODELS_CACHE_FILE), { recursive: true })
await fs.writeFile(MODELS_CACHE_FILE, JSON.stringify(models, null, 2), 'utf-8')
logger.info('[OllamaService] Successfully cached available models')
} catch (error) {
logger.warn(
`[OllamaService] Failed to write models cache: ${error instanceof Error ? error.message : error}`
)
}
}
private sortModels(models: NomadOllamaModel[], sort?: 'pulls' | 'name'): NomadOllamaModel[] {
if (sort === 'pulls') {
models.sort((a, b) => {
const parsePulls = (pulls: string) => {
const multiplier = pulls.endsWith('K')
? 1_000
: pulls.endsWith('M')
? 1_000_000
: pulls.endsWith('B')
? 1_000_000_000
: 1
return parseFloat(pulls) * multiplier
}
return parsePulls(b.estimated_pulls) - parsePulls(a.estimated_pulls)
})
} else if (sort === 'name') {
models.sort((a, b) => a.name.localeCompare(b.name))
}
models.forEach((model) => {
if (model.tags && Array.isArray(model.tags)) {
model.tags.sort((a, b) => {
const parseSize = (size: string) => {
const multiplier = size.endsWith('KB')
? 1 / 1_000
: size.endsWith('MB')
? 1 / 1_000_000
: size.endsWith('GB')
? 1
: size.endsWith('TB')
? 1_000
: 0
return parseFloat(size) * multiplier
}
return parseSize(a.size) - parseSize(b.size)
})
}
})
return models
}
private broadcastDownloadError(model: string, error: string) {
transmit.broadcast(BROADCAST_CHANNELS.OLLAMA_MODEL_DOWNLOAD, {
model,
percent: -1,
error,
timestamp: new Date().toISOString(),
})
}
private broadcastDownloadProgress(model: string, percent: number) {
transmit.broadcast(BROADCAST_CHANNELS.OLLAMA_MODEL_DOWNLOAD, {
model,
percent,
timestamp: new Date().toISOString(),
})
logger.info(`[OllamaService] Download progress for model "${model}": ${percent}%`)
}
private fuseSearchModels(models: NomadOllamaModel[], query: string): NomadOllamaModel[] {
const options: IFuseOptions<NomadOllamaModel> = {
ignoreDiacritics: true,
keys: ['name', 'description', 'tags.name'],
threshold: 0.3,
}
const fuse = new Fuse(models, options)
return fuse.search(query).map((result) => result.item)
}
}