project-nomad/admin/app/services/ollama_service.ts
Henry Estela 8b54310746
Improve context window size estimation
fixes issue seen with some models in lm studio resulting in:
"The number of tokens to keep from the initial prompt is greater than the context length (n_keep: 4705>= n_ctx: 4096)"

Fixed char/token estimate, the old value was too optimistic,
causing the cap to allow more text than the budget allowed in actual tokens.
After RAG injection, estimates the system prompt token count.
If it exceeds ~3000 tokens, requests the next standard context size (8192, 16384, 32768, or 65536),
large enough to fit the prompt plus a 2048-token buffer for the conversation and response.

For Ollama, num_ctx is honoured per-request and will load the model with that context
window. For LM Studio, the parameter is silently ignored — but the tighter char
estimate will also reduce how much RAG text gets stuffed in, so it's less likely to
overflow.
2026-03-25 17:18:06 -07:00

602 lines
20 KiB
TypeScript

import { inject } from '@adonisjs/core'
import OpenAI from 'openai'
import type { ChatCompletionChunk, ChatCompletionMessageParam } from 'openai/resources/chat/completions.js'
import type { Stream } from 'openai/streaming.js'
import { NomadOllamaModel } from '../../types/ollama.js'
import { FALLBACK_RECOMMENDED_OLLAMA_MODELS } from '../../constants/ollama.js'
import fs from 'node:fs/promises'
import path from 'node:path'
import logger from '@adonisjs/core/services/logger'
import axios from 'axios'
import { DownloadModelJob } from '#jobs/download_model_job'
import { SERVICE_NAMES } from '../../constants/service_names.js'
import transmit from '@adonisjs/transmit/services/main'
import Fuse, { IFuseOptions } from 'fuse.js'
import { BROADCAST_CHANNELS } from '../../constants/broadcast.js'
import env from '#start/env'
import { NOMAD_API_DEFAULT_BASE_URL } from '../../constants/misc.js'
import KVStore from '#models/kv_store'
const NOMAD_MODELS_API_PATH = '/api/v1/ollama/models'
const MODELS_CACHE_FILE = path.join(process.cwd(), 'storage', 'ollama-models-cache.json')
const CACHE_MAX_AGE_MS = 24 * 60 * 60 * 1000 // 24 hours
export type NomadInstalledModel = {
name: string
size: number
digest?: string
details?: Record<string, any>
}
export type NomadChatResponse = {
message: { content: string; thinking?: string }
done: boolean
model: string
}
export type NomadChatStreamChunk = {
message: { content: string; thinking?: string }
done: boolean
}
type ChatInput = {
model: string
messages: Array<{ role: 'system' | 'user' | 'assistant'; content: string }>
think?: boolean | 'medium'
stream?: boolean
numCtx?: number
}
@inject()
export class OllamaService {
private openai: OpenAI | null = null
private baseUrl: string | null = null
private initPromise: Promise<void> | null = null
private isOllamaNative: boolean | null = null
constructor() {}
private async _initialize() {
if (!this.initPromise) {
this.initPromise = (async () => {
// Check KVStore for a custom base URL (remote Ollama, LM Studio, llama.cpp, etc.)
const customUrl = (await KVStore.getValue('ai.remoteOllamaUrl')) as string | null
if (customUrl && customUrl.trim()) {
this.baseUrl = customUrl.trim().replace(/\/$/, '')
} else {
// Fall back to the local Ollama container managed by Docker
const dockerService = new (await import('./docker_service.js')).DockerService()
const ollamaUrl = await dockerService.getServiceURL(SERVICE_NAMES.OLLAMA)
if (!ollamaUrl) {
throw new Error('Ollama service is not installed or running.')
}
this.baseUrl = ollamaUrl.trim().replace(/\/$/, '')
}
this.openai = new OpenAI({
apiKey: 'nomad', // Required by SDK; not validated by Ollama/LM Studio/llama.cpp
baseURL: `${this.baseUrl}/v1`,
})
})()
}
return this.initPromise
}
private async _ensureDependencies() {
if (!this.openai) {
await this._initialize()
}
}
/**
* Downloads a model from Ollama with progress tracking. Only works with Ollama backends.
* Use dispatchModelDownload() for background job processing where possible.
*/
async downloadModel(
model: string,
progressCallback?: (percent: number) => void
): Promise<{ success: boolean; message: string; retryable?: boolean }> {
await this._ensureDependencies()
if (!this.baseUrl) {
return { success: false, message: 'AI service is not initialized.' }
}
try {
// See if model is already installed
const installedModels = await this.getModels()
if (installedModels && installedModels.some((m) => m.name === model)) {
logger.info(`[OllamaService] Model "${model}" is already installed.`)
return { success: true, message: 'Model is already installed.' }
}
// Model pulling is an Ollama-only operation. Non-Ollama backends (LM Studio, llama.cpp, etc.)
// return HTTP 200 for unknown endpoints, so the pull would appear to succeed but do nothing.
if (this.isOllamaNative === false) {
logger.warn(
`[OllamaService] Non-Ollama backend detected — skipping model pull for "${model}". Load the model manually in your AI host.`
)
return {
success: false,
message: `Model "${model}" is not available in your AI host. Please load it manually (model pulling is only supported for Ollama backends).`,
}
}
// Stream pull via Ollama native API
const pullResponse = await axios.post(
`${this.baseUrl}/api/pull`,
{ model, stream: true },
{ responseType: 'stream', timeout: 0 }
)
await new Promise<void>((resolve, reject) => {
let buffer = ''
pullResponse.data.on('data', (chunk: Buffer) => {
buffer += chunk.toString()
const lines = buffer.split('\n')
buffer = lines.pop() || ''
for (const line of lines) {
if (!line.trim()) continue
try {
const parsed = JSON.parse(line)
if (parsed.completed && parsed.total) {
const percent = parseFloat(((parsed.completed / parsed.total) * 100).toFixed(2))
this.broadcastDownloadProgress(model, percent)
if (progressCallback) progressCallback(percent)
}
} catch {
// ignore parse errors on partial lines
}
}
})
pullResponse.data.on('end', resolve)
pullResponse.data.on('error', reject)
})
logger.info(`[OllamaService] Model "${model}" downloaded successfully.`)
return { success: true, message: 'Model downloaded successfully.' }
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error)
logger.error(
`[OllamaService] Failed to download model "${model}": ${errorMessage}`
)
// Check for version mismatch (Ollama 412 response)
const isVersionMismatch = errorMessage.includes('newer version of Ollama')
const userMessage = isVersionMismatch
? 'This model requires a newer version of Ollama. Please update AI Assistant from the Apps page.'
: `Failed to download model: ${errorMessage}`
// Broadcast failure to connected clients so UI can show the error
this.broadcastDownloadError(model, userMessage)
return { success: false, message: userMessage, retryable: !isVersionMismatch }
}
}
async dispatchModelDownload(modelName: string): Promise<{ success: boolean; message: string }> {
try {
logger.info(`[OllamaService] Dispatching model download for ${modelName} via job queue`)
await DownloadModelJob.dispatch({
modelName,
})
return {
success: true,
message:
'Model download has been queued successfully. It will start shortly after Ollama and Open WebUI are ready (if not already).',
}
} catch (error) {
logger.error(
`[OllamaService] Failed to dispatch model download for ${modelName}: ${error instanceof Error ? error.message : error}`
)
return {
success: false,
message: 'Failed to queue model download. Please try again.',
}
}
}
public async chat(chatRequest: ChatInput): Promise<NomadChatResponse> {
await this._ensureDependencies()
if (!this.openai) {
throw new Error('AI client is not initialized.')
}
const params: any = {
model: chatRequest.model,
messages: chatRequest.messages as ChatCompletionMessageParam[],
stream: false,
}
if (chatRequest.think) {
params.think = chatRequest.think
}
if (chatRequest.numCtx) {
params.num_ctx = chatRequest.numCtx
}
const response = await this.openai.chat.completions.create(params)
const choice = response.choices[0]
return {
message: {
content: choice.message.content ?? '',
thinking: (choice.message as any).thinking ?? undefined,
},
done: true,
model: response.model,
}
}
public async chatStream(chatRequest: ChatInput): Promise<AsyncIterable<NomadChatStreamChunk>> {
await this._ensureDependencies()
if (!this.openai) {
throw new Error('AI client is not initialized.')
}
const params: any = {
model: chatRequest.model,
messages: chatRequest.messages as ChatCompletionMessageParam[],
stream: true,
}
if (chatRequest.think) {
params.think = chatRequest.think
}
if (chatRequest.numCtx) {
params.num_ctx = chatRequest.numCtx
}
const stream = (await this.openai.chat.completions.create(params)) as unknown as Stream<ChatCompletionChunk>
async function* normalize(): AsyncGenerator<NomadChatStreamChunk> {
for await (const chunk of stream) {
const delta = chunk.choices[0]?.delta
yield {
message: {
content: delta?.content ?? '',
thinking: (delta as any)?.thinking ?? '',
},
done: chunk.choices[0]?.finish_reason !== null && chunk.choices[0]?.finish_reason !== undefined,
}
}
}
return normalize()
}
public async checkModelHasThinking(modelName: string): Promise<boolean> {
await this._ensureDependencies()
if (!this.baseUrl) return false
try {
const response = await axios.post(
`${this.baseUrl}/api/show`,
{ model: modelName },
{ timeout: 5000 }
)
return Array.isArray(response.data?.capabilities) && response.data.capabilities.includes('thinking')
} catch {
// Non-Ollama backends don't expose /api/show — assume no thinking support
return false
}
}
public async deleteModel(modelName: string): Promise<{ success: boolean; message: string }> {
await this._ensureDependencies()
if (!this.baseUrl) {
return { success: false, message: 'AI service is not initialized.' }
}
try {
await axios.delete(`${this.baseUrl}/api/delete`, {
data: { model: modelName },
timeout: 10000,
})
return { success: true, message: `Model "${modelName}" deleted.` }
} catch (error) {
logger.error(
`[OllamaService] Failed to delete model "${modelName}": ${error instanceof Error ? error.message : error}`
)
return { success: false, message: 'Failed to delete model. This may not be an Ollama backend.' }
}
}
/**
* Generate embeddings for the given input strings.
* Tries the Ollama native /api/embed endpoint first, falls back to /v1/embeddings.
*/
public async embed(model: string, input: string[]): Promise<{ embeddings: number[][] }> {
await this._ensureDependencies()
if (!this.baseUrl || !this.openai) {
throw new Error('AI service is not initialized.')
}
try {
// Prefer Ollama native endpoint (supports batch input natively)
const response = await axios.post(
`${this.baseUrl}/api/embed`,
{ model, input },
{ timeout: 60000 }
)
// Some backends (e.g. LM Studio) return HTTP 200 for unknown endpoints with an incompatible
// body — validate explicitly before accepting the result.
if (!Array.isArray(response.data?.embeddings)) {
throw new Error('Invalid /api/embed response — missing embeddings array')
}
return { embeddings: response.data.embeddings }
} catch {
// Fall back to OpenAI-compatible /v1/embeddings
// Explicitly request float format — some backends (e.g. LM Studio) don't reliably
// implement the base64 encoding the OpenAI SDK requests by default.
logger.info('[OllamaService] /api/embed unavailable, falling back to /v1/embeddings')
const results = await this.openai.embeddings.create({ model, input, encoding_format: 'float' })
return { embeddings: results.data.map((e) => e.embedding as number[]) }
}
}
public async getModels(includeEmbeddings = false): Promise<NomadInstalledModel[]> {
await this._ensureDependencies()
if (!this.baseUrl) {
throw new Error('AI service is not initialized.')
}
try {
// Prefer the Ollama native endpoint which includes size and metadata
const response = await axios.get(`${this.baseUrl}/api/tags`, { timeout: 5000 })
// LM Studio returns HTTP 200 for unknown endpoints with an incompatible body — validate explicitly
if (!Array.isArray(response.data?.models)) {
throw new Error('Not an Ollama-compatible /api/tags response')
}
this.isOllamaNative = true
const models: NomadInstalledModel[] = response.data.models
if (includeEmbeddings) return models
return models.filter((m) => !m.name.includes('embed'))
} catch {
// Fall back to the OpenAI-compatible /v1/models endpoint (LM Studio, llama.cpp, etc.)
this.isOllamaNative = false
logger.info('[OllamaService] /api/tags unavailable, falling back to /v1/models')
try {
const modelList = await this.openai!.models.list()
const models: NomadInstalledModel[] = modelList.data.map((m) => ({ name: m.id, size: 0 }))
if (includeEmbeddings) return models
return models.filter((m) => !m.name.includes('embed'))
} catch (err) {
logger.error(
`[OllamaService] Failed to list models: ${err instanceof Error ? err.message : err}`
)
return []
}
}
}
async getAvailableModels(
{
sort,
recommendedOnly,
query,
limit,
force,
}: {
sort?: 'pulls' | 'name'
recommendedOnly?: boolean
query: string | null
limit?: number
force?: boolean
} = {
sort: 'pulls',
recommendedOnly: false,
query: null,
limit: 15,
}
): Promise<{ models: NomadOllamaModel[]; hasMore: boolean } | null> {
try {
const models = await this.retrieveAndRefreshModels(sort, force)
if (!models) {
logger.warn(
'[OllamaService] Returning fallback recommended models due to failure in fetching available models'
)
return {
models: FALLBACK_RECOMMENDED_OLLAMA_MODELS,
hasMore: false,
}
}
if (!recommendedOnly) {
const filteredModels = query ? this.fuseSearchModels(models, query) : models
return {
models: filteredModels.slice(0, limit || 15),
hasMore: filteredModels.length > (limit || 15),
}
}
const sortedByPulls = sort === 'pulls' ? models : this.sortModels(models, 'pulls')
const firstThree = sortedByPulls.slice(0, 3)
const recommendedModels = firstThree.map((model) => {
return {
...model,
tags: model.tags && model.tags.length > 0 ? [model.tags[0]] : [],
}
})
if (query) {
const filteredRecommendedModels = this.fuseSearchModels(recommendedModels, query)
return {
models: filteredRecommendedModels,
hasMore: filteredRecommendedModels.length > (limit || 15),
}
}
return {
models: recommendedModels,
hasMore: recommendedModels.length > (limit || 15),
}
} catch (error) {
logger.error(
`[OllamaService] Failed to get available models: ${error instanceof Error ? error.message : error}`
)
return null
}
}
private async retrieveAndRefreshModels(
sort?: 'pulls' | 'name',
force?: boolean
): Promise<NomadOllamaModel[] | null> {
try {
if (!force) {
const cachedModels = await this.readModelsFromCache()
if (cachedModels) {
logger.info('[OllamaService] Using cached available models data')
return this.sortModels(cachedModels, sort)
}
} else {
logger.info('[OllamaService] Force refresh requested, bypassing cache')
}
logger.info('[OllamaService] Fetching fresh available models from API')
const baseUrl = env.get('NOMAD_API_URL') || NOMAD_API_DEFAULT_BASE_URL
const fullUrl = new URL(NOMAD_MODELS_API_PATH, baseUrl).toString()
const response = await axios.get(fullUrl)
if (!response.data || !Array.isArray(response.data.models)) {
logger.warn(
`[OllamaService] Invalid response format when fetching available models: ${JSON.stringify(response.data)}`
)
return null
}
const rawModels = response.data.models as NomadOllamaModel[]
const noCloud = rawModels
.map((model) => ({
...model,
tags: model.tags.filter((tag) => !tag.cloud),
}))
.filter((model) => model.tags.length > 0)
await this.writeModelsToCache(noCloud)
return this.sortModels(noCloud, sort)
} catch (error) {
logger.error(
`[OllamaService] Failed to retrieve models from Nomad API: ${error instanceof Error ? error.message : error}`
)
return null
}
}
private async readModelsFromCache(): Promise<NomadOllamaModel[] | null> {
try {
const stats = await fs.stat(MODELS_CACHE_FILE)
const cacheAge = Date.now() - stats.mtimeMs
if (cacheAge > CACHE_MAX_AGE_MS) {
logger.info('[OllamaService] Cache is stale, will fetch fresh data')
return null
}
const cacheData = await fs.readFile(MODELS_CACHE_FILE, 'utf-8')
const models = JSON.parse(cacheData) as NomadOllamaModel[]
if (!Array.isArray(models)) {
logger.warn('[OllamaService] Invalid cache format, will fetch fresh data')
return null
}
return models
} catch (error) {
if ((error as NodeJS.ErrnoException).code !== 'ENOENT') {
logger.warn(
`[OllamaService] Error reading cache: ${error instanceof Error ? error.message : error}`
)
}
return null
}
}
private async writeModelsToCache(models: NomadOllamaModel[]): Promise<void> {
try {
await fs.mkdir(path.dirname(MODELS_CACHE_FILE), { recursive: true })
await fs.writeFile(MODELS_CACHE_FILE, JSON.stringify(models, null, 2), 'utf-8')
logger.info('[OllamaService] Successfully cached available models')
} catch (error) {
logger.warn(
`[OllamaService] Failed to write models cache: ${error instanceof Error ? error.message : error}`
)
}
}
private sortModels(models: NomadOllamaModel[], sort?: 'pulls' | 'name'): NomadOllamaModel[] {
if (sort === 'pulls') {
models.sort((a, b) => {
const parsePulls = (pulls: string) => {
const multiplier = pulls.endsWith('K')
? 1_000
: pulls.endsWith('M')
? 1_000_000
: pulls.endsWith('B')
? 1_000_000_000
: 1
return parseFloat(pulls) * multiplier
}
return parsePulls(b.estimated_pulls) - parsePulls(a.estimated_pulls)
})
} else if (sort === 'name') {
models.sort((a, b) => a.name.localeCompare(b.name))
}
models.forEach((model) => {
if (model.tags && Array.isArray(model.tags)) {
model.tags.sort((a, b) => {
const parseSize = (size: string) => {
const multiplier = size.endsWith('KB')
? 1 / 1_000
: size.endsWith('MB')
? 1 / 1_000_000
: size.endsWith('GB')
? 1
: size.endsWith('TB')
? 1_000
: 0
return parseFloat(size) * multiplier
}
return parseSize(a.size) - parseSize(b.size)
})
}
})
return models
}
private broadcastDownloadError(model: string, error: string) {
transmit.broadcast(BROADCAST_CHANNELS.OLLAMA_MODEL_DOWNLOAD, {
model,
percent: -1,
error,
timestamp: new Date().toISOString(),
})
}
private broadcastDownloadProgress(model: string, percent: number) {
transmit.broadcast(BROADCAST_CHANNELS.OLLAMA_MODEL_DOWNLOAD, {
model,
percent,
timestamp: new Date().toISOString(),
})
logger.info(`[OllamaService] Download progress for model "${model}": ${percent}%`)
}
private fuseSearchModels(models: NomadOllamaModel[], query: string): NomadOllamaModel[] {
const options: IFuseOptions<NomadOllamaModel> = {
ignoreDiacritics: true,
keys: ['name', 'description', 'tags.name'],
threshold: 0.3,
}
const fuse = new Fuse(models, options)
return fuse.search(query).map((result) => result.item)
}
}