project-nomad/admin/app/controllers/ollama_controller.ts
chriscrosstalk 36b7613f85
fix(AI): stop local nomad_ollama container when remote Ollama is configured (#744)
When users set a remote Ollama URL via AI Settings, the local nomad_ollama
container continued running and competed with the remote host for port 11434
and GPU access. Now configureRemote stops the local container on set and
restores it on clear (if still present). Container and its models volume are
preserved so the local install can be re-enabled later.

Closes #662

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 16:02:49 -07:00

430 lines
17 KiB
TypeScript

import { ChatService } from '#services/chat_service'
import { DockerService } from '#services/docker_service'
import { OllamaService } from '#services/ollama_service'
import { RagService } from '#services/rag_service'
import Service from '#models/service'
import KVStore from '#models/kv_store'
import { modelNameSchema } from '#validators/download'
import { chatSchema, getAvailableModelsSchema } from '#validators/ollama'
import { inject } from '@adonisjs/core'
import type { HttpContext } from '@adonisjs/core/http'
import { RAG_CONTEXT_LIMITS, SYSTEM_PROMPTS } from '../../constants/ollama.js'
import { SERVICE_NAMES } from '../../constants/service_names.js'
import logger from '@adonisjs/core/services/logger'
type Message = { role: 'system' | 'user' | 'assistant'; content: string }
@inject()
export default class OllamaController {
constructor(
private chatService: ChatService,
private dockerService: DockerService,
private ollamaService: OllamaService,
private ragService: RagService
) { }
async availableModels({ request }: HttpContext) {
const reqData = await request.validateUsing(getAvailableModelsSchema)
return await this.ollamaService.getAvailableModels({
sort: reqData.sort,
recommendedOnly: reqData.recommendedOnly,
query: reqData.query || null,
limit: reqData.limit || 15,
force: reqData.force,
})
}
async chat({ request, response }: HttpContext) {
const reqData = await request.validateUsing(chatSchema)
// Flush SSE headers immediately so the client connection is open while
// pre-processing (query rewriting, RAG lookup) runs in the background.
if (reqData.stream) {
response.response.setHeader('Content-Type', 'text/event-stream')
response.response.setHeader('Cache-Control', 'no-cache')
response.response.setHeader('Connection', 'keep-alive')
response.response.flushHeaders()
}
try {
// If there are no system messages in the chat inject system prompts
const hasSystemMessage = reqData.messages.some((msg) => msg.role === 'system')
if (!hasSystemMessage) {
const systemPrompt = {
role: 'system' as const,
content: SYSTEM_PROMPTS.default,
}
logger.debug('[OllamaController] Injecting system prompt')
reqData.messages.unshift(systemPrompt)
}
// Query rewriting for better RAG retrieval with manageable context
// Will return user's latest message if no rewriting is needed
const rewrittenQuery = await this.rewriteQueryWithContext(reqData.messages, reqData.model)
logger.debug(`[OllamaController] Rewritten query for RAG: "${rewrittenQuery}"`)
if (rewrittenQuery) {
const relevantDocs = await this.ragService.searchSimilarDocuments(
rewrittenQuery,
5, // Top 5 most relevant chunks
0.3 // Minimum similarity score of 0.3
)
logger.debug(`[RAG] Retrieved ${relevantDocs.length} relevant documents for query: "${rewrittenQuery}"`)
// If relevant context is found, inject as a system message with adaptive limits
if (relevantDocs.length > 0) {
// Determine context budget based on model size
const { maxResults, maxTokens } = this.getContextLimitsForModel(reqData.model)
let trimmedDocs = relevantDocs.slice(0, maxResults)
// Apply token cap if set (estimate ~3.5 chars per token)
// Always include the first (most relevant) result — the cap only gates subsequent results
if (maxTokens > 0) {
const charCap = maxTokens * 3.5
let totalChars = 0
trimmedDocs = trimmedDocs.filter((doc, idx) => {
totalChars += doc.text.length
return idx === 0 || totalChars <= charCap
})
}
logger.debug(
`[RAG] Injecting ${trimmedDocs.length}/${relevantDocs.length} results (model: ${reqData.model}, maxResults: ${maxResults}, maxTokens: ${maxTokens || 'unlimited'})`
)
const contextText = trimmedDocs
.map((doc, idx) => `[Context ${idx + 1}] (Relevance: ${(doc.score * 100).toFixed(1)}%)\n${doc.text}`)
.join('\n\n')
const systemMessage = {
role: 'system' as const,
content: SYSTEM_PROMPTS.rag_context(contextText),
}
// Insert system message at the beginning (after any existing system messages)
const firstNonSystemIndex = reqData.messages.findIndex((msg) => msg.role !== 'system')
const insertIndex = firstNonSystemIndex === -1 ? 0 : firstNonSystemIndex
reqData.messages.splice(insertIndex, 0, systemMessage)
}
}
// If system messages are large (e.g. due to RAG context), request a context window big
// enough to fit them. Ollama respects num_ctx per-request; LM Studio ignores it gracefully.
const systemChars = reqData.messages
.filter((m) => m.role === 'system')
.reduce((sum, m) => sum + m.content.length, 0)
const estimatedSystemTokens = Math.ceil(systemChars / 3.5)
let numCtx: number | undefined
if (estimatedSystemTokens > 3000) {
const needed = estimatedSystemTokens + 2048 // leave room for conversation + response
numCtx = [8192, 16384, 32768, 65536].find((n) => n >= needed) ?? 65536
logger.debug(`[OllamaController] Large system prompt (~${estimatedSystemTokens} tokens), requesting num_ctx: ${numCtx}`)
}
// Check if the model supports "thinking" capability for enhanced response generation
// If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat
const thinkingCapability = await this.ollamaService.checkModelHasThinking(reqData.model)
const think: boolean | 'medium' = thinkingCapability ? (reqData.model.startsWith('gpt-oss') ? 'medium' : true) : false
// Separate sessionId from the Ollama request payload — Ollama rejects unknown fields
const { sessionId, ...ollamaRequest } = reqData
// Save user message to DB before streaming if sessionId provided
let userContent: string | null = null
if (sessionId) {
const lastUserMsg = [...reqData.messages].reverse().find((m) => m.role === 'user')
if (lastUserMsg) {
userContent = lastUserMsg.content
await this.chatService.addMessage(sessionId, 'user', userContent)
}
}
if (reqData.stream) {
logger.debug(`[OllamaController] Initiating streaming response for model: "${reqData.model}" with think: ${think}`)
// Headers already flushed above
const stream = await this.ollamaService.chatStream({ ...ollamaRequest, think, numCtx })
let fullContent = ''
for await (const chunk of stream) {
if (chunk.message?.content) {
fullContent += chunk.message.content
}
response.response.write(`data: ${JSON.stringify(chunk)}\n\n`)
}
response.response.end()
// Save assistant message and optionally generate title
if (sessionId && fullContent) {
await this.chatService.addMessage(sessionId, 'assistant', fullContent)
const messageCount = await this.chatService.getMessageCount(sessionId)
if (messageCount <= 2 && userContent) {
this.chatService.generateTitle(sessionId, userContent, fullContent, reqData.model).catch((err) => {
logger.error(`[OllamaController] Title generation failed: ${err instanceof Error ? err.message : err}`)
})
}
}
return
}
// Non-streaming (legacy) path
const result = await this.ollamaService.chat({ ...ollamaRequest, think, numCtx })
if (sessionId && result?.message?.content) {
await this.chatService.addMessage(sessionId, 'assistant', result.message.content)
const messageCount = await this.chatService.getMessageCount(sessionId)
if (messageCount <= 2 && userContent) {
this.chatService.generateTitle(sessionId, userContent, result.message.content, reqData.model).catch((err) => {
logger.error(`[OllamaController] Title generation failed: ${err instanceof Error ? err.message : err}`)
})
}
}
return result
} catch (error) {
if (reqData.stream) {
response.response.write(`data: ${JSON.stringify({ error: true })}\n\n`)
response.response.end()
return
}
throw error
}
}
async remoteStatus() {
const remoteUrl = await KVStore.getValue('ai.remoteOllamaUrl')
if (!remoteUrl) {
return { configured: false, connected: false }
}
try {
const testResponse = await fetch(`${remoteUrl.replace(/\/$/, '')}/v1/models`, {
signal: AbortSignal.timeout(3000),
})
return { configured: true, connected: testResponse.ok }
} catch {
return { configured: true, connected: false }
}
}
async configureRemote({ request, response }: HttpContext) {
const remoteUrl: string | null = request.input('remoteUrl', null)
const ollamaService = await Service.query().where('service_name', SERVICE_NAMES.OLLAMA).first()
if (!ollamaService) {
return response.status(404).send({ success: false, message: 'Ollama service record not found.' })
}
// Clear path: null or empty URL removes remote config. If a local nomad_ollama container
// still exists (user had previously installed AI Assistant locally), restart it and keep
// the service marked installed. Otherwise fall back to uninstalled.
if (!remoteUrl || remoteUrl.trim() === '') {
await KVStore.clearValue('ai.remoteOllamaUrl')
const hasLocalContainer = await this._startLocalOllamaContainerIfExists()
ollamaService.installed = hasLocalContainer
ollamaService.installation_status = 'idle'
await ollamaService.save()
return {
success: true,
message: hasLocalContainer
? 'Remote Ollama cleared. Local Ollama container restored.'
: 'Remote Ollama configuration cleared.',
}
}
// Validate URL format
if (!remoteUrl.startsWith('http')) {
return response.status(400).send({
success: false,
message: 'Invalid URL. Must start with http:// or https://',
})
}
// Test connectivity via OpenAI-compatible /v1/models endpoint (works with Ollama, LM Studio, llama.cpp, etc.)
try {
const testResponse = await fetch(`${remoteUrl.replace(/\/$/, '')}/v1/models`, {
signal: AbortSignal.timeout(5000),
})
if (!testResponse.ok) {
return response.status(400).send({
success: false,
message: `Could not connect to ${remoteUrl} (HTTP ${testResponse.status}). Make sure the server is running and accessible. For Ollama, start it with OLLAMA_HOST=0.0.0.0.`,
})
}
} catch (error) {
return response.status(400).send({
success: false,
message: `Could not connect to ${remoteUrl}. Make sure the server is running and reachable. For Ollama, start it with OLLAMA_HOST=0.0.0.0.`,
})
}
// Save remote URL and mark service as installed
await KVStore.setValue('ai.remoteOllamaUrl', remoteUrl.trim())
ollamaService.installed = true
ollamaService.installation_status = 'idle'
await ollamaService.save()
// Stop the local nomad_ollama container (if running) so it doesn't compete with the
// remote host for GPU / port 11434. Preserves the container and its models volume.
await this._stopLocalOllamaContainer()
// Install Qdrant if not already installed (fire-and-forget)
const qdrantService = await Service.query().where('service_name', SERVICE_NAMES.QDRANT).first()
if (qdrantService && !qdrantService.installed) {
this.dockerService.createContainerPreflight(SERVICE_NAMES.QDRANT).catch((error) => {
logger.error('[OllamaController] Failed to start Qdrant preflight:', error)
})
}
// Mirror post-install side effects: disable suggestions, trigger docs discovery
await KVStore.setValue('chat.suggestionsEnabled', false)
this.ragService.discoverNomadDocs().catch((error) => {
logger.error('[OllamaController] Failed to discover Nomad docs:', error)
})
return { success: true, message: 'Remote Ollama configured.' }
}
private async _stopLocalOllamaContainer(): Promise<void> {
try {
const containers = await this.dockerService.docker.listContainers({ all: true })
const ollamaContainer = containers.find((c) =>
c.Names.includes(`/${SERVICE_NAMES.OLLAMA}`)
)
if (!ollamaContainer || ollamaContainer.State !== 'running') {
return
}
await this.dockerService.docker.getContainer(ollamaContainer.Id).stop()
this.dockerService.invalidateServicesStatusCache()
logger.info('[OllamaController] Stopped local nomad_ollama (remote Ollama configured)')
} catch (error: any) {
logger.error(
{ err: error },
'[OllamaController] Failed to stop local nomad_ollama; remote Ollama is still active'
)
}
}
private async _startLocalOllamaContainerIfExists(): Promise<boolean> {
try {
const containers = await this.dockerService.docker.listContainers({ all: true })
const ollamaContainer = containers.find((c) =>
c.Names.includes(`/${SERVICE_NAMES.OLLAMA}`)
)
if (!ollamaContainer) {
return false
}
if (ollamaContainer.State !== 'running') {
await this.dockerService.docker.getContainer(ollamaContainer.Id).start()
this.dockerService.invalidateServicesStatusCache()
logger.info('[OllamaController] Started local nomad_ollama (remote Ollama cleared)')
}
return true
} catch (error: any) {
logger.error(
{ err: error },
'[OllamaController] Failed to start local nomad_ollama on remote clear'
)
return false
}
}
async deleteModel({ request }: HttpContext) {
const reqData = await request.validateUsing(modelNameSchema)
await this.ollamaService.deleteModel(reqData.model)
return {
success: true,
message: `Model deleted: ${reqData.model}`,
}
}
async dispatchModelDownload({ request }: HttpContext) {
const reqData = await request.validateUsing(modelNameSchema)
await this.ollamaService.dispatchModelDownload(reqData.model)
return {
success: true,
message: `Download job dispatched for model: ${reqData.model}`,
}
}
async installedModels({ }: HttpContext) {
return await this.ollamaService.getModels()
}
/**
* Determines RAG context limits based on model size extracted from the model name.
* Parses size indicators like "1b", "3b", "8b", "70b" from model names/tags.
*/
private getContextLimitsForModel(modelName: string): { maxResults: number; maxTokens: number } {
// Extract parameter count from model name (e.g., "llama3.2:3b", "qwen2.5:1.5b", "gemma:7b")
const sizeMatch = modelName.match(/(\d+\.?\d*)[bB]/)
const paramBillions = sizeMatch ? parseFloat(sizeMatch[1]) : 8 // default to 8B if unknown
for (const tier of RAG_CONTEXT_LIMITS) {
if (paramBillions <= tier.maxParams) {
return { maxResults: tier.maxResults, maxTokens: tier.maxTokens }
}
}
// Fallback: no limits
return { maxResults: 5, maxTokens: 0 }
}
private async rewriteQueryWithContext(
messages: Message[],
model: string
): Promise<string | null> {
const lastUserMessage = [...messages].reverse().find(msg => msg.role === 'user')
try {
// Skip the entire RAG pipeline if there are no documents to search
const hasDocuments = await this.ragService.hasDocuments()
if (!hasDocuments) {
return null
}
// Get recent conversation history (last 6 messages for 3 turns)
const recentMessages = messages.slice(-6)
// Skip rewriting for short conversations. Rewriting adds latency with
// little RAG benefit until there is enough context to matter.
const userMessages = recentMessages.filter(msg => msg.role === 'user')
if (userMessages.length <= 2) {
return lastUserMessage?.content || null
}
const conversationContext = recentMessages
.map(msg => {
const role = msg.role === 'user' ? 'User' : 'Assistant'
// Truncate assistant messages to first 200 chars to keep context manageable
const content = msg.role === 'assistant'
? msg.content.slice(0, 200) + (msg.content.length > 200 ? '...' : '')
: msg.content
return `${role}: "${content}"`
})
.join('\n')
const response = await this.ollamaService.chat({
model,
messages: [
{
role: 'system',
content: SYSTEM_PROMPTS.query_rewrite,
},
{
role: 'user',
content: `Conversation:\n${conversationContext}\n\nRewritten Query:`,
},
],
})
const rewrittenQuery = response.message.content.trim()
logger.info(`[RAG] Query rewritten: "${rewrittenQuery}"`)
return rewrittenQuery
} catch (error) {
logger.error(
`[RAG] Query rewriting failed: ${error instanceof Error ? error.message : error}`
)
// Fallback to last user message if rewriting fails
return lastUserMessage?.content || null
}
}
}