project-nomad/admin/app/controllers/ollama_controller.ts
Chris Sherwood 43645e4bbc fix(AI): rewrite RAG query on first follow-up (off-by-one in skip-rewrite threshold)
The short-conversation skip in `rewriteQueryWithContext` used `userMessages.length <= 2`,
which short-circuits both the very first turn AND the first follow-up. The follow-up is
the moment the rewriter matters most — it's where pronouns and shorthand ("the bars",
"how long does it last?") need to be resolved against earlier turns before the embedding
search runs. With the rewriter skipped, RAG queries against the raw last message, scores
nothing above the 0.3 threshold, and no context gets injected for that turn.

The visible symptom is the assistant treating the first follow-up in any chat as a
brand-new question — e.g. "great - they threw up 2 of the bars it looks like" answered
as if it were a recipe-bars question, with no carry-forward of the prior chocolate-
poisoning context.

Threshold lowered to `< 2`: skip only when there's exactly one user message (nothing to
rewrite from). From the first follow-up onward the rewriter runs, as originally intended
before commit 96e5027.

Validated against `mistral-nemo:12b` on NOMAD3 by hot-patching the compiled controller
and replaying the dog-chocolate scenario. Post-patch response correctly threads "3
Hershey's bars" from turn 1 into turn 2's answer; pre-patch (per reporter's screenshot)
pivoted to peanut butter bar recipes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 10:16:00 -07:00

435 lines
18 KiB
TypeScript

import { ChatService } from '#services/chat_service'
import { DockerService } from '#services/docker_service'
import { OllamaService } from '#services/ollama_service'
import { RagService } from '#services/rag_service'
import Service from '#models/service'
import KVStore from '#models/kv_store'
import { modelNameSchema } from '#validators/download'
import { chatSchema, getAvailableModelsSchema } from '#validators/ollama'
import { inject } from '@adonisjs/core'
import type { HttpContext } from '@adonisjs/core/http'
import { RAG_CONTEXT_LIMITS, SYSTEM_PROMPTS } from '../../constants/ollama.js'
import { SERVICE_NAMES } from '../../constants/service_names.js'
import logger from '@adonisjs/core/services/logger'
type Message = { role: 'system' | 'user' | 'assistant'; content: string }
@inject()
export default class OllamaController {
constructor(
private chatService: ChatService,
private dockerService: DockerService,
private ollamaService: OllamaService,
private ragService: RagService
) { }
async availableModels({ request }: HttpContext) {
const reqData = await request.validateUsing(getAvailableModelsSchema)
return await this.ollamaService.getAvailableModels({
sort: reqData.sort,
recommendedOnly: reqData.recommendedOnly,
query: reqData.query || null,
limit: reqData.limit || 15,
force: reqData.force,
})
}
async chat({ request, response }: HttpContext) {
const reqData = await request.validateUsing(chatSchema)
// Flush SSE headers immediately so the client connection is open while
// pre-processing (query rewriting, RAG lookup) runs in the background.
if (reqData.stream) {
response.response.setHeader('Content-Type', 'text/event-stream')
response.response.setHeader('Cache-Control', 'no-cache')
response.response.setHeader('Connection', 'keep-alive')
response.response.flushHeaders()
}
try {
// If there are no system messages in the chat inject system prompts
const hasSystemMessage = reqData.messages.some((msg) => msg.role === 'system')
if (!hasSystemMessage) {
const systemPrompt = {
role: 'system' as const,
content: SYSTEM_PROMPTS.default,
}
logger.debug('[OllamaController] Injecting system prompt')
reqData.messages.unshift(systemPrompt)
}
// Query rewriting for better RAG retrieval with manageable context
// Will return user's latest message if no rewriting is needed
const rewrittenQuery = await this.rewriteQueryWithContext(reqData.messages, reqData.model)
logger.debug(`[OllamaController] Rewritten query for RAG: "${rewrittenQuery}"`)
if (rewrittenQuery) {
const relevantDocs = await this.ragService.searchSimilarDocuments(
rewrittenQuery,
5, // Top 5 most relevant chunks
0.3 // Minimum similarity score of 0.3
)
logger.debug(`[RAG] Retrieved ${relevantDocs.length} relevant documents for query: "${rewrittenQuery}"`)
// If relevant context is found, inject as a system message with adaptive limits
if (relevantDocs.length > 0) {
// Determine context budget based on model size
const { maxResults, maxTokens } = this.getContextLimitsForModel(reqData.model)
let trimmedDocs = relevantDocs.slice(0, maxResults)
// Apply token cap if set (estimate ~3.5 chars per token)
// Always include the first (most relevant) result — the cap only gates subsequent results
if (maxTokens > 0) {
const charCap = maxTokens * 3.5
let totalChars = 0
trimmedDocs = trimmedDocs.filter((doc, idx) => {
totalChars += doc.text.length
return idx === 0 || totalChars <= charCap
})
}
logger.debug(
`[RAG] Injecting ${trimmedDocs.length}/${relevantDocs.length} results (model: ${reqData.model}, maxResults: ${maxResults}, maxTokens: ${maxTokens || 'unlimited'})`
)
const contextText = trimmedDocs
.map((doc, idx) => `[Context ${idx + 1}] (Relevance: ${(doc.score * 100).toFixed(1)}%)\n${doc.text}`)
.join('\n\n')
const systemMessage = {
role: 'system' as const,
content: SYSTEM_PROMPTS.rag_context(contextText),
}
// Insert system message at the beginning (after any existing system messages)
const firstNonSystemIndex = reqData.messages.findIndex((msg) => msg.role !== 'system')
const insertIndex = firstNonSystemIndex === -1 ? 0 : firstNonSystemIndex
reqData.messages.splice(insertIndex, 0, systemMessage)
}
}
// If system messages are large (e.g. due to RAG context), request a context window big
// enough to fit them. Ollama respects num_ctx per-request; LM Studio ignores it gracefully.
const systemChars = reqData.messages
.filter((m) => m.role === 'system')
.reduce((sum, m) => sum + m.content.length, 0)
const estimatedSystemTokens = Math.ceil(systemChars / 3.5)
let numCtx: number | undefined
if (estimatedSystemTokens > 3000) {
const needed = estimatedSystemTokens + 2048 // leave room for conversation + response
numCtx = [8192, 16384, 32768, 65536].find((n) => n >= needed) ?? 65536
logger.debug(`[OllamaController] Large system prompt (~${estimatedSystemTokens} tokens), requesting num_ctx: ${numCtx}`)
}
// Check if the model supports "thinking" capability for enhanced response generation
// If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat
const thinkingCapability = await this.ollamaService.checkModelHasThinking(reqData.model)
const think: boolean | 'medium' = thinkingCapability ? (reqData.model.startsWith('gpt-oss') ? 'medium' : true) : false
// Separate sessionId from the Ollama request payload — Ollama rejects unknown fields
const { sessionId, ...ollamaRequest } = reqData
// Save user message to DB before streaming if sessionId provided
let userContent: string | null = null
if (sessionId) {
const lastUserMsg = [...reqData.messages].reverse().find((m) => m.role === 'user')
if (lastUserMsg) {
userContent = lastUserMsg.content
await this.chatService.addMessage(sessionId, 'user', userContent)
}
}
if (reqData.stream) {
logger.debug(`[OllamaController] Initiating streaming response for model: "${reqData.model}" with think: ${think}`)
// Headers already flushed above
const stream = await this.ollamaService.chatStream({ ...ollamaRequest, think, numCtx })
let fullContent = ''
for await (const chunk of stream) {
if (chunk.message?.content) {
fullContent += chunk.message.content
}
response.response.write(`data: ${JSON.stringify(chunk)}\n\n`)
}
response.response.end()
// Save assistant message and optionally generate title
if (sessionId && fullContent) {
await this.chatService.addMessage(sessionId, 'assistant', fullContent)
const messageCount = await this.chatService.getMessageCount(sessionId)
if (messageCount <= 2 && userContent) {
this.chatService.generateTitle(sessionId, userContent, fullContent, reqData.model).catch((err) => {
logger.error(`[OllamaController] Title generation failed: ${err instanceof Error ? err.message : err}`)
})
}
}
return
}
// Non-streaming (legacy) path
const result = await this.ollamaService.chat({ ...ollamaRequest, think, numCtx })
if (sessionId && result?.message?.content) {
await this.chatService.addMessage(sessionId, 'assistant', result.message.content)
const messageCount = await this.chatService.getMessageCount(sessionId)
if (messageCount <= 2 && userContent) {
this.chatService.generateTitle(sessionId, userContent, result.message.content, reqData.model).catch((err) => {
logger.error(`[OllamaController] Title generation failed: ${err instanceof Error ? err.message : err}`)
})
}
}
return result
} catch (error) {
if (reqData.stream) {
response.response.write(`data: ${JSON.stringify({ error: true })}\n\n`)
response.response.end()
return
}
throw error
}
}
async remoteStatus() {
const remoteUrl = await KVStore.getValue('ai.remoteOllamaUrl')
if (!remoteUrl) {
return { configured: false, connected: false }
}
try {
const testResponse = await fetch(`${remoteUrl.replace(/\/$/, '')}/v1/models`, {
signal: AbortSignal.timeout(3000),
})
return { configured: true, connected: testResponse.ok }
} catch {
return { configured: true, connected: false }
}
}
async configureRemote({ request, response }: HttpContext) {
const remoteUrl: string | null = request.input('remoteUrl', null)
const ollamaService = await Service.query().where('service_name', SERVICE_NAMES.OLLAMA).first()
if (!ollamaService) {
return response.status(404).send({ success: false, message: 'Ollama service record not found.' })
}
// Clear path: null or empty URL removes remote config. If a local nomad_ollama container
// still exists (user had previously installed AI Assistant locally), restart it and keep
// the service marked installed. Otherwise fall back to uninstalled.
if (!remoteUrl || remoteUrl.trim() === '') {
await KVStore.clearValue('ai.remoteOllamaUrl')
const hasLocalContainer = await this._startLocalOllamaContainerIfExists()
ollamaService.installed = hasLocalContainer
ollamaService.installation_status = 'idle'
await ollamaService.save()
return {
success: true,
message: hasLocalContainer
? 'Remote Ollama cleared. Local Ollama container restored.'
: 'Remote Ollama configuration cleared.',
}
}
// Validate URL format
if (!remoteUrl.startsWith('http')) {
return response.status(400).send({
success: false,
message: 'Invalid URL. Must start with http:// or https://',
})
}
// Test connectivity via OpenAI-compatible /v1/models endpoint (works with Ollama, LM Studio, llama.cpp, etc.)
try {
const testResponse = await fetch(`${remoteUrl.replace(/\/$/, '')}/v1/models`, {
signal: AbortSignal.timeout(5000),
})
if (!testResponse.ok) {
return response.status(400).send({
success: false,
message: `Could not connect to ${remoteUrl} (HTTP ${testResponse.status}). Make sure the server is running and accessible. For Ollama, start it with OLLAMA_HOST=0.0.0.0.`,
})
}
} catch (error) {
return response.status(400).send({
success: false,
message: `Could not connect to ${remoteUrl}. Make sure the server is running and reachable. For Ollama, start it with OLLAMA_HOST=0.0.0.0.`,
})
}
// Save remote URL and mark service as installed
await KVStore.setValue('ai.remoteOllamaUrl', remoteUrl.trim())
ollamaService.installed = true
ollamaService.installation_status = 'idle'
await ollamaService.save()
// Stop the local nomad_ollama container (if running) so it doesn't compete with the
// remote host for GPU / port 11434. Preserves the container and its models volume.
await this._stopLocalOllamaContainer()
// Install Qdrant if not already installed (fire-and-forget)
const qdrantService = await Service.query().where('service_name', SERVICE_NAMES.QDRANT).first()
if (qdrantService && !qdrantService.installed) {
this.dockerService.createContainerPreflight(SERVICE_NAMES.QDRANT).catch((error) => {
logger.error('[OllamaController] Failed to start Qdrant preflight:', error)
})
}
// Mirror post-install side effects: disable suggestions, trigger docs discovery
await KVStore.setValue('chat.suggestionsEnabled', false)
this.ragService.discoverNomadDocs().catch((error) => {
logger.error('[OllamaController] Failed to discover Nomad docs:', error)
})
return { success: true, message: 'Remote Ollama configured.' }
}
private async _stopLocalOllamaContainer(): Promise<void> {
try {
const containers = await this.dockerService.docker.listContainers({ all: true })
const ollamaContainer = containers.find((c) =>
c.Names.includes(`/${SERVICE_NAMES.OLLAMA}`)
)
if (!ollamaContainer || ollamaContainer.State !== 'running') {
return
}
await this.dockerService.docker.getContainer(ollamaContainer.Id).stop()
this.dockerService.invalidateServicesStatusCache()
logger.info('[OllamaController] Stopped local nomad_ollama (remote Ollama configured)')
} catch (error: any) {
logger.error(
{ err: error },
'[OllamaController] Failed to stop local nomad_ollama; remote Ollama is still active'
)
}
}
private async _startLocalOllamaContainerIfExists(): Promise<boolean> {
try {
const containers = await this.dockerService.docker.listContainers({ all: true })
const ollamaContainer = containers.find((c) =>
c.Names.includes(`/${SERVICE_NAMES.OLLAMA}`)
)
if (!ollamaContainer) {
return false
}
if (ollamaContainer.State !== 'running') {
await this.dockerService.docker.getContainer(ollamaContainer.Id).start()
this.dockerService.invalidateServicesStatusCache()
logger.info('[OllamaController] Started local nomad_ollama (remote Ollama cleared)')
}
return true
} catch (error: any) {
logger.error(
{ err: error },
'[OllamaController] Failed to start local nomad_ollama on remote clear'
)
return false
}
}
async deleteModel({ request }: HttpContext) {
const reqData = await request.validateUsing(modelNameSchema)
await this.ollamaService.deleteModel(reqData.model)
return {
success: true,
message: `Model deleted: ${reqData.model}`,
}
}
async dispatchModelDownload({ request }: HttpContext) {
const reqData = await request.validateUsing(modelNameSchema)
await this.ollamaService.dispatchModelDownload(reqData.model)
return {
success: true,
message: `Download job dispatched for model: ${reqData.model}`,
}
}
async installedModels({ }: HttpContext) {
return await this.ollamaService.getModels()
}
/**
* Determines RAG context limits based on model size extracted from the model name.
* Parses size indicators like "1b", "3b", "8b", "70b" from model names/tags.
*/
private getContextLimitsForModel(modelName: string): { maxResults: number; maxTokens: number } {
// Extract parameter count from model name (e.g., "llama3.2:3b", "qwen2.5:1.5b", "gemma:7b")
const sizeMatch = modelName.match(/(\d+\.?\d*)[bB]/)
const paramBillions = sizeMatch ? parseFloat(sizeMatch[1]) : 8 // default to 8B if unknown
for (const tier of RAG_CONTEXT_LIMITS) {
if (paramBillions <= tier.maxParams) {
return { maxResults: tier.maxResults, maxTokens: tier.maxTokens }
}
}
// Fallback: no limits
return { maxResults: 5, maxTokens: 0 }
}
private async rewriteQueryWithContext(
messages: Message[],
model: string
): Promise<string | null> {
const lastUserMessage = [...messages].reverse().find(msg => msg.role === 'user')
try {
// Skip the entire RAG pipeline if there are no documents to search
const hasDocuments = await this.ragService.hasDocuments()
if (!hasDocuments) {
return null
}
// Get recent conversation history (last 6 messages for 3 turns)
const recentMessages = messages.slice(-6)
// Skip rewriting on the very first turn — with only one user message
// there is no prior context to fold in, so the rewrite would just echo
// the message back at the cost of an extra LLM round-trip. From the
// first follow-up onward we need the rewrite so the RAG query carries
// entities and topics from earlier turns ("the bars" → "Hershey's bars
// chocolate poisoning dog"); without it, embeddings match nothing and
// the assistant loses the thread.
const userMessages = recentMessages.filter(msg => msg.role === 'user')
if (userMessages.length < 2) {
return lastUserMessage?.content || null
}
const conversationContext = recentMessages
.map(msg => {
const role = msg.role === 'user' ? 'User' : 'Assistant'
// Truncate assistant messages to first 200 chars to keep context manageable
const content = msg.role === 'assistant'
? msg.content.slice(0, 200) + (msg.content.length > 200 ? '...' : '')
: msg.content
return `${role}: "${content}"`
})
.join('\n')
const response = await this.ollamaService.chat({
model,
messages: [
{
role: 'system',
content: SYSTEM_PROMPTS.query_rewrite,
},
{
role: 'user',
content: `Conversation:\n${conversationContext}\n\nRewritten Query:`,
},
],
})
const rewrittenQuery = response.message.content.trim()
logger.info(`[RAG] Query rewritten: "${rewrittenQuery}"`)
return rewrittenQuery
} catch (error) {
logger.error(
`[RAG] Query rewriting failed: ${error instanceof Error ? error.message : error}`
)
// Fallback to last user message if rewriting fails
return lastUserMessage?.content || null
}
}
}