mirror of
https://github.com/Crosstalk-Solutions/project-nomad.git
synced 2026-05-30 08:06:51 +02:00
The short-conversation skip in `rewriteQueryWithContext` used `userMessages.length <= 2`,
which short-circuits both the very first turn AND the first follow-up. The follow-up is
the moment the rewriter matters most — it's where pronouns and shorthand ("the bars",
"how long does it last?") need to be resolved against earlier turns before the embedding
search runs. With the rewriter skipped, RAG queries against the raw last message, scores
nothing above the 0.3 threshold, and no context gets injected for that turn.
The visible symptom is the assistant treating the first follow-up in any chat as a
brand-new question — e.g. "great - they threw up 2 of the bars it looks like" answered
as if it were a recipe-bars question, with no carry-forward of the prior chocolate-
poisoning context.
Threshold lowered to `< 2`: skip only when there's exactly one user message (nothing to
rewrite from). From the first follow-up onward the rewriter runs, as originally intended
before commit 96e5027.
Validated against `mistral-nemo:12b` on NOMAD3 by hot-patching the compiled controller
and replaying the dog-chocolate scenario. Post-patch response correctly threads "3
Hershey's bars" from turn 1 into turn 2's answer; pre-patch (per reporter's screenshot)
pivoted to peanut butter bar recipes.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
435 lines
18 KiB
TypeScript
435 lines
18 KiB
TypeScript
import { ChatService } from '#services/chat_service'
|
|
import { DockerService } from '#services/docker_service'
|
|
import { OllamaService } from '#services/ollama_service'
|
|
import { RagService } from '#services/rag_service'
|
|
import Service from '#models/service'
|
|
import KVStore from '#models/kv_store'
|
|
import { modelNameSchema } from '#validators/download'
|
|
import { chatSchema, getAvailableModelsSchema } from '#validators/ollama'
|
|
import { inject } from '@adonisjs/core'
|
|
import type { HttpContext } from '@adonisjs/core/http'
|
|
import { RAG_CONTEXT_LIMITS, SYSTEM_PROMPTS } from '../../constants/ollama.js'
|
|
import { SERVICE_NAMES } from '../../constants/service_names.js'
|
|
import logger from '@adonisjs/core/services/logger'
|
|
type Message = { role: 'system' | 'user' | 'assistant'; content: string }
|
|
|
|
@inject()
|
|
export default class OllamaController {
|
|
constructor(
|
|
private chatService: ChatService,
|
|
private dockerService: DockerService,
|
|
private ollamaService: OllamaService,
|
|
private ragService: RagService
|
|
) { }
|
|
|
|
async availableModels({ request }: HttpContext) {
|
|
const reqData = await request.validateUsing(getAvailableModelsSchema)
|
|
return await this.ollamaService.getAvailableModels({
|
|
sort: reqData.sort,
|
|
recommendedOnly: reqData.recommendedOnly,
|
|
query: reqData.query || null,
|
|
limit: reqData.limit || 15,
|
|
force: reqData.force,
|
|
})
|
|
}
|
|
|
|
async chat({ request, response }: HttpContext) {
|
|
const reqData = await request.validateUsing(chatSchema)
|
|
|
|
// Flush SSE headers immediately so the client connection is open while
|
|
// pre-processing (query rewriting, RAG lookup) runs in the background.
|
|
if (reqData.stream) {
|
|
response.response.setHeader('Content-Type', 'text/event-stream')
|
|
response.response.setHeader('Cache-Control', 'no-cache')
|
|
response.response.setHeader('Connection', 'keep-alive')
|
|
response.response.flushHeaders()
|
|
}
|
|
|
|
try {
|
|
// If there are no system messages in the chat inject system prompts
|
|
const hasSystemMessage = reqData.messages.some((msg) => msg.role === 'system')
|
|
if (!hasSystemMessage) {
|
|
const systemPrompt = {
|
|
role: 'system' as const,
|
|
content: SYSTEM_PROMPTS.default,
|
|
}
|
|
logger.debug('[OllamaController] Injecting system prompt')
|
|
reqData.messages.unshift(systemPrompt)
|
|
}
|
|
|
|
// Query rewriting for better RAG retrieval with manageable context
|
|
// Will return user's latest message if no rewriting is needed
|
|
const rewrittenQuery = await this.rewriteQueryWithContext(reqData.messages, reqData.model)
|
|
|
|
logger.debug(`[OllamaController] Rewritten query for RAG: "${rewrittenQuery}"`)
|
|
if (rewrittenQuery) {
|
|
const relevantDocs = await this.ragService.searchSimilarDocuments(
|
|
rewrittenQuery,
|
|
5, // Top 5 most relevant chunks
|
|
0.3 // Minimum similarity score of 0.3
|
|
)
|
|
|
|
logger.debug(`[RAG] Retrieved ${relevantDocs.length} relevant documents for query: "${rewrittenQuery}"`)
|
|
|
|
// If relevant context is found, inject as a system message with adaptive limits
|
|
if (relevantDocs.length > 0) {
|
|
// Determine context budget based on model size
|
|
const { maxResults, maxTokens } = this.getContextLimitsForModel(reqData.model)
|
|
let trimmedDocs = relevantDocs.slice(0, maxResults)
|
|
|
|
// Apply token cap if set (estimate ~3.5 chars per token)
|
|
// Always include the first (most relevant) result — the cap only gates subsequent results
|
|
if (maxTokens > 0) {
|
|
const charCap = maxTokens * 3.5
|
|
let totalChars = 0
|
|
trimmedDocs = trimmedDocs.filter((doc, idx) => {
|
|
totalChars += doc.text.length
|
|
return idx === 0 || totalChars <= charCap
|
|
})
|
|
}
|
|
|
|
logger.debug(
|
|
`[RAG] Injecting ${trimmedDocs.length}/${relevantDocs.length} results (model: ${reqData.model}, maxResults: ${maxResults}, maxTokens: ${maxTokens || 'unlimited'})`
|
|
)
|
|
|
|
const contextText = trimmedDocs
|
|
.map((doc, idx) => `[Context ${idx + 1}] (Relevance: ${(doc.score * 100).toFixed(1)}%)\n${doc.text}`)
|
|
.join('\n\n')
|
|
|
|
const systemMessage = {
|
|
role: 'system' as const,
|
|
content: SYSTEM_PROMPTS.rag_context(contextText),
|
|
}
|
|
|
|
// Insert system message at the beginning (after any existing system messages)
|
|
const firstNonSystemIndex = reqData.messages.findIndex((msg) => msg.role !== 'system')
|
|
const insertIndex = firstNonSystemIndex === -1 ? 0 : firstNonSystemIndex
|
|
reqData.messages.splice(insertIndex, 0, systemMessage)
|
|
}
|
|
}
|
|
|
|
// If system messages are large (e.g. due to RAG context), request a context window big
|
|
// enough to fit them. Ollama respects num_ctx per-request; LM Studio ignores it gracefully.
|
|
const systemChars = reqData.messages
|
|
.filter((m) => m.role === 'system')
|
|
.reduce((sum, m) => sum + m.content.length, 0)
|
|
const estimatedSystemTokens = Math.ceil(systemChars / 3.5)
|
|
let numCtx: number | undefined
|
|
if (estimatedSystemTokens > 3000) {
|
|
const needed = estimatedSystemTokens + 2048 // leave room for conversation + response
|
|
numCtx = [8192, 16384, 32768, 65536].find((n) => n >= needed) ?? 65536
|
|
logger.debug(`[OllamaController] Large system prompt (~${estimatedSystemTokens} tokens), requesting num_ctx: ${numCtx}`)
|
|
}
|
|
|
|
// Check if the model supports "thinking" capability for enhanced response generation
|
|
// If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat
|
|
const thinkingCapability = await this.ollamaService.checkModelHasThinking(reqData.model)
|
|
const think: boolean | 'medium' = thinkingCapability ? (reqData.model.startsWith('gpt-oss') ? 'medium' : true) : false
|
|
|
|
// Separate sessionId from the Ollama request payload — Ollama rejects unknown fields
|
|
const { sessionId, ...ollamaRequest } = reqData
|
|
|
|
// Save user message to DB before streaming if sessionId provided
|
|
let userContent: string | null = null
|
|
if (sessionId) {
|
|
const lastUserMsg = [...reqData.messages].reverse().find((m) => m.role === 'user')
|
|
if (lastUserMsg) {
|
|
userContent = lastUserMsg.content
|
|
await this.chatService.addMessage(sessionId, 'user', userContent)
|
|
}
|
|
}
|
|
|
|
if (reqData.stream) {
|
|
logger.debug(`[OllamaController] Initiating streaming response for model: "${reqData.model}" with think: ${think}`)
|
|
// Headers already flushed above
|
|
const stream = await this.ollamaService.chatStream({ ...ollamaRequest, think, numCtx })
|
|
let fullContent = ''
|
|
for await (const chunk of stream) {
|
|
if (chunk.message?.content) {
|
|
fullContent += chunk.message.content
|
|
}
|
|
response.response.write(`data: ${JSON.stringify(chunk)}\n\n`)
|
|
}
|
|
response.response.end()
|
|
|
|
// Save assistant message and optionally generate title
|
|
if (sessionId && fullContent) {
|
|
await this.chatService.addMessage(sessionId, 'assistant', fullContent)
|
|
const messageCount = await this.chatService.getMessageCount(sessionId)
|
|
if (messageCount <= 2 && userContent) {
|
|
this.chatService.generateTitle(sessionId, userContent, fullContent, reqData.model).catch((err) => {
|
|
logger.error(`[OllamaController] Title generation failed: ${err instanceof Error ? err.message : err}`)
|
|
})
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
// Non-streaming (legacy) path
|
|
const result = await this.ollamaService.chat({ ...ollamaRequest, think, numCtx })
|
|
|
|
if (sessionId && result?.message?.content) {
|
|
await this.chatService.addMessage(sessionId, 'assistant', result.message.content)
|
|
const messageCount = await this.chatService.getMessageCount(sessionId)
|
|
if (messageCount <= 2 && userContent) {
|
|
this.chatService.generateTitle(sessionId, userContent, result.message.content, reqData.model).catch((err) => {
|
|
logger.error(`[OllamaController] Title generation failed: ${err instanceof Error ? err.message : err}`)
|
|
})
|
|
}
|
|
}
|
|
|
|
return result
|
|
} catch (error) {
|
|
if (reqData.stream) {
|
|
response.response.write(`data: ${JSON.stringify({ error: true })}\n\n`)
|
|
response.response.end()
|
|
return
|
|
}
|
|
throw error
|
|
}
|
|
}
|
|
|
|
async remoteStatus() {
|
|
const remoteUrl = await KVStore.getValue('ai.remoteOllamaUrl')
|
|
if (!remoteUrl) {
|
|
return { configured: false, connected: false }
|
|
}
|
|
try {
|
|
const testResponse = await fetch(`${remoteUrl.replace(/\/$/, '')}/v1/models`, {
|
|
signal: AbortSignal.timeout(3000),
|
|
})
|
|
return { configured: true, connected: testResponse.ok }
|
|
} catch {
|
|
return { configured: true, connected: false }
|
|
}
|
|
}
|
|
|
|
async configureRemote({ request, response }: HttpContext) {
|
|
const remoteUrl: string | null = request.input('remoteUrl', null)
|
|
|
|
const ollamaService = await Service.query().where('service_name', SERVICE_NAMES.OLLAMA).first()
|
|
if (!ollamaService) {
|
|
return response.status(404).send({ success: false, message: 'Ollama service record not found.' })
|
|
}
|
|
|
|
// Clear path: null or empty URL removes remote config. If a local nomad_ollama container
|
|
// still exists (user had previously installed AI Assistant locally), restart it and keep
|
|
// the service marked installed. Otherwise fall back to uninstalled.
|
|
if (!remoteUrl || remoteUrl.trim() === '') {
|
|
await KVStore.clearValue('ai.remoteOllamaUrl')
|
|
const hasLocalContainer = await this._startLocalOllamaContainerIfExists()
|
|
ollamaService.installed = hasLocalContainer
|
|
ollamaService.installation_status = 'idle'
|
|
await ollamaService.save()
|
|
return {
|
|
success: true,
|
|
message: hasLocalContainer
|
|
? 'Remote Ollama cleared. Local Ollama container restored.'
|
|
: 'Remote Ollama configuration cleared.',
|
|
}
|
|
}
|
|
|
|
// Validate URL format
|
|
if (!remoteUrl.startsWith('http')) {
|
|
return response.status(400).send({
|
|
success: false,
|
|
message: 'Invalid URL. Must start with http:// or https://',
|
|
})
|
|
}
|
|
|
|
// Test connectivity via OpenAI-compatible /v1/models endpoint (works with Ollama, LM Studio, llama.cpp, etc.)
|
|
try {
|
|
const testResponse = await fetch(`${remoteUrl.replace(/\/$/, '')}/v1/models`, {
|
|
signal: AbortSignal.timeout(5000),
|
|
})
|
|
if (!testResponse.ok) {
|
|
return response.status(400).send({
|
|
success: false,
|
|
message: `Could not connect to ${remoteUrl} (HTTP ${testResponse.status}). Make sure the server is running and accessible. For Ollama, start it with OLLAMA_HOST=0.0.0.0.`,
|
|
})
|
|
}
|
|
} catch (error) {
|
|
return response.status(400).send({
|
|
success: false,
|
|
message: `Could not connect to ${remoteUrl}. Make sure the server is running and reachable. For Ollama, start it with OLLAMA_HOST=0.0.0.0.`,
|
|
})
|
|
}
|
|
|
|
// Save remote URL and mark service as installed
|
|
await KVStore.setValue('ai.remoteOllamaUrl', remoteUrl.trim())
|
|
ollamaService.installed = true
|
|
ollamaService.installation_status = 'idle'
|
|
await ollamaService.save()
|
|
|
|
// Stop the local nomad_ollama container (if running) so it doesn't compete with the
|
|
// remote host for GPU / port 11434. Preserves the container and its models volume.
|
|
await this._stopLocalOllamaContainer()
|
|
|
|
// Install Qdrant if not already installed (fire-and-forget)
|
|
const qdrantService = await Service.query().where('service_name', SERVICE_NAMES.QDRANT).first()
|
|
if (qdrantService && !qdrantService.installed) {
|
|
this.dockerService.createContainerPreflight(SERVICE_NAMES.QDRANT).catch((error) => {
|
|
logger.error('[OllamaController] Failed to start Qdrant preflight:', error)
|
|
})
|
|
}
|
|
|
|
// Mirror post-install side effects: disable suggestions, trigger docs discovery
|
|
await KVStore.setValue('chat.suggestionsEnabled', false)
|
|
this.ragService.discoverNomadDocs().catch((error) => {
|
|
logger.error('[OllamaController] Failed to discover Nomad docs:', error)
|
|
})
|
|
|
|
return { success: true, message: 'Remote Ollama configured.' }
|
|
}
|
|
|
|
private async _stopLocalOllamaContainer(): Promise<void> {
|
|
try {
|
|
const containers = await this.dockerService.docker.listContainers({ all: true })
|
|
const ollamaContainer = containers.find((c) =>
|
|
c.Names.includes(`/${SERVICE_NAMES.OLLAMA}`)
|
|
)
|
|
if (!ollamaContainer || ollamaContainer.State !== 'running') {
|
|
return
|
|
}
|
|
await this.dockerService.docker.getContainer(ollamaContainer.Id).stop()
|
|
this.dockerService.invalidateServicesStatusCache()
|
|
logger.info('[OllamaController] Stopped local nomad_ollama (remote Ollama configured)')
|
|
} catch (error: any) {
|
|
logger.error(
|
|
{ err: error },
|
|
'[OllamaController] Failed to stop local nomad_ollama; remote Ollama is still active'
|
|
)
|
|
}
|
|
}
|
|
|
|
private async _startLocalOllamaContainerIfExists(): Promise<boolean> {
|
|
try {
|
|
const containers = await this.dockerService.docker.listContainers({ all: true })
|
|
const ollamaContainer = containers.find((c) =>
|
|
c.Names.includes(`/${SERVICE_NAMES.OLLAMA}`)
|
|
)
|
|
if (!ollamaContainer) {
|
|
return false
|
|
}
|
|
if (ollamaContainer.State !== 'running') {
|
|
await this.dockerService.docker.getContainer(ollamaContainer.Id).start()
|
|
this.dockerService.invalidateServicesStatusCache()
|
|
logger.info('[OllamaController] Started local nomad_ollama (remote Ollama cleared)')
|
|
}
|
|
return true
|
|
} catch (error: any) {
|
|
logger.error(
|
|
{ err: error },
|
|
'[OllamaController] Failed to start local nomad_ollama on remote clear'
|
|
)
|
|
return false
|
|
}
|
|
}
|
|
|
|
async deleteModel({ request }: HttpContext) {
|
|
const reqData = await request.validateUsing(modelNameSchema)
|
|
await this.ollamaService.deleteModel(reqData.model)
|
|
return {
|
|
success: true,
|
|
message: `Model deleted: ${reqData.model}`,
|
|
}
|
|
}
|
|
|
|
async dispatchModelDownload({ request }: HttpContext) {
|
|
const reqData = await request.validateUsing(modelNameSchema)
|
|
await this.ollamaService.dispatchModelDownload(reqData.model)
|
|
return {
|
|
success: true,
|
|
message: `Download job dispatched for model: ${reqData.model}`,
|
|
}
|
|
}
|
|
|
|
async installedModels({ }: HttpContext) {
|
|
return await this.ollamaService.getModels()
|
|
}
|
|
|
|
/**
|
|
* Determines RAG context limits based on model size extracted from the model name.
|
|
* Parses size indicators like "1b", "3b", "8b", "70b" from model names/tags.
|
|
*/
|
|
private getContextLimitsForModel(modelName: string): { maxResults: number; maxTokens: number } {
|
|
// Extract parameter count from model name (e.g., "llama3.2:3b", "qwen2.5:1.5b", "gemma:7b")
|
|
const sizeMatch = modelName.match(/(\d+\.?\d*)[bB]/)
|
|
const paramBillions = sizeMatch ? parseFloat(sizeMatch[1]) : 8 // default to 8B if unknown
|
|
|
|
for (const tier of RAG_CONTEXT_LIMITS) {
|
|
if (paramBillions <= tier.maxParams) {
|
|
return { maxResults: tier.maxResults, maxTokens: tier.maxTokens }
|
|
}
|
|
}
|
|
|
|
// Fallback: no limits
|
|
return { maxResults: 5, maxTokens: 0 }
|
|
}
|
|
|
|
private async rewriteQueryWithContext(
|
|
messages: Message[],
|
|
model: string
|
|
): Promise<string | null> {
|
|
const lastUserMessage = [...messages].reverse().find(msg => msg.role === 'user')
|
|
|
|
try {
|
|
// Skip the entire RAG pipeline if there are no documents to search
|
|
const hasDocuments = await this.ragService.hasDocuments()
|
|
if (!hasDocuments) {
|
|
return null
|
|
}
|
|
|
|
// Get recent conversation history (last 6 messages for 3 turns)
|
|
const recentMessages = messages.slice(-6)
|
|
|
|
// Skip rewriting on the very first turn — with only one user message
|
|
// there is no prior context to fold in, so the rewrite would just echo
|
|
// the message back at the cost of an extra LLM round-trip. From the
|
|
// first follow-up onward we need the rewrite so the RAG query carries
|
|
// entities and topics from earlier turns ("the bars" → "Hershey's bars
|
|
// chocolate poisoning dog"); without it, embeddings match nothing and
|
|
// the assistant loses the thread.
|
|
const userMessages = recentMessages.filter(msg => msg.role === 'user')
|
|
if (userMessages.length < 2) {
|
|
return lastUserMessage?.content || null
|
|
}
|
|
|
|
const conversationContext = recentMessages
|
|
.map(msg => {
|
|
const role = msg.role === 'user' ? 'User' : 'Assistant'
|
|
// Truncate assistant messages to first 200 chars to keep context manageable
|
|
const content = msg.role === 'assistant'
|
|
? msg.content.slice(0, 200) + (msg.content.length > 200 ? '...' : '')
|
|
: msg.content
|
|
return `${role}: "${content}"`
|
|
})
|
|
.join('\n')
|
|
|
|
const response = await this.ollamaService.chat({
|
|
model,
|
|
messages: [
|
|
{
|
|
role: 'system',
|
|
content: SYSTEM_PROMPTS.query_rewrite,
|
|
},
|
|
{
|
|
role: 'user',
|
|
content: `Conversation:\n${conversationContext}\n\nRewritten Query:`,
|
|
},
|
|
],
|
|
})
|
|
|
|
const rewrittenQuery = response.message.content.trim()
|
|
logger.info(`[RAG] Query rewritten: "${rewrittenQuery}"`)
|
|
return rewrittenQuery
|
|
} catch (error) {
|
|
logger.error(
|
|
`[RAG] Query rewriting failed: ${error instanceof Error ? error.message : error}`
|
|
)
|
|
// Fallback to last user message if rewriting fails
|
|
return lastUserMessage?.content || null
|
|
}
|
|
}
|
|
}
|