From 00bd864831b72ef0557b4fa1454d86f27ed9b643 Mon Sep 17 00:00:00 2001 From: Jake Turner Date: Wed, 25 Feb 2026 17:42:22 +0000 Subject: [PATCH] fix(AI): improved perf via rewrite and streaming logic --- admin/app/controllers/ollama_controller.ts | 150 ++++++++++-------- admin/constants/ollama.ts | 2 + .../inertia/components/chat/ChatInterface.tsx | 52 ++++++ .../components/chat/ChatMessageBubble.tsx | 4 +- admin/inertia/components/chat/index.tsx | 32 +++- admin/types/chat.ts | 1 + 6 files changed, 164 insertions(+), 77 deletions(-) diff --git a/admin/app/controllers/ollama_controller.ts b/admin/app/controllers/ollama_controller.ts index 17c08db..ee1640c 100644 --- a/admin/app/controllers/ollama_controller.ts +++ b/admin/app/controllers/ollama_controller.ts @@ -4,7 +4,7 @@ import { modelNameSchema } from '#validators/download' import { chatSchema, getAvailableModelsSchema } from '#validators/ollama' import { inject } from '@adonisjs/core' import type { HttpContext } from '@adonisjs/core/http' -import { SYSTEM_PROMPTS } from '../../constants/ollama.js' +import { DEFAULT_QUERY_REWRITE_MODEL, SYSTEM_PROMPTS } from '../../constants/ollama.js' import logger from '@adonisjs/core/services/logger' import type { Message } from 'ollama' @@ -28,80 +28,85 @@ export default class OllamaController { async chat({ request, response }: HttpContext) { const reqData = await request.validateUsing(chatSchema) - // If there are no system messages in the chat inject system prompts - const hasSystemMessage = reqData.messages.some((msg) => msg.role === 'system') - if (!hasSystemMessage) { - const systemPrompt = { - role: 'system' as const, - content: SYSTEM_PROMPTS.default, - } - logger.debug('[OllamaController] Injecting system prompt') - reqData.messages.unshift(systemPrompt) - } - - // Query rewriting for better RAG retrieval with manageable context - // Will return user's latest message if no rewriting is needed - const rewrittenQuery = await this.rewriteQueryWithContext( - reqData.messages, - reqData.model - ) - - logger.debug(`[OllamaController] Rewritten query for RAG: "${rewrittenQuery}"`) - if (rewrittenQuery) { - const relevantDocs = await this.ragService.searchSimilarDocuments( - rewrittenQuery, - 5, // Top 5 most relevant chunks - 0.3 // Minimum similarity score of 0.3 - ) - - logger.debug(`[RAG] Retrieved ${relevantDocs.length} relevant documents for query: "${rewrittenQuery}"`) - - // If relevant context is found, inject as a system message - if (relevantDocs.length > 0) { - const contextText = relevantDocs - .map((doc, idx) => `[Context ${idx + 1}] (Relevance: ${(doc.score * 100).toFixed(1)}%)\n${doc.text}`) - .join('\n\n') - - const systemMessage = { - role: 'system' as const, - content: SYSTEM_PROMPTS.rag_context(contextText), - } - - // Insert system message at the beginning (after any existing system messages) - const firstNonSystemIndex = reqData.messages.findIndex((msg) => msg.role !== 'system') - const insertIndex = firstNonSystemIndex === -1 ? 0 : firstNonSystemIndex - reqData.messages.splice(insertIndex, 0, systemMessage) - } - } - - // Check if the model supports "thinking" capability for enhanced response generation - // If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat - const thinkingCapability = await this.ollamaService.checkModelHasThinking(reqData.model) - const think: boolean | 'medium' = thinkingCapability ? (reqData.model.startsWith('gpt-oss') ? 'medium' : true) : false - + // Flush SSE headers immediately so the client connection is open while + // pre-processing (query rewriting, RAG lookup) runs in the background. if (reqData.stream) { - logger.debug(`[OllamaController] Initiating streaming response for model: "${reqData.model}" with think: ${think}`) - // SSE streaming path response.response.setHeader('Content-Type', 'text/event-stream') response.response.setHeader('Cache-Control', 'no-cache') response.response.setHeader('Connection', 'keep-alive') response.response.flushHeaders() + } - try { + try { + // If there are no system messages in the chat inject system prompts + const hasSystemMessage = reqData.messages.some((msg) => msg.role === 'system') + if (!hasSystemMessage) { + const systemPrompt = { + role: 'system' as const, + content: SYSTEM_PROMPTS.default, + } + logger.debug('[OllamaController] Injecting system prompt') + reqData.messages.unshift(systemPrompt) + } + + // Query rewriting for better RAG retrieval with manageable context + // Will return user's latest message if no rewriting is needed + const rewrittenQuery = await this.rewriteQueryWithContext(reqData.messages) + + logger.debug(`[OllamaController] Rewritten query for RAG: "${rewrittenQuery}"`) + if (rewrittenQuery) { + const relevantDocs = await this.ragService.searchSimilarDocuments( + rewrittenQuery, + 5, // Top 5 most relevant chunks + 0.3 // Minimum similarity score of 0.3 + ) + + logger.debug(`[RAG] Retrieved ${relevantDocs.length} relevant documents for query: "${rewrittenQuery}"`) + + // If relevant context is found, inject as a system message + if (relevantDocs.length > 0) { + const contextText = relevantDocs + .map((doc, idx) => `[Context ${idx + 1}] (Relevance: ${(doc.score * 100).toFixed(1)}%)\n${doc.text}`) + .join('\n\n') + + const systemMessage = { + role: 'system' as const, + content: SYSTEM_PROMPTS.rag_context(contextText), + } + + // Insert system message at the beginning (after any existing system messages) + const firstNonSystemIndex = reqData.messages.findIndex((msg) => msg.role !== 'system') + const insertIndex = firstNonSystemIndex === -1 ? 0 : firstNonSystemIndex + reqData.messages.splice(insertIndex, 0, systemMessage) + } + } + + // Check if the model supports "thinking" capability for enhanced response generation + // If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat + const thinkingCapability = await this.ollamaService.checkModelHasThinking(reqData.model) + const think: boolean | 'medium' = thinkingCapability ? (reqData.model.startsWith('gpt-oss') ? 'medium' : true) : false + + if (reqData.stream) { + logger.debug(`[OllamaController] Initiating streaming response for model: "${reqData.model}" with think: ${think}`) + // Headers already flushed above const stream = await this.ollamaService.chatStream({ ...reqData, think }) for await (const chunk of stream) { response.response.write(`data: ${JSON.stringify(chunk)}\n\n`) } - } catch (error) { - response.response.write(`data: ${JSON.stringify({ error: true })}\n\n`) - } finally { response.response.end() + return } - return - } - // Non-streaming (legacy) path - return await this.ollamaService.chat({ ...reqData, think }) + // Non-streaming (legacy) path + return await this.ollamaService.chat({ ...reqData, think }) + } catch (error) { + if (reqData.stream) { + response.response.write(`data: ${JSON.stringify({ error: true })}\n\n`) + response.response.end() + return + } + throw error + } } async deleteModel({ request }: HttpContext) { @@ -127,17 +132,17 @@ export default class OllamaController { } private async rewriteQueryWithContext( - messages: Message[], - model: string + messages: Message[] ): Promise { try { // Get recent conversation history (last 6 messages for 3 turns) const recentMessages = messages.slice(-6) - // If there's only one user message, no rewriting needed + // Skip rewriting for short conversations. Rewriting adds latency with + // little RAG benefit until there is enough context to matter. const userMessages = recentMessages.filter(msg => msg.role === 'user') - if (userMessages.length <= 1) { - return userMessages[0]?.content || null + if (userMessages.length <= 2) { + return userMessages[userMessages.length - 1]?.content || null } const conversationContext = recentMessages @@ -151,8 +156,17 @@ export default class OllamaController { }) .join('\n') + const availableModels = await this.ollamaService.getAvailableModels({ query: null, limit: 500 }) + const rewriteModelAvailable = availableModels?.models.some(model => model.name === DEFAULT_QUERY_REWRITE_MODEL) + if (!rewriteModelAvailable) { + logger.warn(`[RAG] Query rewrite model "${DEFAULT_QUERY_REWRITE_MODEL}" not available. Skipping query rewriting.`) + const lastUserMessage = [...messages].reverse().find(msg => msg.role === 'user') + return lastUserMessage?.content || null + } + + // FUTURE ENHANCEMENT: allow the user to specify which model to use for rewriting const response = await this.ollamaService.chat({ - model, + model: DEFAULT_QUERY_REWRITE_MODEL, messages: [ { role: 'system', diff --git a/admin/constants/ollama.ts b/admin/constants/ollama.ts index e162346..daf628b 100644 --- a/admin/constants/ollama.ts +++ b/admin/constants/ollama.ts @@ -62,6 +62,8 @@ export const FALLBACK_RECOMMENDED_OLLAMA_MODELS: NomadOllamaModel[] = [ }, ] +export const DEFAULT_QUERY_REWRITE_MODEL = 'qwen2.5:3b' // default to qwen2.5 for query rewriting with good balance of text task performance and resource usage + export const SYSTEM_PROMPTS = { default: ` Format all responses using markdown for better readability. Vanilla markdown or GitHub-flavored markdown is preferred. diff --git a/admin/inertia/components/chat/ChatInterface.tsx b/admin/inertia/components/chat/ChatInterface.tsx index 27d44c1..af2961f 100644 --- a/admin/inertia/components/chat/ChatInterface.tsx +++ b/admin/inertia/components/chat/ChatInterface.tsx @@ -5,6 +5,10 @@ import { ChatMessage } from '../../../types/chat' import ChatMessageBubble from './ChatMessageBubble' import ChatAssistantAvatar from './ChatAssistantAvatar' import BouncingDots from '../BouncingDots' +import StyledModal from '../StyledModal' +import api from '~/lib/api' +import { DEFAULT_QUERY_REWRITE_MODEL } from '../../../constants/ollama' +import { useNotifications } from '~/context/NotificationContext' interface ChatInterfaceProps { messages: ChatMessage[] @@ -13,6 +17,7 @@ interface ChatInterfaceProps { chatSuggestions?: string[] chatSuggestionsEnabled?: boolean chatSuggestionsLoading?: boolean + rewriteModelAvailable?: boolean } export default function ChatInterface({ @@ -22,11 +27,28 @@ export default function ChatInterface({ chatSuggestions = [], chatSuggestionsEnabled = false, chatSuggestionsLoading = false, + rewriteModelAvailable = false }: ChatInterfaceProps) { + const { addNotification } = useNotifications() const [input, setInput] = useState('') + const [downloadDialogOpen, setDownloadDialogOpen] = useState(false) + const [isDownloading, setIsDownloading] = useState(false) const messagesEndRef = useRef(null) const textareaRef = useRef(null) + const handleDownloadModel = async () => { + setIsDownloading(true) + try { + await api.downloadModel(DEFAULT_QUERY_REWRITE_MODEL) + addNotification({ type: 'success', message: 'Model download queued' }) + } catch (error) { + addNotification({ type: 'error', message: 'Failed to queue model download' }) + } finally { + setIsDownloading(false) + setDownloadDialogOpen(false) + } + } + const scrollToBottom = () => { messagesEndRef.current?.scrollIntoView({ behavior: 'smooth' }) } @@ -162,6 +184,36 @@ export default function ChatInterface({ )} + {!rewriteModelAvailable && ( +
+ The {DEFAULT_QUERY_REWRITE_MODEL} model is not installed. Consider{' '} + {' '} + for improved retrieval-augmented generation (RAG) performance. +
+ )} + setDownloadDialogOpen(false)} + onClose={() => setDownloadDialogOpen(false)} + > +

+ This will dispatch a background download job for{' '} + {DEFAULT_QUERY_REWRITE_MODEL} and may take some time to complete. The model + will be used to rewrite queries for improved RAG retrieval performance. +

+
) diff --git a/admin/inertia/components/chat/ChatMessageBubble.tsx b/admin/inertia/components/chat/ChatMessageBubble.tsx index aca040e..34f1547 100644 --- a/admin/inertia/components/chat/ChatMessageBubble.tsx +++ b/admin/inertia/components/chat/ChatMessageBubble.tsx @@ -29,7 +29,9 @@ export default function ChatMessageBubble({ message }: ChatMessageBubbleProps) { {!message.isThinking && message.thinking && (
- Reasoning + {message.thinkingDuration !== undefined + ? `Thought for ${message.thinkingDuration}s` + : 'Reasoning'}
{message.thinking} diff --git a/admin/inertia/components/chat/index.tsx b/admin/inertia/components/chat/index.tsx index 2e37baf..20a7ae1 100644 --- a/admin/inertia/components/chat/index.tsx +++ b/admin/inertia/components/chat/index.tsx @@ -1,4 +1,4 @@ -import { useState, useCallback, useEffect, useRef } from 'react' +import { useState, useCallback, useEffect, useRef, useMemo } from 'react' import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query' import ChatSidebar from './ChatSidebar' import ChatInterface from './ChatInterface' @@ -9,6 +9,7 @@ import { useModals } from '~/context/ModalContext' import { ChatMessage } from '../../../types/chat' import classNames from '~/lib/classNames' import { IconX } from '@tabler/icons-react' +import { DEFAULT_QUERY_REWRITE_MODEL } from '../../../constants/ollama' interface ChatProps { enabled: boolean @@ -68,6 +69,10 @@ export default function Chat({ refetchOnMount: false, }) + const rewriteModelAvailable = useMemo(() => { + return installedModels.some(model => model.name === DEFAULT_QUERY_REWRITE_MODEL) + }, [installedModels]) + const deleteAllSessionsMutation = useMutation({ mutationFn: () => api.deleteAllChatSessions(), onSuccess: () => { @@ -159,7 +164,7 @@ export default function Chat({ async (sessionId: string) => { // Cancel any ongoing suggestions fetch queryClient.cancelQueries({ queryKey: ['chatSuggestions'] }) - + setActiveSessionId(sessionId) // Load messages for this session const sessionData = await api.getChatSession(sessionId) @@ -230,11 +235,16 @@ export default function Chat({ let fullContent = '' let thinkingContent = '' let isThinkingPhase = true + let thinkingStartTime: number | null = null + let thinkingDuration: number | null = null try { await api.streamChatMessage( { model: selectedModel || 'llama3.2', messages: chatMessages, stream: true }, (chunkContent, chunkThinking, done) => { + if (chunkThinking.length > 0 && thinkingStartTime === null) { + thinkingStartTime = Date.now() + } if (isFirstChunk) { isFirstChunk = false setIsStreamingResponse(false) @@ -248,22 +258,27 @@ export default function Chat({ timestamp: new Date(), isStreaming: true, isThinking: chunkThinking.length > 0 && chunkContent.length === 0, + thinkingDuration: undefined, }, ]) } else { if (isThinkingPhase && chunkContent.length > 0) { isThinkingPhase = false + if (thinkingStartTime !== null) { + thinkingDuration = Math.max(1, Math.round((Date.now() - thinkingStartTime) / 1000)) + } } setMessages((prev) => prev.map((m) => m.id === assistantMsgId ? { - ...m, - content: m.content + chunkContent, - thinking: (m.thinking ?? '') + chunkThinking, - isStreaming: !done, - isThinking: isThinkingPhase, - } + ...m, + content: m.content + chunkContent, + thinking: (m.thinking ?? '') + chunkThinking, + isStreaming: !done, + isThinking: isThinkingPhase, + thinkingDuration: thinkingDuration ?? undefined, + } : m ) ) @@ -391,6 +406,7 @@ export default function Chat({ chatSuggestions={chatSuggestions} chatSuggestionsEnabled={suggestionsEnabled} chatSuggestionsLoading={chatSuggestionsLoading} + rewriteModelAvailable={rewriteModelAvailable} />
diff --git a/admin/types/chat.ts b/admin/types/chat.ts index a3047ef..d57ee48 100644 --- a/admin/types/chat.ts +++ b/admin/types/chat.ts @@ -6,6 +6,7 @@ export interface ChatMessage { isStreaming?: boolean thinking?: string isThinking?: boolean + thinkingDuration?: number } export interface ChatSession {