mirror of
https://github.com/Crosstalk-Solutions/project-nomad.git
synced 2026-03-28 03:29:25 +01:00
fix(AI): improved perf via rewrite and streaming logic
This commit is contained in:
parent
41eb30d84d
commit
00bd864831
|
|
@ -4,7 +4,7 @@ import { modelNameSchema } from '#validators/download'
|
|||
import { chatSchema, getAvailableModelsSchema } from '#validators/ollama'
|
||||
import { inject } from '@adonisjs/core'
|
||||
import type { HttpContext } from '@adonisjs/core/http'
|
||||
import { SYSTEM_PROMPTS } from '../../constants/ollama.js'
|
||||
import { DEFAULT_QUERY_REWRITE_MODEL, SYSTEM_PROMPTS } from '../../constants/ollama.js'
|
||||
import logger from '@adonisjs/core/services/logger'
|
||||
import type { Message } from 'ollama'
|
||||
|
||||
|
|
@ -28,80 +28,85 @@ export default class OllamaController {
|
|||
async chat({ request, response }: HttpContext) {
|
||||
const reqData = await request.validateUsing(chatSchema)
|
||||
|
||||
// If there are no system messages in the chat inject system prompts
|
||||
const hasSystemMessage = reqData.messages.some((msg) => msg.role === 'system')
|
||||
if (!hasSystemMessage) {
|
||||
const systemPrompt = {
|
||||
role: 'system' as const,
|
||||
content: SYSTEM_PROMPTS.default,
|
||||
}
|
||||
logger.debug('[OllamaController] Injecting system prompt')
|
||||
reqData.messages.unshift(systemPrompt)
|
||||
}
|
||||
|
||||
// Query rewriting for better RAG retrieval with manageable context
|
||||
// Will return user's latest message if no rewriting is needed
|
||||
const rewrittenQuery = await this.rewriteQueryWithContext(
|
||||
reqData.messages,
|
||||
reqData.model
|
||||
)
|
||||
|
||||
logger.debug(`[OllamaController] Rewritten query for RAG: "${rewrittenQuery}"`)
|
||||
if (rewrittenQuery) {
|
||||
const relevantDocs = await this.ragService.searchSimilarDocuments(
|
||||
rewrittenQuery,
|
||||
5, // Top 5 most relevant chunks
|
||||
0.3 // Minimum similarity score of 0.3
|
||||
)
|
||||
|
||||
logger.debug(`[RAG] Retrieved ${relevantDocs.length} relevant documents for query: "${rewrittenQuery}"`)
|
||||
|
||||
// If relevant context is found, inject as a system message
|
||||
if (relevantDocs.length > 0) {
|
||||
const contextText = relevantDocs
|
||||
.map((doc, idx) => `[Context ${idx + 1}] (Relevance: ${(doc.score * 100).toFixed(1)}%)\n${doc.text}`)
|
||||
.join('\n\n')
|
||||
|
||||
const systemMessage = {
|
||||
role: 'system' as const,
|
||||
content: SYSTEM_PROMPTS.rag_context(contextText),
|
||||
}
|
||||
|
||||
// Insert system message at the beginning (after any existing system messages)
|
||||
const firstNonSystemIndex = reqData.messages.findIndex((msg) => msg.role !== 'system')
|
||||
const insertIndex = firstNonSystemIndex === -1 ? 0 : firstNonSystemIndex
|
||||
reqData.messages.splice(insertIndex, 0, systemMessage)
|
||||
}
|
||||
}
|
||||
|
||||
// Check if the model supports "thinking" capability for enhanced response generation
|
||||
// If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat
|
||||
const thinkingCapability = await this.ollamaService.checkModelHasThinking(reqData.model)
|
||||
const think: boolean | 'medium' = thinkingCapability ? (reqData.model.startsWith('gpt-oss') ? 'medium' : true) : false
|
||||
|
||||
// Flush SSE headers immediately so the client connection is open while
|
||||
// pre-processing (query rewriting, RAG lookup) runs in the background.
|
||||
if (reqData.stream) {
|
||||
logger.debug(`[OllamaController] Initiating streaming response for model: "${reqData.model}" with think: ${think}`)
|
||||
// SSE streaming path
|
||||
response.response.setHeader('Content-Type', 'text/event-stream')
|
||||
response.response.setHeader('Cache-Control', 'no-cache')
|
||||
response.response.setHeader('Connection', 'keep-alive')
|
||||
response.response.flushHeaders()
|
||||
}
|
||||
|
||||
try {
|
||||
try {
|
||||
// If there are no system messages in the chat inject system prompts
|
||||
const hasSystemMessage = reqData.messages.some((msg) => msg.role === 'system')
|
||||
if (!hasSystemMessage) {
|
||||
const systemPrompt = {
|
||||
role: 'system' as const,
|
||||
content: SYSTEM_PROMPTS.default,
|
||||
}
|
||||
logger.debug('[OllamaController] Injecting system prompt')
|
||||
reqData.messages.unshift(systemPrompt)
|
||||
}
|
||||
|
||||
// Query rewriting for better RAG retrieval with manageable context
|
||||
// Will return user's latest message if no rewriting is needed
|
||||
const rewrittenQuery = await this.rewriteQueryWithContext(reqData.messages)
|
||||
|
||||
logger.debug(`[OllamaController] Rewritten query for RAG: "${rewrittenQuery}"`)
|
||||
if (rewrittenQuery) {
|
||||
const relevantDocs = await this.ragService.searchSimilarDocuments(
|
||||
rewrittenQuery,
|
||||
5, // Top 5 most relevant chunks
|
||||
0.3 // Minimum similarity score of 0.3
|
||||
)
|
||||
|
||||
logger.debug(`[RAG] Retrieved ${relevantDocs.length} relevant documents for query: "${rewrittenQuery}"`)
|
||||
|
||||
// If relevant context is found, inject as a system message
|
||||
if (relevantDocs.length > 0) {
|
||||
const contextText = relevantDocs
|
||||
.map((doc, idx) => `[Context ${idx + 1}] (Relevance: ${(doc.score * 100).toFixed(1)}%)\n${doc.text}`)
|
||||
.join('\n\n')
|
||||
|
||||
const systemMessage = {
|
||||
role: 'system' as const,
|
||||
content: SYSTEM_PROMPTS.rag_context(contextText),
|
||||
}
|
||||
|
||||
// Insert system message at the beginning (after any existing system messages)
|
||||
const firstNonSystemIndex = reqData.messages.findIndex((msg) => msg.role !== 'system')
|
||||
const insertIndex = firstNonSystemIndex === -1 ? 0 : firstNonSystemIndex
|
||||
reqData.messages.splice(insertIndex, 0, systemMessage)
|
||||
}
|
||||
}
|
||||
|
||||
// Check if the model supports "thinking" capability for enhanced response generation
|
||||
// If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat
|
||||
const thinkingCapability = await this.ollamaService.checkModelHasThinking(reqData.model)
|
||||
const think: boolean | 'medium' = thinkingCapability ? (reqData.model.startsWith('gpt-oss') ? 'medium' : true) : false
|
||||
|
||||
if (reqData.stream) {
|
||||
logger.debug(`[OllamaController] Initiating streaming response for model: "${reqData.model}" with think: ${think}`)
|
||||
// Headers already flushed above
|
||||
const stream = await this.ollamaService.chatStream({ ...reqData, think })
|
||||
for await (const chunk of stream) {
|
||||
response.response.write(`data: ${JSON.stringify(chunk)}\n\n`)
|
||||
}
|
||||
} catch (error) {
|
||||
response.response.write(`data: ${JSON.stringify({ error: true })}\n\n`)
|
||||
} finally {
|
||||
response.response.end()
|
||||
return
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Non-streaming (legacy) path
|
||||
return await this.ollamaService.chat({ ...reqData, think })
|
||||
// Non-streaming (legacy) path
|
||||
return await this.ollamaService.chat({ ...reqData, think })
|
||||
} catch (error) {
|
||||
if (reqData.stream) {
|
||||
response.response.write(`data: ${JSON.stringify({ error: true })}\n\n`)
|
||||
response.response.end()
|
||||
return
|
||||
}
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
async deleteModel({ request }: HttpContext) {
|
||||
|
|
@ -127,17 +132,17 @@ export default class OllamaController {
|
|||
}
|
||||
|
||||
private async rewriteQueryWithContext(
|
||||
messages: Message[],
|
||||
model: string
|
||||
messages: Message[]
|
||||
): Promise<string | null> {
|
||||
try {
|
||||
// Get recent conversation history (last 6 messages for 3 turns)
|
||||
const recentMessages = messages.slice(-6)
|
||||
|
||||
// If there's only one user message, no rewriting needed
|
||||
// Skip rewriting for short conversations. Rewriting adds latency with
|
||||
// little RAG benefit until there is enough context to matter.
|
||||
const userMessages = recentMessages.filter(msg => msg.role === 'user')
|
||||
if (userMessages.length <= 1) {
|
||||
return userMessages[0]?.content || null
|
||||
if (userMessages.length <= 2) {
|
||||
return userMessages[userMessages.length - 1]?.content || null
|
||||
}
|
||||
|
||||
const conversationContext = recentMessages
|
||||
|
|
@ -151,8 +156,17 @@ export default class OllamaController {
|
|||
})
|
||||
.join('\n')
|
||||
|
||||
const availableModels = await this.ollamaService.getAvailableModels({ query: null, limit: 500 })
|
||||
const rewriteModelAvailable = availableModels?.models.some(model => model.name === DEFAULT_QUERY_REWRITE_MODEL)
|
||||
if (!rewriteModelAvailable) {
|
||||
logger.warn(`[RAG] Query rewrite model "${DEFAULT_QUERY_REWRITE_MODEL}" not available. Skipping query rewriting.`)
|
||||
const lastUserMessage = [...messages].reverse().find(msg => msg.role === 'user')
|
||||
return lastUserMessage?.content || null
|
||||
}
|
||||
|
||||
// FUTURE ENHANCEMENT: allow the user to specify which model to use for rewriting
|
||||
const response = await this.ollamaService.chat({
|
||||
model,
|
||||
model: DEFAULT_QUERY_REWRITE_MODEL,
|
||||
messages: [
|
||||
{
|
||||
role: 'system',
|
||||
|
|
|
|||
|
|
@ -62,6 +62,8 @@ export const FALLBACK_RECOMMENDED_OLLAMA_MODELS: NomadOllamaModel[] = [
|
|||
},
|
||||
]
|
||||
|
||||
export const DEFAULT_QUERY_REWRITE_MODEL = 'qwen2.5:3b' // default to qwen2.5 for query rewriting with good balance of text task performance and resource usage
|
||||
|
||||
export const SYSTEM_PROMPTS = {
|
||||
default: `
|
||||
Format all responses using markdown for better readability. Vanilla markdown or GitHub-flavored markdown is preferred.
|
||||
|
|
|
|||
|
|
@ -5,6 +5,10 @@ import { ChatMessage } from '../../../types/chat'
|
|||
import ChatMessageBubble from './ChatMessageBubble'
|
||||
import ChatAssistantAvatar from './ChatAssistantAvatar'
|
||||
import BouncingDots from '../BouncingDots'
|
||||
import StyledModal from '../StyledModal'
|
||||
import api from '~/lib/api'
|
||||
import { DEFAULT_QUERY_REWRITE_MODEL } from '../../../constants/ollama'
|
||||
import { useNotifications } from '~/context/NotificationContext'
|
||||
|
||||
interface ChatInterfaceProps {
|
||||
messages: ChatMessage[]
|
||||
|
|
@ -13,6 +17,7 @@ interface ChatInterfaceProps {
|
|||
chatSuggestions?: string[]
|
||||
chatSuggestionsEnabled?: boolean
|
||||
chatSuggestionsLoading?: boolean
|
||||
rewriteModelAvailable?: boolean
|
||||
}
|
||||
|
||||
export default function ChatInterface({
|
||||
|
|
@ -22,11 +27,28 @@ export default function ChatInterface({
|
|||
chatSuggestions = [],
|
||||
chatSuggestionsEnabled = false,
|
||||
chatSuggestionsLoading = false,
|
||||
rewriteModelAvailable = false
|
||||
}: ChatInterfaceProps) {
|
||||
const { addNotification } = useNotifications()
|
||||
const [input, setInput] = useState('')
|
||||
const [downloadDialogOpen, setDownloadDialogOpen] = useState(false)
|
||||
const [isDownloading, setIsDownloading] = useState(false)
|
||||
const messagesEndRef = useRef<HTMLDivElement>(null)
|
||||
const textareaRef = useRef<HTMLTextAreaElement>(null)
|
||||
|
||||
const handleDownloadModel = async () => {
|
||||
setIsDownloading(true)
|
||||
try {
|
||||
await api.downloadModel(DEFAULT_QUERY_REWRITE_MODEL)
|
||||
addNotification({ type: 'success', message: 'Model download queued' })
|
||||
} catch (error) {
|
||||
addNotification({ type: 'error', message: 'Failed to queue model download' })
|
||||
} finally {
|
||||
setIsDownloading(false)
|
||||
setDownloadDialogOpen(false)
|
||||
}
|
||||
}
|
||||
|
||||
const scrollToBottom = () => {
|
||||
messagesEndRef.current?.scrollIntoView({ behavior: 'smooth' })
|
||||
}
|
||||
|
|
@ -162,6 +184,36 @@ export default function ChatInterface({
|
|||
)}
|
||||
</button>
|
||||
</form>
|
||||
{!rewriteModelAvailable && (
|
||||
<div className="text-sm text-gray-500 mt-2">
|
||||
The {DEFAULT_QUERY_REWRITE_MODEL} model is not installed. Consider{' '}
|
||||
<button
|
||||
onClick={() => setDownloadDialogOpen(true)}
|
||||
className="text-desert-green underline hover:text-desert-green/80 cursor-pointer"
|
||||
>
|
||||
downloading it
|
||||
</button>{' '}
|
||||
for improved retrieval-augmented generation (RAG) performance.
|
||||
</div>
|
||||
)}
|
||||
<StyledModal
|
||||
open={downloadDialogOpen}
|
||||
title={`Download ${DEFAULT_QUERY_REWRITE_MODEL}?`}
|
||||
confirmText="Download"
|
||||
cancelText="Cancel"
|
||||
confirmIcon='IconDownload'
|
||||
confirmVariant='primary'
|
||||
confirmLoading={isDownloading}
|
||||
onConfirm={handleDownloadModel}
|
||||
onCancel={() => setDownloadDialogOpen(false)}
|
||||
onClose={() => setDownloadDialogOpen(false)}
|
||||
>
|
||||
<p className="text-gray-700">
|
||||
This will dispatch a background download job for{' '}
|
||||
<span className="font-mono font-medium">{DEFAULT_QUERY_REWRITE_MODEL}</span> and may take some time to complete. The model
|
||||
will be used to rewrite queries for improved RAG retrieval performance.
|
||||
</p>
|
||||
</StyledModal>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
|
|
|
|||
|
|
@ -29,7 +29,9 @@ export default function ChatMessageBubble({ message }: ChatMessageBubbleProps) {
|
|||
{!message.isThinking && message.thinking && (
|
||||
<details className="mb-3 rounded border border-gray-200 bg-gray-50 text-xs">
|
||||
<summary className="cursor-pointer px-3 py-2 font-medium text-gray-500 hover:text-gray-700 select-none">
|
||||
Reasoning
|
||||
{message.thinkingDuration !== undefined
|
||||
? `Thought for ${message.thinkingDuration}s`
|
||||
: 'Reasoning'}
|
||||
</summary>
|
||||
<div className="px-3 pb-3 prose prose-xs max-w-none text-gray-600 max-h-48 overflow-y-auto border-t border-gray-200 pt-2">
|
||||
<ReactMarkdown remarkPlugins={[remarkGfm]}>{message.thinking}</ReactMarkdown>
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
import { useState, useCallback, useEffect, useRef } from 'react'
|
||||
import { useState, useCallback, useEffect, useRef, useMemo } from 'react'
|
||||
import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'
|
||||
import ChatSidebar from './ChatSidebar'
|
||||
import ChatInterface from './ChatInterface'
|
||||
|
|
@ -9,6 +9,7 @@ import { useModals } from '~/context/ModalContext'
|
|||
import { ChatMessage } from '../../../types/chat'
|
||||
import classNames from '~/lib/classNames'
|
||||
import { IconX } from '@tabler/icons-react'
|
||||
import { DEFAULT_QUERY_REWRITE_MODEL } from '../../../constants/ollama'
|
||||
|
||||
interface ChatProps {
|
||||
enabled: boolean
|
||||
|
|
@ -68,6 +69,10 @@ export default function Chat({
|
|||
refetchOnMount: false,
|
||||
})
|
||||
|
||||
const rewriteModelAvailable = useMemo(() => {
|
||||
return installedModels.some(model => model.name === DEFAULT_QUERY_REWRITE_MODEL)
|
||||
}, [installedModels])
|
||||
|
||||
const deleteAllSessionsMutation = useMutation({
|
||||
mutationFn: () => api.deleteAllChatSessions(),
|
||||
onSuccess: () => {
|
||||
|
|
@ -230,11 +235,16 @@ export default function Chat({
|
|||
let fullContent = ''
|
||||
let thinkingContent = ''
|
||||
let isThinkingPhase = true
|
||||
let thinkingStartTime: number | null = null
|
||||
let thinkingDuration: number | null = null
|
||||
|
||||
try {
|
||||
await api.streamChatMessage(
|
||||
{ model: selectedModel || 'llama3.2', messages: chatMessages, stream: true },
|
||||
(chunkContent, chunkThinking, done) => {
|
||||
if (chunkThinking.length > 0 && thinkingStartTime === null) {
|
||||
thinkingStartTime = Date.now()
|
||||
}
|
||||
if (isFirstChunk) {
|
||||
isFirstChunk = false
|
||||
setIsStreamingResponse(false)
|
||||
|
|
@ -248,22 +258,27 @@ export default function Chat({
|
|||
timestamp: new Date(),
|
||||
isStreaming: true,
|
||||
isThinking: chunkThinking.length > 0 && chunkContent.length === 0,
|
||||
thinkingDuration: undefined,
|
||||
},
|
||||
])
|
||||
} else {
|
||||
if (isThinkingPhase && chunkContent.length > 0) {
|
||||
isThinkingPhase = false
|
||||
if (thinkingStartTime !== null) {
|
||||
thinkingDuration = Math.max(1, Math.round((Date.now() - thinkingStartTime) / 1000))
|
||||
}
|
||||
}
|
||||
setMessages((prev) =>
|
||||
prev.map((m) =>
|
||||
m.id === assistantMsgId
|
||||
? {
|
||||
...m,
|
||||
content: m.content + chunkContent,
|
||||
thinking: (m.thinking ?? '') + chunkThinking,
|
||||
isStreaming: !done,
|
||||
isThinking: isThinkingPhase,
|
||||
}
|
||||
...m,
|
||||
content: m.content + chunkContent,
|
||||
thinking: (m.thinking ?? '') + chunkThinking,
|
||||
isStreaming: !done,
|
||||
isThinking: isThinkingPhase,
|
||||
thinkingDuration: thinkingDuration ?? undefined,
|
||||
}
|
||||
: m
|
||||
)
|
||||
)
|
||||
|
|
@ -391,6 +406,7 @@ export default function Chat({
|
|||
chatSuggestions={chatSuggestions}
|
||||
chatSuggestionsEnabled={suggestionsEnabled}
|
||||
chatSuggestionsLoading={chatSuggestionsLoading}
|
||||
rewriteModelAvailable={rewriteModelAvailable}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ export interface ChatMessage {
|
|||
isStreaming?: boolean
|
||||
thinking?: string
|
||||
isThinking?: boolean
|
||||
thinkingDuration?: number
|
||||
}
|
||||
|
||||
export interface ChatSession {
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user