feat(AI): thinking and response streaming

This commit is contained in:
Jake Turner 2026-02-19 05:18:20 +00:00 committed by Jake Turner
parent 16ce1e2945
commit 98b65c421c
8 changed files with 255 additions and 13 deletions

View File

@ -24,7 +24,7 @@ export default class OllamaController {
}) })
} }
async chat({ request }: HttpContext) { async chat({ request, response }: HttpContext) {
const reqData = await request.validateUsing(chatSchema) const reqData = await request.validateUsing(chatSchema)
// If there are no system messages in the chat inject system prompts // If there are no system messages in the chat inject system prompts
@ -73,7 +73,34 @@ export default class OllamaController {
} }
} }
return await this.ollamaService.chat(reqData) // Check if the model supports "thinking" capability for enhanced response generation
// If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat
const thinkingCapability = await this.ollamaService.checkModelHasThinking(reqData.model)
const think: boolean | 'medium' = thinkingCapability ? (reqData.model.startsWith('gpt-oss') ? 'medium' : true) : false
if (reqData.stream) {
logger.debug(`[OllamaController] Initiating streaming response for model: "${reqData.model}" with think: ${think}`)
// SSE streaming path
response.response.setHeader('Content-Type', 'text/event-stream')
response.response.setHeader('Cache-Control', 'no-cache')
response.response.setHeader('Connection', 'keep-alive')
response.response.flushHeaders()
try {
const stream = await this.ollamaService.chatStream({ ...reqData, think })
for await (const chunk of stream) {
response.response.write(`data: ${JSON.stringify(chunk)}\n\n`)
}
} catch (error) {
response.response.write(`data: ${JSON.stringify({ error: true })}\n\n`)
} finally {
response.response.end()
}
return
}
// Non-streaming (legacy) path
return await this.ollamaService.chat({ ...reqData, think })
} }
async deleteModel({ request }: HttpContext) { async deleteModel({ request }: HttpContext) {

View File

@ -134,6 +134,30 @@ export class OllamaService {
}) })
} }
public async chatStream(chatRequest: ChatRequest) {
await this._ensureDependencies()
if (!this.ollama) {
throw new Error('Ollama client is not initialized.')
}
return await this.ollama.chat({
...chatRequest,
stream: true,
})
}
public async checkModelHasThinking(modelName: string): Promise<boolean> {
await this._ensureDependencies()
if (!this.ollama) {
throw new Error('Ollama client is not initialized.')
}
const modelInfo = await this.ollama.show({
model: modelName,
})
return modelInfo.capabilities.includes('thinking')
}
public async deleteModel(modelName: string) { public async deleteModel(modelName: string) {
await this._ensureDependencies() await this._ensureDependencies()
if (!this.ollama) { if (!this.ollama) {

View File

@ -1,5 +1,16 @@
# Release Notes # Release Notes
## Unreleased
### Features
- **AI Assistant**: Added support for showing reasoning stream for models with thinking capabilities
- **AI Assistant**: Added support for response streaming for improved UX
### Bug Fixes
### Improvements
## Version 1.25.2 - February 18, 2026 ## Version 1.25.2 - February 18, 2026
### Features ### Features

View File

@ -15,6 +15,27 @@ export default function ChatMessageBubble({ message }: ChatMessageBubbleProps) {
message.role === 'user' ? 'bg-desert-green text-white' : 'bg-gray-100 text-gray-800' message.role === 'user' ? 'bg-desert-green text-white' : 'bg-gray-100 text-gray-800'
)} )}
> >
{message.isThinking && message.thinking && (
<div className="mb-3 rounded border border-amber-200 bg-amber-50 px-3 py-2 text-xs">
<div className="mb-1 flex items-center gap-1.5 font-medium text-amber-700">
<span>Reasoning</span>
<span className="h-1.5 w-1.5 rounded-full bg-amber-500 animate-pulse inline-block" />
</div>
<div className="prose prose-xs max-w-none text-amber-900/80 max-h-32 overflow-y-auto">
<ReactMarkdown remarkPlugins={[remarkGfm]}>{message.thinking}</ReactMarkdown>
</div>
</div>
)}
{!message.isThinking && message.thinking && (
<details className="mb-3 rounded border border-gray-200 bg-gray-50 text-xs">
<summary className="cursor-pointer px-3 py-2 font-medium text-gray-500 hover:text-gray-700 select-none">
Reasoning
</summary>
<div className="px-3 pb-3 prose prose-xs max-w-none text-gray-600 max-h-48 overflow-y-auto border-t border-gray-200 pt-2">
<ReactMarkdown remarkPlugins={[remarkGfm]}>{message.thinking}</ReactMarkdown>
</div>
</details>
)}
<div <div
className={classNames( className={classNames(
'break-words', 'break-words',

View File

@ -1,4 +1,4 @@
import { useState, useCallback, useEffect } from 'react' import { useState, useCallback, useEffect, useRef } from 'react'
import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query' import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'
import ChatSidebar from './ChatSidebar' import ChatSidebar from './ChatSidebar'
import ChatInterface from './ChatInterface' import ChatInterface from './ChatInterface'
@ -15,6 +15,7 @@ interface ChatProps {
isInModal?: boolean isInModal?: boolean
onClose?: () => void onClose?: () => void
suggestionsEnabled?: boolean suggestionsEnabled?: boolean
streamingEnabled?: boolean
} }
export default function Chat({ export default function Chat({
@ -22,12 +23,15 @@ export default function Chat({
isInModal, isInModal,
onClose, onClose,
suggestionsEnabled = false, suggestionsEnabled = false,
streamingEnabled = true,
}: ChatProps) { }: ChatProps) {
const queryClient = useQueryClient() const queryClient = useQueryClient()
const { openModal, closeAllModals } = useModals() const { openModal, closeAllModals } = useModals()
const [activeSessionId, setActiveSessionId] = useState<string | null>(null) const [activeSessionId, setActiveSessionId] = useState<string | null>(null)
const [messages, setMessages] = useState<ChatMessage[]>([]) const [messages, setMessages] = useState<ChatMessage[]>([])
const [selectedModel, setSelectedModel] = useState<string>('') const [selectedModel, setSelectedModel] = useState<string>('')
const [isStreamingResponse, setIsStreamingResponse] = useState(false)
const streamAbortRef = useRef<AbortController | null>(null)
// Fetch all sessions // Fetch all sessions
const { data: sessions = [] } = useQuery({ const { data: sessions = [] } = useQuery({
@ -209,16 +213,117 @@ export default function Chat({
// Save user message to backend // Save user message to backend
await api.addChatMessage(sessionId, 'user', content) await api.addChatMessage(sessionId, 'user', content)
// Send chat request using mutation const chatMessages = [
chatMutation.mutate({ ...messages.map((m) => ({ role: m.role, content: m.content })),
model: selectedModel || 'llama3.2', { role: 'user' as const, content },
messages: [ ]
...messages.map((m) => ({ role: m.role, content: m.content })),
{ role: 'user', content }, if (streamingEnabled !== false) {
], // Streaming path
}) const abortController = new AbortController()
streamAbortRef.current = abortController
setIsStreamingResponse(true)
const assistantMsgId = `msg-${Date.now()}-assistant`
let isFirstChunk = true
let fullContent = ''
let thinkingContent = ''
let isThinkingPhase = true
try {
await api.streamChatMessage(
{ model: selectedModel || 'llama3.2', messages: chatMessages, stream: true },
(chunkContent, chunkThinking, done) => {
if (isFirstChunk) {
isFirstChunk = false
setIsStreamingResponse(false)
setMessages((prev) => [
...prev,
{
id: assistantMsgId,
role: 'assistant',
content: chunkContent,
thinking: chunkThinking,
timestamp: new Date(),
isStreaming: true,
isThinking: chunkThinking.length > 0 && chunkContent.length === 0,
},
])
} else {
if (isThinkingPhase && chunkContent.length > 0) {
isThinkingPhase = false
}
setMessages((prev) =>
prev.map((m) =>
m.id === assistantMsgId
? {
...m,
content: m.content + chunkContent,
thinking: (m.thinking ?? '') + chunkThinking,
isStreaming: !done,
isThinking: isThinkingPhase,
}
: m
)
)
}
fullContent += chunkContent
thinkingContent += chunkThinking
},
abortController.signal
)
} catch (error: any) {
if (error?.name !== 'AbortError') {
setMessages((prev) => {
const hasAssistantMsg = prev.some((m) => m.id === assistantMsgId)
if (hasAssistantMsg) {
return prev.map((m) =>
m.id === assistantMsgId ? { ...m, isStreaming: false } : m
)
}
return [
...prev,
{
id: assistantMsgId,
role: 'assistant',
content: 'Sorry, there was an error processing your request. Please try again.',
timestamp: new Date(),
},
]
})
}
} finally {
setIsStreamingResponse(false)
streamAbortRef.current = null
}
if (fullContent && sessionId) {
// Ensure the streaming cursor is removed
setMessages((prev) =>
prev.map((m) =>
m.id === assistantMsgId ? { ...m, isStreaming: false } : m
)
)
await api.addChatMessage(sessionId, 'assistant', fullContent)
const currentSession = sessions.find((s) => s.id === sessionId)
if (currentSession && currentSession.title === 'New Chat') {
const newTitle = content.slice(0, 50) + (content.length > 50 ? '...' : '')
await api.updateChatSession(sessionId, { title: newTitle })
queryClient.invalidateQueries({ queryKey: ['chatSessions'] })
}
}
} else {
// Non-streaming (legacy) path
chatMutation.mutate({
model: selectedModel || 'llama3.2',
messages: chatMessages,
})
}
}, },
[activeSessionId, messages, selectedModel, chatMutation, queryClient] [activeSessionId, messages, selectedModel, chatMutation, queryClient, streamingEnabled, sessions]
) )
return ( return (
@ -282,7 +387,7 @@ export default function Chat({
<ChatInterface <ChatInterface
messages={messages} messages={messages}
onSendMessage={handleSendMessage} onSendMessage={handleSendMessage}
isLoading={chatMutation.isPending} isLoading={isStreamingResponse || chatMutation.isPending}
chatSuggestions={chatSuggestions} chatSuggestions={chatSuggestions}
chatSuggestionsEnabled={suggestionsEnabled} chatSuggestionsEnabled={suggestionsEnabled}
chatSuggestionsLoading={chatSuggestionsLoading} chatSuggestionsLoading={chatSuggestionsLoading}

View File

@ -212,6 +212,57 @@ class API {
})() })()
} }
async streamChatMessage(
chatRequest: OllamaChatRequest,
onChunk: (content: string, thinking: string, done: boolean) => void,
signal?: AbortSignal
): Promise<void> {
// Axios doesn't support ReadableStream in browser, so need to use fetch
const response = await fetch('/api/ollama/chat', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ ...chatRequest, stream: true }),
signal,
})
if (!response.ok || !response.body) {
throw new Error(`HTTP error: ${response.status}`)
}
const reader = response.body.getReader()
const decoder = new TextDecoder()
let buffer = ''
try {
while (true) {
const { done, value } = await reader.read()
if (done) break
buffer += decoder.decode(value, { stream: true })
const lines = buffer.split('\n')
buffer = lines.pop() || ''
for (const line of lines) {
if (!line.startsWith('data: ')) continue
let data: any
try {
data = JSON.parse(line.slice(6))
} catch { continue /* skip malformed chunks */ }
if (data.error) throw new Error('The model encountered an error. Please try again.')
onChunk(
data.message?.content ?? '',
data.message?.thinking ?? '',
data.done ?? false
)
}
}
} finally {
reader.releaseLock()
}
}
async getBenchmarkResults() { async getBenchmarkResults() {
return catchInternal(async () => { return catchInternal(async () => {
const response = await this.client.get<{ results: BenchmarkResult[], total: number }>('/benchmark/results') const response = await this.client.get<{ results: BenchmarkResult[], total: number }>('/benchmark/results')

View File

@ -4,6 +4,8 @@ export interface ChatMessage {
content: string content: string
timestamp: Date timestamp: Date
isStreaming?: boolean isStreaming?: boolean
thinking?: string
isThinking?: boolean
} }
export interface ChatSession { export interface ChatSession {

View File

@ -14,6 +14,7 @@ export type NomadOllamaModelTag = {
context: string context: string
input: string input: string
cloud: boolean cloud: boolean
thinking: boolean
} }
export type NomadOllamaModelAPIResponse = { export type NomadOllamaModelAPIResponse = {