From 98b65c421cf4ca9dc739a8d46877945682092330 Mon Sep 17 00:00:00 2001 From: Jake Turner Date: Thu, 19 Feb 2026 05:18:20 +0000 Subject: [PATCH] feat(AI): thinking and response streaming --- admin/app/controllers/ollama_controller.ts | 31 ++++- admin/app/services/ollama_service.ts | 24 ++++ admin/docs/release-notes.md | 11 ++ .../components/chat/ChatMessageBubble.tsx | 21 +++ admin/inertia/components/chat/index.tsx | 127 ++++++++++++++++-- admin/inertia/lib/api.ts | 51 +++++++ admin/types/chat.ts | 2 + admin/types/ollama.ts | 1 + 8 files changed, 255 insertions(+), 13 deletions(-) diff --git a/admin/app/controllers/ollama_controller.ts b/admin/app/controllers/ollama_controller.ts index e77ed56..d163d3b 100644 --- a/admin/app/controllers/ollama_controller.ts +++ b/admin/app/controllers/ollama_controller.ts @@ -24,7 +24,7 @@ export default class OllamaController { }) } - async chat({ request }: HttpContext) { + async chat({ request, response }: HttpContext) { const reqData = await request.validateUsing(chatSchema) // If there are no system messages in the chat inject system prompts @@ -73,7 +73,34 @@ export default class OllamaController { } } - return await this.ollamaService.chat(reqData) + // Check if the model supports "thinking" capability for enhanced response generation + // If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat + const thinkingCapability = await this.ollamaService.checkModelHasThinking(reqData.model) + const think: boolean | 'medium' = thinkingCapability ? (reqData.model.startsWith('gpt-oss') ? 'medium' : true) : false + + if (reqData.stream) { + logger.debug(`[OllamaController] Initiating streaming response for model: "${reqData.model}" with think: ${think}`) + // SSE streaming path + response.response.setHeader('Content-Type', 'text/event-stream') + response.response.setHeader('Cache-Control', 'no-cache') + response.response.setHeader('Connection', 'keep-alive') + response.response.flushHeaders() + + try { + const stream = await this.ollamaService.chatStream({ ...reqData, think }) + for await (const chunk of stream) { + response.response.write(`data: ${JSON.stringify(chunk)}\n\n`) + } + } catch (error) { + response.response.write(`data: ${JSON.stringify({ error: true })}\n\n`) + } finally { + response.response.end() + } + return + } + + // Non-streaming (legacy) path + return await this.ollamaService.chat({ ...reqData, think }) } async deleteModel({ request }: HttpContext) { diff --git a/admin/app/services/ollama_service.ts b/admin/app/services/ollama_service.ts index 2793152..d9022aa 100644 --- a/admin/app/services/ollama_service.ts +++ b/admin/app/services/ollama_service.ts @@ -134,6 +134,30 @@ export class OllamaService { }) } + public async chatStream(chatRequest: ChatRequest) { + await this._ensureDependencies() + if (!this.ollama) { + throw new Error('Ollama client is not initialized.') + } + return await this.ollama.chat({ + ...chatRequest, + stream: true, + }) + } + + public async checkModelHasThinking(modelName: string): Promise { + await this._ensureDependencies() + if (!this.ollama) { + throw new Error('Ollama client is not initialized.') + } + + const modelInfo = await this.ollama.show({ + model: modelName, + }) + + return modelInfo.capabilities.includes('thinking') + } + public async deleteModel(modelName: string) { await this._ensureDependencies() if (!this.ollama) { diff --git a/admin/docs/release-notes.md b/admin/docs/release-notes.md index 8c54c27..0d98837 100644 --- a/admin/docs/release-notes.md +++ b/admin/docs/release-notes.md @@ -1,5 +1,16 @@ # Release Notes +## Unreleased + +### Features +- **AI Assistant**: Added support for showing reasoning stream for models with thinking capabilities +- **AI Assistant**: Added support for response streaming for improved UX + +### Bug Fixes + +### Improvements + + ## Version 1.25.2 - February 18, 2026 ### Features diff --git a/admin/inertia/components/chat/ChatMessageBubble.tsx b/admin/inertia/components/chat/ChatMessageBubble.tsx index a931c0c..aca040e 100644 --- a/admin/inertia/components/chat/ChatMessageBubble.tsx +++ b/admin/inertia/components/chat/ChatMessageBubble.tsx @@ -15,6 +15,27 @@ export default function ChatMessageBubble({ message }: ChatMessageBubbleProps) { message.role === 'user' ? 'bg-desert-green text-white' : 'bg-gray-100 text-gray-800' )} > + {message.isThinking && message.thinking && ( +
+
+ Reasoning + +
+
+ {message.thinking} +
+
+ )} + {!message.isThinking && message.thinking && ( +
+ + Reasoning + +
+ {message.thinking} +
+
+ )}
void suggestionsEnabled?: boolean + streamingEnabled?: boolean } export default function Chat({ @@ -22,12 +23,15 @@ export default function Chat({ isInModal, onClose, suggestionsEnabled = false, + streamingEnabled = true, }: ChatProps) { const queryClient = useQueryClient() const { openModal, closeAllModals } = useModals() const [activeSessionId, setActiveSessionId] = useState(null) const [messages, setMessages] = useState([]) const [selectedModel, setSelectedModel] = useState('') + const [isStreamingResponse, setIsStreamingResponse] = useState(false) + const streamAbortRef = useRef(null) // Fetch all sessions const { data: sessions = [] } = useQuery({ @@ -209,16 +213,117 @@ export default function Chat({ // Save user message to backend await api.addChatMessage(sessionId, 'user', content) - // Send chat request using mutation - chatMutation.mutate({ - model: selectedModel || 'llama3.2', - messages: [ - ...messages.map((m) => ({ role: m.role, content: m.content })), - { role: 'user', content }, - ], - }) + const chatMessages = [ + ...messages.map((m) => ({ role: m.role, content: m.content })), + { role: 'user' as const, content }, + ] + + if (streamingEnabled !== false) { + // Streaming path + const abortController = new AbortController() + streamAbortRef.current = abortController + + setIsStreamingResponse(true) + + const assistantMsgId = `msg-${Date.now()}-assistant` + let isFirstChunk = true + let fullContent = '' + let thinkingContent = '' + let isThinkingPhase = true + + try { + await api.streamChatMessage( + { model: selectedModel || 'llama3.2', messages: chatMessages, stream: true }, + (chunkContent, chunkThinking, done) => { + if (isFirstChunk) { + isFirstChunk = false + setIsStreamingResponse(false) + setMessages((prev) => [ + ...prev, + { + id: assistantMsgId, + role: 'assistant', + content: chunkContent, + thinking: chunkThinking, + timestamp: new Date(), + isStreaming: true, + isThinking: chunkThinking.length > 0 && chunkContent.length === 0, + }, + ]) + } else { + if (isThinkingPhase && chunkContent.length > 0) { + isThinkingPhase = false + } + setMessages((prev) => + prev.map((m) => + m.id === assistantMsgId + ? { + ...m, + content: m.content + chunkContent, + thinking: (m.thinking ?? '') + chunkThinking, + isStreaming: !done, + isThinking: isThinkingPhase, + } + : m + ) + ) + } + fullContent += chunkContent + thinkingContent += chunkThinking + }, + abortController.signal + ) + } catch (error: any) { + if (error?.name !== 'AbortError') { + setMessages((prev) => { + const hasAssistantMsg = prev.some((m) => m.id === assistantMsgId) + if (hasAssistantMsg) { + return prev.map((m) => + m.id === assistantMsgId ? { ...m, isStreaming: false } : m + ) + } + return [ + ...prev, + { + id: assistantMsgId, + role: 'assistant', + content: 'Sorry, there was an error processing your request. Please try again.', + timestamp: new Date(), + }, + ] + }) + } + } finally { + setIsStreamingResponse(false) + streamAbortRef.current = null + } + + if (fullContent && sessionId) { + // Ensure the streaming cursor is removed + setMessages((prev) => + prev.map((m) => + m.id === assistantMsgId ? { ...m, isStreaming: false } : m + ) + ) + + await api.addChatMessage(sessionId, 'assistant', fullContent) + + const currentSession = sessions.find((s) => s.id === sessionId) + if (currentSession && currentSession.title === 'New Chat') { + const newTitle = content.slice(0, 50) + (content.length > 50 ? '...' : '') + await api.updateChatSession(sessionId, { title: newTitle }) + queryClient.invalidateQueries({ queryKey: ['chatSessions'] }) + } + } + } else { + // Non-streaming (legacy) path + chatMutation.mutate({ + model: selectedModel || 'llama3.2', + messages: chatMessages, + }) + } }, - [activeSessionId, messages, selectedModel, chatMutation, queryClient] + [activeSessionId, messages, selectedModel, chatMutation, queryClient, streamingEnabled, sessions] ) return ( @@ -282,7 +387,7 @@ export default function Chat({ void, + signal?: AbortSignal + ): Promise { + // Axios doesn't support ReadableStream in browser, so need to use fetch + const response = await fetch('/api/ollama/chat', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ ...chatRequest, stream: true }), + signal, + }) + + if (!response.ok || !response.body) { + throw new Error(`HTTP error: ${response.status}`) + } + + const reader = response.body.getReader() + const decoder = new TextDecoder() + let buffer = '' + + try { + while (true) { + const { done, value } = await reader.read() + if (done) break + + buffer += decoder.decode(value, { stream: true }) + const lines = buffer.split('\n') + buffer = lines.pop() || '' + + for (const line of lines) { + if (!line.startsWith('data: ')) continue + let data: any + try { + data = JSON.parse(line.slice(6)) + } catch { continue /* skip malformed chunks */ } + + if (data.error) throw new Error('The model encountered an error. Please try again.') + + onChunk( + data.message?.content ?? '', + data.message?.thinking ?? '', + data.done ?? false + ) + } + } + } finally { + reader.releaseLock() + } + } + async getBenchmarkResults() { return catchInternal(async () => { const response = await this.client.get<{ results: BenchmarkResult[], total: number }>('/benchmark/results') diff --git a/admin/types/chat.ts b/admin/types/chat.ts index 805e999..a3047ef 100644 --- a/admin/types/chat.ts +++ b/admin/types/chat.ts @@ -4,6 +4,8 @@ export interface ChatMessage { content: string timestamp: Date isStreaming?: boolean + thinking?: string + isThinking?: boolean } export interface ChatSession { diff --git a/admin/types/ollama.ts b/admin/types/ollama.ts index f7168fe..5d3e7c3 100644 --- a/admin/types/ollama.ts +++ b/admin/types/ollama.ts @@ -14,6 +14,7 @@ export type NomadOllamaModelTag = { context: string input: string cloud: boolean + thinking: boolean } export type NomadOllamaModelAPIResponse = {