feat(AI): thinking and response streaming

2026-03-28 03:29:25 +01:00 · 2026-02-19 05:18:20 +00:00 · 2026-02-19 05:18:20 +00:00 · 98b65c421c
commit 98b65c421c
parent 16ce1e2945
8 changed files with 255 additions and 13 deletions
--- a/admin/app/controllers/ollama_controller.ts
+++ b/admin/app/controllers/ollama_controller.ts
@ -24,7 +24,7 @@ export default class OllamaController {
    })
  }
-  async chat({ request }: HttpContext) {
+  async chat({ request, response }: HttpContext) {
    const reqData = await request.validateUsing(chatSchema)
    // If there are no system messages in the chat inject system prompts
@ -73,7 +73,34 @@ export default class OllamaController {
      }
    }
-    return await this.ollamaService.chat(reqData)
+    // Check if the model supports "thinking" capability for enhanced response generation
    // If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat
    const thinkingCapability = await this.ollamaService.checkModelHasThinking(reqData.model)
    const think: boolean | 'medium' = thinkingCapability ? (reqData.model.startsWith('gpt-oss') ? 'medium' : true) : false
    if (reqData.stream) {
      logger.debug(`[OllamaController] Initiating streaming response for model: "${reqData.model}" with think: ${think}`)
      // SSE streaming path
      response.response.setHeader('Content-Type', 'text/event-stream')
      response.response.setHeader('Cache-Control', 'no-cache')
      response.response.setHeader('Connection', 'keep-alive')
      response.response.flushHeaders()
      try {
        const stream = await this.ollamaService.chatStream({ ...reqData, think })
        for await (const chunk of stream) {
          response.response.write(`data: ${JSON.stringify(chunk)}\n\n`)
        }
      } catch (error) {
        response.response.write(`data: ${JSON.stringify({ error: true })}\n\n`)
      } finally {
        response.response.end()
      }
      return
    }
    // Non-streaming (legacy) path
    return await this.ollamaService.chat({ ...reqData, think })
  }
  async deleteModel({ request }: HttpContext) {
--- a/admin/app/services/ollama_service.ts
+++ b/admin/app/services/ollama_service.ts
@ -134,6 +134,30 @@ export class OllamaService {
    })
  }
  public async chatStream(chatRequest: ChatRequest) {
    await this._ensureDependencies()
    if (!this.ollama) {
      throw new Error('Ollama client is not initialized.')
    }
    return await this.ollama.chat({
      ...chatRequest,
      stream: true,
    })
  }
  public async checkModelHasThinking(modelName: string): Promise<boolean> {
    await this._ensureDependencies()
    if (!this.ollama) {
      throw new Error('Ollama client is not initialized.')
    }
    const modelInfo = await this.ollama.show({
      model: modelName,
    })
    return modelInfo.capabilities.includes('thinking')
  }
  public async deleteModel(modelName: string) {
    await this._ensureDependencies()
    if (!this.ollama) {
--- a/admin/docs/release-notes.md
+++ b/admin/docs/release-notes.md
@ -1,5 +1,16 @@
 # Release Notes
 ## Unreleased
 ### Features
 - **AI Assistant**: Added support for showing reasoning stream for models with thinking capabilities
 - **AI Assistant**: Added support for response streaming for improved UX
 ### Bug Fixes
 ### Improvements
 ## Version 1.25.2 - February 18, 2026
 ### Features
--- a/admin/inertia/components/chat/ChatMessageBubble.tsx
+++ b/admin/inertia/components/chat/ChatMessageBubble.tsx
@ -15,6 +15,27 @@ export default function ChatMessageBubble({ message }: ChatMessageBubbleProps) {
        message.role === 'user' ? 'bg-desert-green text-white' : 'bg-gray-100 text-gray-800'
      )}
    >
      {message.isThinking && message.thinking && (
        <div className="mb-3 rounded border border-amber-200 bg-amber-50 px-3 py-2 text-xs">
          <div className="mb-1 flex items-center gap-1.5 font-medium text-amber-700">
            <span>Reasoning</span>
            <span className="h-1.5 w-1.5 rounded-full bg-amber-500 animate-pulse inline-block" />
          </div>
          <div className="prose prose-xs max-w-none text-amber-900/80 max-h-32 overflow-y-auto">
            <ReactMarkdown remarkPlugins={[remarkGfm]}>{message.thinking}</ReactMarkdown>
          </div>
        </div>
      )}
      {!message.isThinking && message.thinking && (
        <details className="mb-3 rounded border border-gray-200 bg-gray-50 text-xs">
          <summary className="cursor-pointer px-3 py-2 font-medium text-gray-500 hover:text-gray-700 select-none">
            Reasoning
          </summary>
          <div className="px-3 pb-3 prose prose-xs max-w-none text-gray-600 max-h-48 overflow-y-auto border-t border-gray-200 pt-2">
            <ReactMarkdown remarkPlugins={[remarkGfm]}>{message.thinking}</ReactMarkdown>
          </div>
        </details>
      )}
      <div
        className={classNames(
          'break-words',
--- a/admin/inertia/components/chat/index.tsx
+++ b/admin/inertia/components/chat/index.tsx
@ -1,4 +1,4 @@
-import { useState, useCallback, useEffect } from 'react'
+import { useState, useCallback, useEffect, useRef } from 'react'
 import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'
 import ChatSidebar from './ChatSidebar'
 import ChatInterface from './ChatInterface'
@ -15,6 +15,7 @@ interface ChatProps {
  isInModal?: boolean
  onClose?: () => void
  suggestionsEnabled?: boolean
  streamingEnabled?: boolean
 }
 export default function Chat({
@ -22,12 +23,15 @@ export default function Chat({
  isInModal,
  onClose,
  suggestionsEnabled = false,
  streamingEnabled = true,
 }: ChatProps) {
  const queryClient = useQueryClient()
  const { openModal, closeAllModals } = useModals()
  const [activeSessionId, setActiveSessionId] = useState<string | null>(null)
  const [messages, setMessages] = useState<ChatMessage[]>([])
  const [selectedModel, setSelectedModel] = useState<string>('')
  const [isStreamingResponse, setIsStreamingResponse] = useState(false)
  const streamAbortRef = useRef<AbortController | null>(null)
  // Fetch all sessions
  const { data: sessions = [] } = useQuery({
@ -209,16 +213,117 @@ export default function Chat({
      // Save user message to backend
      await api.addChatMessage(sessionId, 'user', content)
-      // Send chat request using mutation
+      const chatMessages = [
-      chatMutation.mutate({
+        ...messages.map((m) => ({ role: m.role, content: m.content })),
-        model: selectedModel || 'llama3.2',
+        { role: 'user' as const, content },
-        messages: [
+      ]
-          ...messages.map((m) => ({ role: m.role, content: m.content })),
+
-          { role: 'user', content },
+      if (streamingEnabled !== false) {
-        ],
+        // Streaming path
-      })
+        const abortController = new AbortController()
        streamAbortRef.current = abortController
        setIsStreamingResponse(true)
        const assistantMsgId = `msg-${Date.now()}-assistant`
        let isFirstChunk = true
        let fullContent = ''
        let thinkingContent = ''
        let isThinkingPhase = true
        try {
          await api.streamChatMessage(
            { model: selectedModel || 'llama3.2', messages: chatMessages, stream: true },
            (chunkContent, chunkThinking, done) => {
              if (isFirstChunk) {
                isFirstChunk = false
                setIsStreamingResponse(false)
                setMessages((prev) => [
                  ...prev,
                  {
                    id: assistantMsgId,
                    role: 'assistant',
                    content: chunkContent,
                    thinking: chunkThinking,
                    timestamp: new Date(),
                    isStreaming: true,
                    isThinking: chunkThinking.length > 0 && chunkContent.length === 0,
                  },
                ])
              } else {
                if (isThinkingPhase && chunkContent.length > 0) {
                  isThinkingPhase = false
                }
                setMessages((prev) =>
                  prev.map((m) =>
                    m.id === assistantMsgId
                      ? {
                          ...m,
                          content: m.content + chunkContent,
                          thinking: (m.thinking ?? '') + chunkThinking,
                          isStreaming: !done,
                          isThinking: isThinkingPhase,
                        }
                      : m
                  )
                )
              }
              fullContent += chunkContent
              thinkingContent += chunkThinking
            },
            abortController.signal
          )
        } catch (error: any) {
          if (error?.name !== 'AbortError') {
            setMessages((prev) => {
              const hasAssistantMsg = prev.some((m) => m.id === assistantMsgId)
              if (hasAssistantMsg) {
                return prev.map((m) =>
                  m.id === assistantMsgId ? { ...m, isStreaming: false } : m
                )
              }
              return [
                ...prev,
                {
                  id: assistantMsgId,
                  role: 'assistant',
                  content: 'Sorry, there was an error processing your request. Please try again.',
                  timestamp: new Date(),
                },
              ]
            })
          }
        } finally {
          setIsStreamingResponse(false)
          streamAbortRef.current = null
        }
        if (fullContent && sessionId) {
          // Ensure the streaming cursor is removed
          setMessages((prev) =>
            prev.map((m) =>
              m.id === assistantMsgId ? { ...m, isStreaming: false } : m
            )
          )
          await api.addChatMessage(sessionId, 'assistant', fullContent)
          const currentSession = sessions.find((s) => s.id === sessionId)
          if (currentSession && currentSession.title === 'New Chat') {
            const newTitle = content.slice(0, 50) + (content.length > 50 ? '...' : '')
            await api.updateChatSession(sessionId, { title: newTitle })
            queryClient.invalidateQueries({ queryKey: ['chatSessions'] })
          }
        }
      } else {
        // Non-streaming (legacy) path
        chatMutation.mutate({
          model: selectedModel || 'llama3.2',
          messages: chatMessages,
        })
      }
    },
-    [activeSessionId, messages, selectedModel, chatMutation, queryClient]
+    [activeSessionId, messages, selectedModel, chatMutation, queryClient, streamingEnabled, sessions]
  )
  return (
@ -282,7 +387,7 @@ export default function Chat({
        <ChatInterface
          messages={messages}
          onSendMessage={handleSendMessage}
-          isLoading={chatMutation.isPending}
+          isLoading={isStreamingResponse || chatMutation.isPending}
          chatSuggestions={chatSuggestions}
          chatSuggestionsEnabled={suggestionsEnabled}
          chatSuggestionsLoading={chatSuggestionsLoading}
--- a/admin/inertia/lib/api.ts
+++ b/admin/inertia/lib/api.ts
@ -212,6 +212,57 @@ class API {
    })()
  }
  async streamChatMessage(
    chatRequest: OllamaChatRequest,
    onChunk: (content: string, thinking: string, done: boolean) => void,
    signal?: AbortSignal
  ): Promise<void> {
    // Axios doesn't support ReadableStream in browser, so need to use fetch
    const response = await fetch('/api/ollama/chat', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({ ...chatRequest, stream: true }),
      signal,
    })
    if (!response.ok || !response.body) {
      throw new Error(`HTTP error: ${response.status}`)
    }
    const reader = response.body.getReader()
    const decoder = new TextDecoder()
    let buffer = ''
    try {
      while (true) {
        const { done, value } = await reader.read()
        if (done) break
        buffer += decoder.decode(value, { stream: true })
        const lines = buffer.split('\n')
        buffer = lines.pop() || ''
        for (const line of lines) {
          if (!line.startsWith('data: ')) continue
          let data: any
          try {
            data = JSON.parse(line.slice(6))
          } catch { continue /* skip malformed chunks */ }
          if (data.error) throw new Error('The model encountered an error. Please try again.')
          onChunk(
            data.message?.content ?? '',
            data.message?.thinking ?? '',
            data.done ?? false
          )
        }
      }
    } finally {
      reader.releaseLock()
    }
  }
  async getBenchmarkResults() {
    return catchInternal(async () => {
      const response = await this.client.get<{ results: BenchmarkResult[], total: number }>('/benchmark/results')
--- a/admin/types/chat.ts
+++ b/admin/types/chat.ts
@ -4,6 +4,8 @@ export interface ChatMessage {
  content: string
  timestamp: Date
  isStreaming?: boolean
  thinking?: string
  isThinking?: boolean
 }
 export interface ChatSession {
--- a/admin/types/ollama.ts
+++ b/admin/types/ollama.ts
@ -14,6 +14,7 @@ export type NomadOllamaModelTag = {
  context: string
  input: string
  cloud: boolean
  thinking: boolean
 }
 export type NomadOllamaModelAPIResponse = {