From 98b65c421cf4ca9dc739a8d46877945682092330 Mon Sep 17 00:00:00 2001
From: Jake Turner <jturner@cosmistack.com>
Date: Thu, 19 Feb 2026 05:18:20 +0000
Subject: [PATCH] feat(AI): thinking and response streaming

---
 admin/app/controllers/ollama_controller.ts    |  31 ++++-
 admin/app/services/ollama_service.ts          |  24 ++++
 admin/docs/release-notes.md                   |  11 ++
 .../components/chat/ChatMessageBubble.tsx     |  21 +++
 admin/inertia/components/chat/index.tsx       | 127 ++++++++++++++++--
 admin/inertia/lib/api.ts                      |  51 +++++++
 admin/types/chat.ts                           |   2 +
 admin/types/ollama.ts                         |   1 +
 8 files changed, 255 insertions(+), 13 deletions(-)
diff --git a/admin/app/controllers/ollama_controller.ts b/admin/app/controllers/ollama_controller.ts
index e77ed56..d163d3b 100644
--- a/admin/app/controllers/ollama_controller.ts
+++ b/admin/app/controllers/ollama_controller.ts
@@ -24,7 +24,7 @@ export default class OllamaController {
     })
   }
 
-  async chat({ request }: HttpContext) {
+  async chat({ request, response }: HttpContext) {
     const reqData = await request.validateUsing(chatSchema)
 
     // If there are no system messages in the chat inject system prompts
@@ -73,7 +73,34 @@ export default class OllamaController {
       }
     }
 
-    return await this.ollamaService.chat(reqData)
+    // Check if the model supports "thinking" capability for enhanced response generation
+    // If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat
+    const thinkingCapability = await this.ollamaService.checkModelHasThinking(reqData.model)
+    const think: boolean | 'medium' = thinkingCapability ? (reqData.model.startsWith('gpt-oss') ? 'medium' : true) : false
+    
+    if (reqData.stream) {
+      logger.debug(`[OllamaController] Initiating streaming response for model: "${reqData.model}" with think: ${think}`)
+      // SSE streaming path
+      response.response.setHeader('Content-Type', 'text/event-stream')
+      response.response.setHeader('Cache-Control', 'no-cache')
+      response.response.setHeader('Connection', 'keep-alive')
+      response.response.flushHeaders()
+
+      try {
+        const stream = await this.ollamaService.chatStream({ ...reqData, think })
+        for await (const chunk of stream) {
+          response.response.write(`data: ${JSON.stringify(chunk)}\n\n`)
+        }
+      } catch (error) {
+        response.response.write(`data: ${JSON.stringify({ error: true })}\n\n`)
+      } finally {
+        response.response.end()
+      }
+      return
+    }
+
+    // Non-streaming (legacy) path
+    return await this.ollamaService.chat({ ...reqData, think })
   }
 
   async deleteModel({ request }: HttpContext) {
diff --git a/admin/app/services/ollama_service.ts b/admin/app/services/ollama_service.ts
index 2793152..d9022aa 100644
--- a/admin/app/services/ollama_service.ts
+++ b/admin/app/services/ollama_service.ts
@@ -134,6 +134,30 @@ export class OllamaService {
     })
   }
 
+  public async chatStream(chatRequest: ChatRequest) {
+    await this._ensureDependencies()
+    if (!this.ollama) {
+      throw new Error('Ollama client is not initialized.')
+    }
+    return await this.ollama.chat({
+      ...chatRequest,
+      stream: true,
+    })
+  }
+
+  public async checkModelHasThinking(modelName: string): Promise<boolean> {
+    await this._ensureDependencies()
+    if (!this.ollama) {
+      throw new Error('Ollama client is not initialized.')
+    }
+
+    const modelInfo = await this.ollama.show({
+      model: modelName,
+    })
+
+    return modelInfo.capabilities.includes('thinking')
+  }
+
   public async deleteModel(modelName: string) {
     await this._ensureDependencies()
     if (!this.ollama) {
diff --git a/admin/docs/release-notes.md b/admin/docs/release-notes.md
index 8c54c27..0d98837 100644
--- a/admin/docs/release-notes.md
+++ b/admin/docs/release-notes.md
@@ -1,5 +1,16 @@
 # Release Notes
 
+## Unreleased
+
+### Features
+- **AI Assistant**: Added support for showing reasoning stream for models with thinking capabilities
+- **AI Assistant**: Added support for response streaming for improved UX
+
+### Bug Fixes
+
+### Improvements
+
+
 ## Version 1.25.2 - February 18, 2026
 
 ### Features
diff --git a/admin/inertia/components/chat/ChatMessageBubble.tsx b/admin/inertia/components/chat/ChatMessageBubble.tsx
index a931c0c..aca040e 100644
--- a/admin/inertia/components/chat/ChatMessageBubble.tsx
+++ b/admin/inertia/components/chat/ChatMessageBubble.tsx
@@ -15,6 +15,27 @@ export default function ChatMessageBubble({ message }: ChatMessageBubbleProps) {
         message.role === 'user' ? 'bg-desert-green text-white' : 'bg-gray-100 text-gray-800'
       )}
     >
+      {message.isThinking && message.thinking && (
+        <div className="mb-3 rounded border border-amber-200 bg-amber-50 px-3 py-2 text-xs">
+          <div className="mb-1 flex items-center gap-1.5 font-medium text-amber-700">
+            <span>Reasoning</span>
+            <span className="h-1.5 w-1.5 rounded-full bg-amber-500 animate-pulse inline-block" />
+          </div>
+          <div className="prose prose-xs max-w-none text-amber-900/80 max-h-32 overflow-y-auto">
+            <ReactMarkdown remarkPlugins={[remarkGfm]}>{message.thinking}</ReactMarkdown>
+          </div>
+        </div>
+      )}
+      {!message.isThinking && message.thinking && (
+        <details className="mb-3 rounded border border-gray-200 bg-gray-50 text-xs">
+          <summary className="cursor-pointer px-3 py-2 font-medium text-gray-500 hover:text-gray-700 select-none">
+            Reasoning
+          </summary>
+          <div className="px-3 pb-3 prose prose-xs max-w-none text-gray-600 max-h-48 overflow-y-auto border-t border-gray-200 pt-2">
+            <ReactMarkdown remarkPlugins={[remarkGfm]}>{message.thinking}</ReactMarkdown>
+          </div>
+        </details>
+      )}
       <div
         className={classNames(
           'break-words',
diff --git a/admin/inertia/components/chat/index.tsx b/admin/inertia/components/chat/index.tsx
index 1d6cb5f..2e37baf 100644
--- a/admin/inertia/components/chat/index.tsx
+++ b/admin/inertia/components/chat/index.tsx
@@ -1,4 +1,4 @@
-import { useState, useCallback, useEffect } from 'react'
+import { useState, useCallback, useEffect, useRef } from 'react'
 import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'
 import ChatSidebar from './ChatSidebar'
 import ChatInterface from './ChatInterface'
@@ -15,6 +15,7 @@ interface ChatProps {
   isInModal?: boolean
   onClose?: () => void
   suggestionsEnabled?: boolean
+  streamingEnabled?: boolean
 }
 
 export default function Chat({
@@ -22,12 +23,15 @@ export default function Chat({
   isInModal,
   onClose,
   suggestionsEnabled = false,
+  streamingEnabled = true,
 }: ChatProps) {
   const queryClient = useQueryClient()
   const { openModal, closeAllModals } = useModals()
   const [activeSessionId, setActiveSessionId] = useState<string | null>(null)
   const [messages, setMessages] = useState<ChatMessage[]>([])
   const [selectedModel, setSelectedModel] = useState<string>('')
+  const [isStreamingResponse, setIsStreamingResponse] = useState(false)
+  const streamAbortRef = useRef<AbortController | null>(null)
 
   // Fetch all sessions
   const { data: sessions = [] } = useQuery({
@@ -209,16 +213,117 @@ export default function Chat({
       // Save user message to backend
       await api.addChatMessage(sessionId, 'user', content)
 
-      // Send chat request using mutation
-      chatMutation.mutate({
-        model: selectedModel || 'llama3.2',
-        messages: [
-          ...messages.map((m) => ({ role: m.role, content: m.content })),
-          { role: 'user', content },
-        ],
-      })
+      const chatMessages = [
+        ...messages.map((m) => ({ role: m.role, content: m.content })),
+        { role: 'user' as const, content },
+      ]
+
+      if (streamingEnabled !== false) {
+        // Streaming path
+        const abortController = new AbortController()
+        streamAbortRef.current = abortController
+
+        setIsStreamingResponse(true)
+
+        const assistantMsgId = `msg-${Date.now()}-assistant`
+        let isFirstChunk = true
+        let fullContent = ''
+        let thinkingContent = ''
+        let isThinkingPhase = true
+
+        try {
+          await api.streamChatMessage(
+            { model: selectedModel || 'llama3.2', messages: chatMessages, stream: true },
+            (chunkContent, chunkThinking, done) => {
+              if (isFirstChunk) {
+                isFirstChunk = false
+                setIsStreamingResponse(false)
+                setMessages((prev) => [
+                  ...prev,
+                  {
+                    id: assistantMsgId,
+                    role: 'assistant',
+                    content: chunkContent,
+                    thinking: chunkThinking,
+                    timestamp: new Date(),
+                    isStreaming: true,
+                    isThinking: chunkThinking.length > 0 && chunkContent.length === 0,
+                  },
+                ])
+              } else {
+                if (isThinkingPhase && chunkContent.length > 0) {
+                  isThinkingPhase = false
+                }
+                setMessages((prev) =>
+                  prev.map((m) =>
+                    m.id === assistantMsgId
+                      ? {
+                          ...m,
+                          content: m.content + chunkContent,
+                          thinking: (m.thinking ?? '') + chunkThinking,
+                          isStreaming: !done,
+                          isThinking: isThinkingPhase,
+                        }
+                      : m
+                  )
+                )
+              }
+              fullContent += chunkContent
+              thinkingContent += chunkThinking
+            },
+            abortController.signal
+          )
+        } catch (error: any) {
+          if (error?.name !== 'AbortError') {
+            setMessages((prev) => {
+              const hasAssistantMsg = prev.some((m) => m.id === assistantMsgId)
+              if (hasAssistantMsg) {
+                return prev.map((m) =>
+                  m.id === assistantMsgId ? { ...m, isStreaming: false } : m
+                )
+              }
+              return [
+                ...prev,
+                {
+                  id: assistantMsgId,
+                  role: 'assistant',
+                  content: 'Sorry, there was an error processing your request. Please try again.',
+                  timestamp: new Date(),
+                },
+              ]
+            })
+          }
+        } finally {
+          setIsStreamingResponse(false)
+          streamAbortRef.current = null
+        }
+
+        if (fullContent && sessionId) {
+          // Ensure the streaming cursor is removed
+          setMessages((prev) =>
+            prev.map((m) =>
+              m.id === assistantMsgId ? { ...m, isStreaming: false } : m
+            )
+          )
+
+          await api.addChatMessage(sessionId, 'assistant', fullContent)
+
+          const currentSession = sessions.find((s) => s.id === sessionId)
+          if (currentSession && currentSession.title === 'New Chat') {
+            const newTitle = content.slice(0, 50) + (content.length > 50 ? '...' : '')
+            await api.updateChatSession(sessionId, { title: newTitle })
+            queryClient.invalidateQueries({ queryKey: ['chatSessions'] })
+          }
+        }
+      } else {
+        // Non-streaming (legacy) path
+        chatMutation.mutate({
+          model: selectedModel || 'llama3.2',
+          messages: chatMessages,
+        })
+      }
     },
-    [activeSessionId, messages, selectedModel, chatMutation, queryClient]
+    [activeSessionId, messages, selectedModel, chatMutation, queryClient, streamingEnabled, sessions]
   )
 
   return (
@@ -282,7 +387,7 @@ export default function Chat({
         <ChatInterface
           messages={messages}
           onSendMessage={handleSendMessage}
-          isLoading={chatMutation.isPending}
+          isLoading={isStreamingResponse || chatMutation.isPending}
           chatSuggestions={chatSuggestions}
           chatSuggestionsEnabled={suggestionsEnabled}
           chatSuggestionsLoading={chatSuggestionsLoading}
diff --git a/admin/inertia/lib/api.ts b/admin/inertia/lib/api.ts
index a786ee8..2ebf780 100644
--- a/admin/inertia/lib/api.ts
+++ b/admin/inertia/lib/api.ts
@@ -212,6 +212,57 @@ class API {
     })()
   }
 
+  async streamChatMessage(
+    chatRequest: OllamaChatRequest,
+    onChunk: (content: string, thinking: string, done: boolean) => void,
+    signal?: AbortSignal
+  ): Promise<void> {
+    // Axios doesn't support ReadableStream in browser, so need to use fetch
+    const response = await fetch('/api/ollama/chat', {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ ...chatRequest, stream: true }),
+      signal,
+    })
+
+    if (!response.ok || !response.body) {
+      throw new Error(`HTTP error: ${response.status}`)
+    }
+
+    const reader = response.body.getReader()
+    const decoder = new TextDecoder()
+    let buffer = ''
+
+    try {
+      while (true) {
+        const { done, value } = await reader.read()
+        if (done) break
+
+        buffer += decoder.decode(value, { stream: true })
+        const lines = buffer.split('\n')
+        buffer = lines.pop() || ''
+
+        for (const line of lines) {
+          if (!line.startsWith('data: ')) continue
+          let data: any
+          try {
+            data = JSON.parse(line.slice(6))
+          } catch { continue /* skip malformed chunks */ }
+
+          if (data.error) throw new Error('The model encountered an error. Please try again.')
+
+          onChunk(
+            data.message?.content ?? '',
+            data.message?.thinking ?? '',
+            data.done ?? false
+          )
+        }
+      }
+    } finally {
+      reader.releaseLock()
+    }
+  }
+
   async getBenchmarkResults() {
     return catchInternal(async () => {
       const response = await this.client.get<{ results: BenchmarkResult[], total: number }>('/benchmark/results')
diff --git a/admin/types/chat.ts b/admin/types/chat.ts
index 805e999..a3047ef 100644
--- a/admin/types/chat.ts
+++ b/admin/types/chat.ts
@@ -4,6 +4,8 @@ export interface ChatMessage {
   content: string
   timestamp: Date
   isStreaming?: boolean
+  thinking?: string
+  isThinking?: boolean
 }
 
 export interface ChatSession {
diff --git a/admin/types/ollama.ts b/admin/types/ollama.ts
index f7168fe..5d3e7c3 100644
--- a/admin/types/ollama.ts
+++ b/admin/types/ollama.ts
@@ -14,6 +14,7 @@ export type NomadOllamaModelTag = {
   context: string
   input: string
   cloud: boolean
+  thinking: boolean
 }
 
 export type NomadOllamaModelAPIResponse = {