diff --git a/admin/app/controllers/ollama_controller.ts b/admin/app/controllers/ollama_controller.ts
index a43f445..3324eab 100644
--- a/admin/app/controllers/ollama_controller.ts
+++ b/admin/app/controllers/ollama_controller.ts
@@ -77,10 +77,10 @@ export default class OllamaController {
           const { maxResults, maxTokens } = this.getContextLimitsForModel(reqData.model)
           let trimmedDocs = relevantDocs.slice(0, maxResults)
 
-          // Apply token cap if set (estimate ~4 chars per token)
+          // Apply token cap if set (estimate ~3.5 chars per token)
           // Always include the first (most relevant) result — the cap only gates subsequent results
           if (maxTokens > 0) {
-            const charCap = maxTokens * 4
+            const charCap = maxTokens * 3.5
             let totalChars = 0
             trimmedDocs = trimmedDocs.filter((doc, idx) => {
               totalChars += doc.text.length
@@ -108,6 +108,19 @@ export default class OllamaController {
         }
       }
 
+      // If system messages are large (e.g. due to RAG context), request a context window big
+      // enough to fit them. Ollama respects num_ctx per-request; LM Studio ignores it gracefully.
+      const systemChars = reqData.messages
+        .filter((m) => m.role === 'system')
+        .reduce((sum, m) => sum + m.content.length, 0)
+      const estimatedSystemTokens = Math.ceil(systemChars / 3.5)
+      let numCtx: number | undefined
+      if (estimatedSystemTokens > 3000) {
+        const needed = estimatedSystemTokens + 2048 // leave room for conversation + response
+        numCtx = [8192, 16384, 32768, 65536].find((n) => n >= needed) ?? 65536
+        logger.debug(`[OllamaController] Large system prompt (~${estimatedSystemTokens} tokens), requesting num_ctx: ${numCtx}`)
+      }
+
       // Check if the model supports "thinking" capability for enhanced response generation
       // If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat
       const thinkingCapability = await this.ollamaService.checkModelHasThinking(reqData.model)
@@ -129,7 +142,7 @@ export default class OllamaController {
       if (reqData.stream) {
         logger.debug(`[OllamaController] Initiating streaming response for model: "${reqData.model}" with think: ${think}`)
         // Headers already flushed above
-        const stream = await this.ollamaService.chatStream({ ...ollamaRequest, think })
+        const stream = await this.ollamaService.chatStream({ ...ollamaRequest, think, numCtx })
         let fullContent = ''
         for await (const chunk of stream) {
           if (chunk.message?.content) {
@@ -153,7 +166,7 @@ export default class OllamaController {
       }
 
       // Non-streaming (legacy) path
-      const result = await this.ollamaService.chat({ ...ollamaRequest, think })
+      const result = await this.ollamaService.chat({ ...ollamaRequest, think, numCtx })
 
       if (sessionId && result?.message?.content) {
         await this.chatService.addMessage(sessionId, 'assistant', result.message.content)
diff --git a/admin/app/services/ollama_service.ts b/admin/app/services/ollama_service.ts
index faff9cd..58afc55 100644
--- a/admin/app/services/ollama_service.ts
+++ b/admin/app/services/ollama_service.ts
@@ -44,6 +44,7 @@ type ChatInput = {
   messages: Array<{ role: 'system' | 'user' | 'assistant'; content: string }>
   think?: boolean | 'medium'
   stream?: boolean
+  numCtx?: number
 }
 
 @inject()
@@ -210,6 +211,9 @@ export class OllamaService {
     if (chatRequest.think) {
       params.think = chatRequest.think
     }
+    if (chatRequest.numCtx) {
+      params.num_ctx = chatRequest.numCtx
+    }
 
     const response = await this.openai.chat.completions.create(params)
     const choice = response.choices[0]
@@ -238,6 +242,9 @@ export class OllamaService {
     if (chatRequest.think) {
       params.think = chatRequest.think
     }
+    if (chatRequest.numCtx) {
+      params.num_ctx = chatRequest.numCtx
+    }
 
     const stream = (await this.openai.chat.completions.create(params)) as unknown as Stream<ChatCompletionChunk>