From 8b54310746c040e40b4d3142b8bb4cf69df059e0 Mon Sep 17 00:00:00 2001
From: Henry Estela <hestela@mailbox.org>
Date: Mon, 16 Mar 2026 22:10:00 -0700
Subject: [PATCH] Improve context window size estimation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fixes issue seen with some models in lm studio resulting in:
"The number of tokens to keep from the initial prompt is greater than the context length (n_keep: 4705>= n_ctx: 4096)"

Fixed char/token estimate, the old value was too optimistic,
causing the cap to allow more text than the budget allowed in actual tokens.
After RAG injection, estimates the system prompt token count.
If it exceeds ~3000 tokens, requests the next standard context size (8192, 16384, 32768, or 65536),
large enough to fit the prompt plus a 2048-token buffer for the conversation and response.

For Ollama, num_ctx is honoured per-request and will load the model with that context
window. For LM Studio, the parameter is silently ignored — but the tighter char
estimate will also reduce how much RAG text gets stuffed in, so it's less likely to
overflow.
---
 admin/app/controllers/ollama_controller.ts | 21 +++++++++++++++++----
 admin/app/services/ollama_service.ts       |  7 +++++++
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/admin/app/controllers/ollama_controller.ts b/admin/app/controllers/ollama_controller.ts
index a43f445..3324eab 100644
--- a/admin/app/controllers/ollama_controller.ts
+++ b/admin/app/controllers/ollama_controller.ts
@@ -77,10 +77,10 @@ export default class OllamaController {
           const { maxResults, maxTokens } = this.getContextLimitsForModel(reqData.model)
           let trimmedDocs = relevantDocs.slice(0, maxResults)
 
-          // Apply token cap if set (estimate ~4 chars per token)
+          // Apply token cap if set (estimate ~3.5 chars per token)
           // Always include the first (most relevant) result — the cap only gates subsequent results
           if (maxTokens > 0) {
-            const charCap = maxTokens * 4
+            const charCap = maxTokens * 3.5
             let totalChars = 0
             trimmedDocs = trimmedDocs.filter((doc, idx) => {
               totalChars += doc.text.length
@@ -108,6 +108,19 @@ export default class OllamaController {
         }
       }
 
+      // If system messages are large (e.g. due to RAG context), request a context window big
+      // enough to fit them. Ollama respects num_ctx per-request; LM Studio ignores it gracefully.
+      const systemChars = reqData.messages
+        .filter((m) => m.role === 'system')
+        .reduce((sum, m) => sum + m.content.length, 0)
+      const estimatedSystemTokens = Math.ceil(systemChars / 3.5)
+      let numCtx: number | undefined
+      if (estimatedSystemTokens > 3000) {
+        const needed = estimatedSystemTokens + 2048 // leave room for conversation + response
+        numCtx = [8192, 16384, 32768, 65536].find((n) => n >= needed) ?? 65536
+        logger.debug(`[OllamaController] Large system prompt (~${estimatedSystemTokens} tokens), requesting num_ctx: ${numCtx}`)
+      }
+
       // Check if the model supports "thinking" capability for enhanced response generation
       // If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat
       const thinkingCapability = await this.ollamaService.checkModelHasThinking(reqData.model)
@@ -129,7 +142,7 @@ export default class OllamaController {
       if (reqData.stream) {
         logger.debug(`[OllamaController] Initiating streaming response for model: "${reqData.model}" with think: ${think}`)
         // Headers already flushed above
-        const stream = await this.ollamaService.chatStream({ ...ollamaRequest, think })
+        const stream = await this.ollamaService.chatStream({ ...ollamaRequest, think, numCtx })
         let fullContent = ''
         for await (const chunk of stream) {
           if (chunk.message?.content) {
@@ -153,7 +166,7 @@ export default class OllamaController {
       }
 
       // Non-streaming (legacy) path
-      const result = await this.ollamaService.chat({ ...ollamaRequest, think })
+      const result = await this.ollamaService.chat({ ...ollamaRequest, think, numCtx })
 
       if (sessionId && result?.message?.content) {
         await this.chatService.addMessage(sessionId, 'assistant', result.message.content)
diff --git a/admin/app/services/ollama_service.ts b/admin/app/services/ollama_service.ts
index faff9cd..58afc55 100644
--- a/admin/app/services/ollama_service.ts
+++ b/admin/app/services/ollama_service.ts
@@ -44,6 +44,7 @@ type ChatInput = {
   messages: Array<{ role: 'system' | 'user' | 'assistant'; content: string }>
   think?: boolean | 'medium'
   stream?: boolean
+  numCtx?: number
 }
 
 @inject()
@@ -210,6 +211,9 @@ export class OllamaService {
     if (chatRequest.think) {
       params.think = chatRequest.think
     }
+    if (chatRequest.numCtx) {
+      params.num_ctx = chatRequest.numCtx
+    }
 
     const response = await this.openai.chat.completions.create(params)
     const choice = response.choices[0]
@@ -238,6 +242,9 @@ export class OllamaService {
     if (chatRequest.think) {
       params.think = chatRequest.think
     }
+    if (chatRequest.numCtx) {
+      params.num_ctx = chatRequest.numCtx
+    }
 
     const stream = (await this.openai.chat.completions.create(params)) as unknown as Stream<ChatCompletionChunk>