Improve context window size estimation

fixes issue seen with some models in lm studio resulting in: "The number of tokens to keep from the initial prompt is greater than the context length (n_keep: 4705>= n_ctx: 4096)" Fixed char/token estimate, the old value was too optimistic, causing the cap to allow more text than the budget allowed in actual tokens. After RAG injection, estimates the system prompt token count. If it exceeds ~3000 tokens, requests the next standard context size (8192, 16384, 32768, or 65536), large enough to fit the prompt plus a 2048-token buffer for the conversation and response. For Ollama, num_ctx is honoured per-request and will load the model with that context window. For LM Studio, the parameter is silently ignored — but the tighter char estimate will also reduce how much RAG text gets stuffed in, so it's less likely to overflow.
2026-03-28 03:29:25 +01:00 · 2026-03-16 22:10:00 -07:00 · 2026-03-16 22:10:00 -07:00 · 8b54310746
commit 8b54310746
parent c8ce28a84f
2 changed files with 24 additions and 4 deletions
--- a/admin/app/controllers/ollama_controller.ts
+++ b/admin/app/controllers/ollama_controller.ts
@ -77,10 +77,10 @@ export default class OllamaController {
          const { maxResults, maxTokens } = this.getContextLimitsForModel(reqData.model)
          let trimmedDocs = relevantDocs.slice(0, maxResults)

-          // Apply token cap if set (estimate ~4 chars per token)
+          // Apply token cap if set (estimate ~3.5 chars per token)
          // Always include the first (most relevant) result — the cap only gates subsequent results
          if (maxTokens > 0) {
-            const charCap = maxTokens * 4
+            const charCap = maxTokens * 3.5
            let totalChars = 0
            trimmedDocs = trimmedDocs.filter((doc, idx) => {
              totalChars += doc.text.length
@ -108,6 +108,19 @@ export default class OllamaController {
        }
      }

+      // If system messages are large (e.g. due to RAG context), request a context window big
+      // enough to fit them. Ollama respects num_ctx per-request; LM Studio ignores it gracefully.
+      const systemChars = reqData.messages
+        .filter((m) => m.role === 'system')
+        .reduce((sum, m) => sum + m.content.length, 0)
+      const estimatedSystemTokens = Math.ceil(systemChars / 3.5)
+      let numCtx: number | undefined
+      if (estimatedSystemTokens > 3000) {
+        const needed = estimatedSystemTokens + 2048 // leave room for conversation + response
+        numCtx = [8192, 16384, 32768, 65536].find((n) => n >= needed) ?? 65536
+        logger.debug(`[OllamaController] Large system prompt (~${estimatedSystemTokens} tokens), requesting num_ctx: ${numCtx}`)
+      }
+
      // Check if the model supports "thinking" capability for enhanced response generation
      // If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat
      const thinkingCapability = await this.ollamaService.checkModelHasThinking(reqData.model)
@ -129,7 +142,7 @@ export default class OllamaController {
      if (reqData.stream) {
        logger.debug(`[OllamaController] Initiating streaming response for model: "${reqData.model}" with think: ${think}`)
        // Headers already flushed above
-        const stream = await this.ollamaService.chatStream({ ...ollamaRequest, think })
+        const stream = await this.ollamaService.chatStream({ ...ollamaRequest, think, numCtx })
        let fullContent = ''
        for await (const chunk of stream) {
          if (chunk.message?.content) {
@ -153,7 +166,7 @@ export default class OllamaController {
      }

      // Non-streaming (legacy) path
-      const result = await this.ollamaService.chat({ ...ollamaRequest, think })
+      const result = await this.ollamaService.chat({ ...ollamaRequest, think, numCtx })

      if (sessionId && result?.message?.content) {
        await this.chatService.addMessage(sessionId, 'assistant', result.message.content)
--- a/admin/app/services/ollama_service.ts
+++ b/admin/app/services/ollama_service.ts
@ -44,6 +44,7 @@ type ChatInput = {
  messages: Array<{ role: 'system' | 'user' | 'assistant'; content: string }>
  think?: boolean | 'medium'
  stream?: boolean
+  numCtx?: number
 }

@inject()
@ -210,6 +211,9 @@ export class OllamaService {
    if (chatRequest.think) {
      params.think = chatRequest.think
    }
+    if (chatRequest.numCtx) {
+      params.num_ctx = chatRequest.numCtx
+    }

    const response = await this.openai.chat.completions.create(params)
    const choice = response.choices[0]
@ -238,6 +242,9 @@ export class OllamaService {
    if (chatRequest.think) {
      params.think = chatRequest.think
    }
+    if (chatRequest.numCtx) {
+      params.num_ctx = chatRequest.numCtx
+    }

    const stream = (await this.openai.chat.completions.create(params)) as unknown as Stream<ChatCompletionChunk>