From 8b54310746c040e40b4d3142b8bb4cf69df059e0 Mon Sep 17 00:00:00 2001 From: Henry Estela Date: Mon, 16 Mar 2026 22:10:00 -0700 Subject: [PATCH] Improve context window size estimation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fixes issue seen with some models in lm studio resulting in: "The number of tokens to keep from the initial prompt is greater than the context length (n_keep: 4705>= n_ctx: 4096)" Fixed char/token estimate, the old value was too optimistic, causing the cap to allow more text than the budget allowed in actual tokens. After RAG injection, estimates the system prompt token count. If it exceeds ~3000 tokens, requests the next standard context size (8192, 16384, 32768, or 65536), large enough to fit the prompt plus a 2048-token buffer for the conversation and response. For Ollama, num_ctx is honoured per-request and will load the model with that context window. For LM Studio, the parameter is silently ignored — but the tighter char estimate will also reduce how much RAG text gets stuffed in, so it's less likely to overflow. --- admin/app/controllers/ollama_controller.ts | 21 +++++++++++++++++---- admin/app/services/ollama_service.ts | 7 +++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/admin/app/controllers/ollama_controller.ts b/admin/app/controllers/ollama_controller.ts index a43f445..3324eab 100644 --- a/admin/app/controllers/ollama_controller.ts +++ b/admin/app/controllers/ollama_controller.ts @@ -77,10 +77,10 @@ export default class OllamaController { const { maxResults, maxTokens } = this.getContextLimitsForModel(reqData.model) let trimmedDocs = relevantDocs.slice(0, maxResults) - // Apply token cap if set (estimate ~4 chars per token) + // Apply token cap if set (estimate ~3.5 chars per token) // Always include the first (most relevant) result — the cap only gates subsequent results if (maxTokens > 0) { - const charCap = maxTokens * 4 + const charCap = maxTokens * 3.5 let totalChars = 0 trimmedDocs = trimmedDocs.filter((doc, idx) => { totalChars += doc.text.length @@ -108,6 +108,19 @@ export default class OllamaController { } } + // If system messages are large (e.g. due to RAG context), request a context window big + // enough to fit them. Ollama respects num_ctx per-request; LM Studio ignores it gracefully. + const systemChars = reqData.messages + .filter((m) => m.role === 'system') + .reduce((sum, m) => sum + m.content.length, 0) + const estimatedSystemTokens = Math.ceil(systemChars / 3.5) + let numCtx: number | undefined + if (estimatedSystemTokens > 3000) { + const needed = estimatedSystemTokens + 2048 // leave room for conversation + response + numCtx = [8192, 16384, 32768, 65536].find((n) => n >= needed) ?? 65536 + logger.debug(`[OllamaController] Large system prompt (~${estimatedSystemTokens} tokens), requesting num_ctx: ${numCtx}`) + } + // Check if the model supports "thinking" capability for enhanced response generation // If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat const thinkingCapability = await this.ollamaService.checkModelHasThinking(reqData.model) @@ -129,7 +142,7 @@ export default class OllamaController { if (reqData.stream) { logger.debug(`[OllamaController] Initiating streaming response for model: "${reqData.model}" with think: ${think}`) // Headers already flushed above - const stream = await this.ollamaService.chatStream({ ...ollamaRequest, think }) + const stream = await this.ollamaService.chatStream({ ...ollamaRequest, think, numCtx }) let fullContent = '' for await (const chunk of stream) { if (chunk.message?.content) { @@ -153,7 +166,7 @@ export default class OllamaController { } // Non-streaming (legacy) path - const result = await this.ollamaService.chat({ ...ollamaRequest, think }) + const result = await this.ollamaService.chat({ ...ollamaRequest, think, numCtx }) if (sessionId && result?.message?.content) { await this.chatService.addMessage(sessionId, 'assistant', result.message.content) diff --git a/admin/app/services/ollama_service.ts b/admin/app/services/ollama_service.ts index faff9cd..58afc55 100644 --- a/admin/app/services/ollama_service.ts +++ b/admin/app/services/ollama_service.ts @@ -44,6 +44,7 @@ type ChatInput = { messages: Array<{ role: 'system' | 'user' | 'assistant'; content: string }> think?: boolean | 'medium' stream?: boolean + numCtx?: number } @inject() @@ -210,6 +211,9 @@ export class OllamaService { if (chatRequest.think) { params.think = chatRequest.think } + if (chatRequest.numCtx) { + params.num_ctx = chatRequest.numCtx + } const response = await this.openai.chat.completions.create(params) const choice = response.choices[0] @@ -238,6 +242,9 @@ export class OllamaService { if (chatRequest.think) { params.think = chatRequest.think } + if (chatRequest.numCtx) { + params.num_ctx = chatRequest.numCtx + } const stream = (await this.openai.chat.completions.create(params)) as unknown as Stream