Improve context window size estimation

fixes issue seen with some models in lm studio resulting in:
"The number of tokens to keep from the initial prompt is greater than the context length (n_keep: 4705>= n_ctx: 4096)"

Fixed char/token estimate, the old value was too optimistic,
causing the cap to allow more text than the budget allowed in actual tokens.
After RAG injection, estimates the system prompt token count.
If it exceeds ~3000 tokens, requests the next standard context size (8192, 16384, 32768, or 65536),
large enough to fit the prompt plus a 2048-token buffer for the conversation and response.

For Ollama, num_ctx is honoured per-request and will load the model with that context
window. For LM Studio, the parameter is silently ignored — but the tighter char
estimate will also reduce how much RAG text gets stuffed in, so it's less likely to
overflow.
This commit is contained in:
Henry Estela 2026-03-16 22:10:00 -07:00
parent c8ce28a84f
commit 8b54310746
No known key found for this signature in database
GPG Key ID: 90439853E9E235BA
2 changed files with 24 additions and 4 deletions

View File

@ -77,10 +77,10 @@ export default class OllamaController {
const { maxResults, maxTokens } = this.getContextLimitsForModel(reqData.model)
let trimmedDocs = relevantDocs.slice(0, maxResults)
// Apply token cap if set (estimate ~4 chars per token)
// Apply token cap if set (estimate ~3.5 chars per token)
// Always include the first (most relevant) result — the cap only gates subsequent results
if (maxTokens > 0) {
const charCap = maxTokens * 4
const charCap = maxTokens * 3.5
let totalChars = 0
trimmedDocs = trimmedDocs.filter((doc, idx) => {
totalChars += doc.text.length
@ -108,6 +108,19 @@ export default class OllamaController {
}
}
// If system messages are large (e.g. due to RAG context), request a context window big
// enough to fit them. Ollama respects num_ctx per-request; LM Studio ignores it gracefully.
const systemChars = reqData.messages
.filter((m) => m.role === 'system')
.reduce((sum, m) => sum + m.content.length, 0)
const estimatedSystemTokens = Math.ceil(systemChars / 3.5)
let numCtx: number | undefined
if (estimatedSystemTokens > 3000) {
const needed = estimatedSystemTokens + 2048 // leave room for conversation + response
numCtx = [8192, 16384, 32768, 65536].find((n) => n >= needed) ?? 65536
logger.debug(`[OllamaController] Large system prompt (~${estimatedSystemTokens} tokens), requesting num_ctx: ${numCtx}`)
}
// Check if the model supports "thinking" capability for enhanced response generation
// If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat
const thinkingCapability = await this.ollamaService.checkModelHasThinking(reqData.model)
@ -129,7 +142,7 @@ export default class OllamaController {
if (reqData.stream) {
logger.debug(`[OllamaController] Initiating streaming response for model: "${reqData.model}" with think: ${think}`)
// Headers already flushed above
const stream = await this.ollamaService.chatStream({ ...ollamaRequest, think })
const stream = await this.ollamaService.chatStream({ ...ollamaRequest, think, numCtx })
let fullContent = ''
for await (const chunk of stream) {
if (chunk.message?.content) {
@ -153,7 +166,7 @@ export default class OllamaController {
}
// Non-streaming (legacy) path
const result = await this.ollamaService.chat({ ...ollamaRequest, think })
const result = await this.ollamaService.chat({ ...ollamaRequest, think, numCtx })
if (sessionId && result?.message?.content) {
await this.chatService.addMessage(sessionId, 'assistant', result.message.content)

View File

@ -44,6 +44,7 @@ type ChatInput = {
messages: Array<{ role: 'system' | 'user' | 'assistant'; content: string }>
think?: boolean | 'medium'
stream?: boolean
numCtx?: number
}
@inject()
@ -210,6 +211,9 @@ export class OllamaService {
if (chatRequest.think) {
params.think = chatRequest.think
}
if (chatRequest.numCtx) {
params.num_ctx = chatRequest.numCtx
}
const response = await this.openai.chat.completions.create(params)
const choice = response.choices[0]
@ -238,6 +242,9 @@ export class OllamaService {
if (chatRequest.think) {
params.think = chatRequest.think
}
if (chatRequest.numCtx) {
params.num_ctx = chatRequest.numCtx
}
const stream = (await this.openai.chat.completions.create(params)) as unknown as Stream<ChatCompletionChunk>