feat(AI Assistant): performance improvements and smarter RAG context usage

2026-03-28 03:29:25 +01:00 · 2026-03-11 05:52:46 +00:00 · 2026-03-11 05:52:46 +00:00 · 96e5027055
commit 96e5027055
parent 460756f581
5 changed files with 242 additions and 88 deletions
--- a/admin/app/controllers/ollama_controller.ts
+++ b/admin/app/controllers/ollama_controller.ts
@ -5,7 +5,7 @@ import { modelNameSchema } from '#validators/download'
 import { chatSchema, getAvailableModelsSchema } from '#validators/ollama'
 import { inject } from '@adonisjs/core'
 import type { HttpContext } from '@adonisjs/core/http'
-import { DEFAULT_QUERY_REWRITE_MODEL, SYSTEM_PROMPTS } from '../../constants/ollama.js'
+import { DEFAULT_QUERY_REWRITE_MODEL, RAG_CONTEXT_LIMITS, SYSTEM_PROMPTS } from '../../constants/ollama.js'
 import logger from '@adonisjs/core/services/logger'
 import type { Message } from 'ollama'

@ -66,9 +66,28 @@ export default class OllamaController {

        logger.debug(`[RAG] Retrieved ${relevantDocs.length} relevant documents for query: "${rewrittenQuery}"`)

-        // If relevant context is found, inject as a system message
+        // If relevant context is found, inject as a system message with adaptive limits
        if (relevantDocs.length > 0) {
-          const contextText = relevantDocs
+          // Determine context budget based on model size
+          const { maxResults, maxTokens } = this.getContextLimitsForModel(reqData.model)
+          let trimmedDocs = relevantDocs.slice(0, maxResults)
+
+          // Apply token cap if set (estimate ~4 chars per token)
+          // Always include the first (most relevant) result — the cap only gates subsequent results
+          if (maxTokens > 0) {
+            const charCap = maxTokens * 4
+            let totalChars = 0
+            trimmedDocs = trimmedDocs.filter((doc, idx) => {
+              totalChars += doc.text.length
+              return idx === 0 || totalChars <= charCap
+            })
+          }
+
+          logger.debug(
+            `[RAG] Injecting ${trimmedDocs.length}/${relevantDocs.length} results (model: ${reqData.model}, maxResults: ${maxResults}, maxTokens: ${maxTokens || 'unlimited'})`
+          )
+
+          const contextText = trimmedDocs
            .map((doc, idx) => `[Context ${idx + 1}] (Relevance: ${(doc.score * 100).toFixed(1)}%)\n${doc.text}`)
            .join('\n\n')

@ -174,6 +193,25 @@ export default class OllamaController {
    return await this.ollamaService.getModels()
  }

+  /**
+   * Determines RAG context limits based on model size extracted from the model name.
+   * Parses size indicators like "1b", "3b", "8b", "70b" from model names/tags.
+   */
+  private getContextLimitsForModel(modelName: string): { maxResults: number; maxTokens: number } {
+    // Extract parameter count from model name (e.g., "llama3.2:3b", "qwen2.5:1.5b", "gemma:7b")
+    const sizeMatch = modelName.match(/(\d+\.?\d*)[bB]/)
+    const paramBillions = sizeMatch ? parseFloat(sizeMatch[1]) : 8 // default to 8B if unknown
+
+    for (const tier of RAG_CONTEXT_LIMITS) {
+      if (paramBillions <= tier.maxParams) {
+        return { maxResults: tier.maxResults, maxTokens: tier.maxTokens }
+      }
+    }
+
+    // Fallback: no limits
+    return { maxResults: 5, maxTokens: 0 }
+  }
+
  private async rewriteQueryWithContext(
    messages: Message[]
  ): Promise<string | null> {
@ -199,8 +237,8 @@ export default class OllamaController {
        })
        .join('\n')

-      const availableModels = await this.ollamaService.getAvailableModels({ query: null, limit: 500 })
-      const rewriteModelAvailable = availableModels?.models.some(model => model.name === DEFAULT_QUERY_REWRITE_MODEL)
+      const installedModels = await this.ollamaService.getModels(true)
+      const rewriteModelAvailable = installedModels?.some(model => model.name === DEFAULT_QUERY_REWRITE_MODEL)
      if (!rewriteModelAvailable) {
        logger.warn(`[RAG] Query rewrite model "${DEFAULT_QUERY_REWRITE_MODEL}" not available. Skipping query rewriting.`)
        const lastUserMessage = [...messages].reverse().find(msg => msg.role === 'user')
--- a/admin/app/services/rag_service.ts
+++ b/admin/app/services/rag_service.ts
@ -16,11 +16,13 @@ import { join, resolve, sep } from 'node:path'
 import KVStore from '#models/kv_store'
 import { ZIMExtractionService } from './zim_extraction_service.js'
 import { ZIM_BATCH_SIZE } from '../../constants/zim_extraction.js'
+import { ProcessAndEmbedFileResponse, ProcessZIMFileResponse, RAGResult, RerankedRAGResult } from '../../types/rag.js'

@inject()
 export class RagService {
  private qdrant: QdrantClient | null = null
  private qdrantInitPromise: Promise<void> | null = null
+  private embeddingModelVerified = false
  public static UPLOADS_STORAGE_PATH = 'storage/kb_uploads'
  public static CONTENT_COLLECTION_NAME = 'nomad_knowledge_base'
  public static EMBEDDING_MODEL = 'nomic-embed-text:v1.5'
@ -33,6 +35,7 @@ export class RagService {
  // Nomic Embed Text v1.5 uses task-specific prefixes for optimal performance
  public static SEARCH_DOCUMENT_PREFIX = 'search_document: '
  public static SEARCH_QUERY_PREFIX = 'search_query: '
+  public static EMBEDDING_BATCH_SIZE = 8 // Conservative batch size for low-end hardware

  constructor(
    private dockerService: DockerService,
@ -75,6 +78,16 @@ export class RagService {
          },
        })
      }
+
+      // Create payload indexes for faster filtering (idempotent — Qdrant ignores duplicates)
+      await this.qdrant!.createPayloadIndex(collectionName, {
+        field_name: 'source',
+        field_schema: 'keyword',
+      })
+      await this.qdrant!.createPayloadIndex(collectionName, {
+        field_name: 'content_type',
+        field_schema: 'keyword',
+      })
    } catch (error) {
      logger.error('Error ensuring Qdrant collection:', error)
      throw error
@ -148,14 +161,57 @@ export class RagService {
  /**
   * Preprocesses a query to improve retrieval by expanding it with context.
   * This helps match documents even when using different terminology.
+   * TODO: We could probably move this to a separate QueryPreprocessor class if it grows more complex, but for now it's manageable here.
   */
+  private static QUERY_EXPANSION_DICTIONARY: Record<string, string> = {
+    'bob': 'bug out bag',
+    'bov': 'bug out vehicle',
+    'bol': 'bug out location',
+    'edc': 'every day carry',
+    'mre': 'meal ready to eat',
+    'shtf': 'shit hits the fan',
+    'teotwawki': 'the end of the world as we know it',
+    'opsec': 'operational security',
+    'ifak': 'individual first aid kit',
+    'ghb': 'get home bag',
+    'ghi': 'get home in',
+    'wrol': 'without rule of law',
+    'emp': 'electromagnetic pulse',
+    'ham': 'ham amateur radio',
+    'nbr': 'nuclear biological radiological',
+    'cbrn': 'chemical biological radiological nuclear',
+    'sar': 'search and rescue',
+    'comms': 'communications radio',
+    'fifo': 'first in first out',
+    'mylar': 'mylar bag food storage',
+    'paracord': 'paracord 550 cord',
+    'ferro': 'ferro rod fire starter',
+    'bivvy': 'bivvy bivy emergency shelter',
+    'bdu': 'battle dress uniform',
+    'gmrs': 'general mobile radio service',
+    'frs': 'family radio service',
+    'nbc': 'nuclear biological chemical',
+  }
+
  private preprocessQuery(query: string): string {
-    // Future: this is a placeholder for more advanced query expansion techniques.
-    // For now, we simply trim whitespace. Improvements could include:
-    // - Synonym expansion using a thesaurus
-    // - Adding related terms based on domain knowledge
-    // - Using a language model to rephrase or elaborate the query
-    const expanded = query.trim()
+    let expanded = query.trim()
+
+    // Expand known domain abbreviations/acronyms
+    const words = expanded.toLowerCase().split(/\s+/)
+    const expansions: string[] = []
+
+    for (const word of words) {
+      const cleaned = word.replace(/[^\w]/g, '')
+      if (RagService.QUERY_EXPANSION_DICTIONARY[cleaned]) {
+        expansions.push(RagService.QUERY_EXPANSION_DICTIONARY[cleaned])
+      }
+    }
+
+    if (expansions.length > 0) {
+      expanded = `${expanded} ${expansions.join(' ')}`
+      logger.debug(`[RAG] Query expanded with domain terms: "${expanded}"`)
+    }
+
    logger.debug(`[RAG] Original query: "${query}"`)
    logger.debug(`[RAG] Preprocessed query: "${expanded}"`)
    return expanded
@ -187,22 +243,26 @@ export class RagService {
        RagService.EMBEDDING_DIMENSION
      )

-      const allModels = await this.ollamaService.getModels(true)
-      const embeddingModel = allModels.find((model) => model.name === RagService.EMBEDDING_MODEL)
+      if (!this.embeddingModelVerified) {
+        const allModels = await this.ollamaService.getModels(true)
+        const embeddingModel = allModels.find((model) => model.name === RagService.EMBEDDING_MODEL)

-      if (!embeddingModel) {
-        try {
-          const downloadResult = await this.ollamaService.downloadModel(RagService.EMBEDDING_MODEL)
-          if (!downloadResult.success) {
-            throw new Error(downloadResult.message || 'Unknown error during model download')
+        if (!embeddingModel) {
+          try {
+            const downloadResult = await this.ollamaService.downloadModel(RagService.EMBEDDING_MODEL)
+            if (!downloadResult.success) {
+              throw new Error(downloadResult.message || 'Unknown error during model download')
+            }
+          } catch (modelError) {
+            logger.error(
+              `[RAG] Embedding model ${RagService.EMBEDDING_MODEL} not found locally and failed to download:`,
+              modelError
+            )
+            this.embeddingModelVerified = false
+            return null
          }
-        } catch (modelError) {
-          logger.error(
-            `[RAG] Embedding model ${RagService.EMBEDDING_MODEL} not found locally and failed to download:`,
-            modelError
-          )
-          return null
        }
+        this.embeddingModelVerified = true
      }

      // TokenChunker uses character-based tokenization (1 char = 1 token)
@ -227,7 +287,8 @@ export class RagService {

      const ollamaClient = await this.ollamaService.getClient()

-      const embeddings: number[][] = []
+      // Prepare all chunk texts with prefix and truncation
+      const prefixedChunks: string[] = []
      for (let i = 0; i < chunks.length; i++) {
        let chunkText = chunks[i]

@ -237,7 +298,6 @@ export class RagService {
        const estimatedTokens = this.estimateTokenCount(withPrefix)

        if (estimatedTokens > RagService.MAX_SAFE_TOKENS) {
-          // This should be rare - log for debugging if it's occurring frequently
          const prefixTokens = this.estimateTokenCount(prefixText)
          const maxTokensForText = RagService.MAX_SAFE_TOKENS - prefixTokens
          logger.warn(
@ -246,17 +306,30 @@ export class RagService {
          chunkText = this.truncateToTokenLimit(chunkText, maxTokensForText)
        }

-        logger.debug(`[RAG] Generating embedding for chunk ${i + 1}/${chunks.length}`)
+        prefixedChunks.push(RagService.SEARCH_DOCUMENT_PREFIX + chunkText)
+      }

-        const response = await ollamaClient.embeddings({
+      // Batch embed chunks for performance
+      const embeddings: number[][] = []
+      const batchSize = RagService.EMBEDDING_BATCH_SIZE
+      const totalBatches = Math.ceil(prefixedChunks.length / batchSize)
+
+      for (let batchIdx = 0; batchIdx < totalBatches; batchIdx++) {
+        const batchStart = batchIdx * batchSize
+        const batch = prefixedChunks.slice(batchStart, batchStart + batchSize)
+
+        logger.debug(`[RAG] Embedding batch ${batchIdx + 1}/${totalBatches} (${batch.length} chunks)`)
+
+        const response = await ollamaClient.embed({
          model: RagService.EMBEDDING_MODEL,
-          prompt: RagService.SEARCH_DOCUMENT_PREFIX + chunkText,
+          input: batch,
        })

-        embeddings.push(response.embedding)
+        embeddings.push(...response.embeddings)

        if (onProgress) {
-          await onProgress(((i + 1) / chunks.length) * 100)
+          const progress = ((batchStart + batch.length) / prefixedChunks.length) * 100
+          await onProgress(progress)
        }
      }

@ -395,14 +468,7 @@ export class RagService {
    deleteAfterEmbedding: boolean,
    batchOffset?: number,
    onProgress?: (percent: number) => Promise<void>
-  ): Promise<{
-    success: boolean
-    message: string
-    chunks?: number
-    hasMoreBatches?: boolean
-    articlesProcessed?: number
-    totalArticles?: number
-  }> {
+  ): Promise<ProcessZIMFileResponse> {
    const zimExtractionService = new ZIMExtractionService()

    // Process in batches to avoid lock timeout
@ -540,14 +606,7 @@ export class RagService {
    deleteAfterEmbedding: boolean = false,
    batchOffset?: number,
    onProgress?: (percent: number) => Promise<void>
-  ): Promise<{
-    success: boolean
-    message: string
-    chunks?: number
-    hasMoreBatches?: boolean
-    articlesProcessed?: number
-    totalArticles?: number
-  }> {
+  ): Promise<ProcessAndEmbedFileResponse> {
    try {
      const fileType = determineFileType(filepath)
      logger.debug(`[RAG] Processing file: ${filepath} (detected type: ${fileType})`)
@ -631,14 +690,18 @@ export class RagService {
        return []
      }

-      const allModels = await this.ollamaService.getModels(true)
-      const embeddingModel = allModels.find((model) => model.name === RagService.EMBEDDING_MODEL)
+      if (!this.embeddingModelVerified) {
+        const allModels = await this.ollamaService.getModels(true)
+        const embeddingModel = allModels.find((model) => model.name === RagService.EMBEDDING_MODEL)

-      if (!embeddingModel) {
-        logger.warn(
-          `[RAG] ${RagService.EMBEDDING_MODEL} not found. Cannot perform similarity search.`
-        )
-        return []
+        if (!embeddingModel) {
+          logger.warn(
+            `[RAG] ${RagService.EMBEDDING_MODEL} not found. Cannot perform similarity search.`
+          )
+          this.embeddingModelVerified = false
+          return []
+        }
+        this.embeddingModelVerified = true
      }

      // Preprocess query for better matching
@ -666,9 +729,9 @@ export class RagService {
        return []
      }

-      const response = await ollamaClient.embeddings({
+      const response = await ollamaClient.embed({
        model: RagService.EMBEDDING_MODEL,
-        prompt: prefixedQuery,
+        input: [prefixedQuery],
      })

      // Perform semantic search with a higher limit to enable reranking
@ -678,7 +741,7 @@ export class RagService {
      )

      const searchResults = await this.qdrant!.search(RagService.CONTENT_COLLECTION_NAME, {
-        vector: response.embedding,
+        vector: response.embeddings[0],
        limit: searchLimit,
        score_threshold: scoreThreshold,
        with_payload: true,
@ -687,7 +750,7 @@ export class RagService {
      logger.debug(`[RAG] Found ${searchResults.length} results above threshold ${scoreThreshold}`)

      // Map results with metadata for reranking
-      const resultsWithMetadata = searchResults.map((result) => ({
+      const resultsWithMetadata: RAGResult[] = searchResults.map((result) => ({
        text: (result.payload?.text as string) || '',
        score: result.score,
        keywords: (result.payload?.keywords as string) || '',
@ -700,6 +763,7 @@ export class RagService {
        hierarchy: result.payload?.hierarchy as string | undefined,
        document_id: result.payload?.document_id as string | undefined,
        content_type: result.payload?.content_type as string | undefined,
+        source: result.payload?.source as string | undefined,
      }))

      const rerankedResults = this.rerankResults(resultsWithMetadata, keywords, query)
@ -711,8 +775,11 @@ export class RagService {
        )
      })

+      // Apply source diversity penalty to avoid all results from the same document
+      const diverseResults = this.applySourceDiversity(rerankedResults)
+
      // Return top N results with enhanced metadata
-      return rerankedResults.slice(0, limit).map((result) => ({
+      return diverseResults.slice(0, limit).map((result) => ({
        text: result.text,
        score: result.finalScore,
        metadata: {
@ -748,34 +815,10 @@ export class RagService {
   * outweigh the overhead.
   */
  private rerankResults(
-    results: Array<{
-      text: string
-      score: number
-      keywords: string
-      chunk_index: number
-      created_at: number
-      article_title?: string
-      section_title?: string
-      full_title?: string
-      hierarchy?: string
-      document_id?: string
-      content_type?: string
-    }>,
+    results: Array<RAGResult>,
    queryKeywords: string[],
    originalQuery: string
-  ): Array<{
-    text: string
-    score: number
-    finalScore: number
-    chunk_index: number
-    created_at: number
-    article_title?: string
-    section_title?: string
-    full_title?: string
-    hierarchy?: string
-    document_id?: string
-    content_type?: string
-  }> {
+  ): Array<RerankedRAGResult> {
    return results
      .map((result) => {
        let finalScore = result.score
@ -851,6 +894,37 @@ export class RagService {
      .sort((a, b) => b.finalScore - a.finalScore)
  }

+  /**
+   * Applies a diversity penalty so results from the same source are down-weighted.
+   * Uses greedy selection: for each result, apply 0.85^n penalty where n is the
+   * number of results already selected from the same source.
+   */
+  private applySourceDiversity(
+    results: Array<RerankedRAGResult>
+  ) {
+    const sourceCounts = new Map<string, number>()
+    const DIVERSITY_PENALTY = 0.85
+
+    return results
+      .map((result) => {
+        const sourceKey = result.document_id || result.source || 'unknown'
+        const count = sourceCounts.get(sourceKey) || 0
+        const penalty = Math.pow(DIVERSITY_PENALTY, count)
+        const diverseScore = result.finalScore * penalty
+
+        sourceCounts.set(sourceKey, count + 1)
+
+        if (count > 0) {
+          logger.debug(
+            `[RAG] Source diversity penalty for "${sourceKey}": ${result.finalScore.toFixed(4)} → ${diverseScore.toFixed(4)} (seen ${count}x)`
+          )
+        }
+
+        return { ...result, finalScore: diverseScore }
+      })
+      .sort((a, b) => b.finalScore - a.finalScore)
+  }
+
  /**
   * Retrieve all unique source files that have been stored in the knowledge base.
   * @returns Array of unique full source paths
@ -866,12 +940,12 @@ export class RagService {
      let offset: string | number | null | Record<string, unknown> = null
      const batchSize = 100

-      // Scroll through all points in the collection
+      // Scroll through all points in the collection (only fetch source field)
      do {
        const scrollResult = await this.qdrant!.scroll(RagService.CONTENT_COLLECTION_NAME, {
          limit: batchSize,
          offset: offset,
-          with_payload: true,
+          with_payload: ['source'],
          with_vector: false,
        })

--- a/admin/constants/ollama.ts
+++ b/admin/constants/ollama.ts
@ -64,6 +64,16 @@ export const FALLBACK_RECOMMENDED_OLLAMA_MODELS: NomadOllamaModel[] = [

 export const DEFAULT_QUERY_REWRITE_MODEL = 'qwen2.5:3b' // default to qwen2.5 for query rewriting with good balance of text task performance and resource usage

+/**
+ * Adaptive RAG context limits based on model size.
+ * Smaller models get overwhelmed with too much context, so we cap it.
+ */
+export const RAG_CONTEXT_LIMITS: { maxParams: number; maxResults: number; maxTokens: number }[] = [
+  { maxParams: 3, maxResults: 2, maxTokens: 1000 },   // 1-3B models
+  { maxParams: 8, maxResults: 4, maxTokens: 2500 },   // 4-8B models
+  { maxParams: Infinity, maxResults: 5, maxTokens: 0 }, // 13B+ (no cap)
+]
+
 export const SYSTEM_PROMPTS = {
  default: `
 Format all responses using markdown for better readability. Vanilla markdown or GitHub-flavored markdown is preferred.
@ -113,7 +123,7 @@ Ensure that your suggestions are comma-seperated with no conjunctions like "and"
 Do not use line breaks, new lines, or extra spacing to separate the suggestions.
 Format: suggestion1, suggestion2, suggestion3
 `,
-  title_generation: `You are a title generator. Given the start of a conversation, generate a concise, descriptive title under 60 characters. Return ONLY the title text with no quotes, punctuation wrapping, or extra formatting.`,
+  title_generation: `You are a title generator. Given the start of a conversation, generate a concise, descriptive title under 50 characters. Return ONLY the title text with no quotes, punctuation wrapping, or extra formatting.`,
  query_rewrite: `
 You are a query rewriting assistant. Your task is to reformulate the user's latest question to include relevant context from the conversation history.

--- a/admin/docs/release-notes.md
+++ b/admin/docs/release-notes.md
@ -4,13 +4,16 @@

 ### Features
 - **AI Assistant**: Added improved user guidance for troubleshooting GPU pass-through issues
+- **AI Assistant**: The last used model is now automatically selected when a new chat is started
 - **Settings**: Nomad now automatically performs nightly checks for available app updates, and users can select and apply updates from the Apps page in Settings

 ### Bug Fixes
 - **Settings**: Fixed an issue where the AI Assistant settings page would be shown in navigation even if the AI Assistant was not installed, thus causing 404 errors when clicked
 - **Security**: Path traversal and SSRF mitigations
+- **AI Assistant**: Fixed an issue that was causing intermittent failures saving chat session titles

 ### Improvements
+- **AI Assistant**: Extensive performance improvements and improved RAG intelligence/context usage

 ## Version 1.28.0 - March 5, 2026

--- a/admin/types/rag.ts
+++ b/admin/types/rag.ts
@ -5,3 +5,32 @@ export type EmbedJobWithProgress = {
  progress: number
  status: string
 }
+
+export type ProcessAndEmbedFileResponse = {
+  success: boolean
+  message: string
+  chunks?: number
+  hasMoreBatches?: boolean
+  articlesProcessed?: number
+  totalArticles?: number
+}
+export type ProcessZIMFileResponse = ProcessAndEmbedFileResponse
+
+export type RAGResult = {
+  text: string
+  score: number
+  keywords: string
+  chunk_index: number
+  created_at: number
+  article_title?: string
+  section_title?: string
+  full_title?: string
+  hierarchy?: string
+  document_id?: string
+  content_type?: string
+  source?: string
+}
+
+export type RerankedRAGResult = Omit<RAGResult, 'keywords'> & {
+  finalScore: number
+}