feat(AI Assistant): performance improvements and smarter RAG context usage

2026-03-28 03:29:25 +01:00 · 2026-03-11 05:52:46 +00:00 · 2026-03-11 05:52:46 +00:00 · 96e5027055
commit 96e5027055
parent 460756f581
5 changed files with 242 additions and 88 deletions
--- a/admin/app/controllers/ollama_controller.ts
+++ b/admin/app/controllers/ollama_controller.ts
@ -5,7 +5,7 @@ import { modelNameSchema } from '#validators/download'
 import { chatSchema, getAvailableModelsSchema } from '#validators/ollama'
 import { inject } from '@adonisjs/core'
 import type { HttpContext } from '@adonisjs/core/http'
-import { DEFAULT_QUERY_REWRITE_MODEL, SYSTEM_PROMPTS } from '../../constants/ollama.js'
+import { DEFAULT_QUERY_REWRITE_MODEL, RAG_CONTEXT_LIMITS, SYSTEM_PROMPTS } from '../../constants/ollama.js'
 import logger from '@adonisjs/core/services/logger'
 import type { Message } from 'ollama'
@ -66,9 +66,28 @@ export default class OllamaController {
        logger.debug(`[RAG] Retrieved ${relevantDocs.length} relevant documents for query: "${rewrittenQuery}"`)
-        // If relevant context is found, inject as a system message
+        // If relevant context is found, inject as a system message with adaptive limits
        if (relevantDocs.length > 0) {
-          const contextText = relevantDocs
+          // Determine context budget based on model size
          const { maxResults, maxTokens } = this.getContextLimitsForModel(reqData.model)
          let trimmedDocs = relevantDocs.slice(0, maxResults)
          // Apply token cap if set (estimate ~4 chars per token)
          // Always include the first (most relevant) result — the cap only gates subsequent results
          if (maxTokens > 0) {
            const charCap = maxTokens * 4
            let totalChars = 0
            trimmedDocs = trimmedDocs.filter((doc, idx) => {
              totalChars += doc.text.length
              return idx === 0 || totalChars <= charCap
            })
          }
          logger.debug(
            `[RAG] Injecting ${trimmedDocs.length}/${relevantDocs.length} results (model: ${reqData.model}, maxResults: ${maxResults}, maxTokens: ${maxTokens || 'unlimited'})`
          )
          const contextText = trimmedDocs
            .map((doc, idx) => `[Context ${idx + 1}] (Relevance: ${(doc.score * 100).toFixed(1)}%)\n${doc.text}`)
            .join('\n\n')
@ -174,6 +193,25 @@ export default class OllamaController {
    return await this.ollamaService.getModels()
  }
  /**
   * Determines RAG context limits based on model size extracted from the model name.
   * Parses size indicators like "1b", "3b", "8b", "70b" from model names/tags.
   */
  private getContextLimitsForModel(modelName: string): { maxResults: number; maxTokens: number } {
    // Extract parameter count from model name (e.g., "llama3.2:3b", "qwen2.5:1.5b", "gemma:7b")
    const sizeMatch = modelName.match(/(\d+\.?\d*)[bB]/)
    const paramBillions = sizeMatch ? parseFloat(sizeMatch[1]) : 8 // default to 8B if unknown
    for (const tier of RAG_CONTEXT_LIMITS) {
      if (paramBillions <= tier.maxParams) {
        return { maxResults: tier.maxResults, maxTokens: tier.maxTokens }
      }
    }
    // Fallback: no limits
    return { maxResults: 5, maxTokens: 0 }
  }
  private async rewriteQueryWithContext(
    messages: Message[]
  ): Promise<string | null> {
@ -199,8 +237,8 @@ export default class OllamaController {
        })
        .join('\n')
-      const availableModels = await this.ollamaService.getAvailableModels({ query: null, limit: 500 })
+      const installedModels = await this.ollamaService.getModels(true)
-      const rewriteModelAvailable = availableModels?.models.some(model => model.name === DEFAULT_QUERY_REWRITE_MODEL)
+      const rewriteModelAvailable = installedModels?.some(model => model.name === DEFAULT_QUERY_REWRITE_MODEL)
      if (!rewriteModelAvailable) {
        logger.warn(`[RAG] Query rewrite model "${DEFAULT_QUERY_REWRITE_MODEL}" not available. Skipping query rewriting.`)
        const lastUserMessage = [...messages].reverse().find(msg => msg.role === 'user')
--- a/admin/app/services/rag_service.ts
+++ b/admin/app/services/rag_service.ts
@ -16,11 +16,13 @@ import { join, resolve, sep } from 'node:path'
 import KVStore from '#models/kv_store'
 import { ZIMExtractionService } from './zim_extraction_service.js'
 import { ZIM_BATCH_SIZE } from '../../constants/zim_extraction.js'
 import { ProcessAndEmbedFileResponse, ProcessZIMFileResponse, RAGResult, RerankedRAGResult } from '../../types/rag.js'
@inject()
 export class RagService {
  private qdrant: QdrantClient | null = null
  private qdrantInitPromise: Promise<void> | null = null
  private embeddingModelVerified = false
  public static UPLOADS_STORAGE_PATH = 'storage/kb_uploads'
  public static CONTENT_COLLECTION_NAME = 'nomad_knowledge_base'
  public static EMBEDDING_MODEL = 'nomic-embed-text:v1.5'
@ -33,6 +35,7 @@ export class RagService {
  // Nomic Embed Text v1.5 uses task-specific prefixes for optimal performance
  public static SEARCH_DOCUMENT_PREFIX = 'search_document: '
  public static SEARCH_QUERY_PREFIX = 'search_query: '
  public static EMBEDDING_BATCH_SIZE = 8 // Conservative batch size for low-end hardware
  constructor(
    private dockerService: DockerService,
@ -75,6 +78,16 @@ export class RagService {
          },
        })
      }
      // Create payload indexes for faster filtering (idempotent — Qdrant ignores duplicates)
      await this.qdrant!.createPayloadIndex(collectionName, {
        field_name: 'source',
        field_schema: 'keyword',
      })
      await this.qdrant!.createPayloadIndex(collectionName, {
        field_name: 'content_type',
        field_schema: 'keyword',
      })
    } catch (error) {
      logger.error('Error ensuring Qdrant collection:', error)
      throw error
@ -148,14 +161,57 @@ export class RagService {
  /**
   * Preprocesses a query to improve retrieval by expanding it with context.
   * This helps match documents even when using different terminology.
   * TODO: We could probably move this to a separate QueryPreprocessor class if it grows more complex, but for now it's manageable here.
   */
  private static QUERY_EXPANSION_DICTIONARY: Record<string, string> = {
    'bob': 'bug out bag',
    'bov': 'bug out vehicle',
    'bol': 'bug out location',
    'edc': 'every day carry',
    'mre': 'meal ready to eat',
    'shtf': 'shit hits the fan',
    'teotwawki': 'the end of the world as we know it',
    'opsec': 'operational security',
    'ifak': 'individual first aid kit',
    'ghb': 'get home bag',
    'ghi': 'get home in',
    'wrol': 'without rule of law',
    'emp': 'electromagnetic pulse',
    'ham': 'ham amateur radio',
    'nbr': 'nuclear biological radiological',
    'cbrn': 'chemical biological radiological nuclear',
    'sar': 'search and rescue',
    'comms': 'communications radio',
    'fifo': 'first in first out',
    'mylar': 'mylar bag food storage',
    'paracord': 'paracord 550 cord',
    'ferro': 'ferro rod fire starter',
    'bivvy': 'bivvy bivy emergency shelter',
    'bdu': 'battle dress uniform',
    'gmrs': 'general mobile radio service',
    'frs': 'family radio service',
    'nbc': 'nuclear biological chemical',
  }
  private preprocessQuery(query: string): string {
-    // Future: this is a placeholder for more advanced query expansion techniques.
+    let expanded = query.trim()
-    // For now, we simply trim whitespace. Improvements could include:
+
-    // - Synonym expansion using a thesaurus
+    // Expand known domain abbreviations/acronyms
-    // - Adding related terms based on domain knowledge
+    const words = expanded.toLowerCase().split(/\s+/)
-    // - Using a language model to rephrase or elaborate the query
+    const expansions: string[] = []
-    const expanded = query.trim()
+
    for (const word of words) {
      const cleaned = word.replace(/[^\w]/g, '')
      if (RagService.QUERY_EXPANSION_DICTIONARY[cleaned]) {
        expansions.push(RagService.QUERY_EXPANSION_DICTIONARY[cleaned])
      }
    }
    if (expansions.length > 0) {
      expanded = `${expanded} ${expansions.join(' ')}`
      logger.debug(`[RAG] Query expanded with domain terms: "${expanded}"`)
    }
    logger.debug(`[RAG] Original query: "${query}"`)
    logger.debug(`[RAG] Preprocessed query: "${expanded}"`)
    return expanded
@ -187,22 +243,26 @@ export class RagService {
        RagService.EMBEDDING_DIMENSION
      )
-      const allModels = await this.ollamaService.getModels(true)
+      if (!this.embeddingModelVerified) {
-      const embeddingModel = allModels.find((model) => model.name === RagService.EMBEDDING_MODEL)
+        const allModels = await this.ollamaService.getModels(true)
        const embeddingModel = allModels.find((model) => model.name === RagService.EMBEDDING_MODEL)
-      if (!embeddingModel) {
+        if (!embeddingModel) {
-        try {
+          try {
-          const downloadResult = await this.ollamaService.downloadModel(RagService.EMBEDDING_MODEL)
+            const downloadResult = await this.ollamaService.downloadModel(RagService.EMBEDDING_MODEL)
-          if (!downloadResult.success) {
+            if (!downloadResult.success) {
-            throw new Error(downloadResult.message || 'Unknown error during model download')
+              throw new Error(downloadResult.message || 'Unknown error during model download')
            }
          } catch (modelError) {
            logger.error(
              `[RAG] Embedding model ${RagService.EMBEDDING_MODEL} not found locally and failed to download:`,
              modelError
            )
            this.embeddingModelVerified = false
            return null
          }
        } catch (modelError) {
          logger.error(
            `[RAG] Embedding model ${RagService.EMBEDDING_MODEL} not found locally and failed to download:`,
            modelError
          )
          return null
        }
        this.embeddingModelVerified = true
      }
      // TokenChunker uses character-based tokenization (1 char = 1 token)
@ -227,7 +287,8 @@ export class RagService {
      const ollamaClient = await this.ollamaService.getClient()
-      const embeddings: number[][] = []
+      // Prepare all chunk texts with prefix and truncation
      const prefixedChunks: string[] = []
      for (let i = 0; i < chunks.length; i++) {
        let chunkText = chunks[i]
@ -237,7 +298,6 @@ export class RagService {
        const estimatedTokens = this.estimateTokenCount(withPrefix)
        if (estimatedTokens > RagService.MAX_SAFE_TOKENS) {
          // This should be rare - log for debugging if it's occurring frequently
          const prefixTokens = this.estimateTokenCount(prefixText)
          const maxTokensForText = RagService.MAX_SAFE_TOKENS - prefixTokens
          logger.warn(
@ -246,17 +306,30 @@ export class RagService {
          chunkText = this.truncateToTokenLimit(chunkText, maxTokensForText)
        }
-        logger.debug(`[RAG] Generating embedding for chunk ${i + 1}/${chunks.length}`)
+        prefixedChunks.push(RagService.SEARCH_DOCUMENT_PREFIX + chunkText)
      }
-        const response = await ollamaClient.embeddings({
+      // Batch embed chunks for performance
      const embeddings: number[][] = []
      const batchSize = RagService.EMBEDDING_BATCH_SIZE
      const totalBatches = Math.ceil(prefixedChunks.length / batchSize)
      for (let batchIdx = 0; batchIdx < totalBatches; batchIdx++) {
        const batchStart = batchIdx * batchSize
        const batch = prefixedChunks.slice(batchStart, batchStart + batchSize)
        logger.debug(`[RAG] Embedding batch ${batchIdx + 1}/${totalBatches} (${batch.length} chunks)`)
        const response = await ollamaClient.embed({
          model: RagService.EMBEDDING_MODEL,
-          prompt: RagService.SEARCH_DOCUMENT_PREFIX + chunkText,
+          input: batch,
        })
-        embeddings.push(response.embedding)
+        embeddings.push(...response.embeddings)
        if (onProgress) {
-          await onProgress(((i + 1) / chunks.length) * 100)
+          const progress = ((batchStart + batch.length) / prefixedChunks.length) * 100
          await onProgress(progress)
        }
      }
@ -395,14 +468,7 @@ export class RagService {
    deleteAfterEmbedding: boolean,
    batchOffset?: number,
    onProgress?: (percent: number) => Promise<void>
-  ): Promise<{
+  ): Promise<ProcessZIMFileResponse> {
    success: boolean
    message: string
    chunks?: number
    hasMoreBatches?: boolean
    articlesProcessed?: number
    totalArticles?: number
  }> {
    const zimExtractionService = new ZIMExtractionService()
    // Process in batches to avoid lock timeout
@ -540,14 +606,7 @@ export class RagService {
    deleteAfterEmbedding: boolean = false,
    batchOffset?: number,
    onProgress?: (percent: number) => Promise<void>
-  ): Promise<{
+  ): Promise<ProcessAndEmbedFileResponse> {
    success: boolean
    message: string
    chunks?: number
    hasMoreBatches?: boolean
    articlesProcessed?: number
    totalArticles?: number
  }> {
    try {
      const fileType = determineFileType(filepath)
      logger.debug(`[RAG] Processing file: ${filepath} (detected type: ${fileType})`)
@ -631,14 +690,18 @@ export class RagService {
        return []
      }
-      const allModels = await this.ollamaService.getModels(true)
+      if (!this.embeddingModelVerified) {
-      const embeddingModel = allModels.find((model) => model.name === RagService.EMBEDDING_MODEL)
+        const allModels = await this.ollamaService.getModels(true)
        const embeddingModel = allModels.find((model) => model.name === RagService.EMBEDDING_MODEL)
-      if (!embeddingModel) {
+        if (!embeddingModel) {
-        logger.warn(
+          logger.warn(
-          `[RAG] ${RagService.EMBEDDING_MODEL} not found. Cannot perform similarity search.`
+            `[RAG] ${RagService.EMBEDDING_MODEL} not found. Cannot perform similarity search.`
-        )
+          )
-        return []
+          this.embeddingModelVerified = false
          return []
        }
        this.embeddingModelVerified = true
      }
      // Preprocess query for better matching
@ -666,9 +729,9 @@ export class RagService {
        return []
      }
-      const response = await ollamaClient.embeddings({
+      const response = await ollamaClient.embed({
        model: RagService.EMBEDDING_MODEL,
-        prompt: prefixedQuery,
+        input: [prefixedQuery],
      })
      // Perform semantic search with a higher limit to enable reranking
@ -678,7 +741,7 @@ export class RagService {
      )
      const searchResults = await this.qdrant!.search(RagService.CONTENT_COLLECTION_NAME, {
-        vector: response.embedding,
+        vector: response.embeddings[0],
        limit: searchLimit,
        score_threshold: scoreThreshold,
        with_payload: true,
@ -687,7 +750,7 @@ export class RagService {
      logger.debug(`[RAG] Found ${searchResults.length} results above threshold ${scoreThreshold}`)
      // Map results with metadata for reranking
-      const resultsWithMetadata = searchResults.map((result) => ({
+      const resultsWithMetadata: RAGResult[] = searchResults.map((result) => ({
        text: (result.payload?.text as string) || '',
        score: result.score,
        keywords: (result.payload?.keywords as string) || '',
@ -700,6 +763,7 @@ export class RagService {
        hierarchy: result.payload?.hierarchy as string | undefined,
        document_id: result.payload?.document_id as string | undefined,
        content_type: result.payload?.content_type as string | undefined,
        source: result.payload?.source as string | undefined,
      }))
      const rerankedResults = this.rerankResults(resultsWithMetadata, keywords, query)
@ -711,8 +775,11 @@ export class RagService {
        )
      })
      // Apply source diversity penalty to avoid all results from the same document
      const diverseResults = this.applySourceDiversity(rerankedResults)
      // Return top N results with enhanced metadata
-      return rerankedResults.slice(0, limit).map((result) => ({
+      return diverseResults.slice(0, limit).map((result) => ({
        text: result.text,
        score: result.finalScore,
        metadata: {
@ -748,34 +815,10 @@ export class RagService {
   * outweigh the overhead.
   */
  private rerankResults(
-    results: Array<{
+    results: Array<RAGResult>,
      text: string
      score: number
      keywords: string
      chunk_index: number
      created_at: number
      article_title?: string
      section_title?: string
      full_title?: string
      hierarchy?: string
      document_id?: string
      content_type?: string
    }>,
    queryKeywords: string[],
    originalQuery: string
-  ): Array<{
+  ): Array<RerankedRAGResult> {
    text: string
    score: number
    finalScore: number
    chunk_index: number
    created_at: number
    article_title?: string
    section_title?: string
    full_title?: string
    hierarchy?: string
    document_id?: string
    content_type?: string
  }> {
    return results
      .map((result) => {
        let finalScore = result.score
@ -851,6 +894,37 @@ export class RagService {
      .sort((a, b) => b.finalScore - a.finalScore)
  }
  /**
   * Applies a diversity penalty so results from the same source are down-weighted.
   * Uses greedy selection: for each result, apply 0.85^n penalty where n is the
   * number of results already selected from the same source.
   */
  private applySourceDiversity(
    results: Array<RerankedRAGResult>
  ) {
    const sourceCounts = new Map<string, number>()
    const DIVERSITY_PENALTY = 0.85
    return results
      .map((result) => {
        const sourceKey = result.document_id || result.source || 'unknown'
        const count = sourceCounts.get(sourceKey) || 0
        const penalty = Math.pow(DIVERSITY_PENALTY, count)
        const diverseScore = result.finalScore * penalty
        sourceCounts.set(sourceKey, count + 1)
        if (count > 0) {
          logger.debug(
            `[RAG] Source diversity penalty for "${sourceKey}": ${result.finalScore.toFixed(4)} → ${diverseScore.toFixed(4)} (seen ${count}x)`
          )
        }
        return { ...result, finalScore: diverseScore }
      })
      .sort((a, b) => b.finalScore - a.finalScore)
  }
  /**
   * Retrieve all unique source files that have been stored in the knowledge base.
   * @returns Array of unique full source paths
@ -866,12 +940,12 @@ export class RagService {
      let offset: string | number | null | Record<string, unknown> = null
      const batchSize = 100
-      // Scroll through all points in the collection
+      // Scroll through all points in the collection (only fetch source field)
      do {
        const scrollResult = await this.qdrant!.scroll(RagService.CONTENT_COLLECTION_NAME, {
          limit: batchSize,
          offset: offset,
-          with_payload: true,
+          with_payload: ['source'],
          with_vector: false,
        })
--- a/admin/constants/ollama.ts
+++ b/admin/constants/ollama.ts
@ -64,6 +64,16 @@ export const FALLBACK_RECOMMENDED_OLLAMA_MODELS: NomadOllamaModel[] = [
 export const DEFAULT_QUERY_REWRITE_MODEL = 'qwen2.5:3b' // default to qwen2.5 for query rewriting with good balance of text task performance and resource usage
 /**
 * Adaptive RAG context limits based on model size.
 * Smaller models get overwhelmed with too much context, so we cap it.
 */
 export const RAG_CONTEXT_LIMITS: { maxParams: number; maxResults: number; maxTokens: number }[] = [
  { maxParams: 3, maxResults: 2, maxTokens: 1000 },   // 1-3B models
  { maxParams: 8, maxResults: 4, maxTokens: 2500 },   // 4-8B models
  { maxParams: Infinity, maxResults: 5, maxTokens: 0 }, // 13B+ (no cap)
 ]
 export const SYSTEM_PROMPTS = {
  default: `
 Format all responses using markdown for better readability. Vanilla markdown or GitHub-flavored markdown is preferred.
@ -113,7 +123,7 @@ Ensure that your suggestions are comma-seperated with no conjunctions like "and"
 Do not use line breaks, new lines, or extra spacing to separate the suggestions.
 Format: suggestion1, suggestion2, suggestion3
 `,
-  title_generation: `You are a title generator. Given the start of a conversation, generate a concise, descriptive title under 60 characters. Return ONLY the title text with no quotes, punctuation wrapping, or extra formatting.`,
+  title_generation: `You are a title generator. Given the start of a conversation, generate a concise, descriptive title under 50 characters. Return ONLY the title text with no quotes, punctuation wrapping, or extra formatting.`,
  query_rewrite: `
 You are a query rewriting assistant. Your task is to reformulate the user's latest question to include relevant context from the conversation history.
--- a/admin/docs/release-notes.md
+++ b/admin/docs/release-notes.md
@ -4,13 +4,16 @@
 ### Features
 - **AI Assistant**: Added improved user guidance for troubleshooting GPU pass-through issues
 - **AI Assistant**: The last used model is now automatically selected when a new chat is started
 - **Settings**: Nomad now automatically performs nightly checks for available app updates, and users can select and apply updates from the Apps page in Settings
 ### Bug Fixes
 - **Settings**: Fixed an issue where the AI Assistant settings page would be shown in navigation even if the AI Assistant was not installed, thus causing 404 errors when clicked
 - **Security**: Path traversal and SSRF mitigations
 - **AI Assistant**: Fixed an issue that was causing intermittent failures saving chat session titles
 ### Improvements
 - **AI Assistant**: Extensive performance improvements and improved RAG intelligence/context usage
 ## Version 1.28.0 - March 5, 2026
--- a/admin/types/rag.ts
+++ b/admin/types/rag.ts
@ -5,3 +5,32 @@ export type EmbedJobWithProgress = {
  progress: number
  status: string
 }
 export type ProcessAndEmbedFileResponse = {
  success: boolean
  message: string
  chunks?: number
  hasMoreBatches?: boolean
  articlesProcessed?: number
  totalArticles?: number
 }
 export type ProcessZIMFileResponse = ProcessAndEmbedFileResponse
 export type RAGResult = {
  text: string
  score: number
  keywords: string
  chunk_index: number
  created_at: number
  article_title?: string
  section_title?: string
  full_title?: string
  hierarchy?: string
  document_id?: string
  content_type?: string
  source?: string
 }
 export type RerankedRAGResult = Omit<RAGResult, 'keywords'> & {
  finalScore: number
 }