feat(RAG): initial beta with preprocessing, embedding, semantic retrieval, and ctx passage

2026-03-28 03:29:25 +01:00 · 2026-02-01 23:59:21 +00:00 · 2026-02-01 23:59:21 +00:00 · d1f40663d3
commit d1f40663d3
parent 1923cd4cde
10 changed files with 612 additions and 60 deletions
--- a/admin/app/controllers/ollama_controller.ts
+++ b/admin/app/controllers/ollama_controller.ts
@ -41,16 +41,17 @@ export default class OllamaController {

    if (lastUserMessage) {
      // Search for relevant context in the knowledge base
+      // Using lower threshold (0.3) with improved hybrid search
      const relevantDocs = await this.ragService.searchSimilarDocuments(
        lastUserMessage.content,
        5, // Retrieve top 5 most relevant chunks
-        0.7 // Minimum similarity score of 0.7
+        0.3 // Minimum similarity score of 0.3 (lowered from 0.7 for better recall)
      )

      // If relevant context is found, inject as a system message
      if (relevantDocs.length > 0) {
        const contextText = relevantDocs
-          .map((doc, idx) => `[Context ${idx + 1}]\n${doc.text}`)
+          .map((doc, idx) => `[Context ${idx + 1}] (Relevance: ${(doc.score * 100).toFixed(1)}%)\n${doc.text}`)
          .join('\n\n')

        const systemMessage = {
--- a/admin/app/controllers/rag_controller.ts
+++ b/admin/app/controllers/rag_controller.ts
@ -1,9 +1,12 @@
 import { RagService } from '#services/rag_service'
+import { EmbedFileJob } from '#jobs/embed_file_job'
 import { inject } from '@adonisjs/core'
 import type { HttpContext } from '@adonisjs/core/http'
 import app from '@adonisjs/core/services/app'
 import { randomBytes } from 'node:crypto'
 import { sanitizeFilename } from '../utils/fs.js'
+import { stat } from 'node:fs/promises'
+import { getJobStatusSchema } from '#validators/rag'

@inject()
 export default class RagController {
@ -19,19 +22,48 @@ export default class RagController {
    const sanitizedName = sanitizeFilename(uploadedFile.clientName)

    const fileName = `${sanitizedName}-${randomSuffix}.${uploadedFile.extname || 'txt'}`
-    const fullPath = app.makePath('storage/uploads', fileName)
+    const fullPath = app.makePath(RagService.UPLOADS_STORAGE_PATH, fileName)

-    await uploadedFile.move(app.makePath('storage/uploads'), {
+    await uploadedFile.move(app.makePath(RagService.UPLOADS_STORAGE_PATH), {
      name: fileName,
    })

-    // Don't await this - process in background
-    this.ragService.processAndEmbedFile(fullPath)
+    // Get file size for tracking
+    let fileSize: number | undefined = undefined
+    try {
+      const stats = await stat(fullPath)
+      fileSize = stats.size
+    } catch (error) {
+      // Not critical if we can't get file size, just swallow the error
+    }

-    return response.status(200).json({
-      message: 'File has been uploaded and queued for processing.',
-      file_path: `/uploads/${fileName}`,
+    // Dispatch background job for embedding
+    const result = await EmbedFileJob.dispatch({
+      filePath: fullPath,
+      fileName,
+      fileSize,
    })
+
+    return response.status(202).json({
+      message: result.message,
+      jobId: result.jobId,
+      fileName,
+      filePath: `/${RagService.UPLOADS_STORAGE_PATH}/${fileName}`,
+      alreadyProcessing: !result.created,
+    })
+  }
+
+  public async getJobStatus({ request, response }: HttpContext) {
+    const reqData = await request.validateUsing(getJobStatusSchema)
+
+    const fullPath = app.makePath(RagService.UPLOADS_STORAGE_PATH, reqData.filePath)
+    const status = await EmbedFileJob.getStatus(fullPath)
+
+    if (!status.exists) {
+      return response.status(404).json({ error: 'Job not found for this file' })
+    }
+
+    return response.status(200).json(status)
  }

  public async getStoredFiles({ response }: HttpContext) {
--- a/admin/app/jobs/embed_file_job.ts
+++ b/admin/app/jobs/embed_file_job.ts
@ -0,0 +1,161 @@
+import { Job } from 'bullmq'
+import { QueueService } from '#services/queue_service'
+import { RagService } from '#services/rag_service'
+import { DockerService } from '#services/docker_service'
+import { OllamaService } from '#services/ollama_service'
+import { createHash } from 'crypto'
+import logger from '@adonisjs/core/services/logger'
+
+export interface EmbedFileJobParams {
+  filePath: string
+  fileName: string
+  fileSize?: number
+}
+
+export class EmbedFileJob {
+  static get queue() {
+    return 'file-embeddings'
+  }
+
+  static get key() {
+    return 'embed-file'
+  }
+
+  static getJobId(filePath: string): string {
+    return createHash('sha256').update(filePath).digest('hex').slice(0, 16)
+  }
+
+  async handle(job: Job) {
+    const { filePath, fileName } = job.data as EmbedFileJobParams
+
+    logger.info(`[EmbedFileJob] Starting embedding process for: ${fileName}`)
+
+    const dockerService = new DockerService()
+    const ollamaService = new OllamaService()
+    const ragService = new RagService(dockerService, ollamaService)
+
+    try {
+      // Update progress starting
+      await job.updateProgress(0)
+      await job.updateData({
+        ...job.data,
+        status: 'processing',
+        startedAt: Date.now(),
+      })
+
+      logger.info(`[EmbedFileJob] Processing file: ${filePath}`)
+
+      // Process and embed the file
+      const result = await ragService.processAndEmbedFile(filePath)
+
+      if (!result.success) {
+        logger.error(`[EmbedFileJob] Failed to process file ${fileName}: ${result.message}`)
+        throw new Error(result.message)
+      }
+
+      // Update progress complete
+      await job.updateProgress(100)
+      await job.updateData({
+        ...job.data,
+        status: 'completed',
+        completedAt: Date.now(),
+        chunks: result.chunks,
+      })
+
+      logger.info(
+        `[EmbedFileJob] Successfully embedded ${result.chunks} chunks from file: ${fileName}`
+      )
+
+      return {
+        success: true,
+        fileName,
+        filePath,
+        chunks: result.chunks,
+        message: `Successfully embedded ${result.chunks} chunks`,
+      }
+    } catch (error) {
+      logger.error(`[EmbedFileJob] Error embedding file ${fileName}:`, error)
+
+      await job.updateData({
+        ...job.data,
+        status: 'failed',
+        failedAt: Date.now(),
+        error: error instanceof Error ? error.message : 'Unknown error',
+      })
+
+      throw error
+    }
+  }
+
+  static async getByFilePath(filePath: string): Promise<Job | undefined> {
+    const queueService = new QueueService()
+    const queue = queueService.getQueue(this.queue)
+    const jobId = this.getJobId(filePath)
+    return await queue.getJob(jobId)
+  }
+
+  static async dispatch(params: EmbedFileJobParams) {
+    const queueService = new QueueService()
+    const queue = queueService.getQueue(this.queue)
+    const jobId = this.getJobId(params.filePath)
+
+    try {
+      const job = await queue.add(this.key, params, {
+        jobId,
+        attempts: 3,
+        backoff: {
+          type: 'exponential',
+          delay: 5000, // Delay 5 seconds before retrying
+        },
+        removeOnComplete: { count: 50 }, // Keep last 50 completed jobs for history
+        removeOnFail: { count: 20 } // Keep last 20 failed jobs for debugging
+      })
+
+      logger.info(`[EmbedFileJob] Dispatched embedding job for file: ${params.fileName}`)
+
+      return {
+        job,
+        created: true,
+        jobId,
+        message: `File queued for embedding: ${params.fileName}`,
+      }
+    } catch (error) {
+      if (error.message && error.message.includes('job already exists')) {
+        const existing = await queue.getJob(jobId)
+        logger.info(`[EmbedFileJob] Job already exists for file: ${params.fileName}`)
+        return {
+          job: existing,
+          created: false,
+          jobId,
+          message: `Embedding job already exists for: ${params.fileName}`,
+        }
+      }
+      throw error
+    }
+  }
+
+  static async getStatus(filePath: string): Promise<{
+    exists: boolean
+    status?: string
+    progress?: number
+    chunks?: number
+    error?: string
+  }> {
+    const job = await this.getByFilePath(filePath)
+
+    if (!job) {
+      return { exists: false }
+    }
+
+    const state = await job.getState()
+    const data = job.data
+
+    return {
+      exists: true,
+      status: data.status || state,
+      progress: typeof job.progress === 'number' ? job.progress : undefined,
+      chunks: data.chunks,
+      error: data.error,
+    }
+  }
+}
--- a/admin/app/services/rag_service.ts
+++ b/admin/app/services/rag_service.ts
@ -2,28 +2,115 @@ import { QdrantClient } from '@qdrant/js-client-rest'
 import { DockerService } from './docker_service.js'
 import { inject } from '@adonisjs/core'
 import logger from '@adonisjs/core/services/logger'
-import { chunk } from 'llm-chunk'
+import { TokenChunker } from '@chonkiejs/core'
 import sharp from 'sharp'
-import { determineFileType, getFile } from '../utils/fs.js'
+import { deleteFileIfExists, determineFileType, getFile } from '../utils/fs.js'
 import { PDFParse } from 'pdf-parse'
 import { createWorker } from 'tesseract.js'
 import { fromBuffer } from 'pdf2pic'
 import { OllamaService } from './ollama_service.js'
 import { SERVICE_NAMES } from '../../constants/service_names.js'
+import { removeStopwords } from 'stopword'
+import { randomUUID } from 'node:crypto'

@inject()
 export class RagService {
  private qdrant: QdrantClient | null = null
  private qdrantInitPromise: Promise<void> | null = null
-  public static CONTENT_COLLECTION_NAME = 'open-webui_knowledge' // This is the collection name OWUI uses for uploaded knowledge
+  public static UPLOADS_STORAGE_PATH = 'storage/kb_uploads'
+  public static CONTENT_COLLECTION_NAME = 'nomad_knowledge_base'
  public static EMBEDDING_MODEL = 'nomic-embed-text:v1.5'
  public static EMBEDDING_DIMENSION = 768 // Nomic Embed Text v1.5 dimension is 768
+  public static MODEL_CONTEXT_LENGTH = 2048 // nomic-embed-text has 2K token context
+  public static MAX_SAFE_TOKENS = 1800 // Leave buffer for prefix and tokenization variance
+  public static TARGET_TOKENS_PER_CHUNK = 1700 // Target 1700 tokens per chunk for embedding
+  public static PREFIX_TOKEN_BUDGET = 10 // Reserve ~10 tokens for prefixes
+  public static CHAR_TO_TOKEN_RATIO = 3 // Approximate chars per token
+  // Nomic Embed Text v1.5 uses task-specific prefixes for optimal performance
+  public static SEARCH_DOCUMENT_PREFIX = 'search_document: '
+  public static SEARCH_QUERY_PREFIX = 'search_query: '

  constructor(
    private dockerService: DockerService,
    private ollamaService: OllamaService
  ) {}

+  /**
+   * Estimates token count for text. This is a conservative approximation:
+   * - English text: ~1 token per 3 characters
+   * - Adds buffer for special characters and tokenization variance
+   *
+   * Note: This is approximate and realistic english
+   * tokenization is ~4 chars/token, but we use 3 here to be safe.
+   * Actual tokenization may differ, but being
+   * conservative prevents context length errors.
+   */
+  private estimateTokenCount(text: string): number {
+    // This accounts for special characters, numbers, and punctuation
+    return Math.ceil(text.length / RagService.CHAR_TO_TOKEN_RATIO)
+  }
+
+  /**
+   * Truncates text to fit within token limit, preserving word boundaries.
+   * Ensures the text + prefix won't exceed the model's context window.
+   */
+  private truncateToTokenLimit(text: string, maxTokens: number): string {
+    const estimatedTokens = this.estimateTokenCount(text)
+
+    if (estimatedTokens <= maxTokens) {
+      return text
+    }
+
+    // Calculate how many characters we can keep using our ratio
+    const maxChars = Math.floor(maxTokens * RagService.CHAR_TO_TOKEN_RATIO)
+
+    // Truncate at word boundary
+    let truncated = text.substring(0, maxChars)
+    const lastSpace = truncated.lastIndexOf(' ')
+
+    if (lastSpace > maxChars * 0.8) {
+      // If we found a space in the last 20%, use it
+      truncated = truncated.substring(0, lastSpace)
+    }
+
+    logger.warn(
+      `[RAG] Truncated text from ${text.length} to ${truncated.length} chars (est. ${estimatedTokens} → ${this.estimateTokenCount(truncated)} tokens)`
+    )
+
+    return truncated
+  }
+
+  /**
+   * Preprocesses a query to improve retrieval by expanding it with context.
+   * This helps match documents even when using different terminology.
+   */
+  private preprocessQuery(query: string): string {
+    // Future: this is a placeholder for more advanced query expansion techniques.
+    // For now, we simply trim whitespace. Improvements could include:
+    // - Synonym expansion using a thesaurus
+    // - Adding related terms based on domain knowledge
+    // - Using a language model to rephrase or elaborate the query
+    const expanded = query.trim()
+    logger.debug(`[RAG] Original query: "${query}"`)
+    logger.debug(`[RAG] Preprocessed query: "${expanded}"`)
+    return expanded
+  }
+
+  /**
+   * Extract keywords from query for hybrid search
+   */
+  private extractKeywords(query: string): string[] {
+    const split = query.split(' ')
+    const noStopWords = removeStopwords(split)
+
+    // Future: This is basic normalization, could be improved with stemming/lemmatization later
+    const keywords = noStopWords
+      .map((word) => word.replace(/[^\w]/g, '').toLowerCase())
+      .filter((word) => word.length > 2)
+
+    return [...new Set(keywords)]
+  }
+
  private async _initializeQdrantClient() {
    if (!this.qdrantInitPromise) {
      this.qdrantInitPromise = (async () => {
@ -84,43 +171,87 @@ export class RagService {
        throw new Error(`${RagService.EMBEDDING_MODEL} does not exist and could not be downloaded.`)
      }

-      const chunks = chunk(text, {
-        // These settings should provide a good balance between context and precision
-        minLength: 512,
-        maxLength: 1024,
-        overlap: 200,
+      // TokenChunker uses character-based tokenization (1 char = 1 token)
+      // We need to convert our embedding model's token counts to character counts
+      // since nomic-embed-text tokenizer uses ~3 chars per token
+      const targetCharsPerChunk = Math.floor(RagService.TARGET_TOKENS_PER_CHUNK * RagService.CHAR_TO_TOKEN_RATIO)
+      const overlapChars = Math.floor(150 * RagService.CHAR_TO_TOKEN_RATIO)
+
+      const chunker = await TokenChunker.create({
+        chunkSize: targetCharsPerChunk,
+        chunkOverlap: overlapChars,
      })

-      if (!chunks || chunks.length === 0) {
+      const chunkResults = await chunker.chunk(text)
+
+      if (!chunkResults || chunkResults.length === 0) {
        throw new Error('No text chunks generated for embedding.')
      }

-      const embeddings: number[][] = []
+      // Extract text from chunk results
+      const chunks = chunkResults.map((chunk) => chunk.text)
+
      const ollamaClient = await this.ollamaService.getClient()
-      for (const chunkText of chunks) {
+
+      const embeddings: number[][] = []
+      for (let i = 0; i < chunks.length; i++) {
+        let chunkText = chunks[i]
+
+        // Final safety check: ensure chunk + prefix fits
+        const prefixText = RagService.SEARCH_DOCUMENT_PREFIX
+        const withPrefix = prefixText + chunkText
+        const estimatedTokens = this.estimateTokenCount(withPrefix)
+
+        if (estimatedTokens > RagService.MAX_SAFE_TOKENS) {
+          // This should be rare - log for debugging if it's occurring frequently
+          const prefixTokens = this.estimateTokenCount(prefixText)
+          const maxTokensForText = RagService.MAX_SAFE_TOKENS - prefixTokens
+          logger.warn(
+            `[RAG] Chunk ${i} estimated at ${estimatedTokens} tokens (${chunkText.length} chars), truncating to ${maxTokensForText} tokens`
+          )
+          chunkText = this.truncateToTokenLimit(chunkText, maxTokensForText)
+        }
+
+        logger.debug(`[RAG] Generating embedding for chunk ${i + 1}/${chunks.length}`)
+
        const response = await ollamaClient.embeddings({
          model: RagService.EMBEDDING_MODEL,
-          prompt: chunkText,
+          prompt: RagService.SEARCH_DOCUMENT_PREFIX + chunkText,
        })

        embeddings.push(response.embedding)
      }

-      const points = chunks.map((chunkText, index) => ({
-        id: `${Date.now()}_${index}`,
-        vector: embeddings[index],
-        payload: {
-          ...metadata,
-          text: chunkText,
-          chunk_index: index,
-        },
-      }))
+      const timestamp = Date.now()
+      const points = chunks.map((chunkText, index) => {
+        // Extract keywords for hybrid search
+        const keywords = this.extractKeywords(chunkText)
+        logger.debug(`[RAG] Extracted keywords for chunk ${index}: [${keywords.join(', ')}]`)
+        return {
+          id: randomUUID(), // qdrant requires either uuid or unsigned int
+          vector: embeddings[index],
+          payload: {
+            ...metadata,
+            text: chunkText,
+            chunk_index: index,
+            total_chunks: chunks.length,
+            keywords: keywords.join(' '), // Store as space-separated string for text search
+            char_count: chunkText.length,
+            created_at: timestamp,
+            source: metadata.source || 'unknown'
+          },
+        }
+      })

      await this.qdrant!.upsert(RagService.CONTENT_COLLECTION_NAME, { points })

+      logger.debug(`[RAG] Successfully embedded and stored ${chunks.length} chunks`)
+      logger.debug(`[RAG] First chunk preview: "${chunks[0].substring(0, 100)}..."`)
+
      return { chunks: chunks.length }
    } catch (error) {
-      logger.error('Error embedding text:', error)
+      console.error(error)
+      logger.error('[RAG] Error embedding text:', error)
      return null
    }
  }
@ -195,7 +326,7 @@ export class RagService {
   * This includes text extraction, chunking, embedding, and storing in Qdrant.
   */
  public async processAndEmbedFile(
-    filepath: string
+    filepath: string // Should already be the full path to the uploaded file
  ): Promise<{ success: boolean; message: string; chunks?: number }> {
    try {
      const fileType = determineFileType(filepath)
@ -233,7 +364,13 @@ export class RagService {
        return { success: false, message: 'No text could be extracted from the file.' }
      }

-      const embedResult = await this.embedAndStoreText(extractedText, {})
+      const embedResult = await this.embedAndStoreText(extractedText, {
+        source: filepath
+      })
+
+      // Cleanup the file from disk
+      logger.info(`[RAG] Embedding complete, deleting uploaded file: ${filepath}`)
+      await deleteFileIfExists(filepath)

      return {
        success: true,
@ -248,60 +385,230 @@ export class RagService {

  /**
   * Search for documents similar to the query text in the Qdrant knowledge base.
-   * Returns the most relevant text chunks based on semantic similarity.
+   * Uses a hybrid approach combining semantic similarity and keyword matching.
+   * Implements adaptive thresholds and result reranking for optimal retrieval.
   * @param query - The search query text
   * @param limit - Maximum number of results to return (default: 5)
-   * @param scoreThreshold - Minimum similarity score threshold (default: 0.7)
+   * @param scoreThreshold - Minimum similarity score threshold (default: 0.3, much lower than before)
   * @returns Array of relevant text chunks with their scores
   */
  public async searchSimilarDocuments(
    query: string,
    limit: number = 5,
-    scoreThreshold: number = 0.7
-  ): Promise<Array<{ text: string; score: number }>> {
+    scoreThreshold: number = 0.3 // Lower default threshold - was 0.7, now 0.3
+  ): Promise<Array<{ text: string; score: number; metadata?: Record<string, any> }>> {
    try {
+      logger.debug(`[RAG] Starting similarity search for query: "${query}"`)
+
      await this._ensureCollection(
        RagService.CONTENT_COLLECTION_NAME,
        RagService.EMBEDDING_DIMENSION
      )

+      // Check if collection has any points
+      const collectionInfo = await this.qdrant!.getCollection(RagService.CONTENT_COLLECTION_NAME)
+      const pointCount = collectionInfo.points_count || 0
+      logger.debug(`[RAG] Knowledge base contains ${pointCount} document chunks`)
+
+      if (pointCount === 0) {
+        logger.debug('[RAG] Knowledge base is empty. Could not perform search.')
+        return []
+      }
+
      const allModels = await this.ollamaService.getModels(true)
      const embeddingModel = allModels.find((model) => model.name === RagService.EMBEDDING_MODEL)

      if (!embeddingModel) {
        logger.warn(
-          `${RagService.EMBEDDING_MODEL} not found. Cannot perform similarity search.`
+          `[RAG] ${RagService.EMBEDDING_MODEL} not found. Cannot perform similarity search.`
        )
        return []
      }

-      // Generate embedding for the query
+      // Preprocess query for better matching
+      const processedQuery = this.preprocessQuery(query)
+      const keywords = this.extractKeywords(processedQuery)
+      logger.debug(`[RAG] Extracted keywords: [${keywords.join(', ')}]`)
+
+      // Generate embedding for the query with search_query prefix
      const ollamaClient = await this.ollamaService.getClient()
+
+      // Ensure query doesn't exceed token limit
+      const prefixTokens = this.estimateTokenCount(RagService.SEARCH_QUERY_PREFIX)
+      const maxQueryTokens = RagService.MAX_SAFE_TOKENS - prefixTokens
+      const truncatedQuery = this.truncateToTokenLimit(processedQuery, maxQueryTokens)
+
+      const prefixedQuery = RagService.SEARCH_QUERY_PREFIX + truncatedQuery
+      logger.debug(`[RAG] Generating embedding with prefix: "${RagService.SEARCH_QUERY_PREFIX}"`)
+
+      // Validate final token count
+      const queryTokenCount = this.estimateTokenCount(prefixedQuery)
+      if (queryTokenCount > RagService.MAX_SAFE_TOKENS) {
+        logger.error(
+          `[RAG] Query too long even after truncation: ${queryTokenCount} tokens (max: ${RagService.MAX_SAFE_TOKENS})`
+        )
+        return []
+      }
+
      const response = await ollamaClient.embeddings({
        model: RagService.EMBEDDING_MODEL,
-        prompt: query,
+        prompt: prefixedQuery,
      })

-      // Search for similar vectors in Qdrant
+      // Perform semantic search with a higher limit to enable reranking
+      const searchLimit = limit * 3 // Get more results for reranking
+      logger.debug(
+        `[RAG] Searching for top ${searchLimit} semantic matches (threshold: ${scoreThreshold})`
+      )
+
      const searchResults = await this.qdrant!.search(RagService.CONTENT_COLLECTION_NAME, {
        vector: response.embedding,
-        limit: limit,
+        limit: searchLimit,
        score_threshold: scoreThreshold,
        with_payload: true,
      })

-      console.log("Got search results:", searchResults);
+      logger.debug(`[RAG] Found ${searchResults.length} results above threshold ${scoreThreshold}`)

-      return searchResults.map((result) => ({
+      // Map results with metadata for reranking
+      const resultsWithMetadata = searchResults.map((result) => ({
        text: (result.payload?.text as string) || '',
        score: result.score,
+        keywords: (result.payload?.keywords as string) || '',
+        chunk_index: (result.payload?.chunk_index as number) || 0,
+        created_at: (result.payload?.created_at as number) || 0,
+      }))
+
+      const rerankedResults = this.rerankResults(resultsWithMetadata, keywords, query)
+
+      logger.debug(`[RAG] Top 3 results after reranking:`)
+      rerankedResults.slice(0, 3).forEach((result, idx) => {
+        logger.debug(
+          `[RAG]   ${idx + 1}. Score: ${result.finalScore.toFixed(4)} (semantic: ${result.score.toFixed(4)}) - "${result.text.substring(0, 100)}..."`
+        )
+      })
+
+      // Return top N results
+      return rerankedResults.slice(0, limit).map((result) => ({
+        text: result.text,
+        score: result.finalScore,
+        metadata: {
+          chunk_index: result.chunk_index,
+          created_at: result.created_at,
+          semantic_score: result.score,
+        },
      }))
    } catch (error) {
-      logger.error('Error searching similar documents:', error)
+      logger.error('[RAG] Error searching similar documents:', error)
      return []
    }
  }

+  /**
+   * Rerank search results using hybrid scoring that combines:
+   * 1. Semantic similarity score (primary signal)
+   * 2. Keyword overlap bonus (conservative, quality-gated)
+   * 3. Direct term matches (conservative)
+   *
+   * Tries to boost only already-relevant results, not promote
+   * low-quality results just because they have keyword matches.
+   *
+   * Future: this is a decent feature-based approach, but we could
+   * switch to a python-based reranker in the future if the benefits
+   * outweigh the overhead.
+   */
+  private rerankResults(
+    results: Array<{
+      text: string
+      score: number
+      keywords: string
+      chunk_index: number
+      created_at: number
+    }>,
+    queryKeywords: string[],
+    originalQuery: string
+  ): Array<{
+    text: string
+    score: number
+    finalScore: number
+    chunk_index: number
+    created_at: number
+  }> {
+    return results
+      .map((result) => {
+        let finalScore = result.score
+
+        // Quality gate: Only apply boosts if semantic score is reasonable
+        // Try to prevent promoting irrelevant results that just happen to have keyword matches
+        const MIN_SEMANTIC_THRESHOLD = 0.35
+
+        if (result.score < MIN_SEMANTIC_THRESHOLD) {
+          // For low-scoring results, use semantic score as-is
+          // This prevents false positives from keyword gaming
+          logger.debug(
+            `[RAG] Skipping boost for low semantic score: ${result.score.toFixed(3)} (threshold: ${MIN_SEMANTIC_THRESHOLD})`
+          )
+          return {
+            ...result,
+            finalScore,
+          }
+        }
+
+        // Boost score based on keyword overlap (diminishing returns - overlap goes down, so does boost)
+        const docKeywords = result.keywords
+          .toLowerCase()
+          .split(' ')
+          .filter((k) => k.length > 0)
+        const matchingKeywords = queryKeywords.filter(
+          (kw) =>
+            docKeywords.includes(kw.toLowerCase()) ||
+            result.text.toLowerCase().includes(kw.toLowerCase())
+        )
+        const keywordOverlap = matchingKeywords.length / Math.max(queryKeywords.length, 1)
+
+        // Use square root for diminishing returns: 100% overlap = sqrt(1.0) = 1.0, 25% = 0.5
+        // Then scale conservatively (max 10% boost instead of 20%)
+        const keywordBoost = Math.sqrt(keywordOverlap) * 0.1 * result.score
+
+        if (keywordOverlap > 0) {
+          logger.debug(
+            `[RAG] Keyword overlap: ${matchingKeywords.length}/${queryKeywords.length} - Boost: ${keywordBoost.toFixed(3)}`
+          )
+        }
+
+        // Boost if original query terms appear in text (case-insensitive)
+        // Scale boost proportionally to base score to avoid over-promoting weak matches
+        const queryTerms = originalQuery
+          .toLowerCase()
+          .split(/\s+/)
+          .filter((t) => t.length > 3)
+        const directMatches = queryTerms.filter((term) =>
+          result.text.toLowerCase().includes(term)
+        ).length
+
+        if (queryTerms.length > 0) {
+          const directMatchRatio = directMatches / queryTerms.length
+          // Conservative boost: max 7.5% of the base score
+          const directMatchBoost = Math.sqrt(directMatchRatio) * 0.075 * result.score
+
+          if (directMatches > 0) {
+            logger.debug(
+              `[RAG] Direct term matches: ${directMatches}/${queryTerms.length} - Boost: ${directMatchBoost.toFixed(3)}`
+            )
+            finalScore += directMatchBoost
+          }
+        }
+
+        finalScore = Math.min(1.0, finalScore + keywordBoost)
+
+        return {
+          ...result,
+          finalScore,
+        }
+      })
+      .sort((a, b) => b.finalScore - a.finalScore)
+  }
+
  /**
   * Retrieve all unique source files that have been stored in the knowledge base.
   * @returns Array of unique source file identifiers
@ -328,9 +635,8 @@ export class RagService {

        // Extract unique source values from payloads
        scrollResult.points.forEach((point) => {
-          const metadata = point.payload?.metadata
-          if (metadata && typeof metadata === 'object' && 'source' in metadata) {
-            const source = metadata.source as string
+          const source = point.payload?.source
+          if (source && typeof source === 'string') {
            sources.add(source)
          }
        })
@ -338,7 +644,13 @@ export class RagService {
        offset = scrollResult.next_page_offset || null
      } while (offset !== null)

-      return Array.from(sources)
+      const sourcesArr = Array.from(sources)
+
+      // The source is a full path - only extract the filename for display
+      return sourcesArr.map((src) => {
+        const parts = src.split(/[/\\]/)
+        return parts[parts.length - 1] // Return the last part as filename
+      })
    } catch (error) {
      logger.error('Error retrieving stored files:', error)
      return []
--- a/admin/app/validators/rag.ts
+++ b/admin/app/validators/rag.ts
@ -0,0 +1,7 @@
+import vine from '@vinejs/vine'
+
+export const getJobStatusSchema = vine.compile(
+  vine.object({
+    filePath: vine.string(),
+  })
+)
--- a/admin/commands/queue/work.ts
+++ b/admin/commands/queue/work.ts
@ -5,6 +5,7 @@ import queueConfig from '#config/queue'
 import { RunDownloadJob } from '#jobs/run_download_job'
 import { DownloadModelJob } from '#jobs/download_model_job'
 import { RunBenchmarkJob } from '#jobs/run_benchmark_job'
+import { EmbedFileJob } from '#jobs/embed_file_job'

 export default class QueueWork extends BaseCommand {
  static commandName = 'queue:work'
@ -90,10 +91,12 @@ export default class QueueWork extends BaseCommand {
    handlers.set(RunDownloadJob.key, new RunDownloadJob())
    handlers.set(DownloadModelJob.key, new DownloadModelJob())
    handlers.set(RunBenchmarkJob.key, new RunBenchmarkJob())
+    handlers.set(EmbedFileJob.key, new EmbedFileJob())

    queues.set(RunDownloadJob.key, RunDownloadJob.queue)
    queues.set(DownloadModelJob.key, DownloadModelJob.queue)
    queues.set(RunBenchmarkJob.key, RunBenchmarkJob.queue)
+    queues.set(EmbedFileJob.key, EmbedFileJob.queue)

    return [handlers, queues]
  }
@ -107,6 +110,7 @@ export default class QueueWork extends BaseCommand {
      [RunDownloadJob.queue]: 3,
      [DownloadModelJob.queue]: 2, // Lower concurrency for resource-intensive model downloads
      [RunBenchmarkJob.queue]: 1, // Run benchmarks one at a time for accurate results
+      [EmbedFileJob.queue]: 2, // Lower concurrency for embedding jobs, can be resource intensive
      default: 3,
    }

--- a/admin/constants/ollama.ts
+++ b/admin/constants/ollama.ts
@ -66,12 +66,20 @@ export const SYSTEM_PROMPTS = {
 - Use tables when presenting structured data.
 `,
  rag_context: (context: string) => `
-You have access to the following relevant information from the knowledge base. Use this context to provide accurate and informed responses when relevant:
+You have access to relevant information from the knowledge base. This context has been retrieved based on semantic similarity to the user's question.

-[Context]
+[Knowledge Base Context]
 ${context}

-If the user's question is related to this context, incorporate it into your response. Otherwise, respond normally.
+IMPORTANT INSTRUCTIONS:
+1. If the user's question is directly related to the context above, use this information to provide accurate, detailed answers.
+2. Always cite or reference the context when using it (e.g., "According to the information available..." or "Based on the knowledge base...").
+3. If the context is only partially relevant, combine it with your general knowledge but be clear about what comes from the knowledge base.
+4. If the context is not relevant to the user's question, you can respond using your general knowledge without forcing the context into your answer.
+5. Never fabricate information that isn't in the context or your training data.
+6. If you're unsure or the context doesn't contain enough information, acknowledge the limitations.
+
+Format your response using markdown for readability.
 `,
  chat_suggestions: `
 You are a helpful assistant that generates conversation starter suggestions for a survivalist/prepper using an AI assistant.
--- a/admin/package-lock.json
+++ b/admin/package-lock.json
@ -20,6 +20,7 @@
        "@adonisjs/transmit": "^2.0.2",
        "@adonisjs/transmit-client": "^1.0.0",
        "@adonisjs/vite": "^4.0.0",
+        "@chonkiejs/core": "^0.0.7",
        "@headlessui/react": "^2.2.4",
        "@inertiajs/react": "^2.0.13",
        "@markdoc/markdoc": "^0.5.2",
@ -42,7 +43,6 @@
        "dockerode": "^4.0.7",
        "edge.js": "^6.2.1",
        "fast-xml-parser": "^5.2.5",
-        "llm-chunk": "^0.0.1",
        "luxon": "^3.6.1",
        "maplibre-gl": "^4.7.1",
        "mysql2": "^3.14.1",
@ -60,6 +60,7 @@
        "reflect-metadata": "^0.2.2",
        "remark-gfm": "^4.0.1",
        "sharp": "^0.34.5",
+        "stopword": "^3.1.5",
        "systeminformation": "^5.27.14",
        "tailwindcss": "^4.1.10",
        "tar": "^7.5.6",
@ -83,6 +84,7 @@
        "@types/node": "^22.15.18",
        "@types/react": "^19.1.8",
        "@types/react-dom": "^19.1.6",
+        "@types/stopword": "^2.0.3",
        "eslint": "^9.26.0",
        "hot-hook": "^0.4.0",
        "prettier": "^3.5.3",
@ -1230,6 +1232,21 @@
      "dev": true,
      "license": "Apache-2.0"
    },
+    "node_modules/@chonkiejs/chunk": {
+      "version": "0.9.3",
+      "resolved": "https://registry.npmjs.org/@chonkiejs/chunk/-/chunk-0.9.3.tgz",
+      "integrity": "sha512-uUOeoFGY3s6kzAoKskI50weZN0zvW3oLwUijA1uX7Wxuy9yZStF2IvGuXRigMgP2g/L85lsotYGkjpBMLjQnrg==",
+      "license": "MIT OR Apache-2.0"
+    },
+    "node_modules/@chonkiejs/core": {
+      "version": "0.0.7",
+      "resolved": "https://registry.npmjs.org/@chonkiejs/core/-/core-0.0.7.tgz",
+      "integrity": "sha512-R17OW9TT1x7B6lDKTCaMd6NluAObleN/cCQtUbMK2UcFOguJtQz/cL0n1t0AzJWBFMVgYP8EcqTFn/fcKhzPiA==",
+      "license": "MIT",
+      "dependencies": {
+        "@chonkiejs/chunk": "^0.9.3"
+      }
+    },
    "node_modules/@colors/colors": {
      "version": "1.5.0",
      "resolved": "https://registry.npmjs.org/@colors/colors/-/colors-1.5.0.tgz",
@ -5084,6 +5101,13 @@
      "dev": true,
      "license": "MIT"
    },
+    "node_modules/@types/stopword": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/@types/stopword/-/stopword-2.0.3.tgz",
+      "integrity": "sha512-hioMj0lOvISM+EDevf7ijG8EMbU+J3pj4SstCyfQC1t39uPYpAe7beSfBdU6c1d9jeECTQQtR3UJWtVoUO8Weg==",
+      "dev": true,
+      "license": "MIT"
+    },
    "node_modules/@types/supercluster": {
      "version": "7.1.3",
      "resolved": "https://registry.npmjs.org/@types/supercluster/-/supercluster-7.1.3.tgz",
@ -9866,12 +9890,6 @@
        "url": "https://opencollective.com/parcel"
      }
    },
-    "node_modules/llm-chunk": {
-      "version": "0.0.1",
-      "resolved": "https://registry.npmjs.org/llm-chunk/-/llm-chunk-0.0.1.tgz",
-      "integrity": "sha512-n9fHgsSiJb7vXZiC5c4XV6rme+tC7WX/cWH6EJvPPmMOMwOZ9xdg/U9LY5Qhmixd3K1PdRB0FVOdzoJF2HUZbg==",
-      "license": "MIT"
-    },
    "node_modules/locate-path": {
      "version": "7.2.0",
      "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-7.2.0.tgz",
@ -13619,6 +13637,12 @@
        "node": ">= 0.8"
      }
    },
+    "node_modules/stopword": {
+      "version": "3.1.5",
+      "resolved": "https://registry.npmjs.org/stopword/-/stopword-3.1.5.tgz",
+      "integrity": "sha512-OgLYGVFCNa430WOrj9tYZhQge5yg6vd6JsKredveAqEhdLVQkfrpnQIGjx0L9lLqzL4Kq4J8yNTcfQR/MpBwhg==",
+      "license": "MIT"
+    },
    "node_modules/string_decoder": {
      "version": "1.3.0",
      "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz",
--- a/admin/package.json
+++ b/admin/package.json
@ -52,6 +52,7 @@
    "@types/node": "^22.15.18",
    "@types/react": "^19.1.8",
    "@types/react-dom": "^19.1.6",
+    "@types/stopword": "^2.0.3",
    "eslint": "^9.26.0",
    "hot-hook": "^0.4.0",
    "prettier": "^3.5.3",
@ -71,6 +72,7 @@
    "@adonisjs/transmit": "^2.0.2",
    "@adonisjs/transmit-client": "^1.0.0",
    "@adonisjs/vite": "^4.0.0",
+    "@chonkiejs/core": "^0.0.7",
    "@headlessui/react": "^2.2.4",
    "@inertiajs/react": "^2.0.13",
    "@markdoc/markdoc": "^0.5.2",
@ -93,7 +95,6 @@
    "dockerode": "^4.0.7",
    "edge.js": "^6.2.1",
    "fast-xml-parser": "^5.2.5",
-    "llm-chunk": "^0.0.1",
    "luxon": "^3.6.1",
    "maplibre-gl": "^4.7.1",
    "mysql2": "^3.14.1",
@ -111,6 +112,7 @@
    "reflect-metadata": "^0.2.2",
    "remark-gfm": "^4.0.1",
    "sharp": "^0.34.5",
+    "stopword": "^3.1.5",
    "systeminformation": "^5.27.14",
    "tailwindcss": "^4.1.10",
    "tar": "^7.5.6",
--- a/admin/start/routes.ts
+++ b/admin/start/routes.ts
@ -119,6 +119,7 @@ router
  .group(() => {
    router.post('/upload', [RagController, 'upload'])
    router.get('/files', [RagController, 'getStoredFiles'])
+    router.get('/job-status', [RagController, 'getJobStatus'])
  })
  .prefix('/api/rag')