From c8ce28a84fffd63ceb202c187995428c15ef0162 Mon Sep 17 00:00:00 2001
From: Henry Estela <hestela@mailbox.org>
Date: Fri, 13 Mar 2026 12:01:48 -0700
Subject: [PATCH] fix(ai-chat): ingestion of documents with openai and add
 cleanup button

Added a cleanup failed button for Processing Queue in the Knowledge Base
since documents that fail to process tend to get stuck and then can't be
cleared.

Fixed the ingestion of documents for OpenAI servers.

Updated some text in the chat and chat settings since user will need to
manually download models when using a non-ollama remote gpu server.
---
 admin/app/controllers/rag_controller.ts       | 13 +++++
 admin/app/jobs/embed_file_job.ts              | 47 +++++++++++++++++++
 admin/app/services/ollama_service.ts          | 28 +++++++++--
 admin/app/services/rag_service.ts             | 23 ++++++---
 .../inertia/components/chat/ChatInterface.tsx |  2 +-
 .../components/chat/KnowledgeBaseModal.tsx    | 26 +++++++++-
 admin/inertia/lib/api.ts                      | 14 ++++++
 admin/inertia/pages/settings/models.tsx       |  3 ++
 admin/start/routes.ts                         |  2 +
 admin/types/rag.ts                            |  1 +
 10 files changed, 147 insertions(+), 12 deletions(-)

diff --git a/admin/app/controllers/rag_controller.ts b/admin/app/controllers/rag_controller.ts
index ce94876..55b5ef6 100644
--- a/admin/app/controllers/rag_controller.ts
+++ b/admin/app/controllers/rag_controller.ts
@@ -74,6 +74,19 @@ export default class RagController {
     return response.status(200).json({ message: result.message })
   }
 
+  public async getFailedJobs({ response }: HttpContext) {
+    const jobs = await EmbedFileJob.listFailedJobs()
+    return response.status(200).json(jobs)
+  }
+
+  public async cleanupFailedJobs({ response }: HttpContext) {
+    const result = await EmbedFileJob.cleanupFailedJobs()
+    return response.status(200).json({
+      message: `Cleaned up ${result.cleaned} failed job${result.cleaned !== 1 ? 's' : ''}${result.filesDeleted > 0 ? `, deleted ${result.filesDeleted} file${result.filesDeleted !== 1 ? 's' : ''}` : ''}.`,
+      ...result,
+    })
+  }
+
   public async scanAndSync({ response }: HttpContext) {
     try {
       const syncResult = await this.ragService.scanAndSyncStorage()
diff --git a/admin/app/jobs/embed_file_job.ts b/admin/app/jobs/embed_file_job.ts
index 0c0a12f..83a61bf 100644
--- a/admin/app/jobs/embed_file_job.ts
+++ b/admin/app/jobs/embed_file_job.ts
@@ -6,6 +6,7 @@ import { DockerService } from '#services/docker_service'
 import { OllamaService } from '#services/ollama_service'
 import { createHash } from 'crypto'
 import logger from '@adonisjs/core/services/logger'
+import fs from 'node:fs/promises'
 
 export interface EmbedFileJobParams {
   filePath: string
@@ -232,6 +233,52 @@ export class EmbedFileJob {
     }
   }
 
+  static async listFailedJobs(): Promise<EmbedJobWithProgress[]> {
+    const queueService = new QueueService()
+    const queue = queueService.getQueue(this.queue)
+    // Jobs that have failed at least once are in 'delayed' (retrying) or terminal 'failed' state.
+    // We identify them by job.data.status === 'failed' set in the catch block of handle().
+    const jobs = await queue.getJobs(['waiting', 'delayed', 'failed'])
+
+    return jobs
+      .filter((job) => (job.data as any).status === 'failed')
+      .map((job) => ({
+        jobId: job.id!.toString(),
+        fileName: (job.data as EmbedFileJobParams).fileName,
+        filePath: (job.data as EmbedFileJobParams).filePath,
+        progress: 0,
+        status: 'failed',
+        error: (job.data as any).error,
+      }))
+  }
+
+  static async cleanupFailedJobs(): Promise<{ cleaned: number; filesDeleted: number }> {
+    const queueService = new QueueService()
+    const queue = queueService.getQueue(this.queue)
+    const allJobs = await queue.getJobs(['waiting', 'delayed', 'failed'])
+    const failedJobs = allJobs.filter((job) => (job.data as any).status === 'failed')
+
+    let cleaned = 0
+    let filesDeleted = 0
+
+    for (const job of failedJobs) {
+      const filePath = (job.data as EmbedFileJobParams).filePath
+      if (filePath && filePath.includes(RagService.UPLOADS_STORAGE_PATH)) {
+        try {
+          await fs.unlink(filePath)
+          filesDeleted++
+        } catch {
+          // File may already be deleted — that's fine
+        }
+      }
+      await job.remove()
+      cleaned++
+    }
+
+    logger.info(`[EmbedFileJob] Cleaned up ${cleaned} failed jobs, deleted ${filesDeleted} files`)
+    return { cleaned, filesDeleted }
+  }
+
   static async getStatus(filePath: string): Promise<{
     exists: boolean
     status?: string
diff --git a/admin/app/services/ollama_service.ts b/admin/app/services/ollama_service.ts
index e4aa72a..faff9cd 100644
--- a/admin/app/services/ollama_service.ts
+++ b/admin/app/services/ollama_service.ts
@@ -51,6 +51,7 @@ export class OllamaService {
   private openai: OpenAI | null = null
   private baseUrl: string | null = null
   private initPromise: Promise<void> | null = null
+  private isOllamaNative: boolean | null = null
 
   constructor() {}
 
@@ -107,6 +108,18 @@ export class OllamaService {
         return { success: true, message: 'Model is already installed.' }
       }
 
+      // Model pulling is an Ollama-only operation. Non-Ollama backends (LM Studio, llama.cpp, etc.)
+      // return HTTP 200 for unknown endpoints, so the pull would appear to succeed but do nothing.
+      if (this.isOllamaNative === false) {
+        logger.warn(
+          `[OllamaService] Non-Ollama backend detected — skipping model pull for "${model}". Load the model manually in your AI host.`
+        )
+        return {
+          success: false,
+          message: `Model "${model}" is not available in your AI host. Please load it manually (model pulling is only supported for Ollama backends).`,
+        }
+      }
+
       // Stream pull via Ollama native API
       const pullResponse = await axios.post(
         `${this.baseUrl}/api/pull`,
@@ -298,12 +311,19 @@ export class OllamaService {
         { model, input },
         { timeout: 60000 }
       )
+      // Some backends (e.g. LM Studio) return HTTP 200 for unknown endpoints with an incompatible
+      // body — validate explicitly before accepting the result.
+      if (!Array.isArray(response.data?.embeddings)) {
+        throw new Error('Invalid /api/embed response — missing embeddings array')
+      }
       return { embeddings: response.data.embeddings }
     } catch {
-      // Fall back to OpenAI-compatible /v1/embeddings (processes one at a time then batches)
+      // Fall back to OpenAI-compatible /v1/embeddings
+      // Explicitly request float format — some backends (e.g. LM Studio) don't reliably
+      // implement the base64 encoding the OpenAI SDK requests by default.
       logger.info('[OllamaService] /api/embed unavailable, falling back to /v1/embeddings')
-      const results = await this.openai.embeddings.create({ model, input })
-      return { embeddings: results.data.map((e) => e.embedding) }
+      const results = await this.openai.embeddings.create({ model, input, encoding_format: 'float' })
+      return { embeddings: results.data.map((e) => e.embedding as number[]) }
     }
   }
 
@@ -320,11 +340,13 @@ export class OllamaService {
       if (!Array.isArray(response.data?.models)) {
         throw new Error('Not an Ollama-compatible /api/tags response')
       }
+      this.isOllamaNative = true
       const models: NomadInstalledModel[] = response.data.models
       if (includeEmbeddings) return models
       return models.filter((m) => !m.name.includes('embed'))
     } catch {
       // Fall back to the OpenAI-compatible /v1/models endpoint (LM Studio, llama.cpp, etc.)
+      this.isOllamaNative = false
       logger.info('[OllamaService] /api/tags unavailable, falling back to /v1/models')
       try {
         const modelList = await this.openai!.models.list()
diff --git a/admin/app/services/rag_service.ts b/admin/app/services/rag_service.ts
index 167be35..216caa1 100644
--- a/admin/app/services/rag_service.ts
+++ b/admin/app/services/rag_service.ts
@@ -23,15 +23,18 @@ export class RagService {
   private qdrant: QdrantClient | null = null
   private qdrantInitPromise: Promise<void> | null = null
   private embeddingModelVerified = false
+  private resolvedEmbeddingModel: string | null = null
   public static UPLOADS_STORAGE_PATH = 'storage/kb_uploads'
   public static CONTENT_COLLECTION_NAME = 'nomad_knowledge_base'
   public static EMBEDDING_MODEL = 'nomic-embed-text:v1.5'
   public static EMBEDDING_DIMENSION = 768 // Nomic Embed Text v1.5 dimension is 768
   public static MODEL_CONTEXT_LENGTH = 2048 // nomic-embed-text has 2K token context
-  public static MAX_SAFE_TOKENS = 1800 // Leave buffer for prefix and tokenization variance
-  public static TARGET_TOKENS_PER_CHUNK = 1700 // Target 1700 tokens per chunk for embedding
+  public static MAX_SAFE_TOKENS = 1600 // Leave buffer for prefix and tokenization variance
+  public static TARGET_TOKENS_PER_CHUNK = 1500 // Target 1500 tokens per chunk for embedding
   public static PREFIX_TOKEN_BUDGET = 10 // Reserve ~10 tokens for prefixes
-  public static CHAR_TO_TOKEN_RATIO = 3 // Approximate chars per token
+  public static CHAR_TO_TOKEN_RATIO = 2 // Conservative chars-per-token estimate; technical docs
+                                         // (numbers, symbols, abbreviations) tokenize denser
+                                         // than plain prose (~3), so 2 avoids context overflows
   // Nomic Embed Text v1.5 uses task-specific prefixes for optimal performance
   public static SEARCH_DOCUMENT_PREFIX = 'search_document: '
   public static SEARCH_QUERY_PREFIX = 'search_query: '
@@ -245,7 +248,9 @@ export class RagService {
 
       if (!this.embeddingModelVerified) {
         const allModels = await this.ollamaService.getModels(true)
-        const embeddingModel = allModels.find((model) => model.name === RagService.EMBEDDING_MODEL)
+        const embeddingModel =
+          allModels.find((model) => model.name === RagService.EMBEDDING_MODEL) ??
+          allModels.find((model) => model.name.toLowerCase().includes('nomic-embed-text'))
 
         if (!embeddingModel) {
           try {
@@ -262,6 +267,7 @@ export class RagService {
             return null
           }
         }
+        this.resolvedEmbeddingModel = embeddingModel?.name ?? RagService.EMBEDDING_MODEL
         this.embeddingModelVerified = true
       }
 
@@ -318,7 +324,7 @@ export class RagService {
 
         logger.debug(`[RAG] Embedding batch ${batchIdx + 1}/${totalBatches} (${batch.length} chunks)`)
 
-        const response = await this.ollamaService.embed(RagService.EMBEDDING_MODEL, batch)
+        const response = await this.ollamaService.embed(this.resolvedEmbeddingModel ?? RagService.EMBEDDING_MODEL, batch)
 
         embeddings.push(...response.embeddings)
 
@@ -687,7 +693,9 @@ export class RagService {
 
       if (!this.embeddingModelVerified) {
         const allModels = await this.ollamaService.getModels(true)
-        const embeddingModel = allModels.find((model) => model.name === RagService.EMBEDDING_MODEL)
+        const embeddingModel =
+          allModels.find((model) => model.name === RagService.EMBEDDING_MODEL) ??
+          allModels.find((model) => model.name.toLowerCase().includes('nomic-embed-text'))
 
         if (!embeddingModel) {
           logger.warn(
@@ -696,6 +704,7 @@ export class RagService {
           this.embeddingModelVerified = false
           return []
         }
+        this.resolvedEmbeddingModel = embeddingModel.name
         this.embeddingModelVerified = true
       }
 
@@ -722,7 +731,7 @@ export class RagService {
         return []
       }
 
-      const response = await this.ollamaService.embed(RagService.EMBEDDING_MODEL, [prefixedQuery])
+      const response = await this.ollamaService.embed(this.resolvedEmbeddingModel ?? RagService.EMBEDDING_MODEL, [prefixedQuery])
 
       // Perform semantic search with a higher limit to enable reranking
       const searchLimit = limit * 3 // Get more results for reranking
diff --git a/admin/inertia/components/chat/ChatInterface.tsx b/admin/inertia/components/chat/ChatInterface.tsx
index ffdd017..57f9d64 100644
--- a/admin/inertia/components/chat/ChatInterface.tsx
+++ b/admin/inertia/components/chat/ChatInterface.tsx
@@ -213,7 +213,7 @@ export default function ChatInterface({
           <p className="text-text-primary">
             This will dispatch a background download job for{' '}
             <span className="font-mono font-medium">{DEFAULT_QUERY_REWRITE_MODEL}</span> and may take some time to complete. The model
-            will be used to rewrite queries for improved RAG retrieval performance.
+            will be used to rewrite queries for improved RAG retrieval performance. Note that download is only supported when using Ollama. If using an OpenAI API interface, please download the model with that software.
           </p>
         </StyledModal>
       </div>
diff --git a/admin/inertia/components/chat/KnowledgeBaseModal.tsx b/admin/inertia/components/chat/KnowledgeBaseModal.tsx
index 38a2948..508cf1e 100644
--- a/admin/inertia/components/chat/KnowledgeBaseModal.tsx
+++ b/admin/inertia/components/chat/KnowledgeBaseModal.tsx
@@ -68,6 +68,17 @@ export default function KnowledgeBaseModal({ aiAssistantName = "AI Assistant", o
     },
   })
 
+  const cleanupFailedMutation = useMutation({
+    mutationFn: () => api.cleanupFailedEmbedJobs(),
+    onSuccess: (data) => {
+      addNotification({ type: 'success', message: data?.message || 'Failed jobs cleaned up.' })
+      queryClient.invalidateQueries({ queryKey: ['failedEmbedJobs'] })
+    },
+    onError: (error: any) => {
+      addNotification({ type: 'error', message: error?.message || 'Failed to clean up jobs.' })
+    },
+  })
+
   const syncMutation = useMutation({
     mutationFn: () => api.syncRAGStorage(),
     onSuccess: (data) => {
@@ -207,7 +218,20 @@ export default function KnowledgeBaseModal({ aiAssistantName = "AI Assistant", o
             </div>
           </div>
           <div className="my-8">
-            <ActiveEmbedJobs withHeader={true} />
+            <div className="flex items-center justify-between mb-4">
+              <StyledSectionHeader title="Processing Queue" className="!mb-0" />
+              <StyledButton
+                variant="danger"
+                size="md"
+                icon="IconTrash"
+                onClick={() => cleanupFailedMutation.mutate()}
+                loading={cleanupFailedMutation.isPending}
+                disabled={cleanupFailedMutation.isPending}
+              >
+                Clean Up Failed
+              </StyledButton>
+            </div>
+            <ActiveEmbedJobs withHeader={false} />
           </div>
 
           <div className="my-12">
diff --git a/admin/inertia/lib/api.ts b/admin/inertia/lib/api.ts
index 8b865a7..aa28c3c 100644
--- a/admin/inertia/lib/api.ts
+++ b/admin/inertia/lib/api.ts
@@ -428,6 +428,20 @@ class API {
     })()
   }
 
+  async getFailedEmbedJobs(): Promise<EmbedJobWithProgress[] | undefined> {
+    return catchInternal(async () => {
+      const response = await this.client.get<EmbedJobWithProgress[]>('/rag/failed-jobs')
+      return response.data
+    })()
+  }
+
+  async cleanupFailedEmbedJobs(): Promise<{ message: string; cleaned: number; filesDeleted: number } | undefined> {
+    return catchInternal(async () => {
+      const response = await this.client.delete<{ message: string; cleaned: number; filesDeleted: number }>('/rag/failed-jobs')
+      return response.data
+    })()
+  }
+
   async getStoredRAGFiles() {
     return catchInternal(async () => {
       const response = await this.client.get<{ files: string[] }>('/rag/files')
diff --git a/admin/inertia/pages/settings/models.tsx b/admin/inertia/pages/settings/models.tsx
index c1d8bc4..2c3d652 100644
--- a/admin/inertia/pages/settings/models.tsx
+++ b/admin/inertia/pages/settings/models.tsx
@@ -376,6 +376,9 @@ export default function ModelsPage(props: {
           <ActiveModelDownloads withHeader />
 
           <StyledSectionHeader title="Models" className="mt-12 mb-4" />
+          <p className="text-sm text-desert-stone mb-4">
+            Model downloading is only supported when using a local Ollama backend. If you are connected to a remote AI host (e.g. LM Studio), download models directly in that application.
+          </p>
           <div className="flex justify-start items-center gap-3 mt-4">
             <Input
               name="search"
diff --git a/admin/start/routes.ts b/admin/start/routes.ts
index ab86629..dcc3174 100644
--- a/admin/start/routes.ts
+++ b/admin/start/routes.ts
@@ -131,6 +131,8 @@ router
     router.get('/files', [RagController, 'getStoredFiles'])
     router.delete('/files', [RagController, 'deleteFile'])
     router.get('/active-jobs', [RagController, 'getActiveJobs'])
+    router.get('/failed-jobs', [RagController, 'getFailedJobs'])
+    router.delete('/failed-jobs', [RagController, 'cleanupFailedJobs'])
     router.get('/job-status', [RagController, 'getJobStatus'])
     router.post('/sync', [RagController, 'scanAndSync'])
   })
diff --git a/admin/types/rag.ts b/admin/types/rag.ts
index 1d429ea..e84f349 100644
--- a/admin/types/rag.ts
+++ b/admin/types/rag.ts
@@ -4,6 +4,7 @@ export type EmbedJobWithProgress = {
   filePath: string
   progress: number
   status: string
+  error?: string
 }
 
 export type ProcessAndEmbedFileResponse = {