Merge dd7d5b0230 into 5c92c89813

2026-03-28 03:29:25 +01:00 · 2026-03-26 00:21:17 +00:00 · 2026-03-26 00:21:17 +00:00 · 58121e2be6
commit 58121e2be6
parent 5c92c89813 dd7d5b0230
25 changed files with 1107 additions and 275 deletions
--- a/README.md
+++ b/README.md
@ -37,7 +37,7 @@ For more control over the installation process, copy and paste the [Docker Compo
 N.O.M.A.D. is a management UI ("Command Center") and API that orchestrates a collection of containerized tools and resources via [Docker](https://www.docker.com/). It handles installation, configuration, and updates for everything — so you don't have to.

 **Built-in capabilities include:**
- **AI Chat with Knowledge Base** — local AI chat powered by [Ollama](https://ollama.com/), with document upload and semantic search (RAG via [Qdrant](https://qdrant.tech/))
+- **AI Chat with Knowledge Base** — local AI chat powered by [Ollama](https://ollama.com/) or you can use OpenAI API compatible software such as LM Studio or llama.cpp, with document upload and semantic search (RAG via [Qdrant](https://qdrant.tech/))
 - **Information Library** — offline Wikipedia, medical references, ebooks, and more via [Kiwix](https://kiwix.org/)
 - **Education Platform** — Khan Academy courses with progress tracking via [Kolibri](https://learningequality.org/kolibri/)
 - **Offline Maps** — downloadable regional maps via [ProtoMaps](https://protomaps.com)
@ -89,6 +89,12 @@ To run LLM's and other included AI tools:

 Again, Project N.O.M.A.D. itself is quite lightweight - it's the tools and resources you choose to install with N.O.M.A.D. that will determine the specs required for your unique deployment

+#### Running AI models on a different host
+By default, N.O.M.A.D.'s installer will attempt to setup Ollama on the host when the AI Assistant is installed. However, if you would like to run the AI model on a different host, you can go to the settings of of the AI assistant and input a URL for either an ollama or OpenAI-compatible API server (such as LM Studio).  
+Note that if you use Ollama on a different host, you must start the server with this option `OLLAMA_HOST=0.0.0.0`.  
+Ollama is the preferred way to use the AI assistant as it has features such as model download that OpenAI API does not support. So when using LM Studio for example, you will have to use LM Studio to download models.
+You are responsible for the setup of Ollama/OpenAI server on the other host.
+
 ## Frequently Asked Questions (FAQ)
 For answers to common questions about Project N.O.M.A.D., please see our [FAQ](FAQ.md) page.

--- a/admin/app/controllers/easy_setup_controller.ts
+++ b/admin/app/controllers/easy_setup_controller.ts
@ -1,6 +1,7 @@
 import { SystemService } from '#services/system_service'
 import { ZimService } from '#services/zim_service'
 import { CollectionManifestService } from '#services/collection_manifest_service'
+import KVStore from '#models/kv_store'
 import { inject } from '@adonisjs/core'
 import type { HttpContext } from '@adonisjs/core/http'

@ -12,10 +13,14 @@ export default class EasySetupController {
  ) {}

  async index({ inertia }: HttpContext) {
-    const services = await this.systemService.getServices({ installedOnly: false })
+    const [services, remoteOllamaUrl] = await Promise.all([
+      this.systemService.getServices({ installedOnly: false }),
+      KVStore.getValue('ai.remoteOllamaUrl'),
+    ])
    return inertia.render('easy-setup/index', {
      system: {
        services: services,
+        remoteOllamaUrl: remoteOllamaUrl ?? '',
      },
    })
  }
--- a/admin/app/controllers/ollama_controller.ts
+++ b/admin/app/controllers/ollama_controller.ts
@ -1,18 +1,23 @@
 import { ChatService } from '#services/chat_service'
+import { DockerService } from '#services/docker_service'
 import { OllamaService } from '#services/ollama_service'
 import { RagService } from '#services/rag_service'
+import Service from '#models/service'
+import KVStore from '#models/kv_store'
 import { modelNameSchema } from '#validators/download'
 import { chatSchema, getAvailableModelsSchema } from '#validators/ollama'
 import { inject } from '@adonisjs/core'
 import type { HttpContext } from '@adonisjs/core/http'
 import { DEFAULT_QUERY_REWRITE_MODEL, RAG_CONTEXT_LIMITS, SYSTEM_PROMPTS } from '../../constants/ollama.js'
+import { SERVICE_NAMES } from '../../constants/service_names.js'
 import logger from '@adonisjs/core/services/logger'
-import type { Message } from 'ollama'
+type Message = { role: 'system' | 'user' | 'assistant'; content: string }

@inject()
 export default class OllamaController {
  constructor(
    private chatService: ChatService,
+    private dockerService: DockerService,
    private ollamaService: OllamaService,
    private ragService: RagService
  ) { }
@ -72,10 +77,10 @@ export default class OllamaController {
          const { maxResults, maxTokens } = this.getContextLimitsForModel(reqData.model)
          let trimmedDocs = relevantDocs.slice(0, maxResults)

-          // Apply token cap if set (estimate ~4 chars per token)
+          // Apply token cap if set (estimate ~3.5 chars per token)
          // Always include the first (most relevant) result — the cap only gates subsequent results
          if (maxTokens > 0) {
-            const charCap = maxTokens * 4
+            const charCap = maxTokens * 3.5
            let totalChars = 0
            trimmedDocs = trimmedDocs.filter((doc, idx) => {
              totalChars += doc.text.length
@ -103,6 +108,19 @@ export default class OllamaController {
        }
      }

+      // If system messages are large (e.g. due to RAG context), request a context window big
+      // enough to fit them. Ollama respects num_ctx per-request; LM Studio ignores it gracefully.
+      const systemChars = reqData.messages
+        .filter((m) => m.role === 'system')
+        .reduce((sum, m) => sum + m.content.length, 0)
+      const estimatedSystemTokens = Math.ceil(systemChars / 3.5)
+      let numCtx: number | undefined
+      if (estimatedSystemTokens > 3000) {
+        const needed = estimatedSystemTokens + 2048 // leave room for conversation + response
+        numCtx = [8192, 16384, 32768, 65536].find((n) => n >= needed) ?? 65536
+        logger.debug(`[OllamaController] Large system prompt (~${estimatedSystemTokens} tokens), requesting num_ctx: ${numCtx}`)
+      }
+
      // Check if the model supports "thinking" capability for enhanced response generation
      // If gpt-oss model, it requires a text param for "think" https://docs.ollama.com/api/chat
      const thinkingCapability = await this.ollamaService.checkModelHasThinking(reqData.model)
@ -124,7 +142,7 @@ export default class OllamaController {
      if (reqData.stream) {
        logger.debug(`[OllamaController] Initiating streaming response for model: "${reqData.model}" with think: ${think}`)
        // Headers already flushed above
-        const stream = await this.ollamaService.chatStream({ ...ollamaRequest, think })
+        const stream = await this.ollamaService.chatStream({ ...ollamaRequest, think, numCtx })
        let fullContent = ''
        for await (const chunk of stream) {
          if (chunk.message?.content) {
@ -148,7 +166,7 @@ export default class OllamaController {
      }

      // Non-streaming (legacy) path
-      const result = await this.ollamaService.chat({ ...ollamaRequest, think })
+      const result = await this.ollamaService.chat({ ...ollamaRequest, think, numCtx })

      if (sessionId && result?.message?.content) {
        await this.chatService.addMessage(sessionId, 'assistant', result.message.content)
@ -171,6 +189,87 @@ export default class OllamaController {
    }
  }

+  async remoteStatus() {
+    const remoteUrl = await KVStore.getValue('ai.remoteOllamaUrl')
+    if (!remoteUrl) {
+      return { configured: false, connected: false }
+    }
+    try {
+      const testResponse = await fetch(`${remoteUrl.replace(/\/$/, '')}/v1/models`, {
+        signal: AbortSignal.timeout(3000),
+      })
+      return { configured: true, connected: testResponse.ok }
+    } catch {
+      return { configured: true, connected: false }
+    }
+  }
+
+  async configureRemote({ request, response }: HttpContext) {
+    const remoteUrl: string | null = request.input('remoteUrl', null)
+
+    const ollamaService = await Service.query().where('service_name', SERVICE_NAMES.OLLAMA).first()
+    if (!ollamaService) {
+      return response.status(404).send({ success: false, message: 'Ollama service record not found.' })
+    }
+
+    // Clear path: null or empty URL removes remote config and marks service as not installed
+    if (!remoteUrl || remoteUrl.trim() === '') {
+      await KVStore.clearValue('ai.remoteOllamaUrl')
+      ollamaService.installed = false
+      ollamaService.installation_status = 'idle'
+      await ollamaService.save()
+      return { success: true, message: 'Remote Ollama configuration cleared.' }
+    }
+
+    // Validate URL format
+    if (!remoteUrl.startsWith('http')) {
+      return response.status(400).send({
+        success: false,
+        message: 'Invalid URL. Must start with http:// or https://',
+      })
+    }
+
+    // Test connectivity via OpenAI-compatible /v1/models endpoint (works with Ollama, LM Studio, llama.cpp, etc.)
+    try {
+      const testResponse = await fetch(`${remoteUrl.replace(/\/$/, '')}/v1/models`, {
+        signal: AbortSignal.timeout(5000),
+      })
+      if (!testResponse.ok) {
+        return response.status(400).send({
+          success: false,
+          message: `Could not connect to ${remoteUrl} (HTTP ${testResponse.status}). Make sure the server is running and accessible. For Ollama, start it with OLLAMA_HOST=0.0.0.0.`,
+        })
+      }
+    } catch (error) {
+      return response.status(400).send({
+        success: false,
+        message: `Could not connect to ${remoteUrl}. Make sure the server is running and reachable. For Ollama, start it with OLLAMA_HOST=0.0.0.0.`,
+      })
+    }
+
+    // Save remote URL and mark service as installed
+    await KVStore.setValue('ai.remoteOllamaUrl', remoteUrl.trim())
+    ollamaService.installed = true
+    ollamaService.installation_status = 'idle'
+    await ollamaService.save()
+
+    // Install Qdrant if not already installed (fire-and-forget)
+    const qdrantService = await Service.query().where('service_name', SERVICE_NAMES.QDRANT).first()
+    if (qdrantService && !qdrantService.installed) {
+      this.dockerService.createContainerPreflight(SERVICE_NAMES.QDRANT).catch((error) => {
+        logger.error('[OllamaController] Failed to start Qdrant preflight:', error)
+      })
+    }
+
+    // Mirror post-install side effects: disable suggestions, trigger docs discovery
+    await KVStore.setValue('chat.suggestionsEnabled', false)
+    this.ragService.discoverNomadDocs().catch((error) => {
+      logger.error('[OllamaController] Failed to discover Nomad docs:', error)
+    })
+
+    return { success: true, message: 'Remote Ollama configured.' }
+  }
+
  async deleteModel({ request }: HttpContext) {
    const reqData = await request.validateUsing(modelNameSchema)
    await this.ollamaService.deleteModel(reqData.model)
--- a/admin/app/controllers/rag_controller.ts
+++ b/admin/app/controllers/rag_controller.ts
@ -74,6 +74,19 @@ export default class RagController {
    return response.status(200).json({ message: result.message })
  }

+  public async getFailedJobs({ response }: HttpContext) {
+    const jobs = await EmbedFileJob.listFailedJobs()
+    return response.status(200).json(jobs)
+  }
+
+  public async cleanupFailedJobs({ response }: HttpContext) {
+    const result = await EmbedFileJob.cleanupFailedJobs()
+    return response.status(200).json({
+      message: `Cleaned up ${result.cleaned} failed job${result.cleaned !== 1 ? 's' : ''}${result.filesDeleted > 0 ? `, deleted ${result.filesDeleted} file${result.filesDeleted !== 1 ? 's' : ''}` : ''}.`,
+      ...result,
+    })
+  }
+
  public async scanAndSync({ response }: HttpContext) {
    try {
      const syncResult = await this.ragService.scanAndSyncStorage()
--- a/admin/app/controllers/settings_controller.ts
+++ b/admin/app/controllers/settings_controller.ts
@ -1,12 +1,12 @@
-import KVStore from '#models/kv_store';
-import { BenchmarkService } from '#services/benchmark_service';
-import { MapService } from '#services/map_service';
-import { OllamaService } from '#services/ollama_service';
-import { SystemService } from '#services/system_service';
-import { updateSettingSchema } from '#validators/settings';
-import { inject } from '@adonisjs/core';
+import KVStore from '#models/kv_store'
+import { BenchmarkService } from '#services/benchmark_service'
+import { MapService } from '#services/map_service'
+import { OllamaService } from '#services/ollama_service'
+import { SystemService } from '#services/system_service'
+import { updateSettingSchema } from '#validators/settings'
+import { inject } from '@adonisjs/core'
 import type { HttpContext } from '@adonisjs/core/http'
-import type { KVStoreKey } from '../../types/kv_store.js';
+import type { KVStoreKey } from '../../types/kv_store.js'

@inject()
 export default class SettingsController {
@ -18,47 +18,53 @@ export default class SettingsController {
  ) {}

  async system({ inertia }: HttpContext) {
-        const systemInfo = await this.systemService.getSystemInfo();
+    const systemInfo = await this.systemService.getSystemInfo()
    return inertia.render('settings/system', {
      system: {
-                info: systemInfo
-            }
-        });
+        info: systemInfo,
+      },
+    })
  }

  async apps({ inertia }: HttpContext) {
-        const services = await this.systemService.getServices({ installedOnly: false });
+    const services = await this.systemService.getServices({ installedOnly: false })
    return inertia.render('settings/apps', {
      system: {
-                services
-            }
-        });
+        services,
+      },
+    })
  }

  async legal({ inertia }: HttpContext) {
-        return inertia.render('settings/legal');
+    return inertia.render('settings/legal')
  }

  async support({ inertia }: HttpContext) {
-        return inertia.render('settings/support');
+    return inertia.render('settings/support')
  }

  async maps({ inertia }: HttpContext) {
-        const baseAssetsCheck = await this.mapService.ensureBaseAssets();
-        const regionFiles = await this.mapService.listRegions();
+    const baseAssetsCheck = await this.mapService.ensureBaseAssets()
+    const regionFiles = await this.mapService.listRegions()
    return inertia.render('settings/maps', {
      maps: {
        baseAssetsExist: baseAssetsCheck,
-                regionFiles: regionFiles.files
-            }
-        });
+        regionFiles: regionFiles.files,
+      },
+    })
  }

  async models({ inertia }: HttpContext) {
-        const availableModels = await this.ollamaService.getAvailableModels({ sort: 'pulls', recommendedOnly: false, query: null, limit: 15 });
-        const installedModels = await this.ollamaService.getModels();
+    const availableModels = await this.ollamaService.getAvailableModels({
+      sort: 'pulls',
+      recommendedOnly: false,
+      query: null,
+      limit: 15,
+    })
+    const installedModels = await this.ollamaService.getModels().catch(() => [])
    const chatSuggestionsEnabled = await KVStore.getValue('chat.suggestionsEnabled')
    const aiAssistantCustomName = await KVStore.getValue('ai.assistantCustomName')
+    const remoteOllamaUrl = await KVStore.getValue('ai.remoteOllamaUrl')
    return inertia.render('settings/models', {
      models: {
        availableModels: availableModels?.models || [],
@ -66,20 +72,21 @@ export default class SettingsController {
        settings: {
          chatSuggestionsEnabled: chatSuggestionsEnabled ?? false,
          aiAssistantCustomName: aiAssistantCustomName ?? '',
-                }
-            }
-        });
+          remoteOllamaUrl: remoteOllamaUrl ?? '',
+        },
+      },
+    })
  }

  async update({ inertia }: HttpContext) {
-        const updateInfo = await this.systemService.checkLatestVersion();
+    const updateInfo = await this.systemService.checkLatestVersion()
    return inertia.render('settings/update', {
      system: {
        updateAvailable: updateInfo.updateAvailable,
        latestVersion: updateInfo.latestVersion,
-                currentVersion: updateInfo.currentVersion
-            }
-        });
+        currentVersion: updateInfo.currentVersion,
+      },
+    })
  }

  async zim({ inertia }: HttpContext) {
@ -87,30 +94,30 @@ export default class SettingsController {
  }

  async zimRemote({ inertia }: HttpContext) {
-        return inertia.render('settings/zim/remote-explorer');
+    return inertia.render('settings/zim/remote-explorer')
  }

  async benchmark({ inertia }: HttpContext) {
-        const latestResult = await this.benchmarkService.getLatestResult();
-        const status = this.benchmarkService.getStatus();
+    const latestResult = await this.benchmarkService.getLatestResult()
+    const status = this.benchmarkService.getStatus()
    return inertia.render('settings/benchmark', {
      benchmark: {
        latestResult,
        status: status.status,
-                currentBenchmarkId: status.benchmarkId
-            }
-        });
+        currentBenchmarkId: status.benchmarkId,
+      },
+    })
  }

  async getSetting({ request, response }: HttpContext) {
-        const key = request.qs().key;
-        const value = await KVStore.getValue(key as KVStoreKey);
-        return response.status(200).send({ key, value });
+    const key = request.qs().key
+    const value = await KVStore.getValue(key as KVStoreKey)
+    return response.status(200).send({ key, value })
  }

  async updateSetting({ request, response }: HttpContext) {
-        const reqData = await request.validateUsing(updateSettingSchema);
-        await this.systemService.updateSetting(reqData.key, reqData.value);
-        return response.status(200).send({ success: true, message: 'Setting updated successfully' });
+    const reqData = await request.validateUsing(updateSettingSchema)
+    await this.systemService.updateSetting(reqData.key, reqData.value)
+    return response.status(200).send({ success: true, message: 'Setting updated successfully' })
  }
 }
--- a/admin/app/jobs/embed_file_job.ts
+++ b/admin/app/jobs/embed_file_job.ts
@ -6,6 +6,7 @@ import { DockerService } from '#services/docker_service'
 import { OllamaService } from '#services/ollama_service'
 import { createHash } from 'crypto'
 import logger from '@adonisjs/core/services/logger'
+import fs from 'node:fs/promises'

 export interface EmbedFileJobParams {
  filePath: string
@ -232,6 +233,52 @@ export class EmbedFileJob {
    }
  }

+  static async listFailedJobs(): Promise<EmbedJobWithProgress[]> {
+    const queueService = new QueueService()
+    const queue = queueService.getQueue(this.queue)
+    // Jobs that have failed at least once are in 'delayed' (retrying) or terminal 'failed' state.
+    // We identify them by job.data.status === 'failed' set in the catch block of handle().
+    const jobs = await queue.getJobs(['waiting', 'delayed', 'failed'])
+
+    return jobs
+      .filter((job) => (job.data as any).status === 'failed')
+      .map((job) => ({
+        jobId: job.id!.toString(),
+        fileName: (job.data as EmbedFileJobParams).fileName,
+        filePath: (job.data as EmbedFileJobParams).filePath,
+        progress: 0,
+        status: 'failed',
+        error: (job.data as any).error,
+      }))
+  }
+
+  static async cleanupFailedJobs(): Promise<{ cleaned: number; filesDeleted: number }> {
+    const queueService = new QueueService()
+    const queue = queueService.getQueue(this.queue)
+    const allJobs = await queue.getJobs(['waiting', 'delayed', 'failed'])
+    const failedJobs = allJobs.filter((job) => (job.data as any).status === 'failed')
+
+    let cleaned = 0
+    let filesDeleted = 0
+
+    for (const job of failedJobs) {
+      const filePath = (job.data as EmbedFileJobParams).filePath
+      if (filePath && filePath.includes(RagService.UPLOADS_STORAGE_PATH)) {
+        try {
+          await fs.unlink(filePath)
+          filesDeleted++
+        } catch {
+          // File may already be deleted — that's fine
+        }
+      }
+      await job.remove()
+      cleaned++
+    }
+
+    logger.info(`[EmbedFileJob] Cleaned up ${cleaned} failed jobs, deleted ${filesDeleted} files`)
+    return { cleaned, filesDeleted }
+  }
+
  static async getStatus(filePath: string): Promise<{
    exists: boolean
    status?: string
--- a/admin/app/services/docker_service.ts
+++ b/admin/app/services/docker_service.ts
@ -140,6 +140,11 @@ export class DockerService {
      return null
    }

+    if (serviceName === SERVICE_NAMES.OLLAMA) {
+      const remoteUrl = await KVStore.getValue('ai.remoteOllamaUrl')
+      if (remoteUrl) return remoteUrl
+    }
+
    const service = await Service.query()
      .where('service_name', serviceName)
      .andWhere('installed', true)
--- a/admin/app/services/ollama_service.ts
+++ b/admin/app/services/ollama_service.ts
@ -1,5 +1,7 @@
 import { inject } from '@adonisjs/core'
-import { ChatRequest, Ollama } from 'ollama'
+import OpenAI from 'openai'
+import type { ChatCompletionChunk, ChatCompletionMessageParam } from 'openai/resources/chat/completions.js'
+import type { Stream } from 'openai/streaming.js'
 import { NomadOllamaModel } from '../../types/ollama.js'
 import { FALLBACK_RECOMMENDED_OLLAMA_MODELS } from '../../constants/ollama.js'
 import fs from 'node:fs/promises'
@ -13,51 +15,93 @@ import Fuse, { IFuseOptions } from 'fuse.js'
 import { BROADCAST_CHANNELS } from '../../constants/broadcast.js'
 import env from '#start/env'
 import { NOMAD_API_DEFAULT_BASE_URL } from '../../constants/misc.js'
+import KVStore from '#models/kv_store'

 const NOMAD_MODELS_API_PATH = '/api/v1/ollama/models'
 const MODELS_CACHE_FILE = path.join(process.cwd(), 'storage', 'ollama-models-cache.json')
 const CACHE_MAX_AGE_MS = 24 * 60 * 60 * 1000 // 24 hours

+export type NomadInstalledModel = {
+  name: string
+  size: number
+  digest?: string
+  details?: Record<string, any>
+}
+
+export type NomadChatResponse = {
+  message: { content: string; thinking?: string }
+  done: boolean
+  model: string
+}
+
+export type NomadChatStreamChunk = {
+  message: { content: string; thinking?: string }
+  done: boolean
+}
+
+type ChatInput = {
+  model: string
+  messages: Array<{ role: 'system' | 'user' | 'assistant'; content: string }>
+  think?: boolean | 'medium'
+  stream?: boolean
+  numCtx?: number
+}
+
@inject()
 export class OllamaService {
-  private ollama: Ollama | null = null
-  private ollamaInitPromise: Promise<void> | null = null
+  private openai: OpenAI | null = null
+  private baseUrl: string | null = null
+  private initPromise: Promise<void> | null = null
+  private isOllamaNative: boolean | null = null

  constructor() {}

-  private async _initializeOllamaClient() {
-    if (!this.ollamaInitPromise) {
-      this.ollamaInitPromise = (async () => {
+  private async _initialize() {
+    if (!this.initPromise) {
+      this.initPromise = (async () => {
+        // Check KVStore for a custom base URL (remote Ollama, LM Studio, llama.cpp, etc.)
+        const customUrl = (await KVStore.getValue('ai.remoteOllamaUrl')) as string | null
+        if (customUrl && customUrl.trim()) {
+          this.baseUrl = customUrl.trim().replace(/\/$/, '')
+        } else {
+          // Fall back to the local Ollama container managed by Docker
          const dockerService = new (await import('./docker_service.js')).DockerService()
-        const qdrantUrl = await dockerService.getServiceURL(SERVICE_NAMES.OLLAMA)
-        if (!qdrantUrl) {
+          const ollamaUrl = await dockerService.getServiceURL(SERVICE_NAMES.OLLAMA)
+          if (!ollamaUrl) {
            throw new Error('Ollama service is not installed or running.')
          }
-        this.ollama = new Ollama({ host: qdrantUrl })
+          this.baseUrl = ollamaUrl.trim().replace(/\/$/, '')
+        }
+
+        this.openai = new OpenAI({
+          apiKey: 'nomad', // Required by SDK; not validated by Ollama/LM Studio/llama.cpp
+          baseURL: `${this.baseUrl}/v1`,
+        })
      })()
    }
-    return this.ollamaInitPromise
+    return this.initPromise
  }

  private async _ensureDependencies() {
-    if (!this.ollama) {
-      await this._initializeOllamaClient()
+    if (!this.openai) {
+      await this._initialize()
    }
  }

  /**
-   * Downloads a model from the Ollama service with progress tracking. Where possible,
-   * one should dispatch a background job instead of calling this method directly to avoid long blocking.
-   * @param model Model name to download
-   * @returns Success status and message
+   * Downloads a model from Ollama with progress tracking. Only works with Ollama backends.
+   * Use dispatchModelDownload() for background job processing where possible.
   */
-  async downloadModel(model: string, progressCallback?: (percent: number) => void): Promise<{ success: boolean; message: string; retryable?: boolean }> {
-    try {
+  async downloadModel(
+    model: string,
+    progressCallback?: (percent: number) => void
+  ): Promise<{ success: boolean; message: string; retryable?: boolean }> {
    await this._ensureDependencies()
-      if (!this.ollama) {
-        throw new Error('Ollama client is not initialized.')
+    if (!this.baseUrl) {
+      return { success: false, message: 'AI service is not initialized.' }
    }

+    try {
      // See if model is already installed
      const installedModels = await this.getModels()
      if (installedModels && installedModels.some((m) => m.name === model)) {
@ -65,23 +109,48 @@ export class OllamaService {
        return { success: true, message: 'Model is already installed.' }
      }

-      // Returns AbortableAsyncIterator<ProgressResponse>
-      const downloadStream = await this.ollama.pull({
-        model,
-        stream: true,
+      // Model pulling is an Ollama-only operation. Non-Ollama backends (LM Studio, llama.cpp, etc.)
+      // return HTTP 200 for unknown endpoints, so the pull would appear to succeed but do nothing.
+      if (this.isOllamaNative === false) {
+        logger.warn(
+          `[OllamaService] Non-Ollama backend detected — skipping model pull for "${model}". Load the model manually in your AI host.`
+        )
+        return {
+          success: false,
+          message: `Model "${model}" is not available in your AI host. Please load it manually (model pulling is only supported for Ollama backends).`,
+        }
+      }
+
+      // Stream pull via Ollama native API
+      const pullResponse = await axios.post(
+        `${this.baseUrl}/api/pull`,
+        { model, stream: true },
+        { responseType: 'stream', timeout: 0 }
+      )
+
+      await new Promise<void>((resolve, reject) => {
+        let buffer = ''
+        pullResponse.data.on('data', (chunk: Buffer) => {
+          buffer += chunk.toString()
+          const lines = buffer.split('\n')
+          buffer = lines.pop() || ''
+          for (const line of lines) {
+            if (!line.trim()) continue
+            try {
+              const parsed = JSON.parse(line)
+              if (parsed.completed && parsed.total) {
+                const percent = parseFloat(((parsed.completed / parsed.total) * 100).toFixed(2))
+                this.broadcastDownloadProgress(model, percent)
+                if (progressCallback) progressCallback(percent)
+              }
+            } catch {
+              // ignore parse errors on partial lines
+            }
+          }
+        })
+        pullResponse.data.on('end', resolve)
+        pullResponse.data.on('error', reject)
      })
-
-      for await (const chunk of downloadStream) {
-        if (chunk.completed && chunk.total) {
-          const percent = ((chunk.completed / chunk.total) * 100).toFixed(2)
-          const percentNum = parseFloat(percent)
-
-          this.broadcastDownloadProgress(model, percentNum)
-          if (progressCallback) {
-            progressCallback(percentNum)
-          }
-        }
-      }

      logger.info(`[OllamaService] Model "${model}" downloaded successfully.`)
      return { success: true, message: 'Model downloaded successfully.' }
@ -128,88 +197,257 @@ export class OllamaService {
    }
  }

-  public async getClient() {
+  public async chat(chatRequest: ChatInput): Promise<NomadChatResponse> {
    await this._ensureDependencies()
-    return this.ollama!
+    if (!this.openai) {
+      throw new Error('AI client is not initialized.')
    }

-  public async chat(chatRequest: ChatRequest & { stream?: boolean }) {
-    await this._ensureDependencies()
-    if (!this.ollama) {
-      throw new Error('Ollama client is not initialized.')
-    }
-    return await this.ollama.chat({
-      ...chatRequest,
+    const params: any = {
+      model: chatRequest.model,
+      messages: chatRequest.messages as ChatCompletionMessageParam[],
      stream: false,
-    })
+    }
+    if (chatRequest.think) {
+      params.think = chatRequest.think
+    }
+    if (chatRequest.numCtx) {
+      params.num_ctx = chatRequest.numCtx
    }

-  public async chatStream(chatRequest: ChatRequest) {
-    await this._ensureDependencies()
-    if (!this.ollama) {
-      throw new Error('Ollama client is not initialized.')
+    const response = await this.openai.chat.completions.create(params)
+    const choice = response.choices[0]
+
+    return {
+      message: {
+        content: choice.message.content ?? '',
+        thinking: (choice.message as any).thinking ?? undefined,
+      },
+      done: true,
+      model: response.model,
    }
-    return await this.ollama.chat({
-      ...chatRequest,
+  }
+
+  public async chatStream(chatRequest: ChatInput): Promise<AsyncIterable<NomadChatStreamChunk>> {
+    await this._ensureDependencies()
+    if (!this.openai) {
+      throw new Error('AI client is not initialized.')
+    }
+
+    const params: any = {
+      model: chatRequest.model,
+      messages: chatRequest.messages as ChatCompletionMessageParam[],
      stream: true,
-    })
+    }
+    if (chatRequest.think) {
+      params.think = chatRequest.think
+    }
+    if (chatRequest.numCtx) {
+      params.num_ctx = chatRequest.numCtx
+    }
+
+    const stream = (await this.openai.chat.completions.create(params)) as unknown as Stream<ChatCompletionChunk>
+
+    // Returns how many trailing chars of `text` could be the start of `tag`
+    function partialTagSuffix(tag: string, text: string): number {
+      for (let len = Math.min(tag.length - 1, text.length); len >= 1; len--) {
+        if (text.endsWith(tag.slice(0, len))) return len
+      }
+      return 0
+    }
+
+    async function* normalize(): AsyncGenerator<NomadChatStreamChunk> {
+      // Stateful parser for <think>...</think> tags that may be split across chunks.
+      // Ollama provides thinking natively via delta.thinking; OpenAI-compatible backends
+      // (LM Studio, llama.cpp, etc.) embed them inline in delta.content.
+      let tagBuffer = ''
+      let inThink = false
+
+      for await (const chunk of stream) {
+        const delta = chunk.choices[0]?.delta
+        const nativeThinking: string = (delta as any)?.thinking ?? ''
+        const rawContent: string = delta?.content ?? ''
+
+        // Parse <think> tags out of the content stream
+        tagBuffer += rawContent
+        let parsedContent = ''
+        let parsedThinking = ''
+
+        while (tagBuffer.length > 0) {
+          if (inThink) {
+            const closeIdx = tagBuffer.indexOf('</think>')
+            if (closeIdx !== -1) {
+              parsedThinking += tagBuffer.slice(0, closeIdx)
+              tagBuffer = tagBuffer.slice(closeIdx + 8)
+              inThink = false
+            } else {
+              const hold = partialTagSuffix('</think>', tagBuffer)
+              parsedThinking += tagBuffer.slice(0, tagBuffer.length - hold)
+              tagBuffer = tagBuffer.slice(tagBuffer.length - hold)
+              break
+            }
+          } else {
+            const openIdx = tagBuffer.indexOf('<think>')
+            if (openIdx !== -1) {
+              parsedContent += tagBuffer.slice(0, openIdx)
+              tagBuffer = tagBuffer.slice(openIdx + 7)
+              inThink = true
+            } else {
+              const hold = partialTagSuffix('<think>', tagBuffer)
+              parsedContent += tagBuffer.slice(0, tagBuffer.length - hold)
+              tagBuffer = tagBuffer.slice(tagBuffer.length - hold)
+              break
+            }
+          }
+        }
+
+        yield {
+          message: {
+            content: parsedContent,
+            thinking: nativeThinking + parsedThinking,
+          },
+          done: chunk.choices[0]?.finish_reason !== null && chunk.choices[0]?.finish_reason !== undefined,
+        }
+      }
+    }
+
+    return normalize()
  }

  public async checkModelHasThinking(modelName: string): Promise<boolean> {
    await this._ensureDependencies()
-    if (!this.ollama) {
-      throw new Error('Ollama client is not initialized.')
+    if (!this.baseUrl) return false
+
+    try {
+      const response = await axios.post(
+        `${this.baseUrl}/api/show`,
+        { model: modelName },
+        { timeout: 5000 }
+      )
+      return Array.isArray(response.data?.capabilities) && response.data.capabilities.includes('thinking')
+    } catch {
+      // Non-Ollama backends don't expose /api/show — assume no thinking support
+      return false
+    }
  }

-    const modelInfo = await this.ollama.show({
-      model: modelName,
-    })
-
-    return modelInfo.capabilities.includes('thinking')
-  }
-
-  public async deleteModel(modelName: string) {
+  public async deleteModel(modelName: string): Promise<{ success: boolean; message: string }> {
    await this._ensureDependencies()
-    if (!this.ollama) {
-      throw new Error('Ollama client is not initialized.')
+    if (!this.baseUrl) {
+      return { success: false, message: 'AI service is not initialized.' }
    }

-    return await this.ollama.delete({
-      model: modelName,
+    try {
+      await axios.delete(`${this.baseUrl}/api/delete`, {
+        data: { model: modelName },
+        timeout: 10000,
      })
+      return { success: true, message: `Model "${modelName}" deleted.` }
+    } catch (error) {
+      logger.error(
+        `[OllamaService] Failed to delete model "${modelName}": ${error instanceof Error ? error.message : error}`
+      )
+      return { success: false, message: 'Failed to delete model. This may not be an Ollama backend.' }
+    }
  }

-  public async getModels(includeEmbeddings = false) {
+  /**
+   * Generate embeddings for the given input strings.
+   * Tries the Ollama native /api/embed endpoint first, falls back to /v1/embeddings.
+   */
+  public async embed(model: string, input: string[]): Promise<{ embeddings: number[][] }> {
    await this._ensureDependencies()
-    if (!this.ollama) {
-      throw new Error('Ollama client is not initialized.')
+    if (!this.baseUrl || !this.openai) {
+      throw new Error('AI service is not initialized.')
+    }
+
+    try {
+      // Prefer Ollama native endpoint (supports batch input natively)
+      const response = await axios.post(
+        `${this.baseUrl}/api/embed`,
+        { model, input },
+        { timeout: 60000 }
+      )
+      // Some backends (e.g. LM Studio) return HTTP 200 for unknown endpoints with an incompatible
+      // body — validate explicitly before accepting the result.
+      if (!Array.isArray(response.data?.embeddings)) {
+        throw new Error('Invalid /api/embed response — missing embeddings array')
+      }
+      return { embeddings: response.data.embeddings }
+    } catch {
+      // Fall back to OpenAI-compatible /v1/embeddings
+      // Explicitly request float format — some backends (e.g. LM Studio) don't reliably
+      // implement the base64 encoding the OpenAI SDK requests by default.
+      logger.info('[OllamaService] /api/embed unavailable, falling back to /v1/embeddings')
+      const results = await this.openai.embeddings.create({ model, input, encoding_format: 'float' })
+      return { embeddings: results.data.map((e) => e.embedding as number[]) }
+    }
+  }
+
+  public async getModels(includeEmbeddings = false): Promise<NomadInstalledModel[]> {
+    await this._ensureDependencies()
+    if (!this.baseUrl) {
+      throw new Error('AI service is not initialized.')
+    }
+
+    try {
+      // Prefer the Ollama native endpoint which includes size and metadata
+      const response = await axios.get(`${this.baseUrl}/api/tags`, { timeout: 5000 })
+      // LM Studio returns HTTP 200 for unknown endpoints with an incompatible body — validate explicitly
+      if (!Array.isArray(response.data?.models)) {
+        throw new Error('Not an Ollama-compatible /api/tags response')
+      }
+      this.isOllamaNative = true
+      const models: NomadInstalledModel[] = response.data.models
+      if (includeEmbeddings) return models
+      return models.filter((m) => !m.name.includes('embed'))
+    } catch {
+      // Fall back to the OpenAI-compatible /v1/models endpoint (LM Studio, llama.cpp, etc.)
+      this.isOllamaNative = false
+      logger.info('[OllamaService] /api/tags unavailable, falling back to /v1/models')
+      try {
+        const modelList = await this.openai!.models.list()
+        const models: NomadInstalledModel[] = modelList.data.map((m) => ({ name: m.id, size: 0 }))
+        if (includeEmbeddings) return models
+        return models.filter((m) => !m.name.includes('embed'))
+      } catch (err) {
+        logger.error(
+          `[OllamaService] Failed to list models: ${err instanceof Error ? err.message : err}`
+        )
+        return []
      }
-    const response = await this.ollama.list()
-    if (includeEmbeddings) {
-      return response.models
    }
-    // Filter out embedding models
-    return response.models.filter((model) => !model.name.includes('embed'))
  }

  async getAvailableModels(
-    { sort, recommendedOnly, query, limit, force }: { sort?: 'pulls' | 'name'; recommendedOnly?: boolean, query: string | null, limit?: number, force?: boolean } = {
+    {
+      sort,
+      recommendedOnly,
+      query,
+      limit,
+      force,
+    }: {
+      sort?: 'pulls' | 'name'
+      recommendedOnly?: boolean
+      query: string | null
+      limit?: number
+      force?: boolean
+    } = {
      sort: 'pulls',
      recommendedOnly: false,
      query: null,
      limit: 15,
    }
-  ): Promise<{ models: NomadOllamaModel[], hasMore: boolean } | null> {
+  ): Promise<{ models: NomadOllamaModel[]; hasMore: boolean } | null> {
    try {
      const models = await this.retrieveAndRefreshModels(sort, force)
      if (!models) {
-        // If we fail to get models from the API, return the fallback recommended models
        logger.warn(
          '[OllamaService] Returning fallback recommended models due to failure in fetching available models'
        )
        return {
          models: FALLBACK_RECOMMENDED_OLLAMA_MODELS,
-          hasMore: false
+          hasMore: false,
        }
      }

@ -217,15 +455,13 @@ export class OllamaService {
        const filteredModels = query ? this.fuseSearchModels(models, query) : models
        return {
          models: filteredModels.slice(0, limit || 15),
-          hasMore: filteredModels.length > (limit || 15)
+          hasMore: filteredModels.length > (limit || 15),
        }
      }

-      // If recommendedOnly is true, only return the first three models (if sorted by pulls, these will be the top 3)
      const sortedByPulls = sort === 'pulls' ? models : this.sortModels(models, 'pulls')
      const firstThree = sortedByPulls.slice(0, 3)

-      // Only return the first tag of each of these models (should be the most lightweight variant)
      const recommendedModels = firstThree.map((model) => {
        return {
          ...model,
@ -237,13 +473,13 @@ export class OllamaService {
        const filteredRecommendedModels = this.fuseSearchModels(recommendedModels, query)
        return {
          models: filteredRecommendedModels,
-          hasMore: filteredRecommendedModels.length > (limit || 15)
+          hasMore: filteredRecommendedModels.length > (limit || 15),
        }
      }

      return {
        models: recommendedModels,
-        hasMore: recommendedModels.length > (limit || 15)
+        hasMore: recommendedModels.length > (limit || 15),
      }
    } catch (error) {
      logger.error(
@ -283,7 +519,6 @@ export class OllamaService {

      const rawModels = response.data.models as NomadOllamaModel[]

-      // Filter out tags where cloud is truthy, then remove models with no remaining tags
      const noCloud = rawModels
        .map((model) => ({
          ...model,
@ -295,8 +530,7 @@ export class OllamaService {
      return this.sortModels(noCloud, sort)
    } catch (error) {
      logger.error(
-        `[OllamaService] Failed to retrieve models from Nomad API: ${error instanceof Error ? error.message : error
-        }`
+        `[OllamaService] Failed to retrieve models from Nomad API: ${error instanceof Error ? error.message : error}`
      )
      return null
    }
@ -322,7 +556,6 @@ export class OllamaService {

      return models
    } catch (error) {
-      // Cache doesn't exist or is invalid
      if ((error as NodeJS.ErrnoException).code !== 'ENOENT') {
        logger.warn(
          `[OllamaService] Error reading cache: ${error instanceof Error ? error.message : error}`
@ -346,7 +579,6 @@ export class OllamaService {

  private sortModels(models: NomadOllamaModel[], sort?: 'pulls' | 'name'): NomadOllamaModel[] {
    if (sort === 'pulls') {
-      // Sort by estimated pulls (it should be a string like "1.2K", "500", "4M" etc.)
      models.sort((a, b) => {
        const parsePulls = (pulls: string) => {
          const multiplier = pulls.endsWith('K')
@ -364,8 +596,6 @@ export class OllamaService {
      models.sort((a, b) => a.name.localeCompare(b.name))
    }

-    // Always sort model.tags by the size field in descending order
-    // Size is a string like '75GB', '8.5GB', '2GB' etc. Smaller models first
    models.forEach((model) => {
      if (model.tags && Array.isArray(model.tags)) {
        model.tags.sort((a, b) => {
@ -378,7 +608,7 @@ export class OllamaService {
                  ? 1
                  : size.endsWith('TB')
                    ? 1_000
-                    : 0 // Unknown size format
+                    : 0
            return parseFloat(size) * multiplier
          }
          return parseSize(a.size) - parseSize(b.size)
@ -411,11 +641,11 @@ export class OllamaService {
    const options: IFuseOptions<NomadOllamaModel> = {
      ignoreDiacritics: true,
      keys: ['name', 'description', 'tags.name'],
-      threshold: 0.3, // lower threshold for stricter matching
+      threshold: 0.3,
    }

    const fuse = new Fuse(models, options)

-    return fuse.search(query).map(result => result.item)
+    return fuse.search(query).map((result) => result.item)
  }
 }
--- a/admin/app/services/rag_service.ts
+++ b/admin/app/services/rag_service.ts
@ -23,15 +23,18 @@ export class RagService {
  private qdrant: QdrantClient | null = null
  private qdrantInitPromise: Promise<void> | null = null
  private embeddingModelVerified = false
+  private resolvedEmbeddingModel: string | null = null
  public static UPLOADS_STORAGE_PATH = 'storage/kb_uploads'
  public static CONTENT_COLLECTION_NAME = 'nomad_knowledge_base'
  public static EMBEDDING_MODEL = 'nomic-embed-text:v1.5'
  public static EMBEDDING_DIMENSION = 768 // Nomic Embed Text v1.5 dimension is 768
  public static MODEL_CONTEXT_LENGTH = 2048 // nomic-embed-text has 2K token context
-  public static MAX_SAFE_TOKENS = 1800 // Leave buffer for prefix and tokenization variance
-  public static TARGET_TOKENS_PER_CHUNK = 1700 // Target 1700 tokens per chunk for embedding
+  public static MAX_SAFE_TOKENS = 1600 // Leave buffer for prefix and tokenization variance
+  public static TARGET_TOKENS_PER_CHUNK = 1500 // Target 1500 tokens per chunk for embedding
  public static PREFIX_TOKEN_BUDGET = 10 // Reserve ~10 tokens for prefixes
-  public static CHAR_TO_TOKEN_RATIO = 3 // Approximate chars per token
+  public static CHAR_TO_TOKEN_RATIO = 2 // Conservative chars-per-token estimate; technical docs
+                                         // (numbers, symbols, abbreviations) tokenize denser
+                                         // than plain prose (~3), so 2 avoids context overflows
  // Nomic Embed Text v1.5 uses task-specific prefixes for optimal performance
  public static SEARCH_DOCUMENT_PREFIX = 'search_document: '
  public static SEARCH_QUERY_PREFIX = 'search_query: '
@ -245,7 +248,9 @@ export class RagService {

      if (!this.embeddingModelVerified) {
        const allModels = await this.ollamaService.getModels(true)
-        const embeddingModel = allModels.find((model) => model.name === RagService.EMBEDDING_MODEL)
+        const embeddingModel =
+          allModels.find((model) => model.name === RagService.EMBEDDING_MODEL) ??
+          allModels.find((model) => model.name.toLowerCase().includes('nomic-embed-text'))

        if (!embeddingModel) {
          try {
@ -262,6 +267,7 @@ export class RagService {
            return null
          }
        }
+        this.resolvedEmbeddingModel = embeddingModel?.name ?? RagService.EMBEDDING_MODEL
        this.embeddingModelVerified = true
      }

@ -285,8 +291,6 @@ export class RagService {
      // Extract text from chunk results
      const chunks = chunkResults.map((chunk) => chunk.text)

-      const ollamaClient = await this.ollamaService.getClient()
-
      // Prepare all chunk texts with prefix and truncation
      const prefixedChunks: string[] = []
      for (let i = 0; i < chunks.length; i++) {
@ -320,10 +324,7 @@ export class RagService {

        logger.debug(`[RAG] Embedding batch ${batchIdx + 1}/${totalBatches} (${batch.length} chunks)`)

-        const response = await ollamaClient.embed({
-          model: RagService.EMBEDDING_MODEL,
-          input: batch,
-        })
+        const response = await this.ollamaService.embed(this.resolvedEmbeddingModel ?? RagService.EMBEDDING_MODEL, batch)

        embeddings.push(...response.embeddings)

@ -692,7 +693,9 @@ export class RagService {

      if (!this.embeddingModelVerified) {
        const allModels = await this.ollamaService.getModels(true)
-        const embeddingModel = allModels.find((model) => model.name === RagService.EMBEDDING_MODEL)
+        const embeddingModel =
+          allModels.find((model) => model.name === RagService.EMBEDDING_MODEL) ??
+          allModels.find((model) => model.name.toLowerCase().includes('nomic-embed-text'))

        if (!embeddingModel) {
          logger.warn(
@ -701,6 +704,7 @@ export class RagService {
          this.embeddingModelVerified = false
          return []
        }
+        this.resolvedEmbeddingModel = embeddingModel.name
        this.embeddingModelVerified = true
      }

@ -710,8 +714,6 @@ export class RagService {
      logger.debug(`[RAG] Extracted keywords: [${keywords.join(', ')}]`)

      // Generate embedding for the query with search_query prefix
-      const ollamaClient = await this.ollamaService.getClient()
-
      // Ensure query doesn't exceed token limit
      const prefixTokens = this.estimateTokenCount(RagService.SEARCH_QUERY_PREFIX)
      const maxQueryTokens = RagService.MAX_SAFE_TOKENS - prefixTokens
@ -729,10 +731,7 @@ export class RagService {
        return []
      }

-      const response = await ollamaClient.embed({
-        model: RagService.EMBEDDING_MODEL,
-        input: [prefixedQuery],
-      })
+      const response = await this.ollamaService.embed(this.resolvedEmbeddingModel ?? RagService.EMBEDDING_MODEL, [prefixedQuery])

      // Perform semantic search with a higher limit to enable reranking
      const searchLimit = limit * 3 // Get more results for reranking
--- a/admin/app/services/system_service.ts
+++ b/admin/app/services/system_service.ts
@ -4,10 +4,15 @@ import { DockerService } from '#services/docker_service'
 import { ServiceSlim } from '../../types/services.js'
 import logger from '@adonisjs/core/services/logger'
 import si from 'systeminformation'
-import { GpuHealthStatus, NomadDiskInfo, NomadDiskInfoRaw, SystemInformationResponse } from '../../types/system.js'
+import {
+  GpuHealthStatus,
+  NomadDiskInfo,
+  NomadDiskInfoRaw,
+  SystemInformationResponse,
+} from '../../types/system.js'
 import { SERVICE_NAMES } from '../../constants/service_names.js'
-import { readFileSync } from 'fs'
-import path, { join } from 'path'
+import { readFileSync } from 'node:fs'
+import path, { join } from 'node:path'
 import { getAllFilesystems, getFile } from '../utils/fs.js'
 import axios from 'axios'
 import env from '#start/env'
@ -15,7 +20,6 @@ import KVStore from '#models/kv_store'
 import { KV_STORE_SCHEMA, KVStoreKey } from '../../types/kv_store.js'
 import { isNewerVersion } from '../utils/version.js'

-
@inject()
 export class SystemService {
  private static appVersion: string | null = null
@ -24,8 +28,8 @@ export class SystemService {
  constructor(private dockerService: DockerService) {}

  async checkServiceInstalled(serviceName: string): Promise<boolean> {
-    const services = await this.getServices({ installedOnly: true });
-    return services.some(service => service.service_name === serviceName);
+    const services = await this.getServices({ installedOnly: true })
+    return services.some((service) => service.service_name === serviceName)
  }

  async getInternetStatus(): Promise<boolean> {
@ -67,14 +71,20 @@ export class SystemService {
    return false
  }

-  async getNvidiaSmiInfo(): Promise<Array<{ vendor: string; model: string; vram: number; }> | { error: string } | 'OLLAMA_NOT_FOUND' | 'BAD_RESPONSE' | 'UNKNOWN_ERROR'> {
+  async getNvidiaSmiInfo(): Promise<
+    | Array<{ vendor: string; model: string; vram: number }>
+    | { error: string }
+    | 'OLLAMA_NOT_FOUND'
+    | 'BAD_RESPONSE'
+    | 'UNKNOWN_ERROR'
+  > {
    try {
      const containers = await this.dockerService.docker.listContainers({ all: false })
-      const ollamaContainer = containers.find((c) =>
-        c.Names.includes(`/${SERVICE_NAMES.OLLAMA}`)
-      )
+      const ollamaContainer = containers.find((c) => c.Names.includes(`/${SERVICE_NAMES.OLLAMA}`))
      if (!ollamaContainer) {
-        logger.info('Ollama container not found for nvidia-smi info retrieval. This is expected if Ollama is not installed.')
+        logger.info(
+          'Ollama container not found for nvidia-smi info retrieval. This is expected if Ollama is not installed.'
+        )
        return 'OLLAMA_NOT_FOUND'
      }

@ -92,23 +102,35 @@ export class SystemService {
      const output = await new Promise<string>((resolve) => {
        let data = ''
        const timeout = setTimeout(() => resolve(data), 5000)
-        stream.on('data', (chunk: Buffer) => { data += chunk.toString() })
-        stream.on('end', () => { clearTimeout(timeout); resolve(data) })
+        stream.on('data', (chunk: Buffer) => {
+          data += chunk.toString()
+        })
+        stream.on('end', () => {
+          clearTimeout(timeout)
+          resolve(data)
+        })
      })

      // Remove any non-printable characters and trim the output
-      const cleaned = output.replace(/[\x00-\x08]/g, '').trim()
-      if (cleaned && !cleaned.toLowerCase().includes('error') && !cleaned.toLowerCase().includes('not found')) {
+      const cleaned = Array.from(output)
+        .filter((character) => character.charCodeAt(0) > 8)
+        .join('')
+        .trim()
+      if (
+        cleaned &&
+        !cleaned.toLowerCase().includes('error') &&
+        !cleaned.toLowerCase().includes('not found')
+      ) {
        // Split by newlines to handle multiple GPUs installed
-        const lines = cleaned.split('\n').filter(line => line.trim())
+        const lines = cleaned.split('\n').filter((line) => line.trim())

        // Map each line out to a useful structure for us
-        const gpus = lines.map(line => {
+        const gpus = lines.map((line) => {
          const parts = line.split(',').map((s) => s.trim())
          return {
            vendor: 'NVIDIA',
            model: parts[0] || 'NVIDIA GPU',
-            vram: parts[1] ? parseInt(parts[1], 10) : 0,
+            vram: parts[1] ? Number.parseInt(parts[1], 10) : 0,
          }
        })

@ -117,8 +139,7 @@ export class SystemService {

      // If we got output but looks like an error, consider it a bad response from nvidia-smi
      return 'BAD_RESPONSE'
-    }
-    catch (error) {
+    } catch (error) {
      logger.error('Error getting nvidia-smi info:', error)
      if (error instanceof Error && error.message) {
        return { error: error.message }
@ -127,6 +148,63 @@ export class SystemService {
    }
  }

+  async getExternalOllamaGpuInfo(): Promise<Array<{
+    vendor: string
+    model: string
+    vram: number
+  }> | null> {
+    try {
+      // If a remote Ollama URL is configured, use it directly without requiring a local container
+      const remoteOllamaUrl = await KVStore.getValue('ai.remoteOllamaUrl')
+      if (!remoteOllamaUrl) {
+        const containers = await this.dockerService.docker.listContainers({ all: false })
+        const ollamaContainer = containers.find((c) => c.Names.includes(`/${SERVICE_NAMES.OLLAMA}`))
+        if (!ollamaContainer) {
+          return null
+        }
+
+        const actualImage = (ollamaContainer.Image || '').toLowerCase()
+        if (actualImage.includes('ollama/ollama') || actualImage.startsWith('ollama:')) {
+          return null
+        }
+      }
+
+      const ollamaUrl = remoteOllamaUrl || (await this.dockerService.getServiceURL(SERVICE_NAMES.OLLAMA))
+      if (!ollamaUrl) {
+        return null
+      }
+
+      await axios.get(new URL('/api/tags', ollamaUrl).toString(), { timeout: 3000 })
+
+      let vramMb = 0
+      try {
+        const psResponse = await axios.get(new URL('/api/ps', ollamaUrl).toString(), {
+          timeout: 3000,
+        })
+        const loadedModels = Array.isArray(psResponse.data?.models) ? psResponse.data.models : []
+        const largestAllocation = loadedModels.reduce(
+          (max: number, model: { size_vram?: number | string }) =>
+            Math.max(max, Number(model.size_vram) || 0),
+          0
+        )
+        vramMb = largestAllocation > 0 ? Math.round(largestAllocation / (1024 * 1024)) : 0
+      } catch {}
+
+      return [
+        {
+          vendor: 'NVIDIA',
+          model: 'NVIDIA GPU (external Ollama)',
+          vram: vramMb,
+        },
+      ]
+    } catch (error) {
+      logger.info(
+        `[SystemService] External Ollama GPU probe failed: ${error instanceof Error ? error.message : error}`
+      )
+      return null
+    }
+  }
+
  async getServices({ installedOnly = true }: { installedOnly?: boolean }): Promise<ServiceSlim[]> {
    await this._syncContainersWithDatabase() // Sync up before fetching to ensure we have the latest status

@ -273,17 +351,46 @@ export class SystemService {
              graphics.controllers = nvidiaInfo.map((gpu) => ({
                model: gpu.model,
                vendor: gpu.vendor,
-                bus: "",
+                bus: '',
                vram: gpu.vram,
                vramDynamic: false, // assume false here, we don't actually use this field for our purposes.
              }))
              gpuHealth.status = 'ok'
              gpuHealth.ollamaGpuAccessible = true
            } else if (nvidiaInfo === 'OLLAMA_NOT_FOUND') {
+              // No local Ollama container — check if a remote Ollama URL is configured
+              const externalOllamaGpu = await this.getExternalOllamaGpuInfo()
+              if (externalOllamaGpu) {
+                graphics.controllers = externalOllamaGpu.map((gpu) => ({
+                  model: gpu.model,
+                  vendor: gpu.vendor,
+                  bus: '',
+                  vram: gpu.vram,
+                  vramDynamic: false,
+                }))
+                gpuHealth.status = 'ok'
+                gpuHealth.ollamaGpuAccessible = true
+              } else {
                gpuHealth.status = 'ollama_not_installed'
+              }
+            } else {
+              const externalOllamaGpu = await this.getExternalOllamaGpuInfo()
+              if (externalOllamaGpu) {
+                graphics.controllers = externalOllamaGpu.map((gpu) => ({
+                  model: gpu.model,
+                  vendor: gpu.vendor,
+                  bus: '',
+                  vram: gpu.vram,
+                  vramDynamic: false,
+                }))
+                gpuHealth.status = 'ok'
+                gpuHealth.ollamaGpuAccessible = true
              } else {
                gpuHealth.status = 'passthrough_failed'
-              logger.warn(`NVIDIA runtime detected but GPU passthrough failed: ${typeof nvidiaInfo === 'string' ? nvidiaInfo : JSON.stringify(nvidiaInfo)}`)
+                logger.warn(
+                  `NVIDIA runtime detected but GPU passthrough failed: ${typeof nvidiaInfo === 'string' ? nvidiaInfo : JSON.stringify(nvidiaInfo)}`
+                )
+              }
            }
          }
        } else {
@ -356,7 +463,8 @@ export class SystemService {

      logger.info(`Current version: ${currentVersion}, Latest version: ${latestVersion}`)

-      const updateAvailable = process.env.NODE_ENV === 'development'
+      const updateAvailable =
+        process.env.NODE_ENV === 'development'
          ? false
          : isNewerVersion(latestVersion, currentVersion.trim(), earlyAccess)

@ -518,11 +626,14 @@ export class SystemService {
    const k = 1024
    const sizes = ['Bytes', 'KB', 'MB', 'GB', 'TB']
    const i = Math.floor(Math.log(bytes) / Math.log(k))
-    return parseFloat((bytes / Math.pow(k, i)).toFixed(decimals)) + ' ' + sizes[i]
+    return Number.parseFloat((bytes / Math.pow(k, i)).toFixed(decimals)) + ' ' + sizes[i]
  }

  async updateSetting(key: KVStoreKey, value: any): Promise<void> {
-    if ((value === '' || value === undefined || value === null) && KV_STORE_SCHEMA[key] === 'string') {
+    if (
+      (value === '' || value === undefined || value === null) &&
+      KV_STORE_SCHEMA[key] === 'string'
+    ) {
      await KVStore.clearValue(key)
    } else {
      await KVStore.setValue(key, value)
@ -548,6 +659,11 @@ export class SystemService {
        if (service.installed) {
          // If marked as installed but container doesn't exist, mark as not installed
          if (!containerExists) {
+            // Exception: remote Ollama is configured without a local container — don't reset it
+            if (service.service_name === SERVICE_NAMES.OLLAMA) {
+              const remoteUrl = await KVStore.getValue('ai.remoteOllamaUrl')
+              if (remoteUrl) continue
+            }
            logger.warn(
              `Service ${service.service_name} is marked as installed but container does not exist. Marking as not installed.`
            )
@ -620,5 +736,4 @@ export class SystemService {
        }
      })
  }
-
 }
--- a/admin/constants/kv_store.ts
+++ b/admin/constants/kv_store.ts
@ -1,3 +1,3 @@
 import { KVStoreKey } from "../types/kv_store.js";

-export const SETTINGS_KEYS: KVStoreKey[] = ['chat.suggestionsEnabled', 'chat.lastModel', 'ui.hasVisitedEasySetup', 'ui.theme', 'system.earlyAccess', 'ai.assistantCustomName'];
+export const SETTINGS_KEYS: KVStoreKey[] = ['chat.suggestionsEnabled', 'chat.lastModel', 'ui.hasVisitedEasySetup', 'ui.theme', 'system.earlyAccess', 'ai.assistantCustomName', 'ai.remoteOllamaUrl'];
--- a/admin/inertia/components/InstallActivityFeed.tsx
+++ b/admin/inertia/components/InstallActivityFeed.tsx
@ -12,16 +12,30 @@ export type InstallActivityFeedProps = {
      | 'created'
      | 'preinstall'
      | 'preinstall-complete'
+      | 'preinstall-error'
      | 'starting'
      | 'started'
      | 'finalizing'
      | 'completed'
+      | 'checking-dependencies'
+      | 'dependency-installed'
+      | 'image-exists'
+      | 'gpu-config'
+      | 'stopping'
+      | 'removing'
+      | 'recreating'
+      | 'cleanup-warning'
+      | 'no-volumes'
+      | 'volume-removed'
+      | 'volume-cleanup-warning'
+      | 'error'
      | 'update-pulling'
      | 'update-stopping'
      | 'update-creating'
      | 'update-starting'
      | 'update-complete'
      | 'update-rollback'
+      | (string & {})
    timestamp: string
    message: string
  }>
@ -48,7 +62,7 @@ const InstallActivityFeed: React.FC<InstallActivityFeedProps> = ({ activity, cla
              <div className="relative flex size-6 flex-none items-center justify-center bg-transparent">
                {activityItem.type === 'completed' || activityItem.type === 'update-complete' ? (
                  <IconCircleCheck aria-hidden="true" className="size-6 text-indigo-600" />
-                ) : activityItem.type === 'update-rollback' ? (
+                ) : activityItem.type === 'error' || activityItem.type === 'update-rollback' || activityItem.type === 'preinstall-error' ? (
                  <IconCircleX aria-hidden="true" className="size-6 text-red-500" />
                ) : (
                  <div className="size-1.5 rounded-full bg-surface-secondary ring-1 ring-border-default" />
@ -56,7 +70,7 @@ const InstallActivityFeed: React.FC<InstallActivityFeedProps> = ({ activity, cla
              </div>
              <p className="flex-auto py-0.5 text-xs/5 text-text-muted">
                <span className="font-semibold text-text-primary">{activityItem.service_name}</span> -{' '}
-                {activityItem.type.charAt(0).toUpperCase() + activityItem.type.slice(1)}
+                {activityItem.message || activityItem.type.charAt(0).toUpperCase() + activityItem.type.slice(1)}
              </p>
              <time
                dateTime={activityItem.timestamp}
--- a/admin/inertia/components/chat/ChatInterface.tsx
+++ b/admin/inertia/components/chat/ChatInterface.tsx
@ -213,7 +213,7 @@ export default function ChatInterface({
          <p className="text-text-primary">
            This will dispatch a background download job for{' '}
            <span className="font-mono font-medium">{DEFAULT_QUERY_REWRITE_MODEL}</span> and may take some time to complete. The model
-            will be used to rewrite queries for improved RAG retrieval performance.
+            will be used to rewrite queries for improved RAG retrieval performance. Note that download is only supported when using Ollama. If using an OpenAI API interface, please download the model with that software.
          </p>
        </StyledModal>
      </div>
--- a/admin/inertia/components/chat/KnowledgeBaseModal.tsx
+++ b/admin/inertia/components/chat/KnowledgeBaseModal.tsx
@ -46,6 +46,7 @@ export default function KnowledgeBaseModal({ aiAssistantName = "AI Assistant", o
      if (fileUploaderRef.current) {
        fileUploaderRef.current.clear()
      }
+      queryClient.invalidateQueries({ queryKey: ['embed-jobs'] })
    },
    onError: (error: any) => {
      addNotification({
@ -68,6 +69,17 @@ export default function KnowledgeBaseModal({ aiAssistantName = "AI Assistant", o
    },
  })

+  const cleanupFailedMutation = useMutation({
+    mutationFn: () => api.cleanupFailedEmbedJobs(),
+    onSuccess: (data) => {
+      addNotification({ type: 'success', message: data?.message || 'Failed jobs cleaned up.' })
+      queryClient.invalidateQueries({ queryKey: ['failedEmbedJobs'] })
+    },
+    onError: (error: any) => {
+      addNotification({ type: 'error', message: error?.message || 'Failed to clean up jobs.' })
+    },
+  })
+
  const syncMutation = useMutation({
    mutationFn: () => api.syncRAGStorage(),
    onSuccess: (data) => {
@ -207,7 +219,20 @@ export default function KnowledgeBaseModal({ aiAssistantName = "AI Assistant", o
            </div>
          </div>
          <div className="my-8">
-            <ActiveEmbedJobs withHeader={true} />
+            <div className="flex items-center justify-between mb-4">
+              <StyledSectionHeader title="Processing Queue" className="!mb-0" />
+              <StyledButton
+                variant="danger"
+                size="md"
+                icon="IconTrash"
+                onClick={() => cleanupFailedMutation.mutate()}
+                loading={cleanupFailedMutation.isPending}
+                disabled={cleanupFailedMutation.isPending}
+              >
+                Clean Up Failed
+              </StyledButton>
+            </div>
+            <ActiveEmbedJobs withHeader={false} />
          </div>

          <div className="my-12">
--- a/admin/inertia/components/chat/index.tsx
+++ b/admin/inertia/components/chat/index.tsx
@ -53,6 +53,14 @@ export default function Chat({
  const activeSession = sessions.find((s) => s.id === activeSessionId)

  const { data: lastModelSetting } = useSystemSetting({ key: 'chat.lastModel', enabled })
+  const { data: remoteOllamaUrlSetting } = useSystemSetting({ key: 'ai.remoteOllamaUrl', enabled })
+
+  const { data: remoteStatus } = useQuery({
+    queryKey: ['remoteOllamaStatus'],
+    queryFn: () => api.getRemoteOllamaStatus(),
+    enabled: enabled && !!remoteOllamaUrlSetting?.value,
+    refetchInterval: 15000,
+  })

  const { data: installedModels = [], isLoading: isLoadingModels } = useQuery({
    queryKey: ['installedModels'],
@ -363,6 +371,18 @@ export default function Chat({
            {activeSession?.title || 'New Chat'}
          </h2>
          <div className="flex items-center gap-4">
+            {remoteOllamaUrlSetting?.value && (
+              <span
+                className={classNames(
+                  'text-xs rounded px-2 py-1 font-medium',
+                  remoteStatus?.connected === false
+                    ? 'text-red-700 bg-red-50 border border-red-200'
+                    : 'text-green-700 bg-green-50 border border-green-200'
+                )}
+              >
+                {remoteStatus?.connected === false ? 'Remote Disconnected' : 'Remote Connected'}
+              </span>
+            )}
            <div className="flex items-center gap-2">
              <label htmlFor="model-select" className="text-sm text-text-secondary">
                Model:
@ -380,7 +400,7 @@ export default function Chat({
                >
                  {installedModels.map((model) => (
                    <option key={model.name} value={model.name}>
-                      {model.name} ({formatBytes(model.size)})
+                      {model.name}{model.size > 0 ? ` (${formatBytes(model.size)})` : ''}
                    </option>
                  ))}
                </select>
--- a/admin/inertia/hooks/useEmbedJobs.ts
+++ b/admin/inertia/hooks/useEmbedJobs.ts
@ -1,8 +1,10 @@
+import { useEffect, useRef } from 'react'
 import { useQuery, useQueryClient } from '@tanstack/react-query'
 import api from '~/lib/api'

 const useEmbedJobs = (props: { enabled?: boolean } = {}) => {
  const queryClient = useQueryClient()
+  const prevCountRef = useRef<number>(0)

  const queryData = useQuery({
    queryKey: ['embed-jobs'],
@ -15,6 +17,15 @@ const useEmbedJobs = (props: { enabled?: boolean } = {}) => {
    enabled: props.enabled ?? true,
  })

+  // When jobs drain to zero, refresh stored files so they appear without reopening the modal
+  useEffect(() => {
+    const currentCount = queryData.data?.length ?? 0
+    if (prevCountRef.current > 0 && currentCount === 0) {
+      queryClient.invalidateQueries({ queryKey: ['storedFiles'] })
+    }
+    prevCountRef.current = currentCount
+  }, [queryData.data, queryClient])
+
  const invalidate = () => {
    queryClient.invalidateQueries({ queryKey: ['embed-jobs'] })
  }
--- a/admin/inertia/lib/api.ts
+++ b/admin/inertia/lib/api.ts
@ -7,8 +7,7 @@ import { DownloadJobWithProgress, WikipediaState } from '../../types/downloads'
 import { EmbedJobWithProgress } from '../../types/rag'
 import type { CategoryWithStatus, CollectionWithStatus, ContentUpdateCheckResult, ResourceUpdateInfo } from '../../types/collections'
 import { catchInternal } from './util'
-import { NomadOllamaModel, OllamaChatRequest } from '../../types/ollama'
-import { ChatResponse, ModelResponse } from 'ollama'
+import { NomadChatResponse, NomadInstalledModel, NomadOllamaModel, OllamaChatRequest } from '../../types/ollama'
 import BenchmarkResult from '#models/benchmark_result'
 import { BenchmarkType, RunBenchmarkResponse, SubmitBenchmarkResponse, UpdateBuilderTagResponse } from '../../types/benchmark'

@ -49,6 +48,25 @@ class API {
    })()
  }

+  async getRemoteOllamaStatus(): Promise<{ configured: boolean; connected: boolean }> {
+    return catchInternal(async () => {
+      const response = await this.client.get<{ configured: boolean; connected: boolean }>(
+        '/ollama/remote-status'
+      )
+      return response.data
+    })()
+  }
+
+  async configureRemoteOllama(remoteUrl: string | null): Promise<{ success: boolean; message: string }> {
+    return catchInternal(async () => {
+      const response = await this.client.post<{ success: boolean; message: string }>(
+        '/ollama/configure-remote',
+        { remoteUrl }
+      )
+      return response.data
+    })()
+  }
+
  async deleteModel(model: string): Promise<{ success: boolean; message: string }> {
    return catchInternal(async () => {
      const response = await this.client.delete('/ollama/models', { data: { model } })
@ -239,7 +257,7 @@ class API {

  async getInstalledModels() {
    return catchInternal(async () => {
-      const response = await this.client.get<ModelResponse[]>('/ollama/installed-models')
+      const response = await this.client.get<NomadInstalledModel[]>('/ollama/installed-models')
      return response.data
    })()
  }
@ -258,7 +276,7 @@ class API {

  async sendChatMessage(chatRequest: OllamaChatRequest) {
    return catchInternal(async () => {
-      const response = await this.client.post<ChatResponse>('/ollama/chat', chatRequest)
+      const response = await this.client.post<NomadChatResponse>('/ollama/chat', chatRequest)
      return response.data
    })()
  }
@ -419,6 +437,20 @@ class API {
    })()
  }

+  async getFailedEmbedJobs(): Promise<EmbedJobWithProgress[] | undefined> {
+    return catchInternal(async () => {
+      const response = await this.client.get<EmbedJobWithProgress[]>('/rag/failed-jobs')
+      return response.data
+    })()
+  }
+
+  async cleanupFailedEmbedJobs(): Promise<{ message: string; cleaned: number; filesDeleted: number } | undefined> {
+    return catchInternal(async () => {
+      const response = await this.client.delete<{ message: string; cleaned: number; filesDeleted: number }>('/rag/failed-jobs')
+      return response.data
+    })()
+  }
+
  async getStoredRAGFiles() {
    return catchInternal(async () => {
      const response = await this.client.get<{ files: string[] }>('/rag/files')
--- a/admin/inertia/pages/easy-setup/index.tsx
+++ b/admin/inertia/pages/easy-setup/index.tsx
@ -112,7 +112,9 @@ const CURATED_MAP_COLLECTIONS_KEY = 'curated-map-collections'
 const CURATED_CATEGORIES_KEY = 'curated-categories'
 const WIKIPEDIA_STATE_KEY = 'wikipedia-state'

-export default function EasySetupWizard(props: { system: { services: ServiceSlim[] } }) {
+export default function EasySetupWizard(props: {
+  system: { services: ServiceSlim[]; remoteOllamaUrl: string }
+}) {
  const { aiAssistantName } = usePage<{ aiAssistantName: string }>().props
  const CORE_CAPABILITIES = buildCoreCapabilities(aiAssistantName)

@ -122,6 +124,11 @@ export default function EasySetupWizard(props: { system: { services: ServiceSlim
  const [selectedAiModels, setSelectedAiModels] = useState<string[]>([])
  const [isProcessing, setIsProcessing] = useState(false)
  const [showAdditionalTools, setShowAdditionalTools] = useState(false)
+  const [remoteOllamaEnabled, setRemoteOllamaEnabled] = useState(
+    () => !!props.system.remoteOllamaUrl
+  )
+  const [remoteOllamaUrl, setRemoteOllamaUrl] = useState(() => props.system.remoteOllamaUrl ?? '')
+  const [remoteOllamaUrlError, setRemoteOllamaUrlError] = useState<string | null>(null)

  // Category/tier selection state
  const [selectedTiers, setSelectedTiers] = useState<Map<string, SpecTier>>(new Map())
@ -331,8 +338,24 @@ export default function EasySetupWizard(props: { system: { services: ServiceSlim
    setIsProcessing(true)

    try {
+      // If using remote Ollama, configure it first before other installs
+      if (remoteOllamaEnabled && remoteOllamaUrl) {
+        const remoteResult = await api.configureRemoteOllama(remoteOllamaUrl)
+        if (!remoteResult?.success) {
+          const msg = (remoteResult as any)?.message || 'Failed to configure remote Ollama.'
+          setRemoteOllamaUrlError(msg)
+          setIsProcessing(false)
+          setCurrentStep(1)
+          return
+        }
+      }
+
      // All of these ops don't actually wait for completion, they just kick off the process, so we can run them in parallel without awaiting each one sequentially
-      const installPromises = selectedServices.map((serviceName) => api.installService(serviceName))
+      // Exclude Ollama from local install when using remote mode
+      const servicesToInstall = remoteOllamaEnabled
+        ? selectedServices.filter((s) => s !== SERVICE_NAMES.OLLAMA)
+        : selectedServices
+      const installPromises = servicesToInstall.map((serviceName) => api.installService(serviceName))

      await Promise.all(installPromises)

@ -661,10 +684,54 @@ export default function EasySetupWizard(props: { system: { services: ServiceSlim
              <div>
                <h3 className="text-lg font-semibold text-text-primary mb-4">Core Capabilities</h3>
                <div className="grid grid-cols-1 lg:grid-cols-3 gap-4">
-                  {existingCoreCapabilities.map((capability) =>
-                    renderCapabilityCard(capability, true)
+                  {existingCoreCapabilities.map((capability) => {
+                    if (capability.id === 'ai') {
+                      const isAiSelected = isCapabilitySelected(capability)
+                      return (
+                        <div key={capability.id}>
+                          {renderCapabilityCard(capability, true)}
+                          {isAiSelected && !isCapabilityInstalled(capability) && (
+                            <div
+                              className="mt-2 p-4 bg-gray-50 rounded-lg border border-gray-200"
+                              onClick={(e) => e.stopPropagation()}
+                            >
+                              <label className="flex items-center gap-2 cursor-pointer select-none">
+                                <input
+                                  type="checkbox"
+                                  checked={remoteOllamaEnabled}
+                                  onChange={(e) => {
+                                    setRemoteOllamaEnabled(e.target.checked)
+                                    setRemoteOllamaUrlError(null)
+                                  }}
+                                  className="w-4 h-4 accent-desert-green"
+                                />
+                                <span className="text-sm font-medium text-gray-700">Use remote Ollama instance</span>
+                              </label>
+                              {remoteOllamaEnabled && (
+                                <div className="mt-3">
+                                  <input
+                                    type="text"
+                                    value={remoteOllamaUrl}
+                                    onChange={(e) => {
+                                      setRemoteOllamaUrl(e.target.value)
+                                      setRemoteOllamaUrlError(null)
+                                    }}
+                                    placeholder="http://192.168.1.100:11434"
+                                    className="w-full px-3 py-2 text-sm border border-gray-300 rounded-md focus:outline-none focus:ring-1 focus:ring-desert-green"
+                                  />
+                                  {remoteOllamaUrlError && (
+                                    <p className="mt-1 text-xs text-red-600">{remoteOllamaUrlError}</p>
                                  )}
                                </div>
+                              )}
+                            </div>
+                          )}
+                        </div>
+                      )
+                    }
+                    return renderCapabilityCard(capability, true)
+                  })}
+                </div>
              </div>
            )}

@ -777,8 +844,14 @@ export default function EasySetupWizard(props: { system: { services: ServiceSlim
                <p className="text-sm text-text-muted">Select models to download for offline AI</p>
              </div>
            </div>
-
-            {isLoadingRecommendedModels ? (
+            {remoteOllamaEnabled && remoteOllamaUrl ? (
+              <Alert
+                title="Remote Ollama selected"
+                message="Models are managed on the remote machine. You can add models from Settings > AI Assistant after setup, note this is only supported when using Ollama, not LM Studio and other OpenAI API software."
+                type="info"
+                variant="bordered"
+              />
+            ) : isLoadingRecommendedModels ? (
              <div className="flex justify-center py-12">
                <LoadingSpinner />
              </div>
--- a/admin/inertia/pages/settings/models.tsx
+++ b/admin/inertia/pages/settings/models.tsx
@ -10,7 +10,7 @@ import { useNotifications } from '~/context/NotificationContext'
 import api from '~/lib/api'
 import { useModals } from '~/context/ModalContext'
 import StyledModal from '~/components/StyledModal'
-import { ModelResponse } from 'ollama'
+import type { NomadInstalledModel } from '../../../types/ollama'
 import { SERVICE_NAMES } from '../../../constants/service_names'
 import Switch from '~/components/inputs/Switch'
 import StyledSectionHeader from '~/components/StyledSectionHeader'
@ -24,8 +24,8 @@ import { useSystemInfo } from '~/hooks/useSystemInfo'
 export default function ModelsPage(props: {
  models: {
    availableModels: NomadOllamaModel[]
-    installedModels: ModelResponse[]
-    settings: { chatSuggestionsEnabled: boolean; aiAssistantCustomName: string }
+    installedModels: NomadInstalledModel[]
+    settings: { chatSuggestionsEnabled: boolean; aiAssistantCustomName: string; remoteOllamaUrl: string }
  }
 }) {
  const { aiAssistantName } = usePage<{ aiAssistantName: string }>().props
@ -97,6 +97,43 @@ export default function ModelsPage(props: {
  const [aiAssistantCustomName, setAiAssistantCustomName] = useState(
    props.models.settings.aiAssistantCustomName
  )
+  const [remoteOllamaUrl, setRemoteOllamaUrl] = useState(props.models.settings.remoteOllamaUrl)
+  const [remoteOllamaError, setRemoteOllamaError] = useState<string | null>(null)
+  const [remoteOllamaSaving, setRemoteOllamaSaving] = useState(false)
+
+  async function handleSaveRemoteOllama() {
+    setRemoteOllamaError(null)
+    setRemoteOllamaSaving(true)
+    try {
+      const res = await api.configureRemoteOllama(remoteOllamaUrl || null)
+      if (res?.success) {
+        addNotification({ message: res.message, type: 'success' })
+        router.reload()
+      }
+    } catch (error: any) {
+      const msg = error?.response?.data?.message || error?.message || 'Failed to configure remote Ollama.'
+      setRemoteOllamaError(msg)
+    } finally {
+      setRemoteOllamaSaving(false)
+    }
+  }
+
+  async function handleClearRemoteOllama() {
+    setRemoteOllamaError(null)
+    setRemoteOllamaSaving(true)
+    try {
+      const res = await api.configureRemoteOllama(null)
+      if (res?.success) {
+        setRemoteOllamaUrl('')
+        addNotification({ message: 'Remote Ollama configuration cleared.', type: 'success' })
+        router.reload()
+      }
+    } catch (error: any) {
+      setRemoteOllamaError(error?.message || 'Failed to clear remote Ollama.')
+    } finally {
+      setRemoteOllamaSaving(false)
+    }
+  }

  const [query, setQuery] = useState('')
  const [queryUI, setQueryUI] = useState('')
@ -286,9 +323,61 @@ export default function ModelsPage(props: {
              />
            </div>
          </div>
+          <StyledSectionHeader title="Remote Connection" className="mt-8 mb-4" />
+          <div className="bg-surface-primary rounded-lg border-2 border-border-subtle p-6">
+            <p className="text-sm text-text-secondary mb-4">
+              Connect to any OpenAI-compatible API server — Ollama, LM Studio, llama.cpp, and others are all supported.
+              For remote Ollama instances, the host must be started with <code className="bg-surface-secondary px-1 rounded">OLLAMA_HOST=0.0.0.0</code>.
+            </p>
+            <div className="flex items-end gap-3">
+              <div className="flex-1">
+                <Input
+                  name="remoteOllamaUrl"
+                  label="Remote Ollama/OpenAI API URL"
+                  placeholder="http://192.168.1.100:11434  (or :1234 for OpenAI API Compatible Apps)"
+                  value={remoteOllamaUrl}
+                  onChange={(e) => {
+                    setRemoteOllamaUrl(e.target.value)
+                    setRemoteOllamaError(null)
+                  }}
+                />
+                {remoteOllamaError && (
+                  <p className="text-sm text-red-600 mt-1">{remoteOllamaError}</p>
+                )}
+              </div>
+              <StyledButton
+                variant="primary"
+                onClick={handleSaveRemoteOllama}
+                loading={remoteOllamaSaving}
+                disabled={remoteOllamaSaving || !remoteOllamaUrl}
+                className="mb-0.5"
+              >
+                Save &amp; Test
+              </StyledButton>
+              {props.models.settings.remoteOllamaUrl && (
+                <StyledButton
+                  variant="danger"
+                  onClick={handleClearRemoteOllama}
+                  loading={remoteOllamaSaving}
+                  disabled={remoteOllamaSaving}
+                  className="mb-0.5"
+                >
+                  Clear
+                </StyledButton>
+              )}
+            </div>
+          </div>
+
          <ActiveModelDownloads withHeader />

          <StyledSectionHeader title="Models" className="mt-12 mb-4" />
+          <Alert
+            type="info"
+            variant="bordered"
+            title="Model downloading is only supported when using a Ollama backend."
+            message="If you are connected to an OpenAI API host (e.g. LM Studio), please download models directly in that application."
+            className="mb-4"
+          />
          <div className="flex justify-start items-center gap-3 mt-4">
            <Input
              name="search"
--- a/admin/package-lock.json
+++ b/admin/package-lock.json
@ -50,6 +50,7 @@
        "maplibre-gl": "^4.7.1",
        "mysql2": "^3.14.1",
        "ollama": "^0.6.3",
+        "openai": "^6.27.0",
        "pdf-parse": "^2.4.5",
        "pdf2pic": "^3.2.0",
        "pino-pretty": "^13.0.0",
@ -12640,6 +12641,27 @@
        "url": "https://github.com/sponsors/sindresorhus"
      }
    },
+    "node_modules/openai": {
+      "version": "6.27.0",
+      "resolved": "https://registry.npmjs.org/openai/-/openai-6.27.0.tgz",
+      "integrity": "sha512-osTKySlrdYrLYTt0zjhY8yp0JUBmWDCN+Q+QxsV4xMQnnoVFpylgKGgxwN8sSdTNw0G4y+WUXs4eCMWpyDNWZQ==",
+      "license": "Apache-2.0",
+      "bin": {
+        "openai": "bin/cli"
+      },
+      "peerDependencies": {
+        "ws": "^8.18.0",
+        "zod": "^3.25 || ^4.0"
+      },
+      "peerDependenciesMeta": {
+        "ws": {
+          "optional": true
+        },
+        "zod": {
+          "optional": true
+        }
+      }
+    },
    "node_modules/opencollective-postinstall": {
      "version": "2.0.3",
      "resolved": "https://registry.npmjs.org/opencollective-postinstall/-/opencollective-postinstall-2.0.3.tgz",
--- a/admin/package.json
+++ b/admin/package.json
@ -102,6 +102,7 @@
    "maplibre-gl": "^4.7.1",
    "mysql2": "^3.14.1",
    "ollama": "^0.6.3",
+    "openai": "^6.27.0",
    "pdf-parse": "^2.4.5",
    "pdf2pic": "^3.2.0",
    "pino-pretty": "^13.0.0",
--- a/admin/start/routes.ts
+++ b/admin/start/routes.ts
@ -107,6 +107,8 @@ router
    router.post('/models', [OllamaController, 'dispatchModelDownload'])
    router.delete('/models', [OllamaController, 'deleteModel'])
    router.get('/installed-models', [OllamaController, 'installedModels'])
+    router.post('/configure-remote', [OllamaController, 'configureRemote'])
+    router.get('/remote-status', [OllamaController, 'remoteStatus'])
  })
  .prefix('/api/ollama')

@ -130,6 +132,8 @@ router
    router.get('/files', [RagController, 'getStoredFiles'])
    router.delete('/files', [RagController, 'deleteFile'])
    router.get('/active-jobs', [RagController, 'getActiveJobs'])
+    router.get('/failed-jobs', [RagController, 'getFailedJobs'])
+    router.delete('/failed-jobs', [RagController, 'cleanupFailedJobs'])
    router.get('/job-status', [RagController, 'getJobStatus'])
    router.post('/sync', [RagController, 'scanAndSync'])
  })
--- a/admin/types/kv_store.ts
+++ b/admin/types/kv_store.ts
@ -10,6 +10,7 @@ export const KV_STORE_SCHEMA = {
  'ui.theme':                   'string',
  'ai.assistantCustomName':     'string',
  'gpu.type':                   'string',
+  'ai.remoteOllamaUrl':         'string',
 } as const

 type KVTagToType<T extends string> = T extends 'boolean' ? boolean : string
--- a/admin/types/ollama.ts
+++ b/admin/types/ollama.ts
@ -44,3 +44,16 @@ export type OllamaChatResponse = {
  }
  done: boolean
 }
+
+export type NomadInstalledModel = {
+  name: string
+  size: number
+  digest?: string
+  details?: Record<string, any>
+}
+
+export type NomadChatResponse = {
+  message: { content: string; thinking?: string }
+  done: boolean
+  model: string
+}
--- a/admin/types/rag.ts
+++ b/admin/types/rag.ts
@ -4,6 +4,7 @@ export type EmbedJobWithProgress = {
  filePath: string
  progress: number
  status: string
+  error?: string
 }

 export type ProcessAndEmbedFileResponse = {