From 1e1da483e948a77c0f4e7662d91ab36a4f2fa52d Mon Sep 17 00:00:00 2001
From: Henry Estela <hestela@mailbox.org>
Date: Thu, 2 Apr 2026 02:52:11 +0000
Subject: [PATCH] feat(AI): enable flash_attn by default and disable ollama
 cloud (#616)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New defaults:
OLLAMA_NO_CLOUD=1 - "Ollama can run in local only mode by disabling
Ollama’s cloud features. By turning off Ollama’s cloud features, you
will lose the ability to use Ollama’s cloud models and web search."
https://ollama.com/blog/web-search
https://docs.ollama.com/faq#how-do-i-disable-ollama%E2%80%99s-cloud-features
example output:
```
ollama run minimax-m2.7:cloud
Error: ollama cloud is disabled: remote model details are unavailable
```
This setting can be safely disabled as you have to click on a link to
login to ollama cloud and theres no real way to do that in nomad outside
of looking at the nomad_ollama logs.

This one can be disabled in settings in case theres a model out there
that doesn't play nice. but that doesnt seem necessary so far.
OLLAMA_FLASH_ATTENTION=1 - "Flash Attention is a feature of most modern
models that can significantly reduce memory usage as the context size
grows. "

Tested with llama3.2:
```
docker logs nomad_ollama --tail 1000 2>&1 |grep --color -i flash_attn
llama_context: flash_attn    = enabled
```

And with second_constantine/deepseek-coder-v2 with is based on
https://huggingface.co/lmstudio-community/DeepSeek-Coder-V2-Lite-Instruct-GGUF
which is a model that specifically calls out that you should disable
flash attention, but during testing it seems ollama can do this for you
automatically:
```
docker logs nomad_ollama --tail 1000 2>&1 |grep --color -i flash_attn
llama_context: flash_attn    = disabled
```
---
 admin/app/controllers/settings_controller.ts |  2 ++
 admin/app/services/docker_service.ts         | 11 ++++++++++-
 admin/constants/kv_store.ts                  |  2 +-
 admin/inertia/pages/settings/models.tsx      | 14 +++++++++++++-
 admin/types/kv_store.ts                      |  1 +
 5 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/admin/app/controllers/settings_controller.ts b/admin/app/controllers/settings_controller.ts
index c21ddd6..24cb4ce 100644
--- a/admin/app/controllers/settings_controller.ts
+++ b/admin/app/controllers/settings_controller.ts
@@ -64,6 +64,7 @@ export default class SettingsController {
     const chatSuggestionsEnabled = await KVStore.getValue('chat.suggestionsEnabled')
     const aiAssistantCustomName = await KVStore.getValue('ai.assistantCustomName')
     const remoteOllamaUrl = await KVStore.getValue('ai.remoteOllamaUrl')
+    const ollamaFlashAttention = await KVStore.getValue('ai.ollamaFlashAttention')
     return inertia.render('settings/models', {
       models: {
         availableModels: availableModels?.models || [],
@@ -72,6 +73,7 @@ export default class SettingsController {
           chatSuggestionsEnabled: chatSuggestionsEnabled ?? false,
           aiAssistantCustomName: aiAssistantCustomName ?? '',
           remoteOllamaUrl: remoteOllamaUrl ?? '',
+          ollamaFlashAttention: ollamaFlashAttention ?? true,
         },
       },
     })
diff --git a/admin/app/services/docker_service.ts b/admin/app/services/docker_service.ts
index d3edb5d..3c44332 100644
--- a/admin/app/services/docker_service.ts
+++ b/admin/app/services/docker_service.ts
@@ -505,6 +505,15 @@ export class DockerService {
         }
       }
 
+      const ollamaEnv: string[] = []
+      if (service.service_name === SERVICE_NAMES.OLLAMA) {
+        ollamaEnv.push('OLLAMA_NO_CLOUD=1')
+        const flashAttentionEnabled = await KVStore.getValue('ai.ollamaFlashAttention')
+        if (flashAttentionEnabled !== false) {
+          ollamaEnv.push('OLLAMA_FLASH_ATTENTION=1')
+        }
+      }
+
       this._broadcast(
         service.service_name,
         'creating',
@@ -522,7 +531,7 @@ export class DockerService {
         HostConfig: gpuHostConfig,
         ...(containerConfig?.WorkingDir && { WorkingDir: containerConfig.WorkingDir }),
         ...(containerConfig?.ExposedPorts && { ExposedPorts: containerConfig.ExposedPorts }),
-        ...(containerConfig?.Env && { Env: containerConfig.Env }),
+        Env: [...(containerConfig?.Env ?? []), ...ollamaEnv],
         ...(service.container_command ? { Cmd: service.container_command.split(' ') } : {}),
         // Ensure container is attached to the Nomad docker network in production
         ...(process.env.NODE_ENV === 'production' && {
diff --git a/admin/constants/kv_store.ts b/admin/constants/kv_store.ts
index 1085723..c49416c 100644
--- a/admin/constants/kv_store.ts
+++ b/admin/constants/kv_store.ts
@@ -1,3 +1,3 @@
 import { KVStoreKey } from "../types/kv_store.js";
 
-export const SETTINGS_KEYS: KVStoreKey[] = ['chat.suggestionsEnabled', 'chat.lastModel', 'ui.hasVisitedEasySetup', 'ui.theme', 'system.earlyAccess', 'ai.assistantCustomName', 'ai.remoteOllamaUrl'];
\ No newline at end of file
+export const SETTINGS_KEYS: KVStoreKey[] = ['chat.suggestionsEnabled', 'chat.lastModel', 'ui.hasVisitedEasySetup', 'ui.theme', 'system.earlyAccess', 'ai.assistantCustomName', 'ai.remoteOllamaUrl', 'ai.ollamaFlashAttention'];
\ No newline at end of file
diff --git a/admin/inertia/pages/settings/models.tsx b/admin/inertia/pages/settings/models.tsx
index 08d9616..d4124fe 100644
--- a/admin/inertia/pages/settings/models.tsx
+++ b/admin/inertia/pages/settings/models.tsx
@@ -26,7 +26,7 @@ export default function ModelsPage(props: {
   models: {
     availableModels: NomadOllamaModel[]
     installedModels: NomadInstalledModel[]
-    settings: { chatSuggestionsEnabled: boolean; aiAssistantCustomName: string; remoteOllamaUrl: string }
+    settings: { chatSuggestionsEnabled: boolean; aiAssistantCustomName: string; remoteOllamaUrl: string; ollamaFlashAttention: boolean }
   }
 }) {
   const { aiAssistantName } = usePage<{ aiAssistantName: string }>().props
@@ -95,6 +95,9 @@ export default function ModelsPage(props: {
   const [chatSuggestionsEnabled, setChatSuggestionsEnabled] = useState(
     props.models.settings.chatSuggestionsEnabled
   )
+  const [ollamaFlashAttention, setOllamaFlashAttention] = useState(
+    props.models.settings.ollamaFlashAttention
+  )
   const [aiAssistantCustomName, setAiAssistantCustomName] = useState(
     props.models.settings.aiAssistantCustomName
   )
@@ -308,6 +311,15 @@ export default function ModelsPage(props: {
                 label="Chat Suggestions"
                 description="Display AI-generated conversation starters in the chat interface"
               />
+              <Switch
+                checked={ollamaFlashAttention}
+                onChange={(newVal) => {
+                  setOllamaFlashAttention(newVal)
+                  updateSettingMutation.mutate({ key: 'ai.ollamaFlashAttention', value: newVal })
+                }}
+                label="Flash Attention"
+                description="Enables OLLAMA_FLASH_ATTENTION=1 for improved memory efficiency. Disable if you experience instability. Takes effect after reinstalling the AI Assistant."
+              />
               <Input
                 name="aiAssistantCustomName"
                 label="Assistant Name"
diff --git a/admin/types/kv_store.ts b/admin/types/kv_store.ts
index ec568c3..8fb2686 100644
--- a/admin/types/kv_store.ts
+++ b/admin/types/kv_store.ts
@@ -11,6 +11,7 @@ export const KV_STORE_SCHEMA = {
   'ai.assistantCustomName':     'string',
   'gpu.type':                   'string',
   'ai.remoteOllamaUrl':         'string',
+  'ai.ollamaFlashAttention':    'boolean',
 } as const
 
 type KVTagToType<T extends string> = T extends 'boolean' ? boolean : string