diff --git a/admin/app/services/system_service.ts b/admin/app/services/system_service.ts index 0d5d3a6..13cee1b 100644 --- a/admin/app/services/system_service.ts +++ b/admin/app/services/system_service.ts @@ -4,7 +4,7 @@ import { DockerService } from '#services/docker_service' import { ServiceSlim } from '../../types/services.js' import logger from '@adonisjs/core/services/logger' import si from 'systeminformation' -import { NomadDiskInfo, NomadDiskInfoRaw, SystemInformationResponse } from '../../types/system.js' +import { GpuHealthStatus, NomadDiskInfo, NomadDiskInfoRaw, SystemInformationResponse } from '../../types/system.js' import { SERVICE_NAMES } from '../../constants/service_names.js' import { readFileSync } from 'fs' import path, { join } from 'path' @@ -235,6 +235,13 @@ export class SystemService { logger.error('Error reading disk info file:', error) } + // GPU health tracking — detect when host has NVIDIA GPU but Ollama can't access it + let gpuHealth: GpuHealthStatus = { + status: 'no_gpu', + hasNvidiaRuntime: false, + ollamaGpuAccessible: false, + } + // Query Docker API for host-level info (hostname, OS, GPU runtime) // si.osInfo() returns the container's info inside Docker, not the host's try { @@ -255,6 +262,7 @@ export class SystemService { if (!graphics.controllers || graphics.controllers.length === 0) { const runtimes = dockerInfo.Runtimes || {} if ('nvidia' in runtimes) { + gpuHealth.hasNvidiaRuntime = true const nvidiaInfo = await this.getNvidiaSmiInfo() if (Array.isArray(nvidiaInfo)) { graphics.controllers = nvidiaInfo.map((gpu) => ({ @@ -264,10 +272,19 @@ export class SystemService { vram: gpu.vram, vramDynamic: false, // assume false here, we don't actually use this field for our purposes. })) + gpuHealth.status = 'ok' + gpuHealth.ollamaGpuAccessible = true + } else if (nvidiaInfo === 'OLLAMA_NOT_FOUND') { + gpuHealth.status = 'ollama_not_installed' } else { - logger.warn(`NVIDIA runtime detected but failed to get GPU info: ${typeof nvidiaInfo === 'string' ? nvidiaInfo : JSON.stringify(nvidiaInfo)}`) + gpuHealth.status = 'passthrough_failed' + logger.warn(`NVIDIA runtime detected but GPU passthrough failed: ${typeof nvidiaInfo === 'string' ? nvidiaInfo : JSON.stringify(nvidiaInfo)}`) } } + } else { + // si.graphics() returned controllers (host install, not Docker) — GPU is working + gpuHealth.status = 'ok' + gpuHealth.ollamaGpuAccessible = true } } catch { // Docker info query failed, skip host-level enrichment @@ -282,6 +299,7 @@ export class SystemService { fsSize, uptime, graphics, + gpuHealth, } } catch (error) { logger.error('Error getting system info:', error) diff --git a/admin/inertia/pages/settings/models.tsx b/admin/inertia/pages/settings/models.tsx index f21963f..a351fc9 100644 --- a/admin/inertia/pages/settings/models.tsx +++ b/admin/inertia/pages/settings/models.tsx @@ -19,6 +19,7 @@ import Input from '~/components/inputs/Input' import { IconSearch } from '@tabler/icons-react' import useDebounce from '~/hooks/useDebounce' import ActiveModelDownloads from '~/components/ActiveModelDownloads' +import { useSystemInfo } from '~/hooks/useSystemInfo' export default function ModelsPage(props: { models: { @@ -32,6 +33,64 @@ export default function ModelsPage(props: { const { addNotification } = useNotifications() const { openModal, closeAllModals } = useModals() const { debounce } = useDebounce() + const { data: systemInfo } = useSystemInfo({}) + + const [gpuBannerDismissed, setGpuBannerDismissed] = useState(() => { + try { + return localStorage.getItem('nomad:gpu-banner-dismissed') === 'true' + } catch { + return false + } + }) + const [reinstalling, setReinstalling] = useState(false) + + const handleDismissGpuBanner = () => { + setGpuBannerDismissed(true) + try { + localStorage.setItem('nomad:gpu-banner-dismissed', 'true') + } catch {} + } + + const handleForceReinstallOllama = () => { + openModal( + { + closeAllModals() + setReinstalling(true) + try { + const response = await api.forceReinstallService('nomad_ollama') + if (!response || !response.success) { + throw new Error(response?.message || 'Force reinstall failed') + } + addNotification({ + message: `${aiAssistantName} is being reinstalled with GPU support. This page will reload shortly.`, + type: 'success', + }) + try { localStorage.removeItem('nomad:gpu-banner-dismissed') } catch {} + setTimeout(() => window.location.reload(), 5000) + } catch (error) { + addNotification({ + message: `Failed to reinstall: ${error instanceof Error ? error.message : 'Unknown error'}`, + type: 'error', + }) + setReinstalling(false) + } + }} + onCancel={closeAllModals} + open={true} + confirmText="Reinstall" + cancelText="Cancel" + > +

+ This will recreate the {aiAssistantName} container with GPU support enabled. + Your downloaded models will be preserved. The service will be briefly + unavailable during reinstall. +

+
, + 'gpu-health-force-reinstall-modal' + ) + } const [chatSuggestionsEnabled, setChatSuggestionsEnabled] = useState( props.models.settings.chatSuggestionsEnabled ) @@ -164,6 +223,26 @@ export default function ModelsPage(props: { className="!mt-6" /> )} + {isInstalled && systemInfo?.gpuHealth?.status === 'passthrough_failed' && !gpuBannerDismissed && ( + + )}
diff --git a/admin/inertia/pages/settings/system.tsx b/admin/inertia/pages/settings/system.tsx index f0aaf97..b2891b2 100644 --- a/admin/inertia/pages/settings/system.tsx +++ b/admin/inertia/pages/settings/system.tsx @@ -1,3 +1,4 @@ +import { useState } from 'react' import { Head } from '@inertiajs/react' import SettingsLayout from '~/layouts/SettingsLayout' import { SystemInformationResponse } from '../../../types/system' @@ -6,7 +7,11 @@ import CircularGauge from '~/components/systeminfo/CircularGauge' import HorizontalBarChart from '~/components/HorizontalBarChart' import InfoCard from '~/components/systeminfo/InfoCard' import Alert from '~/components/Alert' +import StyledModal from '~/components/StyledModal' import { useSystemInfo } from '~/hooks/useSystemInfo' +import { useNotifications } from '~/context/NotificationContext' +import { useModals } from '~/context/ModalContext' +import api from '~/lib/api' import StatusCard from '~/components/systeminfo/StatusCard' import { IconCpu, IconDatabase, IconServer, IconDeviceDesktop, IconComponents } from '@tabler/icons-react' @@ -16,6 +21,65 @@ export default function SettingsPage(props: { const { data: info } = useSystemInfo({ initialData: props.system.info, }) + const { addNotification } = useNotifications() + const { openModal, closeAllModals } = useModals() + + const [gpuBannerDismissed, setGpuBannerDismissed] = useState(() => { + try { + return localStorage.getItem('nomad:gpu-banner-dismissed') === 'true' + } catch { + return false + } + }) + const [reinstalling, setReinstalling] = useState(false) + + const handleDismissGpuBanner = () => { + setGpuBannerDismissed(true) + try { + localStorage.setItem('nomad:gpu-banner-dismissed', 'true') + } catch {} + } + + const handleForceReinstallOllama = () => { + openModal( + { + closeAllModals() + setReinstalling(true) + try { + const response = await api.forceReinstallService('nomad_ollama') + if (!response || !response.success) { + throw new Error(response?.message || 'Force reinstall failed') + } + addNotification({ + message: 'AI Assistant is being reinstalled with GPU support. This page will reload shortly.', + type: 'success', + }) + try { localStorage.removeItem('nomad:gpu-banner-dismissed') } catch {} + setTimeout(() => window.location.reload(), 5000) + } catch (error) { + addNotification({ + message: `Failed to reinstall: ${error instanceof Error ? error.message : 'Unknown error'}`, + type: 'error', + }) + setReinstalling(false) + } + }} + onCancel={closeAllModals} + open={true} + confirmText="Reinstall" + cancelText="Cancel" + > +

+ This will recreate the AI Assistant container with GPU support enabled. + Your downloaded models will be preserved. The service will be briefly + unavailable during reinstall. +

+
, + 'gpu-health-force-reinstall-modal' + ) + } // Use (total - available) to reflect actual memory pressure. // mem.used includes reclaimable buff/cache on Linux, which inflates the number. @@ -173,6 +237,27 @@ export default function SettingsPage(props: { }, ]} /> + {info?.gpuHealth?.status === 'passthrough_failed' && !gpuBannerDismissed && ( +
+ +
+ )} {info?.graphics?.controllers && info.graphics.controllers.length > 0 && (