mirror of
https://github.com/Crosstalk-Solutions/project-nomad.git
synced 2026-03-28 03:29:25 +01:00
feat(GPU): warn when GPU passthrough not working and offer one-click fix
Ollama can silently run on CPU even when the host has an NVIDIA GPU, resulting in ~3 tok/s instead of ~167 tok/s. This happens when Ollama was installed before the GPU toolkit, or when the container was recreated without proper DeviceRequests. Users had zero indication. Adds a GPU health check to the system info API response that detects when the host has an NVIDIA runtime but nvidia-smi fails inside the Ollama container. Shows a warning banner on the System Information and AI Settings pages with a one-click "Reinstall AI Assistant" button that force-reinstalls Ollama with GPU passthrough. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
d314e82d17
commit
f7515d8e19
|
|
@ -4,7 +4,7 @@ import { DockerService } from '#services/docker_service'
|
|||
import { ServiceSlim } from '../../types/services.js'
|
||||
import logger from '@adonisjs/core/services/logger'
|
||||
import si from 'systeminformation'
|
||||
import { NomadDiskInfo, NomadDiskInfoRaw, SystemInformationResponse } from '../../types/system.js'
|
||||
import { GpuHealthStatus, NomadDiskInfo, NomadDiskInfoRaw, SystemInformationResponse } from '../../types/system.js'
|
||||
import { SERVICE_NAMES } from '../../constants/service_names.js'
|
||||
import { readFileSync } from 'fs'
|
||||
import path, { join } from 'path'
|
||||
|
|
@ -235,6 +235,13 @@ export class SystemService {
|
|||
logger.error('Error reading disk info file:', error)
|
||||
}
|
||||
|
||||
// GPU health tracking — detect when host has NVIDIA GPU but Ollama can't access it
|
||||
let gpuHealth: GpuHealthStatus = {
|
||||
status: 'no_gpu',
|
||||
hasNvidiaRuntime: false,
|
||||
ollamaGpuAccessible: false,
|
||||
}
|
||||
|
||||
// Query Docker API for host-level info (hostname, OS, GPU runtime)
|
||||
// si.osInfo() returns the container's info inside Docker, not the host's
|
||||
try {
|
||||
|
|
@ -255,6 +262,7 @@ export class SystemService {
|
|||
if (!graphics.controllers || graphics.controllers.length === 0) {
|
||||
const runtimes = dockerInfo.Runtimes || {}
|
||||
if ('nvidia' in runtimes) {
|
||||
gpuHealth.hasNvidiaRuntime = true
|
||||
const nvidiaInfo = await this.getNvidiaSmiInfo()
|
||||
if (Array.isArray(nvidiaInfo)) {
|
||||
graphics.controllers = nvidiaInfo.map((gpu) => ({
|
||||
|
|
@ -264,10 +272,19 @@ export class SystemService {
|
|||
vram: gpu.vram,
|
||||
vramDynamic: false, // assume false here, we don't actually use this field for our purposes.
|
||||
}))
|
||||
gpuHealth.status = 'ok'
|
||||
gpuHealth.ollamaGpuAccessible = true
|
||||
} else if (nvidiaInfo === 'OLLAMA_NOT_FOUND') {
|
||||
gpuHealth.status = 'ollama_not_installed'
|
||||
} else {
|
||||
logger.warn(`NVIDIA runtime detected but failed to get GPU info: ${typeof nvidiaInfo === 'string' ? nvidiaInfo : JSON.stringify(nvidiaInfo)}`)
|
||||
gpuHealth.status = 'passthrough_failed'
|
||||
logger.warn(`NVIDIA runtime detected but GPU passthrough failed: ${typeof nvidiaInfo === 'string' ? nvidiaInfo : JSON.stringify(nvidiaInfo)}`)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// si.graphics() returned controllers (host install, not Docker) — GPU is working
|
||||
gpuHealth.status = 'ok'
|
||||
gpuHealth.ollamaGpuAccessible = true
|
||||
}
|
||||
} catch {
|
||||
// Docker info query failed, skip host-level enrichment
|
||||
|
|
@ -282,6 +299,7 @@ export class SystemService {
|
|||
fsSize,
|
||||
uptime,
|
||||
graphics,
|
||||
gpuHealth,
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error('Error getting system info:', error)
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ import Input from '~/components/inputs/Input'
|
|||
import { IconSearch, IconRefresh } from '@tabler/icons-react'
|
||||
import useDebounce from '~/hooks/useDebounce'
|
||||
import ActiveModelDownloads from '~/components/ActiveModelDownloads'
|
||||
import { useSystemInfo } from '~/hooks/useSystemInfo'
|
||||
|
||||
export default function ModelsPage(props: {
|
||||
models: {
|
||||
|
|
@ -32,6 +33,64 @@ export default function ModelsPage(props: {
|
|||
const { addNotification } = useNotifications()
|
||||
const { openModal, closeAllModals } = useModals()
|
||||
const { debounce } = useDebounce()
|
||||
const { data: systemInfo } = useSystemInfo({})
|
||||
|
||||
const [gpuBannerDismissed, setGpuBannerDismissed] = useState(() => {
|
||||
try {
|
||||
return localStorage.getItem('nomad:gpu-banner-dismissed') === 'true'
|
||||
} catch {
|
||||
return false
|
||||
}
|
||||
})
|
||||
const [reinstalling, setReinstalling] = useState(false)
|
||||
|
||||
const handleDismissGpuBanner = () => {
|
||||
setGpuBannerDismissed(true)
|
||||
try {
|
||||
localStorage.setItem('nomad:gpu-banner-dismissed', 'true')
|
||||
} catch {}
|
||||
}
|
||||
|
||||
const handleForceReinstallOllama = () => {
|
||||
openModal(
|
||||
<StyledModal
|
||||
title="Reinstall AI Assistant?"
|
||||
onConfirm={async () => {
|
||||
closeAllModals()
|
||||
setReinstalling(true)
|
||||
try {
|
||||
const response = await api.forceReinstallService('nomad_ollama')
|
||||
if (!response || !response.success) {
|
||||
throw new Error(response?.message || 'Force reinstall failed')
|
||||
}
|
||||
addNotification({
|
||||
message: `${aiAssistantName} is being reinstalled with GPU support. This page will reload shortly.`,
|
||||
type: 'success',
|
||||
})
|
||||
try { localStorage.removeItem('nomad:gpu-banner-dismissed') } catch {}
|
||||
setTimeout(() => window.location.reload(), 5000)
|
||||
} catch (error) {
|
||||
addNotification({
|
||||
message: `Failed to reinstall: ${error instanceof Error ? error.message : 'Unknown error'}`,
|
||||
type: 'error',
|
||||
})
|
||||
setReinstalling(false)
|
||||
}
|
||||
}}
|
||||
onCancel={closeAllModals}
|
||||
open={true}
|
||||
confirmText="Reinstall"
|
||||
cancelText="Cancel"
|
||||
>
|
||||
<p className="text-gray-700">
|
||||
This will recreate the {aiAssistantName} container with GPU support enabled.
|
||||
Your downloaded models will be preserved. The service will be briefly
|
||||
unavailable during reinstall.
|
||||
</p>
|
||||
</StyledModal>,
|
||||
'gpu-health-force-reinstall-modal'
|
||||
)
|
||||
}
|
||||
const [chatSuggestionsEnabled, setChatSuggestionsEnabled] = useState(
|
||||
props.models.settings.chatSuggestionsEnabled
|
||||
)
|
||||
|
|
@ -178,6 +237,26 @@ export default function ModelsPage(props: {
|
|||
className="!mt-6"
|
||||
/>
|
||||
)}
|
||||
{isInstalled && systemInfo?.gpuHealth?.status === 'passthrough_failed' && !gpuBannerDismissed && (
|
||||
<Alert
|
||||
type="warning"
|
||||
variant="bordered"
|
||||
title="GPU Not Accessible"
|
||||
message={`Your system has an NVIDIA GPU, but ${aiAssistantName} can't access it. AI is running on CPU only, which is significantly slower.`}
|
||||
className="!mt-6"
|
||||
dismissible={true}
|
||||
onDismiss={handleDismissGpuBanner}
|
||||
buttonProps={{
|
||||
children: `Fix: Reinstall ${aiAssistantName}`,
|
||||
icon: 'IconRefresh',
|
||||
variant: 'action',
|
||||
size: 'sm',
|
||||
onClick: handleForceReinstallOllama,
|
||||
loading: reinstalling,
|
||||
disabled: reinstalling,
|
||||
}}
|
||||
/>
|
||||
)}
|
||||
|
||||
<StyledSectionHeader title="Settings" className="mt-8 mb-4" />
|
||||
<div className="bg-white rounded-lg border-2 border-gray-200 p-6">
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
import { useState } from 'react'
|
||||
import { Head } from '@inertiajs/react'
|
||||
import SettingsLayout from '~/layouts/SettingsLayout'
|
||||
import { SystemInformationResponse } from '../../../types/system'
|
||||
|
|
@ -6,7 +7,11 @@ import CircularGauge from '~/components/systeminfo/CircularGauge'
|
|||
import HorizontalBarChart from '~/components/HorizontalBarChart'
|
||||
import InfoCard from '~/components/systeminfo/InfoCard'
|
||||
import Alert from '~/components/Alert'
|
||||
import StyledModal from '~/components/StyledModal'
|
||||
import { useSystemInfo } from '~/hooks/useSystemInfo'
|
||||
import { useNotifications } from '~/context/NotificationContext'
|
||||
import { useModals } from '~/context/ModalContext'
|
||||
import api from '~/lib/api'
|
||||
import StatusCard from '~/components/systeminfo/StatusCard'
|
||||
import { IconCpu, IconDatabase, IconServer, IconDeviceDesktop, IconComponents } from '@tabler/icons-react'
|
||||
|
||||
|
|
@ -16,6 +21,65 @@ export default function SettingsPage(props: {
|
|||
const { data: info } = useSystemInfo({
|
||||
initialData: props.system.info,
|
||||
})
|
||||
const { addNotification } = useNotifications()
|
||||
const { openModal, closeAllModals } = useModals()
|
||||
|
||||
const [gpuBannerDismissed, setGpuBannerDismissed] = useState(() => {
|
||||
try {
|
||||
return localStorage.getItem('nomad:gpu-banner-dismissed') === 'true'
|
||||
} catch {
|
||||
return false
|
||||
}
|
||||
})
|
||||
const [reinstalling, setReinstalling] = useState(false)
|
||||
|
||||
const handleDismissGpuBanner = () => {
|
||||
setGpuBannerDismissed(true)
|
||||
try {
|
||||
localStorage.setItem('nomad:gpu-banner-dismissed', 'true')
|
||||
} catch {}
|
||||
}
|
||||
|
||||
const handleForceReinstallOllama = () => {
|
||||
openModal(
|
||||
<StyledModal
|
||||
title="Reinstall AI Assistant?"
|
||||
onConfirm={async () => {
|
||||
closeAllModals()
|
||||
setReinstalling(true)
|
||||
try {
|
||||
const response = await api.forceReinstallService('nomad_ollama')
|
||||
if (!response || !response.success) {
|
||||
throw new Error(response?.message || 'Force reinstall failed')
|
||||
}
|
||||
addNotification({
|
||||
message: 'AI Assistant is being reinstalled with GPU support. This page will reload shortly.',
|
||||
type: 'success',
|
||||
})
|
||||
try { localStorage.removeItem('nomad:gpu-banner-dismissed') } catch {}
|
||||
setTimeout(() => window.location.reload(), 5000)
|
||||
} catch (error) {
|
||||
addNotification({
|
||||
message: `Failed to reinstall: ${error instanceof Error ? error.message : 'Unknown error'}`,
|
||||
type: 'error',
|
||||
})
|
||||
setReinstalling(false)
|
||||
}
|
||||
}}
|
||||
onCancel={closeAllModals}
|
||||
open={true}
|
||||
confirmText="Reinstall"
|
||||
cancelText="Cancel"
|
||||
>
|
||||
<p className="text-gray-700">
|
||||
This will recreate the AI Assistant container with GPU support enabled.
|
||||
Your downloaded models will be preserved. The service will be briefly
|
||||
unavailable during reinstall.
|
||||
</p>
|
||||
</StyledModal>,
|
||||
'gpu-health-force-reinstall-modal'
|
||||
)
|
||||
}
|
||||
|
||||
// Use (total - available) to reflect actual memory pressure.
|
||||
// mem.used includes reclaimable buff/cache on Linux, which inflates the number.
|
||||
|
|
@ -173,6 +237,27 @@ export default function SettingsPage(props: {
|
|||
},
|
||||
]}
|
||||
/>
|
||||
{info?.gpuHealth?.status === 'passthrough_failed' && !gpuBannerDismissed && (
|
||||
<div className="lg:col-span-2">
|
||||
<Alert
|
||||
type="warning"
|
||||
variant="bordered"
|
||||
title="GPU Not Accessible to AI Assistant"
|
||||
message="Your system has an NVIDIA GPU, but the AI Assistant can't access it. AI is running on CPU only, which is significantly slower."
|
||||
dismissible={true}
|
||||
onDismiss={handleDismissGpuBanner}
|
||||
buttonProps={{
|
||||
children: 'Fix: Reinstall AI Assistant',
|
||||
icon: 'IconRefresh',
|
||||
variant: 'action',
|
||||
size: 'sm',
|
||||
onClick: handleForceReinstallOllama,
|
||||
loading: reinstalling,
|
||||
disabled: reinstalling,
|
||||
}}
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
{info?.graphics?.controllers && info.graphics.controllers.length > 0 && (
|
||||
<InfoCard
|
||||
title="Graphics"
|
||||
|
|
|
|||
|
|
@ -1,5 +1,11 @@
|
|||
import { Systeminformation } from 'systeminformation'
|
||||
|
||||
export type GpuHealthStatus = {
|
||||
status: 'ok' | 'passthrough_failed' | 'no_gpu' | 'ollama_not_installed'
|
||||
hasNvidiaRuntime: boolean
|
||||
ollamaGpuAccessible: boolean
|
||||
}
|
||||
|
||||
export type SystemInformationResponse = {
|
||||
cpu: Systeminformation.CpuData
|
||||
mem: Systeminformation.MemData
|
||||
|
|
@ -9,6 +15,7 @@ export type SystemInformationResponse = {
|
|||
fsSize: Systeminformation.FsSizeData[]
|
||||
uptime: Systeminformation.TimeData
|
||||
graphics: Systeminformation.GraphicsData
|
||||
gpuHealth?: GpuHealthStatus
|
||||
}
|
||||
|
||||
// Type inferrence is not working properly with usePage and shared props, so we define this type manually
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user