diff --git a/admin/app/services/benchmark_service.ts b/admin/app/services/benchmark_service.ts index 80247f7..47e4cf1 100644 --- a/admin/app/services/benchmark_service.ts +++ b/admin/app/services/benchmark_service.ts @@ -317,6 +317,23 @@ export class BenchmarkService { } } + // Fallback: AMD discrete cards. si.graphics() returns empty inside Docker for AMD, + // the nvidia-smi path doesn't apply, and the APU regex only catches integrated parts. + // SystemService.getSystemInfo() already handles AMD via the marker file + Ollama log + // probe added in PR #804, so reuse that plumbing rather than duplicating it here. + if (!gpuModel) { + try { + const systemService = new (await import('./system_service.js')).SystemService(this.dockerService) + const sysInfo = await systemService.getSystemInfo() + const sysGpuModel = sysInfo?.graphics?.controllers?.[0]?.model + if (sysGpuModel) { + gpuModel = sysGpuModel + } + } catch (sysError: any) { + logger.warn(`[BenchmarkService] system_service AMD fallback failed: ${sysError.message}`) + } + } + return { cpu_model: `${cpu.manufacturer} ${cpu.brand}`, cpu_cores: cpu.physicalCores, diff --git a/admin/app/services/docker_service.ts b/admin/app/services/docker_service.ts index 7501e78..7e8fc4e 100644 --- a/admin/app/services/docker_service.ts +++ b/admin/app/services/docker_service.ts @@ -592,10 +592,12 @@ export class DockerService { ollamaEnv.push('OLLAMA_FLASH_ATTENTION=1') } if (amdGpuConfigured) { - // RDNA3 iGPUs (gfx1103: 780M, 880M, 890M, ...) aren't on AMD's official ROCm - // allowlist but work when forced to identify as gfx1100 via HSA_OVERRIDE_GFX_VERSION. - // Harmless on supported discrete cards (gfx1030 RX 6800, etc.) — they ignore the override. - ollamaEnv.push('HSA_OVERRIDE_GFX_VERSION=11.0.0') + // gfx-aware HSA override — only set for cards that actually need it. See + // _resolveAmdHsaOverride() for the resolution order and gfx → version mapping. + const hsaOverride = await this._resolveAmdHsaOverride() + if (hsaOverride) { + ollamaEnv.push(`HSA_OVERRIDE_GFX_VERSION=${hsaOverride}`) + } } } @@ -999,6 +1001,67 @@ export class DockerService { } } + /** + * Resolve the HSA_OVERRIDE_GFX_VERSION value for the host's AMD GPU. + * + * gfx1030 (RX 6800/6700/etc.), gfx1100/1101/1102 (RX 7900/7800/7600) are on AMD's + * official ROCm allowlist — forcing an override on these breaks GPU discovery. + * gfx1035 / gfx1036 (RDNA 2 iGPUs like 680M) need 10.3.0 to coerce to gfx1030. + * gfx1103 / gfx1150 / gfx1151 (RDNA 3/3.5 iGPUs like 780M / 890M / Strix Halo) need 11.0.0. + * + * Resolution order: + * 1. KV `ai.amdHsaOverride` — manual user override; accepts 'none' (disable) or a semver-style value. + * 2. Marker file `/app/storage/.nomad-amd-gfx` written by install_nomad.sh. + * 3. Default: '11.0.0' — preserves prior behavior so existing iGPU users don't regress on + * upgrade. Discrete-card users on existing installs can opt out via the KV. + * + * Returns null when no override should be applied. + */ + private async _resolveAmdHsaOverride(): Promise { + const manualRaw = await KVStore.getValue('ai.amdHsaOverride') + if (manualRaw !== null && manualRaw !== undefined && String(manualRaw).trim() !== '') { + const manual = String(manualRaw).trim().toLowerCase() + if (manual === 'none' || manual === 'off' || manual === 'false') { + logger.info('[DockerService] HSA override disabled via ai.amdHsaOverride') + return null + } + if (/^\d+\.\d+\.\d+$/.test(manual)) { + logger.info(`[DockerService] HSA override forced to ${manual} via ai.amdHsaOverride`) + return manual + } + logger.warn(`[DockerService] Ignoring invalid ai.amdHsaOverride value: ${manualRaw}`) + } + + try { + const gfx = (await readFile('/app/storage/.nomad-amd-gfx', 'utf8')).trim() + const mapped = this._mapGfxToHsaOverride(gfx) + logger.info(`[DockerService] AMD gfx marker '${gfx}' → HSA override ${mapped ?? 'none'}`) + return mapped + } catch { + // Marker absent — most likely an existing install upgraded without re-running + // install_nomad.sh. Fall through to the default. + } + + logger.info('[DockerService] No AMD gfx marker; defaulting HSA override to 11.0.0 for backward compatibility') + return '11.0.0' + } + + private _mapGfxToHsaOverride(gfx: string): string | null { + // Officially supported by ROCm — no override needed + if (gfx === 'gfx1030' || gfx === 'gfx1100' || gfx === 'gfx1101' || gfx === 'gfx1102') { + return null + } + // RDNA 2 variants + iGPUs (gfx1031..gfx1036, e.g. Rembrandt 680M) + if (/^gfx103[1-6]$/.test(gfx)) { + return '10.3.0' + } + // RDNA 3 / 3.5 mobile parts (Phoenix 780M = gfx1103, Strix 890M = gfx1150, Strix Halo = gfx1151) + if (gfx === 'gfx1103' || gfx === 'gfx1150' || gfx === 'gfx1151') { + return '11.0.0' + } + return '11.0.0' + } + /** * Build the Docker Devices array for AMD GPU passthrough. * @@ -1132,12 +1195,14 @@ export class DockerService { // and whether HSA_OVERRIDE needs injection. For AMD, replace any prior HSA_OVERRIDE in // the inspect-captured env so updates from older containers pick up the current value. const baseEnv = inspectData.Config?.Env || [] - const finalEnv = updatedAmdGpuConfigured - ? [ - ...baseEnv.filter((e: string) => !e.startsWith('HSA_OVERRIDE_GFX_VERSION=')), - 'HSA_OVERRIDE_GFX_VERSION=11.0.0', - ] - : baseEnv + let finalEnv = baseEnv + if (updatedAmdGpuConfigured) { + const hsaOverride = await this._resolveAmdHsaOverride() + finalEnv = baseEnv.filter((e: string) => !e.startsWith('HSA_OVERRIDE_GFX_VERSION=')) + if (hsaOverride) { + finalEnv.push(`HSA_OVERRIDE_GFX_VERSION=${hsaOverride}`) + } + } const newContainerConfig: any = { Image: newImage, diff --git a/admin/types/kv_store.ts b/admin/types/kv_store.ts index a3632ab..7974e95 100644 --- a/admin/types/kv_store.ts +++ b/admin/types/kv_store.ts @@ -13,6 +13,7 @@ export const KV_STORE_SCHEMA = { 'ai.remoteOllamaUrl': 'string', 'ai.ollamaFlashAttention': 'boolean', 'ai.amdGpuAcceleration': 'boolean', + 'ai.amdHsaOverride': 'string', } as const type KVTagToType = T extends 'boolean' ? boolean : string diff --git a/install/install_nomad.sh b/install/install_nomad.sh index ef501a0..484cea1 100644 --- a/install/install_nomad.sh +++ b/install/install_nomad.sh @@ -520,10 +520,40 @@ verify_gpu_setup() { # Check for AMD GPU — restrict to display controller classes to avoid false positives # from AMD CPU host bridges, PCI bridges, and chipset devices. local has_amd_gpu='false' + local amd_gfx_version='' if command -v lspci &> /dev/null; then if lspci 2>/dev/null | grep -iE "VGA|3D controller|Display" | grep -iE "amd|radeon" &> /dev/null; then has_amd_gpu='true' echo -e "${GREEN}✓${RESET} AMD GPU detected — ROCm acceleration will be configured automatically when AI Assistant is installed.\\n" + + # Map AMD codename → gfx version so the admin can pick the right HSA_OVERRIDE_GFX_VERSION. + # gfx1030/1100/1101/1102 are on AMD's official ROCm allowlist and need NO override — + # forcing one (e.g. 11.0.0) breaks GPU discovery on these. Other variants do need it. + local amd_devices + amd_devices=$(lspci -vmm 2>/dev/null | awk -F'\t' '/^Class:.*(VGA|3D|Display)/{c=1} c && /^Device:/{print $2; c=0}') + if echo "${amd_devices}" | grep -iq 'Navi 21'; then + amd_gfx_version='gfx1030' + elif echo "${amd_devices}" | grep -iq 'Navi 22'; then + amd_gfx_version='gfx1031' + elif echo "${amd_devices}" | grep -iq 'Navi 23'; then + amd_gfx_version='gfx1032' + elif echo "${amd_devices}" | grep -iq 'Navi 24'; then + amd_gfx_version='gfx1034' + elif echo "${amd_devices}" | grep -iq 'Rembrandt'; then + amd_gfx_version='gfx1035' + elif echo "${amd_devices}" | grep -iEq 'Phoenix1?|Phoenix2'; then + amd_gfx_version='gfx1103' + elif echo "${amd_devices}" | grep -iEq 'Strix Halo'; then + amd_gfx_version='gfx1151' + elif echo "${amd_devices}" | grep -iEq 'Strix( Point)?'; then + amd_gfx_version='gfx1150' + elif echo "${amd_devices}" | grep -iq 'Navi 31'; then + amd_gfx_version='gfx1100' + elif echo "${amd_devices}" | grep -iq 'Navi 32'; then + amd_gfx_version='gfx1101' + elif echo "${amd_devices}" | grep -iq 'Navi 33'; then + amd_gfx_version='gfx1102' + fi fi fi @@ -539,6 +569,16 @@ verify_gpu_setup() { sudo rm -f "${gpu_marker_path}" 2>/dev/null || true fi + # Companion marker used by the admin to pick the right HSA_OVERRIDE_GFX_VERSION for + # the detected card. Absence of this file means "unknown gfx" — the admin falls back + # to its built-in default. Always rewrite (or remove) on install to keep state fresh. + local amd_gfx_marker_path="${NOMAD_DIR}/storage/.nomad-amd-gfx" + if [[ -n "${amd_gfx_version}" ]]; then + echo "${amd_gfx_version}" | sudo tee "${amd_gfx_marker_path}" > /dev/null 2>&1 || true + else + sudo rm -f "${amd_gfx_marker_path}" 2>/dev/null || true + fi + echo -e "${YELLOW}===========================================${RESET}\\n" # Summary