mirror of
https://github.com/Crosstalk-Solutions/project-nomad.git
synced 2026-05-12 16:10:11 +02:00
fix(AI): vendor-aware AMD HSA override + benchmark discrete-GPU detection
Closes #810. ## Bug A: HSA_OVERRIDE_GFX_VERSION=11.0.0 was unconditional PR #804 set HSA_OVERRIDE_GFX_VERSION=11.0.0 for any AMD GPU. The inline comment claimed this was harmless on supported discrete cards (gfx1030 RX 6800, etc.) — empirically false. With the override, Ollama crashes during GPU discovery on gfx1030 and falls back to CPU silently. Affects every NOMAD user with an RX 6800 or other RDNA 2 discrete card. The correct value depends on the gfx version: - gfx1030, gfx1100, gfx1101, gfx1102: officially supported by ROCm — no override - gfx1031..gfx1036 (RDNA 2 variants + iGPUs like Rembrandt 680M): 10.3.0 - gfx1103, gfx1150, gfx1151 (Phoenix 780M, Strix 890M, Strix Halo): 11.0.0 ### Resolution chain in `_resolveAmdHsaOverride()` 1. KV `ai.amdHsaOverride` — manual override; accepts 'none' to disable, or a semver-style value to force. 2. Marker file `/app/storage/.nomad-amd-gfx` — written by install_nomad.sh based on lspci codename. Mapped to override via `_mapGfxToHsaOverride()`. 3. Default: `11.0.0` — preserves prior behavior so existing iGPU users (780M / 890M, the dominant AMD population today) don't regress on upgrade. Discrete RDNA 2 users on existing installs can opt out via `ai.amdHsaOverride='none'` and force-reinstall AI Assistant, OR re-run install_nomad.sh to refresh the marker file. The helper is used in both `createContainer` (initial install) and `updateContainer` (image update) paths, replacing the unconditional push. ## Bug B: BenchmarkService had no AMD discrete detection path `BenchmarkService.getHardwareInfo()` had three GPU detection fallbacks: 1. `si.graphics()` — empty inside Docker for AMD 2. nvidia-smi — NVIDIA only 3. AMD APU regex from CPU model — integrated only Result: AMD discrete cards (RX 6800, RX 7900 XTX, etc.) showed up as "GPU: Not detected" on the leaderboard despite ROCm working. Corrupts leaderboard data quality for that population. Fix: after the existing fallbacks, call `SystemService.getSystemInfo()` and read `graphics.controllers[0].model`. That path already handles AMD via the marker file + Ollama log probe added in PR #804, so we're reusing existing plumbing rather than duplicating detection logic. ## install_nomad.sh changes The existing AMD detection block already runs lspci. Added a codename parse step that maps Navi 21/22/23/24, Rembrandt, Phoenix1/Phoenix2, Strix/Strix Point/Strix Halo, and Navi 31/32/33 to gfx versions, then writes `/opt/project-nomad/storage/.nomad-amd-gfx`. Unknown codenames write nothing (admin handles missing-marker case via the backward-compat default). ## Validation Both bugs were originally surfaced and validated empirically on RX 6800 / gfx1030 / Ubuntu 24.04 + kernel 6.17 + ollama/ollama:rocm during the #810 filing. Validation grid from that report: | Run | NOMAD Score | tok/s | GPU detected | |-----------------------------------------------|-------------|-------|-------------------------| | Pre-fix (Bug A active) | n/a | 0 | yes, but library=cpu | | HSA_OVERRIDE removed, Bug B unfixed | 73.8 | 221.6 | "Not detected" | | Both fixes hot-patched (this PR's behavior) | 73.7 | 216.0 | AMD Radeon RX 6800 | Local checks: `npm run typecheck` clean, `npm run build` clean.
This commit is contained in:
parent
63282565a9
commit
0b25638a3e
|
|
@ -317,6 +317,23 @@ export class BenchmarkService {
|
|||
}
|
||||
}
|
||||
|
||||
// Fallback: AMD discrete cards. si.graphics() returns empty inside Docker for AMD,
|
||||
// the nvidia-smi path doesn't apply, and the APU regex only catches integrated parts.
|
||||
// SystemService.getSystemInfo() already handles AMD via the marker file + Ollama log
|
||||
// probe added in PR #804, so reuse that plumbing rather than duplicating it here.
|
||||
if (!gpuModel) {
|
||||
try {
|
||||
const systemService = new (await import('./system_service.js')).SystemService(this.dockerService)
|
||||
const sysInfo = await systemService.getSystemInfo()
|
||||
const sysGpuModel = sysInfo?.graphics?.controllers?.[0]?.model
|
||||
if (sysGpuModel) {
|
||||
gpuModel = sysGpuModel
|
||||
}
|
||||
} catch (sysError: any) {
|
||||
logger.warn(`[BenchmarkService] system_service AMD fallback failed: ${sysError.message}`)
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
cpu_model: `${cpu.manufacturer} ${cpu.brand}`,
|
||||
cpu_cores: cpu.physicalCores,
|
||||
|
|
|
|||
|
|
@ -592,10 +592,12 @@ export class DockerService {
|
|||
ollamaEnv.push('OLLAMA_FLASH_ATTENTION=1')
|
||||
}
|
||||
if (amdGpuConfigured) {
|
||||
// RDNA3 iGPUs (gfx1103: 780M, 880M, 890M, ...) aren't on AMD's official ROCm
|
||||
// allowlist but work when forced to identify as gfx1100 via HSA_OVERRIDE_GFX_VERSION.
|
||||
// Harmless on supported discrete cards (gfx1030 RX 6800, etc.) — they ignore the override.
|
||||
ollamaEnv.push('HSA_OVERRIDE_GFX_VERSION=11.0.0')
|
||||
// gfx-aware HSA override — only set for cards that actually need it. See
|
||||
// _resolveAmdHsaOverride() for the resolution order and gfx → version mapping.
|
||||
const hsaOverride = await this._resolveAmdHsaOverride()
|
||||
if (hsaOverride) {
|
||||
ollamaEnv.push(`HSA_OVERRIDE_GFX_VERSION=${hsaOverride}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -999,6 +1001,67 @@ export class DockerService {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve the HSA_OVERRIDE_GFX_VERSION value for the host's AMD GPU.
|
||||
*
|
||||
* gfx1030 (RX 6800/6700/etc.), gfx1100/1101/1102 (RX 7900/7800/7600) are on AMD's
|
||||
* official ROCm allowlist — forcing an override on these breaks GPU discovery.
|
||||
* gfx1035 / gfx1036 (RDNA 2 iGPUs like 680M) need 10.3.0 to coerce to gfx1030.
|
||||
* gfx1103 / gfx1150 / gfx1151 (RDNA 3/3.5 iGPUs like 780M / 890M / Strix Halo) need 11.0.0.
|
||||
*
|
||||
* Resolution order:
|
||||
* 1. KV `ai.amdHsaOverride` — manual user override; accepts 'none' (disable) or a semver-style value.
|
||||
* 2. Marker file `/app/storage/.nomad-amd-gfx` written by install_nomad.sh.
|
||||
* 3. Default: '11.0.0' — preserves prior behavior so existing iGPU users don't regress on
|
||||
* upgrade. Discrete-card users on existing installs can opt out via the KV.
|
||||
*
|
||||
* Returns null when no override should be applied.
|
||||
*/
|
||||
private async _resolveAmdHsaOverride(): Promise<string | null> {
|
||||
const manualRaw = await KVStore.getValue('ai.amdHsaOverride')
|
||||
if (manualRaw !== null && manualRaw !== undefined && String(manualRaw).trim() !== '') {
|
||||
const manual = String(manualRaw).trim().toLowerCase()
|
||||
if (manual === 'none' || manual === 'off' || manual === 'false') {
|
||||
logger.info('[DockerService] HSA override disabled via ai.amdHsaOverride')
|
||||
return null
|
||||
}
|
||||
if (/^\d+\.\d+\.\d+$/.test(manual)) {
|
||||
logger.info(`[DockerService] HSA override forced to ${manual} via ai.amdHsaOverride`)
|
||||
return manual
|
||||
}
|
||||
logger.warn(`[DockerService] Ignoring invalid ai.amdHsaOverride value: ${manualRaw}`)
|
||||
}
|
||||
|
||||
try {
|
||||
const gfx = (await readFile('/app/storage/.nomad-amd-gfx', 'utf8')).trim()
|
||||
const mapped = this._mapGfxToHsaOverride(gfx)
|
||||
logger.info(`[DockerService] AMD gfx marker '${gfx}' → HSA override ${mapped ?? 'none'}`)
|
||||
return mapped
|
||||
} catch {
|
||||
// Marker absent — most likely an existing install upgraded without re-running
|
||||
// install_nomad.sh. Fall through to the default.
|
||||
}
|
||||
|
||||
logger.info('[DockerService] No AMD gfx marker; defaulting HSA override to 11.0.0 for backward compatibility')
|
||||
return '11.0.0'
|
||||
}
|
||||
|
||||
private _mapGfxToHsaOverride(gfx: string): string | null {
|
||||
// Officially supported by ROCm — no override needed
|
||||
if (gfx === 'gfx1030' || gfx === 'gfx1100' || gfx === 'gfx1101' || gfx === 'gfx1102') {
|
||||
return null
|
||||
}
|
||||
// RDNA 2 variants + iGPUs (gfx1031..gfx1036, e.g. Rembrandt 680M)
|
||||
if (/^gfx103[1-6]$/.test(gfx)) {
|
||||
return '10.3.0'
|
||||
}
|
||||
// RDNA 3 / 3.5 mobile parts (Phoenix 780M = gfx1103, Strix 890M = gfx1150, Strix Halo = gfx1151)
|
||||
if (gfx === 'gfx1103' || gfx === 'gfx1150' || gfx === 'gfx1151') {
|
||||
return '11.0.0'
|
||||
}
|
||||
return '11.0.0'
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the Docker Devices array for AMD GPU passthrough.
|
||||
*
|
||||
|
|
@ -1132,12 +1195,14 @@ export class DockerService {
|
|||
// and whether HSA_OVERRIDE needs injection. For AMD, replace any prior HSA_OVERRIDE in
|
||||
// the inspect-captured env so updates from older containers pick up the current value.
|
||||
const baseEnv = inspectData.Config?.Env || []
|
||||
const finalEnv = updatedAmdGpuConfigured
|
||||
? [
|
||||
...baseEnv.filter((e: string) => !e.startsWith('HSA_OVERRIDE_GFX_VERSION=')),
|
||||
'HSA_OVERRIDE_GFX_VERSION=11.0.0',
|
||||
]
|
||||
: baseEnv
|
||||
let finalEnv = baseEnv
|
||||
if (updatedAmdGpuConfigured) {
|
||||
const hsaOverride = await this._resolveAmdHsaOverride()
|
||||
finalEnv = baseEnv.filter((e: string) => !e.startsWith('HSA_OVERRIDE_GFX_VERSION='))
|
||||
if (hsaOverride) {
|
||||
finalEnv.push(`HSA_OVERRIDE_GFX_VERSION=${hsaOverride}`)
|
||||
}
|
||||
}
|
||||
|
||||
const newContainerConfig: any = {
|
||||
Image: newImage,
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ export const KV_STORE_SCHEMA = {
|
|||
'ai.remoteOllamaUrl': 'string',
|
||||
'ai.ollamaFlashAttention': 'boolean',
|
||||
'ai.amdGpuAcceleration': 'boolean',
|
||||
'ai.amdHsaOverride': 'string',
|
||||
} as const
|
||||
|
||||
type KVTagToType<T extends string> = T extends 'boolean' ? boolean : string
|
||||
|
|
|
|||
|
|
@ -520,10 +520,40 @@ verify_gpu_setup() {
|
|||
# Check for AMD GPU — restrict to display controller classes to avoid false positives
|
||||
# from AMD CPU host bridges, PCI bridges, and chipset devices.
|
||||
local has_amd_gpu='false'
|
||||
local amd_gfx_version=''
|
||||
if command -v lspci &> /dev/null; then
|
||||
if lspci 2>/dev/null | grep -iE "VGA|3D controller|Display" | grep -iE "amd|radeon" &> /dev/null; then
|
||||
has_amd_gpu='true'
|
||||
echo -e "${GREEN}✓${RESET} AMD GPU detected — ROCm acceleration will be configured automatically when AI Assistant is installed.\\n"
|
||||
|
||||
# Map AMD codename → gfx version so the admin can pick the right HSA_OVERRIDE_GFX_VERSION.
|
||||
# gfx1030/1100/1101/1102 are on AMD's official ROCm allowlist and need NO override —
|
||||
# forcing one (e.g. 11.0.0) breaks GPU discovery on these. Other variants do need it.
|
||||
local amd_devices
|
||||
amd_devices=$(lspci -vmm 2>/dev/null | awk -F'\t' '/^Class:.*(VGA|3D|Display)/{c=1} c && /^Device:/{print $2; c=0}')
|
||||
if echo "${amd_devices}" | grep -iq 'Navi 21'; then
|
||||
amd_gfx_version='gfx1030'
|
||||
elif echo "${amd_devices}" | grep -iq 'Navi 22'; then
|
||||
amd_gfx_version='gfx1031'
|
||||
elif echo "${amd_devices}" | grep -iq 'Navi 23'; then
|
||||
amd_gfx_version='gfx1032'
|
||||
elif echo "${amd_devices}" | grep -iq 'Navi 24'; then
|
||||
amd_gfx_version='gfx1034'
|
||||
elif echo "${amd_devices}" | grep -iq 'Rembrandt'; then
|
||||
amd_gfx_version='gfx1035'
|
||||
elif echo "${amd_devices}" | grep -iEq 'Phoenix1?|Phoenix2'; then
|
||||
amd_gfx_version='gfx1103'
|
||||
elif echo "${amd_devices}" | grep -iEq 'Strix Halo'; then
|
||||
amd_gfx_version='gfx1151'
|
||||
elif echo "${amd_devices}" | grep -iEq 'Strix( Point)?'; then
|
||||
amd_gfx_version='gfx1150'
|
||||
elif echo "${amd_devices}" | grep -iq 'Navi 31'; then
|
||||
amd_gfx_version='gfx1100'
|
||||
elif echo "${amd_devices}" | grep -iq 'Navi 32'; then
|
||||
amd_gfx_version='gfx1101'
|
||||
elif echo "${amd_devices}" | grep -iq 'Navi 33'; then
|
||||
amd_gfx_version='gfx1102'
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
|
|
@ -539,6 +569,16 @@ verify_gpu_setup() {
|
|||
sudo rm -f "${gpu_marker_path}" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Companion marker used by the admin to pick the right HSA_OVERRIDE_GFX_VERSION for
|
||||
# the detected card. Absence of this file means "unknown gfx" — the admin falls back
|
||||
# to its built-in default. Always rewrite (or remove) on install to keep state fresh.
|
||||
local amd_gfx_marker_path="${NOMAD_DIR}/storage/.nomad-amd-gfx"
|
||||
if [[ -n "${amd_gfx_version}" ]]; then
|
||||
echo "${amd_gfx_version}" | sudo tee "${amd_gfx_marker_path}" > /dev/null 2>&1 || true
|
||||
else
|
||||
sudo rm -f "${amd_gfx_marker_path}" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
echo -e "${YELLOW}===========================================${RESET}\\n"
|
||||
|
||||
# Summary
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user