drm/amdgpu: adjust enforce_isolation handling

Switch from a bool to an enum and allow more options
for enforce isolation.  There are now 3 modes of operation:
- Disabled (0)
- Enabled (serialization and cleaner shader) (1)
- Enabled in legacy mode (no serialization or cleaner shader) (2)
This provides better flexibility for more use cases.

Acked-by: Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Alex Deucher 2025-04-08 10:39:06 -04:00
parent b86fd212f3
commit 2e0454b730
12 changed files with 93 additions and 30 deletions

View File

@ -230,7 +230,7 @@ extern int amdgpu_force_asic_type;
extern int amdgpu_smartshift_bias;
extern int amdgpu_use_xgmi_p2p;
extern int amdgpu_mtype_local;
extern bool enforce_isolation;
extern int amdgpu_enforce_isolation;
#ifdef CONFIG_HSA_AMD
extern int sched_policy;
extern bool debug_evictions;
@ -873,6 +873,13 @@ struct amdgpu_init_level {
struct amdgpu_reset_domain;
struct amdgpu_fru_info;
enum amdgpu_enforce_isolation_mode {
AMDGPU_ENFORCE_ISOLATION_DISABLE = 0,
AMDGPU_ENFORCE_ISOLATION_ENABLE = 1,
AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY = 2,
};
/*
* Non-zero (true) if the GPU has VRAM. Zero (false) otherwise.
*/
@ -1225,7 +1232,7 @@ struct amdgpu_device {
/* Protection for the following isolation structure */
struct mutex enforce_isolation_mutex;
bool enforce_isolation[MAX_XCP];
enum amdgpu_enforce_isolation_mode enforce_isolation[MAX_XCP];
struct amdgpu_isolation {
void *owner;
struct dma_fence *spearhead;

View File

@ -296,7 +296,21 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
num_ibs[i], &p->jobs[i]);
if (ret)
goto free_all_kdata;
p->jobs[i]->enforce_isolation = p->adev->enforce_isolation[fpriv->xcp_id];
switch (p->adev->enforce_isolation[fpriv->xcp_id]) {
case AMDGPU_ENFORCE_ISOLATION_DISABLE:
default:
p->jobs[i]->enforce_isolation = false;
p->jobs[i]->run_cleaner_shader = false;
break;
case AMDGPU_ENFORCE_ISOLATION_ENABLE:
p->jobs[i]->enforce_isolation = true;
p->jobs[i]->run_cleaner_shader = true;
break;
case AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY:
p->jobs[i]->enforce_isolation = true;
p->jobs[i]->run_cleaner_shader = false;
break;
}
}
p->gang_leader = p->jobs[p->gang_leader_idx];

View File

@ -2145,8 +2145,26 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
for (i = 0; i < MAX_XCP; i++)
adev->enforce_isolation[i] = !!enforce_isolation;
for (i = 0; i < MAX_XCP; i++) {
switch (amdgpu_enforce_isolation) {
case -1:
case 0:
default:
/* disable */
adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE;
break;
case 1:
/* enable */
adev->enforce_isolation[i] =
AMDGPU_ENFORCE_ISOLATION_ENABLE;
break;
case 2:
/* enable legacy mode */
adev->enforce_isolation[i] =
AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY;
break;
}
}
return 0;
}

View File

@ -179,7 +179,7 @@ uint amdgpu_pg_mask = 0xffffffff;
uint amdgpu_sdma_phase_quantum = 32;
char *amdgpu_disable_cu;
char *amdgpu_virtual_display;
bool enforce_isolation;
int amdgpu_enforce_isolation = -1;
int amdgpu_modeset = -1;
/* Specifies the default granularity for SVM, used in buffer
@ -1038,11 +1038,13 @@ module_param_named(user_partt_mode, amdgpu_user_partt_mode, uint, 0444);
/**
* DOC: enforce_isolation (bool)
* enforce process isolation between graphics and compute via using the same reserved vmid.
* DOC: enforce_isolation (int)
* enforce process isolation between graphics and compute.
* (-1 = auto, 0 = disable, 1 = enable, 2 = enable legacy mode)
*/
module_param(enforce_isolation, bool, 0444);
MODULE_PARM_DESC(enforce_isolation, "enforce process isolation between graphics and compute . enforce_isolation = on");
module_param_named(enforce_isolation, amdgpu_enforce_isolation, int, 0444);
MODULE_PARM_DESC(enforce_isolation,
"enforce process isolation between graphics and compute. (-1 = auto, 0 = disable, 1 = enable, 2 = enable legacy mode)");
/**
* DOC: modeset (int)

View File

@ -1468,6 +1468,8 @@ static int amdgpu_gfx_run_cleaner_shader_job(struct amdgpu_ring *ring)
goto err;
job->enforce_isolation = true;
/* always run the cleaner shader */
job->run_cleaner_shader = true;
ib = &job->ibs[0];
for (i = 0; i <= ring->funcs->align_mask; ++i)
@ -1599,7 +1601,7 @@ static ssize_t amdgpu_gfx_set_run_cleaner_shader(struct device *dev,
* Provides the sysfs read interface to get the current settings of the 'enforce_isolation'
* feature for each GPU partition. Reading from the 'enforce_isolation'
* sysfs file returns the isolation settings for all partitions, where '0'
* indicates disabled and '1' indicates enabled.
* indicates disabled, '1' indicates enabled, and '2' indicates enabled in legacy mode.
*
* Return: The number of bytes read from the sysfs file.
*/
@ -1634,9 +1636,10 @@ static ssize_t amdgpu_gfx_get_enforce_isolation(struct device *dev,
* @count: The size of the input data
*
* This function allows control over the 'enforce_isolation' feature, which
* serializes access to the graphics engine. Writing '1' or '0' to the
* 'enforce_isolation' sysfs file enables or disables process isolation for
* each partition. The input should specify the setting for all partitions.
* serializes access to the graphics engine. Writing '1', '2', or '0' to the
* 'enforce_isolation' sysfs file enables (full or legacy) or disables process
* isolation for each partition. The input should specify the setting for all
* partitions.
*
* Return: The number of bytes written to the sysfs file.
*/
@ -1673,13 +1676,29 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev,
return -EINVAL;
for (i = 0; i < num_partitions; i++) {
if (partition_values[i] != 0 && partition_values[i] != 1)
if (partition_values[i] != 0 &&
partition_values[i] != 1 &&
partition_values[i] != 2)
return -EINVAL;
}
mutex_lock(&adev->enforce_isolation_mutex);
for (i = 0; i < num_partitions; i++)
adev->enforce_isolation[i] = partition_values[i];
for (i = 0; i < num_partitions; i++) {
switch (partition_values[i]) {
case 0:
default:
adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE;
break;
case 1:
adev->enforce_isolation[i] =
AMDGPU_ENFORCE_ISOLATION_ENABLE;
break;
case 2:
adev->enforce_isolation[i] =
AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY;
break;
}
}
mutex_unlock(&adev->enforce_isolation_mutex);
amdgpu_mes_update_enforce_isolation(adev);
@ -2034,7 +2053,7 @@ amdgpu_gfx_enforce_isolation_wait_for_kfd(struct amdgpu_device *adev,
bool wait = false;
mutex_lock(&adev->enforce_isolation_mutex);
if (adev->enforce_isolation[idx]) {
if (adev->enforce_isolation[idx] == AMDGPU_ENFORCE_ISOLATION_ENABLE) {
/* set the initial values if nothing is set */
if (!adev->gfx.enforce_isolation_jiffies[idx]) {
adev->gfx.enforce_isolation_jiffies[idx] = jiffies;
@ -2101,7 +2120,7 @@ void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring)
amdgpu_gfx_enforce_isolation_wait_for_kfd(adev, idx);
mutex_lock(&adev->enforce_isolation_mutex);
if (adev->enforce_isolation[idx]) {
if (adev->enforce_isolation[idx] == AMDGPU_ENFORCE_ISOLATION_ENABLE) {
if (adev->kfd.init_complete)
sched_work = true;
}
@ -2138,7 +2157,7 @@ void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring)
return;
mutex_lock(&adev->enforce_isolation_mutex);
if (adev->enforce_isolation[idx]) {
if (adev->enforce_isolation[idx] == AMDGPU_ENFORCE_ISOLATION_ENABLE) {
if (adev->kfd.init_complete)
sched_work = true;
}

View File

@ -588,7 +588,7 @@ void amdgpu_vmid_mgr_init(struct amdgpu_device *adev)
}
/* alloc a default reserved vmid to enforce isolation */
for (i = 0; i < (adev->xcp_mgr ? adev->xcp_mgr->num_xcps : 1); i++) {
if (adev->enforce_isolation[i])
if (adev->enforce_isolation[i] != AMDGPU_ENFORCE_ISOLATION_DISABLE)
amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(i));
}
}

View File

@ -78,6 +78,7 @@ struct amdgpu_job {
/* enforce isolation */
bool enforce_isolation;
bool run_cleaner_shader;
uint32_t num_ibs;
struct amdgpu_ib ibs[];

View File

@ -768,7 +768,7 @@ int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev)
if (adev->enable_mes && adev->gfx.enable_cleaner_shader) {
mutex_lock(&adev->enforce_isolation_mutex);
for (i = 0; i < (adev->xcp_mgr ? adev->xcp_mgr->num_xcps : 1); i++) {
if (adev->enforce_isolation[i])
if (adev->enforce_isolation[i] == AMDGPU_ENFORCE_ISOLATION_ENABLE)
r |= amdgpu_mes_set_enforce_isolation(adev, i, true);
else
r |= amdgpu_mes_set_enforce_isolation(adev, i, false);

View File

@ -787,7 +787,8 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
ring->funcs->emit_wreg;
cleaner_shader_needed = adev->gfx.enable_cleaner_shader &&
cleaner_shader_needed = job->run_cleaner_shader &&
adev->gfx.enable_cleaner_shader &&
ring->funcs->emit_cleaner_shader && job->base.s_fence &&
&job->base.s_fence->scheduled == isolation->spearhead;

View File

@ -724,7 +724,7 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes)
mes->event_log_gpu_addr;
}
if (adev->enforce_isolation[0])
if (adev->enforce_isolation[0] == AMDGPU_ENFORCE_ISOLATION_ENABLE)
mes_set_hw_res_pkt.limit_single_process = 1;
return mes_v11_0_submit_pkt_and_poll_completion(mes,

View File

@ -762,7 +762,7 @@ static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes, int pipe)
pipe * (AMDGPU_MES_LOG_BUFFER_SIZE + AMDGPU_MES_MSCRATCH_SIZE);
}
if (adev->enforce_isolation[0])
if (adev->enforce_isolation[0] == AMDGPU_ENFORCE_ISOLATION_ENABLE)
mes_set_hw_res_pkt.limit_single_process = 1;
return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe,

View File

@ -43,7 +43,7 @@ static int pm_map_process_v9(struct packet_manager *pm,
memset(buffer, 0, sizeof(struct pm4_mes_map_process));
packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
sizeof(struct pm4_mes_map_process));
if (adev->enforce_isolation[kfd->node_id])
if (adev->enforce_isolation[kfd->node_id] == AMDGPU_ENFORCE_ISOLATION_ENABLE)
packet->bitfields2.exec_cleaner_shader = 1;
packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
packet->bitfields2.process_quantum = 10;
@ -102,7 +102,8 @@ static int pm_map_process_aldebaran(struct packet_manager *pm,
memset(buffer, 0, sizeof(struct pm4_mes_map_process_aldebaran));
packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
sizeof(struct pm4_mes_map_process_aldebaran));
if (adev->enforce_isolation[knode->node_id])
if (adev->enforce_isolation[knode->node_id] ==
AMDGPU_ENFORCE_ISOLATION_ENABLE)
packet->bitfields2.exec_cleaner_shader = 1;
packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
packet->bitfields2.process_quantum = 10;
@ -165,9 +166,9 @@ static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer,
* hws_max_conc_proc has been done in
* kgd2kfd_device_init().
*/
concurrent_proc_cnt = adev->enforce_isolation[kfd->node_id] ?
1 : min(pm->dqm->processes_count,
kfd->max_proc_per_quantum);
concurrent_proc_cnt = (adev->enforce_isolation[kfd->node_id] ==
AMDGPU_ENFORCE_ISOLATION_ENABLE) ?
1 : min(pm->dqm->processes_count, kfd->max_proc_per_quantum);
packet = (struct pm4_mes_runlist *)buffer;