mirror of
https://github.com/torvalds/linux.git
synced 2026-05-25 15:41:52 +02:00
drm/amdkfd: Fix checkpoint-restore on multi-xcc
GPUs with multi-xcc have multiple MQDs per queue. This patch saves and restores all the MQDs within the partition. Signed-off-by: David Yat Sin <David.YatSin@amd.com> Reviewed-by: Felix Kuehling <felix.kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
8f249ba6ec
commit
a578f2a58c
|
|
@ -2725,7 +2725,7 @@ static void get_queue_checkpoint_info(struct device_queue_manager *dqm,
|
|||
|
||||
dqm_lock(dqm);
|
||||
mqd_mgr = dqm->mqd_mgrs[mqd_type];
|
||||
*mqd_size = mqd_mgr->mqd_size;
|
||||
*mqd_size = mqd_mgr->mqd_size * NUM_XCC(mqd_mgr->dev->xcc_mask);
|
||||
*ctl_stack_size = 0;
|
||||
|
||||
if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE && mqd_mgr->get_checkpoint_info)
|
||||
|
|
|
|||
|
|
@ -373,7 +373,7 @@ static void get_checkpoint_info(struct mqd_manager *mm, void *mqd, u32 *ctl_stac
|
|||
{
|
||||
struct v9_mqd *m = get_mqd(mqd);
|
||||
|
||||
*ctl_stack_size = m->cp_hqd_cntl_stack_size;
|
||||
*ctl_stack_size = m->cp_hqd_cntl_stack_size * NUM_XCC(mm->dev->xcc_mask);
|
||||
}
|
||||
|
||||
static void checkpoint_mqd(struct mqd_manager *mm, void *mqd, void *mqd_dst, void *ctl_stack_dst)
|
||||
|
|
@ -388,6 +388,24 @@ static void checkpoint_mqd(struct mqd_manager *mm, void *mqd, void *mqd_dst, voi
|
|||
memcpy(ctl_stack_dst, ctl_stack, m->cp_hqd_cntl_stack_size);
|
||||
}
|
||||
|
||||
static void checkpoint_mqd_v9_4_3(struct mqd_manager *mm,
|
||||
void *mqd,
|
||||
void *mqd_dst,
|
||||
void *ctl_stack_dst)
|
||||
{
|
||||
struct v9_mqd *m;
|
||||
int xcc;
|
||||
uint64_t size = get_mqd(mqd)->cp_mqd_stride_size;
|
||||
|
||||
for (xcc = 0; xcc < NUM_XCC(mm->dev->xcc_mask); xcc++) {
|
||||
m = get_mqd(mqd + size * xcc);
|
||||
|
||||
checkpoint_mqd(mm, m,
|
||||
(uint8_t *)mqd_dst + sizeof(*m) * xcc,
|
||||
(uint8_t *)ctl_stack_dst + m->cp_hqd_cntl_stack_size * xcc);
|
||||
}
|
||||
}
|
||||
|
||||
static void restore_mqd(struct mqd_manager *mm, void **mqd,
|
||||
struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
|
||||
struct queue_properties *qp,
|
||||
|
|
@ -764,13 +782,35 @@ static void restore_mqd_v9_4_3(struct mqd_manager *mm, void **mqd,
|
|||
const void *mqd_src,
|
||||
const void *ctl_stack_src, u32 ctl_stack_size)
|
||||
{
|
||||
restore_mqd(mm, mqd, mqd_mem_obj, gart_addr, qp, mqd_src, ctl_stack_src, ctl_stack_size);
|
||||
if (amdgpu_sriov_multi_vf_mode(mm->dev->adev)) {
|
||||
struct v9_mqd *m;
|
||||
struct kfd_mem_obj xcc_mqd_mem_obj;
|
||||
u32 mqd_ctl_stack_size;
|
||||
struct v9_mqd *m;
|
||||
u32 num_xcc;
|
||||
int xcc;
|
||||
|
||||
m = (struct v9_mqd *) mqd_mem_obj->cpu_ptr;
|
||||
m->cp_hqd_pq_doorbell_control |= 1 <<
|
||||
CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_MODE__SHIFT;
|
||||
uint64_t offset = mm->mqd_stride(mm, qp);
|
||||
|
||||
mm->dev->dqm->current_logical_xcc_start++;
|
||||
|
||||
num_xcc = NUM_XCC(mm->dev->xcc_mask);
|
||||
mqd_ctl_stack_size = ctl_stack_size / num_xcc;
|
||||
|
||||
memset(&xcc_mqd_mem_obj, 0x0, sizeof(struct kfd_mem_obj));
|
||||
|
||||
/* Set the MQD pointer and gart address to XCC0 MQD */
|
||||
*mqd = mqd_mem_obj->cpu_ptr;
|
||||
if (gart_addr)
|
||||
*gart_addr = mqd_mem_obj->gpu_addr;
|
||||
|
||||
for (xcc = 0; xcc < num_xcc; xcc++) {
|
||||
get_xcc_mqd(mqd_mem_obj, &xcc_mqd_mem_obj, offset * xcc);
|
||||
restore_mqd(mm, (void **)&m,
|
||||
&xcc_mqd_mem_obj,
|
||||
NULL,
|
||||
qp,
|
||||
(uint8_t *)mqd_src + xcc * sizeof(*m),
|
||||
(uint8_t *)ctl_stack_src + xcc * mqd_ctl_stack_size,
|
||||
mqd_ctl_stack_size);
|
||||
}
|
||||
}
|
||||
static int destroy_mqd_v9_4_3(struct mqd_manager *mm, void *mqd,
|
||||
|
|
@ -906,7 +946,6 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
|
|||
mqd->free_mqd = kfd_free_mqd_cp;
|
||||
mqd->is_occupied = kfd_is_occupied_cp;
|
||||
mqd->get_checkpoint_info = get_checkpoint_info;
|
||||
mqd->checkpoint_mqd = checkpoint_mqd;
|
||||
mqd->mqd_size = sizeof(struct v9_mqd);
|
||||
mqd->mqd_stride = mqd_stride_v9;
|
||||
#if defined(CONFIG_DEBUG_FS)
|
||||
|
|
@ -918,16 +957,18 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
|
|||
mqd->init_mqd = init_mqd_v9_4_3;
|
||||
mqd->load_mqd = load_mqd_v9_4_3;
|
||||
mqd->update_mqd = update_mqd_v9_4_3;
|
||||
mqd->restore_mqd = restore_mqd_v9_4_3;
|
||||
mqd->destroy_mqd = destroy_mqd_v9_4_3;
|
||||
mqd->get_wave_state = get_wave_state_v9_4_3;
|
||||
mqd->checkpoint_mqd = checkpoint_mqd_v9_4_3;
|
||||
mqd->restore_mqd = restore_mqd_v9_4_3;
|
||||
} else {
|
||||
mqd->init_mqd = init_mqd;
|
||||
mqd->load_mqd = load_mqd;
|
||||
mqd->update_mqd = update_mqd;
|
||||
mqd->restore_mqd = restore_mqd;
|
||||
mqd->destroy_mqd = kfd_destroy_mqd_cp;
|
||||
mqd->get_wave_state = get_wave_state;
|
||||
mqd->checkpoint_mqd = checkpoint_mqd;
|
||||
mqd->restore_mqd = restore_mqd;
|
||||
}
|
||||
break;
|
||||
case KFD_MQD_TYPE_HIQ:
|
||||
|
|
|
|||
|
|
@ -914,7 +914,10 @@ static int criu_checkpoint_queues_device(struct kfd_process_device *pdd,
|
|||
|
||||
q_data = (struct kfd_criu_queue_priv_data *)q_private_data;
|
||||
|
||||
/* data stored in this order: priv_data, mqd, ctl_stack */
|
||||
/*
|
||||
* data stored in this order:
|
||||
* priv_data, mqd[xcc0], mqd[xcc1],..., ctl_stack[xcc0], ctl_stack[xcc1]...
|
||||
*/
|
||||
q_data->mqd_size = mqd_size;
|
||||
q_data->ctl_stack_size = ctl_stack_size;
|
||||
|
||||
|
|
@ -963,7 +966,7 @@ int kfd_criu_checkpoint_queues(struct kfd_process *p,
|
|||
}
|
||||
|
||||
static void set_queue_properties_from_criu(struct queue_properties *qp,
|
||||
struct kfd_criu_queue_priv_data *q_data)
|
||||
struct kfd_criu_queue_priv_data *q_data, uint32_t num_xcc)
|
||||
{
|
||||
qp->is_interop = false;
|
||||
qp->queue_percent = q_data->q_percent;
|
||||
|
|
@ -976,7 +979,11 @@ static void set_queue_properties_from_criu(struct queue_properties *qp,
|
|||
qp->eop_ring_buffer_size = q_data->eop_ring_buffer_size;
|
||||
qp->ctx_save_restore_area_address = q_data->ctx_save_restore_area_address;
|
||||
qp->ctx_save_restore_area_size = q_data->ctx_save_restore_area_size;
|
||||
qp->ctl_stack_size = q_data->ctl_stack_size;
|
||||
if (q_data->type == KFD_QUEUE_TYPE_COMPUTE)
|
||||
qp->ctl_stack_size = q_data->ctl_stack_size / num_xcc;
|
||||
else
|
||||
qp->ctl_stack_size = q_data->ctl_stack_size;
|
||||
|
||||
qp->type = q_data->type;
|
||||
qp->format = q_data->format;
|
||||
}
|
||||
|
|
@ -1036,12 +1043,15 @@ int kfd_criu_restore_queue(struct kfd_process *p,
|
|||
goto exit;
|
||||
}
|
||||
|
||||
/* data stored in this order: mqd, ctl_stack */
|
||||
/*
|
||||
* data stored in this order:
|
||||
* mqd[xcc0], mqd[xcc1],..., ctl_stack[xcc0], ctl_stack[xcc1]...
|
||||
*/
|
||||
mqd = q_extra_data;
|
||||
ctl_stack = mqd + q_data->mqd_size;
|
||||
|
||||
memset(&qp, 0, sizeof(qp));
|
||||
set_queue_properties_from_criu(&qp, q_data);
|
||||
set_queue_properties_from_criu(&qp, q_data, NUM_XCC(pdd->dev->adev->gfx.xcc_mask));
|
||||
|
||||
print_queue_properties(&qp);
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user