mirror of
https://github.com/torvalds/linux.git
synced 2026-06-01 02:53:36 +02:00
drm/amdkfd: CRIU add queues support
Add support to existing CRIU ioctl's to save number of queues and queue properties for each queue during checkpoint and re-create queues on restore. Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: David Yat Sin <david.yatsin@amd.com> Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
cd9f791030
commit
626f7b3190
|
|
@ -2015,19 +2015,36 @@ static int criu_checkpoint_bos(struct kfd_process *p,
|
|||
return ret;
|
||||
}
|
||||
|
||||
static void criu_get_process_object_info(struct kfd_process *p,
|
||||
uint32_t *num_bos,
|
||||
uint64_t *objs_priv_size)
|
||||
static int criu_get_process_object_info(struct kfd_process *p,
|
||||
uint32_t *num_bos,
|
||||
uint32_t *num_objects,
|
||||
uint64_t *objs_priv_size)
|
||||
{
|
||||
int ret;
|
||||
uint64_t priv_size;
|
||||
uint32_t num_queues, num_events, num_svm_ranges;
|
||||
uint64_t queues_priv_data_size;
|
||||
|
||||
*num_bos = get_process_num_bos(p);
|
||||
|
||||
ret = kfd_process_get_queue_info(p, &num_queues, &queues_priv_data_size);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
num_events = 0; /* TODO: Implement Events */
|
||||
num_svm_ranges = 0; /* TODO: Implement SVM-Ranges */
|
||||
|
||||
*num_objects = num_queues + num_events + num_svm_ranges;
|
||||
|
||||
if (objs_priv_size) {
|
||||
priv_size = sizeof(struct kfd_criu_process_priv_data);
|
||||
priv_size += *num_bos * sizeof(struct kfd_criu_bo_priv_data);
|
||||
priv_size += queues_priv_data_size;
|
||||
/* TODO: Add Events priv size */
|
||||
/* TODO: Add SVM ranges priv size */
|
||||
*objs_priv_size = priv_size;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int criu_checkpoint(struct file *filep,
|
||||
|
|
@ -2035,7 +2052,7 @@ static int criu_checkpoint(struct file *filep,
|
|||
struct kfd_ioctl_criu_args *args)
|
||||
{
|
||||
int ret;
|
||||
uint32_t num_bos;
|
||||
uint32_t num_bos, num_objects;
|
||||
uint64_t priv_size, priv_offset = 0;
|
||||
|
||||
if (!args->bos || !args->priv_data)
|
||||
|
|
@ -2057,9 +2074,12 @@ static int criu_checkpoint(struct file *filep,
|
|||
goto exit_unlock;
|
||||
}
|
||||
|
||||
criu_get_process_object_info(p, &num_bos, &priv_size);
|
||||
ret = criu_get_process_object_info(p, &num_bos, &num_objects, &priv_size);
|
||||
if (ret)
|
||||
goto exit_unlock;
|
||||
|
||||
if (num_bos != args->num_bos ||
|
||||
num_objects != args->num_objects ||
|
||||
priv_size != args->priv_data_size) {
|
||||
|
||||
ret = -EINVAL;
|
||||
|
|
@ -2076,6 +2096,17 @@ static int criu_checkpoint(struct file *filep,
|
|||
if (ret)
|
||||
goto exit_unlock;
|
||||
|
||||
if (num_objects) {
|
||||
ret = kfd_criu_checkpoint_queues(p, (uint8_t __user *)args->priv_data,
|
||||
&priv_offset);
|
||||
if (ret)
|
||||
goto exit_unlock;
|
||||
|
||||
/* TODO: Dump Events */
|
||||
|
||||
/* TODO: Dump SVM-Ranges */
|
||||
}
|
||||
|
||||
exit_unlock:
|
||||
mutex_unlock(&p->mutex);
|
||||
if (ret)
|
||||
|
|
@ -2344,6 +2375,62 @@ static int criu_restore_bos(struct kfd_process *p,
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int criu_restore_objects(struct file *filep,
|
||||
struct kfd_process *p,
|
||||
struct kfd_ioctl_criu_args *args,
|
||||
uint64_t *priv_offset,
|
||||
uint64_t max_priv_data_size)
|
||||
{
|
||||
int ret = 0;
|
||||
uint32_t i;
|
||||
|
||||
BUILD_BUG_ON(offsetof(struct kfd_criu_queue_priv_data, object_type));
|
||||
BUILD_BUG_ON(offsetof(struct kfd_criu_event_priv_data, object_type));
|
||||
BUILD_BUG_ON(offsetof(struct kfd_criu_svm_range_priv_data, object_type));
|
||||
|
||||
for (i = 0; i < args->num_objects; i++) {
|
||||
uint32_t object_type;
|
||||
|
||||
if (*priv_offset + sizeof(object_type) > max_priv_data_size) {
|
||||
pr_err("Invalid private data size\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ret = get_user(object_type, (uint32_t __user *)(args->priv_data + *priv_offset));
|
||||
if (ret) {
|
||||
pr_err("Failed to copy private information from user\n");
|
||||
goto exit;
|
||||
}
|
||||
|
||||
switch (object_type) {
|
||||
case KFD_CRIU_OBJECT_TYPE_QUEUE:
|
||||
ret = kfd_criu_restore_queue(p, (uint8_t __user *)args->priv_data,
|
||||
priv_offset, max_priv_data_size);
|
||||
if (ret)
|
||||
goto exit;
|
||||
break;
|
||||
case KFD_CRIU_OBJECT_TYPE_EVENT:
|
||||
/* TODO: Implement Events */
|
||||
*priv_offset += sizeof(struct kfd_criu_event_priv_data);
|
||||
if (ret)
|
||||
goto exit;
|
||||
break;
|
||||
case KFD_CRIU_OBJECT_TYPE_SVM_RANGE:
|
||||
/* TODO: Implement SVM range */
|
||||
*priv_offset += sizeof(struct kfd_criu_svm_range_priv_data);
|
||||
if (ret)
|
||||
goto exit;
|
||||
break;
|
||||
default:
|
||||
pr_err("Invalid object type:%u at index:%d\n", object_type, i);
|
||||
ret = -EINVAL;
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
exit:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int criu_restore(struct file *filep,
|
||||
struct kfd_process *p,
|
||||
struct kfd_ioctl_criu_args *args)
|
||||
|
|
@ -2377,6 +2464,10 @@ static int criu_restore(struct file *filep,
|
|||
if (ret)
|
||||
goto exit_unlock;
|
||||
|
||||
ret = criu_restore_objects(filep, p, args, &priv_offset, args->priv_data_size);
|
||||
if (ret)
|
||||
goto exit_unlock;
|
||||
|
||||
if (priv_offset != args->priv_data_size) {
|
||||
pr_err("Invalid private data size\n");
|
||||
ret = -EINVAL;
|
||||
|
|
@ -2474,9 +2565,14 @@ static int criu_process_info(struct file *filep,
|
|||
args->pid = task_pid_nr_ns(p->lead_thread,
|
||||
task_active_pid_ns(p->lead_thread));
|
||||
|
||||
criu_get_process_object_info(p, &args->num_bos, &args->priv_data_size);
|
||||
ret = criu_get_process_object_info(p, &args->num_bos, &args->num_objects,
|
||||
&args->priv_data_size);
|
||||
if (ret)
|
||||
goto err_unlock;
|
||||
|
||||
dev_dbg(kfd_device, "Num of bos:%u objects:%u priv_data_size:%lld\n",
|
||||
args->num_bos, args->num_objects, args->priv_data_size);
|
||||
|
||||
dev_dbg(kfd_device, "Num of bos:%u\n", args->num_bos);
|
||||
err_unlock:
|
||||
if (ret) {
|
||||
kfd_process_restore_queues(p);
|
||||
|
|
|
|||
|
|
@ -1057,6 +1057,16 @@ struct kfd_criu_bo_priv_data {
|
|||
uint32_t mapped_gpuids[MAX_GPU_INSTANCE];
|
||||
};
|
||||
|
||||
/*
|
||||
* The first 4 bytes of kfd_criu_queue_priv_data, kfd_criu_event_priv_data,
|
||||
* kfd_criu_svm_range_priv_data is the object type
|
||||
*/
|
||||
enum kfd_criu_object_type {
|
||||
KFD_CRIU_OBJECT_TYPE_QUEUE,
|
||||
KFD_CRIU_OBJECT_TYPE_EVENT,
|
||||
KFD_CRIU_OBJECT_TYPE_SVM_RANGE,
|
||||
};
|
||||
|
||||
struct kfd_criu_svm_range_priv_data {
|
||||
uint32_t object_type;
|
||||
uint32_t reserved;
|
||||
|
|
@ -1064,7 +1074,26 @@ struct kfd_criu_svm_range_priv_data {
|
|||
|
||||
struct kfd_criu_queue_priv_data {
|
||||
uint32_t object_type;
|
||||
uint32_t reserved;
|
||||
uint64_t q_address;
|
||||
uint64_t q_size;
|
||||
uint64_t read_ptr_addr;
|
||||
uint64_t write_ptr_addr;
|
||||
uint64_t doorbell_off;
|
||||
uint64_t eop_ring_buffer_address;
|
||||
uint64_t ctx_save_restore_area_address;
|
||||
uint32_t gpu_id;
|
||||
uint32_t type;
|
||||
uint32_t format;
|
||||
uint32_t q_id;
|
||||
uint32_t priority;
|
||||
uint32_t q_percent;
|
||||
uint32_t doorbell_id;
|
||||
uint32_t is_gws;
|
||||
uint32_t sdma_id;
|
||||
uint32_t eop_ring_buffer_size;
|
||||
uint32_t ctx_save_restore_area_size;
|
||||
uint32_t ctl_stack_size;
|
||||
uint32_t mqd_size;
|
||||
};
|
||||
|
||||
struct kfd_criu_event_priv_data {
|
||||
|
|
@ -1072,6 +1101,18 @@ struct kfd_criu_event_priv_data {
|
|||
uint32_t reserved;
|
||||
};
|
||||
|
||||
int kfd_process_get_queue_info(struct kfd_process *p,
|
||||
uint32_t *num_queues,
|
||||
uint64_t *priv_data_sizes);
|
||||
|
||||
int kfd_criu_checkpoint_queues(struct kfd_process *p,
|
||||
uint8_t __user *user_priv_data,
|
||||
uint64_t *priv_data_offset);
|
||||
|
||||
int kfd_criu_restore_queue(struct kfd_process *p,
|
||||
uint8_t __user *user_priv_data,
|
||||
uint64_t *priv_data_offset,
|
||||
uint64_t max_priv_data_size);
|
||||
/* CRIU - End */
|
||||
|
||||
/* Queue Context Management */
|
||||
|
|
|
|||
|
|
@ -497,6 +497,214 @@ int pqm_get_wave_state(struct process_queue_manager *pqm,
|
|||
save_area_used_size);
|
||||
}
|
||||
|
||||
int kfd_process_get_queue_info(struct kfd_process *p,
|
||||
uint32_t *num_queues,
|
||||
uint64_t *priv_data_sizes)
|
||||
{
|
||||
struct queue *q;
|
||||
int i;
|
||||
|
||||
*num_queues = 0;
|
||||
|
||||
/* Run over all PDDs of the process */
|
||||
for (i = 0; i < p->n_pdds; i++) {
|
||||
struct kfd_process_device *pdd = p->pdds[i];
|
||||
|
||||
list_for_each_entry(q, &pdd->qpd.queues_list, list) {
|
||||
if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
|
||||
q->properties.type == KFD_QUEUE_TYPE_SDMA ||
|
||||
q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
|
||||
|
||||
*num_queues = *num_queues + 1;
|
||||
} else {
|
||||
pr_err("Unsupported queue type (%d)\n", q->properties.type);
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
}
|
||||
}
|
||||
*priv_data_sizes = *num_queues * sizeof(struct kfd_criu_queue_priv_data);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void criu_dump_queue(struct kfd_process_device *pdd,
|
||||
struct queue *q,
|
||||
struct kfd_criu_queue_priv_data *q_data)
|
||||
{
|
||||
q_data->gpu_id = pdd->dev->id;
|
||||
q_data->type = q->properties.type;
|
||||
q_data->format = q->properties.format;
|
||||
q_data->q_id = q->properties.queue_id;
|
||||
q_data->q_address = q->properties.queue_address;
|
||||
q_data->q_size = q->properties.queue_size;
|
||||
q_data->priority = q->properties.priority;
|
||||
q_data->q_percent = q->properties.queue_percent;
|
||||
q_data->read_ptr_addr = (uint64_t)q->properties.read_ptr;
|
||||
q_data->write_ptr_addr = (uint64_t)q->properties.write_ptr;
|
||||
q_data->doorbell_id = q->doorbell_id;
|
||||
|
||||
q_data->sdma_id = q->sdma_id;
|
||||
|
||||
q_data->eop_ring_buffer_address =
|
||||
q->properties.eop_ring_buffer_address;
|
||||
|
||||
q_data->eop_ring_buffer_size = q->properties.eop_ring_buffer_size;
|
||||
|
||||
q_data->ctx_save_restore_area_address =
|
||||
q->properties.ctx_save_restore_area_address;
|
||||
|
||||
q_data->ctx_save_restore_area_size =
|
||||
q->properties.ctx_save_restore_area_size;
|
||||
|
||||
pr_debug("Dumping Queue: gpu_id:%x queue_id:%u\n", q_data->gpu_id, q_data->q_id);
|
||||
}
|
||||
|
||||
static int criu_dump_queues_device(struct kfd_process_device *pdd,
|
||||
uint8_t __user *user_priv,
|
||||
unsigned int *q_index,
|
||||
uint64_t *queues_priv_data_offset)
|
||||
{
|
||||
struct kfd_criu_queue_priv_data *q_data;
|
||||
struct queue *q;
|
||||
int ret = 0;
|
||||
|
||||
q_data = kzalloc(sizeof(*q_data), GFP_KERNEL);
|
||||
if (!q_data)
|
||||
return -ENOMEM;
|
||||
|
||||
list_for_each_entry(q, &pdd->qpd.queues_list, list) {
|
||||
if (q->properties.type != KFD_QUEUE_TYPE_COMPUTE &&
|
||||
q->properties.type != KFD_QUEUE_TYPE_SDMA &&
|
||||
q->properties.type != KFD_QUEUE_TYPE_SDMA_XGMI) {
|
||||
|
||||
pr_err("Unsupported queue type (%d)\n", q->properties.type);
|
||||
ret = -EOPNOTSUPP;
|
||||
break;
|
||||
}
|
||||
|
||||
criu_dump_queue(pdd, q, q_data);
|
||||
|
||||
ret = copy_to_user(user_priv + *queues_priv_data_offset, q_data, sizeof(*q_data));
|
||||
if (ret) {
|
||||
ret = -EFAULT;
|
||||
break;
|
||||
}
|
||||
*queues_priv_data_offset += sizeof(*q_data);
|
||||
*q_index = *q_index + 1;
|
||||
}
|
||||
|
||||
kfree(q_data);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int kfd_criu_checkpoint_queues(struct kfd_process *p,
|
||||
uint8_t __user *user_priv_data,
|
||||
uint64_t *priv_data_offset)
|
||||
{
|
||||
int ret = 0, pdd_index, q_index = 0;
|
||||
|
||||
for (pdd_index = 0; pdd_index < p->n_pdds; pdd_index++) {
|
||||
struct kfd_process_device *pdd = p->pdds[pdd_index];
|
||||
|
||||
/*
|
||||
* criu_dump_queues_device will copy data to user and update q_index and
|
||||
* queues_priv_data_offset
|
||||
*/
|
||||
ret = criu_dump_queues_device(pdd, user_priv_data, &q_index, priv_data_offset);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void set_queue_properties_from_criu(struct queue_properties *qp,
|
||||
struct kfd_criu_queue_priv_data *q_data)
|
||||
{
|
||||
qp->is_interop = false;
|
||||
qp->is_gws = q_data->is_gws;
|
||||
qp->queue_percent = q_data->q_percent;
|
||||
qp->priority = q_data->priority;
|
||||
qp->queue_address = q_data->q_address;
|
||||
qp->queue_size = q_data->q_size;
|
||||
qp->read_ptr = (uint32_t *) q_data->read_ptr_addr;
|
||||
qp->write_ptr = (uint32_t *) q_data->write_ptr_addr;
|
||||
qp->eop_ring_buffer_address = q_data->eop_ring_buffer_address;
|
||||
qp->eop_ring_buffer_size = q_data->eop_ring_buffer_size;
|
||||
qp->ctx_save_restore_area_address = q_data->ctx_save_restore_area_address;
|
||||
qp->ctx_save_restore_area_size = q_data->ctx_save_restore_area_size;
|
||||
qp->ctl_stack_size = q_data->ctl_stack_size;
|
||||
qp->type = q_data->type;
|
||||
qp->format = q_data->format;
|
||||
}
|
||||
|
||||
int kfd_criu_restore_queue(struct kfd_process *p,
|
||||
uint8_t __user *user_priv_ptr,
|
||||
uint64_t *priv_data_offset,
|
||||
uint64_t max_priv_data_size)
|
||||
{
|
||||
struct kfd_criu_queue_priv_data *q_data;
|
||||
struct kfd_process_device *pdd;
|
||||
struct kfd_dev *dev;
|
||||
struct queue_properties qp;
|
||||
unsigned int queue_id;
|
||||
|
||||
int ret = 0;
|
||||
|
||||
if (*priv_data_offset + sizeof(*q_data) > max_priv_data_size)
|
||||
return -EINVAL;
|
||||
|
||||
q_data = kmalloc(sizeof(*q_data), GFP_KERNEL);
|
||||
if (!q_data)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = copy_from_user(q_data, user_priv_ptr + *priv_data_offset, sizeof(*q_data));
|
||||
if (ret) {
|
||||
ret = -EFAULT;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
*priv_data_offset += sizeof(*q_data);
|
||||
|
||||
dev = kfd_device_by_id(q_data->gpu_id);
|
||||
if (!dev) {
|
||||
pr_err("Could not get kfd_dev from gpu_id = 0x%x\n",
|
||||
q_data->gpu_id);
|
||||
|
||||
ret = -EINVAL;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
pdd = kfd_get_process_device_data(dev, p);
|
||||
if (!pdd) {
|
||||
pr_err("Failed to get pdd\n");
|
||||
ret = -EFAULT;
|
||||
return ret;
|
||||
}
|
||||
|
||||
memset(&qp, 0, sizeof(qp));
|
||||
set_queue_properties_from_criu(&qp, q_data);
|
||||
|
||||
print_queue_properties(&qp);
|
||||
|
||||
ret = pqm_create_queue(&p->pqm, pdd->dev, NULL, &qp, &queue_id, NULL);
|
||||
if (ret) {
|
||||
pr_err("Failed to create new queue err:%d\n", ret);
|
||||
ret = -EINVAL;
|
||||
}
|
||||
|
||||
exit:
|
||||
if (ret)
|
||||
pr_err("Failed to create queue (%d)\n", ret);
|
||||
else
|
||||
pr_debug("Queue id %d was restored successfully\n", queue_id);
|
||||
|
||||
kfree(q_data);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if defined(CONFIG_DEBUG_FS)
|
||||
|
||||
int pqm_debugfs_mqds(struct seq_file *m, void *data)
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user