mirror of
https://github.com/torvalds/linux.git
synced 2026-05-26 08:02:27 +02:00
drm/amdgpu: Follow up change to previous drm scheduler change.
Align refcount behaviour for amdgpu_job embedded HW fence with classic pointer style HW fences by increasing refcount each time emit is called so amdgpu code doesn't need to make workarounds using amdgpu_job.job_run_counter to keep the HW fence refcount balanced. Also since in the previous patch we resumed setting s_fence->parent to NULL in drm_sched_stop switch to directly checking if job->hw_fence is signaled to short circuit reset if already signed. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Tested-by: Yiqing Yao <yiqing.yao@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
45ecaea738
commit
9ae55f030d
|
|
@ -684,6 +684,8 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev,
|
||||||
goto err_ib_sched;
|
goto err_ib_sched;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Drop the initial kref_init count (see drm_sched_main as example) */
|
||||||
|
dma_fence_put(f);
|
||||||
ret = dma_fence_wait(f, false);
|
ret = dma_fence_wait(f, false);
|
||||||
|
|
||||||
err_ib_sched:
|
err_ib_sched:
|
||||||
|
|
|
||||||
|
|
@ -5010,16 +5010,32 @@ static void amdgpu_device_recheck_guilty_jobs(
|
||||||
|
|
||||||
/* clear job's guilty and depend the folowing step to decide the real one */
|
/* clear job's guilty and depend the folowing step to decide the real one */
|
||||||
drm_sched_reset_karma(s_job);
|
drm_sched_reset_karma(s_job);
|
||||||
/* for the real bad job, it will be resubmitted twice, adding a dma_fence_get
|
|
||||||
* to make sure fence is balanced */
|
|
||||||
dma_fence_get(s_job->s_fence->parent);
|
|
||||||
drm_sched_resubmit_jobs_ext(&ring->sched, 1);
|
drm_sched_resubmit_jobs_ext(&ring->sched, 1);
|
||||||
|
|
||||||
|
if (!s_job->s_fence->parent) {
|
||||||
|
DRM_WARN("Failed to get a HW fence for job!");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
|
ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
|
||||||
if (ret == 0) { /* timeout */
|
if (ret == 0) { /* timeout */
|
||||||
DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
|
DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
|
||||||
ring->sched.name, s_job->id);
|
ring->sched.name, s_job->id);
|
||||||
|
|
||||||
|
|
||||||
|
amdgpu_fence_driver_isr_toggle(adev, true);
|
||||||
|
|
||||||
|
/* Clear this failed job from fence array */
|
||||||
|
amdgpu_fence_driver_clear_job_fences(ring);
|
||||||
|
|
||||||
|
amdgpu_fence_driver_isr_toggle(adev, false);
|
||||||
|
|
||||||
|
/* Since the job won't signal and we go for
|
||||||
|
* another resubmit drop this parent pointer
|
||||||
|
*/
|
||||||
|
dma_fence_put(s_job->s_fence->parent);
|
||||||
|
s_job->s_fence->parent = NULL;
|
||||||
|
|
||||||
/* set guilty */
|
/* set guilty */
|
||||||
drm_sched_increase_karma(s_job);
|
drm_sched_increase_karma(s_job);
|
||||||
retry:
|
retry:
|
||||||
|
|
@ -5048,7 +5064,6 @@ static void amdgpu_device_recheck_guilty_jobs(
|
||||||
|
|
||||||
/* got the hw fence, signal finished fence */
|
/* got the hw fence, signal finished fence */
|
||||||
atomic_dec(ring->sched.score);
|
atomic_dec(ring->sched.score);
|
||||||
dma_fence_put(s_job->s_fence->parent);
|
|
||||||
dma_fence_get(&s_job->s_fence->finished);
|
dma_fence_get(&s_job->s_fence->finished);
|
||||||
dma_fence_signal(&s_job->s_fence->finished);
|
dma_fence_signal(&s_job->s_fence->finished);
|
||||||
dma_fence_put(&s_job->s_fence->finished);
|
dma_fence_put(&s_job->s_fence->finished);
|
||||||
|
|
@ -5221,8 +5236,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||||
*
|
*
|
||||||
* job->base holds a reference to parent fence
|
* job->base holds a reference to parent fence
|
||||||
*/
|
*/
|
||||||
if (job && job->base.s_fence->parent &&
|
if (job && (job->hw_fence.ops != NULL) &&
|
||||||
dma_fence_is_signaled(job->base.s_fence->parent)) {
|
dma_fence_is_signaled(&job->hw_fence)) {
|
||||||
job_signaled = true;
|
job_signaled = true;
|
||||||
dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
|
dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
|
||||||
goto skip_hw_reset;
|
goto skip_hw_reset;
|
||||||
|
|
|
||||||
|
|
@ -164,11 +164,16 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd
|
||||||
if (job && job->job_run_counter) {
|
if (job && job->job_run_counter) {
|
||||||
/* reinit seq for resubmitted jobs */
|
/* reinit seq for resubmitted jobs */
|
||||||
fence->seqno = seq;
|
fence->seqno = seq;
|
||||||
|
/* TO be inline with external fence creation and other drivers */
|
||||||
|
dma_fence_get(fence);
|
||||||
} else {
|
} else {
|
||||||
if (job)
|
if (job) {
|
||||||
dma_fence_init(fence, &amdgpu_job_fence_ops,
|
dma_fence_init(fence, &amdgpu_job_fence_ops,
|
||||||
&ring->fence_drv.lock,
|
&ring->fence_drv.lock,
|
||||||
adev->fence_context + ring->idx, seq);
|
adev->fence_context + ring->idx, seq);
|
||||||
|
/* Against remove in amdgpu_job_{free, free_cb} */
|
||||||
|
dma_fence_get(fence);
|
||||||
|
}
|
||||||
else
|
else
|
||||||
dma_fence_init(fence, &amdgpu_fence_ops,
|
dma_fence_init(fence, &amdgpu_fence_ops,
|
||||||
&ring->fence_drv.lock,
|
&ring->fence_drv.lock,
|
||||||
|
|
|
||||||
|
|
@ -262,10 +262,6 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
|
||||||
DRM_ERROR("Error scheduling IBs (%d)\n", r);
|
DRM_ERROR("Error scheduling IBs (%d)\n", r);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!job->job_run_counter)
|
|
||||||
dma_fence_get(fence);
|
|
||||||
else if (finished->error < 0)
|
|
||||||
dma_fence_put(&job->hw_fence);
|
|
||||||
job->job_run_counter++;
|
job->job_run_counter++;
|
||||||
amdgpu_job_free_resources(job);
|
amdgpu_job_free_resources(job);
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user