mirror of
https://github.com/torvalds/linux.git
synced 2026-05-28 17:13:52 +02:00
drm/amdgpu: Fix the race condition for draining retry fault
Issue: In the scenario where svm_range_restore_pages is called, but svm->checkpoint_ts has not been set and the retry fault has not been drained, svm_range_unmap_from_cpu is triggered and calls svm_range_free. Meanwhile, svm_range_restore_pages continues execution and reaches svm_range_from_addr. This results in a "failed to find prange..." error, causing the page recovery to fail. How to fix: Move the timestamp check code under the protection of svm->lock. v2: Make sure all right locks are released before go out. v3: Directly goto out_unlock_svms, and return -EAGAIN. v4: Refine code. Signed-off-by: Emily Deng <Emily.Deng@amd.com> Reviewed-by: Felix Kuehling <felix.kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
b9e75bcb2b
commit
f844732e3a
|
|
@ -3009,19 +3009,6 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
|
|||
goto out;
|
||||
}
|
||||
|
||||
/* check if this page fault time stamp is before svms->checkpoint_ts */
|
||||
if (svms->checkpoint_ts[gpuidx] != 0) {
|
||||
if (amdgpu_ih_ts_after_or_equal(ts, svms->checkpoint_ts[gpuidx])) {
|
||||
pr_debug("draining retry fault, drop fault 0x%llx\n", addr);
|
||||
r = 0;
|
||||
goto out;
|
||||
} else
|
||||
/* ts is after svms->checkpoint_ts now, reset svms->checkpoint_ts
|
||||
* to zero to avoid following ts wrap around give wrong comparing
|
||||
*/
|
||||
svms->checkpoint_ts[gpuidx] = 0;
|
||||
}
|
||||
|
||||
if (!p->xnack_enabled) {
|
||||
pr_debug("XNACK not enabled for pasid 0x%x\n", pasid);
|
||||
r = -EFAULT;
|
||||
|
|
@ -3041,6 +3028,21 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
|
|||
mmap_read_lock(mm);
|
||||
retry_write_locked:
|
||||
mutex_lock(&svms->lock);
|
||||
|
||||
/* check if this page fault time stamp is before svms->checkpoint_ts */
|
||||
if (svms->checkpoint_ts[gpuidx] != 0) {
|
||||
if (amdgpu_ih_ts_after_or_equal(ts, svms->checkpoint_ts[gpuidx])) {
|
||||
pr_debug("draining retry fault, drop fault 0x%llx\n", addr);
|
||||
r = -EAGAIN;
|
||||
goto out_unlock_svms;
|
||||
} else {
|
||||
/* ts is after svms->checkpoint_ts now, reset svms->checkpoint_ts
|
||||
* to zero to avoid following ts wrap around give wrong comparing
|
||||
*/
|
||||
svms->checkpoint_ts[gpuidx] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
prange = svm_range_from_addr(svms, addr, NULL);
|
||||
if (!prange) {
|
||||
pr_debug("failed to find prange svms 0x%p address [0x%llx]\n",
|
||||
|
|
@ -3166,7 +3168,8 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
|
|||
mutex_unlock(&svms->lock);
|
||||
mmap_read_unlock(mm);
|
||||
|
||||
svm_range_count_fault(node, p, gpuidx);
|
||||
if (r != -EAGAIN)
|
||||
svm_range_count_fault(node, p, gpuidx);
|
||||
|
||||
mmput(mm);
|
||||
out:
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user