drm/msm: Temporarily disable stall-on-fault after a page fault

When things go wrong, the GPU is capable of quickly generating millions
of faulting translation requests per second. When that happens, in the
stall-on-fault model each access will stall until it wins the race to
signal the fault and then the RESUME register is written. This slows
processing page faults to a crawl as the GPU can generate faults much
faster than the CPU can acknowledge them. It also means that all
available resources in the SMMU are saturated waiting for the stalled
transactions, so that other transactions such as transactions generated
by the GMU, which shares translation resources with the GPU, cannot
proceed. This causes a GMU watchdog timeout, which leads to a failed
reset because GX cannot collapse when there is a transaction pending and
a permanently hung GPU.

On older platforms with qcom,smmu-v2, it seems that when one transaction
is stalled subsequent faulting transactions are terminated, which avoids
this problem, but the MMU-500 follows the spec here.

To work around these problems, disable stall-on-fault as soon as we get a
page fault until a cooldown period after pagefaults stop. This allows
the GMU some guaranteed time to continue working. We only use
stall-on-fault to halt the GPU while we collect a devcoredump and we
always terminate the transaction afterward, so it's fine to miss some
subsequent page faults. We also keep it disabled so long as the current
devcoredump hasn't been deleted, because in that case we likely won't
capture another one if there's a fault.

After this commit HFI messages still occasionally time out, because the
crashdump handler doesn't run fast enough to let the GMU resume, but the
driver seems to recover from it. This will probably go away after the
HFI timeout is increased.

Signed-off-by: Connor Abbott <cwabbott0@gmail.com>
Reviewed-by: Rob Clark <robdclark@gmail.com>
Patchwork: https://patchwork.freedesktop.org/patch/654891/
Signed-off-by: Rob Clark <robin.clark@oss.qualcomm.com>
This commit is contained in:
Connor Abbott 2025-05-20 15:08:59 -04:00 committed by Rob Clark
parent dedf404be8
commit b13044092c
9 changed files with 116 additions and 1 deletions

View File

@ -131,6 +131,8 @@ static void a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
struct msm_ringbuffer *ring = submit->ring;
unsigned int i, ibs = 0;
adreno_check_and_reenable_stall(adreno_gpu);
if (IS_ENABLED(CONFIG_DRM_MSM_GPU_SUDO) && submit->in_rb) {
ring->cur_ctx_seqno = 0;
a5xx_submit_in_rb(gpu, submit);

View File

@ -212,6 +212,8 @@ static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
struct msm_ringbuffer *ring = submit->ring;
unsigned int i, ibs = 0;
adreno_check_and_reenable_stall(adreno_gpu);
a6xx_set_pagetable(a6xx_gpu, ring, submit);
get_stats_counter(ring, REG_A6XX_RBBM_PERFCTR_CP(0),
@ -335,6 +337,8 @@ static void a7xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
struct msm_ringbuffer *ring = submit->ring;
unsigned int i, ibs = 0;
adreno_check_and_reenable_stall(adreno_gpu);
/*
* Toggle concurrent binning for pagetable switch and set the thread to
* BR since only it can execute the pagetable switch packets.

View File

@ -259,16 +259,54 @@ u64 adreno_private_address_space_size(struct msm_gpu *gpu)
return BIT(ttbr1_cfg->ias) - ADRENO_VM_START;
}
void adreno_check_and_reenable_stall(struct adreno_gpu *adreno_gpu)
{
struct msm_gpu *gpu = &adreno_gpu->base;
struct msm_drm_private *priv = gpu->dev->dev_private;
unsigned long flags;
/*
* Wait until the cooldown period has passed and we would actually
* collect a crashdump to re-enable stall-on-fault.
*/
spin_lock_irqsave(&priv->fault_stall_lock, flags);
if (!priv->stall_enabled &&
ktime_after(ktime_get(), priv->stall_reenable_time) &&
!READ_ONCE(gpu->crashstate)) {
priv->stall_enabled = true;
gpu->aspace->mmu->funcs->set_stall(gpu->aspace->mmu, true);
}
spin_unlock_irqrestore(&priv->fault_stall_lock, flags);
}
#define ARM_SMMU_FSR_TF BIT(1)
#define ARM_SMMU_FSR_PF BIT(3)
#define ARM_SMMU_FSR_EF BIT(4)
#define ARM_SMMU_FSR_SS BIT(30)
int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags,
struct adreno_smmu_fault_info *info, const char *block,
u32 scratch[4])
{
struct msm_drm_private *priv = gpu->dev->dev_private;
const char *type = "UNKNOWN";
bool do_devcoredump = info && !READ_ONCE(gpu->crashstate);
bool do_devcoredump = info && (info->fsr & ARM_SMMU_FSR_SS) &&
!READ_ONCE(gpu->crashstate);
unsigned long irq_flags;
/*
* In case there is a subsequent storm of pagefaults, disable
* stall-on-fault for at least half a second.
*/
spin_lock_irqsave(&priv->fault_stall_lock, irq_flags);
if (priv->stall_enabled) {
priv->stall_enabled = false;
gpu->aspace->mmu->funcs->set_stall(gpu->aspace->mmu, false);
}
priv->stall_reenable_time = ktime_add_ms(ktime_get(), 500);
spin_unlock_irqrestore(&priv->fault_stall_lock, irq_flags);
/*
* Print a default message if we couldn't get the data from the

View File

@ -636,6 +636,8 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags,
struct adreno_smmu_fault_info *info, const char *block,
u32 scratch[4]);
void adreno_check_and_reenable_stall(struct adreno_gpu *gpu);
int adreno_read_speedbin(struct device *dev, u32 *speedbin);
/*

View File

@ -208,6 +208,35 @@ DEFINE_DEBUGFS_ATTRIBUTE(shrink_fops,
shrink_get, shrink_set,
"0x%08llx\n");
/*
* Return the number of microseconds to wait until stall-on-fault is
* re-enabled. If 0 then it is already enabled or will be re-enabled on the
* next submit (unless there's a leftover devcoredump). This is useful for
* kernel tests that intentionally produce a fault and check the devcoredump to
* wait until the cooldown period is over.
*/
static int
stall_reenable_time_get(void *data, u64 *val)
{
struct msm_drm_private *priv = data;
unsigned long irq_flags;
spin_lock_irqsave(&priv->fault_stall_lock, irq_flags);
if (priv->stall_enabled)
*val = 0;
else
*val = max(ktime_us_delta(priv->stall_reenable_time, ktime_get()), 0);
spin_unlock_irqrestore(&priv->fault_stall_lock, irq_flags);
return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(stall_reenable_time_fops,
stall_reenable_time_get, NULL,
"%lld\n");
static int msm_gem_show(struct seq_file *m, void *arg)
{
@ -319,6 +348,9 @@ static void msm_debugfs_gpu_init(struct drm_minor *minor)
debugfs_create_bool("disable_err_irq", 0600, minor->debugfs_root,
&priv->disable_err_irq);
debugfs_create_file("stall_reenable_time_us", 0400, minor->debugfs_root,
priv, &stall_reenable_time_fops);
gpu_devfreq = debugfs_create_dir("devfreq", minor->debugfs_root);
debugfs_create_bool("idle_clamp",0600, gpu_devfreq,

View File

@ -245,6 +245,10 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv)
drm_gem_lru_init(&priv->lru.willneed, &priv->lru.lock);
drm_gem_lru_init(&priv->lru.dontneed, &priv->lru.lock);
/* Initialize stall-on-fault */
spin_lock_init(&priv->fault_stall_lock);
priv->stall_enabled = true;
/* Teach lockdep about lock ordering wrt. shrinker: */
fs_reclaim_acquire(GFP_KERNEL);
might_lock(&priv->lru.lock);

View File

@ -222,6 +222,29 @@ struct msm_drm_private {
* the sw hangcheck mechanism.
*/
bool disable_err_irq;
/**
* @fault_stall_lock:
*
* Serialize changes to stall-on-fault state.
*/
spinlock_t fault_stall_lock;
/**
* @fault_stall_reenable_time:
*
* If stall_enabled is false, when to reenable stall-on-fault.
* Protected by @fault_stall_lock.
*/
ktime_t stall_reenable_time;
/**
* @stall_enabled:
*
* Whether stall-on-fault is currently enabled. Protected by
* @fault_stall_lock.
*/
bool stall_enabled;
};
const struct msm_format *mdp_get_format(struct msm_kms *kms, uint32_t format, uint64_t modifier);

View File

@ -372,6 +372,14 @@ static int msm_disp_fault_handler(struct iommu_domain *domain, struct device *de
return -ENOSYS;
}
static void msm_iommu_set_stall(struct msm_mmu *mmu, bool enable)
{
struct adreno_smmu_priv *adreno_smmu = dev_get_drvdata(mmu->dev);
if (adreno_smmu->set_stall)
adreno_smmu->set_stall(adreno_smmu->cookie, enable);
}
static void msm_iommu_detach(struct msm_mmu *mmu)
{
struct msm_iommu *iommu = to_msm_iommu(mmu);
@ -419,6 +427,7 @@ static const struct msm_mmu_funcs funcs = {
.map = msm_iommu_map,
.unmap = msm_iommu_unmap,
.destroy = msm_iommu_destroy,
.set_stall = msm_iommu_set_stall,
};
struct msm_mmu *msm_iommu_new(struct device *dev, unsigned long quirks)

View File

@ -15,6 +15,7 @@ struct msm_mmu_funcs {
size_t len, int prot);
int (*unmap)(struct msm_mmu *mmu, uint64_t iova, size_t len);
void (*destroy)(struct msm_mmu *mmu);
void (*set_stall)(struct msm_mmu *mmu, bool enable);
};
enum msm_mmu_type {