linux/drivers/gpu/drm/xe/xe_pagefault.c
Linus Torvalds 4a57e0913e drm for v7.1-rc1
mm:
 - two pass MMU interval notifiers
 - add gpu active/reclaim per-node stat counters
 
 math:
 - provide __KERNEL_DIV_ROUND_CLOSEST() in UAPI
 - implement DIV_ROUND_CLOSEST() with __KERNEL_DIV_ROUND_CLOSEST()
 
 rust:
 - shared tag with driver-core: register macro and io infra
 - core: rework DMA coherent API
 - core: add interop::list to interop with C linked lists
 - core: add more num::Bounded operations
 - core: enable generic_arg_infer and add EMSGSIZE
 - workqueue: add ARef<T> support for work and delayed work
 - add GPU buddy allocator abstraction
 - add DRM shmem GEM helper abstraction
 - allow drm:::Device to dispatch work and delayed work items
   to driver private data
 - add dma_resv_lock helper and raw accessors
 
 core:
 - introduce DRM RAS infrastructure over netlink
 - add connector panel_type property
 - fourcc: add ARM interleaved 64k modifier
 - colorop: add destroy helper
 - suballoc: split into alloc and init helpers
 - mode: provide DRM_ARGB_GET*() macros for reading color components
 
 edid:
 - provide drm_output_color_Format
 
 dma-buf:
 - provide revoke mechanism for shared buffers
 - rename move_notify to invalidate_mappings
 - always enable move_notify
 - protect dma_fence_ops with RCU and improve locking
 - clean pages with helpers
 
 atomic:
 - allocate drm_private_state via callback
 - helper: use system_percpu_wq
 
 buddy:
 - make buddy allocator available to gpu level
 - add kernel-doc for buddy allocator
 - improve aligned allocation
 
 ttm:
 - fix fence signalling
 - improve tests and docs
 - improve handling of gfp_retry_mayfail
 - use per-node stat counters to track memory allocations
 - port pool to use list_lru
 - drop NUMA specific pools
 - make pool shrinker numa aware
 - track allocated pages per numa node
 
 coreboot:
 - cleanup coreboot framebuffer support
 
 sched:
 - fix race condition in drm_sched_fini
 
 pagemap:
 - enable THP support
 - pass pagemap_addr by reference
 
 gem-shmem:
 - Track page accessed/dirty status across mmap/vmap
 
 gpusvm:
 - reenable device to device migration
 - fix unbalanced unclock
 
 bridge:
 - anx7625: Support USB-C plus DT bindings
 - connector: Fix EDID detection
 - dw-hdmi-qp: Support Vendor-Specfic and SDP Infoframes; improve others
 - fsl-ldb: Fix visual artifacts plus related DT property 'enable-termination-resistor'
 - imx8qxp-pixel-link: Improve bridge reference handling
 - lt9611: Support Port-B-only input plus DT bindings
 - tda998x: Support DRM_BRIDGE_ATTACH_NO_CONNECTOR; Clean up
 - Support TH1520 HDMI plus DT bindings
 - waveshare-dsi: Fix register and attach; Support 1..4 DSI lanes plus DT bindings
 - anx7625: Fix USB Type-C handling
 - cdns-mhdp8546-core: Handle HDCP state in bridge atomic_check
 - Support Lontium LT8713SX DP MST bridge plus DT bindings
 - analogix_dp: Use DP helpers for link training
 
 panel:
 - panel-jdi-lt070me05000: Use mipi-dsi multi functions
 - panel-edp: Support Add AUO B116XAT04.1 (HW: 1A); Support CMN N116BCL-EAK (C2); Support FriendlyELEC plus DT changes
 - panel-edp: Fix timings for BOE NV140WUM-N64
 - ilitek-ili9882t: Allow GPIO calls to sleep
 - jadard: Support TAIGUAN XTI05101-01A
 - lxd: Support LXD M9189A plus DT bindings
 - mantix: Fix pixel clock; Clean up
 - motorola: Support Motorola Atrix 4G and Droid X2 plus DT bindings
 - novatek: Support Novatek/Tianma NT37700F plus DT bindings
 - simple: Support EDT ET057023UDBA plus DT bindings; Support Powertip
   PH800480T032-ZHC19 plus DT bindings; Support Waveshare 13.3"
 - novatek-nt36672a: Use mipi_dsi_*_multi() functions
 - panel-edp: Support BOE NV153WUM-N42, CMN N153JCA-ELK, CSW MNF307QS3-2
 - support Himax HX83121A plus DT bindings
 - support JuTouch JT070TM041 plus DT bindings
 - support Samsung S6E8FC0 plus DT bindings
 - himax-hx83102c: support Samsung S6E8FC0 plus DT bindings; support backlight
 - ili9806e: support Rocktech RK050HR345-CT106A plus DT bindings
 - simple: support Tianma TM050RDH03 plus DT bindings
 
 amdgpu:
 - enable DC by default on CIK APUs
 - userq fence ioctl param size fixes
 - set panel_type to OLED for eDP
 - refactor DC i2c code
 - FAMS2 update
 - rework ttm handling to allow multiple engines
 - DC DCE 6.x cleanup
 - DC support for NUTMEG/TRAVIS DP bridge
 - DCN 4.2 support
 - GC12 idle power fix for compute
 - use struct drm_edid in non-DC code
 - enable NV12/P010 support on primary planes
 - support newer IP discovery tables
 - VCN/JPEG 5.0.2 support
 - GC/MES 12.1 updates
 - USERQ fixes
 - add DC idle state manager
 - eDP DSC seamless boot
 
 amdkfd:
 - GC 12.1 updates
 - non 4K page fixes
 
 xe:
 - basic Xe3p_LPG and NVL-P enabling patches
 - allow VM_BIND decompress support
 - add purgeable buffer object support
 - add xe_vm_get_property_ioctl
 - restrict multi-lrc to VCS/VECS engines
 - allow disabling VM overcommit in fault mode
 - dGPU memory optimizations
 - Workaround cleanups and simplification
 - Allow VFs VRAM quote changes using sysfs
 - convert GT stats to per-cpu counters
 - pagefault refactors
 - enable multi-queue on xe3p_xpc
 - disable DCC on PTL
 - make MMIO communication more robust
 - disable D3Cold for BMG on specific platforms
 - vfio: improve FLR sync for Xe VFIO
 
 i915/display:
 - C10/C20/LT PHY PLL divider verification
 - use trans push mechanism to generate PSR frame change on LNL+
 - refactor DP DSC slice config
 - VGA decode refactoring
 - refactor DPT, gen2-4 overlay, masked field register macro helpers
 - refactor stolen memory allocation decisions
 - prepare for UHBR DP tunnels
 - refactor LT PHY PLL to use DPLL framework
 - implement register polling/waiting in display code
 - add shared stepping header between i915 and display
 
 i915:
 - fix potential overflow of shmem scatterlist length
 
 nouveau:
 - provide Z cull info to userspace
 - initial GA100 support
 - shutdown on PCI device shutdown
 
 nova-core:
 - harden GSP command queue
 - add support for large RPCs
 - simplify GSP sequencer and message handling
 - refactor falcon firmware handling
 - convert to new register macro
 - conver to new DMA coherent API
 - use checked arithmetic
 - add debugfs support for gsp-rm log buffers
 - fix aux device registration for multi-GPU
 
 msm:
 - CI:
   - Uprev mesa
   - Restore CI jobs for Qualcomm APQ8016 and APQ8096 devices
 - Core:
   - Switched to of_get_available_child_by_name()
 - DPU:
   - Fixes for DSC panels
   - Fixed brownout because of the frequency / OPP mismatch
   - Quad pipe preparation (not enabled yet)
   - Switched to virtual planes by default
   - Dropped VBIF_NRT support
   - Added support for Eliza platform
   - Reworked alpha handling
   - Switched to correct CWB definitions on Eliza
   - Dropped dummy INTF_0 on MSM8953
   - Corrected INTFs related to DP-MST
 - DP:
   - Removed debug prints looking into PHY internals
 - DSI:
   - Fixes for DSC panels
   - RGB101010 support
   - Support for SC8280XP
   - Moved PHY bindings from display/ to phy/
 - GPU:
   - Preemption support for x2-85 and a840
   - IFPC support for a840
   - SKU detection support for x2-85 and a840
   - Expose AQE support (VK ray-pipeline)
   - Avoid locking in VM_BIND fence signaling path
   - Fix to avoid reclaim in GPU snapshot path
   - Disallow foreign mapping of _NO_SHARE BOs
 - HDMI:
   - Fixed infoframes programming
 - MDP5:
   - Dropped support for MSM8974v1
   - Dropped now unused code for MSM8974 v1 and SDM660 / MSM8998
 
 panthor:
 - add tracepoints for power and IRQs
 - fix fence handling
 - extend timestamp query with flags
 - support various sources for timestamp queries
 
 tyr:
 - fix names and model/versions
 
 rockchip:
 - vop2: use drm logging function
 - rk3576 displayport support
 - support CRTC background color
 
 atmel-hlcdc:
 - support sana5d65 LCD controller
 
 tilcdc:
 - use DT bindings schema
 - use managed DRM interfaces
 - support DRM_BRIDGE_ATTACH_NO_CONNECTOR
 
 verisilicon:
 - support DC8200 + DT bindings
 
 virtgpu:
 - support PRIME import with 3D enabled
 
 komeda:
 - fix integer overflow in AFBC checks
 
 mcde:
 - improve bridge handling
 
 gma500:
 - use drm client buffer for fbdev framebuffer
 
 amdxdna:
 - add sensors ioctls
 - provide NPU power estimate
 - support column utilization sensor
 - allow forcing DMA through IOMMU IOVA
 - support per-BO mem usage queries
 - refactor GEM implementation
 
 ivpu:
 - update boot API to v3.29.4
 - limit per-user number of doorbells/contexts
 - perform engine reset on TDR error
 
 loongson:
 - replace custom code with drm_gem_ttm_dumb_map_offset()
 
 imx:
 - support planes behind the primary plane
 - fix bus-format selection
 
 vkms:
 - support CRTC background color
 
 v3d:
 - improve handling of struct v3d_stats
 
 komeda:
 - support Arm China Linlon D6 plus DT bindings
 
 imagination:
 - improve power-off sequence
 - support context-reset notification from firmware
 
 mediatek:
 - mtk_dsi: enable hs clock during pre-enable
 - Remove all conflicting aperture devices during probe
 - Add support for mt8167 display blocks
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEEKbZHaGwW9KfbeusDHTzWXnEhr4FAmnfMHMACgkQDHTzWXnE
 hr4gEg/+Oaf6KBcvqNKPLwDlNeOvHap1n8oiy7SXvOKN2/KEAu/zGpEciJ7GsSge
 qdqY4xhEfp0JZLrTZiIIzFr38uzkanfOLdF2AQCVrfCRhlO7QLiUDxAAdDZUyINe
 kKLvNunxMwhzwsmRHEDL85cgPkhsxt2ux+tUOYZrEQ/ZbdupNrFw9q5ewmuYzGng
 HY8bsnB0jVwQ9IU/X6h+Xzr/19623/CZyUWJSuY1foKMhHMceyrCmpAFEqjFWn71
 7zNYFlPEQtqa6qtIZXVbJB4mhd7NbmMW6s367xx+Sx+UJDDNfS6ku+hpISwxNuVX
 7fOoEkhQ+ynIcxGkfOi5Q9j2/mV/WL/GEA/IUWfmX8l219WOrKY4w0NtCE4C78r7
 QFGUR6w8Vi97FCP8NuA7Kix4J9eSr/FAzqoG0snAOQbVdaTSBr1hL0PeewD8BRry
 PUkCCh6J7jKA6POt4JZeU6mbJ3AMoOwS9BICi10R1R6EnIKNpKGVpAuYHk4B5+u3
 X5vd1ds+8dJN/etaFYgIbirUocKx6zt9rT5i4/wPZIDPoCgZNofePtPCiJoTcnNN
 PUZUngcWLpftwW+kCUdc4lF1Q7nguQpXVpX0WJiSfqejshUTPXHPlmJV81GoNSHo
 fQMUXIjO5cAX0FKPBakSxxwFnOQFq4aZb6kRBt4lYgt+RJfzo3s=
 =GX7Q
 -----END PGP SIGNATURE-----

Merge tag 'drm-next-2026-04-15' of https://gitlab.freedesktop.org/drm/kernel

Pull drm updates from Dave Airlie:
 "Highlights:
   - new DRM RAS infrastructure using netlink
   - amdgpu: enable DC on CIK APUs, and more IP enablement, and more
     user queue work
   - xe: purgeable BO support, and new hw enablement
   - dma-buf : add revocable operations

  Full summary:

  mm:
   - two-pass MMU interval notifiers
   - add gpu active/reclaim per-node stat counters

  math:
   - provide __KERNEL_DIV_ROUND_CLOSEST() in UAPI
   - implement DIV_ROUND_CLOSEST() with __KERNEL_DIV_ROUND_CLOSEST()

  rust:
   - shared tag with driver-core: register macro and io infra
   - core: rework DMA coherent API
   - core: add interop::list to interop with C linked lists
   - core: add more num::Bounded operations
   - core: enable generic_arg_infer and add EMSGSIZE
   - workqueue: add ARef<T> support for work and delayed work
   - add GPU buddy allocator abstraction
   - add DRM shmem GEM helper abstraction
   - allow drm:::Device to dispatch work and delayed work items
     to driver private data
   - add dma_resv_lock helper and raw accessors

  core:
   - introduce DRM RAS infrastructure over netlink
   - add connector panel_type property
   - fourcc: add ARM interleaved 64k modifier
   - colorop: add destroy helper
   - suballoc: split into alloc and init helpers
   - mode: provide DRM_ARGB_GET*() macros for reading color components

  edid:
   - provide drm_output_color_Format

  dma-buf:
   - provide revoke mechanism for shared buffers
   - rename move_notify to invalidate_mappings
   - always enable move_notify
   - protect dma_fence_ops with RCU and improve locking
   - clean pages with helpers

  atomic:
   - allocate drm_private_state via callback
   - helper: use system_percpu_wq

  buddy:
   - make buddy allocator available to gpu level
   - add kernel-doc for buddy allocator
   - improve aligned allocation

  ttm:
   - fix fence signalling
   - improve tests and docs
   - improve handling of gfp_retry_mayfail
   - use per-node stat counters to track memory allocations
   - port pool to use list_lru
   - drop NUMA specific pools
   - make pool shrinker numa aware
   - track allocated pages per numa node

  coreboot:
   - cleanup coreboot framebuffer support

  sched:
   - fix race condition in drm_sched_fini

  pagemap:
   - enable THP support
   - pass pagemap_addr by reference

  gem-shmem:
   - Track page accessed/dirty status across mmap/vmap

  gpusvm:
   - reenable device to device migration
   - fix unbalanced unclock

  bridge:
   - anx7625: Support USB-C plus DT bindings
   - connector: Fix EDID detection
   - dw-hdmi-qp: Support Vendor-Specfic and SDP Infoframes; improve
     others
   - fsl-ldb: Fix visual artifacts plus related DT property
     'enable-termination-resistor'
   - imx8qxp-pixel-link: Improve bridge reference handling
   - lt9611: Support Port-B-only input plus DT bindings
   - tda998x: Support DRM_BRIDGE_ATTACH_NO_CONNECTOR; Clean up
   - Support TH1520 HDMI plus DT bindings
   - waveshare-dsi: Fix register and attach; Support 1..4 DSI lanes plus
     DT bindings
   - anx7625: Fix USB Type-C handling
   - cdns-mhdp8546-core: Handle HDCP state in bridge atomic_check
   - Support Lontium LT8713SX DP MST bridge plus DT bindings
   - analogix_dp: Use DP helpers for link training

  panel:
   - panel-jdi-lt070me05000: Use mipi-dsi multi functions
   - panel-edp: Support Add AUO B116XAT04.1 (HW: 1A); Support CMN
     N116BCL-EAK (C2); Support FriendlyELEC plus DT changes
   - panel-edp: Fix timings for BOE NV140WUM-N64
   - ilitek-ili9882t: Allow GPIO calls to sleep
   - jadard: Support TAIGUAN XTI05101-01A
   - lxd: Support LXD M9189A plus DT bindings
   - mantix: Fix pixel clock; Clean up
   - motorola: Support Motorola Atrix 4G and Droid X2 plus DT bindings
   - novatek: Support Novatek/Tianma NT37700F plus DT bindings
   - simple: Support EDT ET057023UDBA plus DT bindings; Support Powertip
     PH800480T032-ZHC19 plus DT bindings; Support Waveshare 13.3"
   - novatek-nt36672a: Use mipi_dsi_*_multi() functions
   - panel-edp: Support BOE NV153WUM-N42, CMN N153JCA-ELK, CSW
     MNF307QS3-2
   - support Himax HX83121A plus DT bindings
   - support JuTouch JT070TM041 plus DT bindings
   - support Samsung S6E8FC0 plus DT bindings
   - himax-hx83102c: support Samsung S6E8FC0 plus DT bindings; support
     backlight
   - ili9806e: support Rocktech RK050HR345-CT106A plus DT bindings
   - simple: support Tianma TM050RDH03 plus DT bindings

  amdgpu:
   - enable DC by default on CIK APUs
   - userq fence ioctl param size fixes
   - set panel_type to OLED for eDP
   - refactor DC i2c code
   - FAMS2 update
   - rework ttm handling to allow multiple engines
   - DC DCE 6.x cleanup
   - DC support for NUTMEG/TRAVIS DP bridge
   - DCN 4.2 support
   - GC12 idle power fix for compute
   - use struct drm_edid in non-DC code
   - enable NV12/P010 support on primary planes
   - support newer IP discovery tables
   - VCN/JPEG 5.0.2 support
   - GC/MES 12.1 updates
   - USERQ fixes
   - add DC idle state manager
   - eDP DSC seamless boot

  amdkfd:
   - GC 12.1 updates
   - non 4K page fixes

  xe:
   - basic Xe3p_LPG and NVL-P enabling patches
   - allow VM_BIND decompress support
   - add purgeable buffer object support
   - add xe_vm_get_property_ioctl
   - restrict multi-lrc to VCS/VECS engines
   - allow disabling VM overcommit in fault mode
   - dGPU memory optimizations
   - Workaround cleanups and simplification
   - Allow VFs VRAM quote changes using sysfs
   - convert GT stats to per-cpu counters
   - pagefault refactors
   - enable multi-queue on xe3p_xpc
   - disable DCC on PTL
   - make MMIO communication more robust
   - disable D3Cold for BMG on specific platforms
   - vfio: improve FLR sync for Xe VFIO

  i915/display:
   - C10/C20/LT PHY PLL divider verification
   - use trans push mechanism to generate PSR frame change on LNL+
   - refactor DP DSC slice config
   - VGA decode refactoring
   - refactor DPT, gen2-4 overlay, masked field register macro helpers
   - refactor stolen memory allocation decisions
   - prepare for UHBR DP tunnels
   - refactor LT PHY PLL to use DPLL framework
   - implement register polling/waiting in display code
   - add shared stepping header between i915 and display

  i915:
   - fix potential overflow of shmem scatterlist length

  nouveau:
   - provide Z cull info to userspace
   - initial GA100 support
   - shutdown on PCI device shutdown

  nova-core:
   - harden GSP command queue
   - add support for large RPCs
   - simplify GSP sequencer and message handling
   - refactor falcon firmware handling
   - convert to new register macro
   - conver to new DMA coherent API
   - use checked arithmetic
   - add debugfs support for gsp-rm log buffers
   - fix aux device registration for multi-GPU

  msm:
   - CI:
      - Uprev mesa
      - Restore CI jobs for Qualcomm APQ8016 and APQ8096 devices
   - Core:
      - Switched to of_get_available_child_by_name()
   - DPU:
      - Fixes for DSC panels
      - Fixed brownout because of the frequency / OPP mismatch
      - Quad pipe preparation (not enabled yet)
      - Switched to virtual planes by default
      - Dropped VBIF_NRT support
      - Added support for Eliza platform
      - Reworked alpha handling
      - Switched to correct CWB definitions on Eliza
      - Dropped dummy INTF_0 on MSM8953
      - Corrected INTFs related to DP-MST
   - DP:
      - Removed debug prints looking into PHY internals
   - DSI:
      - Fixes for DSC panels
      - RGB101010 support
      - Support for SC8280XP
      - Moved PHY bindings from display/ to phy/
   - GPU:
      - Preemption support for x2-85 and a840
      - IFPC support for a840
      - SKU detection support for x2-85 and a840
      - Expose AQE support (VK ray-pipeline)
      - Avoid locking in VM_BIND fence signaling path
      - Fix to avoid reclaim in GPU snapshot path
      - Disallow foreign mapping of _NO_SHARE BOs
   - HDMI:
      - Fixed infoframes programming
   - MDP5:
      - Dropped support for MSM8974v1
      - Dropped now unused code for MSM8974 v1 and SDM660 / MSM8998

  panthor:
   - add tracepoints for power and IRQs
   - fix fence handling
   - extend timestamp query with flags
   - support various sources for timestamp queries

  tyr:
   - fix names and model/versions

  rockchip:
   - vop2: use drm logging function
   - rk3576 displayport support
   - support CRTC background color

  atmel-hlcdc:
   - support sana5d65 LCD controller

  tilcdc:
   - use DT bindings schema
   - use managed DRM interfaces
   - support DRM_BRIDGE_ATTACH_NO_CONNECTOR

  verisilicon:
   - support DC8200 + DT bindings

  virtgpu:
   - support PRIME import with 3D enabled

  komeda:
   - fix integer overflow in AFBC checks

  mcde:
   - improve bridge handling

  gma500:
   - use drm client buffer for fbdev framebuffer

  amdxdna:
   - add sensors ioctls
   - provide NPU power estimate
   - support column utilization sensor
   - allow forcing DMA through IOMMU IOVA
   - support per-BO mem usage queries
   - refactor GEM implementation

  ivpu:
   - update boot API to v3.29.4
   - limit per-user number of doorbells/contexts
   - perform engine reset on TDR error

  loongson:
   - replace custom code with drm_gem_ttm_dumb_map_offset()

  imx:
   - support planes behind the primary plane
   - fix bus-format selection

  vkms:
   - support CRTC background color

  v3d:
   - improve handling of struct v3d_stats

  komeda:
   - support Arm China Linlon D6 plus DT bindings

  imagination:
   - improve power-off sequence
   - support context-reset notification from firmware

  mediatek:
   - mtk_dsi: enable hs clock during pre-enable
   - Remove all conflicting aperture devices during probe
   - Add support for mt8167 display blocks"

* tag 'drm-next-2026-04-15' of https://gitlab.freedesktop.org/drm/kernel: (1735 commits)
  drm/ttm/tests: Remove checks from ttm_pool_free_no_dma_alloc
  drm/ttm/tests: fix lru_count ASSERT
  drm/vram: remove DRM_VRAM_MM_FILE_OPERATIONS from docs
  drm/fb-helper: Fix a locking bug in an error path
  dma-fence: correct kernel-doc function parameter @flags
  ttm/pool: track allocated_pages per numa node.
  ttm/pool: make pool shrinker NUMA aware (v2)
  ttm/pool: drop numa specific pools
  ttm/pool: port to list_lru. (v2)
  drm/ttm: use gpu mm stats to track gpu memory allocations. (v4)
  mm: add gpu active/reclaim per-node stat counters (v2)
  gpu: nova-core: fix missing colon in SEC2 boot debug message
  gpu: nova-core: vbios: use from_le_bytes() for PCI ROM header parsing
  gpu: nova-core: bitfield: fix broken Default implementation
  gpu: nova-core: falcon: pad firmware DMA object size to required block alignment
  gpu: nova-core: gsp: fix undefined behavior in command queue code
  drm/shmem_helper: Make sure PMD entries get the writeable upgrade
  accel/ivpu: Trigger recovery on TDR with OS scheduling
  drm/msm: Use of_get_available_child_by_name()
  dt-bindings: display/msm: move DSI PHY bindings to phy/ subdir
  ...
2026-04-15 08:45:00 -07:00

498 lines
13 KiB
C

// SPDX-License-Identifier: MIT
/*
* Copyright © 2025 Intel Corporation
*/
#include <linux/circ_buf.h>
#include <drm/drm_exec.h>
#include <drm/drm_managed.h>
#include "xe_bo.h"
#include "xe_device.h"
#include "xe_gt_printk.h"
#include "xe_gt_types.h"
#include "xe_gt_stats.h"
#include "xe_hw_engine.h"
#include "xe_pagefault.h"
#include "xe_pagefault_types.h"
#include "xe_svm.h"
#include "xe_trace_bo.h"
#include "xe_vm.h"
/**
* DOC: Xe page faults
*
* Xe page faults are handled in two layers. The producer layer interacts with
* hardware or firmware to receive and parse faults into struct xe_pagefault,
* then forwards them to the consumer. The consumer layer services the faults
* (e.g., memory migration, page table updates) and acknowledges the result back
* to the producer, which then forwards the results to the hardware or firmware.
* The consumer uses a page fault queue sized to absorb all potential faults and
* a multi-threaded worker to process them. Multiple producers are supported,
* with a single shared consumer.
*
* xe_pagefault.c implements the consumer layer.
*/
static int xe_pagefault_entry_size(void)
{
/*
* Power of two alignment is not a hardware requirement, rather a
* software restriction which makes the math for page fault queue
* management simplier.
*/
return roundup_pow_of_two(sizeof(struct xe_pagefault));
}
static int xe_pagefault_begin(struct drm_exec *exec, struct xe_vma *vma,
struct xe_vram_region *vram, bool need_vram_move)
{
struct xe_bo *bo = xe_vma_bo(vma);
struct xe_vm *vm = xe_vma_vm(vma);
int err;
err = xe_vm_lock_vma(exec, vma);
if (err)
return err;
if (!bo)
return 0;
/*
* Skip validate/migrate for DONTNEED/purged BOs - repopulating
* their pages would prevent the shrinker from reclaiming them.
* For non-scratch VMs there is no safe fallback so fail the fault.
* For scratch VMs let xe_vma_rebind() run normally; it will install
* scratch PTEs so the GPU gets safe zero reads instead of faulting.
*/
if (unlikely(xe_bo_madv_is_dontneed(bo) || xe_bo_is_purged(bo))) {
if (!xe_vm_has_scratch(vm))
return -EACCES;
return 0;
}
return need_vram_move ? xe_bo_migrate(bo, vram->placement, NULL, exec) :
xe_bo_validate(bo, vm, true, exec);
}
static int xe_pagefault_handle_vma(struct xe_gt *gt, struct xe_vma *vma,
bool atomic)
{
struct xe_vm *vm = xe_vma_vm(vma);
struct xe_tile *tile = gt_to_tile(gt);
struct xe_validation_ctx ctx;
struct drm_exec exec;
struct dma_fence *fence;
int err, needs_vram;
lockdep_assert_held_write(&vm->lock);
needs_vram = xe_vma_need_vram_for_atomic(vm->xe, vma, atomic);
if (needs_vram < 0 || (needs_vram && xe_vma_is_userptr(vma)))
return needs_vram < 0 ? needs_vram : -EACCES;
xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_COUNT, 1);
xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_KB,
xe_vma_size(vma) / SZ_1K);
trace_xe_vma_pagefault(vma);
/* Check if VMA is valid, opportunistic check only */
if (xe_vm_has_valid_gpu_mapping(tile, vma->tile_present,
vma->tile_invalidated) && !atomic)
return 0;
retry_userptr:
if (xe_vma_is_userptr(vma) &&
xe_vma_userptr_check_repin(to_userptr_vma(vma))) {
struct xe_userptr_vma *uvma = to_userptr_vma(vma);
err = xe_vma_userptr_pin_pages(uvma);
if (err)
return err;
}
/* Lock VM and BOs dma-resv */
xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, (struct xe_val_flags) {});
drm_exec_until_all_locked(&exec) {
err = xe_pagefault_begin(&exec, vma, tile->mem.vram,
needs_vram == 1);
drm_exec_retry_on_contention(&exec);
xe_validation_retry_on_oom(&ctx, &err);
if (err)
goto unlock_dma_resv;
/* Bind VMA only to the GT that has faulted */
trace_xe_vma_pf_bind(vma);
xe_vm_set_validation_exec(vm, &exec);
fence = xe_vma_rebind(vm, vma, BIT(tile->id));
xe_vm_set_validation_exec(vm, NULL);
if (IS_ERR(fence)) {
err = PTR_ERR(fence);
xe_validation_retry_on_oom(&ctx, &err);
goto unlock_dma_resv;
}
}
dma_fence_wait(fence, false);
dma_fence_put(fence);
unlock_dma_resv:
xe_validation_ctx_fini(&ctx);
if (err == -EAGAIN)
goto retry_userptr;
return err;
}
static bool
xe_pagefault_access_is_atomic(enum xe_pagefault_access_type access_type)
{
return (access_type & XE_PAGEFAULT_ACCESS_TYPE_MASK) == XE_PAGEFAULT_ACCESS_TYPE_ATOMIC;
}
static struct xe_vm *xe_pagefault_asid_to_vm(struct xe_device *xe, u32 asid)
{
struct xe_vm *vm;
down_read(&xe->usm.lock);
vm = xa_load(&xe->usm.asid_to_vm, asid);
if (vm && (xe_vm_in_fault_mode(vm) || xe_vm_has_scratch(vm)))
xe_vm_get(vm);
else
vm = ERR_PTR(-EINVAL);
up_read(&xe->usm.lock);
return vm;
}
static int xe_pagefault_service(struct xe_pagefault *pf)
{
struct xe_gt *gt = pf->gt;
struct xe_device *xe = gt_to_xe(gt);
struct xe_vm *vm;
struct xe_vma *vma = NULL;
int err;
bool atomic;
/* Producer flagged this fault to be nacked */
if (pf->consumer.fault_type_level == XE_PAGEFAULT_TYPE_LEVEL_NACK)
return -EFAULT;
vm = xe_pagefault_asid_to_vm(xe, pf->consumer.asid);
if (IS_ERR(vm))
return PTR_ERR(vm);
/*
* TODO: Change to read lock? Using write lock for simplicity.
*/
down_write(&vm->lock);
if (xe_vm_is_closed(vm)) {
err = -ENOENT;
goto unlock_vm;
}
vma = xe_vm_find_vma_by_addr(vm, pf->consumer.page_addr);
if (!vma) {
err = -EINVAL;
goto unlock_vm;
}
if (xe_vma_read_only(vma) &&
pf->consumer.access_type != XE_PAGEFAULT_ACCESS_TYPE_READ) {
err = -EPERM;
goto unlock_vm;
}
atomic = xe_pagefault_access_is_atomic(pf->consumer.access_type);
if (xe_vma_is_cpu_addr_mirror(vma))
err = xe_svm_handle_pagefault(vm, vma, gt,
pf->consumer.page_addr, atomic);
else
err = xe_pagefault_handle_vma(gt, vma, atomic);
unlock_vm:
if (!err)
vm->usm.last_fault_vma = vma;
up_write(&vm->lock);
xe_vm_put(vm);
return err;
}
static bool xe_pagefault_queue_pop(struct xe_pagefault_queue *pf_queue,
struct xe_pagefault *pf)
{
bool found_fault = false;
spin_lock_irq(&pf_queue->lock);
if (pf_queue->tail != pf_queue->head) {
memcpy(pf, pf_queue->data + pf_queue->tail, sizeof(*pf));
pf_queue->tail = (pf_queue->tail + xe_pagefault_entry_size()) %
pf_queue->size;
found_fault = true;
}
spin_unlock_irq(&pf_queue->lock);
return found_fault;
}
static void xe_pagefault_print(struct xe_pagefault *pf)
{
xe_gt_info(pf->gt, "\n\tASID: %d\n"
"\tFaulted Address: 0x%08x%08x\n"
"\tFaultType: %lu\n"
"\tAccessType: %lu\n"
"\tFaultLevel: %lu\n"
"\tEngineClass: %d %s\n"
"\tEngineInstance: %d\n",
pf->consumer.asid,
upper_32_bits(pf->consumer.page_addr),
lower_32_bits(pf->consumer.page_addr),
FIELD_GET(XE_PAGEFAULT_TYPE_MASK,
pf->consumer.fault_type_level),
FIELD_GET(XE_PAGEFAULT_ACCESS_TYPE_MASK,
pf->consumer.access_type),
FIELD_GET(XE_PAGEFAULT_LEVEL_MASK,
pf->consumer.fault_type_level),
pf->consumer.engine_class,
xe_hw_engine_class_to_str(pf->consumer.engine_class),
pf->consumer.engine_instance);
}
static void xe_pagefault_save_to_vm(struct xe_device *xe, struct xe_pagefault *pf)
{
struct xe_vm *vm;
/*
* Pagefault may be asociated to VM that is not in fault mode.
* Perform asid_to_vm behavior, except if VM is not in fault
* mode, return VM anyways.
*/
down_read(&xe->usm.lock);
vm = xa_load(&xe->usm.asid_to_vm, pf->consumer.asid);
if (vm)
xe_vm_get(vm);
else
vm = ERR_PTR(-EINVAL);
up_read(&xe->usm.lock);
if (IS_ERR(vm))
return;
xe_vm_add_fault_entry_pf(vm, pf);
xe_vm_put(vm);
}
static void xe_pagefault_queue_work(struct work_struct *w)
{
struct xe_pagefault_queue *pf_queue =
container_of(w, typeof(*pf_queue), worker);
struct xe_pagefault pf;
unsigned long threshold;
#define USM_QUEUE_MAX_RUNTIME_MS 20
threshold = jiffies + msecs_to_jiffies(USM_QUEUE_MAX_RUNTIME_MS);
while (xe_pagefault_queue_pop(pf_queue, &pf)) {
int err;
if (!pf.gt) /* Fault squashed during reset */
continue;
err = xe_pagefault_service(&pf);
if (err) {
xe_pagefault_save_to_vm(gt_to_xe(pf.gt), &pf);
if (!(pf.consumer.access_type & XE_PAGEFAULT_ACCESS_PREFETCH)) {
xe_pagefault_print(&pf);
xe_gt_info(pf.gt, "Fault response: Unsuccessful %pe\n",
ERR_PTR(err));
} else {
xe_gt_stats_incr(pf.gt, XE_GT_STATS_ID_INVALID_PREFETCH_PAGEFAULT_COUNT, 1);
xe_gt_dbg(pf.gt, "Prefetch Fault response: Unsuccessful %pe\n",
ERR_PTR(err));
}
}
pf.producer.ops->ack_fault(&pf, err);
if (time_after(jiffies, threshold)) {
queue_work(gt_to_xe(pf.gt)->usm.pf_wq, w);
break;
}
}
#undef USM_QUEUE_MAX_RUNTIME_MS
}
static int xe_pagefault_queue_init(struct xe_device *xe,
struct xe_pagefault_queue *pf_queue)
{
struct xe_gt *gt;
int total_num_eus = 0;
u8 id;
for_each_gt(gt, xe, id) {
xe_dss_mask_t all_dss;
int num_dss, num_eus;
num_dss = bitmap_weighted_or(all_dss, gt->fuse_topo.g_dss_mask,
gt->fuse_topo.c_dss_mask, XE_MAX_DSS_FUSE_BITS);
num_eus = bitmap_weight(gt->fuse_topo.eu_mask_per_dss,
XE_MAX_EU_FUSE_BITS) * num_dss;
total_num_eus += num_eus;
}
xe_assert(xe, total_num_eus);
/*
* user can issue separate page faults per EU and per CS
*
* XXX: Multiplier required as compute UMD are getting PF queue errors
* without it. Follow on why this multiplier is required.
*/
#define PF_MULTIPLIER 8
pf_queue->size = (total_num_eus + XE_NUM_HW_ENGINES) *
xe_pagefault_entry_size() * PF_MULTIPLIER;
pf_queue->size = roundup_pow_of_two(pf_queue->size);
#undef PF_MULTIPLIER
drm_dbg(&xe->drm, "xe_pagefault_entry_size=%d, total_num_eus=%d, pf_queue->size=%u",
xe_pagefault_entry_size(), total_num_eus, pf_queue->size);
spin_lock_init(&pf_queue->lock);
INIT_WORK(&pf_queue->worker, xe_pagefault_queue_work);
pf_queue->data = drmm_kzalloc(&xe->drm, pf_queue->size, GFP_KERNEL);
if (!pf_queue->data)
return -ENOMEM;
return 0;
}
static void xe_pagefault_fini(void *arg)
{
struct xe_device *xe = arg;
destroy_workqueue(xe->usm.pf_wq);
}
/**
* xe_pagefault_init() - Page fault init
* @xe: xe device instance
*
* Initialize Xe page fault state. Must be done after reading fuses.
*
* Return: 0 on Success, errno on failure
*/
int xe_pagefault_init(struct xe_device *xe)
{
int err, i;
if (!xe->info.has_usm)
return 0;
xe->usm.pf_wq = alloc_workqueue("xe_page_fault_work_queue",
WQ_UNBOUND | WQ_HIGHPRI,
XE_PAGEFAULT_QUEUE_COUNT);
if (!xe->usm.pf_wq)
return -ENOMEM;
for (i = 0; i < XE_PAGEFAULT_QUEUE_COUNT; ++i) {
err = xe_pagefault_queue_init(xe, xe->usm.pf_queue + i);
if (err)
goto err_out;
}
return devm_add_action_or_reset(xe->drm.dev, xe_pagefault_fini, xe);
err_out:
destroy_workqueue(xe->usm.pf_wq);
return err;
}
static void xe_pagefault_queue_reset(struct xe_device *xe, struct xe_gt *gt,
struct xe_pagefault_queue *pf_queue)
{
u32 i;
/* Driver load failure guard / USM not enabled guard */
if (!pf_queue->data)
return;
/* Squash all pending faults on the GT */
spin_lock_irq(&pf_queue->lock);
for (i = pf_queue->tail; i != pf_queue->head;
i = (i + xe_pagefault_entry_size()) % pf_queue->size) {
struct xe_pagefault *pf = pf_queue->data + i;
if (pf->gt == gt)
pf->gt = NULL;
}
spin_unlock_irq(&pf_queue->lock);
}
/**
* xe_pagefault_reset() - Page fault reset for a GT
* @xe: xe device instance
* @gt: GT being reset
*
* Reset the Xe page fault state for a GT; that is, squash any pending faults on
* the GT.
*/
void xe_pagefault_reset(struct xe_device *xe, struct xe_gt *gt)
{
int i;
for (i = 0; i < XE_PAGEFAULT_QUEUE_COUNT; ++i)
xe_pagefault_queue_reset(xe, gt, xe->usm.pf_queue + i);
}
static bool xe_pagefault_queue_full(struct xe_pagefault_queue *pf_queue)
{
lockdep_assert_held(&pf_queue->lock);
return CIRC_SPACE(pf_queue->head, pf_queue->tail, pf_queue->size) <=
xe_pagefault_entry_size();
}
/**
* xe_pagefault_handler() - Page fault handler
* @xe: xe device instance
* @pf: Page fault
*
* Sink the page fault to a queue (i.e., a memory buffer) and queue a worker to
* service it. Safe to be called from IRQ or process context. Reclaim safe.
*
* Return: 0 on success, errno on failure
*/
int xe_pagefault_handler(struct xe_device *xe, struct xe_pagefault *pf)
{
struct xe_pagefault_queue *pf_queue = xe->usm.pf_queue +
(pf->consumer.asid % XE_PAGEFAULT_QUEUE_COUNT);
unsigned long flags;
bool full;
spin_lock_irqsave(&pf_queue->lock, flags);
full = xe_pagefault_queue_full(pf_queue);
if (!full) {
memcpy(pf_queue->data + pf_queue->head, pf, sizeof(*pf));
pf_queue->head = (pf_queue->head + xe_pagefault_entry_size()) %
pf_queue->size;
queue_work(xe->usm.pf_wq, &pf_queue->worker);
} else {
drm_warn(&xe->drm,
"PageFault Queue (%d) full, shouldn't be possible\n",
pf->consumer.asid % XE_PAGEFAULT_QUEUE_COUNT);
}
spin_unlock_irqrestore(&pf_queue->lock, flags);
return full ? -ENOSPC : 0;
}